Whamcloud - gitweb
Landing b_hd_newconfig on HEAD
authornathan <nathan>
Sat, 10 Feb 2007 00:05:05 +0000 (00:05 +0000)
committernathan <nathan>
Sat, 10 Feb 2007 00:05:05 +0000 (00:05 +0000)
349 files changed:
lnet/ChangeLog
lnet/Kernelenv.in
lnet/Kernelenv.mk
lnet/Makefile.in
lnet/autoMakefile.am
lnet/autoconf/Makefile.am
lnet/autoconf/lustre-lnet.m4
lnet/include/Makefile.am
lnet/include/libcfs/Makefile.am
lnet/include/libcfs/curproc.h
lnet/include/libcfs/darwin/Makefile.am
lnet/include/libcfs/darwin/darwin-fs.h
lnet/include/libcfs/darwin/darwin-lock.h
lnet/include/libcfs/darwin/darwin-mem.h
lnet/include/libcfs/darwin/darwin-prim.h
lnet/include/libcfs/darwin/darwin-sync.h
lnet/include/libcfs/darwin/darwin-tcpip.h [new file with mode: 0644]
lnet/include/libcfs/darwin/darwin-time.h
lnet/include/libcfs/darwin/darwin-types.h
lnet/include/libcfs/darwin/darwin-utils.h
lnet/include/libcfs/darwin/kp30.h
lnet/include/libcfs/darwin/libcfs.h
lnet/include/libcfs/darwin/lltrace.h
lnet/include/libcfs/kp30.h
lnet/include/libcfs/libcfs.h
lnet/include/libcfs/linux/Makefile.am
lnet/include/libcfs/linux/kp30.h
lnet/include/libcfs/linux/libcfs.h
lnet/include/libcfs/linux/linux-fs.h
lnet/include/libcfs/linux/linux-lock.h
lnet/include/libcfs/linux/linux-mem.h
lnet/include/libcfs/linux/linux-prim.h
lnet/include/libcfs/linux/linux-tcpip.h [new file with mode: 0644]
lnet/include/libcfs/linux/linux-time.h
lnet/include/libcfs/linux/lltrace.h
lnet/include/libcfs/linux/portals_compat25.h
lnet/include/libcfs/linux/portals_utils.h
lnet/include/libcfs/list.h
lnet/include/libcfs/lltrace.h
lnet/include/libcfs/portals_lib.h [deleted file]
lnet/include/libcfs/portals_utils.h
lnet/include/libcfs/types.h [new file with mode: 0755]
lnet/include/libcfs/user-lock.h
lnet/include/libcfs/user-prim.h
lnet/include/libcfs/user-time.h
lnet/include/libcfs/winnt/kp30.h [new file with mode: 0644]
lnet/include/libcfs/winnt/libcfs.h [new file with mode: 0644]
lnet/include/libcfs/winnt/lltrace.h [moved from lnet/include/libcfs/darwin/portals_lib.h with 69% similarity]
lnet/include/libcfs/winnt/portals_compat25.h [moved from lnet/include/libcfs/linux/portals_lib.h with 65% similarity]
lnet/include/libcfs/winnt/portals_utils.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-fs.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-lock.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-mem.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-prim.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-tcpip.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-time.h [new file with mode: 0644]
lnet/include/libcfs/winnt/winnt-types.h [new file with mode: 0644]
lnet/include/lnet/Makefile.am
lnet/include/lnet/api-support.h
lnet/include/lnet/api.h
lnet/include/lnet/build_check.h [deleted file]
lnet/include/lnet/darwin/Makefile.am
lnet/include/lnet/darwin/api-support.h [new file with mode: 0644]
lnet/include/lnet/darwin/lib-lnet.h
lnet/include/lnet/darwin/lib-p30.h [deleted file]
lnet/include/lnet/darwin/lib-types.h
lnet/include/lnet/darwin/lnet.h
lnet/include/lnet/darwin/p30.h [deleted file]
lnet/include/lnet/errno.h [deleted file]
lnet/include/lnet/internal.h [deleted file]
lnet/include/lnet/kpr.h [deleted file]
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-p30.h [deleted file]
lnet/include/lnet/lib-types.h
lnet/include/lnet/linux/Makefile.am
lnet/include/lnet/linux/api-support.h [new file with mode: 0644]
lnet/include/lnet/linux/lib-lnet.h
lnet/include/lnet/linux/lib-p30.h [deleted file]
lnet/include/lnet/linux/lib-types.h
lnet/include/lnet/linux/lnet.h
lnet/include/lnet/linux/p30.h [deleted file]
lnet/include/lnet/lnet.h
lnet/include/lnet/lnetctl.h
lnet/include/lnet/myrnal.h [deleted file]
lnet/include/lnet/nal.h [deleted file]
lnet/include/lnet/nalids.h [deleted file]
lnet/include/lnet/p30.h [deleted file]
lnet/include/lnet/ptlctl.h [deleted file]
lnet/include/lnet/ptllnd.h [new file with mode: 0755]
lnet/include/lnet/ptllnd_wire.h [new file with mode: 0644]
lnet/include/lnet/socklnd.h
lnet/include/lnet/stringtab.h [deleted file]
lnet/include/lnet/types.h
lnet/include/lnet/winnt/api-support.h [new file with mode: 0644]
lnet/include/lnet/winnt/lib-lnet.h [new file with mode: 0644]
lnet/include/lnet/winnt/lib-types.h [new file with mode: 0644]
lnet/include/lnet/winnt/lnet.h [new file with mode: 0644]
lnet/klnds/Makefile.in
lnet/klnds/autoMakefile.am
lnet/klnds/ciblnd/.cvsignore [moved from lnet/klnds/lolnd/.cvsignore with 100% similarity]
lnet/klnds/ciblnd/Makefile.in [new file with mode: 0644]
lnet/klnds/ciblnd/autoMakefile.am [moved from lnet/klnds/lolnd/autoMakefile.am with 54% similarity]
lnet/klnds/ciblnd/ciblnd.c [new file with mode: 0644]
lnet/klnds/ciblnd/ciblnd_cb.c [new file with mode: 0644]
lnet/klnds/ciblnd/ciblnd_modparams.c [new file with mode: 0644]
lnet/klnds/gmlnd/Makefile.in
lnet/klnds/gmlnd/README [new file with mode: 0644]
lnet/klnds/gmlnd/autoMakefile.am
lnet/klnds/gmlnd/gm-reg-phys.patch [new file with mode: 0644]
lnet/klnds/gmlnd/gmlnd.h
lnet/klnds/gmlnd/gmlnd_api.c
lnet/klnds/gmlnd/gmlnd_cb.c
lnet/klnds/gmlnd/gmlnd_comm.c
lnet/klnds/gmlnd/gmlnd_module.c
lnet/klnds/gmlnd/gmlnd_utils.c
lnet/klnds/iiblnd/Makefile.in
lnet/klnds/iiblnd/autoMakefile.am
lnet/klnds/iiblnd/iiblnd.c
lnet/klnds/iiblnd/iiblnd.h
lnet/klnds/iiblnd/iiblnd_cb.c
lnet/klnds/iiblnd/iiblnd_modparams.c [new file with mode: 0644]
lnet/klnds/lolnd/Makefile.in [deleted file]
lnet/klnds/lolnd/lolnd.c [deleted file]
lnet/klnds/lolnd/lolnd.h [deleted file]
lnet/klnds/lolnd/lolnd_cb.c [deleted file]
lnet/klnds/mxlnd/.cvsignore [new file with mode: 0644]
lnet/klnds/mxlnd/Makefile.in [new file with mode: 0644]
lnet/klnds/mxlnd/README [new file with mode: 0644]
lnet/klnds/mxlnd/autoMakefile.am [moved from lnet/router/autoMakefile.am with 52% similarity]
lnet/klnds/mxlnd/mxlnd.c [new file with mode: 0644]
lnet/klnds/mxlnd/mxlnd.h [new file with mode: 0644]
lnet/klnds/mxlnd/mxlnd_cb.c [new file with mode: 0644]
lnet/klnds/mxlnd/mxlnd_modparams.c [new file with mode: 0644]
lnet/klnds/mxlnd/mxlnd_wire.h [new file with mode: 0644]
lnet/klnds/o2iblnd/.cvsignore [new file with mode: 0644]
lnet/klnds/o2iblnd/Makefile.in [new file with mode: 0644]
lnet/klnds/o2iblnd/autoMakefile.am [new file with mode: 0644]
lnet/klnds/o2iblnd/o2iblnd.c [new file with mode: 0644]
lnet/klnds/o2iblnd/o2iblnd.h [new file with mode: 0644]
lnet/klnds/o2iblnd/o2iblnd_cb.c [new file with mode: 0644]
lnet/klnds/o2iblnd/o2iblnd_modparams.c [new file with mode: 0644]
lnet/klnds/openiblnd/Makefile.in
lnet/klnds/openiblnd/autoMakefile.am
lnet/klnds/openiblnd/openiblnd.c
lnet/klnds/openiblnd/openiblnd.h
lnet/klnds/openiblnd/openiblnd_cb.c
lnet/klnds/openiblnd/openiblnd_modparams.c [new file with mode: 0644]
lnet/klnds/ptllnd/.cvsignore [new file with mode: 0644]
lnet/klnds/ptllnd/Makefile.in [new file with mode: 0755]
lnet/klnds/ptllnd/README [new file with mode: 0644]
lnet/klnds/ptllnd/autoMakefile.am [new file with mode: 0755]
lnet/klnds/ptllnd/ptllnd.c [new file with mode: 0755]
lnet/klnds/ptllnd/ptllnd.h [new file with mode: 0755]
lnet/klnds/ptllnd/ptllnd_cb.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_modparams.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_peer.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_ptltrace.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_rx_buf.c [new file with mode: 0644]
lnet/klnds/ptllnd/ptllnd_tx.c [new file with mode: 0644]
lnet/klnds/ptllnd/wirecheck.c [new file with mode: 0644]
lnet/klnds/qswlnd/Makefile.in
lnet/klnds/qswlnd/autoMakefile.am
lnet/klnds/qswlnd/qswlnd.c
lnet/klnds/qswlnd/qswlnd.h
lnet/klnds/qswlnd/qswlnd_cb.c
lnet/klnds/qswlnd/qswlnd_modparams.c [new file with mode: 0644]
lnet/klnds/ralnd/Makefile.in
lnet/klnds/ralnd/autoMakefile.am
lnet/klnds/ralnd/ralnd.c
lnet/klnds/ralnd/ralnd.h
lnet/klnds/ralnd/ralnd_cb.c
lnet/klnds/ralnd/ralnd_modparams.c [new file with mode: 0644]
lnet/klnds/socklnd/Info.plist
lnet/klnds/socklnd/Makefile.in
lnet/klnds/socklnd/autoMakefile.am
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/socklnd/socklnd_lib-darwin.c
lnet/klnds/socklnd/socklnd_lib-darwin.h
lnet/klnds/socklnd/socklnd_lib-linux.c
lnet/klnds/socklnd/socklnd_lib-linux.h
lnet/klnds/socklnd/socklnd_lib-winnt.c [new file with mode: 0755]
lnet/klnds/socklnd/socklnd_lib-winnt.h [new file with mode: 0755]
lnet/klnds/socklnd/socklnd_modparams.c [new file with mode: 0644]
lnet/klnds/viblnd/Makefile.in
lnet/klnds/viblnd/autoMakefile.am
lnet/klnds/viblnd/viblnd.c
lnet/klnds/viblnd/viblnd.h
lnet/klnds/viblnd/viblnd_cb.c
lnet/klnds/viblnd/viblnd_modparams.c [new file with mode: 0644]
lnet/klnds/viblnd/viblnd_wire.h
lnet/klnds/viblnd/wirecheck.c
lnet/libcfs/Info.plist
lnet/libcfs/Makefile.in
lnet/libcfs/autoMakefile.am
lnet/libcfs/darwin/Makefile.am
lnet/libcfs/darwin/darwin-curproc.c
lnet/libcfs/darwin/darwin-debug.c
lnet/libcfs/darwin/darwin-fs.c
lnet/libcfs/darwin/darwin-internal.h [new file with mode: 0644]
lnet/libcfs/darwin/darwin-mem.c
lnet/libcfs/darwin/darwin-module.c
lnet/libcfs/darwin/darwin-prim.c
lnet/libcfs/darwin/darwin-proc.c
lnet/libcfs/darwin/darwin-sync.c
lnet/libcfs/darwin/darwin-tcpip.c [new file with mode: 0644]
lnet/libcfs/darwin/darwin-tracefile.c
lnet/libcfs/darwin/darwin-utils.c
lnet/libcfs/debug.c
lnet/libcfs/linux/Makefile.am
lnet/libcfs/linux/linux-curproc.c
lnet/libcfs/linux/linux-debug.c
lnet/libcfs/linux/linux-fs.c
lnet/libcfs/linux/linux-lock.c
lnet/libcfs/linux/linux-lwt.c
lnet/libcfs/linux/linux-mem.c
lnet/libcfs/linux/linux-module.c
lnet/libcfs/linux/linux-prim.c
lnet/libcfs/linux/linux-proc.c
lnet/libcfs/linux/linux-sync.c
lnet/libcfs/linux/linux-tcpip.c [new file with mode: 0644]
lnet/libcfs/linux/linux-tracefile.c
lnet/libcfs/linux/linux-utils.c
lnet/libcfs/lwt.c
lnet/libcfs/misc.c [new file with mode: 0644]
lnet/libcfs/module.c
lnet/libcfs/nidstrings.c [new file with mode: 0644]
lnet/libcfs/tracefile.c
lnet/libcfs/tracefile.h
lnet/libcfs/user-lock.c
lnet/libcfs/user-prim.c
lnet/libcfs/watchdog.c
lnet/libcfs/winnt/winnt-curproc.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-debug.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-fs.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-lock.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-lwt.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-mem.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-module.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-prim.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-proc.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-sync.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-tcpip.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-tracefile.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-usr.c [new file with mode: 0644]
lnet/libcfs/winnt/winnt-utils.c [new file with mode: 0644]
lnet/lnet/Info.plist
lnet/lnet/Makefile.in
lnet/lnet/acceptor.c [new file with mode: 0644]
lnet/lnet/api-errno.c
lnet/lnet/api-ni.c
lnet/lnet/api-wrap.c [deleted file]
lnet/lnet/autoMakefile.am
lnet/lnet/config.c [new file with mode: 0644]
lnet/lnet/lib-eq.c
lnet/lnet/lib-init.c [deleted file]
lnet/lnet/lib-md.c
lnet/lnet/lib-me.c
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/lib-ni.c [deleted file]
lnet/lnet/lib-pid.c [deleted file]
lnet/lnet/lo.c [new file with mode: 0644]
lnet/lnet/module.c
lnet/lnet/peer.c [new file with mode: 0644]
lnet/lnet/router.c [new file with mode: 0644]
lnet/lnet/router_proc.c [new file with mode: 0644]
lnet/router/Makefile.in [deleted file]
lnet/router/proc.c [deleted file]
lnet/router/router.c [deleted file]
lnet/router/router.h [deleted file]
lnet/tests/Makefile.in
lnet/tests/autoMakefile.am
lnet/tests/ping.h
lnet/tests/ping_cli.c
lnet/tests/ping_cli/Info.plist
lnet/tests/ping_cli/winnt-pingcli.c [new file with mode: 0644]
lnet/tests/ping_srv.c
lnet/tests/ping_srv/Info.plist
lnet/tests/ping_srv/winnt-pingsrv.c [new file with mode: 0644]
lnet/tests/sping_cli.c [deleted file]
lnet/tests/sping_srv.c [deleted file]
lnet/tests/startclient.sh
lnet/tests/startserver.sh
lnet/tests/stopclient.sh
lnet/tests/stopserver.sh
lnet/tests/ut.README [new file with mode: 0644]
lnet/tests/ut.h [new file with mode: 0644]
lnet/tests/ut_cli.c [new file with mode: 0644]
lnet/tests/ut_srv.c [new file with mode: 0644]
lnet/ulnds/.cvsignore
lnet/ulnds/Makefile.am [deleted file]
lnet/ulnds/Makefile.in [new file with mode: 0644]
lnet/ulnds/README [deleted file]
lnet/ulnds/address.c [deleted file]
lnet/ulnds/autoMakefile.am [new file with mode: 0644]
lnet/ulnds/bridge.h [deleted file]
lnet/ulnds/connection.c [deleted file]
lnet/ulnds/connection.h [deleted file]
lnet/ulnds/debug.c [deleted file]
lnet/ulnds/dispatch.h [deleted file]
lnet/ulnds/ipmap.h [deleted file]
lnet/ulnds/pqtimer.c [deleted file]
lnet/ulnds/pqtimer.h [deleted file]
lnet/ulnds/procapi.c [deleted file]
lnet/ulnds/procbridge.h [deleted file]
lnet/ulnds/proclib.c [deleted file]
lnet/ulnds/ptllnd/.cvsignore [new file with mode: 0644]
lnet/ulnds/ptllnd/Makefile.am [new file with mode: 0644]
lnet/ulnds/ptllnd/ptllnd.c [new file with mode: 0644]
lnet/ulnds/ptllnd/ptllnd.h [new file with mode: 0644]
lnet/ulnds/ptllnd/ptllnd_cb.c [new file with mode: 0644]
lnet/ulnds/select.c [deleted file]
lnet/ulnds/socklnd/.cvsignore [new file with mode: 0644]
lnet/ulnds/socklnd/Makefile.am
lnet/ulnds/socklnd/address.c [deleted file]
lnet/ulnds/socklnd/bridge.h
lnet/ulnds/socklnd/connection.c
lnet/ulnds/socklnd/connection.h
lnet/ulnds/socklnd/debug.c [deleted file]
lnet/ulnds/socklnd/dispatch.h
lnet/ulnds/socklnd/ipmap.h [deleted file]
lnet/ulnds/socklnd/procapi.c
lnet/ulnds/socklnd/procbridge.h
lnet/ulnds/socklnd/proclib.c
lnet/ulnds/socklnd/select.c
lnet/ulnds/socklnd/table.c
lnet/ulnds/socklnd/table.h
lnet/ulnds/socklnd/tcplnd.c
lnet/ulnds/socklnd/utypes.h [deleted file]
lnet/ulnds/table.c [deleted file]
lnet/ulnds/table.h [deleted file]
lnet/ulnds/tcplnd.c [deleted file]
lnet/ulnds/timer.h [deleted file]
lnet/ulnds/utypes.h [deleted file]
lnet/utils/.cvsignore
lnet/utils/Makefile.am
lnet/utils/acceptor.c [deleted file]
lnet/utils/debug.c
lnet/utils/debugctl.c
lnet/utils/gmlndnid.c
lnet/utils/l_ioctl.c
lnet/utils/lbstats [new file with mode: 0755]
lnet/utils/parser.c
lnet/utils/portals.c
lnet/utils/ptlctl.c
lnet/utils/routerstat.c
lnet/utils/wirecheck.c

index fed4790..79ca961 100644 (file)
@@ -1,4 +1,204 @@
-tba  Cluster File Systems, Inc. <info@clusterfs.com>
+TBD         Cluster File Systems, Inc. <info@clusterfs.com>
+       * version 1.4.10
+       * Support for networks:
+       socklnd   - kernels up to 2.6.15 (I believe this is accurate, SLES10)
+       qswlnd    - Qsnet kernel modules 5.20 and later
+       openiblnd - IbGold 1.8.2
+       o2iblnd   - OFED 1.1
+       viblnd    - Voltaire ibhost 3.4.5 and later
+       ciblnd    - Topspin 3.2.0
+       iiblnd    - Infiniserv 3.3 + PathBits patch
+       gmlnd     - GM 2.1.22 and later
+       mxlnd     - MX 1.2.1 or later
+       ptllnd    - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
+       * bug fixes
+
+Severity   : major
+Frequency  : rare      
+Bugzilla   : 11616
+Description: o2iblnd handle early RDMA_CM_EVENT_DISCONNECTED.
+Details    : If the fabric is lossy, an RDMA_CM_EVENT_DISCONNECTED
+            callback can occur before a connection has actually been
+            established.  This caused an assertion failure previously.
+
+Severity   : enhancement
+Bugzilla   : 11094
+Description: Multiple instances for o2iblnd
+Details    : Allow multiple instances of o2iblnd to enable networking over
+            multiple HCAs and routing between them.
+
+Severity   : major
+Bugzilla   : 11201
+Description: lnet deadlock in router_checker
+Details    : turned ksnd_connd_lock, ksnd_reaper_lock, and ksock_net_t:ksnd_lock
+            into BH locks to eliminate potential deadlock caused by
+            ksocknal_data_ready() preempting code holding these locks.
+
+Severity   : major
+Bugzilla   : 11126
+Description: Millions of failed socklnd connection attempts cause a very slow FS
+Details    : added a new route flag ksnr_scheduled to distinguish from
+            ksnr_connecting, so that a peer connection request is only turned
+            down for race concerns when an active connection to the same peer
+            is under progress (instead of just being scheduled).
+
+------------------------------------------------------------------------------
+
+2007-02-09  Cluster File Systems, Inc. <info@clusterfs.com>
+       * version 1.4.9
+       * Support for networks:
+       socklnd   - kernels up to 2.6.15 (I believe this is accurate, SLES10)
+       qswlnd    - Qsnet kernel modules 5.20 and later
+       openiblnd - IbGold 1.8.2
+       o2iblnd   - OFED 1.1
+       viblnd    - Voltaire ibhost 3.4.5 and later
+       ciblnd    - Topspin 3.2.0
+       iiblnd    - Infiniserv 3.3 + PathBits patch
+       gmlnd     - GM 2.1.22 and later
+       mxlnd     - MX 1.2.1 or later
+       ptllnd    - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x
+       * bug fixes
+
+Severity   : major on XT3
+Bugzilla   : none
+Description: libcfs overwrites /proc/sys/portals
+Details    : libcfs created a symlink from /proc/sys/portals to
+            /proc/sys/lnet for backwards compatibility.  This is no
+            longer required and makes the Cray portals /proc variables
+            inaccessible.
+            
+Severity   : minor
+Bugzilla   : 11312
+Description: OFED FMR API change
+Details    : This changes parameter usage to reflect a change in
+            ib_fmr_pool_map_phys() between OFED 1.0 and OFED 1.1.  Note
+            that FMR support is only used in experimental versions of the
+            o2iblnd - this change does not affect standard usage at all.
+            
+Severity   : enhancement
+Bugzilla   : 11245
+Description: new ko2iblnd module parameter: ib_mtu
+Details    : the default IB MTU of 2048 performs badly on 23108 Tavor
+            HCAs.  You can avoid this problem by setting the MTU to 1024
+            using this module parameter.
+            
+Severity   : enhancement
+Bugzilla   : 11118/11620
+Description: ptllnd small request message buffer alignment fix
+Details    : Set the PTL_MD_LOCAL_ALIGN8 option on small message receives.
+            Round up small message size on sends in case this option
+            is not supported.  11620 was a defect in the initial
+            implementation which effectively asserted all peers had to be
+            running the correct protocol version which was fixed by always
+            NAK-ing such requests and handling any misalignments they
+            introduce.
+            
+Severity   : minor
+Frequency  : rarely
+Description: When kib(nal|lnd)_del_peer() is called upon a peer whose
+            ibp_tx_queue is not empty, kib(nal|lnd)_destroy_peer()'s
+            'LASSERT(list_empty(&peer->ibp_tx_queue))' will fail.
+
+Severity   : enhancement
+Bugzilla   : 11250
+Description: Patchless ZC(zero copy) socklnd
+Details    : New protocol for socklnd, socklnd can support zero copy without
+            kernel patch, it's compatible with old socklnd. Checksum is 
+            moved from tunables to modparams.
+
+Severity   : minor
+Frequency  : rarely
+Description: When ksocknal_del_peer() is called upon a peer whose
+            ksnp_tx_queue is not empty, ksocknal_destroy_peer()'s
+            'LASSERT(list_empty(&peer->ksnp_tx_queue))' will fail.
+
+Severity   : normal
+Frequency  : when ptlrpc is under heavy use and runs out of request buffer
+Bugzilla   : 11318
+Description: In lnet_match_blocked_msg(), md can be used without holding a
+            ref on it.
+
+Severity   : minor
+Frequency  : very rarely
+Bugzilla   : 10727
+Description: If ksocknal_lib_setup_sock() fails, a ref on peer is lost.
+            If connd connects a route which has been closed by
+            ksocknal_shutdown(), ksocknal_create_routes() may create new
+            routes which hold references on the peer, causing shutdown
+            process to wait for peer to disappear forever.
+
+Severity   : enhancement
+Bugzilla   : 11234
+Description: Dump XT3 portals traces on kptllnd timeout
+Details    : Set the kptllnd module parameter "ptltrace_on_timeout=1" to
+            dump Cray portals debug traces to a file.  The kptllnd module
+            parameter "ptltrace_basename", default "/tmp/lnet-ptltrace",
+            is the basename of the dump file.
+       
+Severity   : major
+Frequency  : infrequent
+Bugzilla   : 11308
+Description: kernel ptllnd fix bug in connection re-establishment
+Details    : Kernel ptllnd could produce protocol errors e.g. illegal
+            matchbits and/or violate the credit flow protocol when trying
+            to re-establish a connection with a peer after an error or
+            timeout. 
+       
+Severity   : enhancement
+Bugzilla   : 10316
+Description: Allow /proc/sys/lnet/debug to be set symbolically
+Details    : Allow debug and subsystem debug values to be read/set by name
+            in addition to numerically, for ease of use.
+
+Severity   : normal
+Frequency  : only in configurations with LNET routers
+Bugzilla   : 10316
+Description: routes automatically marked down and recovered
+Details    : In configurations with LNET routers if a router fails routers
+            now actively try to recover routes that are down, unless they
+            are marked down by an administrator.
+
+------------------------------------------------------------------------------
+
+2006-07-31  Cluster File Systems, Inc. <info@clusterfs.com>
+       * version 1.4.7
+       - rework CDEBUG messages rate-limiting mechanism b=10375
+       - add per-socket tunables for socklnd if the kernel is patched b=10327
+
+------------------------------------------------------------------------------
+
+2006-02-15  Cluster File Systems, Inc. <info@clusterfs.com>
+       * version 1.4.6
+       - fix use of portals/lnet pid to avoid dropping RPCs b=10074
+       - iiblnd wasn't mapping all memory, resulting in comms errors b=9776
+       - quiet LNET startup LNI message for liblustre b=10128
+       - Better console error messages if 'ip2nets' can't match an IP address
+       - Fixed overflow/use-before-set bugs in linux-time.h
+       - Fixed ptllnd bug that wasn't initialising rx descriptors completely
+       - LNET teardown failed an assertion about the route table being empty
+       - Fixed a crash in LNetEQPoll(<invalid handle>)
+       - Future protocol compatibility work (b_rls146_lnetprotovrsn)
+       - improve debug message for liblustre/Catamount nodes (b=10116)
+
+2005-10-10  Cluster File Systems, Inc. <info@clusterfs.com>
+       * Configuration change for the XT3
+            The PTLLND is now used to run Lustre over Portals on the XT3.
+            The configure option(s) --with-cray-portals are no longer
+            used.  Rather --with-portals=<path-to-portals-includes> is
+            used to enable building on the XT3.  In addition to enable
+            XT3 specific features the option --enable-cray-xt3 must be
+            used.
+
+2005-10-10  Cluster File Systems, Inc. <info@clusterfs.com>
+       * Portals has been removed, replaced by LNET.
+          LNET is new networking infrastructure for Lustre, it includes a
+          reorganized network configuration mode (see the user
+          documentation for full details) as well as support for routing
+          between different network fabrics.  Lustre Networking Devices
+          (LNDS) for the supported network fabrics have also been created
+          for this new infrastructure.
+       
+2005-08-08  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.4.4
        * bug fixes
 
@@ -6,9 +206,9 @@ Severity   : major
 Frequency  : rare (large Voltaire clusters only)
 Bugzilla   : 6993
 Description: the default number of reserved transmit descriptors was too low
-             for some large clusters
+            for some large clusters
 Details    : As a workaround, the number was increased.  A proper fix includes
-             a run-time tunable.
+            a run-time tunable.
 
 2005-06-02  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.4.3
@@ -18,14 +218,14 @@ Severity   : major
 Frequency  : occasional (large-scale events, cluster reboot, network failure)
 Bugzilla   : 6411
 Description: too many error messages on console obscure actual problem and
-             can slow down/panic server, or cause recovery to fail repeatedly
+            can slow down/panic server, or cause recovery to fail repeatedly
 Details    : enable rate-limiting of console error messages, and some messages
-             that were console errors now only go to the kernel log
+            that were console errors now only go to the kernel log
 
 Severity   : enhancement
 Bugzilla   : 1693
 Description: add /proc/sys/portals/catastrophe entry which will report if
-             that node has previously LBUGged
+            that node has previously LBUGged
 
 2005-04-06  Cluster File Systems, Inc. <info@clusterfs.com>
        * bugs
index 7a48c58..59eda30 100644 (file)
@@ -1,5 +1,5 @@
-EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/portals/include
-# portals/utils/debug.c wants <linux/version.h> from userspace.  sigh.
+EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/lnet/include
+# lnet/utils/debug.c wants <linux/version.h> from userspace.  sigh.
 HOSTCFLAGS := -I@LINUX@/include $(EXTRA_CFLAGS)
 LIBREADLINE := @LIBREADLINE@
 # 2.5's makefiles aren't nice to cross dir libraries in host programs
index 7c66dfa..d973e5d 100644 (file)
@@ -1,4 +1,4 @@
-EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/portals/include
+EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/lnet/include
 HOSTCFLAGS := $(EXTRA_CFLAGS)
 # the kernel doesn't want us to build archives for host binaries :/
 PTLCTLOBJS := debug.o l_ioctl.o parser.o portals.o
index 71d0dc8..553578c 100644 (file)
@@ -1,9 +1,8 @@
 subdir-m += libcfs
 
-cray-subdirs += portals
-cray-subdirs += knals
-cray-subdirs += router
-cray-subdirs += tests
-@CRAY_PORTALS_FALSE@subdir-m += $(cray-subdirs)
+lnet-subdirs += lnet
+lnet-subdirs += klnds
+lnet-subdirs += tests
+subdir-m += $(lnet-subdirs)
 
 @INCLUDE_RULES@
index b49b8d4..27a60a8 100644 (file)
@@ -3,7 +3,7 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-SUBDIRS = libcfs portals knals unals router tests doc utils include    \
+SUBDIRS = libcfs lnet klnds ulnds tests doc utils include      \
        autoconf
 
 sources:
index f65d2c0..171634a 100644 (file)
@@ -1 +1 @@
-EXTRA_DIST := lustre-portals.m4
+EXTRA_DIST := lustre-lnet.m4
index 9897290..479a1f5 100644 (file)
@@ -1,9 +1,31 @@
 #
-# LP_CHECK_GCC_VERSION
+# LN_CONFIG_MAX_PAYLOAD
+#
+# configure maximum payload
+#
+AC_DEFUN([LN_CONFIG_MAX_PAYLOAD],
+[AC_MSG_CHECKING([for non-default maximum LNET payload])
+AC_ARG_WITH([max-payload-mb],
+       AC_HELP_STRING([--with-max-payload-mb=MBytes],
+                       [set maximum lnet payload in MBytes]),
+        [
+               AC_MSG_RESULT([$with_max_payload_mb])
+               LNET_MAX_PAYLOAD_MB=$with_max_payload_mb
+               LNET_MAX_PAYLOAD="(($with_max_payload_mb)<<20)"
+       ], [
+               AC_MSG_RESULT([no])
+               LNET_MAX_PAYLOAD="LNET_MTU"
+       ])
+        AC_DEFINE_UNQUOTED(LNET_MAX_PAYLOAD, $LNET_MAX_PAYLOAD,
+                          [Max LNET payload])
+])
+
+#
+# LN_CHECK_GCC_VERSION
 #
 # Check compiler version
 #
-AC_DEFUN([LP_CHECK_GCC_VERSION],
+AC_DEFUN([LN_CHECK_GCC_VERSION],
 [AC_MSG_CHECKING([compiler version])
 PTL_CC_VERSION=`$CC --version | awk '/^gcc/{print $ 3}'`
 PTL_MIN_CC_VERSION="3.2.2"
@@ -20,35 +42,58 @@ fi
 ])
 
 #
-# LP_CONFIG_ZEROCOPY
+# LN_CONFIG_ZEROCOPY
 #
 # check if zerocopy is available/wanted
 #
-AC_DEFUN([LP_CONFIG_ZEROCOPY],
-[AC_MSG_CHECKING([for zero-copy TCP support])
-AC_ARG_ENABLE([zerocopy],
+AC_DEFUN([LN_CONFIG_ZEROCOPY],
+[AC_ARG_ENABLE([zerocopy],
        AC_HELP_STRING([--disable-zerocopy],
-                      [disable socknal zerocopy]),
+                      [disable socklnd zerocopy]),
        [],[enable_zerocopy='yes'])
+AC_MSG_CHECKING([for zero-copy TCP support])
 if test x$enable_zerocopy = xno ; then
        AC_MSG_RESULT([no (by request)])
 else
        ZCCD="`grep -c zccd $LINUX/include/linux/skbuff.h`"
-       if test "$ZCCD" != 0 ; then
-               AC_DEFINE(SOCKNAL_ZC, 1, [use zero-copy TCP])
-               AC_MSG_RESULT(yes)
-       else
+       if test "$ZCCD" = 0 ; then
                AC_MSG_RESULT([no (no kernel support)])
+       else
+               AC_MSG_RESULT([yes])
+               AC_MSG_CHECKING([for up-to-date tcp zero-copy patch])
+               LB_LINUX_TRY_COMPILE([
+                       #include <linux/config.h>
+                       #include <linux/kernel.h>
+                       #include <linux/sched.h>
+                       #include <linux/types.h>
+                       #include <linux/in.h>
+                       #include <linux/string.h>
+                       #include <linux/init.h>
+                       #include <linux/errno.h>
+                       #include <linux/interrupt.h>
+                       #include <linux/netdevice.h>
+                       #include <linux/skbuff.h>
+               ],[
+                       struct zccd zc = {0};
+
+                       return atomic_read(&zc.zccd_refcount);
+               ],[
+                       AC_MSG_RESULT([yes])
+                       AC_DEFINE(SOCKNAL_ZC, 1, [enable zero-copy support])
+               ],[
+                       AC_MSG_RESULT([no])
+                       AC_MSG_ERROR([old TCP zero-copy in kernel (bug 10889) - use --disable-zerocopy to continue ])
+               ])
        fi
 fi
 ])
 
 #
-# LP_CONFIG_AFFINITY
+# LN_CONFIG_AFFINITY
 #
 # check if cpu affinity is available/wanted
 #
-AC_DEFUN([LP_CONFIG_AFFINITY],
+AC_DEFUN([LN_CONFIG_AFFINITY],
 [AC_ARG_ENABLE([affinity],
        AC_HELP_STRING([--disable-affinity],
                       [disable process/irq affinity]),
@@ -62,11 +107,11 @@ else
                #include <linux/sched.h>
        ],[
                struct task_struct t;
-               #ifdef CPU_ARRAY_SIZE
-               cpumask_t m;
-               #else
-               unsigned long m;
-               #endif
+               #if HAVE_CPUMASK_T
+               cpumask_t     m;
+               #else
+               unsigned long m;
+               #endif
                set_cpus_allowed(&t, m);
        ],[
                AC_DEFINE(CPU_AFFINITY, 1, [kernel has cpu affinity support])
@@ -78,11 +123,162 @@ fi
 ])
 
 #
-# LP_CONFIG_QUADRICS
+# LN_CONFIG_PORTALS
+#
+# configure support for Portals
+#
+AC_DEFUN([LN_CONFIG_PORTALS],
+[AC_MSG_CHECKING([for portals])
+AC_ARG_WITH([portals],
+       AC_HELP_STRING([--with-portals=path],
+                       [set path to portals]),
+        [
+               case $with_portals in
+                       no)     ENABLEPORTALS=0
+                               ;;
+                       *)      PORTALS="${with_portals}"
+                               ENABLEPORTALS=1
+                               ;;
+               esac
+               
+       ], [
+               ENABLEPORTALS=0
+       ])
+PTLLNDCPPFLAGS=""
+if test $ENABLEPORTALS -eq 0; then
+       AC_MSG_RESULT([no])
+elif test ! \( -f ${PORTALS}/include/portals/p30.h \); then
+        AC_MSG_RESULT([no])
+       AC_MSG_ERROR([bad --with-portals path])
+else
+        AC_MSG_RESULT([$PORTALS])
+        PTLLNDCPPFLAGS="-I${PORTALS}/include"
+fi
+AC_SUBST(PTLLNDCPPFLAGS)
+])
+
+#
+# LN_CONFIG_BACKOFF
+#
+# check if tunable tcp backoff is available/wanted
+#
+AC_DEFUN([LN_CONFIG_BACKOFF],
+[AC_MSG_CHECKING([for tunable backoff TCP support])
+AC_ARG_ENABLE([backoff],
+       AC_HELP_STRING([--disable-backoff],
+                      [disable socknal tunable backoff]),
+       [],[enable_backoff='yes'])
+if test x$enable_backoff = xno ; then
+       AC_MSG_RESULT([no (by request)])
+else
+       BOCD="`grep -c TCP_BACKOFF $LINUX/include/linux/tcp.h`"
+       if test "$BOCD" != 0 ; then
+               AC_DEFINE(SOCKNAL_BACKOFF, 1, [use tunable backoff TCP])
+               AC_MSG_RESULT(yes)
+       else
+               AC_MSG_RESULT([no (no kernel support)])
+       fi
+fi
+])
+
+#
+# LN_CONFIG_PANIC_DUMPLOG
+#
+# check if tunable panic_dumplog is wanted
+#
+AC_DEFUN([LN_CONFIG_PANIC_DUMPLOG],
+[AC_MSG_CHECKING([for tunable panic_dumplog support])
+AC_ARG_ENABLE([panic_dumplog],
+       AC_HELP_STRING([--enable-panic_dumplog],
+                      [enable panic_dumplog]),
+       [],[enable_panic_dumplog='no'])
+if test x$enable_panic_dumplog = xyes ; then
+       AC_DEFINE(LNET_DUMP_ON_PANIC, 1, [use dumplog on panic])
+       AC_MSG_RESULT([yes (by request)])
+else
+       AC_MSG_RESULT([no])
+fi
+])
+
+#
+# LN_CONFIG_PTLLND
+#
+# configure support for Portals LND
+#
+AC_DEFUN([LN_CONFIG_PTLLND],
+[
+if test -z "$ENABLEPORTALS"; then
+       LN_CONFIG_PORTALS
+fi
+
+AC_MSG_CHECKING([whether to build the kernel portals LND])
+
+PTLLND=""
+if test $ENABLEPORTALS -ne 0; then
+       AC_MSG_RESULT([yes])
+       PTLLND="ptllnd"
+else
+       AC_MSG_RESULT([no])
+fi
+AC_SUBST(PTLLND)
+])
+
+#
+# LN_CONFIG_UPTLLND
+#
+# configure support for Portals LND
+#
+AC_DEFUN([LN_CONFIG_UPTLLND],
+[
+if test -z "$ENABLEPORTALS"; then
+       LN_CONFIG_PORTALS
+fi
+
+AC_MSG_CHECKING([whether to build the userspace portals LND])
+
+UPTLLND=""
+if test $ENABLEPORTALS -ne 0; then
+       AC_MSG_RESULT([yes])
+       UPTLLND="ptllnd"
+else
+       AC_MSG_RESULT([no])
+fi
+AC_SUBST(UPTLLND)
+])
+
+#
+# LN_CONFIG_USOCKLND
+#
+# configure support for userspace TCP/IP LND
+#
+AC_DEFUN([LN_CONFIG_USOCKLND],
+[AC_MSG_CHECKING([whether to build usocklnd])
+AC_ARG_ENABLE([usocklnd],
+               AC_HELP_STRING([--disable-usocklnd],
+                       [disable usocklnd]),
+               [],[enable_usocklnd='yes'])
+
+if test x$enable_usocklnd = xyes ; then
+       if test "$ENABLE_LIBPTHREAD" = "yes" ; then
+               AC_MSG_RESULT([yes])
+               USOCKLND="usocklnd"
+       else
+               AC_MSG_RESULT([no (libpthread not present or disabled)])
+               USOCKLND=""
+       fi
+else
+       AC_MSG_RESULT([no (disabled explicitly)])
+       USOCKLND=""
+fi
+AC_SUBST(USOCKLND)
+])
+
+#
+# LN_CONFIG_QUADRICS
 #
 # check if quadrics support is in this kernel
 #
-AC_DEFUN([LP_CONFIG_QUADRICS],
+AC_DEFUN([LN_CONFIG_QUADRICS],
 [AC_MSG_CHECKING([for QsNet sources])
 AC_ARG_WITH([qsnet],
        AC_HELP_STRING([--with-qsnet=path],
@@ -94,78 +290,304 @@ AC_MSG_RESULT([$QSNET])
 AC_MSG_CHECKING([if quadrics kernel headers are present])
 if test -d $QSNET/drivers/net/qsnet ; then
        AC_MSG_RESULT([yes])
-       QSWNAL="qswnal"
+       QSWLND="qswlnd"
        AC_MSG_CHECKING([for multirail EKC])
        if test -f $QSNET/include/elan/epcomms.h; then
                AC_MSG_RESULT([supported])
                QSWCPPFLAGS="-I$QSNET/include -DMULTIRAIL_EKC=1"
        else
                AC_MSG_RESULT([not supported])
-               if test -d $QSNET/drivers/net/qsnet/include; then
-                       QSWCPPFLAGS="-I$QSNET/drivers/net/qsnet/include"
-               else
-                       QSWCPPFLAGS="-I$QSNET/include/linux"
-               fi
+               AC_MSG_ERROR([Need multirail EKC])
        fi
 
        if test x$QSNET = x$LINUX ; then
                LB_LINUX_CONFIG([QSNET],[],[
                        LB_LINUX_CONFIG([QSNET_MODULE],[],[
-                               AC_MSG_WARN([QSNET is not enabled in this kernel; not building qswnal.])
-                               QSWNAL=""
+                               AC_MSG_WARN([QSNET is not enabled in this kernel; not building qswlnd.])
+                               QSWLND=""
                                QSWCPPFLAGS=""
                        ])
                ])
        fi
 else
        AC_MSG_RESULT([no])
-       QSWNAL=""
+       QSWLND=""
        QSWCPPFLAGS=""
 fi
 AC_SUBST(QSWCPPFLAGS)
-AC_SUBST(QSWNAL)
+AC_SUBST(QSWLND)
 ])
 
 #
-# LP_CONFIG_GM
+# LN_CONFIG_GM
 #
 # check if GM support is available
 #
-AC_DEFUN([LP_CONFIG_GM],
-[LB_ARG_LIBS_INCLUDES([Myrinet],[gm])
-if test x$gm_includes != x ; then
-       GMCPPFLAGS="-I$gm_includes"
-       if test -d "$gm/drivers" ; then
-               GMCPPFLAGS="$GMCPPFLAGS -I$gm/drivers -I$gm/drivers/linux/gm"
-       fi
+AC_DEFUN([LN_CONFIG_GM],[
+AC_MSG_CHECKING([whether to enable GM support])
+AC_ARG_WITH([gm],
+        AC_HELP_STRING([--with-gm=path-to-gm-source-tree],
+                      [build gmlnd against path]),
+       [
+               case $with_gm in
+                no)    ENABLE_GM=0
+                      ;;
+                *)     ENABLE_GM=1
+                       GM_SRC="$with_gm"
+                      ;;
+                esac
+        ],[
+                ENABLE_GM=0
+        ])
+AC_ARG_WITH([gm-install],
+        AC_HELP_STRING([--with-gm-install=path-to-gm-install-tree],
+                      [say where GM has been installed]),
+       [
+               GM_INSTALL=$with_gm_install
+        ],[
+                GM_INSTALL="/opt/gm"
+        ])
+if test $ENABLE_GM -eq 0; then
+        AC_MSG_RESULT([no])
+else
+        AC_MSG_RESULT([yes])
+
+       GMLND="gmlnd"
+        GMCPPFLAGS="-I$GM_SRC/include -I$GM_SRC/drivers -I$GM_SRC/drivers/linux/gm"
+
+       if test -f $GM_INSTALL/lib/libgm.a -o \
+                -f $GM_INSTALL/lib64/libgm.a; then
+               GMLIBS="-L$GM_INSTALL/lib -L$GM_INSTALL/lib64"
+        else
+               AC_MSG_ERROR([Cant find GM libraries under $GM_INSTALL])
+        fi
+
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$GMCPPFLAGS -DGM_KERNEL $EXTRA_KCFLAGS"
+
+        AC_MSG_CHECKING([that code using GM compiles with given path])
+       LB_LINUX_TRY_COMPILE([
+               #define GM_STRONG_TYPES 1
+               #ifdef VERSION
+               #undef VERSION
+               #endif
+               #include "gm.h"
+               #include "gm_internal.h"
+        ],[
+               struct gm_port *port = NULL;
+               gm_recv_event_t *rxevent = gm_blocking_receive_no_spin(port);
+                return 0;
+        ],[
+               AC_MSG_RESULT([yes])
+        ],[
+               AC_MSG_RESULT([no])
+               AC_MSG_ERROR([Bad --with-gm path])
+        ])
+
+       AC_MSG_CHECKING([that GM has gm_register_memory_ex_phys()])
+       LB_LINUX_TRY_COMPILE([
+               #define GM_STRONG_TYPES 1
+               #ifdef VERSION
+               #undef VERSION
+               #endif
+               #include "gm.h"
+               #include "gm_internal.h"
+       ],[
+               gm_status_t     gmrc;
+               struct gm_port *port = NULL;
+               gm_u64_t        phys = 0;
+               gm_up_t         pvma = 0;
+
+               gmrc = gm_register_memory_ex_phys(port, phys, 100, pvma);
+               return 0;
+       ],[
+               AC_MSG_RESULT([yes])
+       ],[
+               AC_MSG_RESULT([no.
+Please patch the GM sources as follows...
+    cd $GM_SRC
+    patch -p0 < $PWD/lnet/klnds/gmlnd/gm-reg-phys.patch
+...then rebuild and re-install them])
+                AC_MSG_ERROR([Can't build GM without gm_register_memory_ex_phys()])
+        ])
+
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 fi
 AC_SUBST(GMCPPFLAGS)
+AC_SUBST(GMLIBS)
+AC_SUBST(GMLND)
+])
+
 
-if test x$gm_libs != x ; then
-       GMLIBS="-L$gm_libs"
+#
+# LN_CONFIG_MX
+#
+AC_DEFUN([LN_CONFIG_MX],
+[AC_MSG_CHECKING([whether to enable Myrinet MX support])
+# set default
+MXPATH="/opt/mx"
+AC_ARG_WITH([mx],
+       AC_HELP_STRING([--with-mx=path],
+                      [build mxlnd against path]),
+       [
+               case $with_mx in
+               yes)    ENABLEMX=2
+                       ;;
+               no)     ENABLEMX=0
+                       ;;
+               *)      MXPATH=$with_mx
+                       ENABLEMX=3
+                       ;;
+               esac
+       ],[
+               ENABLEMX=1
+       ])
+if test $ENABLEMX -eq 0; then
+       AC_MSG_RESULT([disabled])
+elif test ! \( -f ${MXPATH}/include/myriexpress.h -a \
+              -f ${MXPATH}/include/mx_kernel_api.h -a \
+              -f ${MXPATH}/include/mx_pin.h \); then
+       AC_MSG_RESULT([no])
+       case $ENABLEMX in
+       1) ;;
+       2) AC_MSG_ERROR([Myrinet MX kernel headers not present]);;
+       3) AC_MSG_ERROR([bad --with-mx path]);;
+       *) AC_MSG_ERROR([internal error]);;
+       esac
+else
+       MXCPPFLAGS="-I$MXPATH/include"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $MXCPPFLAGS"
+       MXLIBS="-L$MXPATH/lib"
+       LB_LINUX_TRY_COMPILE([
+               #define MX_KERNEL 1
+               #include <mx_extensions.h>
+               #include <myriexpress.h>
+       ],[
+               mx_endpoint_t   end;
+               mx_status_t     status;
+               mx_request_t    request;
+               int             result;
+
+               mx_init();
+               mx_open_endpoint(MX_ANY_NIC, MX_ANY_ENDPOINT, 0, NULL, 0, &end);
+              mx_register_unexp_handler(end, (mx_unexp_handler_t) NULL, NULL);
+               mx_wait_any(end, MX_INFINITE, 0LL, 0LL, &status, &result);
+               mx_iconnect(end, 0LL, 0, 0, 0, NULL, &request);
+               return 0;
+       ],[
+               AC_MSG_RESULT([yes])
+               MXLND="mxlnd"
+       ],[
+               AC_MSG_RESULT([no])
+               case $ENABLEMX in
+               1) ;;
+               2) AC_MSG_ERROR([can't compile with Myrinet MX kernel headers]);;
+               3) AC_MSG_ERROR([can't compile with Myrinet MX headers under $MXPATH]);;
+               *) AC_MSG_ERROR([internal error]);;
+               esac
+               MXLND=""
+               MXCPPFLAGS=""
+       ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 fi
-AC_SUBST(GMLIBS)
+AC_SUBST(MXCPPFLAGS)
+AC_SUBST(MXLIBS)
+AC_SUBST(MXLND)
+])
+
+
+
+#
+# LN_CONFIG_O2IB
+#
+AC_DEFUN([LN_CONFIG_O2IB],[
+AC_MSG_CHECKING([whether to enable OpenIB gen2 support])
+# set default
+O2IBPATH="$LINUX/drivers/infiniband"
+AC_ARG_WITH([o2ib],
+       AC_HELP_STRING([--with-o2ib=path],
+                      [build o2iblnd against path]),
+       [
+               case $with_o2ib in
+               yes)    ENABLEO2IB=2
+                       ;;
+               no)     ENABLEO2IB=0
+                       ;;
+               *)      O2IBPATH=$with_o2ib
+                       ENABLEO2IB=3
+                       ;;
+               esac
+       ],[
+               ENABLEO2IB=1
+       ])
+if test $ENABLEO2IB -eq 0; then
+       AC_MSG_RESULT([disabled])
+elif test ! \( -f ${O2IBPATH}/include/rdma/rdma_cm.h -a \
+               -f ${O2IBPATH}/include/rdma/ib_cm.h -a\
+               -f ${O2IBPATH}/include/rdma/ib_verbs.h -a\
+              -f ${O2IBPATH}/include/rdma/ib_fmr_pool.h \); then
+       AC_MSG_RESULT([no])
+       case $ENABLEO2IB in
+       1) ;;
+       2) AC_MSG_ERROR([kernel OpenIB gen2 headers not present]);;
+       3) AC_MSG_ERROR([bad --with-o2ib path]);;
+       *) AC_MSG_ERROR([internal error]);;
+       esac
+else
+       O2IBCPPFLAGS="-I$O2IBPATH/include"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $O2IBCPPFLAGS"
+       LB_LINUX_TRY_COMPILE([
+               #include <linux/version.h>
+               #if !HAVE_GFP_T
+               typedef int gfp_t;
+               #endif
+               #include <rdma/rdma_cm.h>
+               #include <rdma/ib_cm.h>
+               #include <rdma/ib_verbs.h>
+               #include <rdma/ib_fmr_pool.h>
+       ],[
+                       struct rdma_cm_id          *cm_id;
+               struct rdma_conn_param      conn_param;
+               struct ib_device_attr       device_attr;
+               struct ib_qp_attr           qp_attr;
+               struct ib_pool_fmr          pool_fmr;           
+               enum   ib_cm_rej_reason     rej_reason;
 
-ENABLE_GM=0
-if test x$gm != x ; then
-       GMNAL="gmnal"
-       ENABLE_GM=1
+               cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
+               return PTR_ERR(cm_id);
+       ],[
+               AC_MSG_RESULT([yes])
+               O2IBLND="o2iblnd"
+       ],[
+               AC_MSG_RESULT([no])
+               case $ENABLEO2IB in
+               1) ;;
+               2) AC_MSG_ERROR([can't compile with kernel OpenIB gen2 headers]);;
+               3) AC_MSG_ERROR([can't compile with OpenIB gen2 headers under $O2IBPATH]);;
+               *) AC_MSG_ERROR([internal error]);;
+               esac
+               O2IBLND=""
+               O2IBCPPFLAGS=""
+       ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 fi
-AC_SUBST(GMNAL)
-AC_SUBST(ENABLE_GM)
+AC_SUBST(O2IBCPPFLAGS)
+AC_SUBST(O2IBLND)
 ])
 
 #
-# LP_CONFIG_OPENIB
+# LN_CONFIG_OPENIB
 #
 # check for OpenIB in the kernel
-AC_DEFUN([LP_CONFIG_OPENIB],[
+AC_DEFUN([LN_CONFIG_OPENIB],[
 AC_MSG_CHECKING([whether to enable OpenIB support])
 # set default
 OPENIBPATH="$LINUX/drivers/infiniband"
 AC_ARG_WITH([openib],
        AC_HELP_STRING([--with-openib=path],
-                      [build openibnal against path]),
+                      [build openiblnd against path]),
        [
                case $with_openib in
                yes)    ENABLEOPENIB=2
@@ -198,6 +620,7 @@ else
        *)   AC_MSG_RESULT([no])
             AC_MSG_ERROR([internal error]);;
        esac
+       OPENIBCPPFLAGS="$OPENIBCPPFLAGS -DIB_NTXRXPARAMS=4"
        EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="$EXTRA_KCFLAGS $OPENIBCPPFLAGS"
        LB_LINUX_TRY_COMPILE([
@@ -215,7 +638,7 @@ else
                return 0;
        ],[
                AC_MSG_RESULT([yes])
-               OPENIBNAL="openibnal"
+               OPENIBLND="openiblnd"
        ],[
                AC_MSG_RESULT([no])
                case $ENABLEOPENIB in
@@ -224,32 +647,82 @@ else
                3) AC_MSG_ERROR([can't compile with OpenIB headers under $OPENIBPATH]);;
                *) AC_MSG_ERROR([internal error]);;
                esac
-               OPENIBNAL=""
+               OPENIBLND=""
                OPENIBCPPFLAGS=""
        ])
        EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 fi
 AC_SUBST(OPENIBCPPFLAGS)
-AC_SUBST(OPENIBNAL)
+AC_SUBST(OPENIBLND)
 ])
 
 #
-# LP_CONFIG_IIB
-#
-# check for infinicon infiniband support
+# LN_CONFIG_CIBLND
 #
+AC_DEFUN([LN_CONFIG_CIB],[
+AC_MSG_CHECKING([whether to enable Cisco/TopSpin IB support])
+# set default
+CIBPATH=""
+CIBLND=""
+AC_ARG_WITH([cib],
+       AC_HELP_STRING([--with-cib=path],
+                      [build ciblnd against path]),
+       [
+               case $with_cib in
+               no)     AC_MSG_RESULT([no]);;
+               *)      CIBPATH="$with_cib"
+                       if test -d "$CIBPATH"; then
+                               AC_MSG_RESULT([yes])
+                        else
+                               AC_MSG_RESULT([no])
+                               AC_MSG_ERROR([No directory $CIBPATH])
+                       fi;;
+               esac
+       ],[
+               AC_MSG_RESULT([no])
+       ])
+if test -n "$CIBPATH"; then
+       CIBCPPFLAGS="-I${CIBPATH}/ib/ts_api_ng/include -I${CIBPATH}/all/kernel_services/include -DUSING_TSAPI"
+       CIBCPPFLAGS="$CIBCPPFLAGS -DIB_NTXRXPARAMS=3"
+       EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS $CIBCPPFLAGS"
+       LB_LINUX_TRY_COMPILE([
+               #include <ts_ib_core.h>
+               #include <ts_ib_cm.h>
+               #include <ts_ib_sa_client.h>
+       ],[
+                       struct ib_device_properties dev_props;
+               struct ib_cm_active_param   cm_active_params;
+               tTS_IB_CLIENT_QUERY_TID     tid;
+               int                         enum1 = TS_IB_QP_ATTRIBUTE_STATE;
+               int                         enum2 = TS_IB_ACCESS_LOCAL_WRITE;
+               int                         enum3 = TS_IB_CQ_CALLBACK_INTERRUPT;
+               int                         enum4 = TS_IB_CQ_PROVIDER_REARM;
+               return 0;
+       ],[
+               CIBLND="ciblnd"
+       ],[
+               AC_MSG_ERROR([can't compile ciblnd with given path])
+               CIBCPPFLAGS=""
+       ])
+       EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
+fi
+AC_SUBST(CIBCPPFLAGS)
+AC_SUBST(CIBLND)
+])
+
 #
-# LP_CONFIG_IIB
+# LN_CONFIG_IIB
 #
 # check for infinicon infiniband support
 #
-AC_DEFUN([LP_CONFIG_IIB],[
+AC_DEFUN([LN_CONFIG_IIB],[
 AC_MSG_CHECKING([whether to enable Infinicon support])
 # set default
 IIBPATH="/usr/include"
 AC_ARG_WITH([iib],
        AC_HELP_STRING([--with-iib=path],
-                      [build iibnal against path]),
+                      [build iiblnd against path]),
        [
                case $with_iib in
                yes)    ENABLEIIB=2
@@ -293,7 +766,7 @@ else
                return rc == FSUCCESS ? 0 : 1;
        ],[
                AC_MSG_RESULT([yes])
-               IIBNAL="iibnal"
+               IIBLND="iiblnd"
        ],[
                AC_MSG_RESULT([no])
                case $ENABLEIIB in
@@ -302,26 +775,26 @@ else
                3) AC_MSG_ERROR([can't compile with Infinicon headers under $IIBPATH]);;
                *) AC_MSG_ERROR([internal error]);;
                esac
-               IIBNAL=""
+               IIBLND=""
                IIBCPPFLAGS=""
        ])
        EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 fi
 AC_SUBST(IIBCPPFLAGS)
-AC_SUBST(IIBNAL)
+AC_SUBST(IIBLND)
 ])
 
 #
-# LP_CONFIG_VIB
+# LN_CONFIG_VIB
 #
 # check for Voltaire infiniband support
 #
-AC_DEFUN([LP_CONFIG_VIB],
+AC_DEFUN([LN_CONFIG_VIB],
 [AC_MSG_CHECKING([whether to enable Voltaire IB support])
 VIBPATH=""
 AC_ARG_WITH([vib],
        AC_HELP_STRING([--with-vib=path],
-                      [build vibnal against path]),
+                      [build viblnd against path]),
        [
                case $with_vib in
                no)     AC_MSG_RESULT([no]);;
@@ -337,7 +810,7 @@ AC_ARG_WITH([vib],
                AC_MSG_RESULT([no])
        ])
 if test -z "$VIBPATH"; then
-       VIBNAL=""
+       VIBLND=""
 else
        VIBCPPFLAGS="-I${VIBPATH}/include -I${VIBPATH}/cm"
        EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
@@ -371,13 +844,13 @@ else
                                           NULL, 0);
                return 0;
        ],[
-               VIBNAL="vibnal"
+               VIBLND="viblnd"
        ],[
-               AC_MSG_ERROR([can't compile vibnal with given path])
+               AC_MSG_ERROR([can't compile viblnd with given path])
        ])
        EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 fi
-if test -n "$VIBNAL"; then
+if test -n "$VIBLND"; then
        EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS"
        EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS"
        AC_MSG_CHECKING([if Voltaire still uses void * sg addresses])
@@ -405,39 +878,18 @@ if test -n "$VIBNAL"; then
        ],[
                AC_MSG_RESULT([no])
        ])
-       AC_MSG_CHECKING([if page_to_phys() must avoid sign extension])
-       LB_LINUX_TRY_COMPILE([
-               #include <linux/kernel.h>
-               #include <linux/mm.h>
-               #include <linux/unistd.h>
-               #include <asm/system.h>
-               #include <asm/io.h>
-       ],[
-               struct page p;
-
-               switch (42) {
-               case 0:
-               case (sizeof(typeof(page_to_phys(&p))) < 8):
-                       break;
-               }
-       ],[
-               AC_MSG_RESULT([yes])
-               VIBCPPFLAGS="$VIBCPPFLAGS -DIBNAL_32BIT_PAGE2PHYS=1"
-       ],[
-               AC_MSG_RESULT([no])
-       ])
        EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 fi
 AC_SUBST(VIBCPPFLAGS)
-AC_SUBST(VIBNAL)
+AC_SUBST(VIBLND)
 ])
 
 #
-# LP_CONFIG_RANAL
+# LN_CONFIG_RALND
 #
-# check whether to use the RapidArray nal
+# check whether to use the RapidArray lnd
 #
-AC_DEFUN([LP_CONFIG_RANAL],
+AC_DEFUN([LN_CONFIG_RALND],
 [#### Rapid Array
 AC_MSG_CHECKING([if RapidArray kernel headers are present])
 # placeholder
@@ -456,23 +908,23 @@ LB_LINUX_TRY_COMPILE([
        return rc == RAP_SUCCESS ? 0 : 1;
 ],[
        AC_MSG_RESULT([yes])
-       RANAL="ranal"
+       RALND="ralnd"
 ],[
        AC_MSG_RESULT([no])
-       RANAL=""
+       RALND=""
        RACPPFLAGS=""
 ])
 EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save"
 AC_SUBST(RACPPFLAGS)
-AC_SUBST(RANAL)
+AC_SUBST(RALND)
 ])
 
 #
-# LP_STRUCT_PAGE_LIST
+# LN_STRUCT_PAGE_LIST
 #
 # 2.6.4 no longer has page->list
 #
-AC_DEFUN([LP_STRUCT_PAGE_LIST],
+AC_DEFUN([LN_STRUCT_PAGE_LIST],
 [AC_MSG_CHECKING([if struct page has a list field])
 LB_LINUX_TRY_COMPILE([
        #include <linux/mm.h>
@@ -488,11 +940,11 @@ LB_LINUX_TRY_COMPILE([
 ])
 
 #
-# LP_STRUCT_SIGHAND
+# LN_STRUCT_SIGHAND
 #
 # red hat 2.4 adds sighand to struct task_struct
 #
-AC_DEFUN([LP_STRUCT_SIGHAND],
+AC_DEFUN([LN_STRUCT_SIGHAND],
 [AC_MSG_CHECKING([if task_struct has a sighand field])
 LB_LINUX_TRY_COMPILE([
        #include <linux/sched.h>
@@ -508,11 +960,11 @@ LB_LINUX_TRY_COMPILE([
 ])
 
 #
-# LP_FUNC_CPU_ONLINE
+# LN_FUNC_CPU_ONLINE
 #
 # cpu_online is different in rh 2.4, vanilla 2.4, and 2.6
 #
-AC_DEFUN([LP_FUNC_CPU_ONLINE],
+AC_DEFUN([LN_FUNC_CPU_ONLINE],
 [AC_MSG_CHECKING([if kernel defines cpu_online()])
 LB_LINUX_TRY_COMPILE([
        #include <linux/sched.h>
@@ -527,11 +979,30 @@ LB_LINUX_TRY_COMPILE([
 ])
 
 #
-# LP_TYPE_CPUMASK_T
+# LN_TYPE_GFP_T
+#
+# check if gfp_t is typedef-ed
+#
+AC_DEFUN([LN_TYPE_GFP_T],
+[AC_MSG_CHECKING([if kernel defines gfp_t])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/gfp.h>
+],[
+       return sizeof(gfp_t);
+],[
+       AC_MSG_RESULT([yes])
+       AC_DEFINE(HAVE_GFP_T, 1, [gfp_t found])
+],[
+       AC_MSG_RESULT([no])
+])
+])
+
+#
+# LN_TYPE_CPUMASK_T
 #
 # same goes for cpumask_t
 #
-AC_DEFUN([LP_TYPE_CPUMASK_T],
+AC_DEFUN([LN_TYPE_CPUMASK_T],
 [AC_MSG_CHECKING([if kernel defines cpumask_t])
 LB_LINUX_TRY_COMPILE([
        #include <linux/sched.h>
@@ -546,11 +1017,11 @@ LB_LINUX_TRY_COMPILE([
 ])
 
 #
-# LP_FUNC_SHOW_TASK
+# LN_FUNC_SHOW_TASK
 #
 # we export show_task(), but not all kernels have it (yet)
 #
-AC_DEFUN([LP_FUNC_SHOW_TASK],
+AC_DEFUN([LN_FUNC_SHOW_TASK],
 [AC_MSG_CHECKING([if kernel exports show_task])
 have_show_task=0
 for file in ksyms sched ; do
@@ -568,57 +1039,120 @@ else
 fi
 ])
 
+# LN_TASKLIST_LOCK
+# 2.6.18 remove tasklist_lock export
+AC_DEFUN([LN_TASKLIST_LOCK],
+[AC_MSG_CHECKING([kernel export tasklist_lock])
+        if grep -q "EXPORT_SYMBOL(tasklist_lock)" \
+                 "$LINUX/kernel/fork.c" 2>/dev/null ; then
+                AC_DEFINE(HAVE_TASKLIST_LOCK, 1,
+                          [tasklist_lock exported])
+                AC_MSG_RESULT([yes])
+        else
+                AC_MSG_RESULT([no])
+        fi
+])
+
+# 2.6.19 API changes
+# kmem_cache_destroy(cachep) return void instead of 
+# int
+AC_DEFUN([LN_KMEM_CACHE_DESTROY_INT],
+[AC_MSG_CHECKING([kmem_cache_destroy(cachep) return int])
+LB_LINUX_TRY_COMPILE([
+        #include <linux/slab.h>
+],[
+       int i = kmem_cache_destroy(NULL);
+],[
+        AC_MSG_RESULT(yes)
+        AC_DEFINE(HAVE_KMEM_CACHE_DESTROY_INT, 1,
+                [kmem_cache_destroy(cachep) return int])
+],[
+        AC_MSG_RESULT(NO)
+])
+])
+
+#2.6.19 API change
+#panic_notifier_list use atomic_notifier operations
+#
+AC_DEFUN([LN_ATOMIC_PANIC_NOTIFIER],
+[AC_MSG_CHECKING([panic_notifier_list is atomic])
+LB_LINUX_TRY_COMPILE([
+       #include <linux/notifier.h>
+       #include <linux/kernel.h>       
+],[
+       struct atomic_notifier_head panic_notifier_list;
+],[
+        AC_MSG_RESULT(yes)
+       AC_DEFINE(HAVE_ATOMIC_PANIC_NOTIFIER, 1,
+               [panic_notifier_list is atomic_notifier_head])
+],[
+        AC_MSG_RESULT(NO)
+])
+])
+
 #
-# LP_PROG_LINUX
+# LN_PROG_LINUX
 #
-# Portals linux kernel checks
+# LNet linux kernel checks
 #
-AC_DEFUN([LP_PROG_LINUX],
-[LP_CONFIG_ZEROCOPY
-LP_CONFIG_AFFINITY
-LP_CONFIG_QUADRICS
-LP_CONFIG_GM
-LP_CONFIG_OPENIB
-LP_CONFIG_VIB
-LP_CONFIG_IIB
-LP_CONFIG_RANAL
+AC_DEFUN([LN_PROG_LINUX],
+[LN_CONFIG_ZEROCOPY
+LN_FUNC_CPU_ONLINE
+LN_TYPE_GFP_T
+LN_TYPE_CPUMASK_T
+LN_CONFIG_AFFINITY
+LN_CONFIG_BACKOFF
+LN_CONFIG_PANIC_DUMPLOG
+LN_CONFIG_QUADRICS
+LN_CONFIG_GM
+LN_CONFIG_OPENIB
+LN_CONFIG_CIB
+LN_CONFIG_VIB
+LN_CONFIG_IIB
+LN_CONFIG_O2IB
+LN_CONFIG_RALND
+LN_CONFIG_PTLLND
+LN_CONFIG_MX
 
-LP_STRUCT_PAGE_LIST
-LP_STRUCT_SIGHAND
-LP_FUNC_CPU_ONLINE
-LP_TYPE_CPUMASK_T
-LP_FUNC_SHOW_TASK
+LN_STRUCT_PAGE_LIST
+LN_STRUCT_SIGHAND
+LN_FUNC_SHOW_TASK
+# 2.6.18
+LN_TASKLIST_LOCK
+# 2.6.19
+LN_KMEM_CACHE_DESTROY_INT
+LN_ATOMIC_PANIC_NOTIFIER
 ])
 
 #
-# LP_PROG_DARWIN
+# LN_PROG_DARWIN
 #
 # Darwin checks
 #
-AC_DEFUN([LP_PROG_DARWIN],
+AC_DEFUN([LN_PROG_DARWIN],
 [LB_DARWIN_CHECK_FUNCS([get_preemption_level])
 ])
 
 #
-# LP_PATH_DEFAULTS
+# LN_PATH_DEFAULTS
 #
 # default paths for installed files
 #
-AC_DEFUN([LP_PATH_DEFAULTS],
+AC_DEFUN([LN_PATH_DEFAULTS],
 [
 ])
 
 #
-# LP_CONFIGURE
+# LN_CONFIGURE
 #
 # other configure checks
 #
-AC_DEFUN([LP_CONFIGURE],
-[# portals/utils/portals.c
+AC_DEFUN([LN_CONFIGURE],
+[# lnet/utils/portals.c
 AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h endian.h sys/ioctl.h])
 AC_CHECK_FUNCS([gethostbyname socket connect])
 
-# portals/utils/debug.c
+# lnet/utils/debug.c
 AC_CHECK_HEADERS([linux/version.h])
 
 AC_CHECK_TYPE([spinlock_t],
@@ -626,20 +1160,37 @@ AC_CHECK_TYPE([spinlock_t],
        [],
        [#include <linux/spinlock.h>])
 
-# portals/utils/wirecheck.c
+# lnet/utils/wirecheck.c
 AC_CHECK_FUNCS([strnlen])
 
 # --------  Check for required packages  --------------
 
-LIBS_save="$LIBS"
-LIBS="-lncurses $LIBS"
-AC_CHECK_LIB([readline],[readline],[
+#
+# LC_CONFIG_READLINE
+#
+# Build with readline
+#
+AC_MSG_CHECKING([whether to enable readline support])
+AC_ARG_ENABLE(readline,
+        AC_HELP_STRING([--disable-readline],
+                        [disable readline support]),
+        [],[enable_readline='yes'])
+AC_MSG_RESULT([$enable_readline])
+
+# -------- check for readline if enabled ----
+if test x$enable_readline = xyes ; then
+       LIBS_save="$LIBS"
+       LIBS="-lncurses $LIBS"
+       AC_CHECK_LIB([readline],[readline],[
        LIBREADLINE="-lreadline -lncurses"
        AC_DEFINE(HAVE_LIBREADLINE, 1, [readline library is available])
-],[
+       ],[
        LIBREADLINE=""
-])
-LIBS="$LIBS_save"
+       ])
+       LIBS="$LIBS_save"
+else
+       LIBREADLINE=""
+fi
 AC_SUBST(LIBREADLINE)
 
 AC_MSG_CHECKING([if efence debugging support is requested])
@@ -651,7 +1202,7 @@ AC_MSG_RESULT([$enable_efence])
 if test "$enable_efence" = "yes" ; then
        LIBEFENCE="-lefence"
        AC_DEFINE(HAVE_LIBEFENCE, 1, [libefence support is requested])
-else 
+else
        LIBEFENCE=""
 fi
 AC_SUBST(LIBEFENCE)
@@ -674,6 +1225,31 @@ else
 fi
 AC_SUBST(LIBWRAP)
 
+# -------- check for -lpthread support ----
+AC_MSG_CHECKING([whether to use libpthread for lnet library])
+AC_ARG_ENABLE([libpthread],
+               AC_HELP_STRING([--disable-libpthread],
+                       [disable libpthread]),
+               [],[enable_libpthread=yes])
+if test "$enable_libpthread" = "yes" ; then
+       AC_CHECK_LIB([pthread], [pthread_create],
+               [ENABLE_LIBPTHREAD="yes"],
+               [ENABLE_LIBPTHREAD="no"])
+       if test "$ENABLE_LIBPTHREAD" = "yes" ; then
+               AC_MSG_RESULT([$ENABLE_LIBPTHREAD])
+               PTHREAD_LIBS="-lpthread"
+               AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
+       else
+               PTHREAD_LIBS=""
+               AC_MSG_RESULT([no libpthread is found])
+       fi
+       AC_SUBST(PTHREAD_LIBS)
+else
+       AC_MSG_RESULT([no (disabled explicitly)])
+       ENABLE_LIBPTHREAD="no"
+fi
+AC_SUBST(ENABLE_LIBPTHREAD)
+
 # ----------------------------------------
 # some tests for catamount-like systems
 # ----------------------------------------
@@ -697,92 +1273,105 @@ if test x$enable_urandom != xno ; then
        AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data])
 fi
 
-# -------- check for -lcap and -lpthread ----
+# -------- check for -lcap support ----
 if test x$enable_liblustre = xyes ; then
        AC_CHECK_LIB([cap], [cap_get_proc],
                [
                        CAP_LIBS="-lcap"
                        AC_DEFINE([HAVE_LIBCAP], 1, [use libcap])
                ],
-               [CAP_LIBS=""])
-       AC_SUBST(CAP_LIBS)
-       AC_CHECK_LIB([pthread], [pthread_create],
                [
-                       PTHREAD_LIBS="-lpthread"
-                       AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread])
-               ],
-               [PTHREAD_LIBS=""])
-       AC_SUBST(PTHREAD_LIBS)
+                       CAP_LIBS=""
+               ])
+       AC_SUBST(CAP_LIBS)
+
 fi
+
+LN_CONFIG_MAX_PAYLOAD
+LN_CONFIG_UPTLLND
+LN_CONFIG_USOCKLND
 ])
 
 #
-# LP_CONDITIONALS
+# LN_CONDITIONALS
 #
-# AM_CONDITOINAL defines for portals
+# AM_CONDITOINAL defines for lnet
 #
-AC_DEFUN([LP_CONDITIONALS],
-[AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal")
-AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal")
-AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal")
-AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal")
-AM_CONDITIONAL(BUILD_VIBNAL, test x$VIBNAL = "xvibnal")
-AM_CONDITIONAL(BUILD_RANAL, test x$RANAL = "xranal")
+AC_DEFUN([LN_CONDITIONALS],
+[AM_CONDITIONAL(BUILD_QSWLND, test x$QSWLND = "xqswlnd")
+AM_CONDITIONAL(BUILD_GMLND, test x$GMLND = "xgmlnd")
+AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd")
+AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd")
+AM_CONDITIONAL(BUILD_OPENIBLND, test x$OPENIBLND = "xopeniblnd")
+AM_CONDITIONAL(BUILD_CIBLND, test x$CIBLND = "xciblnd")
+AM_CONDITIONAL(BUILD_IIBLND, test x$IIBLND = "xiiblnd")
+AM_CONDITIONAL(BUILD_VIBLND, test x$VIBLND = "xviblnd")
+AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd")
+AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd")
+AM_CONDITIONAL(BUILD_UPTLLND, test x$UPTLLND = "xptllnd")
+AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd")
 ])
 
 #
-# LP_CONFIG_FILES
+# LN_CONFIG_FILES
 #
 # files that should be generated with AC_OUTPUT
 #
-AC_DEFUN([LP_CONFIG_FILES],
+AC_DEFUN([LN_CONFIG_FILES],
 [AC_CONFIG_FILES([
-portals/Kernelenv
-portals/Makefile
-portals/autoMakefile
-portals/autoconf/Makefile
-portals/doc/Makefile
-portals/include/Makefile
-portals/include/libcfs/Makefile
-portals/include/libcfs/linux/Makefile
-portals/include/portals/Makefile
-portals/include/portals/linux/Makefile
-portals/knals/Makefile
-portals/knals/autoMakefile
-portals/knals/gmnal/Makefile
-portals/knals/gmnal/autoMakefile
-portals/knals/openibnal/Makefile
-portals/knals/openibnal/autoMakefile
-portals/knals/iibnal/Makefile
-portals/knals/iibnal/autoMakefile
-portals/knals/vibnal/Makefile
-portals/knals/vibnal/autoMakefile
-portals/knals/lonal/Makefile
-portals/knals/lonal/autoMakefile
-portals/knals/qswnal/Makefile
-portals/knals/qswnal/autoMakefile
-portals/knals/ranal/Makefile
-portals/knals/ranal/autoMakefile
-portals/knals/socknal/Makefile
-portals/knals/socknal/autoMakefile
-portals/libcfs/Makefile
-portals/libcfs/autoMakefile
-portals/libcfs/linux/Makefile
-portals/portals/Makefile
-portals/portals/autoMakefile
-portals/router/Makefile
-portals/router/autoMakefile
-portals/tests/Makefile
-portals/tests/autoMakefile
-portals/unals/Makefile
-portals/utils/Makefile
+lnet/Kernelenv
+lnet/Makefile
+lnet/autoMakefile
+lnet/autoconf/Makefile
+lnet/doc/Makefile
+lnet/include/Makefile
+lnet/include/libcfs/Makefile
+lnet/include/libcfs/linux/Makefile
+lnet/include/lnet/Makefile
+lnet/include/lnet/linux/Makefile
+lnet/klnds/Makefile
+lnet/klnds/autoMakefile
+lnet/klnds/gmlnd/Makefile
+lnet/klnds/mxlnd/autoMakefile
+lnet/klnds/mxlnd/Makefile
+lnet/klnds/gmlnd/autoMakefile
+lnet/klnds/openiblnd/Makefile
+lnet/klnds/openiblnd/autoMakefile
+lnet/klnds/o2iblnd/Makefile
+lnet/klnds/o2iblnd/autoMakefile
+lnet/klnds/ciblnd/Makefile
+lnet/klnds/ciblnd/autoMakefile
+lnet/klnds/iiblnd/Makefile
+lnet/klnds/iiblnd/autoMakefile
+lnet/klnds/viblnd/Makefile
+lnet/klnds/viblnd/autoMakefile
+lnet/klnds/qswlnd/Makefile
+lnet/klnds/qswlnd/autoMakefile
+lnet/klnds/ralnd/Makefile
+lnet/klnds/ralnd/autoMakefile
+lnet/klnds/socklnd/Makefile
+lnet/klnds/socklnd/autoMakefile
+lnet/klnds/ptllnd/Makefile
+lnet/klnds/ptllnd/autoMakefile
+lnet/libcfs/Makefile
+lnet/libcfs/autoMakefile
+lnet/libcfs/linux/Makefile
+lnet/lnet/Makefile
+lnet/lnet/autoMakefile
+lnet/tests/Makefile
+lnet/tests/autoMakefile
+lnet/ulnds/Makefile
+lnet/ulnds/autoMakefile
+lnet/ulnds/socklnd/Makefile
+lnet/ulnds/ptllnd/Makefile
+lnet/utils/Makefile
 ])
 case $lb_target_os in
        darwin)
                AC_CONFIG_FILES([
-portals/include/libcfs/darwin/Makefile
-portals/include/portals/darwin/Makefile
-portals/libcfs/darwin/Makefile
+lnet/include/libcfs/darwin/Makefile
+lnet/include/lnet/darwin/Makefile
+lnet/libcfs/darwin/Makefile
 ])
                ;;
 esac
index dd6db1d..006180b 100644 (file)
@@ -1,3 +1,3 @@
-SUBDIRS = libcfs portals
+SUBDIRS = libcfs lnet
 
 EXTRA_DIST = cygwin-ioctl.h
index 50e377a..2874a52 100644 (file)
@@ -4,5 +4,5 @@ SUBDIRS += darwin
 endif
 DIST_SUBDIRS := $(SUBDIRS)
 
-EXTRA_DIST := curproc.h kp30.h libcfs.h list.h lltrace.h portals_lib.h \
-       portals_utils.h user-lock.h user-prim.h user-time.h
+EXTRA_DIST := curproc.h kp30.h libcfs.h list.h lltrace.h \
+       portals_utils.h types.h user-lock.h user-prim.h user-time.h
index 630912d..6495c66 100644 (file)
@@ -20,6 +20,7 @@
 #ifndef __LIBCFS_CURPROC_H__
 #define __LIBCFS_CURPROC_H__
 
+#ifdef __KERNEL__
 /*
  * Portable API to access common characteristics of "current" UNIX process.
  *
@@ -48,6 +49,7 @@ char  *cfs_curproc_comm(void);
  */
 cfs_kernel_cap_t cfs_curproc_cap_get(void);
 void cfs_curproc_cap_set(cfs_kernel_cap_t cap);
+#endif
 
 /* __LIBCFS_CURPROC_H__ */
 #endif
index 4ff2072..f2f217a 100644 (file)
@@ -1,3 +1,3 @@
 EXTRA_DIST := darwin-mem.h darwin-types.h libcfs.h portals_utils.h     \
        darwin-fs.h darwin-prim.h darwin-utils.h lltrace.h              \
-       darwin-lock.h darwin-sync.h kp30.h portals_lib.h
+       darwin-lock.h darwin-sync.h darwin-tcpip.h kp30.h
index 32244e7..5eed9ef 100644 (file)
@@ -1,5 +1,24 @@
-#ifndef __LIBCFS_DARWIN_CFS_FS_H__
-#define __LIBCFS_DARWIN_CFS_FS_H__
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Implementation of standard file system interfaces for XNU kernel.
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+#ifndef __LIBCFS_DARWIN_FS_H__
+#define __LIBCFS_DARWIN_FS_H__
 
 #ifndef __LIBCFS_LIBCFS_H__
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
 
 #include <sys/types.h>
 #include <sys/systm.h>
-/*
- * __APPLE_API_PRIVATE is defined before include user.h
- * Doing this way to get the define of uthread, it's not good
- * but I do need to know what's inside uthread.
- */
-#ifndef __APPLE_API_PRIVATE
-#define __APPLE_API_PRIVATE
-#include <sys/vnode.h>
-#undef __APPLE_API_PRIVATE
-#else
-#include <sys/vnode.h>
-#endif
 
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/time.h>
 #include <sys/filedesc.h>
-#include <sys/stat.h>
 #include <sys/mount.h>
+#include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/ubc.h>
 #include <sys/mbuf.h>
@@ -37,7 +44,6 @@
 #include <stdarg.h>
 
 #include <mach/mach_types.h>
-#include <mach/mach_traps.h>
 #include <mach/time_value.h>
 #include <kern/clock.h>
 #include <sys/param.h>
 /*
  * File operating APIs in kernel
  */
+#ifdef __DARWIN8__
+/*
+ * Kernel file descriptor
+ */
+typedef struct cfs_kern_file {
+        int             f_flags;
+        vnode_t         f_vp;
+        vfs_context_t   f_ctxt;
+} cfs_file_t;
+
+#else
+
 typedef struct file cfs_file_t;
 
-int    filp_node_size(cfs_file_t *fp, off_t    *size);
+#endif
+
+int    kern_file_size(cfs_file_t *fp, off_t    *size);
 #define cfs_filp_size(fp)                      \
        ({                                      \
                off_t           __size;         \
-               filp_node_size((fp), &__size);  \
+               kern_file_size((fp), &__size);  \
                __size;                         \
         })
 #define cfs_filp_poff(fp)               (NULL)
 
-cfs_file_t *filp_open(const char *name, int flags, int mode, int *err);
-int filp_close(cfs_file_t *fp);
-int filp_read(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
-int filp_write(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
-int filp_fsync(cfs_file_t *fp);
+cfs_file_t *kern_file_open(const char *name, int flags, int mode, int *err);
+int kern_file_close(cfs_file_t *fp);
+int kern_file_read(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
+int kern_file_write(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos);
+int kern_file_sync(cfs_file_t *fp);
 
-#define cfs_filp_open(n, f, m, e)      filp_open(n, f, m, e)
-#define cfs_filp_close(f)              filp_close(f)
-#define cfs_filp_read(f, b, n, p)      filp_read(f, b, n, p)
-#define cfs_filp_write(f, b, n, p)     filp_write(f, b, n, p)
-#define cfs_filp_fsync(f)              filp_fsync(f)
+#define cfs_filp_open(n, f, m, e)      kern_file_open(n, f, m, e)
+#define cfs_filp_close(f)              kern_file_close(f)
+#define cfs_filp_read(f, b, n, p)      kern_file_read(f, b, n, p)
+#define cfs_filp_write(f, b, n, p)     kern_file_write(f, b, n, p)
+#define cfs_filp_fsync(f)              kern_file_sync(f)
 
 int ref_file(cfs_file_t *fp);
 int rele_file(cfs_file_t *fp);
@@ -85,25 +105,25 @@ int file_count(cfs_file_t *fp);
 #define CFS_OFFSET_MAX                 CFS_INT_LIMIT(loff_t)
 
 typedef struct flock                   cfs_flock_t;
-#define CFS_FLOCK_TYPE(fl)             ((fl)->l_type)
-#define CFS_FLOCK_SET_TYPE(fl, type)   do { (fl)->l_type = (type); } while(0)
-#define CFS_FLOCK_PID(fl)              ((fl)->l_pid)
-#define CFS_FLOCK_SET_PID(fl, pid)     do { (fl)->l_pid = (pid); } while(0)
-#define CFS_FLOCK_START(fl)            ((fl)->l_start)
-#define CFS_FLOCK_SET_START(fl, start) do { (fl)->l_start = (start); } while(0)
-#define CFS_FLOCK_END(fl)              ((fl)->l_len == 0? CFS_OFFSET_MAX: ((fl)->l_start + (fl)->l_en))
-#define CFS_FLOCK_SET_END(fl, end)             \
-       do {                                    \
-               if (end == CFS_OFFSET_MAX)      \
-                       (fl)->l_len = 0;        \
-               else                            \
-                       (fl)->l_len = (end) - (fl)->l_start;\
-       } while(0)
-
-typedef struct {
-       void    *d;
-} cfs_dentry_t;
-typedef unsigned short umode_t;
+#define cfs_flock_type(fl)             ((fl)->l_type)
+#define cfs_flock_set_type(fl, type)   do { (fl)->l_type = (type); } while(0)
+#define cfs_flock_pid(fl)              ((fl)->l_pid)
+#define cfs_flock_set_pid(fl, pid)     do { (fl)->l_pid = (pid); } while(0)
+#define cfs_flock_start(fl)            ((fl)->l_start)
+#define cfs_flock_set_start(fl, start) do { (fl)->l_start = (start); } while(0)
+
+static inline loff_t cfs_flock_end(cfs_flock_t *fl)
+{
+        return (fl->l_len == 0 ? CFS_OFFSET_MAX: (fl->l_start + fl->l_len));
+}
+
+static inline void cfs_flock_set_end(cfs_flock_t *fl, loff_t end)
+{
+        if (end == CFS_OFFSET_MAX)
+                fl->l_len = 0;
+        else
+                fl->l_len = end - fl->l_start;
+}
 
 #define ATTR_MODE       0x0001
 #define ATTR_UID        0x0002
@@ -119,13 +139,59 @@ typedef unsigned short umode_t;
 #define ATTR_RAW        0x0800  /* file system, not vfs will massage attrs */
 #define ATTR_FROM_OPEN  0x1000  /* called from open path, ie O_TRUNC */
 #define ATTR_CTIME_SET  0x2000
+#define ATTR_BLOCKS     0x4000
 
 #define in_group_p(x)  (0)
 
-#endif
+struct posix_acl_entry {
+        short                   e_tag;
+        unsigned short          e_perm;
+        unsigned int            e_id;
+};
+
+struct posix_acl {
+        atomic_t                a_refcount;
+        unsigned int            a_count;
+        struct posix_acl_entry  a_entries[0];
+};
+
+struct posix_acl *posix_acl_alloc(int count, int flags);
+static inline struct posix_acl *posix_acl_from_xattr(const void *value, 
+                                                     size_t size)
+{ 
+        return posix_acl_alloc(0, 0);
+}
+static inline void posix_acl_release(struct posix_acl *acl) {};
+static inline int posix_acl_valid(const struct posix_acl *acl) { return 0; }
+static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) 
+{ 
+        return acl;
+}
+
+/*
+ * portable UNIX device file identification.
+ */
+
+typedef dev_t cfs_rdev_t;
+
+#else  /* !__KERNEL__ */
+
+typedef struct file cfs_file_t;
 
+#endif /* END __KERNEL__ */
+
+typedef struct {
+       void    *d;
+} cfs_dentry_t;
+
+#ifndef O_SYNC
 #define O_SYNC                                 0
+#endif
+#ifndef O_DIRECTORY
 #define O_DIRECTORY                            0
+#endif
+#ifndef O_LARGEFILE
 #define O_LARGEFILE                            0
+#endif
 
 #endif
index da16418..f826fef 100644 (file)
@@ -9,10 +9,6 @@
 #include <mach/sync_policy.h>
 #include <mach/task.h>
 #include <mach/semaphore.h>
-#include <mach/mach_traps.h>
-
-/* spin lock types and operations */
-#include <kern/simple_lock.h>
 #include <kern/assert.h>
 #include <kern/thread.h>
 
@@ -56,12 +52,18 @@ static inline int spin_trylock(spinlock_t *lock)
        return kspin_trylock(&lock->spin);
 }
 
+static inline void spin_lock_done(spinlock_t *lock)
+{
+       kspin_done(&lock->spin);
+}
+
+#error "does this lock out timer callbacks?"
 #define spin_lock_bh(x)                spin_lock(x)
 #define spin_unlock_bh(x)      spin_unlock(x)
 #define spin_lock_bh_init(x)   spin_lock_init(x)
 
 extern boolean_t ml_set_interrupts_enabled(boolean_t enable);
-#define __disable_irq()         (spl_t) ml_set_interrupts_enabled(FALSE)
+#define __disable_irq()         ml_set_interrupts_enabled(FALSE)
 #define __enable_irq(x)         (void) ml_set_interrupts_enabled(x)
 
 #define spin_lock_irqsave(s, f)                do{                     \
@@ -165,6 +167,11 @@ static inline void init_rwsem(struct rw_semaphore *s)
        krw_sem_init(&s->s);
 }
 
+static inline void fini_rwsem(struct rw_semaphore *s)
+{
+       krw_sem_done(&s->s);
+}
+
 static inline void down_read(struct rw_semaphore *s)
 {
        krw_sem_down_r(&s->s);
@@ -173,7 +180,7 @@ static inline void down_read(struct rw_semaphore *s)
 static inline int down_read_trylock(struct rw_semaphore *s)
 {
        int ret = krw_sem_down_r_try(&s->s);
-       return ret == 0? 1: 0;
+       return ret == 0;
 }
 
 static inline void down_write(struct rw_semaphore *s)
@@ -184,7 +191,7 @@ static inline void down_write(struct rw_semaphore *s)
 static inline int down_write_trylock(struct rw_semaphore *s)
 {
        int ret = krw_sem_down_w_try(&s->s);
-       return ret == 0? 1: 0;
+       return ret == 0;
 }
 
 static inline void up_read(struct rw_semaphore *s)
@@ -199,7 +206,6 @@ static inline void up_write(struct rw_semaphore *s)
 
 /* 
  * read-write lock : Need to be investigated more!!
- * XXX nikita: for now, let rwlock_t to be identical to rw_semaphore
  *
  * - DECLARE_RWLOCK(l)
  * - rwlock_init(x)
@@ -208,14 +214,14 @@ static inline void up_write(struct rw_semaphore *s)
  * - write_lock(x)
  * - write_unlock(x)
  */
-typedef struct rw_semaphore rwlock_t;
+typedef struct krw_spin rwlock_t;
 
-#define rwlock_init(pl)                init_rwsem(pl)
+#define rwlock_init(pl)                        krw_spin_init(pl)
 
-#define read_lock(l)           down_read(l)
-#define read_unlock(l)         up_read(l)
-#define write_lock(l)          down_write(l)
-#define write_unlock(l)                up_write(l)
+#define read_lock(l)                   krw_spin_down_r(l)
+#define read_unlock(l)                 krw_spin_up_r(l)
+#define write_lock(l)                  krw_spin_down_w(l)
+#define write_unlock(l)                        krw_spin_up_w(l)
 
 #define write_lock_irqsave(l, f)       do{                     \
                                        f = __disable_irq();    \
@@ -232,12 +238,23 @@ typedef struct rw_semaphore rwlock_t;
 #define read_unlock_irqrestore(l, f)   do{                     \
                                        read_unlock(l);         \
                                        __enable_irq(f);}while(0)
-
 /*
  * Funnel: 
  *
  * Safe funnel in/out
  */
+#ifdef __DARWIN8__
+
+#define CFS_DECL_FUNNEL_DATA
+#define CFS_DECL_CONE_DATA              DECLARE_FUNNEL_DATA
+#define CFS_DECL_NET_DATA               DECLARE_FUNNEL_DATA
+#define CFS_CONE_IN                     do {} while(0)
+#define CFS_CONE_EX                     do {} while(0)
+
+#define CFS_NET_IN                      do {} while(0)
+#define CFS_NET_EX                      do {} while(0)
+
+#else
 
 #define CFS_DECL_FUNNEL_DATA                   \
         boolean_t    __funnel_state = FALSE;   \
@@ -257,8 +274,11 @@ void lustre_net_ex(boolean_t state, funnel_t *cone);
 #define CFS_NET_IN  lustre_net_in(&__funnel_state, &__funnel)
 #define CFS_NET_EX  lustre_net_ex(__funnel_state, __funnel)
 
-/* __KERNEL__ */
 #endif
 
+#else
+#include <libcfs/user-lock.h>
+#endif /* __KERNEL__ */
+
 /* __XNU_CFS_LOCK_H */
 #endif
index 922a1b8..5ffcd4e 100644 (file)
 #include <libcfs/list.h>
 
 /*
- * Page of OSX
- *
- * There is no page in OSX, however, we need page in lustre.
- */
-#define PAGE_MASK                              (~(PAGE_SIZE-1))
-#define _ALIGN_UP(addr,size)                   (((addr)+((size)-1))&(~((size)-1)))
-#define _ALIGN(addr,size)                      _ALIGN_UP(addr,size)
-#define PAGE_ALIGN(addr)                       _ALIGN(addr, PAGE_SIZE)
-
-/*
  * Basic xnu_page struct, should be binary compatibility with
  * all page types in xnu (we have only xnu_raw_page, xll_page now)
  */
 
 /* Variable sized pages are not supported */
 
+#ifdef PAGE_SHIFT
+#define CFS_PAGE_SHIFT PAGE_SHIFT
+#else
 #define CFS_PAGE_SHIFT 12
-#define CFS_PAGE_SIZE  (1 << CFS_PAGE_SHIFT)
-#define PAGE_CACHE_SIZE CFS_PAGE_SIZE
-#define CFS_PAGE_MASK  (~(CFS_PAGE_SIZE - 1))
+#endif
+
+#define CFS_PAGE_SIZE  (1UL << CFS_PAGE_SHIFT)
+
+#define CFS_PAGE_MASK  (~((__u64)CFS_PAGE_SIZE - 1))
 
 enum {
        XNU_PAGE_RAW,
@@ -98,20 +93,16 @@ void xnu_page_ops_unregister(int type);
  * raw page, no cache object, just like buffer
  */
 struct xnu_raw_page {
-       struct xnu_page header;
-       vm_address_t    virtual;
-       upl_t           upl;
-       int             order;
-       atomic_t        count;
-       void           *private;
+       struct xnu_page  header;
+       void            *virtual;
+       atomic_t         count;
+       struct list_head link;
 };
 
 /*
  * Public interface to lustre
  *
- * - cfs_alloc_pages(f, o)
  * - cfs_alloc_page(f)
- * - cfs_free_pages(p, o)
  * - cfs_free_page(p)
  * - cfs_kmap(p)
  * - cfs_kunmap(p)
@@ -124,14 +115,12 @@ struct xnu_raw_page {
  * pages only.
  */
 
-cfs_page_t *cfs_alloc_pages(u_int32_t flags, u_int32_t order);
 cfs_page_t *cfs_alloc_page(u_int32_t flags);
-void cfs_free_pages(cfs_page_t *pages, int order);
 void cfs_free_page(cfs_page_t *page);
 void cfs_get_page(cfs_page_t *page);
 int cfs_put_page_testzero(cfs_page_t *page);
 int cfs_page_count(cfs_page_t *page);
-void cfs_set_page_count(cfs_page_t *page, int v);
+#define cfs_page_index(pg)     (0)
 
 void *cfs_page_address(cfs_page_t *pg);
 void *cfs_kmap(cfs_page_t *pg);
@@ -141,48 +130,84 @@ void cfs_kunmap(cfs_page_t *pg);
  * Memory allocator
  */
 
-extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
-extern void  cfs_free(void *addr);
+void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
+void  cfs_free(void *addr);
+
+void *cfs_alloc_large(size_t nr_bytes);
+void  cfs_free_large(void *addr);
 
-extern void *cfs_alloc_large(size_t nr_bytes);
-extern void  cfs_free_large(void *addr);
+extern int get_preemption_level(void);
+
+#define CFS_ALLOC_ATOMIC_TRY                                    \
+       (get_preemption_level() != 0 ? CFS_ALLOC_ATOMIC : 0)
 
 /*
  * Slab:
  *
- * No slab in OSX, use zone allocator to fake slab
+ * No slab in OSX, use zone allocator to simulate slab
  */
 #define SLAB_HWCACHE_ALIGN             0
 
+#ifdef __DARWIN8__
+/* 
+ * In Darwin8, we cannot use zalloc_noblock(not exported by kernel),
+ * also, direct using of zone allocator is not recommended.
+ */
+#define CFS_INDIVIDUAL_ZONE     (0)
+
+#if !CFS_INDIVIDUAL_ZONE
+#include <libkern/OSMalloc.h>
+typedef        OSMallocTag     mem_cache_t;
+#else
+typedef                void*           zone_t;
+typedef                zone_t          mem_cache_t;
+#endif
+
+#else /* !__DARWIN8__ */
+
+#define CFS_INDIVIDUAL_ZONE     (1)
+
+typedef        zone_t          mem_cache_t;
+
+#endif /* !__DARWIN8__ */
+
+#define MC_NAME_MAX_LEN                64
+
 typedef struct cfs_mem_cache {
-       struct list_head        link;
-       zone_t                  zone;
-       int                     size;
-       char                    name [ZONE_NAME_MAX_LEN];
+       int                     mc_size;
+       mem_cache_t             mc_cache;
+       struct list_head        mc_link;
+       char                    mc_name [MC_NAME_MAX_LEN];
 } cfs_mem_cache_t;
 
 #define KMEM_CACHE_MAX_COUNT   64
 #define KMEM_MAX_ZONE          8192
 
-extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long,
-                                              void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                                              void (*)(void *, cfs_mem_cache_t *, unsigned long));
-extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
-extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
-extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
+cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long);
+int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
+void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
+void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
 
 /*
  * Misc
  */
-/* XXX fix me */
+/* XXX Liang: num_physpages... fix me */
 #define num_physpages                  (64 * 1024)
 
 #define CFS_DECL_MMSPACE               
 #define CFS_MMSPACE_OPEN               do {} while(0)
 #define CFS_MMSPACE_CLOSE              do {} while(0)
 
-#define copy_from_user(kaddr, uaddr, size)     copyin((caddr_t)uaddr, (caddr_t)kaddr, size)
-#define copy_to_user(uaddr, kaddr, size)       copyout((caddr_t)kaddr, (caddr_t)uaddr, size)
+#define copy_from_user(kaddr, uaddr, size)     copyin(CAST_USER_ADDR_T(uaddr), (caddr_t)kaddr, size)
+#define copy_to_user(uaddr, kaddr, size)       copyout((caddr_t)kaddr, CAST_USER_ADDR_T(uaddr), size)
+
+#if 0
+static inline int strncpy_from_user(char *kaddr, char *uaddr, int size)
+{
+       size_t count;
+       return copyinstr((const user_addr_t)uaddr, (void *)kaddr, size, &count);
+}
+#endif
 
 #if defined (__ppc__)
 #define mb()  __asm__ __volatile__ ("sync" : : : "memory")
@@ -198,9 +223,10 @@ extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
 
 #else  /* !__KERNEL__ */
 
-typedef struct cfs_page{
-       void    *foo;
-} cfs_page_t;
+#define CFS_CACHE_SHIFT 12
+#define PAGE_CACHE_SIZE (1 << CFS_CACHE_SHIFT)
+#include <libcfs/user-prim.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* __XNU_CFS_MEM_H__ */
index ec9be59..00fbeed 100644 (file)
@@ -9,25 +9,29 @@
 #include <sys/types.h>
 #include <sys/systm.h>
 
-#ifndef __APPLE_API_PRIVATE
-#define __APPLE_API_PRIVATE
-#include <sys/user.h>
-#undef __APPLE_API_PRIVATE
-#else
-#include <sys/user.h>
-#endif
+#ifndef __DARWIN8__
+# ifndef __APPLE_API_PRIVATE
+#  define __APPLE_API_PRIVATE
+#  include <sys/user.h>
+#  undef __APPLE_API_PRIVATE
+# else
+#  include <sys/user.h>
+# endif
+# include <mach/mach_traps.h>
+# include <mach/thread_switch.h>
+# include <machine/cpu_number.h>
+#endif /* !__DARWIN8__ */
 
 #include <sys/kernel.h>
 
 #include <mach/thread_act.h>
 #include <mach/mach_types.h>
-#include <mach/mach_traps.h>
-#include <mach/thread_switch.h>
 #include <mach/time_value.h>
 #include <kern/sched_prim.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <mach/machine/vm_param.h>
+#include <machine/machine_routines.h>
 #include <kern/clock.h>
 #include <kern/thread_call.h>
 #include <sys/param.h>
@@ -63,17 +67,19 @@ extern kern_return_t            cfs_symbol_put(const char *);
  * User can register/unregister a list of sysctl_oids
  * sysctl_oid is data struct of osx's sysctl-entry
  */
+#define        CONFIG_SYSCTL   1
+
 typedef struct sysctl_oid *     cfs_sysctl_table_t;
 typedef cfs_sysctl_table_t      cfs_sysctl_table_header_t;
-cfs_sysctl_table_header_t      *register_cfs_sysctl_table (cfs_sysctl_table_t *table, int arg);
-void unregister_cfs_sysctl_table (cfs_sysctl_table_header_t *table);
+cfs_sysctl_table_header_t      *cfs_register_sysctl_table (cfs_sysctl_table_t *table, int arg);
+void cfs_unregister_sysctl_table (cfs_sysctl_table_header_t *table);
 
 /*
  * Proc file system APIs, no /proc fs support in OSX
  */
-typedef struct cfs_proc_dir_entry{
+typedef struct cfs_proc_dir_entry {
        void            *data;
-}cfs_proc_dir_entry_t;
+} cfs_proc_dir_entry_t;
 
 cfs_proc_dir_entry_t * cfs_create_proc_entry(char *name, int mod,
                                          cfs_proc_dir_entry_t *parent);
@@ -111,12 +117,23 @@ extern kern_return_t            cfs_psdev_deregister(cfs_psdev_t *);
 extern boolean_t        assert_wait_possible(void);
 extern void             *get_bsdtask_info(task_t);
 
+#ifdef __DARWIN8__
+
+typedef struct {}              cfs_task_t;
+#define cfs_current()          ((cfs_task_t *)current_thread())
+#else  /* !__DARWIN8__ */
+
 typedef struct uthread         cfs_task_t;
+
 #define current_uthread()       ((struct uthread *)get_bsdthread_info(current_act()))
 #define cfs_current()          current_uthread()
 
+#endif /* !__DARWIN8__ */
+
+#define cfs_task_lock(t)       do {;} while (0)
+#define cfs_task_unlock(t)     do {;} while (0)
+
 #define set_current_state(s)   do {;} while (0)
-#define reparent_to_init()     do {;} while (0)
 
 #define CFS_DECL_JOURNAL_DATA  
 #define CFS_PUSH_JOURNAL       do {;} while(0)
@@ -128,109 +145,12 @@ typedef struct uthread           cfs_task_t;
  *
  * OSX kernel thread can not be created with args,
  * so we have to implement new APIs to create thread with args
- *
- * All requests to create kernel thread will create a new
- * thread instance of cfs_thread_agent, one by one.
- * cfs_thread_agent will call the caller's thread function
- * with argument supplied by caller.
  */
 
 typedef int (*cfs_thread_t)(void *);
 
 extern task_t  kernel_task;
 
-struct kernel_thread_arg
-{
-       spinlock_t      lock;
-       atomic_t        inuse;
-       cfs_thread_t    func;
-       void            *arg;
-};
-
-extern struct kernel_thread_arg cfs_thread_arg;
-extern void cfs_thread_agent(void);
-
-#define THREAD_ARG_FREE                        0
-#define THREAD_ARG_HOLD                        1
-#define THREAD_ARG_RECV                        2
-
-#define set_targ_stat(a, v)            atomic_set(&(a)->inuse, v)
-#define get_targ_stat(a)               atomic_read(&(a)->inuse)
-
-/*
- * Hold the thread argument and set the status of thread_status
- * to THREAD_ARG_HOLD, if the thread argument is held by other
- * threads (It's THREAD_ARG_HOLD already), current-thread has to wait.
- */
-#define thread_arg_hold(pta, _func, _arg)                      \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               if (get_targ_stat(pta) == THREAD_ARG_FREE) {    \
-                       set_targ_stat((pta), THREAD_ARG_HOLD);  \
-                       (pta)->arg = (void *)_arg;              \
-                       (pta)->func = _func;                    \
-                       spin_unlock(&(pta)->lock);              \
-                       break;                                  \
-               }                                               \
-               spin_unlock(&(pta)->lock);                      \
-               schedule();                                     \
-       } while(1);                                             \
-
-/*
- * Release the thread argument if the thread argument has been
- * received by the child-thread (Status of thread_args is
- * THREAD_ARG_RECV), otherwise current-thread has to wait.
- * After release, the thread_args' status will be set to
- * THREAD_ARG_FREE, and others can re-use the thread_args to
- * create new kernel_thread.
- */
-#define thread_arg_release(pta)                                        \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               if (get_targ_stat(pta) == THREAD_ARG_RECV) {    \
-                       (pta)->arg = NULL;                      \
-                       (pta)->func = NULL;                     \
-                       set_targ_stat(pta, THREAD_ARG_FREE);    \
-                       spin_unlock(&(pta)->lock);              \
-                       break;                                  \
-               }                                               \
-               spin_unlock(&(pta)->lock);                      \
-               schedule();                                     \
-       } while(1)
-
-/*
- * Receive thread argument (Used in child thread), set the status
- * of thread_args to THREAD_ARG_RECV.
- */
-#define __thread_arg_recv_fin(pta, _func, _arg, fin)           \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               if (get_targ_stat(pta) == THREAD_ARG_HOLD) {    \
-                       if (fin)                                \
-                           set_targ_stat(pta, THREAD_ARG_RECV);\
-                       _arg = (pta)->arg;                      \
-                       _func = (pta)->func;                    \
-                       spin_unlock(&(pta)->lock);              \
-                       break;                                  \
-               }                                               \
-               spin_unlock(&(pta)->lock);                      \
-               schedule();                                     \
-       } while (1);                                            \
-
-/*
- * Just set the thread_args' status to THREAD_ARG_RECV
- */
-#define thread_arg_fin(pta)                                    \
-       do {                                                    \
-               spin_lock(&(pta)->lock);                        \
-               assert( get_targ_stat(pta) == THREAD_ARG_HOLD); \
-               set_targ_stat(pta, THREAD_ARG_RECV);            \
-               spin_unlock(&(pta)->lock);                      \
-       } while(0)
-
-#define thread_arg_recv(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 1)
-#define thread_arg_keep(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 0)
-
 /*
  * cloning flags, no use in OSX, just copy them from Linux
  */
@@ -265,11 +185,16 @@ typedef struct cfs_waitlink {
        struct ksleep_link  wl_ksleep_link;
 } cfs_waitlink_t;
 
+typedef int cfs_task_state_t;
+
+#define CFS_TASK_INTERRUPTIBLE THREAD_ABORTSAFE
+#define CFS_TASK_UNINT         THREAD_UNINT
+
 void cfs_waitq_init(struct cfs_waitq *waitq);
 void cfs_waitlink_init(struct cfs_waitlink *link);
 
 void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link);
-void cfs_waitq_add_exclusive(struct cfs_waitq *waitq, 
+void cfs_waitq_add_exclusive(struct cfs_waitq *waitq,
                             struct cfs_waitlink *link);
 void cfs_waitq_forward(struct cfs_waitlink *link, struct cfs_waitq *waitq);
 void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link);
@@ -279,29 +204,37 @@ void cfs_waitq_signal(struct cfs_waitq *waitq);
 void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr);
 void cfs_waitq_broadcast(struct cfs_waitq *waitq);
 
-void cfs_waitq_wait(struct cfs_waitlink *link);
-cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, 
+void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state);
+cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link,
+                                  cfs_task_state_t state, 
                                   cfs_duration_t timeout);
 
 /*
  * Thread schedule APIs.
  */
 #define MAX_SCHEDULE_TIMEOUT    ((long)(~0UL>>12))
+extern void thread_set_timer_deadline(uint64_t deadline);
+extern void thread_cancel_timer(void);
 
-static inline int schedule_timeout(int64_t timeout)
+static inline int cfs_schedule_timeout(int state, int64_t timeout)
 {
        int          result;
        
-       AbsoluteTime clock_current;
-       AbsoluteTime clock_delay;
-       result = assert_wait((event_t)current_uthread(), THREAD_UNINT);
-       clock_get_uptime(&clock_current);
-       nanoseconds_to_absolutetime(timeout, &clock_delay);
-       ADD_ABSOLUTETIME(&clock_current, &clock_delay);
-       thread_set_timer_deadline(clock_current);
+#ifdef __DARWIN8__
+       result = assert_wait((event_t)current_thread(), state);
+#else
+       result = assert_wait((event_t)current_uthread(), state);
+#endif
+       if (timeout > 0) {
+               uint64_t expire;
+               nanoseconds_to_absolutetime(timeout, &expire);
+               clock_absolutetime_interval_to_deadline(expire, &expire);
+               thread_set_timer_deadline(expire);
+       }
        if (result == THREAD_WAITING)
                result = thread_block(THREAD_CONTINUE_NULL);
-       thread_cancel_timer();
+       if (timeout > 0)
+               thread_cancel_timer();
        if (result == THREAD_TIMED_OUT)
                result = 0;
        else
@@ -309,47 +242,80 @@ static inline int schedule_timeout(int64_t timeout)
        return result;
 }
 
-#define schedule()                              \
-       do {                                    \
-               if (assert_wait_possible())     \
-                       schedule_timeout(1);    \
-               else                            \
-                       schedule_timeout(0);    \
-       } while (0)
+#define cfs_schedule() cfs_schedule_timeout(CFS_TASK_UNINT, CFS_TICK)
+#define cfs_pause(tick)        cfs_schedule_timeout(CFS_TASK_UNINT, tick)
+
+#define __wait_event(wq, condition)                            \
+do {                                                           \
+       struct cfs_waitlink __wait;                             \
+                                                               \
+       cfs_waitlink_init(&__wait);                             \
+       for (;;) {                                              \
+               cfs_waitq_add(&wq, &__wait);                    \
+               if (condition)                                  \
+                       break;                                  \
+               cfs_waitq_wait(&__wait, CFS_TASK_UNINT);        \
+               cfs_waitq_del(&wq, &__wait);                    \
+       }                                                       \
+       cfs_waitq_del(&wq, &__wait);                            \
+} while (0)
 
-#define __wait_event(wq, condition)            \
-do {                                           \
-       struct cfs_waitlink __wait;             \
-                                               \
-       cfs_waitlink_init(&__wait);             \
-       for (;;) {                              \
-               cfs_waitq_add(&wq, &__wait);    \
-               if (condition)                  \
-                       break;                  \
-               cfs_waitq_wait(&__wait);        \
-               cfs_waitq_del(&wq, &__wait);    \
-       }                                       \
-       cfs_waitq_del(&wq, &__wait);            \
+#define wait_event(wq, condition)                              \
+do {                                                           \
+       if (condition)                                          \
+               break;                                          \
+       __wait_event(wq, condition);                            \
 } while (0)
 
-#define wait_event(wq, condition)                                      \
-do {                                                                   \
-       if (condition)                                                  \
-               break;                                                  \
-       __wait_event(wq, condition);                                    \
+#define __wait_event_interruptible(wq, condition, ex, ret)     \
+do {                                                           \
+       struct cfs_waitlink __wait;                             \
+                                                               \
+       cfs_waitlink_init(&__wait);                             \
+       for (;;) {                                              \
+               if (ex == 0)                                    \
+                       cfs_waitq_add(&wq, &__wait);            \
+               else                                            \
+                       cfs_waitq_add_exclusive(&wq, &__wait);  \
+               if (condition)                                  \
+                       break;                                  \
+               if (!cfs_signal_pending()) {                    \
+                       cfs_waitq_wait(&__wait,                 \
+                                      CFS_TASK_INTERRUPTIBLE); \
+                       cfs_waitq_del(&wq, &__wait);            \
+                       continue;                               \
+               }                                               \
+               ret = -ERESTARTSYS;                             \
+               break;                                          \
+       }                                                       \
+       cfs_waitq_del(&wq, &__wait);                            \
 } while (0)
 
-#define wait_event_interruptible(wq, condition)        \
-({                                             \
-       wait_event(wq, condition);              \
-       0;                                      \
+#define wait_event_interruptible(wq, condition)                        \
+({                                                             \
+       int __ret = 0;                                          \
+       if (!condition)                                         \
+               __wait_event_interruptible(wq, condition,       \
+                                          0, __ret);           \
+       __ret;                                                  \
 })
 
+#define wait_event_interruptible_exclusive(wq, condition)      \
+({                                                             \
+       int __ret = 0;                                          \
+       if (!condition)                                         \
+               __wait_event_interruptible(wq, condition,       \
+                                          1, __ret);           \
+       __ret;                                                  \
+})
+
+#ifndef __DARWIN8__
 extern void    wakeup_one __P((void * chan));
+#endif
 /* only used in tests */
-#define wake_up_process(p)                     \
-       do {                                    \
-               wakeup_one(p);                  \
+#define wake_up_process(p)                                     \
+       do {                                                    \
+               wakeup_one((caddr_t)p);                         \
        } while (0)
        
 /* used in couple of places */
@@ -359,48 +325,19 @@ static inline void sleep_on(cfs_waitq_t *waitq)
        
        cfs_waitlink_init(&link);
        cfs_waitq_add(waitq, &link);
-       cfs_waitq_wait(&link);
+       cfs_waitq_wait(&link, CFS_TASK_UNINT);
        cfs_waitq_del(waitq, &link);
 }
 
 /*
- * XXX
  * Signal
  */
-#define cfs_sigmask_lock(t, f)         do { f = 0; } while(0)
-#define cfs_sigmask_unlock(t, f)       do { f = 0; } while(0)
-#define cfs_signal_pending(t)          (0)
-
-#define cfs_siginitset(pmask, sigs)                            \
-       do {                                                    \
-               sigset_t __sigs = sigs & (~sigcantmask);        \
-               *(pmask) = __sigs;                              \
-       } while(0)
-
-#define cfs_siginitsetinv(pmask, sigs)                          \
-       do {                                                    \
-               sigset_t __sigs = ~(sigs | sigcantmask);        \
-               *(pmask) = __sigs;                              \
-       } while(0)
-
-#define cfs_recalc_sigpending(ut)                              \
-        do {                                                   \
-               (ut)->uu_siglist = (ut)->uu_siglist & ~(ut)->uu_sigmask;\
-       } while (0)
-#define cfs_sigfillset(s)                                      \
-       do {                                                    \
-               memset((s), -1, sizeof(sigset_t));              \
-       } while(0)
-
-#define cfs_set_sig_blocked(ut, b)             do {(ut)->uu_sigmask = b;} while(0)
-#define cfs_get_sig_blocked(ut)                        (&(ut)->uu_sigmask)
+typedef sigset_t       cfs_sigset_t;
 
 #define SIGNAL_MASK_ASSERT()
-
 /*
  * Timer
  */
-
 typedef struct cfs_timer {
        struct ktimer t;
 } cfs_timer_t;
@@ -434,20 +371,27 @@ cfs_time_t cfs_timer_deadline(struct cfs_timer *t);
 /*
  * CPU
  */
-#include <machine/cpu_number.h>
 /* Run in PowerG5 who is PPC64 */
 #define SMP_CACHE_BYTES                         128
 #define __cacheline_aligned                     __attribute__((__aligned__(SMP_CACHE_BYTES)))
-/* XXX How to get the value of NCPUS from xnu ? */
 #define NR_CPUS                                        2
-#define smp_processor_id()                     cpu_number()
-#define smp_num_cpus                           NR_CPUS
+
+/* 
+ * XXX Liang: patch xnu and export current_processor()?
+ *
+ * #define smp_processor_id()                  current_processor()
+ */
+#define smp_processor_id()                     0
 /* XXX smp_call_function is not supported in xnu */
 #define smp_call_function(f, a, n, w)          do {} while(0)
+int cfs_online_cpus(void);
+#define smp_num_cpus                           cfs_online_cpus()
 
 /*
  * Misc
  */
+extern int is_suser(void);
+
 #ifndef likely
 #define likely(exp) (exp)
 #endif
@@ -458,11 +402,9 @@ cfs_time_t cfs_timer_deadline(struct cfs_timer *t);
 #define lock_kernel()                          do {} while(0)
 #define unlock_kernel()                                do {} while(0)
 
-#define exit_mm(t)                             do {} while(0)
-#define exit_files(t)                          do {} while(0)
-
-#define CAP_SYS_ADMIN                           0
-#define capable(a)             suser(current_proc()->p_ucred, &(current_proc()->p_acflag))
+#define CAP_SYS_BOOT                           0
+#define CAP_SYS_ADMIN                           1
+#define capable(a)                             ((a) == CAP_SYS_BOOT ? is_suser(): is_suser1())
 
 #define USERMODEHELPER(path, argv, envp)       (0)
 
@@ -499,6 +441,11 @@ struct __dummy_ ## name ## _struct {}
 #define inter_module_get(n)                    cfs_symbol_get(n)
 #define inter_module_put(n)                    cfs_symbol_put(n)
 
+static inline int request_module(char *name)
+{
+       return (-EINVAL);
+}
+
 #ifndef __exit
 #define __exit
 #endif
@@ -517,7 +464,7 @@ struct __dummy_ ## name ## _struct {}
 #define LINUX_VERSION_CODE KERNEL_VERSION(2,5,0)
 
 #define NR_IRQS                                512
-#define in_interrupt()                 (0)
+#define in_interrupt()                 ml_at_interrupt_context()
 
 #define KERN_EMERG      "<0>"   /* system is unusable                   */
 #define KERN_ALERT      "<1>"   /* action must be taken immediately     */
@@ -534,21 +481,47 @@ static inline long PTR_ERR(const void *ptr)
 }
 
 #define ERR_PTR(err)   ((void *)err)
+#define IS_ERR(p)      ((unsigned long)(p) + 1000 < 1000)
+
+#else  /* !__KERNEL__ */
 
-/* XXX */
-#define IS_ERR(p)      (0)
+typedef struct cfs_proc_dir_entry {
+       void            *data;
+} cfs_proc_dir_entry_t;
+
+#include <libcfs/user-prim.h>
+#define __WORDSIZE     32
 
+#endif /* END __KERNEL__ */
 /*
  * Error number
  */
+#ifndef EPROTO
+#define EPROTO          EPROTOTYPE
+#endif
+#ifndef EBADR
 #define EBADR          EBADRPC
-#define ERESTARTSYS    ERESTART
+#endif
+#ifndef ERESTARTSYS
+#define ERESTARTSYS    512
+#endif
+#ifndef EDEADLOCK
 #define EDEADLOCK      EDEADLK
+#endif
+#ifndef ECOMM
 #define ECOMM          EINVAL
+#endif
+#ifndef ENODATA
 #define ENODATA                EINVAL
+#endif
+#ifndef ENOTSUPP
+#define ENOTSUPP       EINVAL
+#endif
 
+#if BYTE_ORDER == BIG_ENDIAN
+# define __BIG_ENDIAN
 #else
-#define __WORDSIZE     32
-#endif /* __KERNEL__ */
+# define __LITTLE_ENDIAN
+#endif
 
-#endif /* __LINUX__ */
+#endif /* __LIBCFS_DARWIN_CFS_PRIM_H__ */
index 3374f43..5a3fabd 100644 (file)
@@ -1,7 +1,8 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Lustre Light Super operations
+ * Implementation of standard libcfs synchronization primitives for XNU
+ * kernel.
  *
  *  Copyright (c) 2004 Cluster File Systems, Inc.
  *
@@ -33,7 +34,7 @@
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
 #endif
 
-#define XNU_SYNC_DEBUG (0)
+#define XNU_SYNC_DEBUG (1)
 
 #if XNU_SYNC_DEBUG
 #define ON_SYNC_DEBUG(e) e
@@ -48,6 +49,7 @@ enum {
        KCOND_MAGIC = 0xb01dface,
        KRW_MAGIC   = 0xdabb1edd,
        KSPIN_MAGIC = 0xca11ab1e,
+        KRW_SPIN_MAGIC    = 0xbabeface,
        KSLEEP_CHAN_MAGIC = 0x0debac1e,
        KSLEEP_LINK_MAGIC = 0xacc01ade,
        KTIMER_MAGIC      = 0xbefadd1e
@@ -60,25 +62,63 @@ enum {
  */
 #define SMP (1)
 
+#include <libcfs/list.h>
+
+#ifdef __DARWIN8__
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <kern/locks.h>
+
+/*
+ * hw_lock is not available in Darwin8 (hw_lock_* are not exported at all), 
+ * so use lck_spin_t. we can hack out lck_spin_t easily, it's the only 
+ * hacking in Darwin8.x. We did so because it'll take a lot of time to 
+ * add lock_done for all locks, maybe it should be done in the future.
+ * If lock_done for all locks were added, we can:
+ *
+ * typedef lck_spin_t      *xnu_spin_t;
+ */
+#if defined (__ppc__)
+typedef struct {
+        unsigned int    opaque[3];
+} xnu_spin_t;
+#elif defined (__i386__)
+typedef struct {
+        unsigned int    opaque[10];
+} xnu_spin_t;
+#endif
+
+/* 
+ * wait_queue is not available in Darwin8 (wait_queue_* are not exported), 
+ * use assert_wait/wakeup/wake_one (wait_queue in kernel hash).
+ */
+typedef void * xnu_wait_queue_t;
+
+/* DARWIN8 */
+#else
+
+#include <mach/mach_types.h>
+#include <sys/types.h>
 #include <kern/simple_lock.h>
 
-#include <libcfs/list.h>
+typedef hw_lock_data_t          xnu_spin_t;
+typedef struct wait_queue       xnu_wait_queue_t;
+
+/* DARWIN8 */
+#endif
 
 struct kspin {
 #if SMP
-       hw_lock_data_t lock;
+       xnu_spin_t      lock;
 #endif
 #if XNU_SYNC_DEBUG
-       unsigned magic;
-       thread_t owner;
+       unsigned        magic;
+       thread_t        owner;
 #endif
 };
 
-/*
- * XXX nikita: we cannot use simple_* functions, because bsd/sys/lock.h
- * redefines them to nothing. Use low-level hw_lock_* instead.
- */
-
 void kspin_init(struct kspin *spin);
 void kspin_done(struct kspin *spin);
 void kspin_lock(struct kspin *spin);
@@ -98,11 +138,27 @@ int kspin_isnotlocked(struct kspin *spin);
 #define kspin_isnotlocked(s) (1)
 #endif
 
+/* ------------------------- rw spinlock ----------------------- */
+struct krw_spin {
+        struct kspin      guard;
+        int               count;
+#if XNU_SYNC_DEBUG
+        unsigned          magic;
+#endif
+};
+
+void krw_spin_init(struct krw_spin *sem);
+void krw_spin_done(struct krw_spin *sem);
+void krw_spin_down_r(struct krw_spin *sem);
+void krw_spin_down_w(struct krw_spin *sem);
+void krw_spin_up_r(struct krw_spin *sem);
+void krw_spin_up_w(struct krw_spin *sem);
+
 /* ------------------------- semaphore ------------------------- */
 
 struct ksem {
         struct kspin      guard;
-        struct wait_queue q;
+        xnu_wait_queue_t  q;
         int               value;
 #if XNU_SYNC_DEBUG
         unsigned          magic;
@@ -225,20 +281,20 @@ void ksleep_link_done(struct ksleep_link *link);
 void ksleep_add(struct ksleep_chan *chan, struct ksleep_link *link);
 void ksleep_del(struct ksleep_chan *chan, struct ksleep_link *link);
 
-void ksleep_wait(struct ksleep_chan *chan);
-int64_t  ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout);
+void ksleep_wait(struct ksleep_chan *chan, int state);
+int64_t  ksleep_timedwait(struct ksleep_chan *chan, int state, uint64_t timeout);
 
 void ksleep_wake(struct ksleep_chan *chan);
 void ksleep_wake_all(struct ksleep_chan *chan);
 void ksleep_wake_nr(struct ksleep_chan *chan, int nr);
 
-#define KSLEEP_LINK_DECLARE(name)                      \
-{                                                      \
-       .flags   = 0,                                   \
-       .event   = 0,                                   \
-       .hits    = 0,                                   \
-       .linkage = PTL_LIST_HEAD_INIT(name.linkage),    \
-       .magic   = KSLEEP_LINK_MAGIC                    \
+#define KSLEEP_LINK_DECLARE(name)               \
+{                                               \
+       .flags   = 0,                           \
+       .event   = 0,                           \
+       .hits    = 0,                           \
+       .linkage = CFS_LIST_HEAD(name.linkage), \
+       .magic   = KSLEEP_LINK_MAGIC            \
 }
 
 /* ------------------------- timer ------------------------- */
diff --git a/lnet/include/libcfs/darwin/darwin-tcpip.h b/lnet/include/libcfs/darwin/darwin-tcpip.h
new file mode 100644 (file)
index 0000000..1a73891
--- /dev/null
@@ -0,0 +1,90 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef __LIBCFS_DARWIN_TCPIP_H__
+#define __LIBCFS_DARWIN_TCPIP_H__
+
+#ifdef __KERNEL__
+#include <sys/socket.h>
+
+#ifdef __DARWIN8__
+
+struct socket;
+
+typedef void    (*so_upcall)(socket_t sock, void* arg, int waitf);
+
+#define CFS_SOCK_UPCALL         0x1
+#define CFS_SOCK_DOWN           0x2
+
+#define CFS_SOCK_MAGIC          0xbabeface
+
+typedef struct cfs_socket {
+        socket_t        s_so;
+        int             s_magic;
+        int             s_flags;
+        so_upcall       s_upcall;
+        void           *s_upcallarg;
+} cfs_socket_t;
+
+
+/* cfs_socket_t to bsd socket */
+#define C2B_SOCK(s)             ((s)->s_so)     
+
+static inline int get_sock_intopt(socket_t so, int opt)
+{
+        int     val, len;
+        int     rc;
+
+        /*
+         * sock_getsockopt will take a lock(mutex) for socket,
+         * so it can be blocked. So be careful while using 
+         * them.
+         */
+        len = sizeof(val);
+        rc = sock_getsockopt(so, SOL_SOCKET, opt, &val, &len);
+        assert(rc == 0);
+        return val;
+}
+
+#define SOCK_ERROR(s)           get_sock_intopt(C2B_SOCK(s), SO_ERROR)        
+/* #define SOCK_WMEM_QUEUED(s)     (0) */
+#define SOCK_WMEM_QUEUED(s)     get_sock_intopt(C2B_SOCK(s), SO_NWRITE)
+/* XXX Liang: no reliable way to get it in Darwin8.x */
+#define SOCK_TEST_NOSPACE(s)    (0)
+
+void libcfs_sock_set_cb(cfs_socket_t *sock, so_upcall callback, void *arg);
+void libcfs_sock_reset_cb(cfs_socket_t *sock);
+
+#else /* !__DARWIN8__ */
+
+#define SOCK_WMEM_QUEUED(so)    ((so)->so_snd.sb_cc)
+#define SOCK_ERROR(so)          ((so)->so_error)
+
+#define SOCK_TEST_NOSPACE(so)   (sbspace(&(so)->so_snd) < (so)->so_snd.sb_lowat)
+
+#endif /* !__DARWIN8__ */
+
+#endif /* __KERNEL END */
+
+#endif  /* __XNU_CFS_TYPES_H__ */
index d6230ad..43ad274 100644 (file)
  *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
  *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
  *
- *  cfs_duration_t cfs_time_minimal_timeout(void)
- *
  *  CFS_TIME_FORMAT
  *  CFS_DURATION_FORMAT
  *
  */
 
 #define ONE_BILLION ((u_int64_t)1000000000)
-#define ONE_MILLION ((u_int64_t)   1000000)
+#define ONE_MILLION 1000000
 
 #ifdef __KERNEL__
 #include <sys/types.h>
 #include <sys/systm.h>
 
-#ifndef __APPLE_API_PRIVATE
-#define __APPLE_API_PRIVATE
-#include <sys/user.h>
-#undef __APPLE_API_PRIVATE
-#else
-#include <sys/user.h>
-#endif
-
 #include <sys/kernel.h>
 
-#include <mach/thread_act.h>
 #include <mach/mach_types.h>
-#include <mach/mach_traps.h>
-#include <mach/thread_switch.h>
 #include <mach/time_value.h>
-#include <kern/sched_prim.h>
-#include <vm/pmap.h>
-#include <vm/vm_kern.h>
-#include <mach/machine/vm_param.h>
 #include <kern/clock.h>
-#include <kern/thread_call.h>
 #include <sys/param.h>
-#include <sys/vm.h>
 
 #include <libcfs/darwin/darwin-types.h>
 #include <libcfs/darwin/darwin-utils.h>
 #include <libcfs/darwin/darwin-lock.h>
 
+/*
+ * There are three way to measure time in OS X:
+ * 1. nanoseconds
+ * 2. absolute time (abstime unit equal to the length of one bus cycle),
+ *    schedule of thread/timer are counted by absolute time, but abstime
+ *    in different mac can be different also, so we wouldn't use it.
+ * 3. clock interval (1sec = 100hz). But clock interval only taken by KPI
+ *    like tsleep().
+ *
+ * We use nanoseconds (uptime, not calendar time)
+ *
+ * clock_get_uptime()   :get absolute time since bootup.
+ * nanouptime()         :get nanoseconds since bootup
+ * microuptime()        :get microseonds since bootup
+ * nanotime()           :get nanoseconds since epoch
+ * microtime()          :get microseconds since epoch
+ */
 typedef u_int64_t cfs_time_t; /* nanoseconds */
 typedef int64_t cfs_duration_t;
 
@@ -118,15 +116,15 @@ static inline cfs_time_t cfs_time_current(void)
 {
         struct timespec instant;
 
-        nanotime(&instant);
-        return ((u_int64_t)instant.tv_sec) * ONE_BILLION + instant.tv_nsec;
+        nanouptime(&instant);
+        return ((u_int64_t)instant.tv_sec) * NSEC_PER_SEC + instant.tv_nsec;
 }
 
 static inline time_t cfs_time_current_sec(void)
 {
         struct timespec instant;
 
-        nanotime(&instant);
+        nanouptime(&instant);
        return instant.tv_sec;
 }
 
@@ -152,7 +150,7 @@ static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
 
 static inline void cfs_fs_time_current(cfs_fs_time_t *t)
 {
-        *t = time;
+        microtime((struct timeval *)t);
 }
 
 static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
@@ -160,12 +158,6 @@ static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
         return t->tv_sec;
 }
 
-static inline cfs_duration_t cfs_duration_build(int64_t nano)
-{
-        return nano;
-}
-
-
 static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
 {
         *v = *t;
@@ -174,17 +166,12 @@ static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
 static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
 {
         s->tv_sec  = t->tv_sec;
-        s->tv_nsec = t->tv_usec * 1000;
+        s->tv_nsec = t->tv_usec * NSEC_PER_USEC;
 }
 
 static inline cfs_duration_t cfs_time_seconds(int seconds)
 {
-       return cfs_duration_build(ONE_BILLION * (int64_t)seconds);
-}
-
-static inline cfs_time_t cfs_time_shift(int seconds)
-{
-       return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+       return (NSEC_PER_SEC * (int64_t)seconds);
 }
 
 /*
@@ -192,7 +179,7 @@ static inline cfs_time_t cfs_time_shift(int seconds)
  */
 static inline int64_t __cfs_fs_time_flat(cfs_fs_time_t *t)
 {
-        return ((int64_t)t->tv_sec) * ONE_BILLION + t->tv_usec;
+        return ((int64_t)t->tv_sec)*NSEC_PER_SEC + t->tv_usec*NSEC_PER_USEC;
 }
 
 static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
@@ -207,29 +194,33 @@ static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
 
 static inline time_t cfs_duration_sec(cfs_duration_t d)
 {
-        return d / ONE_BILLION;
+        return d / NSEC_PER_SEC;
 }
 
 static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
 {
-        s->tv_sec = d / ONE_BILLION;
-        s->tv_usec = (d - s->tv_sec * ONE_BILLION) / 1000;
+        s->tv_sec = d / NSEC_PER_SEC;
+        s->tv_usec = (d - ((int64_t)s->tv_sec) * NSEC_PER_SEC) / NSEC_PER_USEC;
 }
 
 static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
 {
-        s->tv_sec = d / ONE_BILLION;
-        s->tv_nsec = d - ((int64_t)s->tv_sec) * ONE_BILLION;
+        s->tv_sec = d / NSEC_PER_SEC;
+        s->tv_nsec = d - ((int64_t)s->tv_sec) * NSEC_PER_SEC;
 }
 
-static inline cfs_duration_t cfs_time_minimal_timeout(void)
-{
-        return ONE_BILLION / (u_int64_t)hz;
-}
+#define cfs_time_current_64 cfs_time_current
+#define cfs_time_add_64     cfs_time_add
+#define cfs_time_shift_64   cfs_time_shift
+#define cfs_time_before_64  cfs_time_before
 
-/* inline function cfs_time_minimal_timeout() can not be used to
- * initiallize static variable */
-#define CFS_MIN_DELAY          (ONE_BILLION / (u_int64_t)100)
+/* 
+ * One jiffy (in nanoseconds)
+ *
+ * osfmk/kern/sched_prim.c
+ * #define DEFAULT_PREEMPTION_RATE      100
+ */
+#define CFS_TICK               (NSEC_PER_SEC / (u_int64_t)100)
 
 #define LTIME_S(t)             (t)
 
index b2762c0..0fd2966 100644 (file)
 #include <mach/mach_types.h>
 #include <sys/types.h>
 
-typedef u_int8_t  __u8;
-typedef u_int16_t __u16;
-typedef u_int32_t __u32;
-typedef u_int64_t __u64;
+#ifndef _BLKID_TYPES_H
+#define _BLKID_TYPES_H
+#endif
+
+typedef u_int8_t        __u8;
+typedef u_int16_t       __u16;
+typedef u_int32_t       __u32;
+typedef u_int64_t       __u64;
+typedef int8_t          __s8;
+typedef int16_t         __s16;
+typedef int32_t         __s32;
+typedef int64_t         __s64;
 
 #ifdef __KERNEL__
 
 #include <kern/kern_types.h>
 
-#ifndef __s32
-typedef __signed__ int                 __s32;
-#endif
-#ifndef __s64
-typedef __signed__ long long           __s64;
-#endif
 
 typedef struct { int e; }              event_chan_t;
 typedef dev_t                          kdev_t;
@@ -61,22 +63,30 @@ typedef struct { volatile uint32_t counter; }       atomic_t;
 #define ATOMIC_INIT(i)                 { (i) }
 #define atomic_read(a)                 ((a)->counter)
 #define atomic_set(a, v)               (((a)->counter) = (v))
+#ifdef __DARWIN8__
+#define atomic_add(v, a)               OSAddAtomic(v, (SInt32 *)&((a)->counter))
+#define atomic_sub(v, a)               OSAddAtomic(-(v), (SInt32 *)&((a)->counter))
+#define atomic_inc(a)                  OSIncrementAtomic((SInt32 *)&((a)->counter))
+#define atomic_dec(a)                  OSDecrementAtomic((SInt32 *)&((a)->counter))
+#else /* !__DARWIN8__ */
 #define atomic_add(v, a)               hw_atomic_add((uint32_t *)&((a)->counter), v)
 #define atomic_sub(v, a)               hw_atomic_sub((uint32_t *)&((a)->counter), v)
 #define atomic_inc(a)                  atomic_add(1, a)
 #define atomic_dec(a)                  atomic_sub(1, a)
-#define atomic_sub_and_test(v, a)      ( atomic_sub(v, a) == 0 )
-#define atomic_dec_and_test(a)         ( atomic_dec(a) == 0 )
+#endif /* !__DARWIN8__ */
+#define atomic_sub_and_test(v, a)      ( atomic_sub(v, a) == -(a) )
+#define atomic_dec_and_test(a)         ( atomic_dec(a) == 1 )
 
 #include <libsa/mach/mach.h>
-typedef uint64_t                       loff_t;
+typedef off_t                          loff_t;
 
 #else  /* !__KERNEL__ */
 
 #include <stdint.h>
 
-typedef uint64_t                       loff_t;
+typedef off_t                          loff_t;
 
 #endif /* __KERNEL END */
+typedef unsigned short                  umode_t;
 
 #endif  /* __XNU_CFS_TYPES_H__ */
index 4e91db9..0f808a2 100644 (file)
@@ -1,5 +1,5 @@
-#ifndef __LIBCFS_DARWIN_XNU_UTILS_H__
-#define __LIBCFS_DARWIN_XNU_UTILS_H__
+#ifndef __LIBCFS_DARWIN_UTILS_H__
+#define __LIBCFS_DARWIN_UTILS_H__
 
 #ifndef __LIBCFS_LIBCFS_H__
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
@@ -57,4 +57,11 @@ char * ul2dstr(unsigned long address, char *buf, int len);
 
 #define HIPQUAD NIPQUAD
 
+#ifndef LIST_CIRCLE
+#define LIST_CIRCLE(elm, field)                                 \
+       do {                                                    \
+               (elm)->field.le_prev = &(elm)->field.le_next;   \
+       } while (0)
+#endif
+
 #endif /* __XNU_UTILS_H__ */
index 5c1acc4..f9e94b1 100644 (file)
 
 #include <libcfs/darwin/darwin-lock.h>
 #include <libcfs/darwin/darwin-prim.h>
-#include <portals/p30.h>
+#include <lnet/lnet.h>
 
-#define our_cond_resched()              schedule_timeout(1);
+#define our_cond_resched() cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, 1)
 
 #ifdef CONFIG_SMP
 #define LASSERT_SPIN_LOCKED(lock) do {} while(0) /* XXX */
 #else
 #define LASSERT_SPIN_LOCKED(lock) do {} while(0)
 #endif
+#define LASSERT_SEM_LOCKED(sem) do {} while(0) /* XXX */
 
-#define LBUG_WITH_LOC(file, func, line)         portals_catastrophe = 1
+#define LIBCFS_PANIC(msg) panic(msg)
+#error libcfs_register_panic_notifier() missing
+#error libcfs_unregister_panic_notifier() missing
 
 /* --------------------------------------------------------------------- */
 
 #define PORTAL_MODULE_USE                       do{int i = 0; i++;}while(0)
 #define PORTAL_MODULE_UNUSE                     do{int i = 0; i--;}while(0)
 
-#define printk(format, args...)                 printf(format, ## args)
+#define num_online_cpus()                       cfs_online_cpus()
+
+/******************************************************************************/
+/* XXX Liang: There is no module parameter supporting in OSX */
+#define CFS_MODULE_PARM(name, t, type, perm, desc)
+
+#define CFS_SYSFS_MODULE_PARM    0 /* no sysfs access to module parameters */
+/******************************************************************************/
 
 #else  /* !__KERNEL__ */
 # include <stdio.h>
 # include <sys/types.h>
 #endif
 
+#define BITS_PER_LONG   LONG_BIT
 /******************************************************************************/
 /* Light-weight trace
  * Support for temporary event tracing with minimal Heisenberg effect. */
 #define LWT_SUPPORT  0
 
-typedef struct { 
-        long long   lwte_when; 
-        char       *lwte_where; 
-        void       *lwte_task; 
-        long        lwte_p1; 
-        long        lwte_p2; 
-        long        lwte_p3; 
-        long        lwte_p4; 
+typedef struct {
+        long long   lwte_when;
+        char       *lwte_where;
+        void       *lwte_task;
+        long        lwte_p1;
+        long        lwte_p2;
+        long        lwte_p3;
+        long        lwte_p4;
 } lwt_event_t;
 
 # define LWT_EVENT(p1,p2,p3,p4)     /* no lwt implementation yet */
 
 /* -------------------------------------------------------------------------- */
 
-#define IOCTL_PORTAL_TYPE struct portal_ioctl_data
+#define IOCTL_LIBCFS_TYPE struct libcfs_ioctl_data
 
 #define LPU64 "%llu"
 #define LPD64 "%lld"
-#define LPX64 "%llx"
+#define LPX64 "%#llx"
 #define LPSZ  "%lu"
 #define LPSSZ "%ld"
 # define LI_POISON ((int)0x5a5a5a5a)
index 8e4eb89..eb4d8f3 100644 (file)
@@ -9,6 +9,7 @@
 #endif
 
 #include <mach/mach_types.h>
+#include <sys/errno.h>
 #include <string.h>
 #include <libcfs/darwin/darwin-types.h>
 #include <libcfs/darwin/darwin-time.h>
@@ -16,6 +17,7 @@
 #include <libcfs/darwin/darwin-mem.h>
 #include <libcfs/darwin/darwin-lock.h>
 #include <libcfs/darwin/darwin-fs.h>
+#include <libcfs/darwin/darwin-tcpip.h>
 
 #ifdef __KERNEL__
 # include <sys/types.h>
@@ -75,8 +77,8 @@ struct ptldebug_header {
 #endif
 #define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
 
-#define CHECK_STACK(stack) do { } while(0)
-#define CDEBUG_STACK (0L)
+#define CHECK_STACK() do { } while(0)
+#define CDEBUG_STACK() (0L)
 
 /* Darwin has defined RETURN, so we have to undef it in lustre */
 #ifdef RETURN
@@ -155,7 +157,7 @@ __entry_nesting(&__cdd);
 /* ENTRY_NESTING_SUPPORT */
 #endif
 
-#define LUSTRE_PTL_PID          12345
+#define LUSTRE_LNET_PID          12345
 
 #define _XNU_LIBCFS_H
 
@@ -164,10 +166,28 @@ __entry_nesting(&__cdd);
  *
  * Implementation is in darwin-curproc.c
  */
-#define CFS_CURPROC_COMM_MAX (sizeof ((struct proc *)0)->p_comm)
+#define CFS_CURPROC_COMM_MAX    MAXCOMLEN
 /*
  * XNU has no capabilities
  */
 typedef int cfs_kernel_cap_t;
 
+#ifdef __KERNEL__
+enum {
+        /* if you change this, update darwin-util.c:cfs_stack_trace_fill() */
+        CFS_STACK_TRACE_DEPTH = 16
+};
+
+struct cfs_stack_trace {
+        void *frame[CFS_STACK_TRACE_DEPTH];
+};
+
+#define printk(format, args...)                 printf(format, ## args)
+
+#ifdef WITH_WATCHDOG
+#undef WITH_WATCHDOG
+#endif
+
+#endif /* __KERNEL__ */
+
 #endif /* _XNU_LIBCFS_H */
index bb0dc91..31d6e17 100644 (file)
@@ -18,9 +18,9 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/time.h>
-#include <portals/types.h>
+#include <lnet/types.h>
 #include <libcfs/kp30.h>
 #include <mach/vm_param.h>
-#include <portals/ptlctl.h>
+#include <lnet/lnetctl.h>
 
 #endif
index 971df1b..19ee200 100644 (file)
 #ifndef __LIBCFS_KP30_H__
 #define __LIBCFS_KP30_H__
 
-#define PORTAL_DEBUG
+#define LIBCFS_DEBUG
 #include <libcfs/libcfs.h>
-#include <portals/types.h>
+#include <lnet/types.h>
 
 #if defined(__linux__)
 #include <libcfs/linux/kp30.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/kp30.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/kp30.h>
 #else
 #error Unsupported operating system
 #endif
 
-#include <portals/types.h>
+#ifndef DEBUG_SUBSYSTEM
+# define DEBUG_SUBSYSTEM S_UNDEFINED
+#endif
 
 #ifdef __KERNEL__
 
-# ifndef DEBUG_SUBSYSTEM
-#  define DEBUG_SUBSYSTEM S_UNDEFINED
-# endif
+#ifdef LIBCFS_DEBUG
 
-#ifdef PORTAL_DEBUG
-extern void kportal_assertion_failed(char *expr, char *file, const char *func,
-                                     const int line);
-#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__,  \
-                                                        __FUNCTION__, __LINE__))
-#define LASSERTF(cond, fmt...)                                                \
-        do {                                                                  \
-                if (unlikely(!(cond))) {                                      \
-                        portals_debug_msg(DEBUG_SUBSYSTEM, D_EMERG,  __FILE__,\
-                                          __FUNCTION__,__LINE__, CDEBUG_STACK,\
-                                          "ASSERTION(" #cond ") failed:" fmt);\
-                        LBUG();                                               \
-                }                                                             \
-        } while (0)
+/*
+ * When this is on, LASSERT macro includes check for assignment used instead
+ * of equality check, but doesn't have unlikely(). Turn this on from time to
+ * time to make test-builds. This shouldn't be on for production release.
+ */
+#define LASSERT_CHECKED (0)
 
+#if LASSERT_CHECKED
+/*
+ * Assertion.
+ *
+ * Strange construction with empty "then" clause is used to trigger compiler
+ * warnings on the assertions of the form LASSERT(a = b);
+ *
+ * "warning: suggest parentheses around assignment used as truth value"
+ *
+ * requires -Wall. Unfortunately this rules out use of likely/unlikely.
+ */
+#define LASSERT(cond)                                           \
+({                                                              \
+        if (cond)                                               \
+                ;                                               \
+        else                                                    \
+                libcfs_assertion_failed( #cond , __FILE__,      \
+                        __FUNCTION__, __LINE__);                \
+})
+
+#define LASSERTF(cond, fmt, a...)                                       \
+({                                                                      \
+         if (cond)                                                      \
+                 ;                                                      \
+         else {                                                         \
+                 libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_EMERG,       \
+                                  __FILE__, __FUNCTION__,__LINE__,      \
+                                  "ASSERTION(" #cond ") failed:" fmt,   \
+                                  ## a);                                \
+                 LBUG();                                                \
+         }                                                              \
+})
+
+/* LASSERT_CHECKED */
 #else
-#define LASSERT(e)
-#define LASSERTF(cond, fmt...) do { } while (0)
+
+#define LASSERT(cond)                                           \
+({                                                              \
+        if (unlikely(!(cond)))                                  \
+                libcfs_assertion_failed(#cond , __FILE__,       \
+                        __FUNCTION__, __LINE__);                \
+})
+
+#define LASSERTF(cond, fmt, a...)                                       \
+({                                                                      \
+        if (unlikely(!(cond))) {                                        \
+                libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_EMERG,        \
+                                 __FILE__, __FUNCTION__,__LINE__,       \
+                                 "ASSERTION(" #cond ") failed:" fmt,    \
+                                 ## a);                                 \
+                LBUG();                                                 \
+        }                                                               \
+})
+
+/* LASSERT_CHECKED */
 #endif
 
-/* LBUG_WITH_LOC defined in portals/<os>/kp30.h */
-#define LBUG() LBUG_WITH_LOC(__FILE__, __FUNCTION__, __LINE__)
+/* LIBCFS_DEBUG */
+#else
+#define LASSERT(e) ((void)(0))
+#define LASSERTF(cond, fmt...) ((void)(0))
+#endif /* LIBCFS_DEBUG */
+
+void lbug_with_loc(char *file, const char *func, const int line)
+        __attribute__((noreturn));
+
+#define LBUG() lbug_with_loc(__FILE__, __FUNCTION__, __LINE__)
 
+extern atomic_t libcfs_kmemory;
 /*
  * Memory
  */
-#ifdef PORTAL_DEBUG
-extern atomic_t portal_kmemory;
+#ifdef LIBCFS_DEBUG
 
-# define portal_kmem_inc(ptr, size)                                           \
-do {                                                                          \
-        atomic_add(size, &portal_kmemory);                                    \
+# define libcfs_kmem_inc(ptr, size)             \
+do {                                            \
+        atomic_add(size, &libcfs_kmemory);      \
 } while (0)
 
-# define portal_kmem_dec(ptr, size) do {                                      \
-        atomic_sub(size, &portal_kmemory);                                    \
+# define libcfs_kmem_dec(ptr, size) do {        \
+        atomic_sub(size, &libcfs_kmemory);      \
 } while (0)
 
 #else
-# define portal_kmem_inc(ptr, size) do {} while (0)
-# define portal_kmem_dec(ptr, size) do {} while (0)
-#endif /* PORTAL_DEBUG */
+# define libcfs_kmem_inc(ptr, size) do {} while (0)
+# define libcfs_kmem_dec(ptr, size) do {} while (0)
+#endif /* LIBCFS_DEBUG */
 
-#define PORTAL_VMALLOC_SIZE        16384
+#define LIBCFS_VMALLOC_SIZE        16384
 
-#define PORTAL_ALLOC_GFP(ptr, size, mask)                                 \
+#define LIBCFS_ALLOC_GFP(ptr, size, mask)                                 \
 do {                                                                      \
         LASSERT(!in_interrupt() ||                                        \
-               (size <= PORTAL_VMALLOC_SIZE && mask == CFS_ALLOC_ATOMIC));\
-        if ((size) > PORTAL_VMALLOC_SIZE)                                 \
+               (size <= LIBCFS_VMALLOC_SIZE && mask == CFS_ALLOC_ATOMIC));\
+        if (unlikely((size) > LIBCFS_VMALLOC_SIZE))                     \
                 (ptr) = cfs_alloc_large(size);                            \
         else                                                              \
                 (ptr) = cfs_alloc((size), (mask));                        \
-        if ((ptr) == NULL) {                                              \
-                CERROR("PORTALS: out of memory at %s:%d (tried to alloc '"\
+        if (unlikely((ptr) == NULL)) {                                  \
+                CERROR("LNET: out of memory at %s:%d (tried to alloc '"   \
                        #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));\
-                CERROR("PORTALS: %d total bytes allocated by portals\n",  \
-                       atomic_read(&portal_kmemory));                     \
+                CERROR("LNET: %d total bytes allocated by lnet\n",        \
+                       atomic_read(&libcfs_kmemory));                     \
         } else {                                                          \
-                portal_kmem_inc((ptr), (size));                           \
+                libcfs_kmem_inc((ptr), (size));                           \
                 if (!((mask) & CFS_ALLOC_ZERO))                           \
                        memset((ptr), 0, (size));                          \
         }                                                                 \
         CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p (tot %d).\n",    \
-               (int)(size), (ptr), atomic_read (&portal_kmemory));        \
+               (int)(size), (ptr), atomic_read (&libcfs_kmemory));        \
 } while (0)
 
-#define PORTAL_ALLOC(ptr, size) \
-        PORTAL_ALLOC_GFP(ptr, size, CFS_ALLOC_IO)
+#define LIBCFS_ALLOC(ptr, size) \
+        LIBCFS_ALLOC_GFP(ptr, size, CFS_ALLOC_IO)
 
-#define PORTAL_ALLOC_ATOMIC(ptr, size) \
-        PORTAL_ALLOC_GFP(ptr, size, CFS_ALLOC_ATOMIC)
+#define LIBCFS_ALLOC_ATOMIC(ptr, size) \
+        LIBCFS_ALLOC_GFP(ptr, size, CFS_ALLOC_ATOMIC)
 
-#define PORTAL_FREE(ptr, size)                                          \
+#define LIBCFS_FREE(ptr, size)                                          \
 do {                                                                    \
         int s = (size);                                                 \
-        if ((ptr) == NULL) {                                            \
-                CERROR("PORTALS: free NULL '" #ptr "' (%d bytes) at "   \
+        if (unlikely((ptr) == NULL)) {                                  \
+                CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at "    \
                        "%s:%d\n", s, __FILE__, __LINE__);               \
                 break;                                                  \
         }                                                               \
-        if (s > PORTAL_VMALLOC_SIZE)                                    \
+        if (unlikely(s > LIBCFS_VMALLOC_SIZE))                          \
                 cfs_free_large(ptr);                                    \
         else                                                            \
                 cfs_free(ptr);                                          \
-        portal_kmem_dec((ptr), s);                                      \
+        libcfs_kmem_dec((ptr), s);                                      \
         CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n",     \
-               s, (ptr), atomic_read(&portal_kmemory));                 \
+               s, (ptr), atomic_read(&libcfs_kmemory));                 \
 } while (0)
 
 /******************************************************************************/
 
-#ifdef PORTALS_PROFILING
-#define prof_enum(FOO) PROF__##FOO
-enum {
-        prof_enum(our_recvmsg),
-        prof_enum(our_sendmsg),
-        prof_enum(socknal_recv),
-        prof_enum(lib_parse),
-        prof_enum(conn_list_walk),
-        prof_enum(memcpy),
-        prof_enum(lib_finalize),
-        prof_enum(pingcli_time),
-        prof_enum(gmnal_send),
-        prof_enum(gmnal_recv),
-        MAX_PROFS
-};
-
-struct prof_ent {
-        char *str;
-        /* hrmph.  wrap-tastic. */
-        u32       starts;
-        u32       finishes;
-        cycles_t  total_cycles;
-        cycles_t  start;
-        cycles_t  end;
-};
+/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */
+#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__)
+#define ___htonl(x) __cpu_to_be32(x)
+#define ___htons(x) __cpu_to_be16(x)
+#define ___ntohl(x) __be32_to_cpu(x)
+#define ___ntohs(x) __be16_to_cpu(x)
+#define htonl(x) ___htonl(x)
+#define ntohl(x) ___ntohl(x)
+#define htons(x) ___htons(x)
+#define ntohs(x) ___ntohs(x)
+#endif
 
-extern struct prof_ent prof_ents[MAX_PROFS];
-
-#define PROF_START(FOO)                                         \
-        do {                                                    \
-                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
-                pe->starts++;                                   \
-                pe->start = get_cycles();                       \
-        } while (0)
-
-#define PROF_FINISH(FOO)                                        \
-        do {                                                    \
-                struct prof_ent *pe = &prof_ents[PROF__##FOO];  \
-                pe->finishes++;                                 \
-                pe->end = get_cycles();                         \
-                pe->total_cycles += (pe->end - pe->start);      \
-        } while (0)
-#else /* !PORTALS_PROFILING */
-#define PROF_START(FOO) do {} while(0)
-#define PROF_FINISH(FOO) do {} while(0)
-#endif /* PORTALS_PROFILING */
-
-/* debug.c */
-extern spinlock_t stack_backtrace_lock;
-
-void portals_debug_dumpstack(cfs_task_t *tsk);
-void portals_run_upcall(char **argv);
-void portals_run_lbug_upcall(char * file, const char *fn, const int line);
-void portals_debug_dumplog(void);
-int portals_debug_init(unsigned long bufsize);
-int portals_debug_cleanup(void);
-int portals_debug_clear_buffer(void);
-int portals_debug_mark_buffer(char *text);
-int portals_debug_set_daemon(unsigned int cmd, unsigned int length,
-                             char *file, unsigned int size);
-__s32 portals_debug_copy_to_user(char *buf, unsigned long len);
-/* Use the special GNU C __attribute__ hack to have the compiler check the
- * printf style argument string against the actual argument count and
- * types.
- */
-void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
-                       const int line, unsigned long stack,
-                       char *format, ...)
-        __attribute__ ((format (printf, 7, 8)));
-void portals_debug_set_level(unsigned int debug_level);
+void libcfs_debug_dumpstack(cfs_task_t *tsk);
+void libcfs_run_upcall(char **argv);
+void libcfs_run_lbug_upcall(char * file, const char *fn, const int line);
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+int libcfs_debug_clear_buffer(void);
+int libcfs_debug_mark_buffer(char *text);
 
-extern void kportal_daemonize (char *name);
-extern void kportal_blockallsigs (void);
+void libcfs_debug_set_level(unsigned int debug_level);
 
 #else  /* !__KERNEL__ */
-# ifndef DEBUG_SUBSYSTEM
-#  define DEBUG_SUBSYSTEM S_UNDEFINED
-# endif
-# ifdef PORTAL_DEBUG
+# ifdef LIBCFS_DEBUG
 #  undef NDEBUG
 #  include <assert.h>
 #  define LASSERT(e)     assert(e)
@@ -204,23 +204,38 @@ do {                                                                           \
                 CERROR(args);                                                  \
           assert(cond);                                                        \
 } while (0)
+#  define LBUG()   assert(0)
 # else
-#  define LASSERT(e)
+#  define LASSERT(e) ((void)(0))
 #  define LASSERTF(cond, args...) do { } while (0)
-# endif
+#  define LBUG()   ((void)(0))
+# endif /* LIBCFS_DEBUG */
 # define printk(format, args...) printf (format, ## args)
-# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
-# define PORTAL_FREE(a, b) do { free(a); } while (0);
-void portals_debug_dumplog(void);
-# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \
-    printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format,                    \
-           (subsys), (mask), (long)time(0), file, fn, line,                   \
-           getpid(), (unsigned long)stack, ## a);
-
-#undef CWARN
-#undef CERROR
-#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a)
-#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a)
+# ifdef CRAY_XT3                                /* buggy calloc! */
+#  define LIBCFS_ALLOC(ptr, size)               \
+   do {                                         \
+        (ptr) = malloc(size);                   \
+        memset(ptr, 0, size);                   \
+   } while (0);
+# else
+#  define LIBCFS_ALLOC(ptr, size) do { (ptr) = calloc(1,size); } while (0);
+# endif
+# define LIBCFS_FREE(a, b) do { free(a); } while (0);
+
+void libcfs_debug_dumplog(void);
+int libcfs_debug_init(unsigned long bufsize);
+int libcfs_debug_cleanup(void);
+
+/*
+ * Generic compiler-dependent macros required for kernel
+ * build go below this comment. Actual compiler/compiler version
+ * specific implementations come from the above header files
+ */
+
+#define likely(x)      __builtin_expect(!!(x), 1)
+#define unlikely(x)    __builtin_expect(!!(x), 0)
+
+/* !__KERNEL__ */
 #endif
 
 /*
@@ -240,8 +255,31 @@ void portals_debug_dumplog(void);
 #define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } })
 
 /* support decl needed both by kernel and liblustre */
-char *portals_nid2str(int nal, ptl_nid_t nid, char *str);
-char *portals_id2str(int nal, ptl_process_id_t nid, char *str);
+int         libcfs_isknown_lnd(int type);
+char       *libcfs_lnd2modname(int type);
+char       *libcfs_lnd2str(int type);
+int         libcfs_str2lnd(char *str);
+char       *libcfs_net2str(__u32 net);
+char       *libcfs_nid2str(lnet_nid_t nid);
+__u32       libcfs_str2net(char *str);
+lnet_nid_t  libcfs_str2nid(char *str);
+int         libcfs_str2anynid(lnet_nid_t *nid, char *str);
+char       *libcfs_id2str(lnet_process_id_t id);
+void        libcfs_setnet0alias(int type);
+
+/* how an LNET NID encodes net:address */
+#define LNET_NIDADDR(nid)      ((__u32)((nid) & 0xffffffff))
+#define LNET_NIDNET(nid)       ((__u32)(((nid) >> 32)) & 0xffffffff)
+#define LNET_MKNID(net,addr)   ((((__u64)(net))<<32)|((__u64)(addr)))
+/* how net encodes type:number */
+#define LNET_NETNUM(net)       ((net) & 0xffff)
+#define LNET_NETTYP(net)       (((net) >> 16) & 0xffff)
+#define LNET_MKNET(typ,num)    ((((__u32)(typ))<<16)|((__u32)(num)))
+
+/* implication */
+#define ergo(a, b) (!(a) || (b))
+/* logical equivalence */
+#define equi(a, b) (!!(a) == !!(b))
 
 #ifndef CURRENT_TIME
 # define CURRENT_TIME time(0)
@@ -253,45 +291,132 @@ char *portals_id2str(int nal, ptl_process_id_t nid, char *str);
  * All stuff about lwt are put in arch/kp30.h
  * -------------------------------------------------------------------- */
 
-struct portals_device_userstate
+struct libcfs_device_userstate
 {
-        int          pdu_memhog_pages;
-        cfs_page_t   *pdu_memhog_root_page;
+        int           ldu_memhog_pages;
+        cfs_page_t   *ldu_memhog_root_page;
 };
 
-#include <libcfs/portals_lib.h>
+/* what used to be in portals_lib.h */
+#ifndef MIN
+# define MIN(a,b) (((a)<(b)) ? (a): (b))
+#endif
+#ifndef MAX
+# define MAX(a,b) (((a)>(b)) ? (a): (b))
+#endif
+
+#define MKSTR(ptr) ((ptr))? (ptr) : ""
+
+static inline int size_round4 (int val)
+{
+        return (val + 3) & (~0x3);
+}
+
+static inline int size_round (int val)
+{
+        return (val + 7) & (~0x7);
+}
+
+static inline int size_round16(int val)
+{
+        return (val + 0xf) & (~0xf);
+}
+
+static inline int size_round32(int val)
+{
+        return (val + 0x1f) & (~0x1f);
+}
+
+static inline int size_round0(int val)
+{
+        if (!val)
+                return 0;
+        return (val + 1 + 7) & (~0x7);
+}
+
+static inline size_t round_strlen(char *fset)
+{
+        return (size_t)size_round((int)strlen(fset) + 1);
+}
+
+#define LOGL(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)ptr, (const char *)var, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGU(var,len,ptr)                                       \
+do {                                                            \
+        if (var)                                                \
+                memcpy((char *)var, (const char *)ptr, len);    \
+        ptr += size_round(len);                                 \
+} while (0)
+
+#define LOGL0(var,len,ptr)                              \
+do {                                                    \
+        if (!len)                                       \
+                break;                                  \
+        memcpy((char *)ptr, (const char *)var, len);    \
+        *((char *)(ptr) + len) = 0;                     \
+        ptr += size_round(len + 1);                     \
+} while (0)
 
 /*
  * USER LEVEL STUFF BELOW
  */
 
-#define PORTAL_IOCTL_VERSION 0x00010008
-#define PING_SYNC       0
-#define PING_ASYNC      1
+#define LIBCFS_IOCTL_VERSION 0x0001000a
+
+struct libcfs_ioctl_data {
+        __u32 ioc_len;
+        __u32 ioc_version;
+
+        __u64 ioc_nid;
+        __u64 ioc_u64[1];
+
+        __u32 ioc_flags;
+        __u32 ioc_count;
+        __u32 ioc_net;
+        __u32 ioc_u32[7];
+
+        __u32 ioc_inllen1;
+        char *ioc_inlbuf1;
+        __u32 ioc_inllen2;
+        char *ioc_inlbuf2;
 
-struct portal_ioctl_hdr {
+        __u32 ioc_plen1; /* buffers in userspace */
+        char *ioc_pbuf1;
+        __u32 ioc_plen2; /* buffers in userspace */
+        char *ioc_pbuf2;
+
+        char ioc_bulk[0];
+};
+
+
+struct libcfs_ioctl_hdr {
         __u32 ioc_len;
         __u32 ioc_version;
 };
 
-struct portals_debug_ioctl_data
+struct libcfs_debug_ioctl_data
 {
-        struct portal_ioctl_hdr hdr;
+        struct libcfs_ioctl_hdr hdr;
         unsigned int subs;
         unsigned int debug;
 };
 
-#define PORTAL_IOC_INIT(data)                           \
+#define LIBCFS_IOC_INIT(data)                           \
 do {                                                    \
         memset(&data, 0, sizeof(data));                 \
-        data.ioc_version = PORTAL_IOCTL_VERSION;        \
+        data.ioc_version = LIBCFS_IOCTL_VERSION;        \
         data.ioc_len = sizeof(data);                    \
 } while (0)
 
 /* FIXME check conflict with lustre_lib.h */
-#define PTL_IOC_DEBUG_MASK             _IOWR('f', 250, long)
+#define LIBCFS_IOC_DEBUG_MASK             _IOWR('f', 250, long)
 
-static inline int portal_ioctl_packlen(struct portal_ioctl_data *data)
+static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data)
 {
         int len = sizeof(*data);
         len += size_round(data->ioc_inllen1);
@@ -299,79 +424,79 @@ static inline int portal_ioctl_packlen(struct portal_ioctl_data *data)
         return len;
 }
 
-static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data)
+static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data)
 {
         if (data->ioc_len > (1<<30)) {
-                CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n");
+                CERROR ("LIBCFS ioctl: ioc_len larger than 1<<30\n");
                 return 1;
         }
         if (data->ioc_inllen1 > (1<<30)) {
-                CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n");
+                CERROR ("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n");
                 return 1;
         }
         if (data->ioc_inllen2 > (1<<30)) {
-                CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n");
+                CERROR ("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n");
                 return 1;
         }
         if (data->ioc_inlbuf1 && !data->ioc_inllen1) {
-                CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n");
+                CERROR ("LIBCFS ioctl: inlbuf1 pointer but 0 length\n");
                 return 1;
         }
         if (data->ioc_inlbuf2 && !data->ioc_inllen2) {
-                CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n");
+                CERROR ("LIBCFS ioctl: inlbuf2 pointer but 0 length\n");
                 return 1;
         }
         if (data->ioc_pbuf1 && !data->ioc_plen1) {
-                CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n");
+                CERROR ("LIBCFS ioctl: pbuf1 pointer but 0 length\n");
                 return 1;
         }
         if (data->ioc_pbuf2 && !data->ioc_plen2) {
-                CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n");
+                CERROR ("LIBCFS ioctl: pbuf2 pointer but 0 length\n");
                 return 1;
         }
         if (data->ioc_plen1 && !data->ioc_pbuf1) {
-                CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n");
+                CERROR ("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n");
                 return 1;
         }
         if (data->ioc_plen2 && !data->ioc_pbuf2) {
-                CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n");
+                CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n");
                 return 1;
         }
-        if (portal_ioctl_packlen(data) != data->ioc_len ) {
-                CERROR ("PORTALS ioctl: packlen != ioc_len\n");
+        if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) {
+                CERROR ("LIBCFS ioctl: packlen != ioc_len\n");
                 return 1;
         }
         if (data->ioc_inllen1 &&
             data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') {
-                CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n");
+                CERROR ("LIBCFS ioctl: inlbuf1 not 0 terminated\n");
                 return 1;
         }
         if (data->ioc_inllen2 &&
             data->ioc_bulk[size_round(data->ioc_inllen1) +
                            data->ioc_inllen2 - 1] != '\0') {
-                CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n");
+                CERROR ("LIBCFS ioctl: inlbuf2 not 0 terminated\n");
                 return 1;
         }
         return 0;
 }
 
 #ifndef __KERNEL__
-static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf,
+static inline int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf,
                                     int max)
 {
         char *ptr;
-        struct portal_ioctl_data *overlay;
-        data->ioc_len = portal_ioctl_packlen(data);
-        data->ioc_version = PORTAL_IOCTL_VERSION;
+        struct libcfs_ioctl_data *overlay;
+        data->ioc_len = libcfs_ioctl_packlen(data);
+        data->ioc_version = LIBCFS_IOCTL_VERSION;
 
-        if (*pbuf && portal_ioctl_packlen(data) > max)
+        if (*pbuf && libcfs_ioctl_packlen(data) > max)
                 return 1;
         if (*pbuf == NULL) {
                 *pbuf = malloc(data->ioc_len);
         }
         if (!*pbuf)
                 return 1;
-        overlay = (struct portal_ioctl_data *)*pbuf;
+        overlay = (struct libcfs_ioctl_data *)*pbuf;
         memcpy(*pbuf, data, sizeof(*data));
 
         ptr = overlay->ioc_bulk;
@@ -379,7 +504,7 @@ static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf,
                 LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr);
         if (data->ioc_inlbuf2)
                 LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr);
-        if (portal_ioctl_is_invalid(overlay))
+        if (libcfs_ioctl_is_invalid(overlay))
                 return 1;
 
         return 0;
@@ -387,70 +512,71 @@ static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf,
 
 #else
 
-extern int portal_ioctl_getdata(char *buf, char *end, void *arg);
+extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg);
+extern int libcfs_ioctl_popdata(void *arg, void *buf, int size);
 
 #endif
 
 /* ioctls for manipulating snapshots 30- */
-#define IOC_PORTAL_TYPE                   'e'
-#define IOC_PORTAL_MIN_NR                 30
-
-#define IOC_PORTAL_PING                    _IOWR('e', 30, IOCTL_PORTAL_TYPE)
-
-#define IOC_PORTAL_CLEAR_DEBUG             _IOWR('e', 32, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_MARK_DEBUG              _IOWR('e', 33, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_PANIC                   _IOWR('e', 34, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_NAL_CMD                 _IOWR('e', 35, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_GET_NID                 _IOWR('e', 36, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_FAIL_NID                _IOWR('e', 37, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_LOOPBACK                _IOWR('e', 38, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_LWT_CONTROL             _IOWR('e', 39, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_LWT_SNAPSHOT            _IOWR('e', 40, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_LWT_LOOKUP_STRING       _IOWR('e', 41, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_MEMHOG                  _IOWR('e', 42, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_DMSG                    _IOWR('e', 43, IOCTL_PORTAL_TYPE)
-#define IOC_PORTAL_MAX_NR                             43
+#define IOC_LIBCFS_TYPE                   'e'
+#define IOC_LIBCFS_MIN_NR                 30
+/* libcfs ioctls */
+#define IOC_LIBCFS_PANIC                   _IOWR('e', 30, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLEAR_DEBUG             _IOWR('e', 31, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MARK_DEBUG              _IOWR('e', 32, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_CONTROL             _IOWR('e', 33, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_SNAPSHOT            _IOWR('e', 34, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LWT_LOOKUP_STRING       _IOWR('e', 35, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_MEMHOG                  _IOWR('e', 36, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING_TEST               _IOWR('e', 37, IOCTL_LIBCFS_TYPE)
+/* lnet ioctls */
+#define IOC_LIBCFS_GET_NI                  _IOWR('e', 50, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_FAIL_NID                _IOWR('e', 51, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_ROUTE               _IOWR('e', 52, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_ROUTE               _IOWR('e', 53, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_ROUTE               _IOWR('e', 54, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_NOTIFY_ROUTER           _IOWR('e', 55, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_UNCONFIGURE             _IOWR('e', 56, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PORTALS_COMPATIBILITY   _IOWR('e', 57, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_LNET_DIST               _IOWR('e', 58, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CONFIGURE               _IOWR('e', 59, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_TESTPROTOCOMPAT         _IOWR('e', 60, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PING                    _IOWR('e', 61, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEBUG_PEER              _IOWR('e', 62, IOCTL_LIBCFS_TYPE)
+/* lnd ioctls */
+#define IOC_LIBCFS_REGISTER_MYNID          _IOWR('e', 70, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_CLOSE_CONNECTION        _IOWR('e', 71, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_PUSH_CONNECTION         _IOWR('e', 72, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_CONN                _IOWR('e', 73, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_PEER                _IOWR('e', 74, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_PEER                _IOWR('e', 75, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_PEER                _IOWR('e', 76, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_TXDESC              _IOWR('e', 77, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_ADD_INTERFACE           _IOWR('e', 78, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_DEL_INTERFACE           _IOWR('e', 79, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_INTERFACE           _IOWR('e', 80, IOCTL_LIBCFS_TYPE)
+#define IOC_LIBCFS_GET_GMID                _IOWR('e', 81, IOCTL_LIBCFS_TYPE)
+
+#define IOC_LIBCFS_MAX_NR                             81
+
 
 enum {
-        QSWNAL    = 1,
-        SOCKNAL   = 2,
-        GMNAL     = 3,
-        /*          4 unused */
-        TCPNAL    = 5,
-        ROUTER    = 6,
-        OPENIBNAL = 7,
-        IIBNAL    = 8,
-        LONAL     = 9,
-        RANAL     = 10,
-        VIBNAL    = 11,
-        NAL_ENUM_END_MARKER
+        /* Only add to these values (i.e. don't ever change or redefine them):
+         * network addresses depend on them... */
+        QSWLND    = 1,
+        SOCKLND   = 2,
+        GMLND     = 3,
+        PTLLND    = 4,
+        O2IBLND   = 5,
+        CIBLND    = 6,
+        OPENIBLND = 7,
+        IIBLND    = 8,
+        LOLND     = 9,
+        RALND     = 10,
+        VIBLND    = 11,
+        MXLND     = 12,
 };
 
-#define PTL_NALFMT_SIZE             32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */
-#ifndef CRAY_PORTALS
-#define NALID_FROM_IFACE(nal) (nal)
-#endif
-
-#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1)
-
-#define NAL_CMD_REGISTER_PEER_FD     100
-#define NAL_CMD_CLOSE_CONNECTION     101
-#define NAL_CMD_REGISTER_MYNID       102
-#define NAL_CMD_PUSH_CONNECTION      103
-#define NAL_CMD_GET_CONN             104
-#define NAL_CMD_DEL_PEER             105
-#define NAL_CMD_ADD_PEER             106
-#define NAL_CMD_GET_PEER             107
-#define NAL_CMD_GET_TXDESC           108
-#define NAL_CMD_ADD_ROUTE            109
-#define NAL_CMD_DEL_ROUTE            110
-#define NAL_CMD_GET_ROUTE            111
-#define NAL_CMD_NOTIFY_ROUTER        112
-#define NAL_CMD_ADD_INTERFACE        113
-#define NAL_CMD_DEL_INTERFACE        114
-#define NAL_CMD_GET_INTERFACE        115
-
-
 enum {
         DEBUG_DAEMON_START       =  1,
         DEBUG_DAEMON_STOP        =  2,
index 9e7ea85..2e11e7c 100644 (file)
@@ -12,6 +12,8 @@
 #include <libcfs/linux/libcfs.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/libcfs.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/libcfs.h>
 #else
 #error Unsupported operating system.
 #endif
 #include <stdio.h>
 #endif
 
-#define PORTAL_DEBUG
+#define LIBCFS_DEBUG
 
 #ifndef offsetof
 # define offsetof(typ,memb)     ((unsigned long)((char *)&(((typ *)0)->memb)))
 #endif
 
+/* cardinality of array */
+#define sizeof_array(a) ((sizeof (a)) / (sizeof ((a)[0])))
+
+#if !defined(container_of)
+/* given a pointer @ptr to the field @member embedded into type (usually
+ * struct) @type, return pointer to the embedding instance of @type. */
+#define container_of(ptr, type, member) \
+        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+#endif
+
+#define container_of0(ptr, type, member)                        \
+({                                                              \
+        typeof(ptr) __ptr = (ptr);                              \
+        __ptr ? container_of(__ptr, type, member) : NULL;       \
+})
+
+/*
+ * true iff @i is power-of-2
+ */
+#define IS_PO2(i)                               \
+({                                              \
+        typeof(i) __i;                          \
+                                                \
+        __i = (i);                              \
+        !(__i & (__i - 1));                     \
+})
+
 #define LOWEST_BIT_SET(x)       ((x) & ~((x) - 1))
 
 /*
  *  Debugging
  */
-extern unsigned int portal_subsystem_debug;
-extern unsigned int portal_stack;
-extern unsigned int portal_debug;
-extern unsigned int portal_printk;
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_stack;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_debug_binary;
+extern char debug_file_path[1024];
+
+int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys);
+int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys);
 
 /* Has there been an LBUG? */
-extern unsigned int portals_catastrophe;
+extern unsigned int libcfs_catastrophe;
 
 /*
  * struct ptldebug_header is defined in libcfs/<os>/libcfs.h
@@ -48,6 +83,7 @@ extern unsigned int portals_catastrophe;
 #define PH_FLAG_FIRST_RECORD 1
 
 /* Debugging subsystems (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
 #define S_UNDEFINED   0x00000001
 #define S_MDC         0x00000002
 #define S_MDS         0x00000004
@@ -58,30 +94,33 @@ extern unsigned int portals_catastrophe;
 #define S_LLITE       0x00000080
 #define S_RPC         0x00000100
 #define S_MGMT        0x00000200
-#define S_PORTALS     0x00000400
-#define S_NAL         0x00000800 /* ALL NALs */
+/* unused */
+#define S_LNET        0x00000400
+#define S_LND         0x00000800 /* ALL LNDs */
 #define S_PINGER      0x00001000
 #define S_FILTER      0x00002000
-#define S_PTLBD       0x00004000
+/* unused */
 #define S_ECHO        0x00008000
 #define S_LDLM        0x00010000
 #define S_LOV         0x00020000
-#define S_PTLROUTER   0x00040000
-#define S_COBD        0x00080000
-#define S_SM          0x00100000
-#define S_ASOBD       0x00200000
-#define S_CONFOBD     0x00400000
-#define S_LMV         0x00800000
-#define S_CMOBD       0x01000000
-#define S_SEC         0x02000000
-#define S_GSS         0x04000000
-#define S_GKS         0x08000000
-/* If you change these values, please keep these files up to date...
- *    portals/utils/debug.c
- *    utils/lconf
- */
+/* unused */
+/* unused */
+/* unused */
+/* unused */
+/* unused */
+#define S_LMV         0x00800000 /* b_new_cmd */
+/* unused */
+#define S_SEC         0x02000000 /* upcall cache */
+#define S_GSS         0x04000000 /* b_new_cmd */
+/* unused */
+#define S_MGC         0x10000000
+#define S_MGS         0x20000000
+#define S_FID         0x40000000 /* b_new_cmd */
+#define S_FLD         0x80000000 /* b_new_cmd */
+/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */
 
 /* Debugging masks (32 bits, non-overlapping) */
+/* keep these in sync with lnet/utils/debug.c */
 #define D_TRACE       0x00000001 /* ENTRY/EXIT markers */
 #define D_INODE       0x00000002
 #define D_SUPER       0x00000004
@@ -90,13 +129,13 @@ extern unsigned int portals_catastrophe;
 #define D_CACHE       0x00000020 /* cache-related items */
 #define D_INFO        0x00000040 /* general information */
 #define D_IOCTL       0x00000080 /* ioctl related information */
-#define D_BLOCKS      0x00000100 /* ext2 block allocation */
+#define D_NETERROR    0x00000100 /* network errors */
 #define D_NET         0x00000200 /* network communications */
 #define D_WARNING     0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */
 #define D_BUFFS       0x00000800
 #define D_OTHER       0x00001000
 #define D_DENTRY      0x00002000
-#define D_PORTALS     0x00004000 /* ENTRY/EXIT markers */
+/* unused: keep in sync with lnet/utils/debug.c */
 #define D_PAGE        0x00008000 /* bulk page handling */
 #define D_DLMTRACE    0x00010000
 #define D_ERROR       0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */
@@ -110,86 +149,75 @@ extern unsigned int portals_catastrophe;
 #define D_CONSOLE     0x02000000
 #define D_QUOTA       0x04000000
 #define D_SEC         0x08000000
-/* If you change these values, please keep these files up to date...
- *    portals/utils/debug.c
- *    utils/lconf
- */
+/* keep these in sync with lnet/utils/debug.c */
+
+#define D_CANTMASK   (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)
 
 #ifndef DEBUG_SUBSYSTEM
 # define DEBUG_SUBSYSTEM S_UNDEFINED
 #endif
 
+#define CDEBUG_MAX_LIMIT 600
+typedef struct {
+        cfs_time_t      cdls_next;
+        int             cdls_count;
+        cfs_duration_t  cdls_delay;
+} cfs_debug_limit_state_t;
+
+#define CDEBUG_ENABLED (1)
+
 #ifdef __KERNEL__
-#define CDEBUG(mask, format, a...)                                            \
-do {                                                                          \
-        CHECK_STACK(CDEBUG_STACK);                                            \
-        if (((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)) ||         \
-            (portal_debug & (mask) &&                                         \
-             portal_subsystem_debug & DEBUG_SUBSYSTEM))                       \
-                portals_debug_msg(DEBUG_SUBSYSTEM, mask,                      \
-                                  __FILE__, __FUNCTION__, __LINE__,           \
-                                  CDEBUG_STACK, format, ## a);                \
+
+#if CDEBUG_ENABLED
+#define __CDEBUG(cdls, mask, format, a...)                              \
+do {                                                                    \
+        CHECK_STACK();                                                  \
+                                                                        \
+        if (((mask) & D_CANTMASK) != 0 ||                               \
+            ((libcfs_debug & (mask)) != 0 &&                            \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))          \
+                libcfs_debug_msg(cdls, DEBUG_SUBSYSTEM, mask,           \
+                                 __FILE__, __FUNCTION__, __LINE__,      \
+                                 format, ## a);                         \
 } while (0)
 
-#define CDEBUG_MAX_LIMIT 600
-#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...)                        \
-do {                                                                          \
-        static cfs_time_t cdebug_next = 0;                                    \
-        static int cdebug_count = 0;                                          \
-        static cfs_duration_t cdebug_delay = CFS_MIN_DELAY;                   \
-                                                                              \
-        CHECK_STACK(CDEBUG_STACK);                                            \
-        if (cfs_time_after(cfs_time_current(), cdebug_next)) {                \
-                portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__,     \
-                                  __FUNCTION__, __LINE__, CDEBUG_STACK,       \
-                                  cdebug_format, ## a);                       \
-                if (cdebug_count) {                                           \
-                        portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask,       \
-                                          __FILE__, __FUNCTION__, __LINE__,0, \
-                                          "previously skipped %d similar "    \
-                                          "messages\n", cdebug_count);        \
-                        cdebug_count = 0;                                     \
-                }                                                             \
-                if (cfs_time_after(cfs_time_current(),                        \
-                                   cdebug_next +                              \
-                                   cfs_time_seconds(CDEBUG_MAX_LIMIT+10)))    \
-                        cdebug_delay = cdebug_delay > (8 * CFS_MIN_DELAY)?    \
-                                       cdebug_delay/8 : CFS_MIN_DELAY;        \
-                else                                                          \
-                        cdebug_delay = cdebug_delay*2 >= cfs_time_seconds(CDEBUG_MAX_LIMIT)?\
-                                       cfs_time_seconds(CDEBUG_MAX_LIMIT) :   \
-                                       cdebug_delay*2;                        \
-                cdebug_next = cfs_time_current() + cdebug_delay;              \
-        } else {                                                              \
-                portals_debug_msg(DEBUG_SUBSYSTEM,                            \
-                                  portal_debug &                              \
-                                  ~(D_EMERG|D_ERROR|D_WARNING|D_CONSOLE),     \
-                                  __FILE__, __FUNCTION__, __LINE__,           \
-                                  CDEBUG_STACK, cdebug_format, ## a);         \
-                cdebug_count++;                                               \
-        }                                                                     \
+#define CDEBUG(mask, format, a...) __CDEBUG(NULL, mask, format, ## a)
+
+#define CDEBUG_LIMIT(mask, format, a...)        \
+do {                                            \
+        static cfs_debug_limit_state_t cdls;    \
+                                                \
+        __CDEBUG(&cdls, mask, format, ## a);    \
 } while (0)
 
-#elif defined(LUSTRE_UTILS)
+#else /* CDEBUG_ENABLED */
+#define CDEBUG(mask, format, a...) (void)(0)
+#define CDEBUG_LIMIT(mask, format, a...) (void)(0)
+#endif
+
+#elif defined(__arch_lib__) && !defined(LUSTRE_UTILS)
 
 #define CDEBUG(mask, format, a...)                                      \
 do {                                                                    \
-        if ((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE))       \
-                fprintf(stderr, "(%s:%d:%s()) " format,                 \
-                        __FILE__, __LINE__, __FUNCTION__, ## a);        \
+        if (((mask) & D_CANTMASK) != 0 ||                               \
+            ((libcfs_debug & (mask)) != 0 &&                            \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))          \
+                libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, mask,           \
+                                 __FILE__, __FUNCTION__, __LINE__,      \
+                                 format, ## a);                         \
 } while (0)
+
 #define CDEBUG_LIMIT CDEBUG
 
-#else  /* !__KERNEL__ && !LUSTRE_UTILS*/
+#else
 
 #define CDEBUG(mask, format, a...)                                      \
 do {                                                                    \
-        if (((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)) ||   \
-            (portal_debug & (mask) &&                                   \
-             portal_subsystem_debug & DEBUG_SUBSYSTEM))                 \
+        if (((mask) & D_CANTMASK) != 0)                                 \
                 fprintf(stderr, "(%s:%d:%s()) " format,                 \
                         __FILE__, __LINE__, __FUNCTION__, ## a);        \
 } while (0)
+
 #define CDEBUG_LIMIT CDEBUG
 
 #endif /* !__KERNEL__ */
@@ -204,6 +232,8 @@ do {                                                                    \
 #define LCONSOLE_ERROR(format, a...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, format, ## a)
 #define LCONSOLE_EMERG(format, a...) CDEBUG(D_CONSOLE | D_EMERG, format, ## a)
 
+#if CDEBUG_ENABLED
+
 #define GOTO(label, rc)                                                 \
 do {                                                                    \
         long GOTO__ret = (long)(rc);                                    \
@@ -212,8 +242,11 @@ do {                                                                    \
                (signed long)GOTO__ret);                                 \
         goto label;                                                     \
 } while (0)
+#else
+#define GOTO(label, rc) do { ((void)(rc)); goto label; } while (0)
+#endif
 
-#define CDEBUG_ENTRY_EXIT 1
+#define CDEBUG_ENTRY_EXIT (1)
 #if CDEBUG_ENTRY_EXIT
 
 /*
@@ -248,143 +281,65 @@ do {                                                                    \
 
 #endif /* !CDEBUG_ENTRY_EXIT */
 
-
-#define LUSTRE_SRV_PTL_PID      LUSTRE_PTL_PID
-
 /*
- * eeb cfg
- * ecf6
- * ecfG
+ * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses
+ * Lustre RETURN(NULL) macro.
  */
-#define PORTALS_CFG_VERSION 0xecf60001
-
-struct portals_cfg {
-        __u32 pcfg_version;
-        __u32 pcfg_command;
-
-        __u32 pcfg_nal;
-        __u32 pcfg_flags;
-
-        __u32 pcfg_gw_nal;
-        __u32 pcfg_padding1;
-
-        __u64 pcfg_nid;
-        __u64 pcfg_nid2;
-        __u64 pcfg_nid3;
-        __u32 pcfg_id;
-        __u32 pcfg_misc;
-        __u32 pcfg_fd;
-        __u32 pcfg_count;
-        __u32 pcfg_size;
-        __u32 pcfg_wait;
-
-        __u32 pcfg_plen1; /* buffers in userspace */
-        __u32 pcfg_plen2; /* buffers in userspace */
-        __u32 pcfg_alloc_size;  /* size of this allocated portals_cfg */
-        char  pcfg_pbuf[0];
-};
-
-#define PCFG_INIT(pcfg, cmd)                            \
-do {                                                    \
-        memset(&(pcfg), 0, sizeof((pcfg)));             \
-        (pcfg).pcfg_version = PORTALS_CFG_VERSION;      \
-        (pcfg).pcfg_command = (cmd);                    \
-                                                        \
-} while (0)
+#if defined(NULL)
+#undef NULL
+#endif
 
-#define PCFG_INIT_PBUF(pcfg, cmd, plen1, plen2)                         \
-        do {                                                            \
-                int bufsize = size_round(sizeof(*(pcfg)));              \
-                bufsize += size_round(plen1) + size_round(plen2);       \
-                PORTAL_ALLOC((pcfg), bufsize);                          \
-                if ((pcfg)) {                                           \
-                        memset((pcfg), 0, bufsize);                     \
-                        (pcfg)->pcfg_version = PORTALS_CFG_VERSION;     \
-                        (pcfg)->pcfg_command = (cmd);                   \
-                        (pcfg)->pcfg_plen1 = (plen1);                   \
-                        (pcfg)->pcfg_plen2 = (plen2);                   \
-                        (pcfg)->pcfg_alloc_size = bufsize;              \
-                }                                                       \
-        } while (0)
-
-#define PCFG_FREE_PBUF(pcfg) PORTAL_FREE((pcfg), (pcfg)->pcfg_alloc_size)
-
-#define PCFG_PBUF(pcfg, idx)                                            \
-        (0 == (idx)                                                     \
-         ? ((char *)(pcfg) + size_round(sizeof(*(pcfg))))               \
-         : (1 == (idx)                                                  \
-            ? ((char *)(pcfg) + size_round(sizeof(*(pcfg))) + size_round(pcfg->pcfg_plen1)) \
-            : (NULL)))
-
-typedef int (nal_cmd_handler_fn)(struct portals_cfg *, void *);
-int libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *arg);
-int libcfs_nal_cmd(struct portals_cfg *pcfg);
-void libcfs_nal_cmd_unregister(int nal);
-
-struct portal_ioctl_data {
-        __u32 ioc_len;
-        __u32 ioc_version;
-        __u64 ioc_nid;
-        __u64 ioc_nid2;
-        __u64 ioc_nid3;
-        __u32 ioc_count;
-        __u32 ioc_nal;
-        __u32 ioc_nal_cmd;
-        __u32 ioc_fd;
-        __u32 ioc_id;
-
-        __u32 ioc_flags;
-        __u32 ioc_size;
-
-        __u32 ioc_wait;
-        __u32 ioc_timeout;
-        __u32 ioc_misc;
-
-        __u32 ioc_inllen1;
-        char *ioc_inlbuf1;
-        __u32 ioc_inllen2;
-        char *ioc_inlbuf2;
-
-        __u32 ioc_plen1; /* buffers in userspace */
-        char *ioc_pbuf1;
-        __u32 ioc_plen2; /* buffers in userspace */
-        char *ioc_pbuf2;
-
-        char ioc_bulk[0];
-};
+#define NULL ((void *)0)
 
+#define LUSTRE_SRV_LNET_PID      LUSTRE_LNET_PID
 
 #ifdef __KERNEL__
 
 #include <libcfs/list.h>
 
+struct libcfs_ioctl_data;                       /* forward ref */
+
 struct libcfs_ioctl_handler {
         struct list_head item;
-        int (*handle_ioctl)(struct portal_ioctl_data *data,
-                            unsigned int cmd, unsigned long args);
+        int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data);
 };
 
-#define DECLARE_IOCTL_HANDLER(ident, func)              \
-        struct libcfs_ioctl_handler ident = {           \
-                .item = CFS_LIST_HEAD_INIT(ident.item),     \
-                .handle_ioctl = func                    \
+#define DECLARE_IOCTL_HANDLER(ident, func)                      \
+        struct libcfs_ioctl_handler ident = {                   \
+                /* .item = */ CFS_LIST_HEAD_INIT(ident.item),   \
+                /* .handle_ioctl = */ func                      \
         }
 
 int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand);
 int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand);
 
+/* libcfs tcpip */
+#define LNET_ACCEPTOR_MIN_RESERVED_PORT    512
+#define LNET_ACCEPTOR_MAX_RESERVED_PORT    1023
+
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask);
+int libcfs_ipif_enumerate(char ***names);
+void libcfs_ipif_free_enumeration(char **names, int n);
+int libcfs_sock_listen(cfs_socket_t **sockp, __u32 ip, int port, int backlog);
+int libcfs_sock_accept(cfs_socket_t **newsockp, cfs_socket_t *sock);
+void libcfs_sock_abort_accept(cfs_socket_t *sock);
+int libcfs_sock_connect(cfs_socket_t **sockp, int *fatal,
+                        __u32 local_ip, int local_port,
+                        __u32 peer_ip, int peer_port);
+int libcfs_sock_setbuf(cfs_socket_t *socket, int txbufsize, int rxbufsize);
+int libcfs_sock_getbuf(cfs_socket_t *socket, int *txbufsize, int *rxbufsize);
+int libcfs_sock_getaddr(cfs_socket_t *socket, int remote, __u32 *ip, int *port);
+int libcfs_sock_write(cfs_socket_t *sock, void *buffer, int nob, int timeout);
+int libcfs_sock_read(cfs_socket_t *sock, void *buffer, int nob, int timeout);
+void libcfs_sock_release(cfs_socket_t *sock);
+
 /* libcfs watchdogs */
 struct lc_watchdog;
 
-/* Just use the default handler (dumplog)  */
-#define LC_WATCHDOG_DEFAULT_CB NULL
-
 /* Add a watchdog which fires after "time" milliseconds of delay.  You have to
  * touch it once to enable it. */
 struct lc_watchdog *lc_watchdog_add(int time,
-                                    void (*cb)(struct lc_watchdog *,
-                                               struct task_struct *,
-                                               void *),
+                                    void (*cb)(pid_t pid, void *),
                                     void *data);
 
 /* Enables a watchdog and resets its timer. */
@@ -397,9 +352,7 @@ void lc_watchdog_disable(struct lc_watchdog *lcw);
 void lc_watchdog_delete(struct lc_watchdog *lcw);
 
 /* Dump a debug log */
-void lc_watchdog_dumplog(struct lc_watchdog *lcw,
-                         struct task_struct *tsk,
-                         void *data);
+void lc_watchdog_dumplog(pid_t pid, void *data);
 
 /* __KERNEL__ */
 #endif
@@ -452,7 +405,25 @@ static inline time_t cfs_unix_seconds(void)
         cfs_fs_time_t t;
 
         cfs_fs_time_current(&t);
-        return cfs_fs_time_sec(&t);
+        return (time_t)cfs_fs_time_sec(&t);
+}
+
+static inline cfs_time_t cfs_time_shift(int seconds)
+{
+        return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds));
+}
+
+static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small,
+                                   struct timeval *result)
+{
+        long r = (long) (
+                (large->tv_sec - small->tv_sec) * ONE_MILLION +
+                (large->tv_usec - small->tv_usec));
+        if (result != NULL) {
+                result->tv_usec = r / ONE_MILLION;
+                result->tv_sec = r;
+        }
+        return r;
 }
 
 #define CFS_RATELIMIT(seconds)                                  \
@@ -472,10 +443,57 @@ static inline time_t cfs_unix_seconds(void)
         result;                                                 \
 })
 
-extern void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
-                              const int line, unsigned long stack,
-                              char *format, ...)
-            __attribute__ ((format (printf, 7, 8)));
+struct libcfs_debug_msg_data {
+        cfs_debug_limit_state_t *msg_cdls;
+        int                      msg_subsys;
+        const char              *msg_file;
+        const char              *msg_fn;
+        int                      msg_line;
+};
+
+#define DEBUG_MSG_DATA_INIT(cdls, subsystem, file, func, ln ) { \
+        .msg_cdls           = (cdls),       \
+        .msg_subsys         = (subsystem),  \
+        .msg_file           = (file),       \
+        .msg_fn             = (func),       \
+        .msg_line           = (ln)          \
+    }
+
+
+extern int libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls,
+                              int subsys, int mask,
+                              const char *file, const char *fn, const int line, 
+                              const char *format1, va_list args, 
+                              const char *format2, ...);
+
+#define libcfs_debug_vmsg(cdls, subsys, mask, file, fn, line, format, args) \
+    libcfs_debug_vmsg2(cdls, subsys, mask, file, fn, line, format, args, NULL, NULL)
+
+#define libcfs_debug_msg(cdls, subsys, mask, file, fn, line, format, a...) \
+    libcfs_debug_vmsg2(cdls, subsys, mask, file, fn, line, NULL, NULL, format, ##a) 
+
+#define cdebug_va(cdls, mask, file, func, line, fmt, args)      do {            \
+        CHECK_STACK();                                                          \
+                                                                                \
+        if (((mask) & D_CANTMASK) != 0 ||                                       \
+            ((libcfs_debug & (mask)) != 0 &&                                    \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                  \
+                libcfs_debug_vmsg(cdls, DEBUG_SUBSYSTEM, (mask),                \
+                                  (file), (func), (line), fmt, args);           \
+} while(0);
+
+#define cdebug(cdls, mask, file, func, line, fmt, a...) do {                    \
+        CHECK_STACK();                                                          \
+                                                                                \
+        if (((mask) & D_CANTMASK) != 0 ||                                       \
+            ((libcfs_debug & (mask)) != 0 &&                                    \
+             (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0))                  \
+                libcfs_debug_msg(cdls, DEBUG_SUBSYSTEM, (mask),                 \
+                                 (file), (func), (line), fmt, ## a);            \
+} while(0);
+
+extern void libcfs_assertion_failed(const char *expr, const char *file,
+                                    const char *fn, const int line);
 
 static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
 {
@@ -490,10 +508,10 @@ static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg)
  */
 static inline void cfs_fs_timeval(struct timeval *tv)
 {
-       cfs_fs_time_t time;
+        cfs_fs_time_t time;
 
-       cfs_fs_time_current(&time);
-       cfs_fs_time_usec(&time, tv);
+        cfs_fs_time_current(&time);
+        cfs_fs_time_usec(&time, tv);
 }
 
 /*
@@ -502,13 +520,13 @@ static inline void cfs_fs_timeval(struct timeval *tv)
  */
 static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout)
 {
-       if (timeout < cfs_time_minimal_timeout())
-               timeout = cfs_time_minimal_timeout();
-       return timeout;
+        if (timeout < CFS_TICK)
+                timeout = CFS_TICK;
+        return timeout;
 }
 
 /*
- * Portable memory allocator API (draft)
+ * Universal memory allocator API
  */
 enum cfs_alloc_flags {
         /* allocation is not allowed to block */
@@ -522,27 +540,124 @@ enum cfs_alloc_flags {
         CFS_ALLOC_FS     = (1 << 3),
         /* allocation is allowed to do io to free/clean memory */
         CFS_ALLOC_IO     = (1 << 4),
+        /* don't report allocation failure to the console */
+        CFS_ALLOC_NOWARN = (1 << 5),
         /* standard allocator flag combination */
         CFS_ALLOC_STD    = CFS_ALLOC_FS | CFS_ALLOC_IO,
         CFS_ALLOC_USER   = CFS_ALLOC_WAIT | CFS_ALLOC_FS | CFS_ALLOC_IO,
 };
 
-#define CFS_SLAB_ATOMIC         CFS_ALLOC_ATOMIC
-#define CFS_SLAB_WAIT           CFS_ALLOC_WAIT
-#define CFS_SLAB_ZERO           CFS_ALLOC_ZERO
-#define CFS_SLAB_FS             CFS_ALLOC_FS
-#define CFS_SLAB_IO             CFS_ALLOC_IO
-#define CFS_SLAB_STD            CFS_ALLOC_STD
-#define CFS_SLAB_USER           CFS_ALLOC_USER
-
 /* flags for cfs_page_alloc() in addition to enum cfs_alloc_flags */
-enum cfs_page_alloc_flags {
+enum cfs_alloc_page_flags {
         /* allow to return page beyond KVM. It has to be mapped into KVM by
          * cfs_page_map(); */
         CFS_ALLOC_HIGH   = (1 << 5),
         CFS_ALLOC_HIGHUSER = CFS_ALLOC_WAIT | CFS_ALLOC_FS | CFS_ALLOC_IO | CFS_ALLOC_HIGH,
 };
 
+/*
+ * portable UNIX device file identification. (This is not _very_
+ * portable. Probably makes no sense for Windows.)
+ */
+/*
+ * Platform defines
+ *
+ * cfs_rdev_t
+ */
+
+typedef unsigned int cfs_major_nr_t;
+typedef unsigned int cfs_minor_nr_t;
+
+/*
+ * Defined by platform.
+ */
+cfs_rdev_t     cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor);
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev);
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev);
+
+/*
+ * Generic on-wire rdev format.
+ */
+
+typedef __u32 cfs_wire_rdev_t;
+
+cfs_wire_rdev_t cfs_wire_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor);
+cfs_major_nr_t  cfs_wire_rdev_major(cfs_wire_rdev_t rdev);
+cfs_minor_nr_t  cfs_wire_rdev_minor(cfs_wire_rdev_t rdev);
+
+/*
+ * Drop into debugger, if possible. Implementation is provided by platform.
+ */
+
+void cfs_enter_debugger(void);
+
+/*
+ * Defined by platform
+ */
+void cfs_daemonize(char *str);
+int cfs_daemonize_ctxt(char *str);
+cfs_sigset_t cfs_get_blocked_sigs(void);
+cfs_sigset_t cfs_block_allsigs(void);
+cfs_sigset_t cfs_block_sigs(cfs_sigset_t bits);
+void cfs_restore_sigs(cfs_sigset_t);
+int cfs_signal_pending(void);
+void cfs_clear_sigpending(void);
+/*
+ * XXX Liang:
+ * these macros should be removed in the future,
+ * we keep them just for keeping libcfs compatible
+ * with other branches.
+ */
+#define libcfs_daemonize(s)     cfs_daemonize(s)
+#define cfs_sigmask_lock(f)     do { f= 0; } while (0)
+#define cfs_sigmask_unlock(f)   do { f= 0; } while (0)
+
+int convert_server_error(__u64 ecode);
+int convert_client_oflag(int cflag, int *result);
+
+/*
+ * Stack-tracing filling.
+ */
+
+/*
+ * Platform-dependent data-type to hold stack frames.
+ */
+struct cfs_stack_trace;
+
+/*
+ * Fill @trace with current back-trace.
+ */
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace);
+
+/*
+ * Return instruction pointer for frame @frame_no. NULL if @frame_no is
+ * invalid.
+ */
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no);
+
+/*
+ * Universal open flags.
+ */
+#define CFS_O_ACCMODE           0003
+#define CFS_O_CREAT             0100
+#define CFS_O_EXCL              0200
+#define CFS_O_NOCTTY            0400
+#define CFS_O_TRUNC             01000
+#define CFS_O_APPEND            02000
+#define CFS_O_NONBLOCK          04000
+#define CFS_O_NDELAY            CFS_O_NONBLOCK
+#define CFS_O_SYNC              010000
+#define CFS_O_ASYNC             020000
+#define CFS_O_DIRECT            040000
+#define CFS_O_LARGEFILE         0100000
+#define CFS_O_DIRECTORY         0200000
+#define CFS_O_NOFOLLOW          0400000
+#define CFS_O_NOATIME           01000000
+
+/* convert local open flags to universal open flags */
+int cfs_oflags2univ(int flags);
+/* convert universal open flags to local open flags */
+int cfs_univ2oflags(int flags);
 
 #define _LIBCFS_H
 
index 159cf57..072a7ad 100644 (file)
@@ -1,3 +1,3 @@
 EXTRA_DIST := kp30.h libcfs.h linux-fs.h linux-lock.h linux-mem.h      \
-       linux-prim.h linux-time.h lltrace.h portals_compat25.h          \
-       portals_lib.h portals_utils.h
+       linux-prim.h linux-time.h linux-tcpip.h lltrace.h               \
+       portals_compat25.h portals_utils.h
index d2329ba..be2cd34 100644 (file)
@@ -9,7 +9,9 @@
 #endif
 
 #ifdef __KERNEL__
+#ifdef HAVE_KERNEL_CONFIG_H
 # include <linux/config.h>
+#endif
 # include <linux/kernel.h>
 # include <linux/mm.h>
 # include <linux/string.h>
@@ -30,7 +32,7 @@
 # include <linux/highmem.h>
 # include <linux/module.h>
 # include <linux/version.h>
-# include <portals/p30.h>
+# include <lnet/lnet.h>
 # include <linux/smp_lock.h>
 # include <asm/atomic.h>
 # include <asm/uaccess.h>
 # include <linux/file.h>
 # include <linux/smp.h>
 # include <linux/ctype.h>
+# include <linux/compiler.h>
 # ifdef HAVE_MM_INLINE
 #  include <linux/mm_inline.h>
 # endif
 # if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
 #  include <linux/kallsyms.h>
+#  include <linux/moduleparam.h>
 # endif
 
 #include <libcfs/linux/portals_compat25.h>
@@ -88,28 +92,9 @@ static inline void our_cond_resched(void)
 #else
 #define LASSERT_SPIN_LOCKED(lock) do {} while(0)
 #endif
+#define LASSERT_SEM_LOCKED(sem) LASSERT(down_trylock(sem) != 0)
 
-#ifdef __arch_um__
-#define LBUG_WITH_LOC(file, func, line)                                 \
-do {                                                                    \
-        CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n");       \
-        portals_catastrophe = 1;                                        \
-        portals_debug_dumplog();                                        \
-        portals_run_lbug_upcall(file, func, line);                      \
-        panic("LBUG");                                                  \
-} while (0)
-#else
-#define LBUG_WITH_LOC(file, func, line)                                 \
-do {                                                                    \
-        CEMERG("LBUG\n");                                               \
-        portals_catastrophe = 1;                                        \
-        portals_debug_dumpstack(NULL);                                  \
-        portals_debug_dumplog();                                        \
-        portals_run_lbug_upcall(file, func, line);                      \
-        set_task_state(current, TASK_UNINTERRUPTIBLE);                  \
-        schedule();                                                     \
-} while (0)
-#endif /* __arch_um__ */
+#define LIBCFS_PANIC(msg)            panic(msg)
 
 /* ------------------------------------------------------------------- */
 
@@ -137,6 +122,24 @@ do {                                                                    \
 #endif
 
 /******************************************************************************/
+/* Module parameter support */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+# define CFS_MODULE_PARM(name, t, type, perm, desc) \
+        MODULE_PARM(name, t);\
+        MODULE_PARM_DESC(name, desc)
+
+#else
+# define CFS_MODULE_PARM(name, t, type, perm, desc) \
+        module_param(name, type, perm);\
+        MODULE_PARM_DESC(name, desc)
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9))
+# define CFS_SYSFS_MODULE_PARM  0 /* no sysfs module parameters */
+#else
+# define CFS_SYSFS_MODULE_PARM  1 /* module parameters accessible via sysfs */
+#endif
+/******************************************************************************/
 
 #if (__GNUC__)
 /* Use the special GNU C __attribute__ hack to have the compiler check the
@@ -157,14 +160,22 @@ do {                                                                    \
 #else  /* !__KERNEL__ */
 # include <stdio.h>
 # include <stdlib.h>
-#ifndef __CYGWIN__
-# include <stdint.h>
-#else
+#ifdef CRAY_XT3
+# include <ioctl.h>
+#elif defined(__CYGWIN__)
 # include <cygwin-ioctl.h>
+#else
+# include <stdint.h>
 #endif
 # include <unistd.h>
 # include <time.h>
 # include <limits.h>
+# include <errno.h>
+# include <sys/ioctl.h>                         /* for _IOWR */
+
+# define CFS_MODULE_PARM(name, t, type, perm, desc)
+#define PORTAL_SYMBOL_GET(x) inter_module_get(#x)
+#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x)
 
 #endif /* End of !__KERNEL__ */
 
@@ -175,7 +186,7 @@ do {                                                                    \
 
 #define LWT_MEMORY   (16<<20)
 
-#if !KLWT_SUPPORT
+#ifndef KLWT_SUPPORT
 # if defined(__KERNEL__)
 #  if !defined(BITS_PER_LONG)
 #   error "BITS_PER_LONG not defined"
@@ -225,7 +236,7 @@ extern lwt_cpu_t lwt_cpus[];
 
 #define LWTSTR(n)       #n
 #define LWTWHERE(f,l)   f ":" LWTSTR(l)
-#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t))
+#define LWT_EVENTS_PER_PAGE (CFS_PAGE_SIZE / sizeof (lwt_event_t))
 
 #define LWT_EVENT(p1, p2, p3, p4)                                       \
 do {                                                                    \
@@ -276,7 +287,7 @@ extern int  lwt_snapshot (cycles_t *now, int *ncpu, int *total_size,
 
 /* ------------------------------------------------------------------ */
 
-#define IOCTL_PORTAL_TYPE long
+#define IOCTL_LIBCFS_TYPE long
 
 #ifdef __CYGWIN__
 # ifndef BITS_PER_LONG
@@ -298,23 +309,26 @@ extern int  lwt_snapshot (cycles_t *now, int *ncpu, int *total_size,
 # define LP_POISON ((void *)(long)0x5a5a5a5a)
 #endif
 
-#if defined(__x86_64__) && defined(__KERNEL__)
+#if (defined(__x86_64__) && defined(__KERNEL__))
 /* x86_64 defines __u64 as "long" in userspace, but "long long" in the kernel */
 # define LPU64 "%Lu"
 # define LPD64 "%Ld"
 # define LPX64 "%#Lx"
+# define LPF64 "L"
 # define LPSZ  "%lu"
 # define LPSSZ "%ld"
 #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32)
 # define LPU64 "%Lu"
 # define LPD64 "%Ld"
 # define LPX64 "%#Lx"
+# define LPF64 "L"
 # define LPSZ  "%u"
 # define LPSSZ "%d"
 #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64)
 # define LPU64 "%lu"
 # define LPD64 "%ld"
 # define LPX64 "%#lx"
+# define LPF64 "l"
 # define LPSZ  "%lu"
 # define LPSSZ "%ld"
 #endif
index 1e82343..0aac919 100644 (file)
@@ -8,28 +8,18 @@
 #error Do not #include this file directly. #include <libcfs/libcfs.h> instead
 #endif
 
+#include <stdarg.h>
 #include <libcfs/linux/linux-mem.h>
 #include <libcfs/linux/linux-time.h>
 #include <libcfs/linux/linux-prim.h>
 #include <libcfs/linux/linux-lock.h>
 #include <libcfs/linux/linux-fs.h>
+#include <libcfs/linux/linux-tcpip.h>
 
 #ifdef HAVE_ASM_TYPES_H
 #include <asm/types.h>
 #else
-/* this is actually coming from within lustre, a layering violation.
- * we may not even need it, as libuptlctl (the dependency for which it
- * is needed in liblustre building on catamount, bug 6923) shows no
- * apparent need to be included in liblustre AFAICS.  The change of
- * include to lustre/types.h only makes this explicit instead of implicit.
- * To be resolved.  For now, make it CRAY_PORTALS only, to avoid breaking
- * non-b1_4 branches that don't have this file.
- */
-# if CRAY_PORTALS
-#  include <lustre/types.h>
-# else
-#  include "types.h"
-# endif
+#include <libcfs/types.h>
 #endif
 
 
@@ -99,49 +89,40 @@ struct ptldebug_header {
 
 #define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
 
-#ifdef __KERNEL__
+#if defined(__KERNEL__) && !defined(__x86_64__)
 # ifdef  __ia64__
-#  define CDEBUG_STACK (THREAD_SIZE -                                      \
-                        ((unsigned long)__builtin_dwarf_cfa() &            \
-                         (THREAD_SIZE - 1)))
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                          ((unsigned long)__builtin_dwarf_cfa() &       \
+                           (THREAD_SIZE - 1)))
 # else
-#  define CDEBUG_STACK (THREAD_SIZE -                                      \
-                        ((unsigned long)__builtin_frame_address(0) &       \
-                         (THREAD_SIZE - 1)))
+#  define CDEBUG_STACK() (THREAD_SIZE -                                 \
+                          ((unsigned long)__builtin_frame_address(0) &  \
+                           (THREAD_SIZE - 1)))
 # endif /* __ia64__ */
 
-#define CHECK_STACK(stack)                                                    \
-        do {                                                                  \
-                if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) {    \
-                        portals_debug_msg(DEBUG_SUBSYSTEM, D_WARNING,         \
-                                          __FILE__, __FUNCTION__, __LINE__,   \
-                                          (stack),"maximum lustre stack %u\n",\
-                                          portal_stack = (stack));            \
-                      /*panic("LBUG");*/                                      \
-                }                                                             \
-        } while (0)
+#define __CHECK_STACK(file, func, line)                                 \
+do {                                                                    \
+        unsigned long _stack = CDEBUG_STACK();                          \
+                                                                        \
+        if (_stack > 3*THREAD_SIZE/4 && _stack > libcfs_stack) {        \
+                libcfs_stack = _stack;                                  \
+                libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_WARNING,      \
+                                 file, func, line,                      \
+                                 "maximum lustre stack %lu\n", _stack); \
+              /*panic("LBUG");*/                                        \
+        }                                                               \
+} while (0)
+#define CHECK_STACK()     __CHECK_STACK(__FILE__, __func__, __LINE__)
 #else /* !__KERNEL__ */
-#define CHECK_STACK(stack) do { } while(0)
-#define CDEBUG_STACK (0L)
+#define __CHECK_STACK(X, Y, Z) do { } while(0)
+#define CHECK_STACK() do { } while(0)
+#define CDEBUG_STACK() (0L)
 #endif /* __KERNEL__ */
 
 /* initial pid  */
-# if CRAY_PORTALS
-/*
- * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this
- *    is too big.
- *
- * 2) the implementation of ernal in cray portals further restricts the pid
- *    space that may be used to 0 <= pid <= 255 (an 8 bit value).  Returns
- *    an error at nal init time for any pid outside this range.  Other nals
- *    in cray portals don't have this restriction.
- * */
-#define LUSTRE_PTL_PID          9
-# else
-#define LUSTRE_PTL_PID          12345
-# endif
+#define LUSTRE_LNET_PID          12345
 
-#define ENTRY_NESTING_SUPPORT (0)
+#define ENTRY_NESTING_SUPPORT (1)
 #define ENTRY_NESTING   do {;} while (0)
 #define EXIT_NESTING   do {;} while (0)
 #define __current_nesting_level() (0)
@@ -160,4 +141,17 @@ typedef kernel_cap_t cfs_kernel_cap_t;
 typedef __u32 cfs_kernel_cap_t;
 #endif
 
+#if defined(__KERNEL__)
+/*
+ * No stack-back-tracing in Linux for now.
+ */
+struct cfs_stack_trace {
+};
+
+#ifndef WITH_WATCHDOG
+#define WITH_WATCHDOG
+#endif
+
+#endif
+
 #endif /* _LINUX_LIBCFS_H */
index 9530360..3ba5461 100644 (file)
 #include <linux/fs.h>
 #include <linux/stat.h>
 #include <linux/mount.h>
-#endif
+#else /* !__KERNEL__ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <mntent.h>
+#endif  /* __KERNEL__ */
 
 typedef struct file cfs_file_t;
 typedef struct dentry cfs_dentry_t;
@@ -55,15 +64,23 @@ cfs_file_t *cfs_filp_open (const char *name, int flags, int mode, int *err);
 #define cfs_put_file(f)                     fput(f)
 #define cfs_file_count(f)                   file_count(f)
 
-typedef struct file_lock cfs_flock_t; 
-#define CFS_FLOCK_TYPE(fl)                  ((fl)->fl_type)
-#define CFS_FLOCK_SET_TYPE(fl, type)        do { (fl)->fl_type = (type); } while(0)
-#define CFS_FLOCK_PID(fl)                   ((fl)->fl_pid)
-#define CFS_FLOCK_SET_PID(fl, pid)          do { (fl)->fl_pid = (pid); } while(0)
-#define CFS_FLOCK_START(fl)                 ((fl)->fl_start)
-#define CFS_FLOCK_SET_START(fl, start)      do { (fl)->fl_start = (start); } while(0)
-#define CFS_FLOCK_END(fl)                   ((fl)->fl_end)
-#define CFS_FLOCK_SET_END(fl, end)          do { (fl)->fl_end = (end); } while(0)
+typedef struct file_lock cfs_flock_t;
+#define cfs_flock_type(fl)                  ((fl)->fl_type)
+#define cfs_flock_set_type(fl, type)        do { (fl)->fl_type = (type); } while(0)
+#define cfs_flock_pid(fl)                   ((fl)->fl_pid)
+#define cfs_flock_set_pid(fl, pid)          do { (fl)->fl_pid = (pid); } while(0)
+#define cfs_flock_start(fl)                 ((fl)->fl_start)
+#define cfs_flock_set_start(fl, start)      do { (fl)->fl_start = (start); } while(0)
+#define cfs_flock_end(fl)                   ((fl)->fl_end)
+#define cfs_flock_set_end(fl, end)          do { (fl)->fl_end = (end); } while(0)
+
+ssize_t cfs_user_write (cfs_file_t *filp, const char *buf, size_t count, loff_t *offset);
+
+/*
+ * portable UNIX device file identification.
+ */
+
+typedef dev_t cfs_rdev_t;
 
 #endif
 
index ce097e9..f419c9b 100644 (file)
@@ -74,7 +74,7 @@
  */
 
 /*
- * mutex_t:
+ * mutex:
  *
  * - init_mutex(x)
  * - init_mutex_locked(x)
  * - wait_for_completion(c)
  */
 
-/*
- * OSX funnels:
- *
- * No funnels needed in Linux
- */
-#define CFS_DECL_FUNNEL_DATA
-#define CFS_DECL_CONE_DATA             DECLARE_FUNNEL_DATA
-#define CFS_DECL_NET_DATA               DECLARE_FUNNEL_DATA
-#define CFS_CONE_IN                    do {} while(0)
-#define CFS_CONE_EX                    do {} while(0)
-
-#define CFS_NET_IN                      do {} while(0)
-#define CFS_NET_EX                      do {} while(0)
-
 /* __KERNEL__ */
 #else
 
-//#include "../user-lock.h"
+#include "../user-lock.h"
 
 /* __KERNEL__ */
 #endif
index 94b764f..7591213 100644 (file)
 typedef struct page                     cfs_page_t;
 #define CFS_PAGE_SIZE                   PAGE_CACHE_SIZE
 #define CFS_PAGE_SHIFT                  PAGE_CACHE_SHIFT
-#define CFS_PAGE_MASK                   PAGE_CACHE_MASK
+#define CFS_PAGE_MASK                   (~((__u64)CFS_PAGE_SIZE-1))
 
-cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order);
-#define cfs_alloc_page(f)              cfs_alloc_pages(f, 0)
-#define cfs_free_pages(p, o)           __free_pages(p, o)
+cfs_page_t *cfs_alloc_page(unsigned int flags);
 #define cfs_free_page(p)               __free_pages(p, 0)
 
 static inline void *cfs_page_address(cfs_page_t *page)
 {
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
         return page_address(page);
 }
 
@@ -73,13 +75,11 @@ static inline int cfs_page_count(cfs_page_t *page)
         return page_count(page);
 }
 
-static inline void cfs_set_page_count(cfs_page_t *page, int v)
-{
-        set_page_count(page, v);
-}
+#define cfs_page_index(p)       ((p)->index)
 
 /*
  * Memory allocator
+ * XXX Liang: move these declare to public file
  */
 extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
 extern void  cfs_free(void *addr);
@@ -88,12 +88,17 @@ extern void *cfs_alloc_large(size_t nr_bytes);
 extern void  cfs_free_large(void *addr);
 
 /*
+ * In Linux there is no way to determine whether current execution context is
+ * blockable.
+ */
+#define CFS_ALLOC_ATOMIC_TRY   CFS_ALLOC_ATOMIC
+
+/*
  * SLAB allocator
+ * XXX Liang: move these declare to public file
  */
 typedef kmem_cache_t    cfs_mem_cache_t;
-extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long,
-                                               void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                                               void (*)(void *, cfs_mem_cache_t *, unsigned long));
+extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long);
 extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
 extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
 extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
@@ -104,6 +109,12 @@ extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
 #define CFS_MMSPACE_OPEN                do { __oldfs = get_fs(); set_fs(get_ds());} while(0)
 #define CFS_MMSPACE_CLOSE               set_fs(__oldfs)
 
+#else   /* !__KERNEL__ */
+#ifdef HAVE_ASM_PAGE_H
+#include <asm/page.h>           /* needed for PAGE_SIZE - rread */
+#endif
+
+#include <libcfs/user-prim.h>
 /* __KERNEL__ */
 #endif
 
index 69bda36..41eeb8a 100644 (file)
@@ -30,7 +30,9 @@
 #endif
 
 #ifdef __KERNEL__
+#ifdef HAVE_KERNEL_CONFIG_H
 #include <linux/config.h>
+#endif
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/version.h>
@@ -57,8 +59,27 @@ typedef struct miscdevice            cfs_psdev_t;
 typedef struct ctl_table               cfs_sysctl_table_t;
 typedef struct ctl_table_header                cfs_sysctl_table_header_t;
 
-#define register_cfs_sysctl_table(t, a)        register_sysctl_table(t, a)
-#define unregister_cfs_sysctl_table(t) unregister_sysctl_table(t, a)
+#define cfs_register_sysctl_table(t, a)        register_sysctl_table(t, a)
+#define cfs_unregister_sysctl_table(t) unregister_sysctl_table(t, a)
+
+/*
+ * Symbol register
+ */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define cfs_symbol_register(s, p)       inter_module_register(s, THIS_MODULE, p)
+#define cfs_symbol_unregister(s)        inter_module_unregister(s)
+#define cfs_symbol_get(s)               inter_module_get(s)
+#define cfs_symbol_put(s)               inter_module_put(s)
+#define cfs_module_get()                MOD_INC_USE_COUNT
+#define cfs_module_put()                MOD_DEC_USE_COUNT
+#else
+#define cfs_symbol_register(s, p)       do {} while(0)
+#define cfs_symbol_unregister(s)        do {} while(0)
+#define cfs_symbol_get(s)               symbol_get(s)
+#define cfs_symbol_put(s)               symbol_put(s)
+#define cfs_module_get()                try_module_get(THIS_MODULE)
+#define cfs_module_put()                module_put(THIS_MODULE)
+#endif
 
 /*
  * Proc file system APIs
@@ -73,21 +94,28 @@ typedef struct proc_dir_entry           cfs_proc_dir_entry_t;
 /*
  * Wait Queue
  */
+#define CFS_TASK_INTERRUPTIBLE          TASK_INTERRUPTIBLE
+#define CFS_TASK_UNINT                  TASK_UNINTERRUPTIBLE
+
 typedef wait_queue_t                   cfs_waitlink_t;
 typedef wait_queue_head_t              cfs_waitq_t;
 
-#define cfs_waitq_init(w)              init_waitqueue_head(w)
-#define cfs_waitlink_init(l)           init_waitqueue_entry(l, current)
-#define cfs_waitq_add(w, l)            add_wait_queue(w, l)
-#define cfs_waitq_add_exclusive(w, l)  add_wait_queue_exclusive(w, l)
+typedef long                            cfs_task_state_t;
+
+#define cfs_waitq_init(w)               init_waitqueue_head(w)
+#define cfs_waitlink_init(l)            init_waitqueue_entry(l, current)
+#define cfs_waitq_add(w, l)             add_wait_queue(w, l)
+#define cfs_waitq_add_exclusive(w, l)   add_wait_queue_exclusive(w, l)
 #define cfs_waitq_forward(l, w)         do {} while(0)
-#define cfs_waitq_del(w, l)            remove_wait_queue(w, l)
-#define cfs_waitq_active(w)            waitqueue_active(w)
-#define cfs_waitq_signal(w)            wake_up(w)
-#define cfs_waitq_signal_nr(w,n)       wake_up_nr(w, n)
-#define cfs_waitq_broadcast(w)         wake_up_all(w)
-#define cfs_waitq_wait(l)              schedule()
-#define cfs_waitq_timedwait(l, t)      schedule_timeout(t)
+#define cfs_waitq_del(w, l)             remove_wait_queue(w, l)
+#define cfs_waitq_active(w)             waitqueue_active(w)
+#define cfs_waitq_signal(w)             wake_up(w)
+#define cfs_waitq_signal_nr(w,n)        wake_up_nr(w, n)
+#define cfs_waitq_broadcast(w)          wake_up_all(w)
+#define cfs_waitq_wait(l, s)            schedule()
+#define cfs_waitq_timedwait(l, s, t)    schedule_timeout(t)
+#define cfs_schedule_timeout(s, t)      schedule_timeout(t)
+#define cfs_schedule()                  schedule()
 
 /* Kernel thread */
 typedef int (*cfs_thread_t)(void *);
@@ -98,6 +126,8 @@ typedef int (*cfs_thread_t)(void *);
  */
 typedef struct task_struct              cfs_task_t;
 #define cfs_current()                   current
+#define cfs_task_lock(t)                task_lock(t)
+#define cfs_task_unlock(t)              task_unlock(t)
 #define CFS_DECL_JOURNAL_DATA           void *journal_info
 #define CFS_PUSH_JOURNAL                do {    \
         journal_info = current->journal_info;   \
@@ -115,14 +145,7 @@ module_exit(fini)
 /*
  * Signal
  */
-#define cfs_sigmask_lock(t, f)          SIGNAL_MASK_LOCK(t, f)
-#define cfs_sigmask_unlock(t, f)        SIGNAL_MASK_UNLOCK(t, f)
-#define cfs_recalc_sigpending(t)        RECALC_SIGPENDING
-#define cfs_signal_pending(t)           signal_pending(t)
-#define cfs_sigfillset(s)               sigfillset(s)
-
-#define cfs_set_sig_blocked(t, b)       do { (t)->blocked = b; } while(0)
-#define cfs_get_sig_blocked(t)          (&(t)->blocked)
+typedef sigset_t                        cfs_sigset_t;
 
 /*
  * Timer
@@ -164,8 +187,17 @@ static inline cfs_time_t cfs_timer_deadline(cfs_timer_t *t)
         return t->expires;
 }
 
+
+/* deschedule for a bit... */
+static inline void cfs_pause(cfs_duration_t ticks)
+{
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        schedule_timeout(ticks);
+}
+
 #else   /* !__KERNEL__ */
 
+typedef struct proc_dir_entry           cfs_proc_dir_entry_t;
 #include "../user-prim.h"
 
 #endif /* __KERNEL__ */
diff --git a/lnet/include/libcfs/linux/linux-tcpip.h b/lnet/include/libcfs/linux/linux-tcpip.h
new file mode 100644 (file)
index 0000000..2d14904
--- /dev/null
@@ -0,0 +1,62 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines. 
+ *
+ */
+
+#ifndef __LIBCFS_LINUX_CFS_TCP_H__
+#define __LIBCFS_LINUX_CFS_TCP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifdef __KERNEL__
+#include <net/sock.h>
+
+typedef struct socket   cfs_socket_t;
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
+# define sk_allocation  allocation
+# define sk_data_ready  data_ready
+# define sk_write_space write_space
+# define sk_user_data   user_data
+# define sk_prot        prot
+# define sk_sndbuf      sndbuf
+# define sk_rcvbuf      rcvbuf
+# define sk_socket      socket
+# define sk_sleep       sleep
+#endif
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
+# define sk_wmem_queued wmem_queued
+# define sk_err         err
+# define sk_route_caps  route_caps
+#endif
+
+#define SOCK_SNDBUF(so)         ((so)->sk->sk_sndbuf)
+#define SOCK_WMEM_QUEUED(so)    ((so)->sk->sk_wmem_queued)
+#define SOCK_ERROR(so)          ((so)->sk->sk_err)
+#define SOCK_TEST_NOSPACE(so)   test_bit(SOCK_NOSPACE, &(so)->flags)
+
+#endif
+
+#endif
index f18e7d9..7135218 100644 (file)
  *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
  *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
  *
- *  cfs_duration_t cfs_time_minimal_timeout(void)
- *
  *  CFS_TIME_FORMAT
  *  CFS_DURATION_FORMAT
  *
  */
 
 #define ONE_BILLION ((u_int64_t)1000000000)
-#define ONE_MILLION ((u_int64_t)   1000000)
+#define ONE_MILLION 1000000
 
 #ifdef __KERNEL__
-
+#ifdef HAVE_KERNEL_CONFIG_H
 #include <linux/config.h>
+#endif
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/version.h>
@@ -106,15 +105,15 @@ static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
 /*
  * internal helper function used by cfs_fs_time_before*()
  */
-static inline unsigned long __cfs_fs_time_flat(cfs_fs_time_t *t)
+static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t)
 {
-        return ((unsigned long)t->tv_sec) * ONE_MILLION + t->tv_usec * 1000;
+        return (unsigned long long)t->tv_sec * ONE_MILLION + t->tv_usec;
 }
 
 #define CURRENT_KERN_TIME        xtime
 
-/* (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) */
 #else
+/* (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) */
 
 /*
  * post 2.5 kernels.
@@ -138,9 +137,9 @@ static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
 /*
  * internal helper function used by cfs_fs_time_before*()
  */
-static inline unsigned long __cfs_fs_time_flat(cfs_fs_time_t *t)
+static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t)
 {
-        return ((unsigned long)t->tv_sec) * ONE_BILLION + t->tv_nsec;
+        return (unsigned long long)t->tv_sec * ONE_BILLION + t->tv_nsec;
 }
 
 #define CURRENT_KERN_TIME        CURRENT_TIME
@@ -198,12 +197,12 @@ static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t)
 
 static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
 {
-        return time_before(__cfs_fs_time_flat(t1), __cfs_fs_time_flat(t2));
+        return __cfs_fs_time_flat(t1) <  __cfs_fs_time_flat(t2);
 }
 
 static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
 {
-        return time_before_eq(__cfs_fs_time_flat(t1), __cfs_fs_time_flat(t2));
+        return __cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2);
 }
 
 #if 0
@@ -224,12 +223,7 @@ static inline cfs_duration_t cfs_duration_build(int64_t nano)
 
 static inline cfs_duration_t cfs_time_seconds(int seconds)
 {
-        return seconds * HZ;
-}
-
-static inline cfs_time_t cfs_time_shift(int seconds)
-{
-        return jiffies + seconds * HZ;
+        return ((cfs_duration_t)seconds) * HZ;
 }
 
 static inline time_t cfs_duration_sec(cfs_duration_t d)
@@ -239,34 +233,64 @@ static inline time_t cfs_duration_sec(cfs_duration_t d)
 
 static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
 {
-#if (BITS_PER_LONG == 32)
-        uint64_t t = (d - s->tv_sec * HZ) * ONE_MILLION;
+#if (BITS_PER_LONG == 32) && (HZ > 4096)
+        uint64_t t;
+
+        s->tv_sec = d / HZ;
+        t = (d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION;
         s->tv_usec = do_div (t, HZ);
 #else
-        s->tv_usec = (d - s->tv_sec * HZ) * ONE_MILLION / HZ;
-#endif
         s->tv_sec = d / HZ;
+        s->tv_usec = ((d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION) / HZ;
+#endif
 }
 
 static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
 {
 #if (BITS_PER_LONG == 32)
-        uint64_t t = (d - s->tv_sec * HZ) * ONE_BILLION;
+        uint64_t t;
+
+        s->tv_sec = d / HZ;
+        t = (d - s->tv_sec * HZ) * ONE_BILLION;
         s->tv_nsec = do_div (t, HZ);
 #else
-        s->tv_nsec = (d - s->tv_sec * HZ) * ONE_BILLION / HZ;
-#endif
         s->tv_sec = d / HZ;
+        s->tv_nsec = ((d - s->tv_sec * HZ) * ONE_BILLION) / HZ;
+#endif
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+
+#define cfs_time_current_64 get_jiffies_64
+
+static inline __u64 cfs_time_add_64(__u64 t, __u64 d)
+{
+        return t + d;
+}
+
+static inline __u64 cfs_time_shift_64(int seconds)
+{
+        return cfs_time_add_64(cfs_time_current_64(),
+                               cfs_time_seconds(seconds));
 }
 
-static inline cfs_duration_t cfs_time_minimal_timeout(void)
+static inline int cfs_time_before_64(__u64 t1, __u64 t2)
 {
-        return 1;
+        return (__s64)t2 - (__s64)t1 > 0;
 }
 
-/* inline function cfs_time_minimal_timeout() can not be used
- * to initiallize static variable */
-#define CFS_MIN_DELAY           (1)
+#else
+#define cfs_time_current_64 cfs_time_current
+#define cfs_time_add_64     cfs_time_add
+#define cfs_time_shift_64   cfs_time_shift
+#define cfs_time_before_64  cfs_time_before
+
+#endif
+
+/*
+ * One jiffy
+ */
+#define CFS_TICK                (1)
 
 #define CFS_TIME_T              "%lu"
 #define CFS_DURATION_T          "%ld"
index 5050abc..1ddd03d 100644 (file)
@@ -18,9 +18,9 @@
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/time.h>
-#include <portals/types.h>
+#include <lnet/types.h>
 #include <libcfs/kp30.h>
-#include <portals/ptlctl.h>
+#include <lnet/lnetctl.h>
 #include <linux/limits.h>
 #include <asm/page.h>
 #include <linux/version.h>
index 31658d5..657c011 100644 (file)
@@ -5,7 +5,7 @@
 #define __LIBCFS_LINUX_PORTALS_COMPAT_H__
 
 // XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved
-#if SPINLOCK_DEBUG
+#if defined(SPINLOCK_DEBUG) && SPINLOCK_DEBUG
 # if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) || defined(CONFIG_RH_2_4_20)
 #  define SIGNAL_MASK_ASSERT() \
    LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC)
@@ -44,6 +44,8 @@
 # define RECALC_SIGPENDING         recalc_sigpending()
 # define CLEAR_SIGPENDING          (current->sigpending = 0)
 # define CURRENT_SECONDS           CURRENT_TIME
+# define wait_event_interruptible_exclusive(wq, condition)              \
+        wait_event_interruptible(wq, condition)
 
 #else /* 2.4.x */
 
@@ -56,6 +58,8 @@
 # define RECALC_SIGPENDING         recalc_sigpending(current)
 # define CLEAR_SIGPENDING          (current->sigpending = 0)
 # define CURRENT_SECONDS           CURRENT_TIME
+# define wait_event_interruptible_exclusive(wq, condition)              \
+        wait_event_interruptible(wq, condition)
 
 #endif
 
 #endif
 
 #ifndef HAVE_CPU_ONLINE
-#define cpu_online(cpu) test_bit(cpu, &(cpu_online_map))
+#define cpu_online(cpu) ((1<<cpu) & (cpu_online_map))
 #endif
 #ifndef HAVE_CPUMASK_T
-#define cpu_set(cpu, map) set_bit(cpu, &(map))
 typedef unsigned long cpumask_t;
+#define cpu_set(cpu, map) set_bit(cpu, &(map))
+#define cpus_clear(map) memset(&(map), 0, sizeof(cpumask_t))
+#endif
+
+#ifndef __user
+#define __user
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
+#define ll_proc_dointvec(table, write, filp, buffer, lenp, ppos)        \
+        proc_dointvec(table, write, filp, buffer, lenp)
+#define ll_proc_dostring(table, write, filp, buffer, lenp, ppos)        \
+        proc_dostring(table, write, filp, buffer, lenp)
+#define LL_PROC_PROTO(name)                                             \
+        name(ctl_table *table, int write, struct file *filp,            \
+             void __user *buffer, size_t *lenp)
+#else
+#define ll_proc_dointvec(table, write, filp, buffer, lenp, ppos)        \
+        proc_dointvec(table, write, filp, buffer, lenp, ppos);
+#define ll_proc_dostring(table, write, filp, buffer, lenp, ppos)        \
+        proc_dostring(table, write, filp, buffer, lenp, ppos);
+#define LL_PROC_PROTO(name)                                             \
+        name(ctl_table *table, int write, struct file *filp,            \
+             void __user *buffer, size_t *lenp, loff_t *ppos)
 #endif
 
 #endif /* _PORTALS_COMPAT_H */
index 0dd6c7e..ae319af 100644 (file)
@@ -14,7 +14,7 @@
 #include <linux/smp_lock.h>
 #include <linux/poll.h>
 #include <linux/random.h>
-                                                                                                                                                                      
+
 #include <asm/unistd.h>
 #include <asm/semaphore.h>
 
 
 #include <endian.h>
 #include <libcfs/list.h>
-                                                                                                                                                                      
+
 #ifdef HAVE_LINUX_VERSION_H
 # include <linux/version.h>
-                                                                                                                                                                      
+
 # if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 #  define BUG()                            /* workaround for module.h includes */
 #  include <linux/module.h>
 # endif
 #endif /* !HAVE_LINUX_VERSION_H */
-                                                                                                                                                                      
+
 #ifndef __CYGWIN__
 # include <syscall.h>
 #else /* __CYGWIN__ */
@@ -47,5 +47,5 @@
 # include <netinet/in.h>
 #endif /* __CYGWIN__ */
 
-#endif /* !__KERNEL__ */
+#endif /* !__KERNEL__ */
 #endif
index 5520f75..5c27071 100644 (file)
@@ -9,6 +9,13 @@
 #define CFS_LIST_HEAD(n)               LIST_HEAD(n)
 #define CFS_INIT_LIST_HEAD(p)          INIT_LIST_HEAD(p)
 
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define CFS_HLIST_HEAD_INIT            HLIST_HEAD_INIT
+#define CFS_HLIST_HEAD(n)              HLIST_HEAD(n)
+#define CFS_INIT_HLIST_HEAD(p)         INIT_HLIST_HEAD(p)
+#define CFS_INIT_HLIST_NODE(p)         INIT_HLIST_NODE(p)
+#endif
+
 #else /* !defined (__linux__) || !defined(__KERNEL__) */
 
 /*
  * using the generic single-entry routines.
  */
 
+#ifndef __WINNT__
 #define prefetch(a) ((void)a)
+#else
+#define prefetch(a) ((void *)a)
+#endif
 
 struct list_head {
        struct list_head *next, *prev;
@@ -124,6 +135,8 @@ static inline void list_del_init(struct list_head *entry)
  * list_move - delete from one list and add as another's head
  * @list: the entry to move
  * @head: the head that will precede our entry
+ *
+ * This is not safe to use if @list is already on the same list as @head.
  */
 static inline void list_move(struct list_head *list, struct list_head *head)
 {
@@ -135,6 +148,8 @@ static inline void list_move(struct list_head *list, struct list_head *head)
  * list_move_tail - delete from one list and add as another's tail
  * @list: the entry to move
  * @head: the head that will follow our entry
+ *
+ * This is not safe to use if @list is already on the same list as @head.
  */
 static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
@@ -221,7 +236,164 @@ static inline void list_splice_init(struct list_head *list,
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)
 
-#endif /* __linux__*/
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_head {
+       struct hlist_node *first;
+};
+
+struct hlist_node {
+       struct hlist_node *next, **pprev;
+};
+
+/*
+ * "NULL" might not be defined at this point
+ */
+#ifdef NULL
+#define NULL_P NULL
+#else
+#define NULL_P ((void *)0)
+#endif
+
+#define CFS_HLIST_HEAD_INIT { .first = NULL_P }
+#define CFS_HLIST_HEAD(name) struct hlist_head name = {  .first = NULL_P }
+#define CFS_INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL_P)
+#define CFS_INIT_HLIST_NODE(ptr) ((ptr)->next = NULL_P, (ptr)->pprev = NULL_P)
+
+#define HLIST_HEAD_INIT                CFS_HLIST_HEAD_INIT
+#define HLIST_HEAD(n)          CFS_HLIST_HEAD(n)
+#define INIT_HLIST_HEAD(p)     CFS_INIT_HLIST_HEAD(p)
+#define INIT_HLIST_NODE(p)     CFS_INIT_HLIST_NODE(p)
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+       return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+       return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+       struct hlist_node *next = n->next;
+       struct hlist_node **pprev = n->pprev;
+       *pprev = next;
+       if (next)
+               next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+       __hlist_del(n);
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+       if (n->pprev)  {
+               __hlist_del(n);
+               INIT_HLIST_NODE(n);
+       }
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+       struct hlist_node *first = h->first;
+       n->next = first;
+       if (first)
+               first->pprev = &n->next;
+       h->first = n;
+       n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+                                       struct hlist_node *next)
+{
+       n->pprev = next->pprev;
+       n->next = next;
+       next->pprev = &n->next;
+       *(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+                                       struct hlist_node *next)
+{
+       next->next = n->next;
+       n->next = next;
+       next->pprev = &n->next;
+
+       if(next->next)
+               next->next->pprev  = &next->next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+       for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \
+            pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+       for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+            pos = n)
+
+/**
+ * hlist_for_each_entry        - iterate over list of given type
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)                   \
+       for (pos = (head)->first;                                        \
+            pos && ({ prefetch(pos->next); 1;}) &&                      \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after existing point
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)                \
+       for (pos = (pos)->next;                                          \
+            pos && ({ prefetch(pos->next); 1;}) &&                      \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from existing point
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)                    \
+       for (; pos && ({ prefetch(pos->next); 1;}) &&                    \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @tpos:      the type * to use as a loop counter.
+ * @pos:       the &struct hlist_node to use as a loop counter.
+ * @n:         another &struct hlist_node to use as temporary storage
+ * @head:      the head for your list.
+ * @member:    the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member)           \
+       for (pos = (head)->first;                                        \
+            pos && ({ n = pos->next; 1; }) &&                           \
+               ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+            pos = n)
+
+#endif /* __linux__ && __KERNEL__ */
 
 #ifndef list_for_each_prev
 /**
@@ -250,6 +422,19 @@ static inline void list_splice_init(struct list_head *list,
             prefetch(pos->member.next))
 #endif /* list_for_each_entry */
 
+#ifndef list_for_each_entry_reverse
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:        the type * to use as a loop counter.
+ * @head:       the head for your list.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)                  \
+       for (pos = list_entry((head)->prev, typeof(*pos), member);      \
+            prefetch(pos->member.prev), &pos->member != (head);        \
+            pos = list_entry(pos->member.prev, typeof(*pos), member))
+#endif /* list_for_each_entry_reverse */
+
 #ifndef list_for_each_entry_safe
 /**
  * list_for_each_entry_safe  -       iterate over list of given type safe against removal of list entry
@@ -265,138 +450,4 @@ static inline void list_splice_init(struct list_head *list,
             pos = n, n = list_entry(n->member.next, typeof(*n), member))
 #endif /* list_for_each_entry_safe */
 
-#ifndef list_for_each_entry_reverse
-/**
- * list_for_each_entry_reverse - iterate backwards over list of given type.
- * @pos:        the type * to use as a loop counter.
- * @head:       the head for your list.
- * @member:     the name of the list_struct within the struct.
- */
-#define list_for_each_entry_reverse(pos, head, member)                  \
-        for (pos = list_entry((head)->prev, typeof(*pos), member),      \
-                     prefetch(pos->member.prev);                        \
-             &pos->member != (head);                                    \
-             pos = list_entry(pos->member.prev, typeof(*pos), member),  \
-                     prefetch(pos->member.prev))
-#endif
-
-#ifndef NULL
-#define NULL ((void *)0)
-#endif
-
-/* hlist stuff */
-#ifndef __KERNEL__
-#define HLIST_HEAD_INIT { .first = NULL }
-#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
-#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
-#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
-
-#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
-
-#ifndef hlist_for_each
-#define hlist_for_each(pos, head) \
-        for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \
-             pos = pos->next)
-#endif
-
-#ifndef hlist_for_each_entry_safe
-/**
- * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
- * @tpos:       the type * to use as a loop counter.
- * @pos:        the &struct hlist_node to use as a loop counter.
- * @n:          another &struct hlist_node to use as temporary storage
- * @head:       the head for your list.
- * @member:     the name of the hlist_node within the struct.
- */
-#define hlist_for_each_entry_safe(tpos, pos, n, head, member)            \
-        for (pos = (head)->first;                                        \
-             pos && ({ n = pos->next; 1; }) &&                           \
-                ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-             pos = n)
-#endif
-
-#ifndef hlist_for_each_safe
-#define hlist_for_each_safe(pos, n, head) \
-        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
-             pos = n)
-#endif
-
-#ifndef hlist_for_each_entry
-/**
- * hlist_for_each_entry - iterate over list of given type
- * @tpos:       the type * to use as a loop counter.
- * @pos:        the &struct hlist_node to use as a loop counter.
- * @head:       the head for your list.
- * @member:     the name of the hlist_node within the struct.
- */
-#define hlist_for_each_entry(tpos, pos, head, member)                    \
-        for (pos = (head)->first;                                        \
-             pos && ({ prefetch(pos->next); 1;}) &&                      \
-                ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
-             pos = pos->next)
-#endif
-
-/*
- * These are non-NULL pointers that will result in page faults
- * under normal circumstances, used to verify that nobody uses
- * non-initialized list entries.
- */
-#define LIST_POISON1  ((void *) 0x00100100)
-#define LIST_POISON2  ((void *) 0x00200200)
-
-#ifndef __KERNEL__
-struct hlist_head {
-        struct hlist_node *first;
-};
-
-struct hlist_node {
-        struct hlist_node *next, **pprev;
-};
-
-static inline int hlist_unhashed(const struct hlist_node *h)
-{
-        return !h->pprev;
-}
-
-static inline int hlist_empty(const struct hlist_head *h)
-{
-        return !h->first;
-}
-
-static inline void __hlist_del(struct hlist_node *n)
-{
-        struct hlist_node *next = n->next;
-        struct hlist_node **pprev = n->pprev;
-        *pprev = next;
-        if (next)
-                next->pprev = pprev;
-}
-
-static inline void hlist_del(struct hlist_node *n)
-{
-        __hlist_del(n);
-        n->next = LIST_POISON1;
-        n->pprev = LIST_POISON2;
-}
-
-static inline void hlist_del_init(struct hlist_node *n)
-{
-        if (n->pprev)  {
-                __hlist_del(n);
-                INIT_HLIST_NODE(n);
-        }
-}
-
-static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
-{
-        struct hlist_node *first = h->first;
-        n->next = first;
-        if (first)
-                first->pprev = &n->next;
-        h->first = n;
-        n->pprev = &h->first;
-}
-#endif /* __KERNEL__ */
-#endif /* HLIST_HEAD */
-
 #endif /* __LIBCFS_LUSTRE_LIST_H__ */
index 4f386c5..dbeae91 100644 (file)
@@ -11,6 +11,8 @@
 #include <libcfs/linux/lltrace.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/lltrace.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/lltrace.h>
 #else
 #error Unsupported Operating System
 #endif
@@ -83,8 +85,9 @@ static inline int ltrace_start()
 {
         int rc = 0;
         dbg_initialize(0, NULL);
-#ifdef PORTALS_DEV_ID
-        rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+#ifdef LNET_DEV_ID
+        rc = register_ioc_dev(LNET_DEV_ID, LNET_DEV_PATH,
+                              LNET_DEV_MAJOR, LNET_DEV_MINOR);
 #endif
         ltrace_filter("class");
         ltrace_filter("nal");
@@ -105,8 +108,8 @@ static inline int ltrace_start()
 
 static inline void ltrace_stop()
 {
-#ifdef PORTALS_DEV_ID
-        unregister_ioc_dev(PORTALS_DEV_ID);
+#ifdef LNET_DEV_ID
+        unregister_ioc_dev(LNET_DEV_ID);
 #endif
 }
 
@@ -117,14 +120,14 @@ static inline int not_uml()
    *   1 when run on host
    *  <0 when lookup failed
    */
-       struct stat buf;
-       int rc = stat("/dev/ubd", &buf);
-       rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc;
-       if (rc<0) {
-         fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno));
-         rc = 1; /* Assume host */
-       }
-       return rc;
+        struct stat buf;
+        int rc = stat("/dev/ubd", &buf);
+        rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc;
+        if (rc<0) {
+          fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno));
+          rc = 1; /* Assume host */
+        }
+        return rc;
 }
 
 #define LTRACE_MAX_NOB   256
diff --git a/lnet/include/libcfs/portals_lib.h b/lnet/include/libcfs/portals_lib.h
deleted file mode 100644 (file)
index 8be849b..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Basic library routines. 
- *
- */
-
-#ifndef __LIBCFS_PORTALS_LIB_H__
-#define __LIBCFS_PORTALS_LIB_H__
-
-#if defined(__linux__)
-#include <libcfs/linux/portals_lib.h>
-#elif defined(__APPLE__)
-#include <libcfs/darwin/portals_lib.h>
-#else
-#error Unsupported Operating System
-#endif
-
-#undef MIN
-#define MIN(a,b) (((a)<(b)) ? (a): (b))
-#undef MAX
-#define MAX(a,b) (((a)>(b)) ? (a): (b))
-#define MKSTR(ptr) ((ptr))? (ptr) : ""
-
-static inline int size_round4 (int val)
-{
-        return (val + 3) & (~0x3);
-}
-
-static inline int size_round (int val)
-{
-        return (val + 7) & (~0x7);
-}
-
-static inline int size_round16(int val)
-{
-        return (val + 0xf) & (~0xf);
-}
-
-static inline int size_round32(int val)
-{
-        return (val + 0x1f) & (~0x1f);
-}
-
-static inline int size_round0(int val)
-{
-        if (!val)
-                return 0;
-        return (val + 1 + 7) & (~0x7);
-}
-
-static inline size_t round_strlen(char *fset)
-{
-        return size_round(strlen(fset) + 1);
-}
-
-#define LOGL(var,len,ptr)                                       \
-do {                                                            \
-        if (var)                                                \
-                memcpy((char *)ptr, (const char *)var, len);    \
-        ptr += size_round(len);                                 \
-} while (0)
-
-#define LOGU(var,len,ptr)                                       \
-do {                                                            \
-        if (var)                                                \
-                memcpy((char *)var, (const char *)ptr, len);    \
-        ptr += size_round(len);                                 \
-} while (0)
-
-#define LOGL0(var,len,ptr)                              \
-do {                                                    \
-        if (!len)                                       \
-                break;                                  \
-        memcpy((char *)ptr, (const char *)var, len);    \
-        *((char *)(ptr) + len) = 0;                     \
-        ptr += size_round(len + 1);                     \
-} while (0)
-
-#endif /* _PORTALS_LIB_H */
index 932caaf..b79eb7e 100644 (file)
@@ -12,6 +12,8 @@
 #include <libcfs/linux/portals_utils.h>
 #elif defined(__APPLE__)
 #include <libcfs/darwin/portals_utils.h>
+#elif defined(__WINNT__)
+#include <libcfs/winnt/portals_utils.h>
 #else
 #error Unsupported Operating System
 #endif
diff --git a/lnet/include/libcfs/types.h b/lnet/include/libcfs/types.h
new file mode 100755 (executable)
index 0000000..71dd7fb
--- /dev/null
@@ -0,0 +1,17 @@
+#ifndef _LIBCFS_TYPES_H
+#define _LIBCFS_TYPES_H
+
+/*
+ * This file was inttroduced to resolve XT3 (Catamount) build issues.
+ * The orignal idea was to move <lustre/types.h> here however at
+ * the time of this writing
+ * it's unclear what external dependencies are tied
+ * to that file (It's not just some source file #including it)
+ * there is some build/packaging infrastructure that includes it.
+ * Hopefully that will be resolved shortly, that file will
+ * be removed, its contents copied here and this comment can be deleted.
+ */
+
+#include <lustre/types.h>
+
+#endif
index e57200f..cea7a6d 100644 (file)
 
 /*
  * liblustre is single-threaded, so most "synchronization" APIs are trivial.
+ *
+ * XXX Liang: There are several branches share lnet with b_hd_newconfig,
+ * if we define lock APIs at here, there will be conflict with liblustre
+ * in other branches.
  */
 
 #ifndef __KERNEL__
+#include <stdio.h>
+#include <stdlib.h>
 
+#if 0
 /*
  * Optional debugging (magic stamping and checking ownership) can be added.
  */
  *
  * No-op implementation.
  */
-struct spin_lock {};
+struct spin_lock {int foo;};
 
 typedef struct spin_lock spinlock_t;
 
+#define SPIN_LOCK_UNLOCKED (spinlock_t) { }
+#define LASSERT_SPIN_LOCKED(lock) do {} while(0)
+
 void spin_lock_init(spinlock_t *lock);
 void spin_lock(spinlock_t *lock);
 void spin_unlock(spinlock_t *lock);
@@ -66,9 +76,10 @@ int spin_trylock(spinlock_t *lock);
 void spin_lock_bh_init(spinlock_t *lock);
 void spin_lock_bh(spinlock_t *lock);
 void spin_unlock_bh(spinlock_t *lock);
+static inline int spin_is_locked(spinlock_t *l) {return 1;}
 
-#define spin_lock_irqsave(l, flags) ({ spin_lock(l); (void)flags; })
-#define spin_unlock_irqrestore(l, flags)  ({ spin_unlock(l); (void)flags; })
+static inline void spin_lock_irqsave(spinlock_t *l, unsigned long f){}
+static inline void spin_unlock_irqrestore(spinlock_t *l, unsigned long f){}
 
 /*
  * Semaphore
@@ -77,7 +88,9 @@ void spin_unlock_bh(spinlock_t *lock);
  * - __down(x)
  * - __up(x)
  */
-struct semaphore {};
+typedef struct semaphore {
+    int foo;
+} mutex_t;
 
 void sema_init(struct semaphore *s, int val);
 void __down(struct semaphore *s);
@@ -104,11 +117,13 @@ void __up(struct semaphore *s);
  * - complete(c)
  * - wait_for_completion(c)
  */
+#if 0
 struct completion {};
 
 void init_completion(struct completion *c);
 void complete(struct completion *c);
 void wait_for_completion(struct completion *c);
+#endif
 
 /*
  * rw_semaphore:
@@ -149,11 +164,32 @@ typedef struct rw_semaphore rwlock_t;
 #define write_lock(l)          down_write(l)
 #define write_unlock(l)                up_write(l)
 
-#define write_lock_irqsave(l, f)       write_lock(l)
-#define write_unlock_irqrestore(l, f)  write_unlock(l)
+static inline void
+write_lock_irqsave(rwlock_t *l, unsigned long f) { write_lock(l); }
+static inline void
+write_unlock_irqrestore(rwlock_t *l, unsigned long f) { write_unlock(l); }
 
-#define read_lock_irqsave(l, f)                read_lock(l)
-#define read_unlock_irqrestore(l, f)   read_unlock(l)
+static inline void 
+read_lock_irqsave(rwlock_t *l, unsigned long f) { read_lock(l); }
+static inline void
+read_unlock_irqrestore(rwlock_t *l, unsigned long f) { read_unlock(l); }
+
+/*
+ * Atomic for user-space
+ * Copied from liblustre
+ */
+typedef struct { volatile int counter; } atomic_t;
+
+#define ATOMIC_INIT(i) { (i) }
+#define atomic_read(a) ((a)->counter)
+#define atomic_set(a,b) do {(a)->counter = b; } while (0)
+#define atomic_dec_and_test(a) ((--((a)->counter)) == 0)
+#define atomic_inc(a)  (((a)->counter)++)
+#define atomic_dec(a)  do { (a)->counter--; } while (0)
+#define atomic_add(b,a)  do {(a)->counter += b;} while (0)
+#define atomic_sub(b,a)  do {(a)->counter -= b;} while (0)
+
+#endif
 
 /* !__KERNEL__ */
 #endif
index 6c3410b..54f7832 100644 (file)
 
 #ifndef __KERNEL__
 
+#include <stdlib.h>
+#include <string.h>
+#include <sys/signal.h>
+#include <sys/mman.h>
 #include <libcfs/list.h>
+#include <libcfs/user-time.h>
+#include <signal.h>
+#include <stdlib.h>
 
 /*
  * Wait Queue. No-op implementation.
  */
 
-typedef struct cfs_waitlink {} cfs_waitlink_t;
-typedef struct cfs_waitq {} cfs_waitq_t;
+typedef struct cfs_waitlink {
+        struct list_head sleeping;
+        void *process;
+} cfs_waitlink_t;
+
+typedef struct cfs_waitq {
+        struct list_head sleepers;
+} cfs_waitq_t;
 
 void cfs_waitq_init(struct cfs_waitq *waitq);
 void cfs_waitlink_init(struct cfs_waitlink *link);
@@ -57,13 +70,17 @@ void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link);
 int  cfs_waitq_active(struct cfs_waitq *waitq);
 void cfs_waitq_signal(struct cfs_waitq *waitq);
 void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr);
-void cfs_waitq_broadcast(struct cfs_waitq *waitq);
+void cfs_waitq_broadcast(struct cfs_waitq *waitq, int state);
 void cfs_waitq_wait(struct cfs_waitlink *link);
-int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int64_t timeout);
+int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int state, int64_t timeout);
+#define cfs_schedule_timeout(s, t)              \
+        do {                                    \
+                cfs_waitlink_t    l;            \
+                cfs_waitq_timedwait(&l, s, t);  \
+        } while (0)
 
-/*
- * Allocator
- */
+#define CFS_TASK_INTERRUPTIBLE  (0)
+#define CFS_TASK_UNINT          (0)
 
 /* 2.4 defines */
 
@@ -88,31 +105,40 @@ struct page {
 
 typedef struct page cfs_page_t;
 
-#define CFS_PAGE_SIZE                   PAGE_CACHE_SIZE
-#define CFS_PAGE_SHIFT                  PAGE_CACHE_SHIFT
-#define CFS_PAGE_MASK                   PAGE_CACHE_MASK
-
-cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order);
-void cfs_free_pages(struct page *pg, int what);
+#define CFS_PAGE_SIZE                   PAGE_SIZE
+#define CFS_PAGE_SHIFT                  PAGE_SHIFT
+#define CFS_PAGE_MASK                   (~((__u64)CFS_PAGE_SIZE-1))
 
 cfs_page_t *cfs_alloc_page(unsigned int flags);
-void cfs_free_page(cfs_page_t *pg, int what);
+void cfs_free_page(cfs_page_t *pg);
 void *cfs_page_address(cfs_page_t *pg);
 void *cfs_kmap(cfs_page_t *pg);
 void cfs_kunmap(cfs_page_t *pg);
 
 #define cfs_get_page(p)                        __I_should_not_be_called__(at_all)
 #define cfs_page_count(p)              __I_should_not_be_called__(at_all)
-#define cfs_set_page_count(p, v)       __I_should_not_be_called__(at_all)
+#define cfs_page_index(p)               ((p)->index)
 
 /*
  * Memory allocator
+ * Inline function, so utils can use them without linking of libcfs
  */
-void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
-void cfs_free(void *addr);
-void *cfs_alloc_large(size_t nr_bytes);
-void  cfs_free_large(void *addr);
+#define __ALLOC_ZERO    (1 << 2)
+static inline void *cfs_alloc(size_t nr_bytes, u_int32_t flags)
+{
+        void *result;
+
+        result = malloc(nr_bytes);
+        if (result != NULL && (flags & __ALLOC_ZERO))
+                memset(result, 0, nr_bytes);
+        return result;
+}
+
+#define cfs_free(addr)  free(addr)
+#define cfs_alloc_large(nr_bytes) cfs_alloc(nr_bytes, 0)
+#define cfs_free_large(addr) cfs_free(addr)
 
+#define CFS_ALLOC_ATOMIC_TRY   (0)
 /*
  * SLAB allocator
  */
@@ -121,11 +147,11 @@ typedef struct {
 } cfs_mem_cache_t;
 
 #define SLAB_HWCACHE_ALIGN 0
+#define SLAB_KERNEL 0
+#define SLAB_NOFS 0
 
 cfs_mem_cache_t *
-cfs_mem_cache_create(const char *, size_t, size_t, unsigned long,
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long));
+cfs_mem_cache_create(const char *, size_t, size_t, unsigned long);
 int cfs_mem_cache_destroy(cfs_mem_cache_t *c);
 void *cfs_mem_cache_alloc(cfs_mem_cache_t *c, int gfp);
 void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr);
@@ -138,10 +164,61 @@ typedef int (cfs_write_proc_t)(struct file *file, const char *buffer,
                                unsigned long count, void *data);
 
 /*
+ * Signal
+ */
+typedef sigset_t                        cfs_sigset_t;
+
+/*
  * Timer
  */
+#include <sys/time.h>
+
+typedef struct {
+        struct list_head tl_list;
+        void (*function)(unsigned long unused);
+        unsigned long data;
+        long expires;
+} cfs_timer_t;
+
+#define cfs_init_timer(t)       do {} while(0)
+#define cfs_jiffies                             \
+({                                              \
+        unsigned long _ret = 0;                 \
+        struct timeval tv;                      \
+        if (gettimeofday(&tv, NULL) == 0)       \
+                _ret = tv.tv_sec;               \
+        _ret;                                   \
+})
+
+static inline int cfs_timer_init(cfs_timer_t *l, void (* func)(unsigned long), void *arg)
+{
+        CFS_INIT_LIST_HEAD(&l->tl_list);
+        l->function = func;
+        l->data = (unsigned long)arg;
+        return 0;
+}
+
+static inline int cfs_timer_is_armed(cfs_timer_t *l)
+{
+        if (cfs_time_before(cfs_jiffies, l->expires))
+                return 1;
+        else
+                return 0;
+}
+
+static inline void cfs_timer_arm(cfs_timer_t *l, int thetime)
+{
+        l->expires = thetime;
+}
+
+static inline void cfs_timer_disarm(cfs_timer_t *l)
+{
+}
 
-typedef struct cfs_timer {} cfs_timer_t;
+static inline long cfs_timer_deadline(cfs_timer_t *l)
+{
+        return l->expires;
+}
 
 #if 0
 #define cfs_init_timer(t)      do {} while(0)
@@ -154,6 +231,16 @@ int  cfs_timer_is_armed(struct cfs_timer *t);
 cfs_time_t cfs_timer_deadline(struct cfs_timer *t);
 #endif
 
+#define in_interrupt()    (0)
+
+static inline void cfs_pause(cfs_duration_t d)
+{
+        struct timespec s;
+        
+        cfs_duration_nsec(d, &s);
+        nanosleep(&s, NULL);
+}
+
 typedef void cfs_psdev_t;
 
 static inline int cfs_psdev_register(cfs_psdev_t *foo)
@@ -166,6 +253,42 @@ static inline int cfs_psdev_deregister(cfs_psdev_t *foo)
         return 0;
 }
 
+/*
+ * portable UNIX device file identification.
+ */
+
+typedef unsigned int cfs_rdev_t;
+// typedef unsigned long long kdev_t;
+/*
+ */
+#define cfs_lock_kernel()               do {} while (0)
+#define cfs_sigfillset(l) do {}         while (0)
+#define cfs_recalc_sigpending(l)        do {} while (0)
+#define cfs_kernel_thread(l,m,n)        LBUG()
+
+// static inline void local_irq_save(unsigned long flag) {return;}
+// static inline void local_irq_restore(unsigned long flag) {return;}
+
+enum {
+        CFS_STACK_TRACE_DEPTH = 16
+};
+
+struct cfs_stack_trace {
+        void *frame[CFS_STACK_TRACE_DEPTH];
+};
+
+/*
+ * arithmetic
+ */
+#define do_div(a,b)                     \
+        ({                              \
+                unsigned long remainder;\
+                remainder = (a) % (b);  \
+                (a) = (a) / (b);        \
+                (remainder);            \
+        })
+
+
 /* !__KERNEL__ */
 #endif
 
index 7abc9e8..86cbc2d 100644 (file)
  *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
  *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
  *
- *  cfs_duration_t cfs_time_minimal_timeout(void)
- *
  *  CFS_TIME_FORMAT
  *  CFS_DURATION_FORMAT
  *
  */
 
-#define ONE_BILLION ((u_int64_t)1000000000)
-#define ONE_MILLION ((u_int64_t)   1000000)
-
 #ifndef __KERNEL__
 
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION 1000000
+
 /*
  * Liblustre. time(2) based implementation.
  */
@@ -98,6 +96,11 @@ static inline cfs_duration_t cfs_time_seconds(int seconds)
         return seconds;
 }
 
+static inline time_t cfs_time_current_sec(void)
+{
+        return cfs_time_seconds(cfs_time_current());
+}
+
 static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
 {
         return t1 < t2;
@@ -110,7 +113,7 @@ static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
 
 static inline cfs_duration_t cfs_duration_build(int64_t nano)
 {
-        return nano / ONE_BILLION;
+        return (cfs_duration_t) (nano / ONE_BILLION);
 }
 
 static inline time_t cfs_duration_sec(cfs_duration_t d)
@@ -162,12 +165,7 @@ static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
         return *t1 <= *t2;
 }
 
-static inline cfs_duration_t cfs_time_minimal_timeout(void)
-{
-        return 1;
-}
-
-#define CFS_MIN_DELAY           (1)
+#define CFS_TICK                (1)
 
 static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
 {
@@ -179,6 +177,11 @@ static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
         return t1 - t2;
 }
 
+#define cfs_time_current_64 cfs_time_current
+#define cfs_time_add_64     cfs_time_add
+#define cfs_time_shift_64   cfs_time_shift
+#define cfs_time_before_64  cfs_time_before
+
 #define CFS_TIME_T              "%lu"
 #define CFS_DURATION_T          "%ld"
 
diff --git a/lnet/include/libcfs/winnt/kp30.h b/lnet/include/libcfs/winnt/kp30.h
new file mode 100644 (file)
index 0000000..e494a9f
--- /dev/null
@@ -0,0 +1,156 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_KP30_H__
+#define __LIBCFS_WINNT_KP30_H__
+
+#ifndef __LIBCFS_KP30_H__
+#error Do not #include this file directly. #include <libcfs/kp30.h> instead
+#endif
+
+#include <libcfs/winnt/portals_compat25.h>
+#include <lnet/types.h>
+
+#ifdef __KERNEL__
+
+/* Module parameter support */
+#define CFS_MODULE_PARM(name, t, type, perm, desc)
+
+#define CFS_SYSFS_MODULE_PARM    0 /* no sysfs access to module parameters */
+
+
+static inline void our_cond_resched()
+{
+    schedule_timeout(1i64);
+}
+
+#ifdef CONFIG_SMP
+#define LASSERT_SPIN_LOCKED(lock) do {} while(0) /* XXX */
+#else
+#define LASSERT_SPIN_LOCKED(lock) do {} while(0)
+#endif
+
+#error Need a winnt version of panic()
+#define LIBCFS_PANIC(msg) KeBugCheckEx(msg, (ULONG_PTR)NULL, (ULONG_PTR)NULL, (ULONG_PTR)NULL, (ULONG_PTR)NULL)
+#error libcfs_register_panic_notifier() missing
+#error libcfs_unregister_panic_notifier() missing
+
+#define cfs_work_struct_t WORK_QUEUE_ITEM
+#define cfs_prepare_work(tq, routine, contex)
+#define cfs_schedule_work(tq)
+
+/* ------------------------------------------------------------------- */
+
+#define PORTAL_SYMBOL_REGISTER(x)               cfs_symbol_register(#x, &x)
+#define PORTAL_SYMBOL_UNREGISTER(x)             cfs_symbol_unregister(#x)
+
+#define PORTAL_SYMBOL_GET(x)                    (cfs_symbol_get(#x))
+#define PORTAL_SYMBOL_PUT(x)                    cfs_symbol_put(#x)
+
+#define PORTAL_MODULE_USE                       do{}while(0)
+#define PORTAL_MODULE_UNUSE                     do{}while(0)
+
+#define printk                                  DbgPrint
+#define ptintf                                  DbgPrint
+
+#else  /* !__KERNEL__ */
+
+# include <stdio.h>
+# include <stdlib.h>
+#ifdef __CYGWIN__
+# include <cygwin-ioctl.h>
+#endif
+# include <time.h>
+
+#endif /* End of !__KERNEL__ */
+
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT  0
+
+/* kernel hasn't defined this? */
+typedef struct {
+        __s64      lwte_when;
+        char       *lwte_where;
+        void       *lwte_task;
+        long_ptr        lwte_p1;
+        long_ptr        lwte_p2;
+        long_ptr        lwte_p3;
+        long_ptr        lwte_p4;
+# if BITS_PER_LONG > 32
+        long_ptr        lwte_pad;
+# endif
+} lwt_event_t;
+
+
+# define LWT_EVENT(p1,p2,p3,p4)
+
+
+/* ------------------------------------------------------------------ */
+
+#define IOCTL_LIBCFS_TYPE long_ptr
+
+#ifdef __CYGWIN__
+# ifndef BITS_PER_LONG
+#  if (~0UL) == 0xffffffffUL
+#   define BITS_PER_LONG 32
+#  else
+#   define BITS_PER_LONG 64
+#  endif
+# endif
+#endif
+
+#if BITS_PER_LONG > 32
+# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a)
+# define LL_POISON ((long_ptr)0x5a5a5a5a5a5a5a5a)
+# define LP_POISON ((char *)(long_ptr)0x5a5a5a5a5a5a5a5a)
+#else
+# define LI_POISON ((int)0x5a5a5a5a)
+# define LL_POISON ((long_ptr)0x5a5a5a5a)
+# define LP_POISON ((char *)(long_ptr)0x5a5a5a5a)
+#endif
+
+#if defined(__x86_64__)
+# define LPU64 "%I64u"
+# define LPD64 "%I64d"
+# define LPX64 "%I64x"
+# define LPSZ  "%lu"
+# define LPSSZ "%ld"
+#elif (BITS_PER_LONG == 32 || __WORDSIZE == 32)
+# define LPU64 "%I64u"
+# define LPD64 "%I64d"
+# define LPX64 "%I64x"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#elif (BITS_PER_LONG == 64 || __WORDSIZE == 64)
+# define LPU64 "%I64u"
+# define LPD64 "%I64d"
+# define LPX64 "%I64x"
+# define LPSZ  "%u"
+# define LPSSZ "%d"
+#endif
+#ifndef LPU64
+# error "No word size defined"
+#endif
+
+#endif
diff --git a/lnet/include/libcfs/winnt/libcfs.h b/lnet/include/libcfs/winnt/libcfs.h
new file mode 100644 (file)
index 0000000..386eb5f
--- /dev/null
@@ -0,0 +1,126 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_LIBCFS_H__
+#define __LIBCFS_WINNT_LIBCFS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+/* workgroud for VC compiler */
+#ifndef __FUNCTION__
+#define __FUNCTION__ "generic"
+#endif
+
+#include <libcfs/winnt/winnt-types.h>
+#include <libcfs/portals_utils.h>
+#include <libcfs/winnt/winnt-time.h>
+#include <libcfs/winnt/winnt-lock.h>
+#include <libcfs/winnt/winnt-mem.h>
+#include <libcfs/winnt/winnt-prim.h>
+#include <libcfs/winnt/winnt-fs.h>
+#include <libcfs/winnt/winnt-tcpip.h>
+
+struct ptldebug_header {
+        __u32 ph_len;
+        __u32 ph_flags;
+        __u32 ph_subsys;
+        __u32 ph_mask;
+        __u32 ph_cpu_id;
+        __u32 ph_sec;
+        __u64 ph_usec;
+        __u32 ph_stack;
+        __u32 ph_pid;
+        __u32 ph_extern_pid;
+        __u32 ph_line_num;
+} __attribute__((packed));
+
+#ifdef __KERNEL__
+
+enum {
+       /* if you change this, update darwin-util.c:cfs_stack_trace_fill() */
+       CFS_STACK_TRACE_DEPTH = 16
+};
+
+struct cfs_stack_trace {
+       void *frame[CFS_STACK_TRACE_DEPTH];
+};
+
+static inline __u32 query_stack_size()
+{
+    ULONG   LowLimit, HighLimit;
+
+    IoGetStackLimits(&LowLimit, &HighLimit);
+    ASSERT(HighLimit > LowLimit);
+
+    return (__u32) (HighLimit - LowLimit);
+}
+#else
+static inline __u32 query_stack_size()
+{
+   return 4096;
+}
+#endif
+
+
+#ifndef THREAD_SIZE
+# define THREAD_SIZE query_stack_size()
+#endif
+
+#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
+
+#ifdef __KERNEL__
+# ifdef  __ia64__
+#  define CDEBUG_STACK() (THREAD_SIZE -                         \
+                          ((ulong_ptr)__builtin_dwarf_cfa() &   \
+                           (THREAD_SIZE - 1)))
+# else
+#  define CDEBUG_STACK (IoGetRemainingStackSize())
+#  error "This doesn't seem right; CDEBUG_STACK should grow with the stack"
+# endif /* __ia64__ */
+
+#define CHECK_STACK()                                                   \
+do {                                                                    \
+        unsigned long _stack = CDEBUG_STACK();                          \
+                                                                        \
+        if (_stack > 3*THREAD_SIZE/4 && _stack > libcfs_stack) {        \
+                libcfs_stack = _stack;                                  \
+                libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_WARNING,      \
+                                 __FILE__, NULL, __LINE__,              \
+                                 "maximum lustre stack %lu\n", _stack); \
+        }                                                               \
+} while (0)
+#else /* !__KERNEL__ */
+#define CHECK_STACK() do { } while(0)
+#define CDEBUG_STACK() (0L)
+#endif /* __KERNEL__ */
+
+/* initial pid  */
+#define LUSTRE_LNET_PID          12345
+
+#define ENTRY_NESTING_SUPPORT (0)
+#define ENTRY_NESTING   do {;} while (0)
+#define EXIT_NESTING   do {;} while (0)
+#define __current_nesting_level() (0)
+
+#endif /* _WINNT_LIBCFS_H */
similarity index 69%
rename from lnet/include/libcfs/darwin/portals_lib.h
rename to lnet/include/libcfs/winnt/lltrace.h
index dde962a..9615e94 100644 (file)
@@ -1,5 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
  *
  *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
  *
  *   along with Lustre; if not, write to the Free Software
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * Basic library routines. 
+ * Basic library routines.
  *
  */
 
-#ifndef __LIBCFS_DARWIN_PORTALS_LIB_H__
-#define __LIBCFS_DARWIN_PORTALS_LIB_H__
+#ifndef __LIBCFS_WINNT_LLTRACE_H__
+#define __LIBCFS_WINNT_LLTRACE_H__
 
-#ifndef __LIBCFS_PORTALS_LIB_H__
-#error Do not #include this file directly. #include <libcfs/portals_lib.h> instead
+#ifndef __LIBCFS_LLTRACE_H__
+#error Do not #include this file directly. #include <libcfs/lltrace.h> instead
 #endif
 
-#include <string.h>
 
 #endif
similarity index 65%
rename from lnet/include/libcfs/linux/portals_lib.h
rename to lnet/include/libcfs/winnt/portals_compat25.h
index 99fd1bd..579b795 100644 (file)
@@ -1,5 +1,5 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
  *
  *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
  *
  *   along with Lustre; if not, write to the Free Software
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * Basic library routines. 
- *
  */
 
-#ifndef __LIBCFS_LINUX_PORTALS_LIB_H__
-#define __LIBCFS_LINUX_PORTALS_LIB_H__
+#ifndef __LIBCFS_WINNT_PORTALS_COMPAT_H__
+#define __LIBCFS_WINNT_PORTALS_COMPAT_H__
 
-#ifndef __LIBCFS_PORTALS_LIB_H__
-#error Do not #include this file directly. #include <libcfs/portals_lib.h> instead
-#endif
 
-#ifndef __KERNEL__
-# include <string.h>
-#else
-# include <asm/types.h>
-#endif
 
-#endif
+#endif /* _PORTALS_COMPAT_H */
diff --git a/lnet/include/libcfs/winnt/portals_utils.h b/lnet/include/libcfs/winnt/portals_utils.h
new file mode 100644 (file)
index 0000000..ec80692
--- /dev/null
@@ -0,0 +1,168 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_PORTALS_UTILS_H__
+#define __LIBCFS_WINNT_PORTALS_UTILS_H__
+
+#ifndef __LIBCFS_PORTALS_UTILS_H__
+#error Do not #include this file directly. #include <libcfs/portals_utils.h> instead
+#endif
+
+#ifndef cfs_is_flag_set
+#define cfs_is_flag_set(x,f) (((x)&(f))==(f))
+#endif
+
+#ifndef cfs_set_flag
+#define cfs_set_flag(x,f)    ((x) |= (f))
+#endif
+
+#ifndef cfs_clear_flag
+#define cfs_clear_flag(x,f)  ((x) &= ~(f))
+#endif
+
+
+static inline __u32 __do_div(__u32 * n, __u32 b) 
+{
+    __u32   mod;
+
+    mod = *n % b;
+    *n  = *n / b;
+    return mod;
+} 
+
+#define do_div(n,base)  __do_div((__u32 *)&(n), (__u32) (base))
+
+#ifdef __KERNEL__
+
+#include <stdlib.h>
+#include <libcfs/winnt/winnt-types.h>
+
+char * strsep(char **s, const char *ct);
+static inline size_t strnlen(const char * s, size_t count) {
+    size_t len = 0;
+    while(len < count && s[len++]);
+    return len;
+}
+char * ul2dstr(ulong_ptr address, char *buf, int len);
+
+#define simple_strtol(a1, a2, a3)               strtol(a1, a2, a3)
+#define simple_strtoll(a1, a2, a3)              (__s64)strtoull(a1, a2, a3)
+#define simple_strtoull(a1, a2, a3)             strtoull(a1, a2, a3)
+
+unsigned long simple_strtoul(const char *cp,char **endp, unsigned int base);
+
+static inline int test_bit(int nr, void * addr)
+{
+    return ((1UL << (nr & 31)) & (((volatile ULONG *) addr)[nr >> 5])) != 0;
+}
+
+static inline void clear_bit(int nr, void * addr)
+{
+    (((volatile ULONG *) addr)[nr >> 5]) &= (~(1UL << (nr & 31)));
+}
+
+
+static inline void set_bit(int nr, void * addr)
+{
+    (((volatile ULONG *) addr)[nr >> 5]) |= (1UL << (nr & 31));
+}
+
+static inline void read_random(char *buf, int len)
+{
+    ULONG   Seed = (ULONG) buf;
+    Seed = RtlRandom(&Seed);
+    while (len >0) {
+        if (len > sizeof(ULONG)) {
+            memcpy(buf, &Seed, sizeof(ULONG));
+            len -= sizeof(ULONG);
+            buf += sizeof(ULONG);
+        } else {
+            memcpy(buf, &Seed, len);
+            len = 0;
+            break;
+        } 
+    }
+}
+#define get_random_bytes(buf, len)  read_random(buf, len)
+
+/* do NOT use function or expression as parameters ... */
+
+#ifndef min_t
+#define min_t(type,x,y) (type)(x) < (type)(y) ? (x): (y)
+#endif
+
+#ifndef max_t
+#define max_t(type,x,y) (type)(x) < (type)(y) ? (y): (x)
+#endif
+
+
+#define NIPQUAD(addr)                      \
+       ((unsigned char *)&addr)[0],    \
+       ((unsigned char *)&addr)[1],    \
+       ((unsigned char *)&addr)[2],    \
+       ((unsigned char *)&addr)[3]
+
+#define HIPQUAD(addr)                      \
+       ((unsigned char *)&addr)[3],    \
+       ((unsigned char *)&addr)[2],    \
+       ((unsigned char *)&addr)[1],    \
+       ((unsigned char *)&addr)[0]
+
+static int copy_from_user(void *to, void *from, int c) 
+{
+    memcpy(to, from, c);
+    return 0;
+}
+
+static int copy_to_user(void *to, void *from, int c) 
+{
+    memcpy(to, from, c);
+    return 0;
+}
+
+
+#define put_user(x, ptr)        \
+(                               \
+    *(ptr) = x,                 \
+    0                           \
+)
+
+
+#define get_user(x,ptr)         \
+(                               \
+    x = *(ptr),                 \
+    0                           \
+)
+
+#define num_physpages                  (64 * 1024)
+
+#define snprintf  _snprintf
+#define vsnprintf _vsnprintf
+
+
+#endif /* !__KERNEL__ */
+
+int cfs_error_code(NTSTATUS);
+
+#endif
diff --git a/lnet/include/libcfs/winnt/winnt-fs.h b/lnet/include/libcfs/winnt/winnt-fs.h
new file mode 100644 (file)
index 0000000..6280b93
--- /dev/null
@@ -0,0 +1,280 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * File operations & routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_FS_H__
+#define __LIBCFS_WINNT_CFS_FS_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+
+/*
+ * Platform defines
+ *
+ * cfs_rdev_t
+ */
+
+typedef unsigned short cfs_rdev_t;
+
+typedef unsigned int cfs_major_nr_t;
+typedef unsigned int cfs_minor_nr_t;
+
+
+#define MINORBITS      8
+#define MINORMASK      ((1U << MINORBITS) - 1)
+
+#define MAJOR(dev)     ((unsigned int) ((dev) >> MINORBITS))
+#define MINOR(dev)     ((unsigned int) ((dev) & MINORMASK))
+#define NODEV          0
+#define MKDEV(ma,mi)   (((ma) << MINORBITS) | (mi))
+
+
+static inline cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+    return MKDEV(major, minor);
+}
+
+static inline cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+    return MAJOR(rdev);
+}
+
+static inline cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
+{
+    return MINOR(rdev);
+}
+
+
+#ifdef __KERNEL__
+
+struct file_operations
+{
+    loff_t (*lseek)(struct file * file, loff_t offset, int origin);
+    ssize_t (*read) (struct file * file, char * buf, size_t nbytes, loff_t *ppos);
+    ssize_t (*write)(struct file * file, const char * buffer,
+        size_t count, loff_t *ppos);
+    int (*ioctl) (struct file *, unsigned int, ulong_ptr);
+    int (*open) (struct file *);
+    int (*release) (struct file *);
+};
+
+struct file {
+
+    cfs_handle_t            f_handle;
+    unsigned int            f_flags;
+    mode_t                  f_mode;
+    ulong_ptr           f_count;
+
+    //struct list_head      f_list;
+    //struct dentry *       f_dentry;
+
+    cfs_proc_entry_t *      proc_dentry;
+    cfs_file_operations_t * f_op;
+
+    size_t                  f_size;
+    loff_t                  f_pos;
+    unsigned int            f_uid, f_gid;
+    int                     f_error;
+
+    ulong_ptr           f_version;
+
+    void *                  private_data;
+
+    char                    f_name[1];
+
+};
+
+#define cfs_filp_size(f)               ((f)->f_size)
+#define cfs_filp_poff(f)                (&(f)->f_pos)
+
+cfs_file_t *cfs_filp_open(const char *name, int flags, int mode, int *err);
+int cfs_filp_close(cfs_file_t *fp);
+int cfs_filp_read(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos);
+int cfs_filp_write(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos);
+int cfs_filp_fsync(cfs_file_t *fp);
+int cfs_get_file(cfs_file_t *fp);
+int cfs_put_file(cfs_file_t *fp);
+int cfs_file_count(cfs_file_t *fp);
+
+
+
+/*
+ * CFS_FLOCK routines
+ */
+
+typedef struct file_lock{
+    int         fl_type;
+    pid_t       fl_pid;
+    size_t      fl_len;
+    off_t       fl_start;
+    off_t       fl_end;
+} cfs_flock_t; 
+
+#define CFS_INT_LIMIT(x)               (~((x)1 << (sizeof(x)*8 - 1)))
+#define CFS_OFFSET_MAX                 CFS_INT_LIMIT(loff_t)
+
+#define cfs_flock_type(fl)                  ((fl)->fl_type)
+#define cfs_flock_set_type(fl, type)        do { (fl)->fl_type = (type); } while(0)
+#define cfs_flock_pid(fl)                   ((fl)->fl_pid)
+#define cfs_flock_set_pid(fl, pid)          do { (fl)->fl_pid = (pid); } while(0)
+#define cfs_flock_start(fl)                 ((fl)->fl_start)
+#define cfs_flock_set_start(fl, start)      do { (fl)->fl_start = (start); } while(0)
+#define cfs_flock_end(fl)                   ((fl)->fl_end)
+#define cfs_flock_set_end(fl, end)          do { (fl)->fl_end = (end); } while(0)
+
+#define ATTR_MODE       0x0001
+#define ATTR_UID        0x0002
+#define ATTR_GID        0x0004
+#define ATTR_SIZE       0x0008
+#define ATTR_ATIME      0x0010
+#define ATTR_MTIME      0x0020
+#define ATTR_CTIME      0x0040
+#define ATTR_ATIME_SET  0x0080
+#define ATTR_MTIME_SET  0x0100
+#define ATTR_FORCE      0x0200  /* Not a change, but a change it */
+#define ATTR_ATTR_FLAG  0x0400
+#define ATTR_RAW        0x0800  /* file system, not vfs will massage attrs */
+#define ATTR_FROM_OPEN  0x1000  /* called from open path, ie O_TRUNC */
+//#define ATTR_CTIME_SET  0x2000
+#define ATTR_BLOCKS     0x4000
+
+#define in_group_p(x)  (0)
+
+/*
+ * proc fs routines
+ */
+
+int proc_init_fs();
+void proc_destroy_fs();
+
+
+/*
+ *  misc
+ */
+
+static inline void *ERR_PTR(long_ptr error)
+{
+       return (void *) error;
+}
+
+static inline long_ptr PTR_ERR(const void *ptr)
+{
+       return (long_ptr) ptr;
+}
+
+static inline long_ptr IS_ERR(const void *ptr)
+{
+       return (ulong_ptr)ptr > (ulong_ptr)-1000L;
+}
+
+#else  /* !__KERNEL__ */
+
+#define CREATE_NEW          1
+#define CREATE_ALWAYS       2
+#define OPEN_EXISTING       3
+#define OPEN_ALWAYS         4
+#define TRUNCATE_EXISTING   5
+
+#define SECTION_QUERY       0x0001
+#define SECTION_MAP_WRITE   0x0002
+#define SECTION_MAP_READ    0x0004
+#define SECTION_MAP_EXECUTE 0x0008
+#define SECTION_EXTEND_SIZE 0x0010
+
+#define FILE_MAP_COPY       SECTION_QUERY
+#define FILE_MAP_WRITE      SECTION_MAP_WRITE
+#define FILE_MAP_READ       SECTION_MAP_READ
+#define FILE_MAP_ALL_ACCESS SECTION_ALL_ACCESS
+
+
+NTSYSAPI
+HANDLE
+NTAPI
+CreateFileA(
+    IN LPCSTR lpFileName,
+    IN DWORD dwDesiredAccess,
+    IN DWORD dwShareMode,
+    IN PVOID lpSecurityAttributes,
+    IN DWORD dwCreationDisposition,
+    IN DWORD dwFlagsAndAttributes,
+    IN HANDLE hTemplateFile
+    );
+
+#define CreateFile  CreateFileA
+
+NTSYSAPI
+BOOL
+NTAPI
+CloseHandle(
+    IN OUT HANDLE hObject
+    );
+
+NTSYSAPI
+HANDLE
+NTAPI
+CreateFileMappingA(
+    IN HANDLE hFile,
+    IN PVOID lpFileMappingAttributes,
+    IN DWORD flProtect,
+    IN DWORD dwMaximumSizeHigh,
+    IN DWORD dwMaximumSizeLow,
+    IN LPCSTR lpName
+    );
+#define CreateFileMapping  CreateFileMappingA
+
+NTSYSAPI
+DWORD
+NTAPI
+GetFileSize(
+    IN HANDLE hFile,
+    OUT DWORD * lpFileSizeHigh
+    );
+
+NTSYSAPI
+PVOID
+NTAPI
+MapViewOfFile(
+    IN HANDLE hFileMappingObject,
+    IN DWORD dwDesiredAccess,
+    IN DWORD dwFileOffsetHigh,
+    IN DWORD dwFileOffsetLow,
+    IN SIZE_T dwNumberOfBytesToMap
+    );
+
+NTSYSAPI
+BOOL
+NTAPI
+UnmapViewOfFile(
+    IN PVOID lpBaseAddress
+    );
+
+#endif /* __KERNEL__ */
+
+typedef struct {
+       void    *d;
+} cfs_dentry_t;
+
+
+#endif /* __LIBCFS_WINNT_CFS_FS_H__*/
diff --git a/lnet/include/libcfs/winnt/winnt-lock.h b/lnet/include/libcfs/winnt/winnt-lock.h
new file mode 100644 (file)
index 0000000..e0b9393
--- /dev/null
@@ -0,0 +1,686 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_LOCK_H__
+#define __LIBCFS_WINNT_CFS_LOCK_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifdef __KERNEL__
+
+
+/*
+ *  nt specific part ...
+ */
+
+
+/* atomic */
+
+typedef struct { volatile int counter; } atomic_t;
+
+#define ATOMIC_INIT(i) { i }
+
+#define atomic_read(v) ((v)->counter)
+#define atomic_set(v,i)                (((v)->counter) = (i))
+
+void FASTCALL atomic_add(int i, atomic_t *v);
+void FASTCALL atomic_sub(int i, atomic_t *v);
+
+int FASTCALL atomic_sub_and_test(int i, atomic_t *v);
+
+void FASTCALL atomic_inc(atomic_t *v);
+void FASTCALL atomic_dec(atomic_t *v);
+
+int FASTCALL atomic_dec_and_test(atomic_t *v);
+int FASTCALL atomic_inc_and_test(atomic_t *v);
+
+
+/* event */
+
+typedef KEVENT          event_t;
+
+/*
+ * cfs_init_event
+ *   To initialize the event object
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *   type:   Non Zero: SynchronizationEvent
+ *           Zero: NotificationEvent
+ *   status: the initial stats of the event
+ *           Non Zero: signaled
+ *           Zero: un-signaled
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+static inline void
+    cfs_init_event(event_t *event, int type, int status)
+{
+    KeInitializeEvent(
+            event,
+            (type) ? SynchronizationEvent: NotificationEvent,
+            (status) ? TRUE : FALSE
+            );
+}
+
+/*
+ * cfs_wait_event
+ *   To wait on an event to syncrhonize the process
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *   timeout: the timeout for waitting or 0 means infinite time.
+ *
+ * Return Value:
+ *   Zero:   waiting timeouts
+ *   Non Zero: event signaled ...
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline int64_t
+cfs_wait_event(event_t * event, int64_t timeout)
+{
+    NTSTATUS        Status;
+    LARGE_INTEGER   TimeOut;
+
+    TimeOut.QuadPart = -1 * (10000000/HZ) * timeout;
+
+    Status = KeWaitForSingleObject(
+                event,
+                Executive,
+                KernelMode,
+                FALSE,
+                (timeout != 0) ? (&TimeOut) : (NULL)
+                );
+
+    if (Status == STATUS_TIMEOUT)  {
+        return 0;
+    }
+
+    return TRUE; // signaled case
+}
+
+/*
+ * cfs_wake_event
+ *   To signal the event object
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline int
+cfs_wake_event(event_t * event)
+{
+    return (KeSetEvent(event, 0, FALSE) != 0);
+}
+
+/*
+ * cfs_clear_event
+ *   To clear/reset the status of the event object
+ *
+ * Arguments:
+ *   event:  pointer to the event object
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void
+cfs_clear_event(event_t * event)
+{
+    KeResetEvent(event);
+}
+
+
+/*
+ * IMPORTANT !!!!!!!!
+ *
+ * All locks' declaration are not guaranteed to be initialized,
+ * Althought some of they are initialized in Linux. All locks
+ * declared by CFS_DECL_* should be initialized explicitly.
+ */
+
+
+/*
+ * spin lock defintions / routines
+ */
+
+/*
+ * Warning:
+ *
+ * for spinlock operations, try to grab nesting acquisition of
+ * spinlock will cause dead-lock in MP system and current irql 
+ * overwritten for UP system. (UP system could allow nesting spin
+ * acqisition, because it's not spin at all just raising the irql.)
+ *
+ */
+
+typedef struct spin_lock {
+
+    KSPIN_LOCK lock;
+    KIRQL      irql;
+
+} spinlock_t;
+
+
+#define CFS_DECL_SPIN(name)  spinlock_t name;
+#define CFS_DECL_SPIN_EXTERN(name)  extern spinlock_t name;
+
+
+static inline void spin_lock_init(spinlock_t *lock)
+{
+    KeInitializeSpinLock(&(lock->lock));
+}
+
+
+static inline void spin_lock(spinlock_t *lock)
+{
+    KeAcquireSpinLock(&(lock->lock), &(lock->irql));
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+    KIRQL       irql = lock->irql;
+    KeReleaseSpinLock(&(lock->lock), irql);
+}
+
+
+#define spin_lock_irqsave(lock, flags)         do {(flags) = 0; spin_lock(lock);} while(0)
+#define spin_unlock_irqrestore(lock, flags)    do {spin_unlock(lock);} while(0)
+
+
+/* There's no  corresponding routine in windows kernel.
+   We must realize a light one of our own.  But there's
+   no way to identify the system is MP build or UP build
+   on the runtime. We just uses a workaround for it. */
+
+extern int MPSystem;
+
+static int spin_trylock(spinlock_t *lock)
+{
+    KIRQL   Irql;
+    int     rc = 0;
+
+    ASSERT(lock != NULL);
+
+    KeRaiseIrql(DISPATCH_LEVEL, &Irql);
+
+    if (MPSystem) {
+        if (0 == (ulong_ptr)lock->lock) {
+#if _X86_
+            __asm {
+                mov  edx, dword ptr [ebp + 8]
+                lock bts dword ptr[edx], 0
+                jb   lock_failed
+                mov  rc, TRUE
+            lock_failed:
+            }
+#else
+        KdBreakPoint();
+#endif
+
+        }
+    } else {
+        rc = TRUE;
+    }
+
+    if (rc) {
+        lock->irql = Irql;
+    } else {
+        KeLowerIrql(Irql);
+    }
+
+    return rc;
+}
+
+/* synchronization between cpus: it will disable all DPCs
+   kernel task scheduler on the CPU */
+#define spin_lock_bh(x)                    spin_lock(x)
+#define spin_unlock_bh(x)          spin_unlock(x)
+#define spin_lock_bh_init(x)   spin_lock_init(x)
+
+/*
+ * rw_semaphore (using ERESOURCE)
+ */
+
+
+typedef struct rw_semaphore {
+    ERESOURCE   rwsem;
+} rw_semaphore_t;
+
+
+#define CFS_DECL_RWSEM(name) rw_semaphore_t name
+#define CFS_DECL_RWSEM_EXTERN(name) extern rw_semaphore_t name
+
+
+/*
+ * init_rwsem
+ *   To initialize the the rw_semaphore_t structure
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void init_rwsem(rw_semaphore_t *s)
+{
+       ExInitializeResourceLite(&s->rwsem);
+}
+
+
+/*
+ * fini_rwsem
+ *   To finilize/destroy the the rw_semaphore_t structure
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   For winnt system, we need this routine to delete the ERESOURCE.
+ *   Just define it NULL for other systems.
+ */
+
+static inline void fini_rwsem(rw_semaphore_t *s)
+{
+    ExDeleteResourceLite(&s->rwsem);
+}
+
+/*
+ * down_read
+ *   To acquire read-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void down_read(struct rw_semaphore *s)
+{
+       ExAcquireResourceSharedLite(&s->rwsem, TRUE);
+}
+
+
+/*
+ * down_read_trylock
+ *   To acquire read-lock of the rw_semahore without blocking
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   Zero: failed to acquire the read lock
+ *   Non-Zero: succeeded to acquire the read lock
+ *
+ * Notes: 
+ *   This routine will return immediately without waiting.
+ */
+
+static inline int down_read_trylock(struct rw_semaphore *s)
+{
+       return ExAcquireResourceSharedLite(&s->rwsem, FALSE);
+}
+
+
+/*
+ * down_write
+ *   To acquire write-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void down_write(struct rw_semaphore *s)
+{
+       ExAcquireResourceExclusiveLite(&(s->rwsem), TRUE);
+}
+
+
+/*
+ * down_write_trylock
+ *   To acquire write-lock of the rw_semahore without blocking
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   Zero: failed to acquire the write lock
+ *   Non-Zero: succeeded to acquire the read lock
+ *
+ * Notes: 
+ *   This routine will return immediately without waiting.
+ */
+
+static inline int down_write_trylock(struct rw_semaphore *s)
+{
+    return ExAcquireResourceExclusiveLite(&(s->rwsem), FALSE);
+}
+
+
+/*
+ * up_read
+ *   To release read-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void up_read(struct rw_semaphore *s)
+{
+    ExReleaseResourceForThreadLite(
+            &(s->rwsem),
+            ExGetCurrentResourceThread());
+}
+
+
+/*
+ * up_write
+ *   To release write-lock of the rw_semahore
+ *
+ * Arguments:
+ *   rwsem:  pointer to the rw_semaphore_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void up_write(struct rw_semaphore *s)
+{
+    ExReleaseResourceForThreadLite(
+                &(s->rwsem),
+                ExGetCurrentResourceThread());
+}
+
+/*
+ * rwlock_t (using sempahore)
+ *
+ * - rwlock_init(x)
+ * - read_lock(x)
+ * - read_unlock(x)
+ * - write_lock(x)
+ * - write_unlock(x)
+ */
+
+typedef struct {
+    spinlock_t guard;
+    int        count;
+} rwlock_t;
+
+void rwlock_init(rwlock_t * rwlock);
+void rwlock_fini(rwlock_t * rwlock);
+
+void read_lock(rwlock_t * rwlock);
+void read_unlock(rwlock_t * rwlock);
+void write_lock(rwlock_t * rwlock);
+void write_unlock(rwlock_t * rwlock);
+
+#define write_lock_irqsave(l, f)        do {f = 0; write_lock(l);} while(0)
+#define write_unlock_irqrestore(l, f)   do {write_unlock(l);} while(0)
+#define read_lock_irqsave(l, f)                do {f=0; read_lock(l);} while(0)
+#define read_unlock_irqrestore(l, f)    do {read_unlock(l);} while(0)
+
+
+/*
+ * Semaphore
+ *
+ * - sema_init(x, v)
+ * - __down(x)
+ * - __up(x)
+ */
+
+typedef struct semaphore {
+       KSEMAPHORE sem;
+} mutex_t;
+
+static inline void sema_init(struct semaphore *s, int val)
+{
+       KeInitializeSemaphore(&s->sem, val, val);
+}
+
+static inline void __down(struct semaphore *s)
+{
+   KeWaitForSingleObject( &(s->sem), Executive,
+                          KernelMode, FALSE, NULL );
+
+}
+
+static inline void __up(struct semaphore *s)
+{
+       KeReleaseSemaphore(&s->sem, 0, 1, FALSE);
+}
+
+/*
+ * mutex_t:
+ *
+ * - init_mutex(x)
+ * - init_mutex_locked(x)
+ * - mutex_up(x)
+ * - mutex_down(x)
+ */
+
+
+/*
+ * init_mutex
+ *   To initialize a mutex_t structure
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void init_mutex(mutex_t *mutex)
+{
+    sema_init(mutex, 1);
+}
+
+
+/*
+ * mutex_down
+ *   To acquire the mutex lock
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void mutex_down(mutex_t *mutex)
+{
+    __down(mutex);
+}
+
+
+/*
+ * mutex_up
+ *   To release the mutex lock (acquired already)
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void mutex_up(mutex_t *mutex)
+{
+    __up(mutex);
+}
+
+
+/*
+ * init_mutex_locked
+ *   To initialize the mutex as acquired state
+ *
+ * Arguments:
+ *   mutex:  pointer to the mutex_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline init_mutex_locked(mutex_t *mutex)
+{
+    init_mutex(mutex);
+    mutex_down(mutex);
+}
+
+/*
+ * completion
+ *
+ * - init_complition(c)
+ * - complete(c)
+ * - wait_for_completion(c)
+ */
+
+struct completion {
+       event_t  event;
+};
+
+
+/*
+ * init_completion
+ *   To initialize the completion object
+ *
+ * Arguments:
+ *   c:  pointer to the completion structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void init_completion(struct completion *c)
+{
+       cfs_init_event(&(c->event), 1, FALSE);
+}
+
+
+/*
+ * complete
+ *   To complete/signal the completion object
+ *
+ * Arguments:
+ *   c:  pointer to the completion structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void complete(struct completion *c)
+{
+       cfs_wake_event(&(c->event));
+}
+
+/*
+ * wait_for_completion
+ *   To wait on the completion object. If the event is signaled,
+ *   this function will return to the call with the event un-singled.
+ *
+ * Arguments:
+ *   c:  pointer to the completion structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+static inline void wait_for_completion(struct completion *c)
+{
+    cfs_wait_event(&(c->event), 0);
+}
+
+/* __KERNEL__ */
+#else
+
+#include "../user-lock.h"
+
+/* __KERNEL__ */
+#endif
+#endif
diff --git a/lnet/include/libcfs/winnt/winnt-mem.h b/lnet/include/libcfs/winnt/winnt-mem.h
new file mode 100644 (file)
index 0000000..b7f00a4
--- /dev/null
@@ -0,0 +1,133 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines of memory manipulation routines .
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_MEM_H__
+#define __LIBCFS_WINNT_CFS_MEM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+#ifdef __KERNEL__
+
+#define CFS_PAGE_SIZE                   PAGE_SIZE
+#define CFS_PAGE_SHIFT                  PAGE_SHIFT
+#define CFS_PAGE_MASK                   (~(PAGE_SIZE - 1))
+
+typedef struct cfs_page {
+    void *      addr;
+    atomic_t    count;
+} cfs_page_t;
+
+
+cfs_page_t *cfs_alloc_page(int flags);
+void cfs_free_page(cfs_page_t *pg);
+
+static inline void *cfs_page_address(cfs_page_t *page)
+{
+    return page->addr;
+}
+
+static inline void *cfs_kmap(cfs_page_t *page)
+{
+    return page->addr;
+}
+
+static inline void cfs_kunmap(cfs_page_t *page)
+{
+    return;
+}
+
+static inline void cfs_get_page(cfs_page_t *page)
+{
+    atomic_inc(&page->count);
+}
+
+static inline void cfs_put_page(cfs_page_t *page)
+{
+    atomic_dec(&page->count);
+}
+
+static inline int cfs_page_count(cfs_page_t *page)
+{
+    return atomic_read(&page->count);
+}
+
+/*
+ * Memory allocator
+ */
+
+#define CFS_ALLOC_ATOMIC_TRY   (0)
+
+extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags);
+extern void  cfs_free(void *addr);
+
+extern void *cfs_alloc_large(size_t nr_bytes);
+extern void  cfs_free_large(void *addr);
+
+/*
+ * SLAB allocator
+ */
+
+#define SLAB_HWCACHE_ALIGN             0
+
+/* The cache name is limited to 20 chars */
+
+typedef struct cfs_mem_cache {
+
+    char                    name[20];
+    ulong_ptr           flags;
+    NPAGED_LOOKASIDE_LIST   npll;
+
+} cfs_mem_cache_t;
+
+
+extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, ulong_ptr);
+extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * );
+extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int);
+extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *);
+
+
+/*
+ * Page allocator slabs 
+ */
+
+extern cfs_mem_cache_t *cfs_page_t_slab;
+extern cfs_mem_cache_t *cfs_page_p_slab;
+
+
+#define CFS_DECL_MMSPACE
+#define CFS_MMSPACE_OPEN    do {} while(0)
+#define CFS_MMSPACE_CLOSE   do {} while(0)
+
+
+#define mb()    do {} while(0)
+#define rmb()   mb()
+#define wmb()   mb()
+
+
+/* __KERNEL__ */
+#endif
+
+#endif /* __WINNT_CFS_MEM_H__ */
diff --git a/lnet/include/libcfs/winnt/winnt-prim.h b/lnet/include/libcfs/winnt/winnt-prim.h
new file mode 100644 (file)
index 0000000..3c8560b
--- /dev/null
@@ -0,0 +1,1082 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic library routines.
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_CFS_PRIM_H__
+#define __LIBCFS_WINNT_CFS_PRIM_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+
+/*
+ * libcfs proc device object
+ */
+
+
+#define LUSTRE_PROC_DEVICE  L"\\Device\\lproc"      /* proc fs emulator device object */
+#define LUSTRE_PROC_SYMLNK  L"\\DosDevices\\lproc"  /* proc fs user-visible device */
+
+
+/*
+ * Device IO Control Code Definitions
+ */
+
+#define FILE_DEVICE_LIBCFS      ('LC')
+
+#define FILE_DEVICE_LIBCFS      ('LC')
+
+#define FUNC_LIBCFS_VERSION     0x101  // get version of current libcfs
+#define FUNC_LIBCFS_IOCTL       0x102  // Device i/o control to proc fs
+
+
+#define IOCTL_LIBCFS_VERSION \
+     CTL_CODE (FILE_DEVICE_LIBCFS, FUNC_LIBCFS_VERSION, METHOD_BUFFERED, FILE_ANY_ACCESS)
+#define IOCTL_LIBCFS_ENTRY   \
+     CTL_CODE(FILE_DEVICE_LIBCFS, FUNC_LIBCFS_IOCTL,   METHOD_BUFFERED, FILE_ANY_ACCESS)
+
+#pragma pack(4)
+
+typedef struct _CFS_PROC_IOCTL {
+
+    ULONG           cmd;    // ioctl command identifier
+    ULONG           len;    // length of data
+
+    // UCHAR        data[]; // content of the real ioctl
+
+} CFS_PROC_IOCTL, *PCFS_PROC_IOCTL;
+
+#pragma pack()
+
+#ifdef __KERNEL__
+
+#include <libcfs/list.h>
+
+/*
+ * Symbol functions for libcfs
+ *
+ * OSX has no facility for use to register symbol.
+ * So we have to implement it.
+ */
+#define CFS_SYMBOL_LEN     64
+
+struct  cfs_symbol {
+       char    name[CFS_SYMBOL_LEN];
+       void    *value;
+       int     ref;
+       struct  list_head sym_list;
+};
+
+extern int      cfs_symbol_register(const char *, const void *);
+extern void     cfs_symbol_unregister(const char *);
+extern void *   cfs_symbol_get(const char *);
+extern void     cfs_symbol_put(const char *);
+extern void     cfs_symbol_clean();
+
+
+
+typedef struct file_operations cfs_file_operations_t;
+typedef struct file cfs_file_t;
+
+/*
+ * Pseudo device register
+ */
+
+typedef struct
+{
+    int                     minor;
+    const char *            name;
+    cfs_file_operations_t * fops;
+} cfs_psdev_t;
+
+int cfs_psdev_register(cfs_psdev_t * psdev);
+int cfs_psdev_deregister(cfs_psdev_t * psdev);
+
+
+/*
+ * Proc emulator file system APIs
+ */
+
+typedef int cfs_read_proc_t(char *page, char **start, off_t off,
+                         int count, int *eof, void *data);
+typedef int cfs_write_proc_t(struct file *file, const char *buffer,
+                          ulong_ptr count, void *data);
+
+#define CFS_PROC_ENTRY_MAGIC 'CPEM'
+
+#define CFS_PROC_FLAG_DIRECTORY    0x00000001 // directory node
+#define CFS_PROC_FLAG_ATTACHED     0x00000002 // node is attached to proc
+#define CFS_PROC_FLAG_MISCDEV      0x00000004 // miscellaneous device
+
+typedef struct cfs_proc_entry
+{
+    ULONG                   magic;      // Magic
+    ULONG                   flags;      // Flags
+
+    struct _dir_entry {                 // proc directory entry
+        PRTL_SPLAY_LINKS    root;
+    };
+
+    struct _file_entry {                // proc file / leaf entry
+           cfs_read_proc_t  *  read_proc;
+           cfs_write_proc_t *  write_proc;
+    };
+
+    mode_t                  mode;
+    unsigned short          nlink;
+
+       
+    struct file_operations * proc_fops;
+       void * data;
+
+    // proc_dir_entry ended.
+
+    RTL_SPLAY_LINKS         s_link;       // splay link
+
+    //
+    // Maximum length of proc entry name is 0x20
+    //
+
+    char                    name[0x20];
+
+} cfs_proc_entry_t, cfs_proc_dir_entry_t;
+
+typedef cfs_proc_entry_t cfs_proc_dir_entry_t;
+
+#define PROC_BLOCK_SIZE    PAGE_SIZE
+
+/*
+ * Sysctl register
+ */
+
+typedef struct ctl_table                   cfs_sysctl_table_t;
+typedef struct ctl_table_header                cfs_sysctl_table_header_t;
+
+
+typedef int ctl_handler (
+            cfs_sysctl_table_t *table,
+            int *name,    int nlen,
+                       void *oldval, size_t *oldlenp,
+                       void *newval, size_t newlen, 
+                       void **context );
+
+typedef int proc_handler (
+            cfs_sysctl_table_t *ctl,
+            int write, struct file * filp,
+                       void *buffer, size_t *lenp );
+
+
+int proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp,
+                    void *buffer, size_t *lenp);
+
+int proc_dostring(cfs_sysctl_table_t *table, int write, struct file *filp,
+                 void *buffer, size_t *lenp);
+
+int sysctl_string(cfs_sysctl_table_t *table, int *name, int nlen,
+                 void *oldval, size_t *oldlenp,
+                 void *newval, size_t newlen, void **context);
+
+
+/*
+ *  System io control definitions
+ */
+
+#define CTL_MAXNAME 10
+
+#define CTL_ANY     -1  /* Matches any name */
+#define CTL_NONE    0
+
+enum
+{
+    CTL_KERN=1,     /* General kernel info and control */
+    CTL_VM=2,       /* VM management */
+    CTL_NET=3,      /* Networking */
+    CTL_PROC=4,     /* Process info */
+    CTL_FS=5,       /* Filesystems */
+    CTL_DEBUG=6,        /* Debugging */
+    CTL_DEV=7,      /* Devices */
+    CTL_BUS=8,      /* Busses */
+    CTL_ABI=9,      /* Binary emulation */
+    CTL_CPU=10      /* CPU stuff (speed scaling, etc) */
+};
+
+/* sysctl table definitons */
+struct ctl_table 
+{
+       int ctl_name;
+       char *procname;
+       void *data;
+       int maxlen;
+       mode_t mode;
+       cfs_sysctl_table_t *child;
+       proc_handler *proc_handler;     /* text formatting callback */
+       ctl_handler *strategy;          /* read / write callback functions */
+       cfs_proc_entry_t *de;   /* proc entry block */
+       void *extra1;
+       void *extra2;
+};
+
+
+/* the mantaner of the cfs_sysctl_table trees */
+struct ctl_table_header
+{
+       cfs_sysctl_table_t *    ctl_table;
+       struct list_head        ctl_entry;
+};
+
+
+cfs_proc_entry_t * create_proc_entry(char *name, mode_t mod,
+                                         cfs_proc_entry_t *parent);
+void proc_free_entry(cfs_proc_entry_t *de);
+void remove_proc_entry(char *name, cfs_proc_entry_t *entry);
+cfs_proc_entry_t * search_proc_entry(char * name,
+                        cfs_proc_entry_t *  root );
+
+#define cfs_create_proc_entry create_proc_entry
+#define cfs_free_proc_entry   proc_free_entry
+#define cfs_remove_proc_entry remove_proc_entry
+
+#define register_cfs_sysctl_table(t, a)        register_sysctl_table(t, a)
+#define unregister_cfs_sysctl_table(t) unregister_sysctl_table(t, a)
+
+
+/*
+ *  declaration of proc kernel process routines
+ */
+
+cfs_file_t *
+lustre_open_file(char * filename);
+
+int
+lustre_close_file(cfs_file_t * fh);
+
+int
+lustre_do_ioctl( cfs_file_t * fh,
+                 unsigned long cmd,
+                 ulong_ptr arg );
+
+int
+lustre_ioctl_file( cfs_file_t * fh,
+                   PCFS_PROC_IOCTL devctl);
+
+size_t
+lustre_read_file( cfs_file_t *    fh,
+                  loff_t          off,
+                  size_t          size,
+                  char *          buf
+                  );
+
+size_t
+lustre_write_file( cfs_file_t *    fh,
+                   loff_t          off,
+                   size_t          size,
+                   char *          buf
+                   );
+
+/*
+ * Wait Queue
+ */
+
+
+typedef int cfs_task_state_t;
+
+#define CFS_TASK_INTERRUPTIBLE 0x00000001
+#define CFS_TASK_UNINT         0x00000002
+
+
+
+#define CFS_WAITQ_MAGIC     'CWQM'
+#define CFS_WAITLINK_MAGIC  'CWLM'
+
+typedef struct cfs_waitq {
+
+    unsigned int        magic;
+    unsigned int        flags;
+    
+    spinlock_t          guard;
+    struct list_head    waiters;
+
+} cfs_waitq_t;
+
+
+typedef struct cfs_waitlink cfs_waitlink_t;
+
+#define CFS_WAITQ_CHANNELS     (2)
+
+#define CFS_WAITQ_CHAN_NORMAL  (0)
+#define CFS_WAITQ_CHAN_FORWARD (1)
+
+
+
+typedef struct cfs_waitlink_channel {
+    struct list_head        link;
+    cfs_waitq_t *           waitq;
+    cfs_waitlink_t *        waitl;
+} cfs_waitlink_channel_t;
+
+struct cfs_waitlink {
+
+    unsigned int            magic;
+    int                     flags;
+    event_t  *              event;
+    atomic_t *              hits;
+
+    cfs_waitlink_channel_t  waitq[CFS_WAITQ_CHANNELS];
+};
+
+enum {
+       CFS_WAITQ_EXCLUSIVE = 1
+};
+
+#define CFS_DECL_WAITQ(name) cfs_waitq_t name
+
+
+void cfs_waitq_init(struct cfs_waitq *waitq);
+void cfs_waitlink_init(struct cfs_waitlink *link);
+
+void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link);
+void cfs_waitq_add_exclusive(struct cfs_waitq *waitq, 
+                            struct cfs_waitlink *link);
+void cfs_waitq_forward(struct cfs_waitlink *link, struct cfs_waitq *waitq);
+void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link);
+int  cfs_waitq_active(struct cfs_waitq *waitq);
+
+void cfs_waitq_signal(struct cfs_waitq *waitq);
+void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr);
+void cfs_waitq_broadcast(struct cfs_waitq *waitq);
+
+void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state);
+cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, 
+                                  cfs_task_state_t state, cfs_duration_t timeout);
+
+
+
+/* Kernel thread */
+
+typedef int (*cfs_thread_t) (void *arg);
+
+typedef struct _cfs_thread_context {
+    cfs_thread_t        func;
+    void *              arg;
+} cfs_thread_context_t;
+
+int cfs_kernel_thread(int (*func)(void *), void *arg, int flag);
+
+/*
+ * thread creation flags from Linux, not used in winnt
+ */
+#define CSIGNAL         0x000000ff      /* signal mask to be sent at exit */
+#define CLONE_VM        0x00000100      /* set if VM shared between processes */
+#define CLONE_FS        0x00000200      /* set if fs info shared between processes */
+#define CLONE_FILES     0x00000400      /* set if open files shared between processes */
+#define CLONE_SIGHAND   0x00000800      /* set if signal handlers and blocked signals shared */
+#define CLONE_PID       0x00001000      /* set if pid shared */
+#define CLONE_PTRACE    0x00002000      /* set if we want to let tracing continue on the child too */
+#define CLONE_VFORK     0x00004000      /* set if the parent wants the child to wake it up on mm_release */
+#define CLONE_PARENT    0x00008000      /* set if we want to have the same parent as the cloner */
+#define CLONE_THREAD    0x00010000      /* Same thread group? */
+#define CLONE_NEWNS     0x00020000      /* New namespace group? */
+
+#define CLONE_SIGNAL    (CLONE_SIGHAND | CLONE_THREAD)
+
+
+/*
+ * sigset ...
+ */
+
+typedef sigset_t cfs_sigset_t;
+
+/*
+ * Task struct
+ */
+
+#define MAX_SCHEDULE_TIMEOUT    ((long_ptr)(~0UL>>12))
+
+
+#define NGROUPS 1
+#define CFS_CURPROC_COMM_MAX (16)
+typedef struct task_sruct{
+    mode_t umask;
+
+       pid_t pid;
+       pid_t pgrp;
+
+       uid_t uid,euid,suid,fsuid;
+       gid_t gid,egid,sgid,fsgid;
+
+       int ngroups;
+       gid_t   groups[NGROUPS];
+       cfs_kernel_cap_t   cap_effective,
+                       cap_inheritable,
+                       cap_permitted;
+
+       char comm[CFS_CURPROC_COMM_MAX];
+    void * journal_info;
+}  cfs_task_t;
+
+
+/*
+ *  linux task struct emulator ...
+ */
+
+#define TASKMAN_MAGIC  'TMAN'   /* Task Manager */
+#define TASKSLT_MAGIC  'TSLT'   /* Task Slot */
+
+typedef struct _TASK_MAN {
+
+    ULONG       Magic;      /* Magic and Flags */
+    ULONG       Flags;
+
+    spinlock_t  Lock;       /* Protection lock */
+
+    cfs_mem_cache_t * slab; /* Memory slab for task slot */
+
+    ULONG       NumOfTasks; /* Total tasks (threads) */
+    LIST_ENTRY  TaskList;   /* List of task slots */
+
+} TASK_MAN, *PTASK_MAN;
+
+typedef struct _TASK_SLOT {
+
+    ULONG       Magic;      /* Magic and Flags */
+    ULONG       Flags;
+
+    LIST_ENTRY  Link;       /* To be linked to TaskMan */
+
+    event_t     Event;      /* Schedule event */
+
+    HANDLE      Pid;        /* Process id */
+    HANDLE      Tid;        /* Thread id */
+    PETHREAD    Tet;        /* Pointer to ethread */
+
+    atomic_t    count;      /* refer count */
+    atomic_t    hits;       /* times of waken event singaled */
+
+    KIRQL       irql;       /* irql for rwlock ... */
+
+    cfs_task_t  task;       /* linux task part */
+
+} TASK_SLOT, *PTASK_SLOT;
+
+
+#define current                 cfs_current()
+#define set_current_state(s)   do {;} while (0)
+
+#define wait_event(wq, condition)                           \
+do {                                                        \
+    cfs_waitlink_t __wait;                                     \
+                                                            \
+    cfs_waitlink_init(&__wait);                                    \
+       while (TRUE) {                                          \
+               cfs_waitq_add(&wq, &__wait);                        \
+               if (condition)  {                                           \
+                       break;                                                  \
+        }                                                   \
+               cfs_waitq_wait(&__wait, CFS_TASK_INTERRUPTIBLE);        \
+               cfs_waitq_del(&wq, &__wait);                        \
+       }                                                                           \
+       cfs_waitq_del(&wq, &__wait);                                \
+} while(0)
+
+#define wait_event_interruptible(wq, condition, __ret)      \
+do {                                                        \
+    cfs_waitlink_t __wait;                                     \
+                                                            \
+    __ret = 0;                                              \
+    cfs_waitlink_init(&__wait);                                    \
+       while (TRUE) {                                          \
+               cfs_waitq_add(&wq, &__wait);                        \
+               if (condition)  {                                           \
+                       break;                                                  \
+        }                                                   \
+               cfs_waitq_wait(&__wait, CFS_TASK_INTERRUPTIBLE);    \
+               cfs_waitq_del(&wq, &__wait);                        \
+       }                                                                           \
+       cfs_waitq_del(&wq, &__wait);                                \
+} while(0)
+
+
+int     init_task_manager();
+void    cleanup_task_manager();
+cfs_task_t * cfs_current();
+int     schedule_timeout(int64_t time);
+int     schedule();
+int     wake_up_process(cfs_task_t * task);
+#define cfs_schedule_timeout(state, time)  schedule_timeout(time)
+void sleep_on(cfs_waitq_t *waitq);
+
+#define CFS_DECL_JOURNAL_DATA  
+#define CFS_PUSH_JOURNAL           do {;} while(0)
+#define CFS_POP_JOURNAL                    do {;} while(0)
+
+
+/* module related definitions */
+
+#ifndef __exit
+#define __exit
+#endif
+#ifndef __init
+#define __init
+#endif
+
+#define request_module(x) (0)
+
+#define EXPORT_SYMBOL(s)
+#define MODULE_AUTHOR(s)
+#define MODULE_DESCRIPTION(s)
+#define MODULE_LICENSE(s)
+#define MODULE_PARM(a, b)
+#define MODULE_PARM_DESC(a, b)
+
+#define module_init(X) int  __init module_##X() {return X();}
+#define module_exit(X) void __exit module_##X() {X();}
+
+#define DECLARE_INIT(X) extern int  __init  module_##X(void)
+#define DECLARE_EXIT(X) extern void __exit  module_##X(void)
+
+#define MODULE_INIT(X) do { int rc = module_##X(); \
+                            if (rc) goto errorout; \
+                          } while(0)
+
+#define MODULE_EXIT(X) do { module_##X(); } while(0)
+
+
+/* Module interfaces */
+#define cfs_module(name, version, init, fini) \
+module_init(init);                            \
+module_exit(fini)
+
+
+/*
+ *  Linux kernel version definition
+ */
+
+#define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c)
+#define LINUX_VERSION_CODE (2*100+6*10+7)
+
+
+/*
+ * Signal
+ */
+#define SIGNAL_MASK_ASSERT()
+
+/*
+ * Timer
+ */
+
+#define CFS_TIMER_FLAG_INITED   0x00000001  // Initialized already
+#define CFS_TIMER_FLAG_TIMERED  0x00000002  // KeSetTimer is called
+
+typedef struct cfs_timer {
+
+    KSPIN_LOCK      Lock;
+
+    ULONG           Flags;
+
+    KDPC            Dpc;
+    KTIMER          Timer;
+
+    cfs_time_t      deadline;
+
+    void (*proc)(ulong_ptr);
+    void *          arg;
+
+} cfs_timer_t;
+
+
+typedef  void (*timer_func_t)(ulong_ptr);
+
+#define cfs_init_timer(t)
+
+void cfs_timer_init(cfs_timer_t *timer, void (*func)(ulong_ptr), void *arg);
+void cfs_timer_done(cfs_timer_t *t);
+void cfs_timer_arm(cfs_timer_t *t, cfs_time_t deadline);
+void cfs_timer_disarm(cfs_timer_t *t);
+int  cfs_timer_is_armed(cfs_timer_t *t);
+cfs_time_t cfs_timer_deadline(cfs_timer_t *t);
+
+
+/* deschedule for a bit... */
+static inline void cfs_pause(cfs_duration_t ticks)
+{
+    cfs_schedule_timeout(TASK_UNINTERRUPTIBLE, ticks);
+}
+
+
+static inline void cfs_enter_debugger(void)
+{
+#if _X86_
+    __asm int 3;
+#else
+    KdBreakPoint();
+#endif
+}
+
+/*
+ *  libcfs globals initialization/cleanup
+ */
+
+int
+libcfs_arch_init(void);
+
+void
+libcfs_arch_cleanup(void);
+
+/*
+ * SMP ...
+ */
+
+#define SMP_CACHE_BYTES             128
+#define __cacheline_aligned
+#define NR_CPUS                                            (2)
+#define smp_processor_id()                 KeGetCurrentProcessorNumber()
+#define smp_num_cpus                NR_CPUS
+#define num_online_cpus() smp_num_cpus
+#define smp_call_function(f, a, n, w)          do {} while(0)
+
+/*
+ *  Irp related
+ */
+
+#define NR_IRQS                                    512
+#define in_interrupt()                 (0)
+
+/*
+ *  printk flags
+ */
+
+#define KERN_EMERG      "<0>"   /* system is unusable                   */
+#define KERN_ALERT      "<1>"   /* action must be taken immediately     */
+#define KERN_CRIT       "<2>"   /* critical conditions                  */
+#define KERN_ERR        "<3>"   /* error conditions                     */
+#define KERN_WARNING    "<4>"   /* warning conditions                   */
+#define KERN_NOTICE     "<5>"   /* normal but significant condition     */
+#define KERN_INFO       "<6>"   /* informational                        */
+#define KERN_DEBUG      "<7>"   /* debug-level messages                 */
+
+/*
+ * Misc
+ */
+
+
+#define inter_module_get(n)                    cfs_symbol_get(n)
+#define inter_module_put(n)                    cfs_symbol_put(n)
+
+#ifndef likely
+#define likely(exp) (exp)
+#endif
+#ifndef unlikely
+#define unlikely(exp) (exp)
+#endif
+
+#define lock_kernel()               do {} while(0)
+#define unlock_kernel()             do {} while(0)
+
+#define CAP_SYS_ADMIN                    0
+#define CAP_SYS_ROOT                     1
+
+#define capable(a)                             (TRUE)
+
+#define USERMODEHELPER(path, argv, envp)       (0)
+
+
+#define local_irq_save(x)
+#define local_irq_restore(x)
+
+#define cfs_assert                      ASSERT
+
+#define THREAD_NAME
+
+#else   /* !__KERNEL__ */
+
+#define PAGE_CACHE_SIZE PAGE_SIZE
+#define PAGE_CACHE_MASK PAGE_MASK
+
+#define getpagesize()   (PAGE_SIZE)
+
+
+typedef struct {
+    int foo;
+} pthread_mutex_t;
+
+typedef struct {
+    int foo;
+} pthread_cond_t;
+
+#define pthread_mutex_init(x, y)    do {} while(0)
+#define pthread_cond_init(x, y)     do {} while(0)
+
+#define pthread_mutex_lock(x)       do {} while(0)
+#define pthread_mutex_unlock(x)     do {} while(0)
+
+#define pthread_cond_wait(x,y)      do {} while(0)
+#define pthread_cond_broadcast(x)   do {} while(0)
+
+typedef struct file {
+    int foo;
+} cfs_file_t;
+
+typedef struct cfs_proc_dir_entry{
+       void            *data;
+}cfs_proc_dir_entry_t;
+
+
+
+#include "../user-prim.h"
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#define strcasecmp  strcmp
+#define strncasecmp strncmp
+#define snprintf   _snprintf
+#define getpid()   (0)
+
+
+#define getpwuid(x) (NULL)
+#define getgrgid(x) (NULL)
+
+int cfs_proc_mknod(const char *path, mode_t mode, dev_t dev);
+
+int gethostname(char * name, int namelen);
+
+#define setlinebuf(x) do {} while(0)
+
+
+NTSYSAPI VOID NTAPI DebugBreak();
+
+
+static inline void cfs_enter_debugger(void)
+{
+#if _X86_
+    __asm int 3;
+#else
+    DebugBreak();
+#endif
+}
+
+/* Maximum EA Information Length */
+#define EA_MAX_LENGTH  (sizeof(FILE_FULL_EA_INFORMATION) + 15)
+
+
+/*
+ *  proc user mode routines
+ */
+
+HANDLE cfs_proc_open (char * filename, int oflag);
+int cfs_proc_close(HANDLE handle);
+int cfs_proc_read(HANDLE handle, void *buffer, unsigned int count);
+int cfs_proc_write(HANDLE handle, void *buffer, unsigned int count);
+int cfs_proc_ioctl(HANDLE handle, int cmd, void *buffer);
+
+
+/*
+ * Native API definitions
+ */
+
+//
+//  Disk I/O Routines
+//
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtReadFile(HANDLE FileHandle,
+    HANDLE Event OPTIONAL,
+    PIO_APC_ROUTINE ApcRoutine OPTIONAL,
+    PVOID ApcContext OPTIONAL,
+    PIO_STATUS_BLOCK IoStatusBlock,
+    PVOID Buffer,
+    ULONG Length,
+    PLARGE_INTEGER ByteOffset OPTIONAL,
+    PULONG Key OPTIONAL);
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtWriteFile(HANDLE FileHandle,
+    HANDLE Event OPTIONAL,
+    PIO_APC_ROUTINE ApcRoutine OPTIONAL,
+    PVOID ApcContext OPTIONAL,
+    PIO_STATUS_BLOCK IoStatusBlock,
+    PVOID Buffer,
+    ULONG Length,
+    PLARGE_INTEGER ByteOffset OPTIONAL,
+    PULONG Key OPTIONAL);
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtClose(HANDLE Handle);
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtCreateFile(PHANDLE FileHandle,
+    ACCESS_MASK DesiredAccess,
+    POBJECT_ATTRIBUTES ObjectAttributes,
+    PIO_STATUS_BLOCK IoStatusBlock,
+    PLARGE_INTEGER AllocationSize OPTIONAL,
+    ULONG FileAttributes,
+    ULONG ShareAccess,
+    ULONG CreateDisposition,
+    ULONG CreateOptions,
+    PVOID EaBuffer OPTIONAL,
+    ULONG EaLength);
+
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtDeviceIoControlFile(
+    IN HANDLE  FileHandle,
+    IN HANDLE  Event,
+    IN PIO_APC_ROUTINE  ApcRoutine,
+    IN PVOID  ApcContext,
+    OUT PIO_STATUS_BLOCK  IoStatusBlock,
+    IN ULONG  IoControlCode,
+    IN PVOID  InputBuffer,
+    IN ULONG  InputBufferLength,
+    OUT PVOID  OutputBuffer,
+    OUT ULONG  OutputBufferLength
+    ); 
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtFsControlFile(
+    IN HANDLE FileHandle,
+    IN HANDLE Event OPTIONAL,
+    IN PIO_APC_ROUTINE ApcRoutine OPTIONAL,
+    IN PVOID ApcContext OPTIONAL,
+    OUT PIO_STATUS_BLOCK IoStatusBlock,
+    IN ULONG FsControlCode,
+    IN PVOID InputBuffer OPTIONAL,
+    IN ULONG InputBufferLength,
+    OUT PVOID OutputBuffer OPTIONAL,
+    IN ULONG OutputBufferLength
+);
+
+
+NTSYSAPI
+NTSTATUS
+NTAPI
+NtQueryInformationFile(
+    IN HANDLE  FileHandle,
+    OUT PIO_STATUS_BLOCK  IoStatusBlock,
+    OUT PVOID  FileInformation,
+    IN ULONG  Length,
+    IN FILE_INFORMATION_CLASS  FileInformationClass
+    );
+
+//
+// Random routines ...
+//
+
+NTSYSAPI
+ULONG
+NTAPI
+RtlRandom(
+    IN OUT PULONG  Seed
+    ); 
+
+#endif /* __KERNEL__ */
+
+
+//
+// Inode flags (Linux uses octad number, but why ? strange!!!)
+//
+
+#undef S_IFMT
+#undef S_IFDIR
+#undef S_IFCHR
+#undef S_IFREG
+#undef S_IREAD
+#undef S_IWRITE
+#undef S_IEXEC
+
+#define S_IFMT   0x0F000            /* 017 0000 */
+#define S_IFSOCK 0x0C000            /* 014 0000 */
+#define S_IFLNK  0x0A000            /* 012 0000 */
+#define S_IFREG  0x08000            /* 010 0000 */
+#define S_IFBLK  0x06000            /* 006 0000 */
+#define S_IFDIR  0x04000            /* 004 0000 */
+#define S_IFCHR  0x02000            /* 002 0000 */
+#define S_IFIFO  0x01000            /* 001 0000 */
+#define S_ISUID  0x00800            /* 000 4000 */
+#define S_ISGID  0x00400            /* 000 2000 */
+#define S_ISVTX  0x00200            /* 000 1000 */
+
+#define S_ISREG(m)      (((m) & S_IFMT) == S_IFREG)
+#define S_ISSOCK(m)     (((m) & S_IFMT) == S_IFSOCK)
+#define S_ISLNK(m)      (((m) & S_IFMT) == S_IFLNK)
+#define S_ISFIL(m)      (((m) & S_IFMT) == S_IFFIL)
+#define S_ISBLK(m)      (((m) & S_IFMT) == S_IFBLK)
+#define S_ISDIR(m)      (((m) & S_IFMT) == S_IFDIR)
+#define S_ISCHR(m)      (((m) & S_IFMT) == S_IFCHR)
+#define S_ISFIFO(m)     (((m) & S_IFMT) == S_IFIFO)
+
+#define S_IPERMISSION_MASK 0x1FF /*  */
+
+#define S_IRWXU  0x1C0              /* 0 0700 */
+#define S_IRUSR  0x100              /* 0 0400 */
+#define S_IWUSR  0x080              /* 0 0200 */
+#define S_IXUSR  0x040              /* 0 0100 */
+
+#define S_IRWXG  0x038              /* 0 0070 */
+#define S_IRGRP  0x020              /* 0 0040 */
+#define S_IWGRP  0x010              /* 0 0020 */
+#define S_IXGRP  0x008              /* 0 0010 */
+
+#define S_IRWXO  0x007              /* 0 0007 */
+#define S_IROTH  0x004              /* 0 0004 */
+#define S_IWOTH  0x002              /* 0 0002 */
+#define S_IXOTH  0x001              /* 0 0001 */
+
+#define S_IRWXUGO   (S_IRWXU|S_IRWXG|S_IRWXO)
+#define S_IALLUGO   (S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO)
+#define S_IRUGO     (S_IRUSR|S_IRGRP|S_IROTH)
+#define S_IWUGO     (S_IWUSR|S_IWGRP|S_IWOTH)
+#define S_IXUGO     (S_IXUSR|S_IXGRP|S_IXOTH)
+
+/*
+ *  linux ioctl coding definitions
+ */
+#define _IOC_NRBITS 8
+#define _IOC_TYPEBITS   8
+#define _IOC_SIZEBITS   14
+#define _IOC_DIRBITS    2
+
+#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1)
+#define _IOC_TYPEMASK   ((1 << _IOC_TYPEBITS)-1)
+#define _IOC_SIZEMASK   ((1 << _IOC_SIZEBITS)-1)
+#define _IOC_DIRMASK    ((1 << _IOC_DIRBITS)-1)
+
+#define _IOC_NRSHIFT    0
+#define _IOC_TYPESHIFT  (_IOC_NRSHIFT+_IOC_NRBITS)
+#define _IOC_SIZESHIFT  (_IOC_TYPESHIFT+_IOC_TYPEBITS)
+#define _IOC_DIRSHIFT   (_IOC_SIZESHIFT+_IOC_SIZEBITS)
+
+/*
+ * Direction bits.
+ */
+#define _IOC_NONE   0U
+#define _IOC_WRITE  1U
+#define _IOC_READ   2U
+
+#define _IOC(dir,type,nr,size) \
+    (((dir)  << _IOC_DIRSHIFT) | \
+     ((type) << _IOC_TYPESHIFT) | \
+     ((nr)   << _IOC_NRSHIFT) | \
+     ((size) << _IOC_SIZESHIFT))
+
+/* used to create numbers */
+#define _IO(type,nr)      _IOC(_IOC_NONE,(type),(nr),0)
+#define _IOR(type,nr,size)    _IOC(_IOC_READ,(type),(nr),sizeof(size))
+#define _IOW(type,nr,size)    _IOC(_IOC_WRITE,(type),(nr),sizeof(size))
+#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size))
+
+/* used to decode ioctl numbers.. */
+#define _IOC_DIR(nr)        (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
+#define _IOC_TYPE(nr)       (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
+#define _IOC_NR(nr)         (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
+#define _IOC_SIZE(nr)       (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
+
+/*
+ * Io vector ...  
+ */
+
+struct iovec
+{
+    void *iov_base;
+    size_t iov_len;
+};
+
+
+#define ULONG_LONG_MAX ((__u64)(0xFFFFFFFFFFFFFFFF))
+/*
+ * Convert a string to an unsigned long long integer.
+ *
+ * Ignores `locale' stuff.  Assumes that the upper and lower case
+ * alphabets and digits are each contiguous.
+ */
+static inline __u64
+strtoull(
+       char *nptr,
+       char **endptr,
+       int base)
+{
+       char *s = nptr;
+       __u64 acc, cutoff;
+       int c, neg = 0, any, cutlim;
+
+       /*
+        * See strtol for comments as to the logic used.
+        */
+       do {
+               c = *s++;
+       } while (isspace(c));
+       if (c == '-') {
+               neg = 1;
+               c = *s++;
+       } else if (c == '+')
+               c = *s++;
+       if ((base == 0 || base == 16) &&
+           c == '0' && (*s == 'x' || *s == 'X')) {
+               c = s[1];
+               s += 2;
+               base = 16;
+       }
+       if (base == 0)
+               base = c == '0' ? 8 : 10;
+       cutoff = (__u64)ULONG_LONG_MAX / (__u64)base;
+       cutlim = (int)((__u64)ULONG_LONG_MAX % (__u64)base);
+       for (acc = 0, any = 0;; c = *s++) {
+               if (isdigit(c))
+                       c -= '0';
+               else if (isalpha(c))
+                       c -= isupper(c) ? 'A' - 10 : 'a' - 10;
+               else
+                       break;
+               if (c >= base)
+                       break;
+               if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim))
+                       any = -1;
+               else {
+                       any = 1;
+                       acc *= base;
+                       acc += c;
+               }
+       }
+       if (any < 0) {
+               acc = ULONG_LONG_MAX;
+       } else if (neg)
+               acc = 0 - acc;
+       if (endptr != 0)
+               *endptr = (char *) (any ? s - 1 : nptr);
+       return (acc);
+}
+
+#endif
diff --git a/lnet/include/libcfs/winnt/winnt-tcpip.h b/lnet/include/libcfs/winnt/winnt-tcpip.h
new file mode 100644 (file)
index 0000000..a988247
--- /dev/null
@@ -0,0 +1,660 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ * Implementation of portable time API for Winnt (kernel and user-level).
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_TCPIP_H__
+#define __LIBCFS_WINNT_TCPIP_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+
+#ifdef __KERNEL__
+
+//
+//  ks definitions
+//
+
+// iovec is defined in libcfs: winnt_prim.h 
+// lnetkiov_t is defined in lnet/types.h
+
+typedef struct socket ksock_tconn_t;
+typedef struct socket cfs_socket_t;
+
+// completion notification callback routine
+
+typedef VOID (*ksock_schedule_cb)(struct socket*, int, void *, ulong_ptr);
+
+/* completion routine to update tx structure for async sending */
+typedef PVOID (*ksock_update_tx)(struct socket*, PVOID tx, ulong_ptr);
+
+//
+// tdinal definitions
+//
+
+
+#if TDI_LIBCFS_DBG
+#define KsPrint(X)     KsPrintf X
+#else
+#define KsPrint(X)
+#endif
+
+
+//
+// Socket Addresses Related ...
+//
+
+#define            INADDR_ANY              (ULONG)0x00000000
+#define     INADDR_LOOPBACK     (ULONG)0x7f000001
+#define            INADDR_BROADCAST    (ULONG)0xffffffff
+#define            INADDR_NONE             (ULONG)0xffffffff
+
+/*
+ *  TCP / IP options
+ */
+
+#define     SOL_TCP             6
+#define     SOL_UDP                    17
+
+
+#define TL_INSTANCE             0
+
+#define TCP_SOCKET_NODELAY      1 //  disabling "Nagle"
+#define TCP_SOCKET_KEEPALIVE    2
+#define TCP_SOCKET_OOBINLINE    3
+#define TCP_SOCKET_BSDURGENT    4
+#define TCP_SOCKET_ATMARK       5
+#define TCP_SOCKET_WINDOW       6
+
+
+/* Flags we can use with send/ and recv. 
+   Added those for 1003.1g not all are supported yet
+ */
+#define MSG_OOB            1
+#define MSG_PEEK        2
+#define MSG_DONTROUTE   4
+#define MSG_TRYHARD     4       /* Synonym for MSG_DONTROUTE for DECnet */
+#define MSG_CTRUNC      8
+#define MSG_PROBE       0x10   /* Do not send. Only probe path f.e. for MTU */
+#define MSG_TRUNC       0x20
+#define MSG_DONTWAIT    0x40   /* Nonblocking io                */
+#define MSG_EOR         0x80   /* End of record */
+#define MSG_WAITALL     0x100  /* Wait for a full request */
+#define MSG_FIN         0x200
+#define MSG_SYN                0x400
+#define MSG_CONFIRM     0x800  /* Confirm path validity */
+#define MSG_RST         0x1000
+#define MSG_ERRQUEUE    0x2000 /* Fetch message from error queue */
+#define MSG_NOSIGNAL    0x4000 /* Do not generate SIGPIPE */
+#define MSG_MORE        0x8000 /* Sender will send more */
+
+#define MSG_EOF         MSG_FIN
+
+
+//
+// Maximum TRANSPORT_ADDRESS Length
+//
+// it must >= FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address)
+//            + TDI_ADDRESS_LENGTH_IP
+//
+// I define it a little large and 16 bytes aligned to avoid possible overflow.
+//
+
+#define MAX_ADDRESS_LENGTH              (0x30)
+
+
+//
+// Maximum Listers Children Sockets
+//
+
+#define MAX_CHILD_LISTENERS             (4)
+
+//
+// Maximum EA Information Length
+//
+
+#define EA_MAX_LENGTH                   ( sizeof(FILE_FULL_EA_INFORMATION) - 1 + \
+                                          TDI_TRANSPORT_ADDRESS_LENGTH + 1 + \
+                                          MAX_ADDRESS_LENGTH )
+
+
+#define UDP_DEVICE_NAME L"\\Device\\Udp"
+#define TCP_DEVICE_NAME L"\\Device\\Tcp"
+
+
+/*
+ * TSDU definitions
+ */
+
+#define TDINAL_TSDU_DEFAULT_SIZE  (0x10000)
+
+#define KS_TSDU_MAGIC       'KSTD'
+
+#define KS_TSDU_ATTACHED    0x00000001  // Attached to the socket receive tsdu list
+
+typedef struct _KS_TSDU {
+
+    ULONG               Magic;
+    ULONG               Flags;
+
+    struct list_head    Link;
+
+    ULONG               TotalLength;    // Total size of KS_TSDU
+
+    ULONG               StartOffset;    // Start offset of the first Tsdu unit
+    ULONG               LastOffset;     // End offset of the last Tsdu unit
+
+/*
+    union {
+        KS_TSDU_DAT[];
+        KS_TSDU_BUF[];
+        KS_TSDU_MDL[];
+    }
+*/
+
+} KS_TSDU, *PKS_TSDU;
+
+#define TSDU_TYPE_BUF   ((USHORT)0x5401)
+#define TSDU_TYPE_DAT   ((USHORT)0x5402)
+#define TSDU_TYPE_MDL   ((USHORT)0x5403)
+
+#define KS_TSDU_BUF_RECEIVING       0x0001
+typedef struct _KS_TSDU_BUF {
+
+    USHORT              TsduType;
+    USHORT              TsduFlags;
+
+    ULONG               DataLength;
+    ULONG               StartOffset;
+
+    PVOID               UserBuffer;
+
+} KS_TSDU_BUF, *PKS_TSDU_BUF;
+
+#define KS_TSDU_DAT_RECEIVING       0x0001
+
+typedef struct _KS_TSDU_DAT {
+
+    USHORT              TsduType;
+    USHORT              TsduFlags;
+
+    ULONG               DataLength;
+    ULONG               StartOffset;
+
+    ULONG               TotalLength;
+
+    UCHAR               Data[1];
+
+} KS_TSDU_DAT, *PKS_TSDU_DAT;
+
+#define KS_DWORD_ALIGN(x)      (((x) + 0x03) & (~(0x03)))
+#define KS_TSDU_STRU_SIZE(Len) (KS_DWORD_ALIGN((Len) + FIELD_OFFSET(KS_TSDU_DAT, Data)))
+
+typedef struct _KS_TSDU_MDL {
+
+    USHORT              TsduType;
+    USHORT              TsduFlags;
+
+    ULONG               DataLength;
+    ULONG               StartOffset;    
+
+    PMDL                Mdl;
+    PVOID               Descriptor;
+
+} KS_TSDU_MDL, *PKS_TSDU_MDL;
+
+
+typedef struct _KS_TSDUMGR {
+
+    struct list_head    TsduList;
+    ULONG               NumOfTsdu;
+    ULONG               TotalBytes;
+    KEVENT              Event;
+
+} KS_TSDUMGR, *PKS_TSDUMGR;
+
+
+typedef struct _KS_CHAIN {
+
+    KS_TSDUMGR          Normal;
+    KS_TSDUMGR          Expedited;
+
+} KS_CHAIN, *PKS_CHAIN;
+
+
+#define TDINAL_SCHED_FACTOR (1)
+#define CAN_BE_SCHED(Len, Limit) (Len >= ((Limit) >> TDINAL_SCHED_FACTOR))
+
+//
+// Handler Settings Indictor 
+//
+
+#define TDI_EVENT_MAXIMUM_HANDLER (TDI_EVENT_ERROR_EX + 1)
+
+
+typedef struct _KS_EVENT_HANDLERS {
+    BOOLEAN     IsActive[TDI_EVENT_MAXIMUM_HANDLER];
+    PVOID       Handler [TDI_EVENT_MAXIMUM_HANDLER];
+} KS_EVENT_HANDLERS, *PKS_EVENT_HANDLERS;
+
+#define SetEventHandler(ha, ht, hr) do {        \
+            ha.IsActive[ht] = TRUE;             \
+            ha.Handler[ht] = (PVOID) (hr);      \
+        } while(0)
+
+//
+// KSock Internal Structures
+//
+
+typedef struct _KS_ADDRESS {
+
+    union {
+        TRANSPORT_ADDRESS   Tdi;
+        UCHAR               Pading[MAX_ADDRESS_LENGTH];
+    };
+
+    HANDLE                  Handle;
+    PFILE_OBJECT            FileObject;
+
+} KS_ADDRESS, *PKS_ADDRESS;
+
+//
+// Structures for Disconnect Workitem
+//
+
+typedef struct _KS_DISCONNECT_WORKITEM {
+
+    WORK_QUEUE_ITEM         WorkItem;       // Workitem to perform disconnection
+    ksock_tconn_t *         tconn;          // tdi connecton
+    ULONG                   Flags;          // connection broken/discnnection flags
+    KEVENT                  Event;          // sync event
+
+} KS_DISCONNECT_WORKITEM, *PKS_DISCONNECT_WORKITEM;
+
+
+typedef struct _KS_CONNECTION {
+
+    HANDLE                      Handle;     // Handle of the tdi connection
+    PFILE_OBJECT                FileObject; // FileObject if the conn object
+
+    PTRANSPORT_ADDRESS          Remote;     // the ConnectionInfo of this connection
+    PTDI_CONNECTION_INFORMATION ConnectionInfo;
+
+    ULONG                       nagle;      // Tcp options 
+
+} KS_CONNECTION, *PKS_CONNECTION;
+
+
+//
+// type definitions
+//
+
+typedef MDL                         ksock_mdl_t;
+typedef UNICODE_STRING              ksock_unicode_name_t;
+typedef WORK_QUEUE_ITEM             ksock_workitem_t;
+
+
+typedef KS_CHAIN                    ksock_chain_t;
+typedef KS_ADDRESS                  ksock_tdi_addr_t;
+typedef KS_CONNECTION               ksock_tconn_info_t;
+typedef KS_DISCONNECT_WORKITEM      ksock_disconnect_workitem_t;
+
+
+//
+// Structures for transmission done Workitem
+//
+
+typedef struct _KS_TCPX_FINILIZE {
+    ksock_workitem_t        item;
+    void *                  tx;
+} ksock_tcpx_fini_t;
+
+
+typedef struct ksock_backlogs {
+
+        struct list_head    list;   /* list to link the backlog connections */
+        int                 num;    /* number of backlogs in the list */
+
+} ksock_backlogs_t;
+
+
+typedef struct ksock_daemon {
+
+    ksock_tconn_t *         tconn;         /* the listener connection object */
+    unsigned short          nbacklogs;     /* number of listening backlog conns */
+    unsigned short          port;          /* listening port number */ 
+    int                     shutdown;      /* daemon threads is to exit */
+    struct list_head        list;          /* to be attached into ksock_nal_data_t*/
+
+} ksock_daemon_t ;
+
+
+typedef enum {
+
+    kstt_sender = 0,    // normal sending connection type, it's active connection, while
+                        // child tconn is for passive connection.
+
+    kstt_listener,      // listener daemon type, it just acts as a daemon, and it does
+                        // not have real connection. It manages children tcons to accept
+                        // or refuse the connecting request from remote peers.
+
+    kstt_child,         // accepted child connection type, it's parent must be Listener
+    kstt_lasttype
+} ksock_tconn_type;
+
+typedef enum {
+
+    ksts_uninited = 0, // tconn is just allocated (zero values), not initialized yet
+
+    ksts_inited,        // tconn structure initialized: so it now can be identified as
+                        // a sender, listener or a child
+
+    ksts_bind,          // tconn is bound: the local address object (ip/port) is created.
+                        // after being bound, we must call ksocknal_put_tconn to release
+                        // the tconn objects, it's not safe just to free the memory of tconn.
+
+    ksts_associated,    // the connection object is created and associated with the address
+                        // object. so it's ready for connection. only for child and sender.
+
+    ksts_connecting,    // only used by child tconn: in the ConnectEvent handler routine,
+                        // it indicts the child tconn is busy to be connected to the peer.
+
+    ksts_connected,     // the connection is built already: for sender and child
+
+    ksts_listening,     // listener daemon is working, only for listener tconn
+
+    ksts_disconnected,  // disconnected by user
+    ksts_aborted,       // un-exptected broken status
+
+    ksts_last           // total number of tconn statuses
+} ksock_tconn_state;
+
+#define KS_TCONN_MAGIC              'KSTM'
+
+#define KS_TCONN_HANDLERS_SET       0x00000001  // Conection handlers are set.
+#define KS_TCONN_DISCONNECT_BUSY    0x00010000  // Disconnect Workitem is queued ...
+#define KS_TCONN_DESTROY_BUSY       0x00020000  // Destory Workitem is queued ...
+
+#define KS_TCONN_DAEMON_STARTED     0x00100000  // indict the daemon is started,
+                                                // only valid for listener
+
+struct socket {
+
+        ulong_ptr                   kstc_magic;      /* Magic & Flags */
+        ulong_ptr                   kstc_flags;
+
+        spinlock_t                  kstc_lock;       /* serialise lock*/
+        void *                      kstc_conn;       /* ksock_conn_t */
+
+        ksock_tconn_type            kstc_type;          /* tdi connection Type */
+        ksock_tconn_state           kstc_state;      /* tdi connection state flag */
+
+        ksock_unicode_name_t        kstc_dev;        /* tcp transport device name */
+
+        ksock_tdi_addr_t            kstc_addr;       /* local address handlers / Objects */
+
+        atomic_t                    kstc_refcount;   /* reference count of ksock_tconn */
+
+        struct list_head            kstc_list;       /* linked to global ksocknal_data */
+
+        union {
+
+            struct {
+                int                 nbacklog;         /* total number of backlog tdi connections */
+                ksock_backlogs_t    kstc_listening;   /* listeing backlog child connections */
+                ksock_backlogs_t    kstc_accepted;    /* connected backlog child connections */
+                event_t             kstc_accept_event;   /* Signaled by AcceptedHander, 
+                                                            ksocknal_wait_accpeted_conns waits on */
+                event_t             kstc_destroy_event;  /* Signaled when accepted child is released */
+            } listener; 
+
+            struct  {
+                ksock_tconn_info_t  kstc_info;      /* Connection Info if Connected */
+                ksock_chain_t       kstc_recv;      /* tsdu engine for data receiving */
+                ksock_chain_t       kstc_send;      /* tsdu engine for data sending */
+
+                int                 kstc_queued;    /* Attached to Parent->ChildList ... */
+                int                 kstc_queueno;   /* 0: Attached to Listening list 
+                                                       1: Attached to Accepted list */
+
+                int                 kstc_busy;      /* referred by ConnectEventCallback ? */
+                int                 kstc_accepted;  /* the connection is built ready ? */
+
+                struct list_head    kstc_link;      /* linked to parent tdi connection */
+                ksock_tconn_t   *   kstc_parent;    /* pointers to it's listener parent */
+            } child;
+
+            struct {
+                ksock_tconn_info_t  kstc_info;      /* Connection Info if Connected */
+                ksock_chain_t       kstc_recv;      /* tsdu engine for data receiving */
+                ksock_chain_t       kstc_send;      /* tsdu engine for data sending */
+            } sender; 
+        };
+
+        ulong_ptr                   kstc_snd_wnd;   /* Sending window size */
+        ulong_ptr                   kstc_rcv_wnd;   /* Recving window size */
+
+        ksock_workitem_t            kstc_destroy;    /* tconn destruction workitem */
+        ksock_disconnect_workitem_t kstc_disconnect; /* connection disconnect workitem */
+
+        ksock_schedule_cb           kstc_sched_cb;   /* notification callback routine of completion */
+        ksock_update_tx             kstc_update_tx;  /* aync sending callback to update tx */
+};
+
+#define SOCK_WMEM_QUEUED(sock) (0)
+
+#define TDINAL_WINDOW_DEFAULT_SIZE  (0x100000)
+
+
+struct _KS_UDP_COMPLETION_CONTEXT;
+struct _KS_TCP_COMPLETION_CONTEXT;
+
+
+typedef
+NTSTATUS
+(*PKS_UDP_COMPLETION_ROUTINE) (
+    IN PIRP     Irp,
+    IN struct _KS_UDP_COMPLETION_CONTEXT
+                *UdpContext
+    );
+
+
+typedef
+NTSTATUS
+(*PKS_TCP_COMPLETION_ROUTINE) (
+    IN PIRP     Irp,
+    IN struct _KS_TCP_COMPLETION_CONTEXT
+                *TcpContext
+    );
+
+//
+// Udp Irp Completion Context
+//
+
+typedef struct _KS_UDP_COMPLETION_CONTEXT {
+
+    PKEVENT                             Event;
+    union {
+        PFILE_OBJECT                    AddressObject;
+        ksock_tconn_t *                 tconn;
+    };
+
+    PKS_UDP_COMPLETION_ROUTINE          CompletionRoutine;
+    PVOID                               CompletionContext;
+
+} KS_UDP_COMPLETION_CONTEXT, *PKS_UDP_COMPLETION_CONTEXT;
+
+
+//
+// Tcp Irp Completion Context (used by tcp data recv/send)
+//
+
+typedef struct _KS_TCP_COMPLETION_CONTEXT {
+
+    PKEVENT                             Event;      // Event to be waited on by Irp caller ...
+
+    ksock_tconn_t *                     tconn;      // the tdi connection
+
+    PKS_TCP_COMPLETION_ROUTINE          CompletionRoutine;
+    PVOID                               CompletionContext;
+    PVOID                               CompletionContext2;
+
+    PKS_TSDUMGR                         KsTsduMgr;  // Tsdu buffer manager
+
+    //
+    // These tow new members are for NON_BLOCKING transmission
+    //
+
+    BOOLEAN                                                        bCounted;    // To indict needing refcount to
+                                                     // execute CompetionRoutine
+    ULONG                               ReferCount;  // Refer count of this structure
+
+} KS_TCP_COMPLETION_CONTEXT, *PKS_TCP_COMPLETION_CONTEXT;
+
+typedef KS_TCP_COMPLETION_CONTEXT  ksock_tdi_tx_t, ksock_tdi_rx_t;
+
+
+/*
+ * tdi extensions
+ */
+
+#define IOCTL_TCP_QUERY_INFORMATION_EX        \
+                        CTL_CODE(FILE_DEVICE_NETWORK, 0, METHOD_NEITHER, FILE_ANY_ACCESS)
+#define IOCTL_TCP_SET_INFORMATION_EX        \
+                        CTL_CODE(FILE_DEVICE_NETWORK, 1, METHOD_BUFFERED, FILE_WRITE_ACCESS)
+
+
+#define TcpBuildSetInformationEx(Irp, DevObj, FileObj, CompRoutine, Contxt, Buffer, BufferLen)\
+    {                                                                        \
+        PIO_STACK_LOCATION _IRPSP;                                           \
+        if ( CompRoutine != NULL) {                                          \
+            IoSetCompletionRoutine( Irp, CompRoutine, Contxt, TRUE, TRUE, TRUE);\
+        } else {                                                             \
+            IoSetCompletionRoutine( Irp, NULL, NULL, FALSE, FALSE, FALSE);   \
+        }                                                                    \
+        _IRPSP = IoGetNextIrpStackLocation (Irp);                            \
+        _IRPSP->MajorFunction = IRP_MJ_DEVICE_CONTROL;                       \
+        _IRPSP->DeviceObject = DevObj;                                       \
+        _IRPSP->FileObject = FileObj;                                        \
+        _IRPSP->Parameters.DeviceIoControl.OutputBufferLength = 0;           \
+        _IRPSP->Parameters.DeviceIoControl.InputBufferLength = BufferLen;    \
+        _IRPSP->Parameters.DeviceIoControl.IoControlCode = IOCTL_TCP_SET_INFORMATION_EX;  \
+        Irp->AssociatedIrp.SystemBuffer = Buffer;                            \
+    }
+
+
+#define TcpBuildQueryInformationEx(Irp, DevObj, FileObj, CompRoutine, Contxt, InBuffer, InLength, OutBuffer, OutLength)\
+    {                                                                        \
+        PIO_STACK_LOCATION _IRPSP;                                           \
+        if ( CompRoutine != NULL) {                                          \
+            IoSetCompletionRoutine( Irp, CompRoutine, Contxt, TRUE, TRUE, TRUE);\
+        } else {                                                             \
+            IoSetCompletionRoutine( Irp, NULL, NULL, FALSE, FALSE, FALSE);   \
+        }                                                                    \
+        _IRPSP = IoGetNextIrpStackLocation (Irp);                            \
+        _IRPSP->MajorFunction = IRP_MJ_DEVICE_CONTROL;                       \
+        _IRPSP->DeviceObject = DevObj;                                       \
+        _IRPSP->FileObject = FileObj;                                        \
+        _IRPSP->Parameters.DeviceIoControl.OutputBufferLength = OutLength;           \
+        _IRPSP->Parameters.DeviceIoControl.InputBufferLength = InLength;    \
+        _IRPSP->Parameters.DeviceIoControl.IoControlCode = IOCTL_TCP_QUERY_INFORMATION_EX;  \
+        _IRPSP->Parameters.DeviceIoControl.Type3InputBuffer = InBuffer;    \
+        Irp->UserBuffer = OutBuffer;                            \
+    }
+
+
+typedef struct ks_addr_slot {
+    LIST_ENTRY      link;
+    int             up;
+    char            iface[40];
+    __u32           ip_addr;
+    __u32           netmask;
+    UNICODE_STRING  devname;
+    WCHAR           buffer[1];
+} ks_addr_slot_t;
+
+typedef struct {
+
+    /*
+     * Tdi client information
+     */
+
+    UNICODE_STRING    ksnd_client_name; /* tdi client module name */
+    HANDLE            ksnd_pnp_handle;  /* the handle for pnp changes */
+
+    spinlock_t        ksnd_addrs_lock;  /* serialize ip address list access */
+    LIST_ENTRY        ksnd_addrs_list;  /* list of the ip addresses */
+    int               ksnd_naddrs;      /* number of the ip addresses */
+
+    /*
+     *  Tdilnd internal defintions
+     */
+
+    int               ksnd_init;            /* initialisation state */
+
+    TDI_PROVIDER_INFO ksnd_provider;    /* tdi tcp/ip provider's information */
+
+    spinlock_t        ksnd_tconn_lock;      /* tdi connections access serialise */
+
+    int               ksnd_ntconns;         /* number of tconns attached in list */
+    struct list_head  ksnd_tconns;          /* tdi connections list */
+    cfs_mem_cache_t * ksnd_tconn_slab;      /* slabs for ksock_tconn_t allocations */
+    event_t           ksnd_tconn_exit;      /* exit event to be signaled by the last tconn */
+
+    spinlock_t        ksnd_tsdu_lock;       /* tsdu access serialise */
+        
+    int               ksnd_ntsdus;          /* number of tsdu buffers allocated */
+    ulong_ptr     ksnd_tsdu_size;       /* the size of a signel tsdu buffer */
+    cfs_mem_cache_t * ksnd_tsdu_slab;       /* slab cache for tsdu buffer allocation */
+
+    int               ksnd_nfreetsdus;      /* number of tsdu buffers in the freed list */
+    struct list_head  ksnd_freetsdus;          /* List of the freed Tsdu buffer. */
+
+    spinlock_t        ksnd_daemon_lock;     /* stabilize daemon ops */
+    int               ksnd_ndaemons;        /* number of listening daemons */
+    struct list_head  ksnd_daemons;         /* listening daemon list */
+    event_t           ksnd_daemon_exit;     /* the last daemon quiting should singal it */
+
+} ks_data_t;
+
+int
+ks_init_tdi_data();
+
+void
+ks_fini_tdi_data();
+
+
+#endif /* __KERNEL__ */
+#endif /* __LIBCFS_WINNT_TCPIP_H__ */
+
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/lnet/include/libcfs/winnt/winnt-time.h b/lnet/include/libcfs/winnt/winnt-time.h
new file mode 100644 (file)
index 0000000..d31f854
--- /dev/null
@@ -0,0 +1,315 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or modify it under the
+ * terms of version 2 of the GNU General Public License as published by the
+ * Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass
+ * Ave, Cambridge, MA 02139, USA.
+ *
+ * Implementation of portable time API for Winnt (kernel and user-level).
+ *
+ */
+
+#ifndef __LIBCFS_WINNT_LINUX_TIME_H__
+#define __LIBCFS_WINNT_LINUX_TIME_H__
+
+#ifndef __LIBCFS_LIBCFS_H__
+#error Do not #include this file directly. #include <libcfs/libcfs.h> instead
+#endif
+
+/* Portable time API */
+
+/*
+ * Platform provides three opaque data-types:
+ *
+ *  cfs_time_t        represents point in time. This is internal kernel
+ *                    time rather than "wall clock". This time bears no
+ *                    relation to gettimeofday().
+ *
+ *  cfs_duration_t    represents time interval with resolution of internal
+ *                    platform clock
+ *
+ *  cfs_fs_time_t     represents instance in world-visible time. This is
+ *                    used in file-system time-stamps
+ *
+ *  cfs_time_t     cfs_time_current(void);
+ *  cfs_time_t     cfs_time_add    (cfs_time_t, cfs_duration_t);
+ *  cfs_duration_t cfs_time_sub    (cfs_time_t, cfs_time_t);
+ *  int            cfs_time_before (cfs_time_t, cfs_time_t);
+ *  int            cfs_time_beforeq(cfs_time_t, cfs_time_t);
+ *
+ *  cfs_duration_t cfs_duration_build(int64_t);
+ *
+ *  time_t         cfs_duration_sec (cfs_duration_t);
+ *  void           cfs_duration_usec(cfs_duration_t, struct timeval *);
+ *  void           cfs_duration_nsec(cfs_duration_t, struct timespec *);
+ *
+ *  void           cfs_fs_time_current(cfs_fs_time_t *);
+ *  time_t         cfs_fs_time_sec    (cfs_fs_time_t *);
+ *  void           cfs_fs_time_usec   (cfs_fs_time_t *, struct timeval *);
+ *  void           cfs_fs_time_nsec   (cfs_fs_time_t *, struct timespec *);
+ *  int            cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *);
+ *  int            cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *);
+ *
+ *  CFS_TIME_FORMAT
+ *  CFS_DURATION_FORMAT
+ *
+ */
+
+#define ONE_BILLION ((u_int64_t)1000000000)
+#define ONE_MILLION ((u_int64_t)   1000000)
+
+#define HZ (100)
+
+struct timeval {
+       time_t          tv_sec;         /* seconds */
+       suseconds_t     tv_usec;        /* microseconds */
+};
+
+struct timespec {
+    ulong_ptr tv_sec;
+    ulong_ptr tv_nsec;
+};
+
+#ifdef __KERNEL__
+
+#include <libcfs/winnt/portals_compat25.h>
+
+/*
+ * Generic kernel stuff
+ */
+
+typedef struct timeval cfs_fs_time_t;
+
+typedef u_int64_t cfs_time_t;
+typedef int64_t cfs_duration_t;
+
+static inline void do_gettimeofday(struct timeval *tv)
+{
+    LARGE_INTEGER Time;
+
+    KeQuerySystemTime(&Time);
+
+    tv->tv_sec  = (long_ptr) (Time.QuadPart / 10000000);
+    tv->tv_usec = (long_ptr) (Time.QuadPart % 10000000) / 10;
+}
+
+static inline cfs_time_t JIFFIES()
+{
+    LARGE_INTEGER Tick;
+    LARGE_INTEGER Elapse;
+
+    KeQueryTickCount(&Tick);
+
+    Elapse.QuadPart  = Tick.QuadPart * KeQueryTimeIncrement();
+    Elapse.QuadPart /= (10000000 / HZ);
+
+    return Elapse.QuadPart;
+}
+
+static inline cfs_time_t cfs_time_current(void)
+{
+    return JIFFIES();
+}
+
+static inline cfs_time_t cfs_time_current_sec(void)
+{
+    return (JIFFIES() / HZ);
+}
+
+static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d)
+{
+    return (t + d);
+}
+
+static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2)
+{
+    return (t1 - t2);
+}
+
+static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2)
+{
+    return ((int64_t)t1 - (int64_t)t2) < 0; 
+}
+
+static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2)
+{
+    return ((int64_t)t1 - (int64_t)t2) <= 0;
+}
+
+static inline void cfs_fs_time_current(cfs_fs_time_t *t)
+{
+    ULONG         Linux;
+    LARGE_INTEGER Sys;
+
+    KeQuerySystemTime(&Sys);
+
+    RtlTimeToSecondsSince1970(&Sys, &Linux);
+
+    t->tv_sec  = Linux;
+    t->tv_usec = (Sys.LowPart % 10000000) / 10;
+}
+
+static inline cfs_time_t cfs_fs_time_sec(cfs_fs_time_t *t)
+{
+    return t->tv_sec;
+}
+
+static inline u_int64_t __cfs_fs_time_flat(cfs_fs_time_t *t)
+{
+    return ((u_int64_t)t->tv_sec) * ONE_MILLION + t->tv_usec;
+}
+
+static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+    return (__cfs_fs_time_flat(t1) < __cfs_fs_time_flat(t2));
+}
+
+static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2)
+{
+    return (__cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2));
+}
+
+static inline cfs_duration_t cfs_time_seconds(int seconds)
+{
+    return (cfs_duration_t)seconds * HZ;
+}
+
+static inline cfs_time_t cfs_duration_sec(cfs_duration_t d)
+{
+        return d / HZ;
+}
+
+static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s)
+{
+        s->tv_sec = (suseconds_t) (d / HZ);
+        s->tv_usec = (time_t)((d - (cfs_duration_t)s->tv_sec * HZ) *
+                              ONE_MILLION / HZ);
+}
+
+static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s)
+{
+        s->tv_sec = (suseconds_t) (d / HZ);
+        s->tv_nsec = (time_t)((d - (cfs_duration_t)s->tv_sec * HZ) *
+                              ONE_BILLION / HZ);
+}
+
+static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v)
+{
+        *v = *t;
+}
+
+static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s)
+{
+        s->tv_sec  = t->tv_sec;
+        s->tv_nsec = t->tv_usec * 1000;
+}
+
+#define cfs_time_current_64 cfs_time_current
+#define cfs_time_add_64     cfs_time_add
+#define cfs_time_shift_64   cfs_time_shift
+#define cfs_time_before_64  cfs_time_before
+
+/*
+ * One jiffy
+ */
+#define CFS_TICK                (1)
+
+#define LTIME_S(t)                     (t)
+
+#define CFS_TIME_T              "%I64u"
+#define CFS_DURATION_T          "%I64d"
+
+#else   /* !__KERNEL__ */
+
+/*
+ * Liblustre. time(2) based implementation.
+ */
+#include <libcfs/user-time.h>
+
+
+//
+// Time routines ...
+//
+
+NTSYSAPI
+CCHAR
+NTAPI
+NtQuerySystemTime(
+    OUT PLARGE_INTEGER  CurrentTime
+    );
+
+
+NTSYSAPI
+BOOLEAN
+NTAPI
+RtlTimeToSecondsSince1970(
+    IN PLARGE_INTEGER  Time,
+    OUT PULONG  ElapsedSeconds
+    );
+
+
+NTSYSAPI
+VOID
+NTAPI
+RtlSecondsSince1970ToTime(
+    IN ULONG  ElapsedSeconds,
+    OUT PLARGE_INTEGER  Time
+    );
+
+NTSYSAPI
+VOID
+NTAPI
+Sleep(
+  DWORD dwMilliseconds   // sleep time in milliseconds
+);
+
+
+static inline void sleep(int time)
+{
+    DWORD Time = 1000 * time;
+    Sleep(Time);
+}
+
+
+static inline void do_gettimeofday(struct timeval *tv)
+{
+    LARGE_INTEGER Time;
+
+    NtQuerySystemTime(&Time);
+
+    tv->tv_sec  = (long_ptr) (Time.QuadPart / 10000000);
+    tv->tv_usec = (long_ptr) (Time.QuadPart % 10000000) / 10;
+}
+
+static inline int gettimeofday(struct timeval *tv, void * tz)
+{
+    do_gettimeofday(tv);
+    return 0;
+}
+
+#endif /* __KERNEL__ */
+
+/* __LIBCFS_LINUX_LINUX_TIME_H__ */
+#endif
+/*
+ * Local variables:
+ * c-indentation-style: "K&R"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * fill-column: 80
+ * scroll-step: 1
+ * End:
+ */
diff --git a/lnet/include/libcfs/winnt/winnt-types.h b/lnet/include/libcfs/winnt/winnt-types.h
new file mode 100644 (file)
index 0000000..6478730
--- /dev/null
@@ -0,0 +1,647 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Basic types definitions
+ *
+ */
+
+#ifndef _WINNT_TYPE_H
+#define _WINNT_TYPE_H
+
+#ifdef __KERNEL__
+
+#include <ntifs.h>
+#include <windef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+
+#include <tdi.h>
+#include <tdikrnl.h>
+#include <tdiinfo.h>
+
+#else
+
+#include <ntddk.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <time.h>
+#include <io.h>
+#include <string.h>
+#include <assert.h>
+
+#endif
+
+
+#define __LITTLE_ENDIAN
+
+#define inline     __inline
+#define __inline__ __inline
+
+typedef unsigned __int8     __u8;
+typedef signed   __int8     __s8;
+
+typedef signed   __int64    __s64;
+typedef unsigned __int64    __u64;
+
+typedef        signed   __int16        __s16;
+typedef        unsigned __int16        __u16;
+
+typedef        signed   __int32        __s32;
+typedef        unsigned __int32        __u32;
+
+typedef        signed   __int64        __s64;
+typedef        unsigned __int64        __u64;
+
+typedef unsigned long       ULONG;
+
+
+#if defined(_WIN64)
+    #define long_ptr        __int64
+    #define ulong_ptr       unsigned __int64
+    #define BITS_PER_LONG   (64)
+#else
+    #define long_ptr        long
+    #define ulong_ptr       unsigned long
+    #define BITS_PER_LONG   (32)
+
+#endif
+
+/* bsd */
+typedef unsigned char          u_char;
+typedef unsigned short         u_short;
+typedef unsigned int           u_int;
+typedef unsigned long          u_long;
+
+/* sysv */
+typedef unsigned char          unchar;
+typedef unsigned short         ushort;
+typedef unsigned int           uint;
+typedef unsigned long          ulong;
+
+#ifndef __BIT_TYPES_DEFINED__
+#define __BIT_TYPES_DEFINED__
+
+typedef                __u8            u_int8_t;
+typedef                __s8            int8_t;
+typedef                __u16           u_int16_t;
+typedef                __s16           int16_t;
+typedef                __u32           u_int32_t;
+typedef                __s32           int32_t;
+
+#endif /* !(__BIT_TYPES_DEFINED__) */
+
+typedef                __u8            uint8_t;
+typedef                __u16           uint16_t;
+typedef                __u32           uint32_t;
+
+typedef                __u64           uint64_t;
+typedef                __u64           u_int64_t;
+typedef                __s64           int64_t;
+
+typedef long            ssize_t;
+
+typedef __u32           suseconds_t;
+
+typedef __u32           pid_t, tid_t;
+
+typedef __u16           uid_t, gid_t;
+
+typedef __u16           mode_t;
+typedef __u16           umode_t;
+
+typedef ulong_ptr       sigset_t;
+
+typedef uint64_t        loff_t;
+typedef HANDLE          cfs_handle_t;
+typedef uint64_t        cycles_t;
+
+#ifndef INVALID_HANDLE_VALUE
+#define INVALID_HANDLE_VALUE ((HANDLE)-1)
+#endif
+
+
+#ifdef __KERNEL__ /* kernel */
+
+typedef __u32           off_t;
+typedef __u32           time_t;
+
+typedef unsigned short  kdev_t;
+
+#else  /* !__KERNEL__ */
+
+typedef int             BOOL;
+typedef __u8            BYTE;
+typedef __u16           WORD;
+typedef __u32           DWORD;
+
+#endif /* __KERNEL__ */
+
+/*
+ * Conastants suffix
+ */
+
+#define ULL i64
+#define ull i64
+
+/*
+ * Winnt kernel has no capabilities.
+ */
+
+typedef __u32 cfs_kernel_cap_t;
+
+#define INT_MAX         ((int)(~0U>>1))
+#define INT_MIN         (-INT_MAX - 1)
+#define UINT_MAX        (~0U)
+
+#endif /* _WINNT_TYPES_H */
+
+
+/*
+ *  Bytes order 
+ */
+
+//
+// Byte order swapping routines
+//
+
+
+#define ___swab16(x) RtlUshortByteSwap(x)
+#define ___swab32(x) RtlUlongByteSwap(x)
+#define ___swab64(x) RtlUlonglongByteSwap(x)
+
+#define ___constant_swab16(x) \
+       ((__u16)( \
+               (((__u16)(x) & (__u16)0x00ffU) << 8) | \
+               (((__u16)(x) & (__u16)0xff00U) >> 8) ))
+
+#define ___constant_swab32(x) \
+       ((__u32)( \
+               (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \
+               (((__u32)(x) & (__u32)0x0000ff00UL) <<  8) | \
+               (((__u32)(x) & (__u32)0x00ff0000UL) >>  8) | \
+               (((__u32)(x) & (__u32)0xff000000UL) >> 24) ))
+
+#define ___constant_swab64(x) \
+       ((__u64)( \
+               (__u64)(((__u64)(x) & (__u64)0x00000000000000ffUL) << 56) | \
+               (__u64)(((__u64)(x) & (__u64)0x000000000000ff00UL) << 40) | \
+               (__u64)(((__u64)(x) & (__u64)0x0000000000ff0000UL) << 24) | \
+               (__u64)(((__u64)(x) & (__u64)0x00000000ff000000UL) <<  8) | \
+               (__u64)(((__u64)(x) & (__u64)0x000000ff00000000UL) >>  8) | \
+               (__u64)(((__u64)(x) & (__u64)0x0000ff0000000000UL) >> 24) | \
+               (__u64)(((__u64)(x) & (__u64)0x00ff000000000000UL) >> 40) | \
+               (__u64)(((__u64)(x) & (__u64)0xff00000000000000UL) >> 56) ))
+
+
+#define __swab16(x)  ___constant_swab16(x)
+#define __swab32(x)  ___constant_swab32(x)
+#define __swab64(x)  ___constant_swab64(x)
+
+#define __swab16s(x) do { *(x) = __swab16((USHORT)(*(x)));} while(0)
+#define __swab32s(x) do { *(x) = __swab32((ULONG)(*(x)));} while(0)
+#define __swab64s(x) do { *(x) = __swab64((ULONGLONG)(*(x)));} while(0)
+
+#define __constant_htonl(x) ___constant_swab32((x))
+#define __constant_ntohl(x) ___constant_swab32((x))
+#define __constant_htons(x) ___constant_swab16((x))
+#define __constant_ntohs(x) ___constant_swab16((x))
+#define __constant_cpu_to_le64(x) ((__u64)(x))
+#define __constant_le64_to_cpu(x) ((__u64)(x))
+#define __constant_cpu_to_le32(x) ((__u32)(x))
+#define __constant_le32_to_cpu(x) ((__u32)(x))
+#define __constant_cpu_to_le16(x) ((__u16)(x))
+#define __constant_le16_to_cpu(x) ((__u16)(x))
+#define __constant_cpu_to_be64(x) ___constant_swab64((x))
+#define __constant_be64_to_cpu(x) ___constant_swab64((x))
+#define __constant_cpu_to_be32(x) ___constant_swab32((x))
+#define __constant_be32_to_cpu(x) ___constant_swab32((x))
+#define __constant_cpu_to_be16(x) ___constant_swab16((x))
+#define __constant_be16_to_cpu(x) ___constant_swab16((x))
+#define __cpu_to_le64(x) ((__u64)(x))
+#define __le64_to_cpu(x) ((__u64)(x))
+#define __cpu_to_le32(x) ((__u32)(x))
+#define __le32_to_cpu(x) ((__u32)(x))
+#define __cpu_to_le16(x) ((__u16)(x))
+#define __le16_to_cpu(x) ((__u16)(x))
+#define __cpu_to_be64(x) __swab64((x))
+#define __be64_to_cpu(x) __swab64((x))
+#define __cpu_to_be32(x) __swab32((x))
+#define __be32_to_cpu(x) __swab32((x))
+#define __cpu_to_be16(x) __swab16((x))
+#define __be16_to_cpu(x) __swab16((x))
+#define __cpu_to_le64p(x) (*(__u64*)(x))
+#define __le64_to_cpup(x) (*(__u64*)(x))
+#define __cpu_to_le32p(x) (*(__u32*)(x))
+#define __le32_to_cpup(x) (*(__u32*)(x))
+#define __cpu_to_le16p(x) (*(__u16*)(x))
+#define __le16_to_cpup(x) (*(__u16*)(x))
+#define __cpu_to_be64p(x) __swab64p((x))
+#define __be64_to_cpup(x) __swab64p((x))
+#define __cpu_to_be32p(x) __swab32p((x))
+#define __be32_to_cpup(x) __swab32p((x))
+#define __cpu_to_be16p(x) __swab16p((x))
+#define __be16_to_cpup(x) __swab16p((x))
+#define __cpu_to_le64s(x) do {} while (0)
+#define __le64_to_cpus(x) do {} while (0)
+#define __cpu_to_le32s(x) do {} while (0)
+#define __le32_to_cpus(x) do {} while (0)
+#define __cpu_to_le16s(x) do {} while (0)
+#define __le16_to_cpus(x) do {} while (0)
+#define __cpu_to_be64s(x) __swab64s((x))
+#define __be64_to_cpus(x) __swab64s((x))
+#define __cpu_to_be32s(x) __swab32s((x))
+#define __be32_to_cpus(x) __swab32s((x))
+#define __cpu_to_be16s(x) __swab16s((x))
+#define __be16_to_cpus(x) __swab16s((x))
+
+#ifndef cpu_to_le64
+#define cpu_to_le64 __cpu_to_le64
+#define le64_to_cpu __le64_to_cpu
+#define cpu_to_le32 __cpu_to_le32
+#define le32_to_cpu __le32_to_cpu
+#define cpu_to_le16 __cpu_to_le16
+#define le16_to_cpu __le16_to_cpu
+#endif
+
+#define cpu_to_be64 __cpu_to_be64
+#define be64_to_cpu __be64_to_cpu
+#define cpu_to_be32 __cpu_to_be32
+#define be32_to_cpu __be32_to_cpu
+#define cpu_to_be16 __cpu_to_be16
+#define be16_to_cpu __be16_to_cpu
+#define cpu_to_le64p __cpu_to_le64p
+#define le64_to_cpup __le64_to_cpup
+#define cpu_to_le32p __cpu_to_le32p
+#define le32_to_cpup __le32_to_cpup
+#define cpu_to_le16p __cpu_to_le16p
+#define le16_to_cpup __le16_to_cpup
+#define cpu_to_be64p __cpu_to_be64p
+#define be64_to_cpup __be64_to_cpup
+#define cpu_to_be32p __cpu_to_be32p
+#define be32_to_cpup __be32_to_cpup
+#define cpu_to_be16p __cpu_to_be16p
+#define be16_to_cpup __be16_to_cpup
+#define cpu_to_le64s __cpu_to_le64s
+#define le64_to_cpus __le64_to_cpus
+#define cpu_to_le32s __cpu_to_le32s
+#define le32_to_cpus __le32_to_cpus
+#define cpu_to_le16s __cpu_to_le16s
+#define le16_to_cpus __le16_to_cpus
+#define cpu_to_be64s __cpu_to_be64s
+#define be64_to_cpus __be64_to_cpus
+#define cpu_to_be32s __cpu_to_be32s
+#define be32_to_cpus __be32_to_cpus
+#define cpu_to_be16s __cpu_to_be16s
+#define be16_to_cpus __be16_to_cpus
+
+
+//
+// Network to host byte swap functions
+//
+
+#define ntohl(x)           ( ( ( ( x ) & 0x000000ff ) << 24 ) | \
+                             ( ( ( x ) & 0x0000ff00 ) << 8 ) | \
+                             ( ( ( x ) & 0x00ff0000 ) >> 8 ) | \
+                             ( ( ( x ) & 0xff000000 ) >> 24 )   )
+
+#define ntohs(x)           ( ( ( ( x ) & 0xff00 ) >> 8 ) | \
+                             ( ( ( x ) & 0x00ff ) << 8 ) )
+
+
+#define htonl(x)           ntohl(x)
+#define htons(x)           ntohs(x)
+
+
+
+#ifndef _I386_ERRNO_H
+#define _I386_ERRNO_H
+
+#define        EPERM            1      /* Operation not permitted */
+#define        ENOENT           2      /* No such file or directory */
+#define        ESRCH            3      /* No such process */
+#define        EINTR            4      /* Interrupted system call */
+#define        EIO                  5  /* I/O error */
+#define        ENXIO            6      /* No such device or address */
+#define        E2BIG            7      /* Arg list too long */
+#define        ENOEXEC          8      /* Exec format error */
+#define        EBADF            9      /* Bad file number */
+#define        ECHILD          10      /* No child processes */
+#define        EAGAIN          11      /* Try again */
+#define        ENOMEM          12      /* Out of memory */
+#define        EACCES          13      /* Permission denied */
+#define        EFAULT          14      /* Bad address */
+#define        ENOTBLK         15      /* Block device required */
+#define        EBUSY           16      /* Device or resource busy */
+#define        EEXIST          17      /* File exists */
+#define        EXDEV           18      /* Cross-device link */
+#define        ENODEV          19      /* No such device */
+#define        ENOTDIR         20      /* Not a directory */
+#define        EISDIR          21      /* Is a directory */
+#define        EINVAL          22      /* Invalid argument */
+#define        ENFILE          23      /* File table overflow */
+#define        EMFILE          24      /* Too many open files */
+#define        ENOTTY          25      /* Not a typewriter */
+#define        ETXTBSY         26      /* Text file busy */
+#define        EFBIG           27      /* File too large */
+#define        ENOSPC          28      /* No space left on device */
+#define        ESPIPE          29      /* Illegal seek */
+#define        EROFS           30      /* Read-only file system */
+#define        EMLINK          31      /* Too many links */
+#define        EPIPE           32      /* Broken pipe */
+#define        EDOM            33      /* Math argument out of domain of func */
+#define        ERANGE          34      /* Math result not representable */
+#undef EDEADLK
+#define        EDEADLK         35      /* Resource deadlock would occur */
+#undef ENAMETOOLONG
+#define        ENAMETOOLONG    36      /* File name too long */
+#undef ENOLCK
+#define        ENOLCK          37      /* No record locks available */
+#undef ENOSYS
+#define        ENOSYS          38      /* Function not implemented */
+#undef ENOTEMPTY
+#define        ENOTEMPTY       39      /* Directory not empty */
+#define        ELOOP           40      /* Too many symbolic links encountered */
+#define        EWOULDBLOCK     EAGAIN  /* Operation would block */
+#define        ENOMSG          42      /* No message of desired type */
+#define        EIDRM           43      /* Identifier removed */
+#define        ECHRNG          44      /* Channel number out of range */
+#define        EL2NSYNC        45      /* Level 2 not synchronized */
+#define        EL3HLT          46      /* Level 3 halted */
+#define        EL3RST          47      /* Level 3 reset */
+#define        ELNRNG          48      /* Link number out of range */
+#define        EUNATCH         49      /* Protocol driver not attached */
+#define        ENOCSI          50      /* No CSI structure available */
+#define        EL2HLT          51      /* Level 2 halted */
+#define        EBADE           52      /* Invalid exchange */
+#define        EBADR           53      /* Invalid request descriptor */
+#define        EXFULL          54      /* Exchange full */
+#define        ENOANO          55      /* No anode */
+#define        EBADRQC         56      /* Invalid request code */
+#define        EBADSLT         57      /* Invalid slot */
+
+#define        EDEADLOCK       EDEADLK
+
+#define        EBFONT          59      /* Bad font file format */
+#define        ENOSTR          60      /* Device not a stream */
+#define        ENODATA         61      /* No data available */
+#define        ETIME           62      /* Timer expired */
+#define        ENOSR           63      /* Out of streams resources */
+#define        ENONET          64      /* Machine is not on the network */
+#define        ENOPKG          65      /* Package not installed */
+#define        EREMOTE         66      /* Object is remote */
+#define        ENOLINK         67      /* Link has been severed */
+#define        EADV            68      /* Advertise error */
+#define        ESRMNT          69      /* Srmount error */
+#define        ECOMM           70      /* Communication error on send */
+#define        EPROTO          71      /* Protocol error */
+#define        EMULTIHOP       72      /* Multihop attempted */
+#define        EDOTDOT         73      /* RFS specific error */
+#define        EBADMSG         74      /* Not a data message */
+#define        EOVERFLOW       75      /* Value too large for defined data type */
+#define        ENOTUNIQ        76      /* Name not unique on network */
+#define        EBADFD          77      /* File descriptor in bad state */
+#define        EREMCHG         78      /* Remote address changed */
+#define        ELIBACC         79      /* Can not access a needed shared library */
+#define        ELIBBAD         80      /* Accessing a corrupted shared library */
+#define        ELIBSCN         81      /* .lib section in a.out corrupted */
+#define        ELIBMAX         82      /* Attempting to link in too many shared libraries */
+#define        ELIBEXEC        83      /* Cannot exec a shared library directly */
+#undef EILSEQ
+#define        EILSEQ          84      /* Illegal byte sequence */
+#define        ERESTART        85      /* Interrupted system call should be restarted */
+#define        ESTRPIPE        86      /* Streams pipe error */
+#define        EUSERS          87      /* Too many users */
+#define        ENOTSOCK        88      /* Socket operation on non-socket */
+#define        EDESTADDRREQ    89      /* Destination address required */
+#define        EMSGSIZE        90      /* Message too long */
+#define        EPROTOTYPE      91      /* Protocol wrong type for socket */
+#define        ENOPROTOOPT     92      /* Protocol not available */
+#define        EPROTONOSUPPORT 93      /* Protocol not supported */
+#define        ESOCKTNOSUPPORT 94      /* Socket type not supported */
+#define        EOPNOTSUPP      95      /* Operation not supported on transport endpoint */
+#define        EPFNOSUPPORT    96      /* Protocol family not supported */
+#define        EAFNOSUPPORT    97      /* Address family not supported by protocol */
+#define        EADDRINUSE      98      /* Address already in use */
+#define        EADDRNOTAVAIL   99      /* Cannot assign requested address */
+#define        ENETDOWN        100     /* Network is down */
+#define        ENETUNREACH     101     /* Network is unreachable */
+#define        ENETRESET       102     /* Network dropped connection because of reset */
+#define        ECONNABORTED    103     /* Software caused connection abort */
+#define        ECONNRESET      104     /* Connection reset by peer */
+#define        ENOBUFS         105     /* No buffer space available */
+#define        EISCONN         106     /* Transport endpoint is already connected */
+#define        ENOTCONN        107     /* Transport endpoint is not connected */
+#define        ESHUTDOWN       108     /* Cannot send after transport endpoint shutdown */
+#define        ETOOMANYREFS    109     /* Too many references: cannot splice */
+#define        ETIMEDOUT       110     /* Connection timed out */
+#define        ECONNREFUSED    111     /* Connection refused */
+#define        EHOSTDOWN       112     /* Host is down */
+#define        EHOSTUNREACH    113     /* No route to host */
+#define        EALREADY        114     /* Operation already in progress */
+#define        EINPROGRESS     115     /* Operation now in progress */
+#define        ESTALE          116     /* Stale NFS file handle */
+#define        EUCLEAN         117     /* Structure needs cleaning */
+#define        ENOTNAM         118     /* Not a XENIX named type file */
+#define        ENAVAIL         119     /* No XENIX semaphores available */
+#define        EISNAM          120     /* Is a named type file */
+#define        EREMOTEIO       121     /* Remote I/O error */
+#define        EDQUOT          122     /* Quota exceeded */
+
+#define        ENOMEDIUM       123     /* No medium found */
+#define        EMEDIUMTYPE     124     /* Wrong medium type */
+
+/* Should never be seen by user programs */
+#define ERESTARTSYS    512
+#define ERESTARTNOINTR 513
+#define ERESTARTNOHAND 514     /* restart if no handler.. */
+#define ENOIOCTLCMD    515     /* No ioctl command */
+
+/* Defined for the NFSv3 protocol */
+#define EBADHANDLE     521     /* Illegal NFS file handle */
+#define ENOTSYNC       522     /* Update synchronization mismatch */
+#define EBADCOOKIE     523     /* Cookie is stale */
+#define ENOTSUPP       524     /* Operation is not supported */
+#define ETOOSMALL      525     /* Buffer or request is too small */
+#define ESERVERFAULT   526     /* An untranslatable error occurred */
+#define EBADTYPE       527     /* Type not supported by server */
+#define EJUKEBOX       528     /* Request initiated, but will not complete before timeout */
+
+
+
+/* open/fcntl - O_SYNC is only implemented on blocks devices and on files
+   located on an ext2 file system */
+#define O_ACCMODE         0003
+#define O_RDONLY            00
+#define O_WRONLY            01
+#define O_RDWR              02
+#define O_CREAT                   0100 /* not fcntl */
+#define O_EXCL            0200 /* not fcntl */
+#define O_NOCTTY          0400 /* not fcntl */
+#define O_TRUNC                  01000 /* not fcntl */
+#define O_APPEND         02000
+#define O_NONBLOCK       04000
+#define O_NDELAY       O_NONBLOCK
+#define O_SYNC          010000
+#define FASYNC          020000 /* fcntl, for BSD compatibility */
+#define O_DIRECT        040000 /* direct disk access hint */
+#define O_LARGEFILE    0100000
+#define O_DIRECTORY    0200000 /* must be a directory */
+#define O_NOFOLLOW     0400000 /* don't follow links */
+
+#define F_DUPFD                0       /* dup */
+#define F_GETFD                1       /* get close_on_exec */
+#define F_SETFD                2       /* set/clear close_on_exec */
+#define F_GETFL                3       /* get file->f_flags */
+#define F_SETFL                4       /* set file->f_flags */
+#define F_GETLK                5
+#define F_SETLK                6
+#define F_SETLKW       7
+
+#define F_SETOWN       8       /*  for sockets. */
+#define F_GETOWN       9       /*  for sockets. */
+#define F_SETSIG       10      /*  for sockets. */
+#define F_GETSIG       11      /*  for sockets. */
+
+#define F_GETLK64      12      /*  using 'struct flock64' */
+#define F_SETLK64      13
+#define F_SETLKW64     14
+
+/* for F_[GET|SET]FL */
+#define FD_CLOEXEC     1       /* actually anything with low bit set goes */
+
+/* for posix fcntl() and lockf() */
+#define F_RDLCK                0
+#define F_WRLCK                1
+#define F_UNLCK                2
+
+/* for old implementation of bsd flock () */
+#define F_EXLCK                4       /* or 3 */
+#define F_SHLCK                8       /* or 4 */
+
+/* for leases */
+#define F_INPROGRESS   16
+
+/* operations for bsd flock(), also used by the kernel implementation */
+#define LOCK_SH                1       /* shared lock */
+#define LOCK_EX                2       /* exclusive lock */
+#define LOCK_NB                4       /* or'd with one of the above to prevent
+                                  blocking */
+#define LOCK_UN                8       /* remove lock */
+
+#define LOCK_MAND      32      /* This is a mandatory flock */
+#define LOCK_READ      64      /* ... Which allows concurrent read operations */
+#define LOCK_WRITE     128     /* ... Which allows concurrent write operations */
+#define LOCK_RW                192     /* ... Which allows concurrent read & write ops */
+
+#endif
+
+
+#ifndef LIBCFS_SIGNAL_H
+#define LIBCFS_SIGNAL_H
+
+/*
+ *  signal values ...
+ */
+
+#define SIGHUP          1
+#define SIGINT          2
+#define SIGQUIT                 3
+#define SIGILL          4
+#define SIGTRAP                 5
+#define SIGABRT                 6
+#define SIGIOT          6
+#define SIGBUS          7
+#define SIGFPE          8
+#define SIGKILL                 9
+#define SIGUSR1                10
+#define SIGSEGV                11
+#define SIGUSR2                12
+#define SIGPIPE                13
+#define SIGALRM                14
+#define SIGTERM                15
+#define SIGSTKFLT      16
+#define SIGCHLD                17
+#define SIGCONT                18
+#define SIGSTOP                19
+#define SIGTSTP                20
+#define SIGTTIN                21
+#define SIGTTOU                22
+#define SIGURG         23
+#define SIGXCPU                24
+#define SIGXFSZ                25
+#define SIGVTALRM      26
+#define SIGPROF                27
+#define SIGWINCH       28
+#define SIGIO          29
+#define SIGPOLL                SIGIO
+/*
+#define SIGLOST                29
+*/
+#define SIGPWR         30
+#define SIGSYS         31
+#define        SIGUNUSED       31
+
+/* These should not be considered constants from userland.  */
+#define SIGRTMIN       32
+#define SIGRTMAX       (_NSIG-1)
+
+/*
+ * SA_FLAGS values:
+ *
+ * SA_ONSTACK indicates that a registered stack_t will be used.
+ * SA_INTERRUPT is a no-op, but left due to historical reasons. Use the
+ * SA_RESTART flag to get restarting signals (which were the default long ago)
+ * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
+ * SA_RESETHAND clears the handler when the signal is delivered.
+ * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
+ * SA_NODEFER prevents the current signal from being masked in the handler.
+ *
+ * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
+ * Unix names RESETHAND and NODEFER respectively.
+ */
+#define SA_NOCLDSTOP   0x00000001
+#define SA_NOCLDWAIT   0x00000002 /* not supported yet */
+#define SA_SIGINFO     0x00000004
+#define SA_ONSTACK     0x08000000
+#define SA_RESTART     0x10000000
+#define SA_NODEFER     0x40000000
+#define SA_RESETHAND   0x80000000
+
+#define SA_NOMASK      SA_NODEFER
+#define SA_ONESHOT     SA_RESETHAND
+#define SA_INTERRUPT   0x20000000 /* dummy -- ignored */
+
+#define SA_RESTORER    0x04000000
+
+/* 
+ * sigaltstack controls
+ */
+#define SS_ONSTACK     1
+#define SS_DISABLE     2
+
+#define MINSIGSTKSZ    2048
+#define SIGSTKSZ       8192
+
+
+#define sigmask(sig)   ((__u32)1 << ((sig) - 1))
+
+#endif // LIBCFS_SIGNAL_H
\ No newline at end of file
index 3df0f2b..a6e5159 100644 (file)
@@ -1,4 +1,4 @@
-portalsdir=$(includedir)/portals
+lnetdir=$(includedir)/lnet
 
 SUBDIRS := linux
 if DARWIN
@@ -6,7 +6,6 @@ SUBDIRS += darwin
 endif
 DIST_SUBDIRS := $(SUBDIRS)
 
-EXTRA_DIST = api.h api-support.h build_check.h errno.h         \
-       internal.h kpr.h lib-p30.h lib-types.h                  \
-       myrnal.h nal.h nalids.h p30.h ptlctl.h  \
-       socknal.h stringtab.h types.h
+EXTRA_DIST = api.h api-support.h \
+       lib-lnet.h lib-types.h lnet.h lnetctl.h types.h \
+        socklnd.h ptllnd.h ptllnd_wire.h
index 848cf40..717559f 100644 (file)
@@ -1,24 +1,18 @@
-#ifndef __API_SUPPORT_H__
-#define __API_SUPPORT_H__
-#include "build_check.h"
+#ifndef __LNET_API_SUPPORT_H__
+#define __LNET_API_SUPPORT_H__
 
-#ifndef __KERNEL__
-# include <stdio.h>
-# include <stdlib.h>
-# include <unistd.h>
-# include <time.h>
-
-/* Lots of POSIX dependencies to support PtlEQWait_timeout */
-# include <signal.h>
-# include <setjmp.h>
-# include <time.h>
+#if defined(__linux__)
+#include <lnet/linux/api-support.h>
+#elif defined(__APPLE__)
+#include <lnet/darwin/api-support.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/api-support.h>
+#else
+#error Unsupported Operating System
 #endif
 
-#include <portals/types.h>
+#include <lnet/types.h>
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
-
-#include <portals/internal.h>
-#include <portals/nal.h>
+#include <lnet/lnet.h>
 
 #endif
index 2d3a8f6..481a0fd 100644 (file)
-#ifndef P30_API_H
-#define P30_API_H
+#ifndef __LNET_API_H__
+#define __LNET_API_H__
 
-#include "build_check.h"
+#include <lnet/types.h>
 
-#include <portals/types.h>
+int LNetInit(void);
+void LNetFini(void);
 
-int PtlInit(int *);
-void PtlFini(void);
+int LNetNIInit(lnet_pid_t requested_pid);
+int LNetNIFini(void);
 
-int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid,
-             ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits,
-              ptl_handle_ni_t *interface_out);
-
-int PtlNIInitialized(ptl_interface_t);
-
-int PtlNIFini(ptl_handle_ni_t interface_in);
-
-int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id);
-
-int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid);
-
-
-/*
- * Network interfaces
- */
-
-int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
-                ptl_sr_value_t * status_out);
-
-int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
-              unsigned long *distance_out);
-
-int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out);
-
-
-/* 
- * PtlFailNid
- *
- * Not an official Portals 3 API call.  It provides a way of simulating
- * communications failures to all (nid == PTL_NID_ANY), or specific peers
- * (via multiple calls), either until further notice (threshold == -1), or
- * for a specific number of messages.  Passing a threshold of zero, "heals"
- * the given peer.
- */
-int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold);
-
-/* 
- * PtlLoopback
- *
- * Not an official Portals 3 API call.  It provides a way of enabling or
- * disabling loopback optimisation, or getting its current state.
- */
-int PtlLoopback (ptl_handle_ni_t ni, int set, int *enabled);
+int LNetGetId(unsigned int index, lnet_process_id_t *id);
+int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, int *order);
+int LNetCtl(unsigned int cmd, void *arg);
+void LNetSnprintHandle (char *str, int str_len, lnet_handle_any_t handle);
 
 /*
- * PtlSnprintHandle: 
- *
- * This is not an official Portals 3 API call.  It is provided
- * so that an application can print an opaque handle.
+ * Portals
  */
-void PtlSnprintHandle (char *str, int str_len, ptl_handle_any_t handle);
+int LNetSetLazyPortal(int portal);
+int LNetClearLazyPortal(int portal);
 
 /*
  * Match entries
  */
-
-int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
-                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
-                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
-                ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out);
-
-int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
-                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
-                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
-                ptl_handle_me_t * handle_out);
-
-int PtlMEUnlink(ptl_handle_me_t current_in);
-
-int PtlMEUnlinkList(ptl_handle_me_t current_in);
-
-
+int LNetMEAttach(unsigned int      portal,
+                lnet_process_id_t match_id_in, 
+                __u64             match_bits_in,
+                __u64             ignore_bits_in, 
+                lnet_unlink_t     unlink_in,
+                lnet_ins_pos_t    pos_in, 
+                lnet_handle_me_t *handle_out);
+
+int LNetMEInsert(lnet_handle_me_t  current_in, 
+                lnet_process_id_t match_id_in,
+                __u64             match_bits_in, 
+                __u64             ignore_bits_in,
+                lnet_unlink_t     unlink_in, 
+                lnet_ins_pos_t    position_in,
+                lnet_handle_me_t *handle_out);
+
+int LNetMEUnlink(lnet_handle_me_t current_in);
 
 /*
  * Memory descriptors
  */
+int LNetMDAttach(lnet_handle_me_t  current_in, 
+                lnet_md_t         md_in,
+                lnet_unlink_t     unlink_in, 
+                lnet_handle_md_t *handle_out);
 
-int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in,
-                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
-
-int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
-             ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out);
-
-int PtlMDUnlink(ptl_handle_md_t md_in);
-
-int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout,
-                ptl_md_t * new_inout, ptl_handle_eq_t testq_in);
-
-
-/* These should not be called by users */
-int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout,
-                         ptl_md_t * new_inout, ptl_handle_eq_t testq_in,
-                         ptl_seq_t sequence_in);
-
-
+int LNetMDBind(lnet_md_t         md_in,
+              lnet_unlink_t     unlink_in, 
+              lnet_handle_md_t *handle_out);
 
+int LNetMDUnlink(lnet_handle_md_t md_in);
 
 /*
  * Event queues
  */
-int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in,
-               ptl_eq_handler_t handler,
-               ptl_handle_eq_t *handle_out);
-int PtlEQFree(ptl_handle_eq_t eventq_in);
-
-int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+int LNetEQAlloc(unsigned int       count_in,
+               lnet_eq_handler_t  handler,
+               lnet_handle_eq_t  *handle_out);
 
+int LNetEQFree(lnet_handle_eq_t eventq_in);
 
-int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out);
+int LNetEQGet(lnet_handle_eq_t  eventq_in, 
+             lnet_event_t     *event_out);
 
-int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout,
-             ptl_event_t *event_out, int *which_out);
 
-/*
- * Access Control Table
- */
-int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
-               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in);
+int LNetEQWait(lnet_handle_eq_t  eventq_in, 
+              lnet_event_t     *event_out);
 
+int LNetEQPoll(lnet_handle_eq_t *eventqs_in, 
+              int               neq_in, 
+              int               timeout_ms,
+              lnet_event_t     *event_out, 
+              int              *which_eq_out);
 
 /*
  * Data movement
  */
-
-int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
-           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
-           ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in,
-           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in);
-
-int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
-           ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in,
-           ptl_match_bits_t match_bits_in, ptl_size_t offset_in);
-
-
+int LNetPut(lnet_nid_t        self,
+           lnet_handle_md_t  md_in, 
+           lnet_ack_req_t    ack_req_in,
+           lnet_process_id_t target_in, 
+           unsigned int      portal_in,
+           __u64             match_bits_in,
+           unsigned int      offset_in, 
+           __u64             hdr_data_in);
+
+int LNetGet(lnet_nid_t        self,
+           lnet_handle_md_t  md_in, 
+           lnet_process_id_t target_in,
+           unsigned int      portal_in, 
+           __u64             match_bits_in, 
+           unsigned int      offset_in);
 
 #endif
diff --git a/lnet/include/lnet/build_check.h b/lnet/include/lnet/build_check.h
deleted file mode 100644 (file)
index c219d2a..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef _BUILD_CHECK_H
-#define _BUILD_CHECK_H
-
-#if CRAY_PORTALS
-#error "an application got to me instead of cray's includes"
-#endif
-
-#endif
index b6e7daf..409e159 100644 (file)
@@ -1 +1 @@
-EXTRA_DIST := lib-p30.h  lib-types.h  p30.h
+EXTRA_DIST := lib-lnet.h  lib-types.h  lnet.h api-support.h
diff --git a/lnet/include/lnet/darwin/api-support.h b/lnet/include/lnet/darwin/api-support.h
new file mode 100644 (file)
index 0000000..c411f17
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef __DARWIN_API_SUPPORT_H__
+#define __DARWIN_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <portals/api-support.h> instead
+#endif
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+
+# ifdef HAVE_LIBREADLINE
+#  include <readline/readline.h>
+typedef VFunction      rl_vintfunc_t;
+typedef VFunction      rl_voidfunc_t;
+# endif
+#endif
+
+
+#endif
index d3b1ba9..af4bc5d 100644 (file)
@@ -1,14 +1,16 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
-#ifndef __PORTALS_DARWIN_LIB_P30_H__
-#define __PORTALS_DARWIN_LIB_P30_H__
+#ifndef __LNET_DARWIN_LIB_LNET_H__
+#define __LNET_DARWIN_LIB_LNET_H__
 
-#ifndef __PORTALS_LIB_P30_H__
-#error Do not #include this file directly. #include <portals/lib-p30.h> instead
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <lnet/lib-lnet.h> instead
 #endif
 
 #include <string.h>
 #include <libcfs/libcfs.h>
 
+#undef LNET_ROUTER
+
 #endif
diff --git a/lnet/include/lnet/darwin/lib-p30.h b/lnet/include/lnet/darwin/lib-p30.h
deleted file mode 100644 (file)
index d3b1ba9..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- */
-#ifndef __PORTALS_DARWIN_LIB_P30_H__
-#define __PORTALS_DARWIN_LIB_P30_H__
-
-#ifndef __PORTALS_LIB_P30_H__
-#error Do not #include this file directly. #include <portals/lib-p30.h> instead
-#endif
-
-#include <string.h>
-#include <libcfs/libcfs.h>
-
-#endif
index 744e566..f1552fb 100644 (file)
@@ -1,15 +1,27 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
-#ifndef __PORTALS_DARWIN_LIB_TYPES_H__
-#define __PORTALS_DARWIN_LIB_TYPES_H__
+#ifndef __LNET_DARWIN_LIB_TYPES_H__
+#define __LNET_DARWIN_LIB_TYPES_H__
 
-#ifndef __PORTALS_LIB_TYPES_H__
-#error Do not #include this file directly. #include <portals/lib-types.h> instead
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <lnet/lib-types.h> instead
 #endif
 
 #include <sys/types.h>
 #include <libcfs/libcfs.h>
 #include <libcfs/list.h>
 
+/*
+ * XXX Liang:
+ *
+ * Temporary fix, because lnet_me_free()->cfs_free->FREE() can be blocked in xnu,
+ * at then same time we've taken LNET_LOCK(), which is a spinlock.
+ * by using LNET_USE_LIB_FREELIST, we can avoid calling of FREE().
+ *
+ * A better solution is moving lnet_me_free() out from LNET_LOCK, it's not hard
+ * but need to be very careful and take some time.
+ */
+#define LNET_USE_LIB_FREELIST
+
 #endif
index e619fa7..82a6127 100644 (file)
@@ -1,15 +1,15 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
-#ifndef _PORTALS_DARWIN_P30_H_
-#define _PORTALS_DARWIN_P30_H_
+#ifndef __LNET_DARWIN_LNET_H__
+#define __LNET_DARWIN_LNET_H__
 
-#ifndef __PORTALS_P30_H__
-#error Do not #include this file directly. #include <portals/p30.h> instead
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <lnet/lnet.h> instead
 #endif
 
 /*
- * p30.h
+ * lnet.h
  *
  * User application interface file
  */
diff --git a/lnet/include/lnet/darwin/p30.h b/lnet/include/lnet/darwin/p30.h
deleted file mode 100644 (file)
index e619fa7..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- */
-#ifndef _PORTALS_DARWIN_P30_H_
-#define _PORTALS_DARWIN_P30_H_
-
-#ifndef __PORTALS_P30_H__
-#error Do not #include this file directly. #include <portals/p30.h> instead
-#endif
-
-/*
- * p30.h
- *
- * User application interface file
- */
-
-#include <sys/types.h>
-#include <sys/uio.h>
-
-#endif
diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h
deleted file mode 100644 (file)
index 42f2626..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef _P30_ERRNO_H_
-#define _P30_ERRNO_H_
-
-#include "build_check.h"
-/*
- * include/portals/errno.h
- *
- * Shared error number lists
- */
-
-/* If you change these, you must update the string table in api-errno.c */
-typedef enum {
-        PTL_OK                 = 0,
-        PTL_SEGV               = 1,
-
-        PTL_NO_SPACE           = 2,
-        PTL_ME_IN_USE          = 3,
-        PTL_VAL_FAILED         = 4,
-
-        PTL_NAL_FAILED         = 5,
-        PTL_NO_INIT            = 6,
-        PTL_IFACE_DUP          = 7,
-        PTL_IFACE_INVALID      = 8,
-
-        PTL_HANDLE_INVALID     = 9,
-        PTL_MD_INVALID         = 10,
-        PTL_ME_INVALID         = 11,
-/* If you change these, you must update the string table in api-errno.c */
-        PTL_PROCESS_INVALID    = 12,
-        PTL_PT_INDEX_INVALID   = 13,
-
-        PTL_SR_INDEX_INVALID   = 14,
-        PTL_EQ_INVALID         = 15,
-        PTL_EQ_DROPPED         = 16,
-
-        PTL_EQ_EMPTY           = 17,
-        PTL_MD_NO_UPDATE       = 18,
-        PTL_FAIL               = 19,
-
-        PTL_IOV_INVALID        = 20,
-
-       PTL_EQ_IN_USE           = 21,
-
-       PTL_NI_INVALID          = 22,
-       PTL_MD_ILLEGAL          = 23,
-       
-        PTL_MAX_ERRNO          = 24
-} ptl_err_t;
-/* If you change these, you must update the string table in api-errno.c */
-
-extern const char *ptl_err_str[];
-
-#endif
diff --git a/lnet/include/lnet/internal.h b/lnet/include/lnet/internal.h
deleted file mode 100644 (file)
index eae00a0..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _P30_INTERNAL_H_
-#define _P30_INTERNAL_H_
-
-#include "build_check.h"
-/*
- * p30/internal.h
- *
- * Internals for the API level library that are not needed
- * by the user application
- */
-
-#include <portals/p30.h>
-
-extern int ptl_init;           /* Has the library been initialized */
-
-#endif
diff --git a/lnet/include/lnet/kpr.h b/lnet/include/lnet/kpr.h
deleted file mode 100644 (file)
index 23d6e7c..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- */
-#ifndef __PORTALS_KPR_H__
-#define __PORTALS_KPR_H__
-
-# include <portals/lib-types.h> /* for ptl_hdr_t */
-
-/******************************************************************************/
-/* Kernel Portals Router interface */
-
-typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback
-
-/* space for routing targets to stash "stuff" in a forwarded packet */
-typedef union {
-        long long        _alignment;
-        void            *_space[16];            /* scale with CPU arch */
-} kprfd_scratch_t;
-
-/* Kernel Portals Routing Forwarded message Descriptor */
-typedef struct {
-        struct list_head     kprfd_list;        /* stash in queues (routing target can use) */
-        ptl_nid_t            kprfd_target_nid;  /* final destination NID */
-        ptl_nid_t            kprfd_gateway_nid; /* gateway NID */
-        ptl_hdr_t           *kprfd_hdr;         /* header in wire byte order */
-        int                  kprfd_nob;         /* # payload bytes */
-        int                  kprfd_niov;        /* # payload frags */
-        ptl_kiov_t          *kprfd_kiov;        /* payload fragments */
-        void                *kprfd_router_arg;  /* originating NAL's router arg */
-        kpr_fwd_callback_t   kprfd_callback;    /* completion callback */
-        void                *kprfd_callback_arg; /* completion callback arg */
-        kprfd_scratch_t      kprfd_scratch;     /* scratchpad for routing targets */
-} kpr_fwd_desc_t;
-
-typedef void  (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd);
-typedef void  (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive);
-
-/* NAL's routing interface (Kernel Portals Routing Nal Interface) */
-typedef const struct {
-        int             kprni_nalid;    /* NAL's id */
-        void           *kprni_arg;      /* Arg to pass when calling into NAL */
-        kpr_fwd_t       kprni_fwd;      /* NAL's forwarding entrypoint */
-        kpr_notify_t    kprni_notify;   /* NAL's notification entrypoint */
-} kpr_nal_interface_t;
-
-/* Router's routing interface (Kernel Portals Routing Router Interface) */
-typedef const struct {
-        /* register the calling NAL with the router and get back the handle for
-         * subsequent calls */
-        int     (*kprri_register) (kpr_nal_interface_t *nal_interface,
-                                   void **router_arg);
-
-        /* ask the router to find a gateway that forwards to 'nid' and is a
-         * peer of the calling NAL; assume caller will send 'nob' bytes of
-         * payload there */
-        int     (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob,
-                                 ptl_nid_t *gateway_nid);
-
-        /* hand a packet over to the router for forwarding */
-        kpr_fwd_t kprri_fwd_start;
-
-        /* hand a packet back to the router for completion */
-        void    (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd,
-                                   int error);
-
-        /* notify the router about peer state */
-        void    (*kprri_notify) (void *router_arg, ptl_nid_t peer,
-                                 int alive, time_t when);
-
-        /* the calling NAL is shutting down */
-        void    (*kprri_shutdown) (void *router_arg);
-
-        /* deregister the calling NAL with the router */
-        void    (*kprri_deregister) (void *router_arg);
-
-} kpr_router_interface_t;
-
-/* Convenient struct for NAL to stash router interface/args */
-typedef struct {
-        kpr_router_interface_t  *kpr_interface;
-        void                    *kpr_arg;
-} kpr_router_t;
-
-extern kpr_router_interface_t   kpr_router_interface;
-
-static inline int
-kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif)
-{
-        int    rc;
-
-        router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface);
-        if (router->kpr_interface == NULL)
-                return (-ENOENT);
-
-        rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg);
-        if (rc != 0)
-                router->kpr_interface = NULL;
-
-        PORTAL_SYMBOL_PUT (kpr_router_interface);
-        return (rc);
-}
-
-static inline int
-kpr_routing (kpr_router_t *router)
-{
-        return (router->kpr_interface != NULL);
-}
-
-static inline int
-kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid)
-{
-        if (!kpr_routing (router))
-                return (-ENETUNREACH);
-
-        return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob,
-                                                    gateway_nid));
-}
-
-static inline void
-kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, ptl_hdr_t *hdr,
-              int nob, int niov, ptl_kiov_t *kiov,
-              kpr_fwd_callback_t callback, void *callback_arg)
-{
-        fwd->kprfd_target_nid   = nid;
-        fwd->kprfd_gateway_nid  = nid;
-        fwd->kprfd_hdr          = hdr;
-        fwd->kprfd_nob          = nob;
-        fwd->kprfd_niov         = niov;
-        fwd->kprfd_kiov         = kiov;
-        fwd->kprfd_callback     = callback;
-        fwd->kprfd_callback_arg = callback_arg;
-}
-
-static inline void
-kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd)
-{
-        if (!kpr_routing (router))
-                fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH);
-        else
-                router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd);
-}
-
-static inline void
-kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error)
-{
-        LASSERT (kpr_routing (router));
-        router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error);
-}
-
-static inline void
-kpr_notify (kpr_router_t *router,
-            ptl_nid_t peer, int alive, time_t when)
-{
-        if (!kpr_routing (router))
-                return;
-
-        router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when);
-}
-
-static inline void
-kpr_shutdown (kpr_router_t *router)
-{
-        if (kpr_routing (router))
-                router->kpr_interface->kprri_shutdown (router->kpr_arg);
-}
-
-static inline void
-kpr_deregister (kpr_router_t *router)
-{
-        if (!kpr_routing (router))
-                return;
-        router->kpr_interface->kprri_deregister (router->kpr_arg);
-        router->kpr_interface = NULL;
-}
-
-#endif /* _KPR_H */
index f56206b..25ab308 100644 (file)
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * lib-p30.h
+ * lib-lnet.h
  *
  * Top level include for library side routines
  */
 
-#ifndef __PORTALS_LIB_P30_H__
-#define __PORTALS_LIB_P30_H__
-
-#include "build_check.h"
+#ifndef __LNET_LIB_LNET_H__
+#define __LNET_LIB_LNET_H__
 
 #if defined(__linux__)
-#include <portals/linux/lib-p30.h>
+#include <lnet/linux/lib-lnet.h>
 #elif defined(__APPLE__)
-#include <portals/darwin/lib-p30.h>
+#include <lnet/darwin/lib-lnet.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/lib-lnet.h>
 #else
 #error Unsupported Operating System
 #endif
 
-#include <portals/types.h>
+#include <lnet/types.h>
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <portals/nal.h>
-#include <portals/lib-types.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-types.h>
+
+extern lnet_t  the_lnet;                        /* THE network */
+
+static inline int lnet_is_wire_handle_none (lnet_handle_wire_t *wh)
+{
+        return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_NONE.wh_interface_cookie &&
+                wh->wh_object_cookie == LNET_WIRE_HANDLE_NONE.wh_object_cookie);
+}
+
+static inline int lnet_md_exhausted (lnet_libmd_t *md) 
+{
+        return (md->md_threshold == 0 ||
+                ((md->md_options & LNET_MD_MAX_SIZE) != 0 &&
+                 md->md_offset + md->md_max_size > md->md_length));
+}
 
-static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
+static inline int lnet_md_unlinkable (lnet_libmd_t *md)
 {
-        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
-                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
+        /* Should unlink md when its refcount is 0 and either:
+         *  - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink,
+         *    in the latter case md may not be exhausted).
+         *  - auto unlink is on and md is exhausted.
+         */
+        if (md->md_refcount != 0)
+                return 0;
+
+        if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0)
+                return 1;
+
+        return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+                lnet_md_exhausted(md));
 }
 
 #ifdef __KERNEL__
-#define LIB_LOCK(nal,flags)                                     \
-        spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags)
-#define LIB_UNLOCK(nal,flags)                                   \
-        spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags)
+#define LNET_LOCK()        spin_lock(&the_lnet.ln_lock)                 
+#define LNET_UNLOCK()      spin_unlock(&the_lnet.ln_lock)               
+#define LNET_MUTEX_DOWN(m) mutex_down(m)
+#define LNET_MUTEX_UP(m)   mutex_up(m)
 #else
-#define LIB_LOCK(nal,flags)                                             \
-        (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0)
-#define LIB_UNLOCK(nal,flags)                                   \
-        pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex)
+# if !HAVE_LIBPTHREAD
+#define LNET_SINGLE_THREADED_LOCK(l)            \
+do {                                            \
+        LASSERT ((l) == 0);                     \
+        (l) = 1;                                \
+} while (0)
+
+#define LNET_SINGLE_THREADED_UNLOCK(l)          \
+do {                                            \
+        LASSERT ((l) == 1);                     \
+        (l) = 0;                                \
+} while (0)
+
+#define LNET_LOCK()        LNET_SINGLE_THREADED_LOCK(the_lnet.ln_lock)
+#define LNET_UNLOCK()      LNET_SINGLE_THREADED_UNLOCK(the_lnet.ln_lock)
+#define LNET_MUTEX_DOWN(m) LNET_SINGLE_THREADED_LOCK(*(m))
+#define LNET_MUTEX_UP(m)   LNET_SINGLE_THREADED_UNLOCK(*(m))
+# else
+#define LNET_LOCK()        pthread_mutex_lock(&the_lnet.ln_lock)
+#define LNET_UNLOCK()      pthread_mutex_unlock(&the_lnet.ln_lock)
+#define LNET_MUTEX_DOWN(m) pthread_mutex_lock(m)
+#define LNET_MUTEX_UP(m)   pthread_mutex_unlock(m)
+# endif
 #endif
 
+#define MAX_PORTALS     64
 
-#ifdef PTL_USE_LIB_FREELIST
+#ifdef LNET_USE_LIB_FREELIST
 
 #define MAX_MES         2048
 #define MAX_MDS         2048
 #define MAX_MSGS        2048    /* Outstanding messages */
 #define MAX_EQS         512
 
-extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize);
-extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl);
-
 static inline void *
-lib_freelist_alloc (lib_freelist_t *fl)
+lnet_freelist_alloc (lnet_freelist_t *fl)
 {
         /* ALWAYS called with liblock held */
-        lib_freeobj_t *o;
+        lnet_freeobj_t *o;
 
         if (list_empty (&fl->fl_list))
                 return (NULL);
         
-        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
+        o = list_entry (fl->fl_list.next, lnet_freeobj_t, fo_list);
         list_del (&o->fo_list);
         return ((void *)&o->fo_contents);
 }
 
 static inline void
-lib_freelist_free (lib_freelist_t *fl, void *obj)
+lnet_freelist_free (lnet_freelist_t *fl, void *obj)
 {
         /* ALWAYS called with liblock held */
-        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
+        lnet_freeobj_t *o = list_entry (obj, lnet_freeobj_t, fo_contents);
         
         list_add (&o->fo_list, &fl->fl_list);
 }
 
 
-static inline lib_eq_t *
-lib_eq_alloc (lib_nal_t *nal)
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
 {
         /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_eq_t      *eq;
+        lnet_eq_t     *eq;
         
-        LIB_LOCK (nal, flags);
-        eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs);
-        LIB_UNLOCK (nal, flags);
+        LNET_LOCK();
+        eq = (lnet_eq_t *)lnet_freelist_alloc(&the_lnet.ln_free_eqs);
+        LNET_UNLOCK();
 
         return (eq);
 }
 
 static inline void
-lib_eq_free (lib_nal_t *nal, lib_eq_t *eq)
+lnet_eq_free (lnet_eq_t *eq)
 {
         /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq);
+        lnet_freelist_free(&the_lnet.ln_free_eqs, eq);
 }
 
-static inline lib_md_t *
-lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd)
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
 {
         /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_md_t      *md;
+        lnet_libmd_t  *md;
         
-        LIB_LOCK (nal, flags);
-        md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds);
-        LIB_UNLOCK (nal, flags);
+        LNET_LOCK();
+        md = (lnet_libmd_t *)lnet_freelist_alloc(&the_lnet.ln_free_mds);
+        LNET_UNLOCK();
 
         return (md);
 }
 
 static inline void
-lib_md_free (lib_nal_t *nal, lib_md_t *md)
+lnet_md_free (lnet_libmd_t *md)
 {
         /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_mds, md);
+        lnet_freelist_free (&the_lnet.ln_free_mds, md);
 }
 
-static inline lib_me_t *
-lib_me_alloc (lib_nal_t *nal)
+static inline lnet_me_t *
+lnet_me_alloc (void)
 {
         /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_me_t      *me;
+        lnet_me_t     *me;
         
-        LIB_LOCK (nal, flags);
-        me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes);
-        LIB_UNLOCK (nal, flags);
+        LNET_LOCK();
+        me = (lnet_me_t *)lnet_freelist_alloc(&the_lnet.ln_free_mes);
+        LNET_UNLOCK();
         
         return (me);
 }
 
 static inline void
-lib_me_free (lib_nal_t *nal, lib_me_t *me)
+lnet_me_free (lnet_me_t *me)
 {
         /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_mes, me);
+        lnet_freelist_free (&the_lnet.ln_free_mes, me);
 }
 
-static inline lib_msg_t *
-lib_msg_alloc (lib_nal_t *nal)
+static inline lnet_msg_t *
+lnet_msg_alloc (void)
 {
         /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_msg_t     *msg;
+        lnet_msg_t    *msg;
         
-        LIB_LOCK (nal, flags);
-        msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs);
-        LIB_UNLOCK (nal, flags);
+        LNET_LOCK();
+        msg = (lnet_msg_t *)lnet_freelist_alloc(&the_lnet.ln_free_msgs);
+        LNET_UNLOCK();
 
         if (msg != NULL) {
                 /* NULL pointers, clear flags etc */
                 memset (msg, 0, sizeof (*msg));
-                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+#ifdef CRAY_XT3
+                msg->msg_ev.uid = LNET_UID_ANY;
+#endif
         }
         return(msg);
 }
 
 static inline void
-lib_msg_free (lib_nal_t *nal, lib_msg_t *msg)
+lnet_msg_free (lnet_msg_t *msg)
 {
         /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg);
+        LASSERT (!msg->msg_onactivelist);
+        lnet_freelist_free(&the_lnet.ln_free_msgs, msg);
 }
 
 #else
 
-static inline lib_eq_t *
-lib_eq_alloc (lib_nal_t *nal)
+static inline lnet_eq_t *
+lnet_eq_alloc (void)
 {
         /* NEVER called with liblock held */
-        lib_eq_t *eq;
+        lnet_eq_t *eq;
 
-        PORTAL_ALLOC(eq, sizeof(*eq));
+        LIBCFS_ALLOC(eq, sizeof(*eq));
         return (eq);
 }
 
 static inline void
-lib_eq_free (lib_nal_t *nal, lib_eq_t *eq)
+lnet_eq_free (lnet_eq_t *eq)
 {
         /* ALWAYS called with liblock held */
-        PORTAL_FREE(eq, sizeof(*eq));
+        LIBCFS_FREE(eq, sizeof(*eq));
 }
 
-static inline lib_md_t *
-lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd)
+static inline lnet_libmd_t *
+lnet_md_alloc (lnet_md_t *umd)
 {
         /* NEVER called with liblock held */
-        lib_md_t *md;
-        int       size;
-        int       niov;
+        lnet_libmd_t *md;
+        int           size;
+        unsigned int  niov;
 
-        if ((umd->options & PTL_MD_KIOV) != 0) {
+        if ((umd->options & LNET_MD_KIOV) != 0) {
                 niov = umd->length;
-                size = offsetof(lib_md_t, md_iov.kiov[niov]);
+                size = offsetof(lnet_libmd_t, md_iov.kiov[niov]);
         } else {
-                niov = ((umd->options & PTL_MD_IOVEC) != 0) ?
+                niov = ((umd->options & LNET_MD_IOVEC) != 0) ?
                        umd->length : 1;
-                size = offsetof(lib_md_t, md_iov.iov[niov]);
+                size = offsetof(lnet_libmd_t, md_iov.iov[niov]);
         }
 
-        PORTAL_ALLOC(md, size);
+        LIBCFS_ALLOC(md, size);
 
         if (md != NULL) {
                 /* Set here in case of early free */
-                md->options = umd->options;
+                md->md_options = umd->options;
                 md->md_niov = niov;
         }
         
@@ -215,252 +256,424 @@ lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd)
 }
 
 static inline void 
-lib_md_free (lib_nal_t *nal, lib_md_t *md)
+lnet_md_free (lnet_libmd_t *md)
 {
         /* ALWAYS called with liblock held */
         int       size;
 
-        if ((md->options & PTL_MD_KIOV) != 0)
-                size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
+        if ((md->md_options & LNET_MD_KIOV) != 0)
+                size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]);
         else
-                size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
+                size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]);
 
-        PORTAL_FREE(md, size);
+        LIBCFS_FREE(md, size);
 }
 
-static inline lib_me_t *
-lib_me_alloc (lib_nal_t *nal)
+static inline lnet_me_t *
+lnet_me_alloc (void)
 {
         /* NEVER called with liblock held */
-        lib_me_t *me;
+        lnet_me_t *me;
 
-        PORTAL_ALLOC(me, sizeof(*me));
+        LIBCFS_ALLOC(me, sizeof(*me));
         return (me);
 }
 
 static inline void 
-lib_me_free(lib_nal_t *nal, lib_me_t *me)
+lnet_me_free(lnet_me_t *me)
 {
         /* ALWAYS called with liblock held */
-        PORTAL_FREE(me, sizeof(*me));
+        LIBCFS_FREE(me, sizeof(*me));
 }
 
-static inline lib_msg_t *
-lib_msg_alloc(lib_nal_t *nal)
+static inline lnet_msg_t *
+lnet_msg_alloc(void)
 {
-        /* NEVER called with liblock held; may be in interrupt... */
-        lib_msg_t *msg;
+        /* NEVER called with liblock held */
+        lnet_msg_t *msg;
 
-        if (in_interrupt())
-                PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
-        else
-                PORTAL_ALLOC(msg, sizeof(*msg));
+        LIBCFS_ALLOC(msg, sizeof(*msg));
 
         if (msg != NULL) {
                 /* NULL pointers, clear flags etc */
                 memset (msg, 0, sizeof (*msg));
-                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
+#ifdef CRAY_XT3
+                msg->msg_ev.uid = LNET_UID_ANY;
+#endif
         }
         return (msg);
 }
 
 static inline void 
-lib_msg_free(lib_nal_t *nal, lib_msg_t *msg)
+lnet_msg_free(lnet_msg_t *msg)
 {
         /* ALWAYS called with liblock held */
-        PORTAL_FREE(msg, sizeof(*msg));
+        LASSERT (!msg->msg_onactivelist);
+        LIBCFS_FREE(msg, sizeof(*msg));
 }
 #endif
 
-extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type);
-extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type);
-extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh);
+extern lnet_libhandle_t *lnet_lookup_cookie (__u64 cookie, int type);
+extern void lnet_initialise_handle (lnet_libhandle_t *lh, int type);
+extern void lnet_invalidate_handle (lnet_libhandle_t *lh);
 
 static inline void
-ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq)
+lnet_eq2handle (lnet_handle_eq_t *handle, lnet_eq_t *eq)
 {
         if (eq == NULL) {
-                *handle = PTL_EQ_NONE;
+                *handle = LNET_EQ_NONE;
                 return;
         }
 
-        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
         handle->cookie = eq->eq_lh.lh_cookie;
 }
 
-static inline lib_eq_t *
-ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal)
+static inline lnet_eq_t *
+lnet_handle2eq (lnet_handle_eq_t *handle)
 {
         /* ALWAYS called with liblock held */
-        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, 
-                                              PTL_COOKIE_TYPE_EQ);
+        lnet_libhandle_t *lh = lnet_lookup_cookie(handle->cookie, 
+                                                  LNET_COOKIE_TYPE_EQ);
         if (lh == NULL)
                 return (NULL);
 
-        return (lh_entry (lh, lib_eq_t, eq_lh));
+        return (lh_entry (lh, lnet_eq_t, eq_lh));
 }
 
 static inline void
-ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md)
+lnet_md2handle (lnet_handle_md_t *handle, lnet_libmd_t *md)
 {
-        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
         handle->cookie = md->md_lh.lh_cookie;
 }
 
-static inline lib_md_t *
-ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal)
+static inline lnet_libmd_t *
+lnet_handle2md (lnet_handle_md_t *handle)
 {
         /* ALWAYS called with liblock held */
-        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
-                                              PTL_COOKIE_TYPE_MD);
+        lnet_libhandle_t *lh = lnet_lookup_cookie(handle->cookie,
+                                                  LNET_COOKIE_TYPE_MD);
         if (lh == NULL)
                 return (NULL);
 
-        return (lh_entry (lh, lib_md_t, md_lh));
+        return (lh_entry (lh, lnet_libmd_t, md_lh));
 }
 
-static inline lib_md_t *
-ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal)
+static inline lnet_libmd_t *
+lnet_wire_handle2md (lnet_handle_wire_t *wh)
 {
         /* ALWAYS called with liblock held */
-        lib_handle_t *lh;
+        lnet_libhandle_t *lh;
         
-        if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie)
+        if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie)
                 return (NULL);
         
-        lh = lib_lookup_cookie (nal, wh->wh_object_cookie,
-                                PTL_COOKIE_TYPE_MD);
+        lh = lnet_lookup_cookie(wh->wh_object_cookie,
+                                LNET_COOKIE_TYPE_MD);
         if (lh == NULL)
                 return (NULL);
 
-        return (lh_entry (lh, lib_md_t, md_lh));
+        return (lh_entry (lh, lnet_libmd_t, md_lh));
 }
 
 static inline void
-ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me)
+lnet_me2handle (lnet_handle_me_t *handle, lnet_me_t *me)
 {
-        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
         handle->cookie = me->me_lh.lh_cookie;
 }
 
-static inline lib_me_t *
-ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal)
+static inline lnet_me_t *
+lnet_handle2me (lnet_handle_me_t *handle)
 {
         /* ALWAYS called with liblock held */
-        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
-                                              PTL_COOKIE_TYPE_ME);
+        lnet_libhandle_t *lh = lnet_lookup_cookie(handle->cookie,
+                                                  LNET_COOKIE_TYPE_ME);
         if (lh == NULL)
                 return (NULL);
 
-        return (lh_entry (lh, lib_me_t, me_lh));
+        return (lh_entry (lh, lnet_me_t, me_lh));
 }
 
-extern int lib_init(lib_nal_t *libnal, nal_t *apinal,
-                    ptl_process_id_t pid,
-                    ptl_ni_limits_t *desired_limits, 
-                    ptl_ni_limits_t *actual_limits);
-extern int lib_fini(lib_nal_t *libnal);
+static inline void
+lnet_peer_addref_locked(lnet_peer_t *lp)
+{
+        LASSERT (lp->lp_refcount > 0);
+        lp->lp_refcount++;
+}
 
-/*
- * When the NAL detects an incoming message header, it should call
- * lib_parse() decode it.  If the message header is garbage, lib_parse()
- * returns immediately with failure, otherwise the NAL callbacks will be
- * called to receive the message body.  They are handed the private cookie
- * as a way for the NAL to maintain state about which transaction is being
- * processed.  An extra parameter, lib_msg contains the lib-level message
- * state for passing to lib_finalize() when the message body has been
- * received.
- */
-extern void lib_enq_event_locked (lib_nal_t *nal, void *private,
-                                  lib_eq_t *eq, ptl_event_t *ev);
-extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, 
-                          ptl_ni_fail_t ni_fail_type);
-extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private);
-extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, 
-                                        lib_msg_t *get_msg);
-extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr);
-
-
-extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, 
-                              ptl_size_t offset, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, 
-                              char *src, ptl_size_t len);
-extern int lib_extract_iov (int dst_niov, struct iovec *dst,
-                            int src_niov, struct iovec *src,
-                            ptl_size_t offset, ptl_size_t len);
-
-extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
-                               ptl_size_t offset, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
-                               char *src, ptl_size_t len);
-extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
-                             int src_niov, ptl_kiov_t *src,
-                             ptl_size_t offset, ptl_size_t len);
-
-extern void lib_assert_wire_constants (void);
-
-extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
-                           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg,
-                           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                           lib_md_t *md, ptl_size_t offset, ptl_size_t len);
-
-extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx,
-                              ptl_sr_value_t *status);
-extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, 
-                            unsigned long *dist);
-
-extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count,
-                             ptl_eq_handler_t callback, 
-                             ptl_handle_eq_t *handle);
-extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh);
-extern int lib_api_eq_poll (nal_t *nal, 
-                            ptl_handle_eq_t *eventqs, int neq, int timeout_ms,
-                            ptl_event_t *event, int *which);
-
-extern int lib_api_me_attach(nal_t *nal,
-                             ptl_pt_index_t portal,
-                             ptl_process_id_t match_id, 
-                             ptl_match_bits_t match_bits, 
-                             ptl_match_bits_t ignore_bits,
-                             ptl_unlink_t unlink, ptl_ins_pos_t pos,
-                             ptl_handle_me_t *handle);
-extern int lib_api_me_insert(nal_t *nal,
-                             ptl_handle_me_t *current_meh,
-                             ptl_process_id_t match_id, 
-                             ptl_match_bits_t match_bits, 
-                             ptl_match_bits_t ignore_bits,
-                             ptl_unlink_t unlink, ptl_ins_pos_t pos,
-                             ptl_handle_me_t *handle);
-extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh);
-extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me);
-
-extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid);
-
-extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md);
-extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd);
-extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh,
-                             ptl_md_t *umd, ptl_unlink_t unlink, 
-                             ptl_handle_md_t *handle);
-extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink,
-                           ptl_handle_md_t *handle);
-extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh);
-extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh,
-                              ptl_md_t *oldumd, ptl_md_t *newumd,
-                              ptl_handle_eq_t *testqh);
-
-extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, 
-                       ptl_process_id_t *id,
-                       ptl_pt_index_t portal, ptl_ac_index_t ac,
-                       ptl_match_bits_t match_bits, ptl_size_t offset);
-extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, 
-                       ptl_ack_req_t ack, ptl_process_id_t *id,
-                       ptl_pt_index_t portal, ptl_ac_index_t ac,
-                       ptl_match_bits_t match_bits, 
-                       ptl_size_t offset, ptl_hdr_data_t hdr_data);
-extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold);
-extern int lib_api_loopback(nal_t *apinal, int set, int *enabled);
+extern void lnet_destroy_peer_locked(lnet_peer_t *lp);
+
+static inline void
+lnet_peer_decref_locked(lnet_peer_t *lp)
+{
+        LASSERT (lp->lp_refcount > 0);
+        lp->lp_refcount--;
+        if (lp->lp_refcount == 0)
+                lnet_destroy_peer_locked(lp);
+}
+
+static inline int
+lnet_isrouter(lnet_peer_t *lp)
+{
+        return lp->lp_rtr_refcount != 0;
+}
+
+static inline void
+lnet_ni_addref_locked(lnet_ni_t *ni) 
+{
+        LASSERT (ni->ni_refcount > 0);
+        ni->ni_refcount++;
+}
+
+static inline void
+lnet_ni_addref(lnet_ni_t *ni) 
+{
+        LNET_LOCK();
+        lnet_ni_addref_locked(ni);
+        LNET_UNLOCK();
+}
+
+static inline void
+lnet_ni_decref_locked(lnet_ni_t *ni)
+{
+        LASSERT (ni->ni_refcount > 0);
+        ni->ni_refcount--;
+        if (ni->ni_refcount == 0)
+                list_add_tail(&ni->ni_list, &the_lnet.ln_zombie_nis);
+}
+
+static inline void
+lnet_ni_decref(lnet_ni_t *ni)
+{
+        LNET_LOCK();
+        lnet_ni_decref_locked(ni);
+        LNET_UNLOCK();
+}
+
+static inline lnet_nid_t
+lnet_ptlcompat_srcnid(lnet_nid_t src, lnet_nid_t dst)
+{
+        /* Give myself a portals srcnid if I'm sending to portals */
+        if (the_lnet.ln_ptlcompat > 0 &&   
+            LNET_NIDNET(dst) == 0)
+                return LNET_MKNID(0, LNET_NIDADDR(src));
+        
+        return src;
+}
+
+static inline int
+lnet_ptlcompat_matchnid(lnet_nid_t lnet_nid, lnet_nid_t ptl_nid) 
+{
+        return ((ptl_nid == lnet_nid) ||
+                (the_lnet.ln_ptlcompat > 0 &&
+                 LNET_NIDNET(ptl_nid) == 0 &&
+                 LNET_NETTYP(LNET_NIDNET(lnet_nid)) != LOLND &&
+                 LNET_NIDADDR(ptl_nid) == LNET_NIDADDR(lnet_nid)));
+}
+
+static inline int
+lnet_ptlcompat_matchnet(__u32 lnet_net, __u32 ptl_net) 
+{
+        return ((ptl_net == lnet_net) ||
+                (the_lnet.ln_ptlcompat > 0 &&
+                 ptl_net == 0 &&
+                 LNET_NETTYP(lnet_net) != LOLND));
+}
+
+static inline struct list_head *
+lnet_nid2peerhash (lnet_nid_t nid)
+{
+       unsigned int idx = LNET_NIDADDR(nid) % LNET_PEER_HASHSIZE;
+
+        return &the_lnet.ln_peer_hash[idx];
+}
+
+extern lnd_t the_lolnd;
+
+#ifndef __KERNEL__
+/* unconditional registration */
+#define LNET_REGISTER_ULND(lnd)                 \
+do {                                            \
+        extern lnd_t lnd;                       \
+                                                \
+        lnet_register_lnd(&(lnd));              \
+} while (0)
+
+/* conditional registration */
+#define LNET_REGISTER_ULND_IF_PRESENT(lnd)                              \
+do {                                                                    \
+        extern lnd_t lnd __attribute__ ((weak, alias("the_lolnd")));    \
+                                                                        \
+        if (&(lnd) != &the_lolnd)                                       \
+                lnet_register_lnd(&(lnd));                              \
+} while (0)
+#endif
+
+#ifdef CRAY_XT3
+inline static void
+lnet_set_msg_uid(lnet_ni_t *ni, lnet_msg_t *msg, lnet_uid_t uid)
+{
+        LASSERT (msg->msg_ev.uid == LNET_UID_ANY);
+        msg->msg_ev.uid = uid;
+}
+#endif
+
+extern lnet_ni_t *lnet_nid2ni_locked (lnet_nid_t nid);
+extern lnet_ni_t *lnet_net2ni_locked (__u32 net);
+static inline lnet_ni_t *
+lnet_net2ni (__u32 net) 
+{
+        lnet_ni_t *ni;
+
+        LNET_LOCK();
+        ni = lnet_net2ni_locked(net);
+        LNET_UNLOCK();
+
+        return ni;
+}
+
+int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, time_t when);
+int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid);
+int lnet_check_routes(void);
+int lnet_del_route(__u32 net, lnet_nid_t gw_nid);
+void lnet_destroy_routes(void);
+int lnet_get_route(int idx, __u32 *net, __u32 *hops, 
+                   lnet_nid_t *gateway, __u32 *alive);
+void lnet_proc_init(void);
+void lnet_proc_fini(void);
+void lnet_init_rtrpools(void);
+int  lnet_alloc_rtrpools(int im_a_router);
+void lnet_free_rtrpools(void);
+lnet_remotenet_t *lnet_find_net_locked (__u32 net);
+
+int lnet_islocalnid(lnet_nid_t nid);
+int lnet_islocalnet(__u32 net);
+
+void lnet_enq_event_locked(lnet_eq_t *eq, lnet_event_t *ev);
+void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+                    unsigned int offset, unsigned int len);
+int lnet_send(lnet_nid_t nid, lnet_msg_t *msg);
+void lnet_return_credits_locked (lnet_msg_t *msg);
+void lnet_match_blocked_msg(lnet_libmd_t *md);
+int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr,
+                lnet_nid_t fromnid, void *private, int rdma_req);
+void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+               unsigned int offset, unsigned int mlen, unsigned int rlen);
+lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg);
+void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len);
+void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc);
+
+char *lnet_msgtyp2str (int type);
+void lnet_print_hdr (lnet_hdr_t * hdr);
+int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold);
+
+unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov);
+int lnet_extract_iov (int dst_niov, struct iovec *dst,
+                      int src_niov, struct iovec *src,
+                      unsigned int offset, unsigned int len);
+
+unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov);
+int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, 
+                      int src_niov, lnet_kiov_t *src,
+                      unsigned int offset, unsigned int len);
+
+void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, 
+                        unsigned int doffset, 
+                        unsigned int nsiov, struct iovec *siov, 
+                        unsigned int soffset, unsigned int nob);
+void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, 
+                         unsigned int iovoffset,
+                         unsigned int nkiov, lnet_kiov_t *kiov, 
+                         unsigned int kiovoffset, unsigned int nob);
+void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, 
+                         unsigned int kiovoffset,
+                         unsigned int niov, struct iovec *iov, 
+                         unsigned int iovoffset, unsigned int nob);
+void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov, 
+                          unsigned int doffset, 
+                          unsigned int nskiov, lnet_kiov_t *skiov, 
+                          unsigned int soffset, unsigned int nob);
+
+static inline void
+lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset,
+                   unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+                   unsigned int nob)
+{
+        struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen};
+
+        lnet_copy_iov2iov(1, &diov, doffset,
+                          nsiov, siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset,
+                    unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset,
+                    unsigned int nob)
+{
+        struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen};
+
+        lnet_copy_kiov2iov(1, &diov, doffset,
+                           nsiov, skiov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+                   int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+        struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen};
+        lnet_copy_iov2iov(ndiov, diov, doffset,
+                          1, &siov, soffset, nob);
+}
+
+static inline void
+lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset,
+                    int slen, void *src, unsigned int soffset, unsigned int nob)
+{
+        struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen};
+        lnet_copy_iov2kiov(ndiov, dkiov, doffset,
+                           1, &siov, soffset, nob);
+}
+
+void lnet_me_unlink(lnet_me_t *me);
+
+void lnet_md_unlink(lnet_libmd_t *md);
+void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd);
+
+void lnet_register_lnd(lnd_t *lnd);
+void lnet_unregister_lnd(lnd_t *lnd);
+int lnet_set_ip_niaddr (lnet_ni_t *ni);
+
+#ifdef __KERNEL__
+int lnet_connect(cfs_socket_t **sockp, lnet_nid_t peer_nid,
+                 __u32 local_ip, __u32 peer_ip, int peer_port);
+void lnet_connect_console_error(int rc, lnet_nid_t peer_nid,
+                                __u32 peer_ip, int port);
+int lnet_count_acceptor_nis(lnet_ni_t **first_ni);
+int lnet_accept(lnet_ni_t *blind_ni, cfs_socket_t *sock, __u32 magic);
+int lnet_acceptor_timeout(void);
+int lnet_acceptor_port(void);
+#endif
+
+int lnet_acceptor_start(void);
+void lnet_acceptor_stop(void);
+
+int lnet_peers_start_down(void);
+int lnet_router_checker_start(void);
+void lnet_router_checker_stop(void);
+
+int lnet_ping_target_init(void);
+void lnet_ping_target_fini(void);
+int lnet_ping(lnet_process_id_t id, int timeout_ms,
+              lnet_process_id_t *ids, int n_ids);
+
+int lnet_parse_ip2nets (char **networksp, char *ip2nets);
+int lnet_parse_routes (char *route_str, int *im_a_router);
+int lnet_parse_networks (struct list_head *nilist, char *networks);
+
+int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid);
+lnet_peer_t *lnet_find_peer_locked (lnet_nid_t nid);
+void lnet_clear_peer_table(void);
+void lnet_destroy_peer_table(void);
+int lnet_create_peer_table(void);
+void lnet_debug_peer(lnet_nid_t nid);
 
 #endif
diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h
deleted file mode 100644 (file)
index f56206b..0000000
+++ /dev/null
@@ -1,466 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * lib-p30.h
- *
- * Top level include for library side routines
- */
-
-#ifndef __PORTALS_LIB_P30_H__
-#define __PORTALS_LIB_P30_H__
-
-#include "build_check.h"
-
-#if defined(__linux__)
-#include <portals/linux/lib-p30.h>
-#elif defined(__APPLE__)
-#include <portals/darwin/lib-p30.h>
-#else
-#error Unsupported Operating System
-#endif
-
-#include <portals/types.h>
-#include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <portals/nal.h>
-#include <portals/lib-types.h>
-
-static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh)
-{
-        return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie &&
-                wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie);
-}
-
-#ifdef __KERNEL__
-#define LIB_LOCK(nal,flags)                                     \
-        spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags)
-#define LIB_UNLOCK(nal,flags)                                   \
-        spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags)
-#else
-#define LIB_LOCK(nal,flags)                                             \
-        (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0)
-#define LIB_UNLOCK(nal,flags)                                   \
-        pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex)
-#endif
-
-
-#ifdef PTL_USE_LIB_FREELIST
-
-#define MAX_MES         2048
-#define MAX_MDS         2048
-#define MAX_MSGS        2048    /* Outstanding messages */
-#define MAX_EQS         512
-
-extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize);
-extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl);
-
-static inline void *
-lib_freelist_alloc (lib_freelist_t *fl)
-{
-        /* ALWAYS called with liblock held */
-        lib_freeobj_t *o;
-
-        if (list_empty (&fl->fl_list))
-                return (NULL);
-        
-        o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list);
-        list_del (&o->fo_list);
-        return ((void *)&o->fo_contents);
-}
-
-static inline void
-lib_freelist_free (lib_freelist_t *fl, void *obj)
-{
-        /* ALWAYS called with liblock held */
-        lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents);
-        
-        list_add (&o->fo_list, &fl->fl_list);
-}
-
-
-static inline lib_eq_t *
-lib_eq_alloc (lib_nal_t *nal)
-{
-        /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_eq_t      *eq;
-        
-        LIB_LOCK (nal, flags);
-        eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs);
-        LIB_UNLOCK (nal, flags);
-
-        return (eq);
-}
-
-static inline void
-lib_eq_free (lib_nal_t *nal, lib_eq_t *eq)
-{
-        /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq);
-}
-
-static inline lib_md_t *
-lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd)
-{
-        /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_md_t      *md;
-        
-        LIB_LOCK (nal, flags);
-        md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds);
-        LIB_UNLOCK (nal, flags);
-
-        return (md);
-}
-
-static inline void
-lib_md_free (lib_nal_t *nal, lib_md_t *md)
-{
-        /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_mds, md);
-}
-
-static inline lib_me_t *
-lib_me_alloc (lib_nal_t *nal)
-{
-        /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_me_t      *me;
-        
-        LIB_LOCK (nal, flags);
-        me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes);
-        LIB_UNLOCK (nal, flags);
-        
-        return (me);
-}
-
-static inline void
-lib_me_free (lib_nal_t *nal, lib_me_t *me)
-{
-        /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_mes, me);
-}
-
-static inline lib_msg_t *
-lib_msg_alloc (lib_nal_t *nal)
-{
-        /* NEVER called with liblock held */
-        unsigned long  flags;
-        lib_msg_t     *msg;
-        
-        LIB_LOCK (nal, flags);
-        msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs);
-        LIB_UNLOCK (nal, flags);
-
-        if (msg != NULL) {
-                /* NULL pointers, clear flags etc */
-                memset (msg, 0, sizeof (*msg));
-                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
-        }
-        return(msg);
-}
-
-static inline void
-lib_msg_free (lib_nal_t *nal, lib_msg_t *msg)
-{
-        /* ALWAYS called with liblock held */
-        lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg);
-}
-
-#else
-
-static inline lib_eq_t *
-lib_eq_alloc (lib_nal_t *nal)
-{
-        /* NEVER called with liblock held */
-        lib_eq_t *eq;
-
-        PORTAL_ALLOC(eq, sizeof(*eq));
-        return (eq);
-}
-
-static inline void
-lib_eq_free (lib_nal_t *nal, lib_eq_t *eq)
-{
-        /* ALWAYS called with liblock held */
-        PORTAL_FREE(eq, sizeof(*eq));
-}
-
-static inline lib_md_t *
-lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd)
-{
-        /* NEVER called with liblock held */
-        lib_md_t *md;
-        int       size;
-        int       niov;
-
-        if ((umd->options & PTL_MD_KIOV) != 0) {
-                niov = umd->length;
-                size = offsetof(lib_md_t, md_iov.kiov[niov]);
-        } else {
-                niov = ((umd->options & PTL_MD_IOVEC) != 0) ?
-                       umd->length : 1;
-                size = offsetof(lib_md_t, md_iov.iov[niov]);
-        }
-
-        PORTAL_ALLOC(md, size);
-
-        if (md != NULL) {
-                /* Set here in case of early free */
-                md->options = umd->options;
-                md->md_niov = niov;
-        }
-        
-        return (md);
-}
-
-static inline void 
-lib_md_free (lib_nal_t *nal, lib_md_t *md)
-{
-        /* ALWAYS called with liblock held */
-        int       size;
-
-        if ((md->options & PTL_MD_KIOV) != 0)
-                size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]);
-        else
-                size = offsetof(lib_md_t, md_iov.iov[md->md_niov]);
-
-        PORTAL_FREE(md, size);
-}
-
-static inline lib_me_t *
-lib_me_alloc (lib_nal_t *nal)
-{
-        /* NEVER called with liblock held */
-        lib_me_t *me;
-
-        PORTAL_ALLOC(me, sizeof(*me));
-        return (me);
-}
-
-static inline void 
-lib_me_free(lib_nal_t *nal, lib_me_t *me)
-{
-        /* ALWAYS called with liblock held */
-        PORTAL_FREE(me, sizeof(*me));
-}
-
-static inline lib_msg_t *
-lib_msg_alloc(lib_nal_t *nal)
-{
-        /* NEVER called with liblock held; may be in interrupt... */
-        lib_msg_t *msg;
-
-        if (in_interrupt())
-                PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg));
-        else
-                PORTAL_ALLOC(msg, sizeof(*msg));
-
-        if (msg != NULL) {
-                /* NULL pointers, clear flags etc */
-                memset (msg, 0, sizeof (*msg));
-                msg->ack_wmd = PTL_WIRE_HANDLE_NONE;
-        }
-        return (msg);
-}
-
-static inline void 
-lib_msg_free(lib_nal_t *nal, lib_msg_t *msg)
-{
-        /* ALWAYS called with liblock held */
-        PORTAL_FREE(msg, sizeof(*msg));
-}
-#endif
-
-extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type);
-extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type);
-extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh);
-
-static inline void
-ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq)
-{
-        if (eq == NULL) {
-                *handle = PTL_EQ_NONE;
-                return;
-        }
-
-        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
-        handle->cookie = eq->eq_lh.lh_cookie;
-}
-
-static inline lib_eq_t *
-ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal)
-{
-        /* ALWAYS called with liblock held */
-        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, 
-                                              PTL_COOKIE_TYPE_EQ);
-        if (lh == NULL)
-                return (NULL);
-
-        return (lh_entry (lh, lib_eq_t, eq_lh));
-}
-
-static inline void
-ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md)
-{
-        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
-        handle->cookie = md->md_lh.lh_cookie;
-}
-
-static inline lib_md_t *
-ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal)
-{
-        /* ALWAYS called with liblock held */
-        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
-                                              PTL_COOKIE_TYPE_MD);
-        if (lh == NULL)
-                return (NULL);
-
-        return (lh_entry (lh, lib_md_t, md_lh));
-}
-
-static inline lib_md_t *
-ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal)
-{
-        /* ALWAYS called with liblock held */
-        lib_handle_t *lh;
-        
-        if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie)
-                return (NULL);
-        
-        lh = lib_lookup_cookie (nal, wh->wh_object_cookie,
-                                PTL_COOKIE_TYPE_MD);
-        if (lh == NULL)
-                return (NULL);
-
-        return (lh_entry (lh, lib_md_t, md_lh));
-}
-
-static inline void
-ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me)
-{
-        handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx;
-        handle->cookie = me->me_lh.lh_cookie;
-}
-
-static inline lib_me_t *
-ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal)
-{
-        /* ALWAYS called with liblock held */
-        lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie,
-                                              PTL_COOKIE_TYPE_ME);
-        if (lh == NULL)
-                return (NULL);
-
-        return (lh_entry (lh, lib_me_t, me_lh));
-}
-
-extern int lib_init(lib_nal_t *libnal, nal_t *apinal,
-                    ptl_process_id_t pid,
-                    ptl_ni_limits_t *desired_limits, 
-                    ptl_ni_limits_t *actual_limits);
-extern int lib_fini(lib_nal_t *libnal);
-
-/*
- * When the NAL detects an incoming message header, it should call
- * lib_parse() decode it.  If the message header is garbage, lib_parse()
- * returns immediately with failure, otherwise the NAL callbacks will be
- * called to receive the message body.  They are handed the private cookie
- * as a way for the NAL to maintain state about which transaction is being
- * processed.  An extra parameter, lib_msg contains the lib-level message
- * state for passing to lib_finalize() when the message body has been
- * received.
- */
-extern void lib_enq_event_locked (lib_nal_t *nal, void *private,
-                                  lib_eq_t *eq, ptl_event_t *ev);
-extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, 
-                          ptl_ni_fail_t ni_fail_type);
-extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private);
-extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, 
-                                        lib_msg_t *get_msg);
-extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr);
-
-
-extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov);
-extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, 
-                              ptl_size_t offset, ptl_size_t len);
-extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, 
-                              char *src, ptl_size_t len);
-extern int lib_extract_iov (int dst_niov, struct iovec *dst,
-                            int src_niov, struct iovec *src,
-                            ptl_size_t offset, ptl_size_t len);
-
-extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov);
-extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, 
-                               ptl_size_t offset, ptl_size_t len);
-extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
-                               char *src, ptl_size_t len);
-extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, 
-                             int src_niov, ptl_kiov_t *src,
-                             ptl_size_t offset, ptl_size_t len);
-
-extern void lib_assert_wire_constants (void);
-
-extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
-                           ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen);
-extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg,
-                           ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                           lib_md_t *md, ptl_size_t offset, ptl_size_t len);
-
-extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx,
-                              ptl_sr_value_t *status);
-extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, 
-                            unsigned long *dist);
-
-extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count,
-                             ptl_eq_handler_t callback, 
-                             ptl_handle_eq_t *handle);
-extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh);
-extern int lib_api_eq_poll (nal_t *nal, 
-                            ptl_handle_eq_t *eventqs, int neq, int timeout_ms,
-                            ptl_event_t *event, int *which);
-
-extern int lib_api_me_attach(nal_t *nal,
-                             ptl_pt_index_t portal,
-                             ptl_process_id_t match_id, 
-                             ptl_match_bits_t match_bits, 
-                             ptl_match_bits_t ignore_bits,
-                             ptl_unlink_t unlink, ptl_ins_pos_t pos,
-                             ptl_handle_me_t *handle);
-extern int lib_api_me_insert(nal_t *nal,
-                             ptl_handle_me_t *current_meh,
-                             ptl_process_id_t match_id, 
-                             ptl_match_bits_t match_bits, 
-                             ptl_match_bits_t ignore_bits,
-                             ptl_unlink_t unlink, ptl_ins_pos_t pos,
-                             ptl_handle_me_t *handle);
-extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh);
-extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me);
-
-extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid);
-
-extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md);
-extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd);
-extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh,
-                             ptl_md_t *umd, ptl_unlink_t unlink, 
-                             ptl_handle_md_t *handle);
-extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink,
-                           ptl_handle_md_t *handle);
-extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh);
-extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh,
-                              ptl_md_t *oldumd, ptl_md_t *newumd,
-                              ptl_handle_eq_t *testqh);
-
-extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, 
-                       ptl_process_id_t *id,
-                       ptl_pt_index_t portal, ptl_ac_index_t ac,
-                       ptl_match_bits_t match_bits, ptl_size_t offset);
-extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, 
-                       ptl_ack_req_t ack, ptl_process_id_t *id,
-                       ptl_pt_index_t portal, ptl_ac_index_t ac,
-                       ptl_match_bits_t match_bits, 
-                       ptl_size_t offset, ptl_hdr_data_t hdr_data);
-extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold);
-extern int lib_api_loopback(nal_t *apinal, int set, int *enabled);
-
-#endif
index 608b1e2..2227c6a 100644 (file)
@@ -7,31 +7,22 @@
  * exposed to the user application
  */
 
-#ifndef __PORTALS_LIB_TYPES_H__
-#define __PORTALS_LIB_TYPES_H__
-
-#include "build_check.h"
+#ifndef __LNET_LIB_TYPES_H__
+#define __LNET_LIB_TYPES_H__
 
 #if defined(__linux__)
-#include <portals/linux/lib-types.h>
+#include <lnet/linux/lib-types.h>
 #elif defined(__APPLE__)
-#include <portals/darwin/lib-types.h>
+#include <lnet/darwin/lib-types.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/lib-types.h>
 #else
 #error Unsupported Operating System
 #endif
 
 #include <libcfs/libcfs.h>
 #include <libcfs/list.h>
-#include <portals/types.h>
-#include <portals/nal.h>
-
-typedef char *user_ptr;
-typedef struct lib_msg_t lib_msg_t;
-typedef struct lib_ptl_t lib_ptl_t;
-typedef struct lib_ac_t lib_ac_t;
-typedef struct lib_me_t lib_me_t;
-typedef struct lib_md_t lib_md_t;
-typedef struct lib_eq_t lib_eq_t;
+#include <lnet/types.h>
 
 #define WIRE_ATTR      __attribute__((packed))
 
@@ -42,334 +33,517 @@ typedef struct lib_eq_t lib_eq_t;
 typedef struct {
         __u64 wh_interface_cookie;
         __u64 wh_object_cookie;
-} WIRE_ATTR ptl_handle_wire_t;
+} WIRE_ATTR lnet_handle_wire_t;
 
 /* byte-flip insensitive! */
-#define PTL_WIRE_HANDLE_NONE \
-((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1})
+#define LNET_WIRE_HANDLE_NONE \
+((const lnet_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1})
 
 typedef enum {
-        PTL_MSG_ACK = 0,
-        PTL_MSG_PUT,
-        PTL_MSG_GET,
-        PTL_MSG_REPLY,
-        PTL_MSG_HELLO,
-} ptl_msg_type_t;
+        LNET_MSG_ACK = 0,
+        LNET_MSG_PUT,
+        LNET_MSG_GET,
+        LNET_MSG_REPLY,
+        LNET_MSG_HELLO,
+} lnet_msg_type_t;
 
 /* The variant fields of the portals message header are aligned on an 8
  * byte boundary in the message header.  Note that all types used in these
  * wire structs MUST be fixed size and the smaller types are placed at the
  * end. */
-typedef struct ptl_ack {
-        ptl_handle_wire_t  dst_wmd;
-        ptl_match_bits_t   match_bits;
-        ptl_size_t         mlength;
-} WIRE_ATTR ptl_ack_t;
-
-typedef struct ptl_put {
-        ptl_handle_wire_t  ack_wmd;
-        ptl_match_bits_t   match_bits;
-        ptl_hdr_data_t     hdr_data;
-        ptl_pt_index_t     ptl_index;
-        ptl_size_t         offset;
-} WIRE_ATTR ptl_put_t;
-
-typedef struct ptl_get {
-        ptl_handle_wire_t  return_wmd;
-        ptl_match_bits_t   match_bits;
-        ptl_pt_index_t     ptl_index;
-        ptl_size_t         src_offset;
-        ptl_size_t         sink_length;
-} WIRE_ATTR ptl_get_t;
-
-typedef struct ptl_reply {
-        ptl_handle_wire_t  dst_wmd;
-} WIRE_ATTR ptl_reply_t;
-
-typedef struct ptl_hello {
+typedef struct lnet_ack {
+        lnet_handle_wire_t  dst_wmd;
+        __u64               match_bits;
+        __u32               mlength;
+} WIRE_ATTR lnet_ack_t;
+
+typedef struct lnet_put {
+        lnet_handle_wire_t  ack_wmd;
+        __u64               match_bits;
+        __u64               hdr_data;
+        __u32               ptl_index;
+        __u32               offset;
+} WIRE_ATTR lnet_put_t;
+
+typedef struct lnet_get {
+        lnet_handle_wire_t  return_wmd;
+        __u64               match_bits;
+        __u32               ptl_index;
+        __u32               src_offset;
+        __u32               sink_length;
+} WIRE_ATTR lnet_get_t;
+
+typedef struct lnet_reply {
+        lnet_handle_wire_t  dst_wmd;
+} WIRE_ATTR lnet_reply_t;
+
+typedef struct lnet_hello {
         __u64              incarnation;
         __u32              type;
-} WIRE_ATTR ptl_hello_t;
+} WIRE_ATTR lnet_hello_t;
 
 typedef struct {
-        ptl_nid_t           dest_nid;
-        ptl_nid_t           src_nid;
-        ptl_pid_t           dest_pid;
-        ptl_pid_t           src_pid;
-        __u32               type;               /* ptl_msg_type_t */
+        lnet_nid_t          dest_nid;
+        lnet_nid_t          src_nid;
+        lnet_pid_t          dest_pid;
+        lnet_pid_t          src_pid;
+        __u32               type;               /* lnet_msg_type_t */
         __u32               payload_length;     /* payload data to follow */
         /*<------__u64 aligned------->*/
         union {
-                ptl_ack_t   ack;
-                ptl_put_t   put;
-                ptl_get_t   get;
-                ptl_reply_t reply;
-                ptl_hello_t hello;
+                lnet_ack_t   ack;
+                lnet_put_t   put;
+                lnet_get_t   get;
+                lnet_reply_t reply;
+                lnet_hello_t hello;
         } msg;
-} WIRE_ATTR ptl_hdr_t;
+} WIRE_ATTR lnet_hdr_t;
 
-/* A HELLO message contains the portals magic number and protocol version
+/* A HELLO message contains a magic number and protocol version
  * code in the header's dest_nid, the peer's NID in the src_nid, and
- * PTL_MSG_HELLO in the type field.  All other common fields are zero
+ * LNET_MSG_HELLO in the type field.  All other common fields are zero
  * (including payload_size; i.e. no payload).  
- * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is
- * running the same protocol and to find out its NID, so that hosts with
- * multiple IP interfaces can have a single NID. These NALs should exchange
- * HELLO messages when a connection is first established. 
- * Individual NALs can put whatever else they fancy in ptl_hdr_t::msg. 
+ * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is
+ * running the same protocol and to find out its NID. These LNDs should
+ * exchange HELLO messages when a connection is first established.  Individual
+ * LNDs can put whatever else they fancy in lnet_hdr_t::msg.
  */
 typedef struct {
-        __u32  magic;                          /* PORTALS_PROTO_MAGIC */
+        __u32  magic;                          /* LNET_PROTO_TCP_MAGIC */
         __u16   version_major;                  /* increment on incompatible change */
         __u16   version_minor;                  /* increment on compatible change */
-} WIRE_ATTR ptl_magicversion_t;
-
-#define PORTALS_PROTO_MAGIC                0xeebc0ded
-
-#define PORTALS_PROTO_VERSION_MAJOR        1
-#define PORTALS_PROTO_VERSION_MINOR        0
-
-typedef struct {
-        long recv_count, recv_length, send_count, send_length, drop_count,
-            drop_length, msgs_alloc, msgs_max;
-} lib_counters_t;
-
-/* temporary expedient: limit number of entries in discontiguous MDs */
-#define PTL_MTU        (1<<20)
-#define PTL_MD_MAX_IOV 256
-
-struct lib_msg_t {
-        struct list_head  msg_list;
-        lib_md_t         *md;
-        ptl_handle_wire_t ack_wmd;
-        ptl_event_t       ev;
-};
-
-struct lib_ptl_t {
-        ptl_pt_index_t size;
-        struct list_head *tbl;
-};
-
-struct lib_ac_t {
-        int next_free;
-};
-
+} WIRE_ATTR lnet_magicversion_t;
+
+/* PROTO MAGIC for LNDs */
+#define LNET_PROTO_IB_MAGIC                 0x0be91b91
+#define LNET_PROTO_OPENIB_MAGIC             LNET_PROTO_IB_MAGIC
+#define LNET_PROTO_IIB_MAGIC                LNET_PROTO_IB_MAGIC
+#define LNET_PROTO_VIB_MAGIC                LNET_PROTO_IB_MAGIC
+#define LNET_PROTO_RA_MAGIC                 0x0be91b92
+#define LNET_PROTO_QSW_MAGIC                0x0be91b93
+#define LNET_PROTO_TCP_MAGIC                0xeebc0ded
+#define LNET_PROTO_PTL_MAGIC                0x50746C4E /* 'PtlN' unique magic */
+#define LNET_PROTO_GM_MAGIC                 0x6d797269 /* 'myri'! */
+#define LNET_PROTO_MX_MAGIC                 0x4d583130 /* 'MX10'! */
+#define LNET_PROTO_ACCEPTOR_MAGIC           0xacce7100
+#define LNET_PROTO_PING_MAGIC               0x70696E67 /* 'ping' */
+
+/* Placeholder for a future "unified" protocol across all LNDs */
+/* Current LNDs that receive a request with this magic will respond with a
+ * "stub" reply using their current protocol */
+#define LNET_PROTO_MAGIC                    0x45726963 /* ! */
+
+
+#define LNET_PROTO_TCP_VERSION_MAJOR        1
+#define LNET_PROTO_TCP_VERSION_MINOR        0
+
+/* Acceptor connection request */
 typedef struct {
+        __u32       acr_magic;                  /* PTL_ACCEPTOR_PROTO_MAGIC */
+        __u32       acr_version;                /* protocol version */
+        __u64       acr_nid;                    /* target NID */
+} WIRE_ATTR lnet_acceptor_connreq_t;
+
+#define LNET_PROTO_ACCEPTOR_VERSION       1
+
+/* forward refs */
+struct lnet_libmd;
+
+typedef struct lnet_msg {
+        struct list_head    msg_activelist;
+        struct list_head    msg_list;           /* Q for credits/MD */
+
+        lnet_process_id_t   msg_target;
+        __u32               msg_type;
+
+        unsigned int        msg_target_is_router:1; /* sending to a router */
+        unsigned int        msg_routing:1;      /* being forwarded */
+        unsigned int        msg_ack:1;          /* ack on finalize (PUT) */
+        unsigned int        msg_sending:1;      /* outgoing message */
+        unsigned int        msg_receiving:1;    /* being received */
+        unsigned int        msg_delayed:1;      /* had to Q for buffer or tx credit */
+        unsigned int        msg_txcredit:1;     /* taken an NI send credit */
+        unsigned int        msg_peertxcredit:1; /* taken a peer send credit */
+        unsigned int        msg_rtrcredit:1;    /* taken a globel router credit */
+        unsigned int        msg_peerrtrcredit:1; /* taken a peer router credit */
+        unsigned int        msg_onactivelist:1; /* on the activelist */
+
+        struct lnet_peer   *msg_txpeer;         /* peer I'm sending to */
+        struct lnet_peer   *msg_rxpeer;         /* peer I received from */
+
+        void               *msg_private;
+        struct lnet_libmd  *msg_md;
+
+        unsigned int        msg_len;
+        unsigned int        msg_wanted;
+        unsigned int        msg_offset;
+        unsigned int        msg_niov;
+        struct iovec       *msg_iov;
+        lnet_kiov_t        *msg_kiov;
+
+        lnet_event_t        msg_ev;
+        lnet_hdr_t          msg_hdr;
+} lnet_msg_t;
+
+
+typedef struct lnet_libhandle {
         struct list_head  lh_hash_chain;
         __u64             lh_cookie;
-} lib_handle_t;
+} lnet_libhandle_t;
 
 #define lh_entry(ptr, type, member) \
        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
 
-struct lib_eq_t {
+typedef struct lnet_eq {
         struct list_head  eq_list;
-        lib_handle_t      eq_lh;
-        ptl_seq_t         eq_enq_seq;
-        ptl_seq_t         eq_deq_seq;
-        ptl_size_t        eq_size;
-        ptl_event_t      *eq_events;
+        lnet_libhandle_t  eq_lh;
+        lnet_seq_t        eq_enq_seq;
+        lnet_seq_t        eq_deq_seq;
+        unsigned int      eq_size;
+        lnet_event_t     *eq_events;
         int               eq_refcount;
-        ptl_eq_handler_t  eq_callback;
-        void             *eq_addrkey;
-};
-
-struct lib_me_t {
-        struct list_head  me_list;
-        lib_handle_t      me_lh;
-        ptl_process_id_t  match_id;
-        ptl_match_bits_t  match_bits, ignore_bits;
-        ptl_unlink_t      unlink;
-        lib_md_t         *md;
-};
-
-struct lib_md_t {
+        lnet_eq_handler_t eq_callback;
+} lnet_eq_t;
+
+typedef struct lnet_me {
+        struct list_head   me_list;
+        lnet_libhandle_t   me_lh;
+        lnet_process_id_t  me_match_id;
+        unsigned int       me_portal;
+        __u64              me_match_bits;
+        __u64              me_ignore_bits;
+        lnet_unlink_t      me_unlink;
+        struct lnet_libmd *me_md;
+} lnet_me_t;
+
+typedef struct lnet_libmd {
         struct list_head  md_list;
-        lib_handle_t      md_lh;
-        lib_me_t         *me;
-        user_ptr          start;
-        ptl_size_t        offset;
-        ptl_size_t        length;
-        ptl_size_t        max_size;
-        int               threshold;
-        int               pending;
-        unsigned int      options;
+        lnet_libhandle_t  md_lh;
+        lnet_me_t        *md_me;
+        char             *md_start;
+        unsigned int      md_offset;
+        unsigned int      md_length;
+        unsigned int      md_max_size;
+        int               md_threshold;
+        int               md_refcount;
+        unsigned int      md_options;
         unsigned int      md_flags;
-        void             *user_ptr;
-        lib_eq_t         *eq;
+        void             *md_user_ptr;
+        lnet_eq_t        *md_eq;
         void             *md_addrkey;
         unsigned int      md_niov;                /* # frags */
         union {
-                struct iovec  iov[PTL_MD_MAX_IOV];
-                ptl_kiov_t    kiov[PTL_MD_MAX_IOV];
+                struct iovec  iov[LNET_MAX_IOV];
+                lnet_kiov_t   kiov[LNET_MAX_IOV];
         } md_iov;
-};
+} lnet_libmd_t;
 
-#define PTL_MD_FLAG_ZOMBIE            (1 << 0)
-#define PTL_MD_FLAG_AUTO_UNLINK       (1 << 1)
+#define LNET_MD_FLAG_ZOMBIE           (1 << 0)
+#define LNET_MD_FLAG_AUTO_UNLINK      (1 << 1)
 
-static inline int lib_md_exhausted (lib_md_t *md) 
-{
-        return (md->threshold == 0 ||
-                ((md->options & PTL_MD_MAX_SIZE) != 0 &&
-                 md->offset + md->max_size > md->length));
-}
-
-#ifdef PTL_USE_LIB_FREELIST
+#ifdef LNET_USE_LIB_FREELIST
 typedef struct
 {
         void             *fl_objs;             /* single contiguous array of objects */
         int                fl_nobjs;            /* the number of them */
         int                fl_objsize;          /* the size (including overhead) of each of them */
         struct list_head   fl_list;             /* where they are enqueued */
-} lib_freelist_t;
+} lnet_freelist_t;
 
 typedef struct
 {
         struct list_head   fo_list;             /* enqueue on fl_list */
         void              *fo_contents;         /* aligned contents */
-} lib_freeobj_t;
+} lnet_freeobj_t;
 #endif
 
 typedef struct {
         /* info about peers we are trying to fail */
-        struct list_head  tp_list;             /* stash in ni.ni_test_peers */
-        ptl_nid_t         tp_nid;              /* matching nid */
-        unsigned int      tp_threshold;        /* # failures to simulate */
-} lib_test_peer_t;
-
-#define PTL_COOKIE_TYPE_MD    1
-#define PTL_COOKIE_TYPE_ME    2
-#define PTL_COOKIE_TYPE_EQ    3
-#define PTL_COOKIE_TYPES      4
-/* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be
- * extracted by masking with (PTL_COOKIE_TYPES - 1) */
-
-typedef struct lib_ni 
+        struct list_head   tp_list;             /* ln_test_peers */
+        lnet_nid_t         tp_nid;              /* matching nid */
+        unsigned int       tp_threshold;        /* # failures to simulate */
+} lnet_test_peer_t;
+
+#define LNET_COOKIE_TYPE_MD    1
+#define LNET_COOKIE_TYPE_ME    2
+#define LNET_COOKIE_TYPE_EQ    3
+#define LNET_COOKIE_TYPES      4
+/* LNET_COOKIE_TYPES must be a power of 2, so the cookie type can be
+ * extracted by masking with (LNET_COOKIE_TYPES - 1) */
+
+struct lnet_ni;                                  /* forward ref */
+
+typedef struct lnet_lnd
 {
-        nal_t            *ni_api;
-        ptl_process_id_t  ni_pid;
-        lib_ptl_t         ni_portals;
-        lib_counters_t    ni_counters;
-        ptl_ni_limits_t   ni_actual_limits;
-
-        int               ni_lh_hash_size;      /* size of lib handle hash table */
-        struct list_head *ni_lh_hash_table;     /* all extant lib handles, this interface */
-        __u64             ni_next_object_cookie; /* cookie generator */
-        __u64             ni_interface_cookie;  /* uniquely identifies this ni in this epoch */
+        /* fields managed by portals */
+        struct list_head  lnd_list;             /* stash in the LND table */
+        int               lnd_refcount;         /* # active instances */
+
+        /* fields initialised by the LND */
+        unsigned int      lnd_type;
         
-        struct list_head  ni_test_peers;
-        int               ni_loopback;          /* loopback shortcircuits NAL */
+        int  (*lnd_startup) (struct lnet_ni *ni);
+        void (*lnd_shutdown) (struct lnet_ni *ni);
+        int  (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg);
+
+        /* In data movement APIs below, payload buffers are described as a set
+         * of 'niov' fragments which are...
+         * EITHER 
+         *    in virtual memory (struct iovec *iov != NULL)
+         * OR
+         *    in pages (kernel only: plt_kiov_t *kiov != NULL).
+         * The LND may NOT overwrite these fragment descriptors.
+         * An 'offset' and may specify a byte offset within the set of
+         * fragments to start from 
+         */
+
+        /* Start sending a preformatted message.  'private' is NULL for PUT and
+        * GET messages; otherwise this is a response to an incoming message
+        * and 'private' is the 'private' passed to lnet_parse().  Return
+        * non-zero for immediate failure, otherwise complete later with
+        * lnet_finalize() */
+       int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg);
+
+        /* Start receiving 'mlen' bytes of payload data, skipping the following
+         * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to
+         * lnet_parse().  Return non-zero for immedaite failure, otherwise
+         * complete later with lnet_finalize().  This also gives back a receive
+         * credit if the LND does flow control. */
+       int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+                        int delayed, unsigned int niov, 
+                        struct iovec *iov, lnet_kiov_t *kiov,
+                        unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+        /* lnet_parse() has had to delay processing of this message
+         * (e.g. waiting for a forwarding buffer or send credits).  Give the
+         * LND a chance to free urgently needed resources.  If called, return 0
+         * for success and do NOT give back a receive credit; that has to wait
+         * until lnd_recv() gets called.  On failure return < 0 and
+         * release resources; lnd_recv() will not be called. */
+       int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg,
+                              void **new_privatep);
+
+        /* notification of peer health */
+        void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive);
+
+#ifdef __KERNEL__
+        /* accept a new connection */
+        int (*lnd_accept)(struct lnet_ni *ni, cfs_socket_t *sock);
+#else
+        /* wait for something to happen */
+        void (*lnd_wait)(struct lnet_ni *ni, int milliseconds);
+#endif
+} lnd_t;
+
+#define LNET_MAX_INTERFACES   16
+
+typedef struct lnet_ni {
+        struct list_head  ni_list;              /* chain on ln_nis */
+        struct list_head  ni_txq;               /* messages waiting for tx credits */
+        int               ni_maxtxcredits;      /* # tx credits  */
+        int               ni_txcredits;         /* # tx credits free */
+        int               ni_mintxcredits;      /* lowest it's been */
+        int               ni_peertxcredits;     /* # per-peer send credits */
+        lnet_nid_t        ni_nid;               /* interface's NID */
+        void             *ni_data;              /* instance-specific data */
+        lnd_t            *ni_lnd;               /* procedural interface */
+        int               ni_refcount;          /* reference count */
+        char             *ni_interfaces[LNET_MAX_INTERFACES]; /* equivalent interfaces to use */
+} lnet_ni_t;
+
+typedef struct lnet_peer {
+        struct list_head  lp_hashlist;          /* chain on peer hash */
+        struct list_head  lp_txq;               /* messages blocking for tx credits */
+        struct list_head  lp_rtrq;              /* messages blocking for router credits */
+        struct list_head  lp_rtr_list;          /* chain on router list */
+        int               lp_txcredits;         /* # tx credits available */
+        int               lp_mintxcredits;      /* low water mark */
+        int               lp_rtrcredits;        /* # router credits */
+        int               lp_minrtrcredits;     /* low water mark */
+        unsigned int      lp_alive:1;           /* alive/dead? */
+        unsigned int      lp_notify:1;          /* notification outstanding? */
+        unsigned int      lp_notifylnd:1;       /* outstanding notification for LND? */
+        unsigned int      lp_notifying:1;       /* some thread is handling notification */
+        unsigned int      lp_ping_notsent;      /* SEND event outstanding from ping */
+        int               lp_alive_count;       /* # times router went dead<->alive */
+        long              lp_txqnob;            /* bytes queued for sending */
+        time_t            lp_timestamp;         /* time of last aliveness news */
+        time_t            lp_ping_timestamp;    /* time of last ping attempt */
+        time_t            lp_ping_deadline;     /* != 0 if ping reply expected */
+        lnet_ni_t        *lp_ni;                /* interface peer is on */
+        lnet_nid_t        lp_nid;               /* peer's NID */
+        int               lp_refcount;          /* # refs */
+        int               lp_rtr_refcount;      /* # refs from lnet_route_t::lr_gateway */
+} lnet_peer_t;
+
+typedef struct {
+       struct list_head  lr_list;              /* chain on net */
+        lnet_peer_t      *lr_gateway;           /* router node */
+} lnet_route_t;
+
+typedef struct {
+        struct list_head        lrn_list;       /* chain on ln_remote_nets */
+        struct list_head        lrn_routes;     /* routes to me */
+        __u32                   lrn_net;        /* my net number */
+        unsigned int            lrn_hops;       /* how far I am */
+} lnet_remotenet_t;
+
+typedef struct {
+        struct list_head  rbp_bufs;             /* my free buffer pool */
+        struct list_head  rbp_msgs;             /* messages blocking for a buffer */
+        int               rbp_npages;           /* # pages in each buffer */
+        int               rbp_nbuffers;         /* # buffers */
+        int               rbp_credits;          /* # free buffers / blocked messages */
+        int               rbp_mincredits;       /* low water mark */
+} lnet_rtrbufpool_t;
+
+typedef struct {
+        struct list_head   rb_list;             /* chain on rbp_bufs */
+        lnet_rtrbufpool_t *rb_pool;             /* owning pool */
+        lnet_kiov_t        rb_kiov[0];          /* the buffer space */
+} lnet_rtrbuf_t;
+
+typedef struct {
+        __u32        msgs_alloc;
+        __u32        msgs_max;
+        __u32        errors;
+        __u32        send_count;
+        __u32        recv_count;
+        __u32        route_count;
+        __u32        drop_count;
+        __u64        send_length;
+        __u64        recv_length;
+        __u64        route_length;
+        __u64        drop_length;
+} lnet_counters_t;
+
+#define LNET_PEER_HASHSIZE   503                /* prime! */
+
+#define LNET_NRBPOOLS         3                 /* # different router buffer pools */
+
+#define LNET_PROTO_PING_MATCHBITS     0x8000000000000000LL
+#define LNET_PROTO_PING_VERSION       1
+typedef struct {
+        __u32          pi_magic;
+        __u32          pi_version;
+        lnet_pid_t     pi_pid;
+        __u32          pi_nnids;
+        lnet_nid_t     pi_nid[0];
+} WIRE_ATTR lnet_ping_info_t;
+
+/* Options for lnet_portal_t::ptl_options */
+#define LNET_PTL_LAZY               (1 << 0)
+typedef struct {
+        struct list_head ptl_ml;  /* match list */
+        struct list_head ptl_msgq; /* messages blocking for MD */
+        __u64            ptl_msgq_version;  /* validity stamp */
+        unsigned int     ptl_options;
+} lnet_portal_t;
+
+/* Router Checker */
+/*                               < 0 == startup error */
+#define LNET_RC_STATE_SHUTDOWN     0            /* not started */
+#define LNET_RC_STATE_RUNNING      1            /* started up OK */
+#define LNET_RC_STATE_STOPTHREAD   2            /* telling thread to stop */
+#define LNET_RC_STATE_UNLINKING    3            /* unlinking RC MD */
+#define LNET_RC_STATE_UNLINKED     4            /* RC's MD has been unlinked */
+
+typedef struct
+{
+        /* Stuff initialised at LNetInit() */
+        int                ln_init;             /* LNetInit() called? */
+        int                ln_refcount;         /* LNetNIInit/LNetNIFini counter */
+        int                ln_niinit_self;      /* Have I called LNetNIInit myself? */
+
+        int                ln_ptlcompat;        /* do I support talking to portals? */
         
-#ifdef PTL_USE_LIB_FREELIST
-        lib_freelist_t    ni_free_mes;
-        lib_freelist_t    ni_free_msgs;
-        lib_freelist_t    ni_free_mds;
-        lib_freelist_t    ni_free_eqs;
+        struct list_head   ln_lnds;             /* registered LNDs */
+
+#ifdef __KERNEL__
+        spinlock_t         ln_lock;
+        cfs_waitq_t        ln_waitq;
+        struct semaphore   ln_api_mutex;
+        struct semaphore   ln_lnd_mutex;
+#else
+# if !HAVE_LIBPTHREAD
+        int                ln_lock;
+        int                ln_api_mutex;
+        int                ln_lnd_mutex;
+# else
+        pthread_cond_t     ln_cond;
+        pthread_mutex_t    ln_lock;
+        pthread_mutex_t    ln_api_mutex;
+        pthread_mutex_t    ln_lnd_mutex;
+# endif
 #endif
 
-        struct list_head  ni_active_msgs;
-        struct list_head  ni_active_mds;
-        struct list_head  ni_active_eqs;
+        /* Stuff initialised at LNetNIInit() */
+
+        int                ln_shutdown;         /* shutdown in progress */
+        int                ln_nportals;         /* # portals */
+        lnet_portal_t     *ln_portals;          /* the vector of portals */
+
+        lnet_pid_t         ln_pid;              /* requested pid */
+
+        struct list_head   ln_nis;              /* LND instances */
+        lnet_ni_t         *ln_loni;             /* the loopback NI */
+        lnet_ni_t         *ln_eqwaitni;         /* NI to wait for events in */
+        struct list_head   ln_zombie_nis;       /* dying LND instances */
+        int                ln_nzombie_nis;      /* # of NIs to wait for */
 
+        struct list_head   ln_remote_nets;      /* remote networks with routes to them */
+        __u64              ln_remote_nets_version; /* validity stamp */
+
+        struct list_head   ln_routers;          /* list of all known routers */
+        __u64              ln_routers_version;  /* validity stamp */
+
+        struct list_head  *ln_peer_hash;        /* NID->peer hash */
+        int                ln_npeers;           /* # peers extant */
+        int                ln_peertable_version; /* /proc validity stamp */
+        
+        int                ln_routing;          /* am I a router? */
+        lnet_rtrbufpool_t  ln_rtrpools[LNET_NRBPOOLS]; /* router buffer pools */
+        
+        int                ln_lh_hash_size;     /* size of lib handle hash table */
+        struct list_head  *ln_lh_hash_table;    /* all extant lib handles, this interface */
+        __u64              ln_next_object_cookie; /* cookie generator */
+        __u64              ln_interface_cookie; /* uniquely identifies this ni in this epoch */
+
+        char              *ln_network_tokens;   /* space for network names */
+        int                ln_network_tokens_nob;
+
+        int                ln_testprotocompat;  /* test protocol compatibility flags */
+
+        struct list_head   ln_finalizeq;        /* msgs waiting to complete finalizing */
 #ifdef __KERNEL__
-        spinlock_t        ni_lock;
-        cfs_waitq_t       ni_waitq;
+        void             **ln_finalizers;       /* threads doing finalization */
+        int                ln_nfinalizers;      /* max # threads finalizing */
 #else
-        pthread_mutex_t   ni_mutex;
-        pthread_cond_t    ni_cond;
+        int                ln_finalizing;
 #endif
-} lib_ni_t;
+        struct list_head   ln_test_peers;       /* failure simulation */
 
+        lnet_handle_md_t   ln_ping_target_md;
+        lnet_handle_eq_t   ln_ping_target_eq;
+        lnet_ping_info_t  *ln_ping_info;
 
-typedef struct lib_nal
-{
-       /* lib-level interface state */
-       lib_ni_t libnal_ni;
-
-       /* NAL-private data */
-       void *libnal_data;
-
-       /*
-        * send: Sends a preformatted header and payload data to a
-        * specified remote process. The payload is scattered over 'niov'
-        * fragments described by iov, starting at 'offset' for 'mlen'
-        * bytes.  
-        * NB the NAL may NOT overwrite iov.  
-        * PTL_OK on success => NAL has committed to send and will call
-        * lib_finalize on completion
-        */
-       ptl_err_t (*libnal_send) 
-                (struct lib_nal *nal, void *private, lib_msg_t *cookie, 
-                 ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                 unsigned int niov, struct iovec *iov, 
-                 size_t offset, size_t mlen);
+#ifdef __KERNEL__
+       int                ln_rc_state;         /* router checker startup/shutdown state */
+       struct semaphore   ln_rc_signal;        /* serialise startup/shutdown */
+        lnet_handle_eq_t   ln_rc_eqh;           /* router checker's event queue */
+#endif
         
-       /* as send, but with a set of page fragments (NULL if not supported) */
-       ptl_err_t (*libnal_send_pages)
-                (struct lib_nal *nal, void *private, lib_msg_t * cookie, 
-                 ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, 
-                 unsigned int niov, ptl_kiov_t *iov, 
-                 size_t offset, size_t mlen);
-       /*
-        * recv: Receives an incoming message from a remote process.  The
-        * payload is to be received into the scattered buffer of 'niov'
-        * fragments described by iov, starting at 'offset' for 'mlen'
-        * bytes.  Payload bytes after 'mlen' up to 'rlen' are to be
-        * discarded.  
-        * NB the NAL may NOT overwrite iov.
-        * PTL_OK on success => NAL has committed to receive and will call
-        * lib_finalize on completion
-        */
-       ptl_err_t (*libnal_recv) 
-                (struct lib_nal *nal, void *private, lib_msg_t * cookie,
-                 unsigned int niov, struct iovec *iov, 
-                 size_t offset, size_t mlen, size_t rlen);
-
-       /* as recv, but with a set of page fragments (NULL if not supported) */
-       ptl_err_t (*libnal_recv_pages) 
-                (struct lib_nal *nal, void *private, lib_msg_t * cookie,
-                 unsigned int niov, ptl_kiov_t *iov, 
-                 size_t offset, size_t mlen, size_t rlen);
-
-       /*
-        * (un)map: Tell the NAL about some memory it will access.
-        * *addrkey passed to libnal_unmap() is what libnal_map() set it to.
-        * type of *iov depends on options.
-        * Set to NULL if not required.
-        */
-       ptl_err_t (*libnal_map)
-                (struct lib_nal *nal, unsigned int niov, struct iovec *iov, 
-                 void **addrkey);
-       void (*libnal_unmap)
-                (struct lib_nal *nal, unsigned int niov, struct iovec *iov, 
-                 void **addrkey);
-
-       /* as (un)map, but with a set of page fragments */
-       ptl_err_t (*libnal_map_pages)
-                (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, 
-                 void **addrkey);
-       void (*libnal_unmap_pages)
-                (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, 
-                 void **addrkey);
-
-       /* Calculate a network "distance" to given node */
-       int (*libnal_dist) (struct lib_nal *nal, ptl_nid_t nid, unsigned long *dist);
-} lib_nal_t;
-
-typedef struct                                  /* loopback descriptor */
-{
-        unsigned int     lod_type;
-        unsigned int     lod_niov;
-        size_t           lod_offset;
-        size_t           lod_nob;
-        union {
-                struct iovec  *iov;
-                ptl_kiov_t    *kiov;
-        }                lod_iov;
-} lo_desc_t;
+#ifdef LNET_USE_LIB_FREELIST
+        lnet_freelist_t    ln_free_mes;
+        lnet_freelist_t    ln_free_msgs;
+        lnet_freelist_t    ln_free_mds;
+        lnet_freelist_t    ln_free_eqs;
+#endif
+        struct list_head   ln_active_msgs;
+        struct list_head   ln_active_mds;
+        struct list_head   ln_active_eqs;
 
-#define LOD_IOV     0xeb105
-#define LOD_KIOV    0xeb106
+        lnet_counters_t    ln_counters;
+} lnet_t;
 
 #endif
index b6e7daf..409e159 100644 (file)
@@ -1 +1 @@
-EXTRA_DIST := lib-p30.h  lib-types.h  p30.h
+EXTRA_DIST := lib-lnet.h  lib-types.h  lnet.h api-support.h
diff --git a/lnet/include/lnet/linux/api-support.h b/lnet/include/lnet/linux/api-support.h
new file mode 100644 (file)
index 0000000..bec6e34
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef __LINUX_API_SUPPORT_H__
+#define __LINUX_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet /api-support.h> instead
+#endif
+
+#ifndef __KERNEL__
+# include <stdio.h>
+# include <stdlib.h>
+# include <unistd.h>
+# include <time.h>
+
+/* Lots of POSIX dependencies to support PtlEQWait_timeout */
+# include <signal.h>
+# include <setjmp.h>
+# include <time.h>
+
+#ifdef HAVE_LIBREADLINE
+#define READLINE_LIBRARY
+#include <readline/readline.h>
+
+/* readline.h pulls in a #define that conflicts with one in libcfs.h */
+#undef RETURN
+
+/* completion_matches() is #if 0-ed out in modern glibc */
+#ifndef completion_matches
+#  define completion_matches rl_completion_matches
+#endif
+
+#endif /* HAVE_LIBREADLINE */
+
+extern void using_history(void);
+extern void stifle_history(int);
+extern void add_history(char *);
+
+#endif /* !__KERNEL__ */
+
+#endif
index 1c88080..9c38fd3 100644 (file)
@@ -1,20 +1,49 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
-#ifndef __PORTALS_LINUX_LIB_P30_H__
-#define __PORTALS_LINUX_LIB_P30_H__
+#ifndef __LNET_LINUX_LIB_LNET_H__
+#define __LNET_LINUX_LIB_LNET_H__
 
-#ifndef __PORTALS_LIB_P30_H__
-#error Do not #include this file directly. #include <portals/lib-p30.h> instead
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <lnet/lib-lnet.h> instead
 #endif
 
 #ifdef __KERNEL__
 # include <asm/page.h>
 # include <linux/string.h>
-#else
+# include <asm/io.h>
+# include <libcfs/kp30.h>
+
+static inline __u64
+lnet_page2phys (struct page *p)
+{
+        /* compiler optimizer will elide unused branches */
+
+        switch (sizeof(typeof(page_to_phys(p)))) {
+        case 4:
+                /* page_to_phys returns a 32 bit physical address.  This must
+                 * be a 32 bit machine with <= 4G memory and we must ensure we
+                 * don't sign extend when converting to 64 bits. */
+                return (unsigned long)page_to_phys(p);
+
+        case 8:
+                /* page_to_phys returns a 64 bit physical address :) */
+                return page_to_phys(p);
+                
+        default:
+                LBUG();
+                return 0;
+        }
+}
+
+#else  /* __KERNEL__ */
 # include <libcfs/list.h>
 # include <string.h>
-# include <pthread.h>
+# ifdef HAVE_LIBPTHREAD
+#  include <pthread.h>
+# endif
 #endif
 
-#endif
+#define LNET_ROUTER
+
+#endif /* __LNET_LINUX_LIB_LNET_H__ */
diff --git a/lnet/include/lnet/linux/lib-p30.h b/lnet/include/lnet/linux/lib-p30.h
deleted file mode 100644 (file)
index 1c88080..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- */
-#ifndef __PORTALS_LINUX_LIB_P30_H__
-#define __PORTALS_LINUX_LIB_P30_H__
-
-#ifndef __PORTALS_LIB_P30_H__
-#error Do not #include this file directly. #include <portals/lib-p30.h> instead
-#endif
-
-#ifdef __KERNEL__
-# include <asm/page.h>
-# include <linux/string.h>
-#else
-# include <libcfs/list.h>
-# include <string.h>
-# include <pthread.h>
-#endif
-
-#endif
index f896b4b..7d28839 100644 (file)
@@ -1,11 +1,11 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
-#ifndef __PORTALS_LINUX_LIB_TYPES_H__
-#define __PORTALS_LINUX_LIB_TYPES_H__
+#ifndef __LNET_LINUX_LIB_TYPES_H__
+#define __LNET_LINUX_LIB_TYPES_H__
 
-#ifndef __PORTALS_LIB_TYPES_H__
-#error Do not #include this file directly. #include <portals/lib-types.h> instead
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <lnet/lib-types.h> instead
 #endif
 
 #ifdef __KERNEL__
@@ -13,7 +13,7 @@
 # include <linux/smp_lock.h>
 # include <linux/types.h>
 #else
-# define PTL_USE_LIB_FREELIST
+# define LNET_USE_LIB_FREELIST
 # include <sys/types.h>
 #endif
 
index b074837..b1aab84 100644 (file)
@@ -1,15 +1,15 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
-#ifndef __PORTALS_LINUX_P30_H__
-#define __PORTALS_LINUX_P30_H__
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
 
-#ifndef __PORTALS_P30_H__
-#error Do not #include this file directly. #include <portals/p30.h> instead
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <lnet/lnet.h> instead
 #endif
 
 /*
- * p30.h
+ * lnet.h
  *
  * User application interface file
  */
diff --git a/lnet/include/lnet/linux/p30.h b/lnet/include/lnet/linux/p30.h
deleted file mode 100644 (file)
index b074837..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- */
-#ifndef __PORTALS_LINUX_P30_H__
-#define __PORTALS_LINUX_P30_H__
-
-#ifndef __PORTALS_P30_H__
-#error Do not #include this file directly. #include <portals/p30.h> instead
-#endif
-
-/*
- * p30.h
- *
- * User application interface file
- */
-
-#if defined (__KERNEL__)
-#include <linux/uio.h>
-#include <linux/types.h>
-#else
-#include <sys/types.h>
-#include <sys/uio.h>
-#endif
-
-#endif
index 9be79b8..819c524 100644 (file)
@@ -1,25 +1,25 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  */
-#ifndef __PORTALS_P30_H__
-#define __PORTALS_P30_H__
-
-#include "build_check.h"
+#ifndef __LNET_H__
+#define __LNET_H__
 
 /*
- * p30.h
+ * lnet.h
  *
  * User application interface file
  */
 #if defined(__linux__)
-#include <portals/linux/p30.h>
+#include <lnet/linux/lnet.h>
 #elif defined(__APPLE__)
-#include <portals/darwin/p30.h>
+#include <lnet/darwin/lnet.h>
+#elif defined(__WINNT__)
+#include <lnet/winnt/lnet.h>
 #else
 #error Unsupported Operating System
 #endif
 
-#include <portals/types.h>
-#include <portals/api.h>
+#include <lnet/types.h>
+#include <lnet/api.h>
 
 #endif
index cce160e..cb66b9d 100644 (file)
 #ifndef _PTLCTL_H_
 #define _PTLCTL_H_
 
-#include <portals/types.h>
+#include <lnet/types.h>
 #include <libcfs/kp30.h>
 #include <libcfs/libcfs.h>
 
-#define PORTALS_DEV_ID 0
-#define PORTALS_DEV_PATH "/dev/portals"
+#define LNET_DEV_ID 0
+#define LNET_DEV_PATH "/dev/lnet"
+#define LNET_DEV_MAJOR 10
+#define LNET_DEV_MINOR 240
 #define OBD_DEV_ID 1
 #define OBD_DEV_PATH "/dev/obd"
+#define OBD_DEV_MAJOR 10
+#define OBD_DEV_MINOR 241
 #define SMFS_DEV_ID  2
 #define SMFS_DEV_PATH "/dev/snapdev"
-
-int ptl_name2nal(char *str);
-int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
-int ptl_parse_anynid (ptl_nid_t *nidp, char *str);
-int ptl_parse_nid (ptl_nid_t *nidp, char *str);
-char * ptl_nid2str (char *buffer, ptl_nid_t nid);
+#define SMFS_DEV_MAJOR 10
+#define SMFS_DEV_MINOR 242
 
 int ptl_initialize(int argc, char **argv);
 int jt_ptl_network(int argc, char **argv);
+int jt_ptl_list_nids(int argc, char **argv);
+int jt_ptl_which_nid(int argc, char **argv);
 int jt_ptl_print_interfaces(int argc, char **argv);
 int jt_ptl_add_interface(int argc, char **argv);
 int jt_ptl_del_interface(int argc, char **argv);
@@ -47,12 +49,11 @@ int jt_ptl_print_peers (int argc, char **argv);
 int jt_ptl_add_peer (int argc, char **argv);
 int jt_ptl_del_peer (int argc, char **argv);
 int jt_ptl_print_connections (int argc, char **argv);
-int jt_ptl_connect(int argc, char **argv);
 int jt_ptl_disconnect(int argc, char **argv);
 int jt_ptl_push_connection(int argc, char **argv);
 int jt_ptl_print_active_txs(int argc, char **argv);
 int jt_ptl_ping(int argc, char **argv);
-int jt_ptl_shownid(int argc, char **argv);
+int jt_ptl_ping_test(int argc, char **argv);
 int jt_ptl_mynid(int argc, char **argv);
 int jt_ptl_add_uuid(int argc, char **argv);
 int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
@@ -63,8 +64,8 @@ int jt_ptl_del_route (int argc, char **argv);
 int jt_ptl_notify_router (int argc, char **argv);
 int jt_ptl_print_routes (int argc, char **argv);
 int jt_ptl_fail_nid (int argc, char **argv);
-int jt_ptl_loopback (int argc, char **argv);
 int jt_ptl_lwt(int argc, char **argv);
+int jt_ptl_testprotocompat(int argc, char **argv);
 int jt_ptl_memhog(int argc, char **argv);
 
 int dbg_initialize(int argc, char **argv);
@@ -79,12 +80,10 @@ int jt_dbg_mark_debug_buf(int argc, char **argv);
 int jt_dbg_modules(int argc, char **argv);
 int jt_dbg_panic(int argc, char **argv);
 
-int ptl_set_cfg_record_cb(cfg_record_cb_t cb);
-
 /* l_ioctl.c */
 typedef int (ioc_handler_t)(int dev_id, unsigned int opc, void *buf);
 void set_ioc_handler(ioc_handler_t *handler);
-int register_ioc_dev(int dev_id, const char * dev_name);
+int register_ioc_dev(int dev_id, const char * dev_name, int major, int minor);
 void unregister_ioc_dev(int dev_id);
 int set_ioctl_dump(char * file);
 int l_ioctl(int dev_id, unsigned int opc, void *buf);
diff --git a/lnet/include/lnet/myrnal.h b/lnet/include/lnet/myrnal.h
deleted file mode 100644 (file)
index 13790f7..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef MYRNAL_H
-#define MYRNAL_H
-
-#define MAX_ARGS_LEN            (256)
-#define MAX_RET_LEN             (128)
-#define MYRNAL_MAX_ACL_SIZE     (64)
-#define MYRNAL_MAX_PTL_SIZE     (64)
-
-#define P3CMD                   (100)
-#define P3SYSCALL               (200)
-#define P3REGISTER              (300)
-
-enum { PTL_MLOCKALL };
-
-typedef struct {
-       void *args;
-       size_t args_len;
-       void *ret;
-       size_t ret_len;
-       int p3cmd;
-} myrnal_forward_t;
-
-#endif                         /* MYRNAL_H */
diff --git a/lnet/include/lnet/nal.h b/lnet/include/lnet/nal.h
deleted file mode 100644 (file)
index aad611d..0000000
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef _NAL_H_
-#define _NAL_H_
-
-#include "build_check.h"
-
-/*
- * p30/nal.h
- *
- * The API side NAL declarations
- */
-
-#include <portals/types.h>
-
-typedef struct nal_t nal_t;
-
-struct nal_t {
-       /* common interface state */
-       int              nal_refct;
-        ptl_handle_ni_t  nal_handle;
-
-       /* NAL-private data */
-       void            *nal_data;
-
-       /* NAL API implementation 
-        * NB only nal_ni_init needs to be set when the NAL registers itself */
-       int (*nal_ni_init) (nal_t *nal, ptl_pid_t requested_pid,
-                           ptl_ni_limits_t *req, ptl_ni_limits_t *actual);
-       
-       void (*nal_ni_fini) (nal_t *nal);
-
-       int (*nal_get_id) (nal_t *nal, ptl_process_id_t *id);
-       int (*nal_ni_status) (nal_t *nal, ptl_sr_index_t register, ptl_sr_value_t *status);
-       int (*nal_ni_dist) (nal_t *nal, ptl_process_id_t *id, unsigned long *distance);
-       int (*nal_fail_nid) (nal_t *nal, ptl_nid_t nid, unsigned int threshold);
-       int (*nal_loopback) (nal_t *nal, int set, int *enabled);
-
-       int (*nal_me_attach) (nal_t *nal, ptl_pt_index_t portal,
-                             ptl_process_id_t match_id, 
-                             ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits,
-                             ptl_unlink_t unlink, ptl_ins_pos_t pos, 
-                             ptl_handle_me_t *handle);
-       int (*nal_me_insert) (nal_t *nal, ptl_handle_me_t *me,
-                             ptl_process_id_t match_id, 
-                             ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits,
-                             ptl_unlink_t unlink, ptl_ins_pos_t pos, 
-                             ptl_handle_me_t *handle);
-       int (*nal_me_unlink) (nal_t *nal, ptl_handle_me_t *me);
-       
-       int (*nal_md_attach) (nal_t *nal, ptl_handle_me_t *me,
-                             ptl_md_t *md, ptl_unlink_t unlink, 
-                             ptl_handle_md_t *handle);
-       int (*nal_md_bind) (nal_t *nal, 
-                           ptl_md_t *md, ptl_unlink_t unlink, 
-                           ptl_handle_md_t *handle);
-       int (*nal_md_unlink) (nal_t *nal, ptl_handle_md_t *md);
-       int (*nal_md_update) (nal_t *nal, ptl_handle_md_t *md,
-                             ptl_md_t *old_md, ptl_md_t *new_md,
-                             ptl_handle_eq_t *testq);
-
-       int (*nal_eq_alloc) (nal_t *nal, ptl_size_t count,
-                            ptl_eq_handler_t handler,
-                            ptl_handle_eq_t *handle);
-       int (*nal_eq_free) (nal_t *nal, ptl_handle_eq_t *eq);
-       int (*nal_eq_poll) (nal_t *nal, 
-                           ptl_handle_eq_t *eqs, int neqs, int timeout,
-                           ptl_event_t *event, int *which);
-
-       int (*nal_ace_entry) (nal_t *nal, ptl_ac_index_t index,
-                             ptl_process_id_t match_id, ptl_pt_index_t portal);
-       
-       int (*nal_put) (nal_t *nal, ptl_handle_md_t *md, ptl_ack_req_t ack,
-                       ptl_process_id_t *target, ptl_pt_index_t portal,
-                       ptl_ac_index_t ac, ptl_match_bits_t match,
-                       ptl_size_t offset, ptl_hdr_data_t hdr_data);
-       int (*nal_get) (nal_t *nal, ptl_handle_md_t *md,
-                       ptl_process_id_t *target, ptl_pt_index_t portal,
-                       ptl_ac_index_t ac, ptl_match_bits_t match,
-                       ptl_size_t offset);
-};
-
-extern nal_t *ptl_hndl2nal(ptl_handle_any_t *any);
-
-#ifdef __KERNEL__
-extern int ptl_register_nal(ptl_interface_t interface, nal_t *nal);
-extern void ptl_unregister_nal(ptl_interface_t interface);
-#endif
-
-#endif
diff --git a/lnet/include/lnet/nalids.h b/lnet/include/lnet/nalids.h
deleted file mode 100644 (file)
index 55a991b..0000000
+++ /dev/null
@@ -1,2 +0,0 @@
-#include "build_check.h"
-
diff --git a/lnet/include/lnet/p30.h b/lnet/include/lnet/p30.h
deleted file mode 100644 (file)
index 9be79b8..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- */
-#ifndef __PORTALS_P30_H__
-#define __PORTALS_P30_H__
-
-#include "build_check.h"
-
-/*
- * p30.h
- *
- * User application interface file
- */
-#if defined(__linux__)
-#include <portals/linux/p30.h>
-#elif defined(__APPLE__)
-#include <portals/darwin/p30.h>
-#else
-#error Unsupported Operating System
-#endif
-
-#include <portals/types.h>
-#include <portals/api.h>
-
-#endif
diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h
deleted file mode 100644 (file)
index cce160e..0000000
+++ /dev/null
@@ -1,96 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *   This file is part of Portals, http://www.sf.net/projects/lustre/
- *
- *   Portals is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Portals is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Portals; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * header for libptlctl.a
- */
-#ifndef _PTLCTL_H_
-#define _PTLCTL_H_
-
-#include <portals/types.h>
-#include <libcfs/kp30.h>
-#include <libcfs/libcfs.h>
-
-#define PORTALS_DEV_ID 0
-#define PORTALS_DEV_PATH "/dev/portals"
-#define OBD_DEV_ID 1
-#define OBD_DEV_PATH "/dev/obd"
-#define SMFS_DEV_ID  2
-#define SMFS_DEV_PATH "/dev/snapdev"
-
-int ptl_name2nal(char *str);
-int ptl_parse_ipaddr (__u32 *ipaddrp, char *str);
-int ptl_parse_anynid (ptl_nid_t *nidp, char *str);
-int ptl_parse_nid (ptl_nid_t *nidp, char *str);
-char * ptl_nid2str (char *buffer, ptl_nid_t nid);
-
-int ptl_initialize(int argc, char **argv);
-int jt_ptl_network(int argc, char **argv);
-int jt_ptl_print_interfaces(int argc, char **argv);
-int jt_ptl_add_interface(int argc, char **argv);
-int jt_ptl_del_interface(int argc, char **argv);
-int jt_ptl_print_peers (int argc, char **argv);
-int jt_ptl_add_peer (int argc, char **argv);
-int jt_ptl_del_peer (int argc, char **argv);
-int jt_ptl_print_connections (int argc, char **argv);
-int jt_ptl_connect(int argc, char **argv);
-int jt_ptl_disconnect(int argc, char **argv);
-int jt_ptl_push_connection(int argc, char **argv);
-int jt_ptl_print_active_txs(int argc, char **argv);
-int jt_ptl_ping(int argc, char **argv);
-int jt_ptl_shownid(int argc, char **argv);
-int jt_ptl_mynid(int argc, char **argv);
-int jt_ptl_add_uuid(int argc, char **argv);
-int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility  */
-int jt_ptl_close_uuid(int argc, char **argv);
-int jt_ptl_del_uuid(int argc, char **argv);
-int jt_ptl_add_route (int argc, char **argv);
-int jt_ptl_del_route (int argc, char **argv);
-int jt_ptl_notify_router (int argc, char **argv);
-int jt_ptl_print_routes (int argc, char **argv);
-int jt_ptl_fail_nid (int argc, char **argv);
-int jt_ptl_loopback (int argc, char **argv);
-int jt_ptl_lwt(int argc, char **argv);
-int jt_ptl_memhog(int argc, char **argv);
-
-int dbg_initialize(int argc, char **argv);
-int jt_dbg_filter(int argc, char **argv);
-int jt_dbg_show(int argc, char **argv);
-int jt_dbg_list(int argc, char **argv);
-int jt_dbg_debug_kernel(int argc, char **argv);
-int jt_dbg_debug_daemon(int argc, char **argv);
-int jt_dbg_debug_file(int argc, char **argv);
-int jt_dbg_clear_debug_buf(int argc, char **argv);
-int jt_dbg_mark_debug_buf(int argc, char **argv);
-int jt_dbg_modules(int argc, char **argv);
-int jt_dbg_panic(int argc, char **argv);
-
-int ptl_set_cfg_record_cb(cfg_record_cb_t cb);
-
-/* l_ioctl.c */
-typedef int (ioc_handler_t)(int dev_id, unsigned int opc, void *buf);
-void set_ioc_handler(ioc_handler_t *handler);
-int register_ioc_dev(int dev_id, const char * dev_name);
-void unregister_ioc_dev(int dev_id);
-int set_ioctl_dump(char * file);
-int l_ioctl(int dev_id, unsigned int opc, void *buf);
-int parse_dump(char * dump_file, ioc_handler_t ioc_func);
-int jt_ioc_dump(int argc, char **argv);
-extern char *dump_filename;
-int dump(int dev_id, unsigned int opc, void *buf);
-
-#endif
diff --git a/lnet/include/lnet/ptllnd.h b/lnet/include/lnet/ptllnd.h
new file mode 100755 (executable)
index 0000000..c52480c
--- /dev/null
@@ -0,0 +1,77 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+/*
+ * The PTLLND was designed to support Portals with
+ * Lustre and non-lustre UNLINK semantics.
+ * However for now the two targets are Cray Portals
+ * on the XT3 and Lustre Portals (for testing) both
+ * have Lustre UNLINK semantics, so this is defined
+ * by default.
+ */
+#define LUSTRE_PORTALS_UNLINK_SEMANTICS
+#ifdef _USING_LUSTRE_PORTALS_
+
+/* NIDs are 64-bits on Lustre Portals */
+#define FMT_NID LPU64
+#define FMT_PID "%d"
+
+/* When using Lustre Portals Lustre completion semantics are imlicit*/
+#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS      0
+
+#else /* _USING_CRAY_PORTALS_ */
+
+/* Explicit NULL function pointer for EQ handler */
+#define PTL_EQ_HANDLER_NONE                     0
+
+/* NIDs are integers on Cray Portals */
+#define FMT_NID "%u"
+#define FMT_PID "%d"
+
+/* When using Cray Portals this is defined in the Cray Portals Header*/
+/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */
+
+/* Can compare handles directly on Cray Portals */
+#define PtlHandleIsEqual(a,b) ((a) == (b))
+
+/* Diffrent error types on Cray Portals*/
+#define ptl_err_t ptl_ni_fail_t
+
+/*
+ * The Cray Portals has no maximum number of IOVs.  The
+ * maximum is limited only my memory and size of the
+ * int parameters (2^31-1).
+ * Lustre only really require that the underyling
+ * implemenation to support at least LNET_MAX_IOV,
+ * so for Cray portals we can safely just use that
+ * value here.
+ *
+ */
+#define PTL_MD_MAX_IOV          LNET_MAX_IOV
+
+#endif
+
+#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID
+
+/* Align incoming small request messages to an 8 byte boundary if this is
+ * supported to avoid alignment issues on some architectures */
+#ifndef PTL_MD_LOCAL_ALIGN8
+# define PTL_MD_LOCAL_ALIGN8 0
+#endif
diff --git a/lnet/include/lnet/ptllnd_wire.h b/lnet/include/lnet/ptllnd_wire.h
new file mode 100644 (file)
index 0000000..e5b5410
--- /dev/null
@@ -0,0 +1,93 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+/************************************************************************
+ * Tunable defaults that {u,k}lnds/ptllnd should have in common.
+ */
+
+#define PTLLND_PORTAL           9          /* The same portal PTLPRC used when talking to cray portals */
+#define PTLLND_PID              9          /* The Portals PID */
+#define PTLLND_PEERCREDITS      8          /* concurrent sends to 1 peer */
+#define PTLLND_MAX_MSG_SIZE     512        /* Maximum message size */
+
+
+/************************************************************************
+ * Portals LNS Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+#define PTL_RESERVED_MATCHBITS  0x100  /* below this value is reserved
+                                         * above is for bulk data transfer */
+#define LNET_MSG_MATCHBITS       0      /* the value for the message channel */
+
+typedef struct
+{
+        lnet_hdr_t        kptlim_hdr;             /* portals header */
+        char              kptlim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kptl_immediate_msg_t;
+
+typedef struct
+{
+        lnet_hdr_t        kptlrm_hdr;             /* portals header */
+        __u64             kptlrm_matchbits;       /* matchbits */
+} WIRE_ATTR kptl_rdma_msg_t;
+
+typedef struct
+{
+        __u64             kptlhm_matchbits;       /* matchbits */
+        __u32             kptlhm_max_msg_size;    /* max message size */
+} WIRE_ATTR kptl_hello_msg_t;
+
+typedef struct
+{
+        /* First 2 fields fixed FOR ALL TIME */
+        __u32           ptlm_magic;     /* I'm a Portals LND message */
+        __u16           ptlm_version;   /* this is my version number */
+        __u8            ptlm_type;      /* the message type */
+        __u8            ptlm_credits;   /* returned credits */
+        __u32           ptlm_nob;       /* # bytes in whole message */
+        __u32           ptlm_cksum;     /* checksum (0 == no checksum) */
+        __u64           ptlm_srcnid;    /* sender's NID */
+        __u64           ptlm_srcstamp;  /* sender's incarnation */
+        __u64           ptlm_dstnid;    /* destination's NID */
+        __u64           ptlm_dststamp;  /* destination's incarnation */
+        __u32           ptlm_srcpid;    /* sender's PID */
+        __u32           ptlm_dstpid;    /* destination's PID */
+
+         union {
+                kptl_immediate_msg_t    immediate;
+                kptl_rdma_msg_t         rdma;
+                kptl_hello_msg_t        hello;
+        } WIRE_ATTR ptlm_u;
+
+} kptl_msg_t;
+
+#define PTLLND_MSG_MAGIC                LNET_PROTO_PTL_MAGIC
+#define PTLLND_MSG_VERSION              0x04
+
+#define PTLLND_RDMA_OK                  0x00
+#define PTLLND_RDMA_FAIL                0x01
+
+#define PTLLND_MSG_TYPE_INVALID         0x00
+#define PTLLND_MSG_TYPE_PUT             0x01
+#define PTLLND_MSG_TYPE_GET             0x02
+#define PTLLND_MSG_TYPE_IMMEDIATE       0x03    /* No bulk data xfer*/
+#define PTLLND_MSG_TYPE_NOOP            0x04
+#define PTLLND_MSG_TYPE_HELLO           0x05
+#define PTLLND_MSG_TYPE_NAK             0x06
+
index 27e6f8e..301f8a8 100644 (file)
@@ -1,14 +1,53 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * <portals/socknal.h>
+ * <lnet/socklnd.h>
  *
  * #defines shared between socknal implementation and utilities
  */
+#ifndef __LNET_LNET_SOCKLND_H__
+#define __LNET_LNET_SOCKLND_H__
 
-#define SOCKNAL_CONN_NONE     (-1)
-#define SOCKNAL_CONN_ANY        0
-#define SOCKNAL_CONN_CONTROL    1
-#define SOCKNAL_CONN_BULK_IN    2
-#define SOCKNAL_CONN_BULK_OUT   3
-#define SOCKNAL_CONN_NTYPES     4
+#include <lnet/types.h>
+#include <lnet/lib-types.h>
+
+#define SOCKLND_CONN_NONE     (-1)
+#define SOCKLND_CONN_ANY        0
+#define SOCKLND_CONN_CONTROL    1
+#define SOCKLND_CONN_BULK_IN    2
+#define SOCKLND_CONN_BULK_OUT   3
+#define SOCKLND_CONN_NTYPES     4
+
+typedef struct {
+        __u32                   kshm_magic;     /* magic number of socklnd message */
+        __u32                   kshm_version;   /* version of socklnd message */
+        lnet_nid_t              kshm_src_nid;   /* sender's nid */
+        lnet_nid_t              kshm_dst_nid;   /* destination nid */
+        lnet_pid_t              kshm_src_pid;   /* sender's pid */
+        lnet_pid_t              kshm_dst_pid;   /* destination pid */
+        __u64                   kshm_src_incarnation; /* sender's incarnation */
+        __u64                   kshm_dst_incarnation; /* destination's incarnation */
+        __u32                   kshm_ctype;     /* connection type */
+        __u32                   kshm_nips;      /* # IP addrs */
+        __u32                   kshm_ips[0];    /* IP addrs */
+} WIRE_ATTR ksock_hello_msg_t;
+
+typedef struct {
+        lnet_hdr_t              ksnm_hdr;       /* lnet hdr */
+        char                    ksnm_payload[0];/* lnet payload */
+} WIRE_ATTR ksock_lnet_msg_t;
+
+typedef struct {
+        __u32                   ksm_type;       /* type of socklnd message */
+        __u32                   ksm_csum;       /* checksum if != 0 */
+        __u64                   ksm_zc_req_cookie; /* ack required if != 0 */
+        __u64                   ksm_zc_ack_cookie; /* ack if != 0 */
+        union {
+                ksock_lnet_msg_t lnetmsg;       /* lnet message, it's empty if it's NOOP */
+        } WIRE_ATTR ksm_u;
+} WIRE_ATTR ksock_msg_t;
+
+#define KSOCK_MSG_NOOP          0xc0            /* ksm_u empty */ 
+#define KSOCK_MSG_LNET          0xc1            /* lnet msg */
+
+#endif
diff --git a/lnet/include/lnet/stringtab.h b/lnet/include/lnet/stringtab.h
deleted file mode 100644 (file)
index 33e4375..0000000
+++ /dev/null
@@ -1,3 +0,0 @@
-/*
- * stringtab.h
- */
index f07534b..11ea5de 100644 (file)
-#ifndef _P30_TYPES_H_
-#define _P30_TYPES_H_
-
-#include "build_check.h"
+#ifndef __LNET_TYPES_H__
+#define __LNET_TYPES_H__
 
 #include <libcfs/libcfs.h>
-#include <portals/errno.h>
-
-/* This implementation uses the same type for API function return codes and
- * the completion status in an event  */
-#define PTL_NI_OK  PTL_OK
-typedef ptl_err_t ptl_ni_fail_t;
-
-typedef __u32 ptl_uid_t;
-typedef __u32 ptl_jid_t;
-typedef __u64 ptl_nid_t;
-typedef __u32 ptl_netid_t;
-typedef __u32 ptl_pid_t;
-typedef __u32 ptl_pt_index_t;
-typedef __u32 ptl_ac_index_t;
-typedef __u64 ptl_match_bits_t;
-typedef __u64 ptl_hdr_data_t;
-typedef __u32 ptl_size_t;
-
-#define PTL_TIME_FOREVER    (-1)
+
+#define LNET_RESERVED_PORTAL      0            /* portals reserved for lnet's own use */
+
+typedef __u64 lnet_nid_t;
+typedef __u32 lnet_pid_t;
+
+#define LNET_NID_ANY      ((lnet_nid_t) -1)
+#define LNET_PID_ANY      ((lnet_pid_t) -1)
+
+#ifdef CRAY_XT3
+typedef __u32 lnet_uid_t;
+#define LNET_UID_ANY      ((lnet_uid_t) -1)
+#endif
+
+#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */
+#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */
+
+#define LNET_TIME_FOREVER    (-1)
 
 typedef struct {
-        unsigned long nal_idx;                 /* which network interface */
-        __u64         cookie;                  /* which thing on that interface */
-} ptl_handle_any_t;
+        __u64         cookie;
+} lnet_handle_any_t;
 
-typedef ptl_handle_any_t ptl_handle_ni_t;
-typedef ptl_handle_any_t ptl_handle_eq_t;
-typedef ptl_handle_any_t ptl_handle_md_t;
-typedef ptl_handle_any_t ptl_handle_me_t;
+typedef lnet_handle_any_t lnet_handle_eq_t;
+typedef lnet_handle_any_t lnet_handle_md_t;
+typedef lnet_handle_any_t lnet_handle_me_t;
 
-#define PTL_INVALID_HANDLE \
-    ((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1})
-#define PTL_EQ_NONE PTL_INVALID_HANDLE
+#define LNET_INVALID_HANDLE \
+    ((const lnet_handle_any_t){.cookie = -1})
+#define LNET_EQ_NONE LNET_INVALID_HANDLE
 
-static inline int PtlHandleIsEqual (ptl_handle_any_t h1, ptl_handle_any_t h2)
+static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2)
 {
-       return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie);
+       return (h1.cookie == h2.cookie);
 }
 
-#define PTL_UID_ANY      ((ptl_uid_t) -1)
-#define PTL_JID_ANY      ((ptl_jid_t) -1)
-#define PTL_NID_ANY      ((ptl_nid_t) -1)
-#define PTL_PID_ANY      ((ptl_pid_t) -1)
-
 typedef struct {
-        ptl_nid_t nid;
-        ptl_pid_t pid;   /* node id / process id */
-} ptl_process_id_t;
+        lnet_nid_t nid;
+        lnet_pid_t pid;   /* node id / process id */
+} lnet_process_id_t;
 
 typedef enum {
-        PTL_RETAIN = 0,
-        PTL_UNLINK
-} ptl_unlink_t;
+        LNET_RETAIN = 0,
+        LNET_UNLINK
+} lnet_unlink_t;
 
 typedef enum {
-        PTL_INS_BEFORE,
-        PTL_INS_AFTER
-} ptl_ins_pos_t;
+        LNET_INS_BEFORE,
+        LNET_INS_AFTER
+} lnet_ins_pos_t;
 
 typedef struct {
         void            *start;
-        ptl_size_t       length;
+        unsigned int     length;
         int              threshold;
         int              max_size;
         unsigned int     options;
         void            *user_ptr;
-        ptl_handle_eq_t  eq_handle;
-} ptl_md_t;
+        lnet_handle_eq_t eq_handle;
+} lnet_md_t;
+
+/* Max Transfer Unit (minimum supported everywhere) */
+#define LNET_MTU_BITS   20
+#define LNET_MTU        (1<<LNET_MTU_BITS)
+
+/* limit on the number of entries in discontiguous MDs */
+#define LNET_MAX_IOV    256
+
+/* Max payload size */
+#ifndef LNET_MAX_PAYLOAD
+# error "LNET_MAX_PAYLOAD must be defined in config.h"
+#else
+# if (LNET_MAX_PAYLOAD < LNET_MTU)
+#  error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb"
+# elif defined(__KERNEL__)
+#  if (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV))
+/*  PAGE_SIZE is a constant: check with cpp! */
+#   error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb"
+#  endif
+# endif
+#endif
 
 /* Options for the MD structure */
-#define PTL_MD_OP_PUT               (1 << 0)
-#define PTL_MD_OP_GET               (1 << 1)
-#define PTL_MD_MANAGE_REMOTE        (1 << 2)
-/* unused                           (1 << 3) */
-#define PTL_MD_TRUNCATE             (1 << 4)
-#define PTL_MD_ACK_DISABLE          (1 << 5)
-#define PTL_MD_IOVEC               (1 << 6)
-#define PTL_MD_MAX_SIZE                    (1 << 7)
-#define PTL_MD_KIOV                 (1 << 8)
-#define PTL_MD_EVENT_START_DISABLE  (1 << 9)
-#define PTL_MD_EVENT_END_DISABLE    (1 << 10)
+#define LNET_MD_OP_PUT               (1 << 0)
+#define LNET_MD_OP_GET               (1 << 1)
+#define LNET_MD_MANAGE_REMOTE        (1 << 2)
+/* unused                            (1 << 3) */
+#define LNET_MD_TRUNCATE             (1 << 4)
+#define LNET_MD_ACK_DISABLE          (1 << 5)
+#define LNET_MD_IOVEC                (1 << 6)
+#define LNET_MD_MAX_SIZE            (1 << 7)
+#define LNET_MD_KIOV                 (1 << 8)
 
 /* For compatibility with Cray Portals */
-#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS  0
-#define PTL_MD_PHYS                         0
+#define LNET_MD_PHYS                         0
 
-#define PTL_MD_THRESH_INF       (-1)
+#define LNET_MD_THRESH_INF       (-1)
 
 /* NB lustre portals uses struct iovec internally! */
-typedef struct iovec ptl_md_iovec_t;
+typedef struct iovec lnet_md_iovec_t;
 
 typedef struct {
        cfs_page_t      *kiov_page;
        unsigned int     kiov_len;
        unsigned int     kiov_offset;
-} ptl_kiov_t;
+} lnet_kiov_t;
 
 typedef enum {
-        PTL_EVENT_GET_START,
-        PTL_EVENT_GET_END,
-
-        PTL_EVENT_PUT_START,
-        PTL_EVENT_PUT_END,
-
-        PTL_EVENT_REPLY_START,
-        PTL_EVENT_REPLY_END,
-
-        PTL_EVENT_ACK,
-
-        PTL_EVENT_SEND_START,
-       PTL_EVENT_SEND_END,
-
-       PTL_EVENT_UNLINK,
-} ptl_event_kind_t;
-
-#define PTL_SEQ_BASETYPE       long
-typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
-#define PTL_SEQ_GT(a,b)        (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0)
+        LNET_EVENT_GET,
+        LNET_EVENT_PUT,
+        LNET_EVENT_REPLY,
+        LNET_EVENT_ACK,
+       LNET_EVENT_SEND,
+       LNET_EVENT_UNLINK,
+} lnet_event_kind_t;
+
+#define LNET_SEQ_BASETYPE      long
+typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t;
+#define LNET_SEQ_GT(a,b)       (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0)
 
 /* XXX
  * cygwin need the pragma line, not clear if it's needed in other places.
@@ -131,64 +130,35 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t;
 #pragma pack(push, 4)
 #endif
 typedef struct {
-        ptl_event_kind_t   type;
-        ptl_process_id_t   initiator;
-        ptl_uid_t          uid;
-        ptl_jid_t          jid;
-        ptl_pt_index_t     pt_index;
-        ptl_match_bits_t   match_bits;
-        ptl_size_t         rlength;
-        ptl_size_t         mlength;
-        ptl_size_t         offset;
-        ptl_handle_md_t    md_handle;
-        ptl_md_t           md;
-        ptl_hdr_data_t     hdr_data;
-        ptl_seq_t          link;
-        ptl_ni_fail_t      ni_fail_type;
-
-        int                unlinked;
-
-        volatile ptl_seq_t sequence;
-} ptl_event_t;
+        lnet_event_kind_t   type;
+       lnet_process_id_t   target;
+        lnet_process_id_t   initiator;
+#ifdef CRAY_XT3
+       lnet_uid_t          uid;
+#endif
+        unsigned int        pt_index;
+        __u64               match_bits;
+        unsigned int        rlength;
+        unsigned int        mlength;
+        unsigned int        offset;
+        lnet_handle_md_t    md_handle;
+        lnet_md_t           md;
+        __u64               hdr_data;
+        int                 status;
+        int                 unlinked;
+
+        volatile lnet_seq_t sequence;
+} lnet_event_t;
 #ifdef __CYGWIN__
 #pragma pop
 #endif
 
 typedef enum {
-        PTL_ACK_REQ,
-        PTL_NOACK_REQ
-} ptl_ack_req_t;
-
-typedef void (*ptl_eq_handler_t)(ptl_event_t *event);
-#define PTL_EQ_HANDLER_NONE NULL
+        LNET_ACK_REQ,
+        LNET_NOACK_REQ
+} lnet_ack_req_t;
 
-typedef struct {
-       int max_mes;
-       int max_mds;
-       int max_eqs;
-       int max_ac_index;
-       int max_pt_index;
-       int max_md_iovecs;
-       int max_me_list;
-       int max_getput_md;
-} ptl_ni_limits_t;
-
-/*
- * Status registers
- */
-typedef enum {
-        PTL_SR_DROP_COUNT,
-        PTL_SR_DROP_LENGTH,
-        PTL_SR_RECV_COUNT,
-        PTL_SR_RECV_LENGTH,
-        PTL_SR_SEND_COUNT,
-        PTL_SR_SEND_LENGTH,
-        PTL_SR_MSGS_MAX,
-} ptl_sr_index_t;
-
-typedef int ptl_sr_value_t;
-
-typedef int ptl_interface_t;
-#define PTL_IFACE_DEFAULT    (-1)
+typedef void (*lnet_eq_handler_t)(lnet_event_t *event);
+#define LNET_EQ_HANDLER_NONE NULL
 
 #endif
diff --git a/lnet/include/lnet/winnt/api-support.h b/lnet/include/lnet/winnt/api-support.h
new file mode 100644 (file)
index 0000000..8806981
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef __WINNT_API_SUPPORT_H__
+#define __WINNT_API_SUPPORT_H__
+
+#ifndef __LNET_API_SUPPORT_H__
+#error Do not #include this file directly. #include <lnet/api-support.h> instead
+#endif
+
+
+#endif
diff --git a/lnet/include/lnet/winnt/lib-lnet.h b/lnet/include/lnet/winnt/lib-lnet.h
new file mode 100644 (file)
index 0000000..bb3e5af
--- /dev/null
@@ -0,0 +1,25 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef __LNET_WINNT_LIB_LNET_H__
+#define __LNET_WINNT_LIB_LNET_H__
+
+#ifndef __LNET_LIB_LNET_H__
+#error Do not #include this file directly. #include <lnet/lib-lnet.h> instead
+#endif
+
+#ifdef __KERNEL__
+# include <libcfs/libcfs.h>
+# include <libcfs/kp30.h>
+
+static inline __u64
+lnet_page2phys (struct page *p)
+{
+    return 0;
+}
+
+#else  /* __KERNEL__ */
+
+#endif
+
+#endif /* __LNET_WINNT_LIB_LNET_H__ */
diff --git a/lnet/include/lnet/winnt/lib-types.h b/lnet/include/lnet/winnt/lib-types.h
new file mode 100644 (file)
index 0000000..33a3134
--- /dev/null
@@ -0,0 +1,55 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef __LNET_WINNT_LIB_TYPES_H__
+#define __LNET_WINNT_LIB_TYPES_H__
+
+#ifndef __LNET_LIB_TYPES_H__
+#error Do not #include this file directly. #include <lnet/lib-types.h> instead
+#endif
+
+#include <libcfs/libcfs.h>
+
+typedef struct {
+    spinlock_t lock;
+} lib_ni_lock_t;
+
+static inline void lib_ni_lock_init(lib_ni_lock_t *l)
+{
+        spin_lock_init(&l->lock);
+}
+
+static inline void lib_ni_lock_fini(lib_ni_lock_t *l)
+{}
+
+static inline void lib_ni_lock(lib_ni_lock_t *l)
+{
+        int     flags;
+        spin_lock_irqsave(&l->lock, flags);
+}
+
+static inline void lib_ni_unlock(lib_ni_lock_t *l)
+{
+        spin_unlock_irqrestore(&l->lock, 0);
+}
+
+#endif
diff --git a/lnet/include/lnet/winnt/lnet.h b/lnet/include/lnet/winnt/lnet.h
new file mode 100644 (file)
index 0000000..7a3d24d
--- /dev/null
@@ -0,0 +1,511 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#ifndef __LNET_LINUX_LNET_H__
+#define __LNET_LINUX_LNET_H__
+
+#ifndef __LNET_H__
+#error Do not #include this file directly. #include <lnet/lnet.h> instead
+#endif
+
+#ifdef __KERNEL__
+
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+/*
+ * tdilnd routines
+ */
+
+
+PUCHAR
+KsNtStatusToString (IN NTSTATUS Status);
+
+
+VOID
+KsPrintf(
+    IN LONG  DebugPrintLevel,
+    IN PCHAR DebugMessage,
+    IN ...
+    );
+
+
+ksock_mdl_t *
+ks_lock_iovs(
+    IN struct iovec  *iov,
+    IN int            niov,
+    IN int            recv,
+    IN int *          len
+    );
+
+ksock_mdl_t *
+ks_lock_kiovs(
+    IN lnet_kiov_t *   kiov,
+    IN int            nkiov,
+    IN int            recv,
+    IN int *          len
+    );
+
+int
+ks_send_mdl(
+    ksock_tconn_t * tconn,
+    void *          tx,
+    ksock_mdl_t *   mdl,
+    int             len,
+    int             flags
+    );
+
+int
+ks_query_data(
+    ksock_tconn_t * tconn,
+    size_t *        size,
+    int             bIsExpedited);
+
+int
+ks_recv_mdl(
+    ksock_tconn_t * tconn,
+    ksock_mdl_t *   mdl,
+    int             size,
+    int             flags
+    );
+
+int
+ks_get_tcp_option (
+    ksock_tconn_t *     tconn,
+    ULONG               ID,
+    PVOID               OptionValue,
+    PULONG              Length
+    );
+
+NTSTATUS
+ks_set_tcp_option (
+    ksock_tconn_t * tconn,
+    ULONG           ID,
+    PVOID           OptionValue,
+    ULONG           Length
+    );
+
+int
+ks_bind_tconn (
+    ksock_tconn_t * tconn,
+    ksock_tconn_t * parent,
+    ulong_ptr   addr,
+    unsigned short  port
+    );
+
+int
+ks_build_tconn(
+    ksock_tconn_t *                 tconn,
+    ulong_ptr                   addr,
+    unsigned short                  port
+    );
+
+int
+ks_disconnect_tconn(
+    ksock_tconn_t *     tconn,
+    ulong_ptr       flags
+    );
+
+void
+ks_abort_tconn(
+    ksock_tconn_t *     tconn
+    );
+
+int
+ks_query_local_ipaddr(
+    ksock_tconn_t *     tconn
+    );
+
+int
+ks_tconn_write (ksock_tconn_t *tconn, void *buffer, int nob);
+
+int
+ks_tconn_read (ksock_tconn_t * tconn, void *buffer, int nob);
+
+NTSTATUS
+KsTcpCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+NTSTATUS
+KsDisconectCompletionRoutine (
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+NTSTATUS
+KsTcpReceiveCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    );
+
+NTSTATUS
+KsTcpSendCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    );
+
+NTSTATUS
+KsAcceptCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    );
+
+
+NTSTATUS
+KsConnectEventHandler(
+    IN PVOID                    TdiEventContext,
+    IN LONG                     RemoteAddressLength,
+    IN PVOID                    RemoteAddress,
+    IN LONG                     UserDataLength,
+    IN PVOID                    UserData,
+    IN LONG                     OptionsLength,
+    IN PVOID                    Options,
+    OUT CONNECTION_CONTEXT *    ConnectionContext,
+    OUT PIRP *                  AcceptIrp
+    );
+
+NTSTATUS 
+KsDisconnectEventHandler(
+    IN PVOID                TdiEventContext,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN LONG                 DisconnectDataLength,
+    IN PVOID                DisconnectData,
+    IN LONG                 DisconnectInformationLength,
+    IN PVOID                DisconnectInformation,
+    IN ULONG                DisconnectFlags
+    );
+
+NTSTATUS
+KsTcpReceiveEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+   );
+
+NTSTATUS
+KsTcpReceiveExpeditedEventHandler(
+    IN PVOID                TdiEventContext, 
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+    );
+
+NTSTATUS
+KsTcpChainedReceiveEventHandler (
+    IN PVOID TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT ConnectionContext,
+    IN ULONG ReceiveFlags, 
+    IN ULONG ReceiveLength,
+    IN ULONG StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL  Tsdu,                  // TSDU data chain
+    IN PVOID TsduDescriptor         // for call to TdiReturnChainedReceives
+    );
+
+NTSTATUS
+KsTcpChainedReceiveExpeditedEventHandler (
+    IN PVOID                TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags, 
+    IN ULONG                ReceiveLength,
+    IN ULONG                StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL                 Tsdu,                  // TSDU data chain
+    IN PVOID                TsduDescriptor         // for call to TdiReturnChainedReceives
+    );
+
+
+
+VOID
+KsDisconnectHelper(PKS_DISCONNECT_WORKITEM WorkItem);
+
+
+ULONG
+ks_tdi_send_flags(ULONG SockFlags);
+
+PIRP
+KsBuildTdiIrp(
+    IN PDEVICE_OBJECT    DeviceObject
+    );
+
+NTSTATUS
+KsSubmitTdiIrp(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN BOOLEAN          bSynchronous,
+    OUT PULONG          Information
+    );
+
+NTSTATUS
+KsOpenControl(
+    IN PUNICODE_STRING      DeviceName,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseControl(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+   );
+
+NTSTATUS
+KsOpenAddress(
+    IN PUNICODE_STRING      DeviceName,
+    IN PTRANSPORT_ADDRESS   pAddress,
+    IN ULONG                AddressLength,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseAddress(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    );
+
+NTSTATUS
+KsOpenConnection(
+    IN PUNICODE_STRING      DeviceName,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   );
+
+NTSTATUS
+KsCloseConnection(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    );
+
+NTSTATUS
+KsAssociateAddress(
+    IN HANDLE           AddressHandle,
+    IN PFILE_OBJECT     ConnectionObject
+    );
+
+
+NTSTATUS
+KsDisassociateAddress(
+    IN PFILE_OBJECT     ConnectionObject
+    );
+
+
+NTSTATUS
+KsSetEventHandlers(
+    IN PFILE_OBJECT         AddressObject,
+    IN PVOID                EventContext,
+    IN PKS_EVENT_HANDLERS   Handlers
+   );
+
+
+NTSTATUS
+KsQueryProviderInfo(
+    PWSTR               TdiDeviceName,
+    PTDI_PROVIDER_INFO  ProviderInfo
+   );
+
+NTSTATUS
+KsQueryAddressInfo(
+    IN PFILE_OBJECT         FileObject,
+    OUT PTDI_ADDRESS_INFO   AddressInfo,
+    OUT PULONG              AddressSize
+   );
+
+NTSTATUS
+KsQueryConnectionInfo(
+    IN PFILE_OBJECT            ConnectionObject,
+    OUT PTDI_CONNECTION_INFO   ConnectionInfo,
+    OUT PULONG                 ConnectionSize
+   );
+
+ULONG
+KsInitializeTdiAddress(
+    IN OUT PTA_IP_ADDRESS   pTransportAddress,
+    IN ULONG                IpAddress,
+    IN USHORT               IpPort
+    );
+
+ULONG
+KsQueryMdlsSize (IN PMDL Mdl);
+
+
+ULONG
+KsQueryTdiAddressLength(
+    OUT PTRANSPORT_ADDRESS   pTransportAddress
+    );
+
+NTSTATUS
+KsQueryIpAddress(
+    IN PFILE_OBJECT     FileObject,
+    OUT PVOID           TdiAddress,
+    OUT ULONG*          AddressLength
+    );
+
+
+NTSTATUS
+KsErrorEventHandler(
+    IN PVOID            TdiEventContext,
+    IN NTSTATUS         Status
+   );
+
+int
+ks_set_handlers(
+    ksock_tconn_t *     tconn
+    );
+
+
+VOID
+KsPrintProviderInfo(
+   PWSTR DeviceName,
+   PTDI_PROVIDER_INFO ProviderInfo
+   );
+
+ksock_tconn_t *
+ks_create_tconn();
+
+void
+ks_free_tconn(
+    ksock_tconn_t * tconn
+    );
+
+void
+ks_init_listener(
+    ksock_tconn_t * tconn
+    );
+
+void
+ks_init_sender(
+    ksock_tconn_t * tconn
+    );
+
+void
+ks_init_child(
+    ksock_tconn_t * tconn
+    );
+
+void
+ks_get_tconn(
+    ksock_tconn_t * tconn
+    );
+
+void
+ks_put_tconn(
+    ksock_tconn_t * tconn
+    );
+
+int
+ks_reset_handlers(
+    ksock_tconn_t *     tconn
+    );
+
+void
+ks_destroy_tconn(
+    ksock_tconn_t *     tconn
+    );
+
+
+PKS_TSDU
+KsAllocateKsTsdu();
+
+VOID
+KsPutKsTsdu(
+    PKS_TSDU  KsTsdu
+    );
+
+VOID
+KsFreeKsTsdu(
+    PKS_TSDU  KsTsdu
+    );
+
+VOID
+KsInitializeKsTsdu(
+    PKS_TSDU    KsTsdu,
+    ULONG       Length
+    );
+
+
+VOID
+KsInitializeKsTsduMgr(
+    PKS_TSDUMGR     TsduMgr
+    );
+
+VOID
+KsInitializeKsChain(
+    PKS_CHAIN       KsChain
+    );
+
+NTSTATUS
+KsCleanupTsduMgr(
+    PKS_TSDUMGR     KsTsduMgr
+    );
+
+NTSTATUS
+KsCleanupKsChain(
+    PKS_CHAIN   KsChain
+    );
+
+NTSTATUS
+KsCleanupTsdu(
+    ksock_tconn_t * tconn
+    );
+
+NTSTATUS
+KsCopyMdlChainToMdlChain(
+    IN PMDL     SourceMdlChain,
+    IN ULONG    SourceOffset,
+    IN PMDL     DestinationMdlChain,
+    IN ULONG    DestinationOffset,
+    IN ULONG    BytesTobecopied,
+    OUT PULONG  BytesCopied
+    );
+
+ULONG
+KsQueryMdlsSize (PMDL Mdl);
+
+NTSTATUS
+KsLockUserBuffer (
+    IN PVOID            UserBuffer,
+    IN BOOLEAN          bPaged,
+    IN ULONG            Length,
+    IN LOCK_OPERATION   Operation,
+    OUT PMDL *          pMdl
+    );
+
+PVOID
+KsMapMdlBuffer (PMDL    Mdl);
+
+VOID
+KsReleaseMdl ( IN PMDL   Mdl,
+               IN int    Paged );
+
+int
+ks_lock_buffer (
+    void *            buffer,
+    int               paged,
+    int               length,
+    LOCK_OPERATION    access,
+    ksock_mdl_t **    kmdl
+    );
+
+void *
+ks_map_mdl (ksock_mdl_t * mdl);
+
+void
+ks_release_mdl (ksock_mdl_t *mdl, int paged);
+
+#endif /* __KERNEL__ */
+
+#endif
index f494a30..d4e034c 100644 (file)
@@ -1,10 +1,13 @@
-@BUILD_GMNAL_TRUE@subdir-m += gmnal
-@BUILD_RANAL_TRUE@subdir-m += ranal
-@BUILD_OPENIBNAL_TRUE@subdir-m += openibnal
-@BUILD_IIBNAL_TRUE@subdir-m += iibnal
-@BUILD_VIBNAL_TRUE@subdir-m += vibnal
-@BUILD_QSWNAL_TRUE@subdir-m += qswnal
-subdir-m += socknal
-subdir-m += lonal
+@BUILD_GMLND_TRUE@subdir-m += gmlnd
+@BUILD_MXLND_TRUE@subdir-m += mxlnd
+@BUILD_RALND_TRUE@subdir-m += ralnd
+@BUILD_O2IBLND_TRUE@subdir-m += o2iblnd
+@BUILD_OPENIBLND_TRUE@subdir-m += openiblnd
+@BUILD_CIBLND_TRUE@subdir-m += ciblnd
+@BUILD_IIBLND_TRUE@subdir-m += iiblnd
+@BUILD_VIBLND_TRUE@subdir-m += viblnd
+@BUILD_QSWLND_TRUE@subdir-m += qswlnd
+@BUILD_PTLLND_TRUE@subdir-m += ptllnd
+subdir-m += socklnd
 
 @INCLUDE_RULES@
index d28e365..e6d0146 100644 (file)
@@ -3,4 +3,4 @@
 # This code is issued under the GNU General Public License.
 # See the file COPYING in this distribution
 
-SUBDIRS = lonal socknal qswnal gmnal openibnal iibnal vibnal ranal
+SUBDIRS = socklnd qswlnd gmlnd mxlnd openiblnd iiblnd viblnd ralnd ptllnd ciblnd o2iblnd
diff --git a/lnet/klnds/ciblnd/Makefile.in b/lnet/klnds/ciblnd/Makefile.in
new file mode 100644 (file)
index 0000000..55311ad
--- /dev/null
@@ -0,0 +1,8 @@
+MODULES := kciblnd
+kciblnd-objs := ciblnd.o ciblnd_cb.o ciblnd_modparams.o
+
+default: all
+
+EXTRA_POST_CFLAGS := @CIBCPPFLAGS@ -I@LUSTRE@/../lnet/klnds/openiblnd
+
+@INCLUDE_RULES@
similarity index 54%
rename from lnet/klnds/lolnd/autoMakefile.am
rename to lnet/klnds/ciblnd/autoMakefile.am
index f7d04f7..cae5cfc 100644 (file)
@@ -4,12 +4,11 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
-if LINUX
-modulenet_DATA = klonal$(KMODEXT)
-endif
+if BUILD_CIBLND
+modulenet_DATA = kciblnd$(KMODEXT)
 endif
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(klonal-objs:%.o=%.c) lonal.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kciblnd-objs:%.o=%.c)
+
diff --git a/lnet/klnds/ciblnd/ciblnd.c b/lnet/klnds/ciblnd/ciblnd.c
new file mode 100644 (file)
index 0000000..e139484
--- /dev/null
@@ -0,0 +1 @@
+#include "openiblnd.c"
diff --git a/lnet/klnds/ciblnd/ciblnd_cb.c b/lnet/klnds/ciblnd/ciblnd_cb.c
new file mode 100644 (file)
index 0000000..893e16d
--- /dev/null
@@ -0,0 +1 @@
+#include "openiblnd_cb.c"
diff --git a/lnet/klnds/ciblnd/ciblnd_modparams.c b/lnet/klnds/ciblnd/ciblnd_modparams.c
new file mode 100644 (file)
index 0000000..a0c6b1f
--- /dev/null
@@ -0,0 +1 @@
+#include "openiblnd_modparams.c"
index 2efbea7..1aec50d 100644 (file)
@@ -1,5 +1,5 @@
-MODULES := kgmnal
-kgmnal-objs := gmnal_api.o gmnal_cb.o gmnal_comm.o gmnal_utils.o gmnal_module.o
+MODULES := kgmlnd
+kgmlnd-objs := gmlnd_api.o gmlnd_cb.o gmlnd_comm.o gmlnd_utils.o gmlnd_module.o
 
 EXTRA_PRE_CFLAGS := @GMCPPFLAGS@ -DGM_KERNEL
 
diff --git a/lnet/klnds/gmlnd/README b/lnet/klnds/gmlnd/README
new file mode 100644 (file)
index 0000000..ac2e23d
--- /dev/null
@@ -0,0 +1,73 @@
+1. This version of the GM nal requires an unreleased extension to the GM API to
+   map physical memory: gm_register_memory_ex_phys(). This allows it to avoid
+   ENOMEM problems associated with large contiguous buffer allocation.
+
+2. ./configure --with-gm=<path-to-gm-source-tree> \
+   [--with-gm-install=<path-to-gm-installation>]
+
+   If the sources do not support gm_register_memory_ex_phys(), configure flags
+   an error. In this case you should apply the patch and rebuild and re-install
+   GM as directed in the error message.
+
+   By default GM is installed in /opt/gm. If an alternate path was specified to
+   <GM-sources>/binary/GM_INSTALL, you should also specify --with-gm-install
+   with the same path.
+
+3. The GM timeout is 300 seconds; i.e. the network may not release resources
+   claimed by communications stalled with a crashing node for this time.
+   Default gmnal buffer tuning parameters (see (4) below) have been chosen to
+   minimize this problem and prevent lustre having to block for resources.
+   However in some situations, where all network buffers are busy, the default
+   lustre timeout (various, scaled from the base timeout of 100 seconds) may be
+   too small and the only solution may be to increase the lustre timeout
+   dramatically.
+
+4. The gmnal has the following module parameters...
+
+   gmnal_port              The GM port that the NAL will use (default 4) 
+                           Change this if it conflicts with site usage.
+
+   gmnal_ntx               The number of "normal" transmit descriptors (default
+                           32). When this pool is exhausted, threads sending
+                           and receiving on the network block until in-progress
+                           transmits have completed. Each descriptor consumes 1
+                           GM_MTU sized buffer. 
+
+   gmnal_ntx_nblk          The number of "reserved" transmit descriptors
+                           (default 256). This pool is reserved for responses to
+                           incoming communications that may not block. Increase
+                           only if console error messages indicates the pool
+                           has been exhausted (LustreError: Can't get tx for
+                           msg type...) Each descriptor consumes 1 GM_MTU sized
+                           buffer.
+
+   gmnal_nlarge_tx_bufs    The number of 1MByte transmit buffers to reserve at
+                           startup (default 32). This controls the number of
+                           concurrent sends larger that GM_MTU. It can be
+                           reduced to conserve memory, or increased to increase
+                           large message sending concurrency. 
+
+   gmnal_nrx_small         The number of GM_MTU sized receive buffers posted to
+                           receive from the network (default 128). Increase if
+                           congestion is suspected, however note that the total
+                           number of receives that can be posted at any time is
+                           limited by the number of GM receive tokens
+                           available. If there are too few, this, and
+                           gmnal_nrx_large are scaled back accordingly. 
+
+   gmnal_nrx_large         The number of 1MByte receive buffers posted to
+                           receive from the network (default 64). Increase if
+                           the number of OST threads is increased. But note
+                           that the total number of receives that can be posted
+                           at any time is limited by the number of GM receive
+                           tokens available. If there are too few, this, and
+                           gmnal_nrx_small are scaled back accordingly. 
+
+5. Network configuration for GM is done in an lmc script as follows...
+
+   GM2NID=${path-to-lustre-tree}/portals/utils/gmnalnid
+
+   ${LMC} --node some_server --add net --nettype gm --nid `$GM2NID -n some_server`
+
+   ${LMC} --node client --add net --nettype gm --nid '*'
+
index 8c3b7c0..6ff7933 100644 (file)
@@ -4,12 +4,10 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if BUILD_GMNAL
-if !CRAY_PORTALS
-modulenet_DATA = kgmnal$(KMODEXT)
-endif
+if BUILD_GMLND
+modulenet_DATA = kgmlnd$(KMODEXT)
 endif
 endif
 
 MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(kgmnal-objs:%.o=%.c) gmnal.h
+DIST_SOURCES = $(kgmlnd-objs:%.o=%.c) gmlnd.h
diff --git a/lnet/klnds/gmlnd/gm-reg-phys.patch b/lnet/klnds/gmlnd/gm-reg-phys.patch
new file mode 100644 (file)
index 0000000..df32a21
--- /dev/null
@@ -0,0 +1,107 @@
+Index: libgm/gm_register.c
+===================================================================
+RCS file: /repository/gm/libgm/gm_register.c,v
+retrieving revision 1.9.16.3
+diff -u -r1.9.16.3 gm_register.c
+--- libgm/gm_register.c        9 Aug 2005 14:37:02 -0000       1.9.16.3
++++ libgm/gm_register.c        25 Aug 2005 21:35:58 -0000
+@@ -77,20 +77,14 @@
+  
+ */
+-GM_ENTRY_POINT
+-gm_status_t
+-gm_register_memory_ex (gm_port_t *p, void *_ptr, gm_size_t length, void *_pvma)
++static gm_status_t
++_gm_register_memory (gm_port_t *p, int is_physical, gm_u64_t ptr, gm_size_t length, gm_up_t pvma)
+ {
+   gm_status_t status;
+-  gm_up_t ptr;
+-  gm_up_t pvma;
+   GM_CALLED_WITH_ARGS (("%p,%p,"GM_U64_TMPL",%p",
+                       p, _ptr, GM_U64_ARG (length), _pvma));
+     
+-  ptr = GM_PTR_TO_UP (_ptr);
+-  pvma = GM_PTR_TO_UP (_pvma);
+-  
+ #if !GM_KERNEL && !GM_CAN_REGISTER_MEMORY
+   GM_PARAMETER_MAY_BE_UNUSED (p);
+   GM_PARAMETER_MAY_BE_UNUSED (ptr);
+@@ -160,7 +154,7 @@
+       status = gm_add_mapping_to_page_table (ps,
+                                              ptr + offset,
+                                              pvma + offset,
+-                                             GM_INVALID_DMA_PAGE);
++                                             is_physical ? ptr + offset : GM_INVALID_DMA_PAGE);
+       if (status != GM_SUCCESS)
+         {
+           status = GM_INVALID_PARAMETER;
+@@ -317,13 +311,31 @@
+  
+ */
++#if GM_KERNEL && (GM_CPU_x86 || GM_CPU_x86_64 || GM_CPU_ia64)
++/* only architecture where pci bus addr == physical address can use
++   such a simple scheme */
++GM_ENTRY_POINT gm_status_t
++gm_register_memory_ex_phys (struct gm_port *p,
++                          gm_u64_t phys, gm_size_t length,
++                          gm_up_t pvma)
++{
++  return _gm_register_memory(p, 1, phys, length, (gm_size_t)pvma);
++}
++#endif
++
++GM_ENTRY_POINT gm_status_t
++gm_register_memory_ex (gm_port_t *p, void *ptr, gm_size_t length, void *pvma)
++{
++  return _gm_register_memory(p, 0, (gm_size_t)ptr, length, (gm_size_t)pvma);
++}
++
+ GM_ENTRY_POINT gm_status_t
+ gm_register_memory (gm_port_t *p, void *ptr, gm_size_t length)
+ {
+   gm_status_t status;
+   
+   GM_CALLED_WITH_ARGS (("%p,%p,"GM_U64_TMPL, p, ptr, GM_U64_ARG (length)));
+-  status = gm_register_memory_ex (p, ptr, length, ptr);
++  status = _gm_register_memory(p, 0, (gm_size_t)ptr, length, (gm_size_t)ptr);
+   GM_RETURN_STATUS (status);
+ }
+Index: include/gm.h
+===================================================================
+RCS file: /repository/gm/include/gm.h,v
+retrieving revision 1.25.10.11
+diff -u -r1.25.10.11 gm.h
+--- include/gm.h       14 Mar 2005 21:42:41 -0000      1.25.10.11
++++ include/gm.h       25 Aug 2005 21:35:58 -0000
+@@ -2676,6 +2676,10 @@
+ GM_ENTRY_POINT gm_status_t gm_register_memory_ex (struct gm_port *p,
+                                                 void *ptr, gm_size_t length,
+                                                 void *pvma);
++
++GM_ENTRY_POINT gm_status_t gm_register_memory_ex_phys (struct gm_port *p,
++                                                     gm_u64_t phys, gm_size_t length,
++                                                     gm_up_t pvma);
+ #endif /* GM_API_VERSION >= GM_API_VERSION_2_0_6 */
+ #if GM_API_VERSION >= GM_API_VERSION_2_1_0
+Index: libgm/gm_reference_api.c
+===================================================================
+RCS file: /repository/gm/libgm/gm_reference_api.c,v
+retrieving revision 1.3.14.1
+diff -u -r1.3.14.1 gm_reference_api.c
+--- libgm/gm_reference_api.c   23 Apr 2004 20:27:29 -0000      1.3.14.1
++++ libgm/gm_reference_api.c   25 Aug 2005 22:39:20 -0000
+@@ -154,6 +154,9 @@
+ GM_REF (gm_register_buffer);
+ GM_REF (gm_register_memory);
+ GM_REF (gm_register_memory_ex);
++#if GM_KERNEL && (GM_CPU_x86 || GM_CPU_x86_64 || GM_CPU_ia64)
++GM_REF (gm_register_memory_ex_phys);
++#endif
+ GM_REF (gm_resume_sending);
+ GM_REF (gm_send);
+ GM_REF (gm_send_to_peer);
index 47d71eb..6936737 100644 (file)
 #include "linux/vmalloc.h"
 #include "linux/sysctl.h"
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
-#include "portals/nal.h"
-#include "portals/api.h"
-#include "portals/errno.h"
 #include "libcfs/kp30.h"
-#include "portals/p30.h"
-
-#include "portals/nal.h"
-#include "portals/lib-p30.h"
+#include "lnet/lnet.h"
+#include "lnet/lib-lnet.h"
 
 /* undefine these before including the GM headers which clash */
 #undef PACKAGE_BUGREPORT
 #include "gm.h"
 #include "gm_internal.h"
 
-/*
- *      Defines for the API NAL
- */
+/* Fixed tunables */
+#define GMNAL_RESCHED              100          /* # busy loops to force scheduler to yield */
+#define GMNAL_NETADDR_BASE         0x10000000   /* where we start in network VM */
+#define GMNAL_LARGE_PRIORITY       GM_LOW_PRIORITY /* large message GM priority */
+#define GMNAL_SMALL_PRIORITY       GM_LOW_PRIORITY /* small message GM priority */
 
 /* Wire protocol */
-
 typedef struct {
-        ptl_hdr_t       gmim_hdr;               /* portals header */
+        lnet_hdr_t      gmim_hdr;               /* portals header */
         char            gmim_payload[0];        /* payload */
 } gmnal_immediate_msg_t;
 
@@ -109,129 +105,141 @@ typedef struct {
         }               gmm_u;
 } WIRE_ATTR gmnal_msg_t;
 
-#define GMNAL_MSG_MAGIC                 0x6d797269 /* 'myri'! */
+#define GMNAL_MSG_MAGIC                 LNET_PROTO_GM_MAGIC
 #define GMNAL_MSG_VERSION               1
 #define GMNAL_MSG_IMMEDIATE             1
 
+typedef struct netbuf {
+        __u64                    nb_netaddr;    /* network VM address */
+        lnet_kiov_t              nb_kiov[1];    /* the pages (at least 1) */
+} gmnal_netbuf_t;
+
+#define GMNAL_NETBUF_MSG(nb)            ((gmnal_msg_t *)page_address((nb)->nb_kiov[0].kiov_page))
+#define GMNAL_NETBUF_LOCAL_NETADDR(nb)  ((void *)((unsigned long)(nb)->nb_netaddr))
+
+typedef struct gmnal_txbuf {
+        struct list_head         txb_list;      /* queue on gmni_idle_ltxbs */
+        struct gmnal_txbuf      *txb_next;      /* stash on gmni_ltxs */
+        gmnal_netbuf_t           txb_buf;       /* space */
+} gmnal_txbuf_t;
+
 typedef struct gmnal_tx {
-        struct gmnal_tx         *tx_next;
-        gmnal_msg_t             *tx_msg;
-        int                      tx_buffer_size;
-        gm_size_t                tx_gm_size;
-        int                      tx_msg_size;
-        int                      tx_gmlid;
-        int                      tx_gm_priority;
-        ptl_nid_t                tx_nid;
-        struct gmnal_ni         *tx_gmni;
-        lib_msg_t               *tx_libmsg;
-        int                      tx_rxt; 
+        struct list_head         tx_list;       /* queue */
+        int                      tx_credit:1;   /* consumed a credit? */
+        int                      tx_large_iskiov:1; /* large is in kiovs? */
+        struct gmnal_ni         *tx_gmni;       /* owning NI */
+        lnet_nid_t               tx_nid;        /* destination NID */
+        int                      tx_gmlid;      /* destination GM local ID */
+        lnet_msg_t              *tx_lntmsg;     /* lntmsg to finalize on completion */
+
+        gmnal_netbuf_t           tx_buf;        /* small tx buffer */
+        gmnal_txbuf_t           *tx_ltxb;       /* large buffer (to free on completion) */
+        int                      tx_msgnob;     /* message size (so far) */
+
+        int                      tx_large_nob;  /* # bytes large buffer payload */
+        int                      tx_large_offset; /* offset within frags */
+        int                      tx_large_niov; /* # VM frags */
+        union {
+                struct iovec    *iov;           /* mapped frags */
+                lnet_kiov_t     *kiov;          /* page frags */
+        }                        tx_large_frags;
+        unsigned long            tx_launchtime; /* when (in jiffies) the transmit was launched */
+        struct gmnal_tx         *tx_next;       /* stash on gmni_txs */
 } gmnal_tx_t;
 
-/*
- *      as for gmnal_tx_t 
- *      a hash table in nal_data find rxs from
- *      the rx buffer address. hash table populated at init time
- */
 typedef struct gmnal_rx {
-        struct list_head         rx_list;
-        gmnal_msg_t             *rx_msg;
-        int                      rx_size;
-        gm_size_t                rx_gmsize;
-        unsigned int             rx_recv_nob;
-        __u16                    rx_recv_gmid;
-        __u8                     rx_recv_port;
-        __u8                     rx_recv_type;
-        struct gmnal_rx         *rx_next;
+        struct list_head         rx_list;      /* enqueue on gmni_rxq for handling */
+        int                      rx_islarge:1;  /* large receive buffer? */
+        unsigned int             rx_recv_nob;  /* bytes received */
+        __u16                    rx_recv_gmid; /* sender */
+        __u8                     rx_recv_port; /* sender's port */
+        __u8                     rx_recv_type; /* ?? */
+        struct gmnal_rx         *rx_next;      /* stash on gmni_rxs */
+        gmnal_netbuf_t           rx_buf;        /* the buffer */
 } gmnal_rx_t;
 
-
-/*
- *      1 receive thread started on each CPU
- */
-#define NRXTHREADS 10 /* max number of receiver threads */
-
 typedef struct gmnal_ni {
-        spinlock_t       gmni_tx_lock;
-        struct semaphore gmni_tx_token;
-        gmnal_tx_t      *gmni_tx;
-        spinlock_t       gmni_rxt_tx_lock;
-        struct semaphore gmni_rxt_tx_token;
-        gmnal_tx_t      *gmni_rxt_tx;
-        gmnal_rx_t      *gmni_rx;
-        struct gm_hash  *gmni_rx_hash;
-        lib_nal_t       *gmni_libnal;
-        struct gm_port  *gmni_port;
-        spinlock_t       gmni_gm_lock;          /* serialise GM calls */
-        atomic_t         gmni_nthreads;
-        int              gmni_nrxthreads;
-        long             gmni_rxthread_pid[NRXTHREADS];
-        gm_alarm_t       gmni_ctthread_alarm;
-        int              gmni_thread_shutdown;
-        int              gmni_msg_size;
-        struct list_head gmni_rxq;
-        spinlock_t       gmni_rxq_lock;
-        struct semaphore gmni_rxq_wait;
+        lnet_ni_t        *gmni_ni;              /* generic NI */
+        struct gm_port   *gmni_port;            /* GM port */
+        spinlock_t        gmni_gm_lock;         /* serialise GM calls */
+        int               gmni_large_pages;     /* # pages in a large message buffer */
+        int               gmni_large_msgsize;   /* nob in large message buffers */
+        int               gmni_large_gmsize;    /* large message GM bucket */
+        int               gmni_small_msgsize;   /* nob in small message buffers */
+        int               gmni_small_gmsize;    /* small message GM bucket */
+        __u64             gmni_netaddr_base;    /* base of mapped network VM */
+        int               gmni_netaddr_size;    /* # bytes of mapped network VM */
+
+        gmnal_tx_t       *gmni_txs;             /* all txs */
+        gmnal_rx_t       *gmni_rxs;            /* all rx descs */
+        gmnal_txbuf_t    *gmni_ltxbs;           /* all large tx bufs */
+        
+        atomic_t          gmni_nthreads;        /* total # threads */
+        gm_alarm_t        gmni_alarm;           /* alarm to wake caretaker */
+        int               gmni_shutdown;       /* tell all threads to exit */
+
+        struct list_head  gmni_idle_txs;        /* idle tx's */
+        int               gmni_tx_credits;      /* # transmits still possible */
+        struct list_head  gmni_idle_ltxbs;      /* idle large tx buffers */
+        struct list_head  gmni_buf_txq;         /* tx's waiting for buffers */
+        struct list_head  gmni_cred_txq;        /* tx's waiting for credits */
+        spinlock_t        gmni_tx_lock;         /* serialise */
+
+        struct gm_hash   *gmni_rx_hash;                /* buffer->rx lookup */
+        struct semaphore  gmni_rx_mutex;        /* serialise blocking on GM */
 } gmnal_ni_t;
 
-
-/*
- * for ioctl get pid
- */
-#define GMNAL_IOC_GET_GNID 1    
+typedef struct {
+        int              *gm_port;
+        int              *gm_ntx;
+        int              *gm_credits;
+        int              *gm_peer_credits;
+        int              *gm_nlarge_tx_bufs;
+        int              *gm_nrx_small;
+        int              *gm_nrx_large;
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        struct ctl_table_header *gm_sysctl;    /* sysctl interface */
+#endif
+} gmnal_tunables_t;
 
 
 /* gmnal_api.c */
 int gmnal_init(void);
-void  gmnal_fini(void);
+void gmnal_fini(void);
+int gmnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int gmnal_startup(lnet_ni_t *ni);
+void gmnal_shutdown(lnet_ni_t *ni);
 
 /* gmnal_cb.c */
-ptl_err_t gmnal_cb_recv(lib_nal_t *libnal, void *private, 
-                        lib_msg_t *libmsg,
-                        unsigned int niov, struct iovec *iov, 
-                        size_t offset, size_t mlen, size_t rlen);
-ptl_err_t gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, 
-                              lib_msg_t *libmsg, 
-                              unsigned int nkiov, ptl_kiov_t *kiov, 
-                              size_t offset, size_t mlen, size_t rlen);
-ptl_err_t gmnal_cb_send(lib_nal_t *libnal, void *private, 
-                        lib_msg_t *libmsg, ptl_hdr_t *hdr, int type, 
-                        ptl_nid_t nid, ptl_pid_t pid,
-                        unsigned int niov, struct iovec *iov, 
-                        size_t offset, size_t len);
-ptl_err_t gmnal_cb_send_pages(lib_nal_t *libnal, void *private,
-                              lib_msg_t *libmsg, ptl_hdr_t *hdr, int type,
-                              ptl_nid_t nid, ptl_pid_t pid, 
-                              unsigned int nkiov, ptl_kiov_t *kiov, 
-                              size_t offset, size_t len);
-int gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist);
+int gmnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+               int delayed, unsigned int niov, 
+               struct iovec *iov, lnet_kiov_t *kiov,
+               unsigned int offset, unsigned int mlen, unsigned int rlen);
+int gmnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
 
 /* gmnal_util.c */
-int gmnal_is_rxthread(gmnal_ni_t *gmnalni);
-int gmnal_alloc_txs(gmnal_ni_t *gmnalni);
-void gmnal_free_txs(gmnal_ni_t *gmnalni);
-gmnal_tx_t *gmnal_get_tx(gmnal_ni_t *gmnalni, int block);
-void gmnal_return_tx(gmnal_ni_t *gmnalni, gmnal_tx_t *tx);
-int gmnal_alloc_rxs(gmnal_ni_t *gmnalni);
-void gmnal_free_rxs(gmnal_ni_t *gmnalni);
+void gmnal_free_ltxbufs(gmnal_ni_t *gmni);
+int gmnal_alloc_ltxbufs(gmnal_ni_t *gmni);
+void gmnal_free_txs(gmnal_ni_t *gmni);
+int gmnal_alloc_txs(gmnal_ni_t *gmni);
+void gmnal_free_rxs(gmnal_ni_t *gmni);
+int gmnal_alloc_rxs(gmnal_ni_t *gmni);
 char *gmnal_gmstatus2str(gm_status_t status);
 char *gmnal_rxevent2str(gm_recv_event_t *ev);
 void gmnal_yield(int delay);
-int gmnal_enqueue_rx(gmnal_ni_t *gmnalni, gm_recv_t *recv);
-gmnal_rx_t *gmnal_dequeue_rx(gmnal_ni_t *gmnalni);
-void gmnal_stop_threads(gmnal_ni_t *gmnalni);
-int gmnal_start_threads(gmnal_ni_t *gmnalni);
 
 /* gmnal_comm.c */
-void gmnal_pack_msg(gmnal_ni_t *gmnalni, gmnal_tx_t *tx,
-                    ptl_nid_t dstnid, int type);
-int gmnal_ct_thread(void *arg);
-int gmnal_rx_thread(void *arg);
-void gmnal_post_rx(gmnal_ni_t *gmnalni, gmnal_rx_t *rx);
-ptl_err_t gmnal_post_tx(gmnal_ni_t *gmnalni, gmnal_tx_t *tx, 
-                        lib_msg_t *libmsg, ptl_nid_t nid, int nob);
+void gmnal_post_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx);
+gmnal_tx_t *gmnal_get_tx(gmnal_ni_t *gmni);
+void gmnal_tx_done(gmnal_tx_t *tx, int rc);
+void gmnal_pack_msg(gmnal_ni_t *gmni, gmnal_msg_t *msg,
+                    lnet_nid_t dstnid, int type);
+void gmnal_stop_threads(gmnal_ni_t *gmni);
+int gmnal_start_threads(gmnal_ni_t *gmni);
+void gmnal_check_txqueues_locked (gmnal_ni_t *gmni);
 
 /* Module Parameters */
-extern  int num_txds;
-extern  int gm_port_id;
+extern gmnal_tunables_t gmnal_tunables;
 
 #endif /*__INCLUDE_GMNAL_H__*/
index 6597cb5..a5c426f 100644 (file)
  *     Implements the API NAL functions
  */
 
-#include "gmnal.h"
+#include "gmlnd.h"
 
-int
-gmnal_cmd(struct portals_cfg *pcfg, void *private)
+lnd_t the_gmlnd =
 {
-       gmnal_ni_t      *gmnalni = private;
-       char            *name;
-       int              nid;
-       int              gmid;
-       gm_status_t      gm_status;
-
-       CDEBUG(D_TRACE, "gmnal_cmd [%d] private [%p]\n",
-              pcfg->pcfg_command, private);
-       gmnalni = (gmnal_ni_t*)private;
-
-       switch(pcfg->pcfg_command) {
-       case GMNAL_IOC_GET_GNID:
-
-               PORTAL_ALLOC(name, pcfg->pcfg_plen1);
-               copy_from_user(name, PCFG_PBUF(pcfg, 1), pcfg->pcfg_plen1);
-
-                gm_status = gm_host_name_to_node_id_ex(gmnalni->gmni_port, 0,
-                                                       name, &nid);
-                if (gm_status != GM_SUCCESS) {
-                        CDEBUG(D_NET, "gm_host_name_to_node_id_ex(...host %s) "
-                               "failed[%d]\n", name, gm_status);
-                        return -ENOENT;
-                }
-
-                CDEBUG(D_NET, "Local node %s id is [%d]\n", name, nid);
-               gm_status = gm_node_id_to_global_id(gmnalni->gmni_port,
-                                                   nid, &gmid);
-               if (gm_status != GM_SUCCESS) {
-                       CDEBUG(D_NET, "gm_node_id_to_global_id failed[%d]\n",
-                              gm_status);
-                       return -ENOENT;
-               }
-
-               CDEBUG(D_NET, "Global node is is [%u][%x]\n", gmid, gmid);
-               copy_to_user(PCFG_PBUF(pcfg, 2), &gmid, pcfg->pcfg_plen2);
-                return 0;
-
-       case NAL_CMD_REGISTER_MYNID:
-                /* Same NID OK */
-                if (pcfg->pcfg_nid == gmnalni->gmni_libnal->libnal_ni.ni_pid.nid)
-                        return 0;
-
-                CERROR("Can't change NID from "LPD64" to "LPD64"\n",
-                       gmnalni->gmni_libnal->libnal_ni.ni_pid.nid,
-                       pcfg->pcfg_nid);
-                return -EINVAL;
+        .lnd_type            = GMLND,
+        .lnd_startup         = gmnal_startup,
+        .lnd_shutdown        = gmnal_shutdown,
+        .lnd_ctl             = gmnal_ctl,
+        .lnd_send            = gmnal_send,
+        .lnd_recv            = gmnal_recv,
+};
 
+gmnal_ni_t *the_gmni = NULL;
+
+int
+gmnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       struct libcfs_ioctl_data *data = arg;
+
+       switch (cmd) {
+       case IOC_LIBCFS_REGISTER_MYNID:
+               if (data->ioc_nid == ni->ni_nid)
+                       return 0;
+               
+               LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid));
+
+               CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n",
+                      libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(ni->ni_nid));
+               return 0;
+               
        default:
-               CERROR ("gmnal_cmd UNKNOWN[%d]\n", pcfg->pcfg_command);
-               return -EINVAL;
+               return (-EINVAL);
        }
-        /* not reached */
 }
 
-ptl_nid_t
-gmnal_get_local_nid (gmnal_ni_t *gmnalni)
+int
+gmnal_set_local_nid (gmnal_ni_t *gmni)
 {
-       unsigned int     local_gmid;
-        unsigned int     global_gmid;
-        ptl_nid_t        nid;
+        lnet_ni_t       *ni = gmni->gmni_ni;
+       __u32            local_gmid;
+        __u32            global_gmid;
         gm_status_t      gm_status;
 
         /* Called before anything initialised: no need to lock */
-       gm_status = gm_get_node_id(gmnalni->gmni_port, &local_gmid);
+       gm_status = gm_get_node_id(gmni->gmni_port, &local_gmid);
        if (gm_status != GM_SUCCESS)
-               return PTL_NID_ANY;
+               return 0;
 
        CDEBUG(D_NET, "Local node id is [%u]\n", local_gmid);
         
-       gm_status = gm_node_id_to_global_id(gmnalni->gmni_port, 
+       gm_status = gm_node_id_to_global_id(gmni->gmni_port, 
                                             local_gmid, 
                                            &global_gmid);
        if (gm_status != GM_SUCCESS)
-               return PTL_NID_ANY;
+               return 0;
         
        CDEBUG(D_NET, "Global node id is [%u]\n", global_gmid);
 
-        nid = (__u64)global_gmid;
-        LASSERT (nid != PTL_NID_ANY);
-        
-        return global_gmid;
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), global_gmid);
+        return 1;
 }
 
-
 void
-gmnal_api_shutdown(nal_t *nal)
+gmnal_shutdown(lnet_ni_t *ni)
 {
-       lib_nal_t       *libnal = nal->nal_data;
-       gmnal_ni_t      *gmnalni = libnal->libnal_data;
-
-        if (nal->nal_refct != 0) {
-                /* This module got the first ref */
-                PORTAL_MODULE_UNUSE;
-                return;
-        }
+       gmnal_ni_t      *gmni = ni->ni_data;
 
-       CDEBUG(D_TRACE, "gmnal_api_shutdown: gmnalni [%p]\n", gmnalni);
+       CDEBUG(D_TRACE, "gmnal_api_shutdown: gmni [%p]\n", gmni);
 
-        /* Stop portals calling our ioctl handler */
-        libcfs_nal_cmd_unregister(GMNAL);
+        LASSERT (gmni == the_gmni);
 
         /* stop processing messages */
-        gmnal_stop_threads(gmnalni);
+        gmnal_stop_threads(gmni);
+
+        /* stop all network callbacks */
+       gm_close(gmni->gmni_port);
+        gmni->gmni_port = NULL;
 
-       gm_close(gmnalni->gmni_port);
        gm_finalize();
 
-        lib_fini(libnal);
+        gmnal_free_ltxbufs(gmni);
+       gmnal_free_txs(gmni);
+       gmnal_free_rxs(gmni);
 
-       gmnal_free_txs(gmnalni);
-       gmnal_free_rxs(gmnalni);
+       LIBCFS_FREE(gmni, sizeof(*gmni));
 
-       PORTAL_FREE(gmnalni, sizeof(*gmnalni));
-       PORTAL_FREE(libnal, sizeof(*libnal));
+        the_gmni = NULL;
 }
 
 int
-gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid,
-                  ptl_ni_limits_t *requested_limits,
-                  ptl_ni_limits_t *actual_limits)
+gmnal_startup(lnet_ni_t *ni)
 {
-
-       lib_nal_t       *libnal = NULL;
-       gmnal_ni_t      *gmnalni = NULL;
+       gmnal_ni_t      *gmni = NULL;
        gmnal_rx_t      *rx = NULL;
        gm_status_t      gm_status;
-        ptl_process_id_t process_id;
         int              rc;
 
-        if (nal->nal_refct != 0) {
-                if (actual_limits != NULL) {
-                        libnal = (lib_nal_t *)nal->nal_data;
-                        *actual_limits = libnal->libnal_ni.ni_actual_limits;
-                }
-                PORTAL_MODULE_USE;
-                return PTL_OK;
-        }
-
-        /* Called on first PtlNIInit() */
-       CDEBUG(D_TRACE, "startup\n");
+        LASSERT (ni->ni_lnd == &the_gmlnd);
 
-       PORTAL_ALLOC(gmnalni, sizeof(*gmnalni));
-       if (gmnalni == NULL) {
-               CERROR("can't allocate gmnalni\n");
-                return PTL_FAIL;
-        }
+        ni->ni_maxtxcredits = *gmnal_tunables.gm_credits;
+        ni->ni_peertxcredits = *gmnal_tunables.gm_peer_credits;
         
-       PORTAL_ALLOC(libnal, sizeof(*libnal));
-       if (libnal == NULL) {
-               CERROR("can't allocate lib_nal\n");
-                goto failed_0;
-       }       
-
-       memset(gmnalni, 0, sizeof(*gmnalni));
-       gmnalni->gmni_libnal = libnal;
-       spin_lock_init(&gmnalni->gmni_gm_lock);
+        if (the_gmni != NULL) {
+                CERROR("Only 1 instance supported\n");
+                return -EINVAL;
+        }
 
-        *libnal = (lib_nal_t) {
-                .libnal_send       = gmnal_cb_send,
-                .libnal_send_pages = gmnal_cb_send_pages,
-                .libnal_recv       = gmnal_cb_recv,
-                .libnal_recv_pages = gmnal_cb_recv_pages,
-                .libnal_dist       = gmnal_cb_dist,
-                .libnal_data       = gmnalni,
-        };
+       LIBCFS_ALLOC(gmni, sizeof(*gmni));
+       if (gmni == NULL) {
+               CERROR("can't allocate gmni\n");
+                return -ENOMEM;
+        }
 
+        ni->ni_data = gmni;
+
+       memset(gmni, 0, sizeof(*gmni));
+       gmni->gmni_ni = ni;
+        spin_lock_init(&gmni->gmni_tx_lock);
+       spin_lock_init(&gmni->gmni_gm_lock);
+        INIT_LIST_HEAD(&gmni->gmni_idle_txs);
+        INIT_LIST_HEAD(&gmni->gmni_idle_ltxbs);
+        INIT_LIST_HEAD(&gmni->gmni_buf_txq);
+        INIT_LIST_HEAD(&gmni->gmni_cred_txq);
+        sema_init(&gmni->gmni_rx_mutex, 1);
+        
        /*
         *      initialise the interface,
         */
        CDEBUG(D_NET, "Calling gm_init\n");
        if (gm_init() != GM_SUCCESS) {
                CERROR("call to gm_init failed\n");
-                goto failed_1;
+                goto failed_0;
        }
 
-       CDEBUG(D_NET, "Calling gm_open with port [%d], "
-              "name [%s], version [%d]\n", gm_port_id,
-              "gmnal", GM_API_VERSION);
+       CDEBUG(D_NET, "Calling gm_open with port [%d], version [%d]\n",
+               *gmnal_tunables.gm_port, GM_API_VERSION);
 
-       gm_status = gm_open(&gmnalni->gmni_port, 0, gm_port_id, "gmnal",
-                           GM_API_VERSION);
+       gm_status = gm_open(&gmni->gmni_port, 0, *gmnal_tunables.gm_port, 
+                            "gmnal", GM_API_VERSION);
 
         if (gm_status != GM_SUCCESS) {
                 CERROR("Can't open GM port %d: %d (%s)\n",
-                       gm_port_id, gm_status, gmnal_gmstatus2str(gm_status));
-                goto failed_2;
+                       *gmnal_tunables.gm_port, gm_status, 
+                       gmnal_gmstatus2str(gm_status));
+                goto failed_1;
        }
 
-        CDEBUG(D_NET,"gm_open succeeded port[%p]\n",gmnalni->gmni_port);
+        CDEBUG(D_NET,"gm_open succeeded port[%p]\n",gmni->gmni_port);
 
-       gmnalni->gmni_msg_size = offsetof(gmnal_msg_t,
-                                          gmm_u.immediate.gmim_payload[PTL_MTU]);
-        CWARN("Msg size %08x\n", gmnalni->gmni_msg_size);
+        if (!gmnal_set_local_nid(gmni))
+                goto failed_2;
 
-       if (gmnal_alloc_rxs(gmnalni) != 0) {
-               CERROR("Failed to allocate rx descriptors\n");
-                goto failed_3;
-       }
+       CDEBUG(D_NET, "portals_nid is %s\n", libcfs_nid2str(ni->ni_nid));
 
-       if (gmnal_alloc_txs(gmnalni) != 0) {
-               CERROR("Failed to allocate tx descriptors\n");
-                goto failed_3;
-       }
+        gmni->gmni_large_msgsize = 
+                offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[LNET_MAX_PAYLOAD]);
+        gmni->gmni_large_gmsize = 
+                gm_min_size_for_length(gmni->gmni_large_msgsize);
+        gmni->gmni_large_pages =
+                (gmni->gmni_large_msgsize + PAGE_SIZE - 1)/PAGE_SIZE;
+        
+        gmni->gmni_small_msgsize = MIN(GM_MTU, PAGE_SIZE);
+        gmni->gmni_small_gmsize = 
+                gm_min_size_for_length(gmni->gmni_small_msgsize);
 
-        process_id.pid = requested_pid;
-        process_id.nid = gmnal_get_local_nid(gmnalni);
-        if (process_id.nid == PTL_NID_ANY)
-                goto failed_3;
+        gmni->gmni_netaddr_base = GMNAL_NETADDR_BASE;
+        gmni->gmni_netaddr_size = 0;
 
-       CDEBUG(D_NET, "portals_pid is [%u]\n", process_id.pid);
-       CDEBUG(D_NET, "portals_nid is ["LPU64"]\n", process_id.nid);
+        CDEBUG(D_NET, "Msg size %08x/%08x [%d/%d]\n", 
+               gmni->gmni_large_msgsize, gmni->gmni_small_msgsize,
+               gmni->gmni_large_gmsize, gmni->gmni_small_gmsize);
 
-       /*      Hang out a bunch of small receive buffers
-        *      In fact hang them all out */
-        for (rx = gmnalni->gmni_rx; rx != NULL; rx = rx->rx_next)
-                gmnal_post_rx(gmnalni, rx);
+       if (gmnal_alloc_rxs(gmni) != 0) {
+               CERROR("Failed to allocate rx descriptors\n");
+                goto failed_2;
+       }
 
-       if (lib_init(libnal, nal, process_id,
-                     requested_limits, actual_limits) != PTL_OK) {
-               CERROR("lib_init failed\n");
-                goto failed_3;
+       if (gmnal_alloc_txs(gmni) != 0) {
+               CERROR("Failed to allocate tx descriptors\n");
+                goto failed_2;
        }
 
-       /* Now that we have initialised the portals library, start receive
-        * threads, we do this to avoid processing messages before we can parse
-        * them */
-       rc = gmnal_start_threads(gmnalni);
-        if (rc != 0) {
-                CERROR("Can't start threads: %d\n", rc);
-                goto failed_3;
+        if (gmnal_alloc_ltxbufs(gmni) != 0) {
+                CERROR("Failed to allocate large tx buffers\n");
+                goto failed_2;
         }
 
-        rc = libcfs_nal_cmd_register(GMNAL, &gmnal_cmd, libnal->libnal_data);
-       if (rc != 0) {
-               CDEBUG(D_NET, "libcfs_nal_cmd_register failed: %d\n", rc);
-                goto failed_4;
+       rc = gmnal_start_threads(gmni);
+        if (rc != 0) {
+                CERROR("Can't start threads: %d\n", rc);
+                goto failed_2;
         }
 
-       CDEBUG(D_NET, "gmnal_init finished\n");
-       return PTL_OK;
+        /* Start listening */
+        for (rx = gmni->gmni_rxs; rx != NULL; rx = rx->rx_next)
+                gmnal_post_rx(gmni, rx);
 
- failed_4:
-       gmnal_stop_threads(gmnalni);
+        the_gmni = gmni;
 
- failed_3:
-        gm_close(gmnalni->gmni_port);
+       CDEBUG(D_NET, "gmnal_init finished\n");
+       return 0;
 
  failed_2:
-        gm_finalize();
-
-        /* safe to free buffers after network has been shut down */
-        gmnal_free_txs(gmnalni);
-        gmnal_free_rxs(gmnalni);
+        gm_close(gmni->gmni_port);
+        gmni->gmni_port = NULL;
 
  failed_1:
-        PORTAL_FREE(libnal, sizeof(*libnal));
+        gm_finalize();
 
  failed_0:
-        PORTAL_FREE(gmnalni, sizeof(*gmnalni));
+        /* safe to free descriptors after network has been shut down */
+        gmnal_free_ltxbufs(gmni);
+        gmnal_free_txs(gmni);
+        gmnal_free_rxs(gmni);
 
-        return PTL_FAIL;
-}
+        LIBCFS_FREE(gmni, sizeof(*gmni));
 
-ptl_handle_ni_t kgmnal_ni;
-nal_t           the_gm_nal;
+        return -EIO;
+}
 
 /* 
  *        Called when module loaded
  */
 int gmnal_init(void)
 {
-        int    rc;
-
-       CDEBUG(D_NET, "reset nal[%p]\n", &the_gm_nal);
-
-        the_gm_nal = (nal_t) {
-                .nal_ni_init = gmnal_api_startup,
-                .nal_ni_fini = gmnal_api_shutdown,
-                .nal_data = NULL,
-        };
-
-        rc = ptl_register_nal(GMNAL, &the_gm_nal);
-        if (rc != PTL_OK)
-                CERROR("Can't register GMNAL: %d\n", rc);
-        rc = PtlNIInit(GMNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kgmnal_ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                ptl_unregister_nal(GMNAL);
-                return (-ENODEV);
-        }
-
-        return (rc);
+        lnet_register_lnd(&the_gmlnd);
+        return 0;
 }
 
-
 /*
  *     Called when module removed
  */
 void gmnal_fini()
 {
-       CDEBUG(D_TRACE, "gmnal_fini\n");
-
-        PtlNIFini(kgmnal_ni);
-
-        ptl_unregister_nal(GMNAL);
+        lnet_unregister_lnd(&the_gmlnd);
 }
index d7e7f5b..503bedf 100644 (file)
  */
 
 
-#include "gmnal.h"
+#include "gmlnd.h"
 
-ptl_err_t 
-gmnal_cb_recv(lib_nal_t *libnal, void *private, 
-              lib_msg_t *libmsg,
-              unsigned int niov, struct iovec *iov, 
-              size_t offset, size_t mlen, size_t rlen)
+int
+gmnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+           int delayed, unsigned int niov, 
+           struct iovec *iov, lnet_kiov_t *kiov,
+           unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
+        gmnal_ni_t      *gmni = ni->ni_data;
        gmnal_rx_t      *rx = (gmnal_rx_t*)private;
-        gmnal_msg_t     *msg = rx->rx_msg;
-        size_t           nobleft = mlen;
-        int              rxnob;
-        char            *buffer;
-        size_t           nob;
-
-       CDEBUG(D_TRACE, "gmnal_cb_recv libnal [%p], private[%p], libmsg[%p], "
-              "niov[%d], iov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n",
-              libnal, private, libmsg, niov, iov, offset, mlen, rlen);
+        gmnal_msg_t     *msg = GMNAL_NETBUF_MSG(&rx->rx_buf);
+        int              npages = rx->rx_islarge ? gmni->gmni_large_pages : 1;
+        int              payload_offset = offsetof(gmnal_msg_t, 
+                                              gmm_u.immediate.gmim_payload[0]);
+        int              nob = payload_offset + mlen;
 
        LASSERT (msg->gmm_type == GMNAL_MSG_IMMEDIATE);
+        LASSERT (iov == NULL || kiov == NULL);
         
-        buffer = &msg->gmm_u.immediate.gmim_payload[0];
-        rxnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[nobleft]);
-        
-        if (rx->rx_recv_nob < rxnob) {
-                CERROR("Short message from nid "LPD64": got %d, need %d\n",
-                       msg->gmm_srcnid, rx->rx_recv_nob, rxnob);
-                return PTL_FAIL;
+        if (rx->rx_recv_nob < nob) {
+                CERROR("Short message from nid %s: got %d, need %d\n",
+                       libcfs_nid2str(msg->gmm_srcnid), rx->rx_recv_nob, nob);
+                gmnal_post_rx(gmni, rx);
+                return -EIO;
         }
-        
-        while (nobleft > 0) {
-                LASSERT (niov > 0);
 
-                if (offset >= iov->iov_len) {
-                        offset -= iov->iov_len;
-                } else {
-                        nob = MIN (iov->iov_len - offset, nobleft);
-
-                        gm_bcopy(buffer, iov->iov_base + offset, nob);
-
-                        buffer += nob;
-                        nobleft -= nob;
-                        offset = 0;
-                }
-                niov--;
-                iov++;
-        }
-
-        lib_finalize(libnal, private, libmsg, PTL_OK);
-       return PTL_OK;
+        if (kiov != NULL)
+                lnet_copy_kiov2kiov(niov, kiov, offset,
+                                    npages, rx->rx_buf.nb_kiov, payload_offset, 
+                                    mlen);
+        else
+                lnet_copy_kiov2iov(niov, iov, offset,
+                                   npages, rx->rx_buf.nb_kiov, payload_offset,
+                                   mlen);
+
+        lnet_finalize(ni, lntmsg, 0);
+        gmnal_post_rx(gmni, rx);
+       return 0;
 }
 
-ptl_err_t 
-gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, 
-                    lib_msg_t *libmsg, 
-                    unsigned int nkiov, ptl_kiov_t *kiov, 
-                    size_t offset, size_t mlen, size_t rlen)
+int
+gmnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
-       gmnal_rx_t      *rx = (gmnal_rx_t*)private;
-        gmnal_msg_t     *msg = rx->rx_msg;
-        size_t           nobleft = mlen;
-        int              rxnob;
-        size_t           nob;
-       char            *ptr;
-       void            *buffer;
-
-       CDEBUG(D_TRACE, "gmnal_cb_recv_pages libnal [%p],private[%p], "
-              "libmsg[%p], kniov[%d], kiov [%p], "
-               "offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n",
-              libnal, private, libmsg, nkiov, kiov, offset, mlen, rlen);
-
-       LASSERT (msg->gmm_type == GMNAL_MSG_IMMEDIATE);
-
-        buffer = &msg->gmm_u.immediate.gmim_payload[0];
-        rxnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[nobleft]);
-
-        if (rx->rx_recv_nob < rxnob) {
-                CERROR("Short message from nid "LPD64": got %d, need %d\n",
-                       msg->gmm_srcnid, rx->rx_recv_nob, rxnob);
-                return PTL_FAIL;
+        lnet_hdr_t       *hdr= &lntmsg->msg_hdr;
+        int               type = lntmsg->msg_type;
+        lnet_process_id_t target = lntmsg->msg_target;
+        unsigned int      niov = lntmsg->msg_niov;
+        struct iovec     *iov = lntmsg->msg_iov;
+        lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+        unsigned int      offset = lntmsg->msg_offset;
+        unsigned int      len = lntmsg->msg_len;
+       gmnal_ni_t       *gmni = ni->ni_data;
+        gm_status_t       gmrc;
+       gmnal_tx_t       *tx;
+
+        LASSERT (iov == NULL || kiov == NULL);
+
+        /* I may not block for a tx if I'm responding to an incoming message */
+        tx = gmnal_get_tx(gmni);
+        if (tx == NULL) {
+                if (!gmni->gmni_shutdown)
+                        CERROR ("Can't get tx for msg type %d for %s\n",
+                                type, libcfs_nid2str(target.nid));
+                return -EIO;
         }
-        
-        while (nobleft > 0) {
-                LASSERT (nkiov > 0);
-
-                if (offset >= kiov->kiov_len) {
-                        offset -= kiov->kiov_len;
-                } else {
-                        nob = MIN (kiov->kiov_len - offset, nobleft);
-
-                        ptr = ((char *)kmap(kiov->kiov_page)) +
-                              kiov->kiov_offset;
-
-                        gm_bcopy(buffer, ptr + offset, nob);
-
-                        kunmap(kiov->kiov_page);
-
-                        buffer += nob;
-                        nobleft -= nob;
-                        offset = 0;
-               }
-                kiov++;
-                nkiov--;
-       }
-
-        lib_finalize(libnal, private, libmsg, PTL_OK);
-       return PTL_OK;
-}
-
-ptl_err_t
-gmnal_cb_send(lib_nal_t *libnal, void *private, 
-              lib_msg_t *libmsg, ptl_hdr_t *hdr, int type, 
-              ptl_nid_t nid, ptl_pid_t pid,
-              unsigned int niov, struct iovec *iov, 
-              size_t offset, size_t len)
-{
-
-       gmnal_ni_t      *gmnalni = libnal->libnal_data;
-        size_t           nobleft = len;
-       void            *buffer;
-       gmnal_tx_t      *tx;
-        size_t           nob;
 
-       CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] offset["LPSZ"] "
-               "len["LPSZ"] nid["LPU64"]\n", niov, offset, len, nid);
+        tx->tx_nid = target.nid;
 
-        if ((nid >> 32) != 0) {
-                CERROR("Illegal nid: "LPU64"\n", nid);
-                return PTL_FAIL;
+        gmrc = gm_global_id_to_node_id(gmni->gmni_port, LNET_NIDADDR(target.nid),
+                                       &tx->tx_gmlid);
+        if (gmrc != GM_SUCCESS) {
+                CERROR("Can't map Nid %s to a GM local ID: %d\n", 
+                       libcfs_nid2str(target.nid), gmrc);
+                /* NB tx_lntmsg not set => doesn't finalize */
+                gmnal_tx_done(tx, -EIO);
+                return -EIO;
         }
 
-        tx = gmnal_get_tx(gmnalni, 1);
-
-        gmnal_pack_msg(gmnalni, tx, nid, GMNAL_MSG_IMMEDIATE);
-        gm_bcopy(hdr, &tx->tx_msg->gmm_u.immediate.gmim_hdr, sizeof(*hdr));
-
-        buffer = &tx->tx_msg->gmm_u.immediate.gmim_payload[0];
-        while (nobleft > 0) {
-                LASSERT (niov > 0);
-                
-                if (offset >= iov->iov_len) {
-                        offset -= iov->iov_len;
-                } else {
-                        nob = MIN (iov->iov_len - offset, nobleft);
-
-                        gm_bcopy(iov->iov_base + offset, buffer, nob);
-
-                        buffer += nob;
-                        nobleft -= nob;
-                        offset = 0;
+        gmnal_pack_msg(gmni, GMNAL_NETBUF_MSG(&tx->tx_buf), 
+                       target.nid, GMNAL_MSG_IMMEDIATE);
+        GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_hdr = *hdr;
+        tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[0]);
+
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto test */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_version++;
+                        the_lnet.ln_testprotocompat &= ~1;
                 }
-                niov--;
-                iov++;
-        }
-        
-        nob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[len]);
-        return gmnal_post_tx(gmnalni, tx, libmsg, nid, nob);
-}
-
-ptl_err_t
-gmnal_cb_send_pages(lib_nal_t *libnal, void *private,
-                    lib_msg_t *libmsg, ptl_hdr_t *hdr, int type,
-                    ptl_nid_t nid, ptl_pid_t pid, 
-                    unsigned int nkiov, ptl_kiov_t *kiov, 
-                    size_t offset, size_t len)
-{
-
-       gmnal_ni_t      *gmnalni = libnal->libnal_data;
-        size_t           nobleft = len;
-       void            *buffer;
-       gmnal_tx_t      *tx;
-       char            *ptr;
-        size_t           nob;
-
-       CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] offset["
-               LPSZ"] len["LPSZ"]\n", nid, nkiov, offset, len);
-
-        if ((nid >> 32) != 0) {
-                CERROR("Illegal nid: "LPU64"\n", nid);
-                return PTL_FAIL;
+                if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                        GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_magic =
+                                LNET_PROTO_MAGIC;
+                        the_lnet.ln_testprotocompat &= ~2;
+                }
+                LNET_UNLOCK();
         }
 
-       tx = gmnal_get_tx(gmnalni, 1);
-
-        gmnal_pack_msg(gmnalni, tx, nid, GMNAL_MSG_IMMEDIATE);
-        gm_bcopy(hdr, &tx->tx_msg->gmm_u.immediate.gmim_hdr, sizeof(*hdr));
-
-       buffer = &tx->tx_msg->gmm_u.immediate.gmim_payload[0];
-        while (nobleft > 0) {
-                LASSERT (nkiov > 0);
-
-                if (offset >= kiov->kiov_len) {
-                        offset -= kiov->kiov_len;
-                } else {
-                        nob = MIN (kiov->kiov_len - offset, nobleft);
-
-                        ptr = ((char *)kmap(kiov->kiov_page)) +
-                              kiov->kiov_offset;
-
-                        gm_bcopy(ptr + offset, buffer, nob);
-
-                        kunmap(kiov->kiov_page);
+        if (tx->tx_msgnob + len <= gmni->gmni_small_msgsize) {
+                /* whole message fits in tx_buf */
+                char *buffer = &(GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_payload[0]);
 
-                        buffer += nob;
-                        nobleft -= nob;
-                        offset = 0;
-                }
-                nkiov--;
-                kiov++;
+                if (iov != NULL)
+                        lnet_copy_iov2flat(len, buffer, 0,
+                                           niov, iov, offset, len);
+                else
+                        lnet_copy_kiov2flat(len, buffer, 0,
+                                            niov, kiov, offset, len);
+                
+                tx->tx_msgnob += len;
+                tx->tx_large_nob = 0;
+        } else {
+                /* stash payload pts to copy later */
+                tx->tx_large_nob = len;
+                tx->tx_large_iskiov = (kiov != NULL);
+                tx->tx_large_niov = niov;
+                if (tx->tx_large_iskiov)
+                        tx->tx_large_frags.kiov = kiov;
+                else
+                        tx->tx_large_frags.iov = iov;
         }
 
-        nob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[len]);
-        return gmnal_post_tx(gmnalni, tx, libmsg, nid, nob);
-}
+        LASSERT(tx->tx_lntmsg == NULL);
+        tx->tx_lntmsg = lntmsg;
+        
+        spin_lock(&gmni->gmni_tx_lock);
 
-int
-gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist)
-{
-       CDEBUG(D_TRACE, "gmnal_cb_dist\n");
+        list_add_tail(&tx->tx_list, &gmni->gmni_buf_txq);
+        gmnal_check_txqueues_locked(gmni);
 
-       if (dist != NULL)
-               *dist = 1;
+        spin_unlock(&gmni->gmni_tx_lock);
 
-       return PTL_OK;
+        return 0;
 }
index 3b4baa0..ea6a8d1 100644 (file)
  *     This file contains all gmnal send and receive functions
  */
 
-#include "gmnal.h"
+#include "gmlnd.h"
 
 void
-gmnal_pack_msg(gmnal_ni_t *gmnalni, gmnal_tx_t *tx,
-               ptl_nid_t dstnid, int type)
+gmnal_notify_peer_down(gmnal_tx_t *tx)
 {
-        gmnal_msg_t *msg = tx->tx_msg;
+        struct timeval     now;
+        time_t             then;
 
+        do_gettimeofday (&now);
+        then = now.tv_sec - (jiffies - tx->tx_launchtime)/HZ;
+
+        lnet_notify(tx->tx_gmni->gmni_ni, tx->tx_nid, 0, then);
+}
+
+void
+gmnal_pack_msg(gmnal_ni_t *gmni, gmnal_msg_t *msg,
+               lnet_nid_t dstnid, int type)
+{
         /* CAVEAT EMPTOR! this only sets the common message fields. */
         msg->gmm_magic    = GMNAL_MSG_MAGIC;
         msg->gmm_version  = GMNAL_MSG_VERSION;
         msg->gmm_type     = type;
-        msg->gmm_srcnid   = gmnalni->gmni_libnal->libnal_ni.ni_pid.nid;
+        msg->gmm_srcnid   = lnet_ptlcompat_srcnid(gmni->gmni_ni->ni_nid,
+                                                  dstnid);
         msg->gmm_dstnid   = dstnid;
 }
 
 int
-gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx)
+gmnal_unpack_msg(gmnal_ni_t *gmni, gmnal_rx_t *rx)
 {
-        gmnal_msg_t *msg = rx->rx_msg;
+        gmnal_msg_t *msg = GMNAL_NETBUF_MSG(&rx->rx_buf);
         const int    hdr_size = offsetof(gmnal_msg_t, gmm_u);
+        int          buffnob = rx->rx_islarge ? gmni->gmni_large_msgsize :
+                                                gmni->gmni_small_msgsize;
         int          flip;
 
+        /* rc = 0:SUCCESS -ve:failure +ve:version mismatch */
+
+        /* GM may not overflow our buffer */
+        LASSERT (rx->rx_recv_nob <= buffnob);
+
         /* 6 bytes are enough to have received magic + version */
         if (rx->rx_recv_nob < 6) {
                 CERROR("Short message from gmid %u: %d\n", 
@@ -57,6 +75,9 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx)
                 flip = 0;
         } else if (msg->gmm_magic == __swab32(GMNAL_MSG_MAGIC)) {
                 flip = 1;
+        } else if (msg->gmm_magic == LNET_PROTO_MAGIC ||
+                   msg->gmm_magic == __swab32(LNET_PROTO_MAGIC)) {
+                return EPROTO;
         } else {
                 CERROR("Bad magic from gmid %u: %08x\n", 
                        rx->rx_recv_gmid, msg->gmm_magic);
@@ -65,9 +86,7 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx)
 
         if (msg->gmm_version != 
             (flip ? __swab16(GMNAL_MSG_VERSION) : GMNAL_MSG_VERSION)) {
-                CERROR("Bad version from gmid %u: %d\n", 
-                       rx->rx_recv_gmid, msg->gmm_version);
-                return -EPROTO;
+                return EPROTO;
         }
 
         if (rx->rx_recv_nob < hdr_size) {
@@ -84,15 +103,16 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx)
                 __swab64s(&msg->gmm_dstnid);
         }
         
-        if (msg->gmm_srcnid == PTL_NID_ANY) {
-                CERROR("Bad src nid from %u: "LPX64"\n", 
-                       rx->rx_recv_gmid, msg->gmm_srcnid);
+        if (msg->gmm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid from %u: %s\n", 
+                       rx->rx_recv_gmid, libcfs_nid2str(msg->gmm_srcnid));
                 return -EPROTO;
         }
 
-        if (msg->gmm_dstnid != gmnalni->gmni_libnal->libnal_ni.ni_pid.nid) {
-                CERROR("Bad dst nid from %u: "LPX64"\n",
-                       rx->rx_recv_gmid, msg->gmm_dstnid);
+        if (!lnet_ptlcompat_matchnid(gmni->gmni_ni->ni_nid, 
+                                     msg->gmm_dstnid)) {
+                CERROR("Bad dst nid from %u: %s\n",
+                       rx->rx_recv_gmid, libcfs_nid2str(msg->gmm_dstnid));
                 return -EPROTO;
         }
         
@@ -114,211 +134,430 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx)
         return 0;
 }
 
-
-/*
- *     The caretaker thread
- *     This is main thread of execution for the NAL side
- *     This guy waits in gm_blocking_recvive and gets
- *     woken up when the myrinet adaptor gets an interrupt.
- *     Hands off receive operations to the receive thread 
- *     This thread Looks after gm_callbacks etc inline.
- */
-int
-gmnal_ct_thread(void *arg)
+gmnal_tx_t *
+gmnal_get_tx(gmnal_ni_t *gmni)
 {
-       gmnal_ni_t              *gmnalni = arg;
-       gm_recv_event_t         *rxevent = NULL;
-       gm_recv_t               *recv = NULL;
+       gmnal_tx_t       *tx = NULL;
+
+        spin_lock(&gmni->gmni_tx_lock);
+
+        if (gmni->gmni_shutdown ||
+            list_empty(&gmni->gmni_idle_txs)) {
+                spin_unlock(&gmni->gmni_tx_lock);
+                return NULL;
+        }
+        
+        tx = list_entry(gmni->gmni_idle_txs.next, gmnal_tx_t, tx_list);
+        list_del(&tx->tx_list);
 
-       sprintf(current->comm, "gmnal_ct");
-       kportal_daemonize("gmnalctd");
+        spin_unlock(&gmni->gmni_tx_lock);
 
-       while(!gmnalni->gmni_thread_shutdown) {
+        LASSERT (tx->tx_lntmsg == NULL);
+        LASSERT (tx->tx_ltxb == NULL);
+        LASSERT (!tx->tx_credit);
+        
+        return tx;
+}
 
-                spin_lock(&gmnalni->gmni_gm_lock);
-               rxevent = gm_blocking_receive_no_spin(gmnalni->gmni_port);
-                spin_unlock(&gmnalni->gmni_gm_lock);
+void
+gmnal_tx_done(gmnal_tx_t *tx, int rc)
+{
+       gmnal_ni_t *gmni = tx->tx_gmni;
+        int         wake_sched = 0;
+        lnet_msg_t *lnetmsg = tx->tx_lntmsg;
+        
+        tx->tx_lntmsg = NULL;
 
-               CDEBUG(D_NET, "got [%s]\n", gmnal_rxevent2str(rxevent));
+        spin_lock(&gmni->gmni_tx_lock);
+        
+        if (tx->tx_ltxb != NULL) {
+                wake_sched = 1;
+                list_add_tail(&tx->tx_ltxb->txb_list, &gmni->gmni_idle_ltxbs);
+                tx->tx_ltxb = NULL;
+        }
+        
+        if (tx->tx_credit) {
+                wake_sched = 1;
+                gmni->gmni_tx_credits++;
+                tx->tx_credit = 0;
+        }
+        
+        list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs);
 
-               if (GM_RECV_EVENT_TYPE(rxevent) == GM_RECV_EVENT) {
-                        recv = (gm_recv_t*)&rxevent->recv;
-                        gmnal_enqueue_rx(gmnalni, recv);
-                        continue;
-                }
+        if (wake_sched)
+                gmnal_check_txqueues_locked(gmni);
 
-                gm_unknown(gmnalni->gmni_port, rxevent);
-       }
+        spin_unlock(&gmni->gmni_tx_lock);
 
-       CDEBUG(D_NET, "exiting\n");
-        atomic_dec(&gmnalni->gmni_nthreads);
-       return 0;
+        /* Delay finalize until tx is free */
+        if (lnetmsg != NULL)
+                lnet_finalize(gmni->gmni_ni, lnetmsg, 0);
 }
 
+void 
+gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, 
+                          gm_status_t status)
+{
+       gmnal_tx_t      *tx = (gmnal_tx_t*)context;
 
-/*
- *     process a receive event
- */
-int 
-gmnal_rx_thread(void *arg)
+        LASSERT(!in_interrupt());
+         
+        CDEBUG(D_NET, "status for tx [%p] is [%d][%s], nid %s\n", 
+               tx, status, gmnal_gmstatus2str(status),
+               libcfs_nid2str(tx->tx_nid));
+
+        gmnal_tx_done(tx, -EIO);
+}
+
+void 
+gmnal_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
 {
-       gmnal_ni_t    *gmnalni = arg;
-        char           name[16];
-       gmnal_rx_t    *rx;
-       int            rank;
+       gmnal_tx_t      *tx = (gmnal_tx_t*)context;
+       gmnal_ni_t      *gmni = tx->tx_gmni;
 
-       for (rank = 0; rank < gmnalni->gmni_nrxthreads; rank++)
-               if (gmnalni->gmni_rxthread_pid[rank] == current->pid)
-                       break;
+        LASSERT(!in_interrupt());
 
-       snprintf(name, sizeof(name), "gmnal_rx_%d", rank);
-       kportal_daemonize(name);
+       switch(status) {
+        case GM_SUCCESS:
+                gmnal_tx_done(tx, 0);
+                return;
+
+        case GM_SEND_DROPPED:
+                CDEBUG(D_NETERROR, "Dropped tx %p to %s\n", 
+                       tx, libcfs_nid2str(tx->tx_nid));
+                /* Another tx failed and called gm_drop_sends() which made this
+                 * one complete immediately */
+                gmnal_tx_done(tx, -EIO);
+                return;
+                        
+        default:
+                /* Some error; NB don't complete tx yet; we need its credit for
+                 * gm_drop_sends() */
+                CDEBUG(D_NETERROR, "tx %p error %d(%s), nid %s\n",
+                       tx, status, gmnal_gmstatus2str(status), 
+                       libcfs_nid2str(tx->tx_nid));
+
+                gmnal_notify_peer_down(tx);
+
+                spin_lock(&gmni->gmni_gm_lock);
+                gm_drop_sends(gmni->gmni_port, 
+                              tx->tx_ltxb != NULL ?
+                              GMNAL_LARGE_PRIORITY : GMNAL_SMALL_PRIORITY,
+                              tx->tx_gmlid, *gmnal_tunables.gm_port, 
+                              gmnal_drop_sends_callback, tx);
+                spin_unlock(&gmni->gmni_gm_lock);
+               return;
+       }
+
+        /* not reached */
+        LBUG();
+}
 
-       while(!gmnalni->gmni_thread_shutdown) {
+void
+gmnal_check_txqueues_locked (gmnal_ni_t *gmni)
+{
+        gmnal_tx_t    *tx;
+        gmnal_txbuf_t *ltxb;
+        int            gmsize;
+        int            pri;
+        void          *netaddr;
+        
+        tx = list_empty(&gmni->gmni_buf_txq) ? NULL :
+             list_entry(gmni->gmni_buf_txq.next, gmnal_tx_t, tx_list);
 
-               rx = gmnal_dequeue_rx(gmnalni);
-               if (rx == NULL)
-                       break;
+        if (tx != NULL &&
+            (tx->tx_large_nob == 0 || 
+             !list_empty(&gmni->gmni_idle_ltxbs))) {
 
-                /* We're connectionless: simply ignore packets on error */
+                /* consume tx */
+                list_del(&tx->tx_list);
                 
-                if (gmnal_unpack_msg(gmnalni, rx) == 0) {
-                        
-                        LASSERT (rx->rx_msg->gmm_type == GMNAL_MSG_IMMEDIATE);
-                        (void)lib_parse(gmnalni->gmni_libnal, 
-                                        &rx->rx_msg->gmm_u.immediate.gmim_hdr,
-                                        rx);
+                LASSERT (tx->tx_ltxb == NULL);
+
+                if (tx->tx_large_nob != 0) {
+                        ltxb = list_entry(gmni->gmni_idle_ltxbs.next,
+                                          gmnal_txbuf_t, txb_list);
+
+                        /* consume large buffer */
+                        list_del(&ltxb->txb_list);
+
+                        spin_unlock(&gmni->gmni_tx_lock);
+
+                        /* Unlocking here allows sends to get re-ordered,
+                         * but we want to allow other CPUs to progress... */
+
+                        tx->tx_ltxb = ltxb;
+
+                        /* marshall message in tx_ltxb...
+                         * 1. Copy what was marshalled so far (in tx_buf) */
+                        memcpy(GMNAL_NETBUF_MSG(&ltxb->txb_buf),
+                               GMNAL_NETBUF_MSG(&tx->tx_buf), tx->tx_msgnob);
+
+                        /* 2. Copy the payload */
+                        if (tx->tx_large_iskiov)
+                                lnet_copy_kiov2kiov(
+                                        gmni->gmni_large_pages,
+                                        ltxb->txb_buf.nb_kiov,
+                                        tx->tx_msgnob,
+                                        tx->tx_large_niov,
+                                        tx->tx_large_frags.kiov,
+                                        tx->tx_large_offset,
+                                        tx->tx_large_nob);
+                        else
+                                lnet_copy_iov2kiov(
+                                        gmni->gmni_large_pages,
+                                        ltxb->txb_buf.nb_kiov,
+                                        tx->tx_msgnob,
+                                        tx->tx_large_niov,
+                                        tx->tx_large_frags.iov,
+                                        tx->tx_large_offset,
+                                        tx->tx_large_nob);
+
+                        tx->tx_msgnob += tx->tx_large_nob;
+
+                        spin_lock(&gmni->gmni_tx_lock);
                 }
 
-                gmnal_post_rx(gmnalni, rx);
-       }
+                list_add_tail(&tx->tx_list, &gmni->gmni_cred_txq);
+        }
 
-       CDEBUG(D_NET, "exiting\n");
-        atomic_dec(&gmnalni->gmni_nthreads);
-       return 0;
+        if (!list_empty(&gmni->gmni_cred_txq) &&
+            gmni->gmni_tx_credits != 0) {
+
+                tx = list_entry(gmni->gmni_cred_txq.next, gmnal_tx_t, tx_list);
+
+                /* consume tx and 1 credit */
+                list_del(&tx->tx_list);
+                gmni->gmni_tx_credits--;
+
+                spin_unlock(&gmni->gmni_tx_lock);
+
+                /* Unlocking here allows sends to get re-ordered, but we want
+                 * to allow other CPUs to progress... */
+
+                LASSERT(!tx->tx_credit);
+                tx->tx_credit = 1;
+
+                tx->tx_launchtime = jiffies;
+
+                if (tx->tx_msgnob <= gmni->gmni_small_msgsize) {
+                        LASSERT (tx->tx_ltxb == NULL);
+                        netaddr = GMNAL_NETBUF_LOCAL_NETADDR(&tx->tx_buf);
+                        gmsize = gmni->gmni_small_gmsize;
+                        pri = GMNAL_SMALL_PRIORITY;
+                } else {
+                        LASSERT (tx->tx_ltxb != NULL);
+                        netaddr = GMNAL_NETBUF_LOCAL_NETADDR(&tx->tx_ltxb->txb_buf);
+                        gmsize = gmni->gmni_large_gmsize;
+                        pri = GMNAL_LARGE_PRIORITY;
+                }
+
+                spin_lock(&gmni->gmni_gm_lock);
+
+                gm_send_to_peer_with_callback(gmni->gmni_port, 
+                                              netaddr, gmsize, 
+                                              tx->tx_msgnob,
+                                              pri, 
+                                              tx->tx_gmlid,
+                                              gmnal_tx_callback, 
+                                              (void*)tx);
+
+                spin_unlock(&gmni->gmni_gm_lock);
+                spin_lock(&gmni->gmni_tx_lock);
+        }
 }
 
 void
-gmnal_post_rx(gmnal_ni_t *gmnalni, gmnal_rx_t *rx)
+gmnal_post_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx)
 {
-       CDEBUG(D_NET, "requeueing rx[%p] gmnalni[%p]\n", rx, gmnalni);
-
-       spin_lock(&gmnalni->gmni_gm_lock);
-       gm_provide_receive_buffer_with_tag(gmnalni->gmni_port, rx->rx_msg,
-                                           rx->rx_gmsize, GM_LOW_PRIORITY, 0 );
-       spin_unlock(&gmnalni->gmni_gm_lock);
+        int   gmsize = rx->rx_islarge ? gmni->gmni_large_gmsize :
+                                        gmni->gmni_small_gmsize;
+        int   pri    = rx->rx_islarge ? GMNAL_LARGE_PRIORITY :
+                                        GMNAL_SMALL_PRIORITY;
+        void *buffer = GMNAL_NETBUF_LOCAL_NETADDR(&rx->rx_buf);
+
+       CDEBUG(D_NET, "posting rx %p buf %p\n", rx, buffer);
+
+       spin_lock(&gmni->gmni_gm_lock);
+        gm_provide_receive_buffer_with_tag(gmni->gmni_port, 
+                                           buffer, gmsize, pri, 0);
+       spin_unlock(&gmni->gmni_gm_lock);
 }
 
-void 
-gmnal_resume_sending_callback(struct gm_port *gm_port, void *context,
-                              gm_status_t status)
+void
+gmnal_version_reply (gmnal_ni_t *gmni, gmnal_rx_t *rx)
 {
-       gmnal_tx_t      *tx = (gmnal_tx_t*)context;
-       gmnal_ni_t      *gmnalni = tx->tx_gmni;
-       lib_msg_t       *libmsg = tx->tx_libmsg;
+        /* Future protocol version compatibility support!
+         * The next gmlnd-specific protocol rev will first send a message to
+         * check version; I reply with a stub message containing my current
+         * magic+version... */
+        gmnal_msg_t *msg;
+        gmnal_tx_t  *tx = gmnal_get_tx(gmni);
+
+        if (tx == NULL) {
+                CERROR("Can't allocate tx to send version info to %u\n",
+                       rx->rx_recv_gmid);
+                return;
+        }
 
-        CWARN("status for tx [%p] is [%d][%s]\n", 
-              tx, status, gmnal_gmstatus2str(status));
+        LASSERT (tx->tx_lntmsg == NULL);        /* no finalize */
 
-        gmnal_return_tx(gmnalni, tx);
-        lib_finalize(gmnalni->gmni_libnal, NULL, libmsg, PTL_FAIL);
-}
+        tx->tx_nid = LNET_NID_ANY;
+        tx->tx_gmlid = rx->rx_recv_gmid;
 
-void 
-gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, 
-                          gm_status_t status)
-{
-       gmnal_tx_t      *tx = (gmnal_tx_t*)context;
-       gmnal_ni_t      *gmnalni = tx->tx_gmni;
+        msg = GMNAL_NETBUF_MSG(&tx->tx_buf);
+        msg->gmm_magic   = GMNAL_MSG_MAGIC;
+        msg->gmm_version = GMNAL_MSG_VERSION;
+
+        /* just send magic + version */
+        tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_type);
+        tx->tx_large_nob = 0;
 
-        CERROR("status for tx [%p] is [%d][%s]\n", 
-               tx, status, gmnal_gmstatus2str(status));
+        spin_lock(&gmni->gmni_tx_lock);
 
-        spin_lock(&gmnalni->gmni_gm_lock);
-        gm_resume_sending(gmnalni->gmni_port, tx->tx_gm_priority,
-                          tx->tx_gmlid, gm_port_id,
-                          gmnal_resume_sending_callback, tx);
-        spin_unlock(&gmnalni->gmni_gm_lock);
+        list_add_tail(&tx->tx_list, &gmni->gmni_buf_txq);
+        gmnal_check_txqueues_locked(gmni);
+
+        spin_unlock(&gmni->gmni_tx_lock);
 }
 
-void 
-gmnal_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status)
+int
+gmnal_rx_thread(void *arg)
 {
-       gmnal_tx_t      *tx = (gmnal_tx_t*)context;
-       gmnal_ni_t      *gmnalni = tx->tx_gmni;
-       lib_nal_t       *libnal = gmnalni->gmni_libnal;
-       lib_msg_t       *libmsg = tx->tx_libmsg;
-        ptl_err_t        rc;
+       gmnal_ni_t      *gmni = arg;
+       gm_recv_event_t *rxevent = NULL;
+       gm_recv_t       *recv = NULL;
+        gmnal_rx_t      *rx;
+        int              rc;
 
-       if (!tx) {
-               CERROR("send completion event for unknown tx\n");
-               return;
-       }
+       cfs_daemonize("gmnal_rxd");
 
-       switch(status) {
-        case(GM_SUCCESS):
-                rc = PTL_OK;
-                break;
+        down(&gmni->gmni_rx_mutex);
 
-        case(GM_SEND_DROPPED):
-                rc = PTL_FAIL;
-                break;
+       while (!gmni->gmni_shutdown) {
+
+                spin_lock(&gmni->gmni_gm_lock);
+               rxevent = gm_blocking_receive_no_spin(gmni->gmni_port);
+                spin_unlock(&gmni->gmni_gm_lock);
+
+                switch (GM_RECV_EVENT_TYPE(rxevent)) {
+                default:
+                        gm_unknown(gmni->gmni_port, rxevent);
+                        continue;
+
+                case GM_FAST_RECV_EVENT:
+                case GM_FAST_PEER_RECV_EVENT:
+                case GM_PEER_RECV_EVENT:
+                case GM_FAST_HIGH_RECV_EVENT:
+                case GM_FAST_HIGH_PEER_RECV_EVENT:
+                case GM_HIGH_PEER_RECV_EVENT:
+                case GM_RECV_EVENT:
+                case GM_HIGH_RECV_EVENT:
+                        break;
+                }
+                
+                recv = &rxevent->recv;
+                rx = gm_hash_find(gmni->gmni_rx_hash, 
+                                  gm_ntohp(recv->buffer));
+                LASSERT (rx != NULL);
+
+                rx->rx_recv_nob  = gm_ntoh_u32(recv->length);
+                rx->rx_recv_gmid = gm_ntoh_u16(recv->sender_node_id);
+                rx->rx_recv_port = gm_ntoh_u8(recv->sender_port_id);
+                rx->rx_recv_type = gm_ntoh_u8(recv->type);
+
+                switch (GM_RECV_EVENT_TYPE(rxevent)) {
+                case GM_FAST_RECV_EVENT:
+                case GM_FAST_PEER_RECV_EVENT:
+                case GM_FAST_HIGH_RECV_EVENT:
+                case GM_FAST_HIGH_PEER_RECV_EVENT:
+                        LASSERT (rx->rx_recv_nob <= PAGE_SIZE);
+
+                        memcpy(GMNAL_NETBUF_MSG(&rx->rx_buf),
+                               gm_ntohp(recv->message), rx->rx_recv_nob);
+                        break;
+                }
+
+                up(&gmni->gmni_rx_mutex);
+
+                CDEBUG (D_NET, "rx %p: buf %p(%p) nob %d\n", rx, 
+                        GMNAL_NETBUF_LOCAL_NETADDR(&rx->rx_buf),
+                        gm_ntohp(recv->buffer), rx->rx_recv_nob);
+
+                /* We're connectionless: simply drop packets with
+                 * errors */
+                rc = gmnal_unpack_msg(gmni, rx);
+
+                if (rc == 0) {
+                        gmnal_msg_t *msg = GMNAL_NETBUF_MSG(&rx->rx_buf);
                         
-        default:
-                CERROR("Error %d(%s), nid "LPD64"\n",
-                       status, gmnal_gmstatus2str(status), tx->tx_nid);
+                        LASSERT (msg->gmm_type == GMNAL_MSG_IMMEDIATE);
+                        rc =  lnet_parse(gmni->gmni_ni, 
+                                         &msg->gmm_u.immediate.gmim_hdr,
+                                         msg->gmm_srcnid,
+                                         rx, 0);
+                } else if (rc > 0) {
+                        gmnal_version_reply(gmni, rx);
+                        rc = -EPROTO;           /* repost rx */
+                }
 
-                spin_lock(&gmnalni->gmni_gm_lock);
-                gm_drop_sends(gmnalni->gmni_port, tx->tx_gm_priority, 
-                              tx->tx_gmlid, gm_port_id, 
-                              gmnal_drop_sends_callback, tx);
-                spin_unlock(&gmnalni->gmni_gm_lock);
-               return;
+                if (rc < 0)                     /* parse failure */
+                        gmnal_post_rx(gmni, rx);
+
+                down(&gmni->gmni_rx_mutex);
        }
 
-       gmnal_return_tx(gmnalni, tx);
-       lib_finalize(libnal, NULL, libmsg, rc);
-       return;
+        up(&gmni->gmni_rx_mutex);
+
+       CDEBUG(D_NET, "exiting\n");
+        atomic_dec(&gmni->gmni_nthreads);
+       return 0;
 }
 
-ptl_err_t
-gmnal_post_tx (gmnal_ni_t *gmnalni, gmnal_tx_t *tx, 
-               lib_msg_t *libmsg, ptl_nid_t nid, int nob)
+void
+gmnal_stop_threads(gmnal_ni_t *gmni)
 {
-        gm_status_t  gm_status;
+        int count = 2;
 
-       CDEBUG(D_NET, "send %d bytes to "LPU64"\n", nob, nid);
+        gmni->gmni_shutdown = 1;
+        mb();
+        
+        /* wake rxthread owning gmni_rx_mutex with an alarm. */
+       spin_lock(&gmni->gmni_gm_lock);
+       gm_set_alarm(gmni->gmni_port, &gmni->gmni_alarm, 0, NULL, NULL);
+       spin_unlock(&gmni->gmni_gm_lock);
+
+       while (atomic_read(&gmni->gmni_nthreads) != 0) {
+                count++;
+                if ((count & (count - 1)) == 0)
+                        CWARN("Waiting for %d threads to stop\n",
+                              atomic_read(&gmni->gmni_nthreads));
+                gmnal_yield(1);
+       }
+}
 
-        LASSERT ((nid >> 32) == 0);
+int
+gmnal_start_threads(gmnal_ni_t *gmni)
+{
+        int     i;
+        int     pid;
 
-       gm_status = gm_global_id_to_node_id(gmnalni->gmni_port, (__u32)nid, 
-                                            &tx->tx_gmlid);
-       if (gm_status != GM_SUCCESS) {
-               CERROR("Failed to obtain local id\n");
-                gmnal_return_tx(gmnalni, tx);
-               return PTL_FAIL;
-       }
+        LASSERT (!gmni->gmni_shutdown);
+        LASSERT (atomic_read(&gmni->gmni_nthreads) == 0);
 
-       CDEBUG(D_NET, "Local Node_id is [%u][%x]\n", 
-               tx->tx_gmlid, tx->tx_gmlid);
+       gm_initialize_alarm(&gmni->gmni_alarm);
 
-        tx->tx_nid = nid;
-       tx->tx_libmsg = libmsg;
-       tx->tx_gm_priority = GM_LOW_PRIORITY;
-       tx->tx_msg_size = nob;
+       for (i = 0; i < num_online_cpus(); i++) {
 
-       CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] "
-              "gmsize [%lu] msize [%d] nid ["LPU64"] local_gmid[%d] "
-              "tx [%p]\n", gmnalni->gmni_port, tx->tx_msg, 
-               tx->tx_gm_size, tx->tx_msg_size, 
-               tx->tx_nid, tx->tx_gmlid, tx);
+                pid = kernel_thread(gmnal_rx_thread, (void*)gmni, 0);
+                if (pid < 0) {
+                        CERROR("rx thread failed to start: %d\n", pid);
+                        gmnal_stop_threads(gmni);
+                        return pid;
+                }
 
-       spin_lock(&gmnalni->gmni_gm_lock);
-       gm_send_to_peer_with_callback(gmnalni->gmni_port, tx->tx_msg,
-                                     tx->tx_gm_size, tx->tx_msg_size,
-                                      tx->tx_gm_priority, tx->tx_gmlid,
-                                     gmnal_tx_callback, (void*)tx);
-       spin_unlock(&gmnalni->gmni_gm_lock);
+                atomic_inc(&gmni->gmni_nthreads);
+       }
 
-       return PTL_OK;
+       return 0;
 }
index 449c331..114a286 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "gmnal.h"
-
-
-int num_txds = 5;
-int gm_port_id = 4;
+#include "gmlnd.h"
+
+
+static int port = 4;
+CFS_MODULE_PARM(port, "i", int, 0444,
+                "GM port to use for communications");
+
+static int ntx = 256;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+                "# tx descriptors");
+
+static int credits = 128;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+                "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+                "# concurrent sends per peer");
+
+static int nlarge_tx_bufs = 32;
+CFS_MODULE_PARM(nlarge_tx_bufs, "i", int, 0444,
+                "# large tx message buffers");
+
+static int nrx_small = 128;
+CFS_MODULE_PARM(nrx_small, "i", int, 0444,
+                "# small rx message buffers");
+
+static int nrx_large = 64;
+CFS_MODULE_PARM(nrx_large, "i", int, 0444,
+                "# large rx message buffers");
+
+gmnal_tunables_t gmnal_tunables = {
+        .gm_port            = &port,
+        .gm_ntx             = &ntx,
+        .gm_credits         = &credits,
+        .gm_peer_credits    = &peer_credits,
+        .gm_nlarge_tx_bufs  = &nlarge_tx_bufs,
+        .gm_nrx_small       = &nrx_small,
+        .gm_nrx_large       = &nrx_large,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table gmnal_ctl_table[] = {
+       {1, "port", &port,
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {2, "ntx", &ntx, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {3, "credits", &credits,
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {4, "peer_credits", &peer_credits,
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {5, "nlarge_tx_bufs", &nlarge_tx_bufs,
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {6, "nrx_small", &nrx_small,
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {7, "nrx_large", &nrx_large,
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {0}
+};
+
+static ctl_table gmnal_top_ctl_table[] = {
+       {207, "gmnal", NULL, 0, 0555, gmnal_ctl_table},
+       {0}
+};
+#endif
 
 static int __init
 gmnal_load(void)
@@ -31,10 +91,16 @@ gmnal_load(void)
        int     status;
        CDEBUG(D_TRACE, "This is the gmnal module initialisation routine\n");
 
-
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        gmnal_tunables.gm_sysctl =
+                register_sysctl_table(gmnal_top_ctl_table, 0);
+        
+        if (gmnal_tunables.gm_sysctl == NULL)
+                CWARN("Can't setup /proc tunables\n");
+#endif
        CDEBUG(D_NET, "Calling gmnal_init\n");
         status = gmnal_init();
-       if (status == PTL_OK) {
+       if (status == 0) {
                CDEBUG(D_NET, "Portals GMNAL initialised ok\n");
        } else {
                CDEBUG(D_NET, "Portals GMNAL Failed to initialise\n");
@@ -46,24 +112,19 @@ gmnal_load(void)
        return(0);
 }
 
-
 static void __exit
 gmnal_unload(void)
 {
        gmnal_fini();
-       return;
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        if (gmnal_tunables.gm_sysctl != NULL)
+                unregister_sysctl_table(gmnal_tunables.gm_sysctl);
+#endif
 }
 
-
 module_init(gmnal_load);
 module_exit(gmnal_unload);
 
-MODULE_PARM(num_rx_threads, "i");
-MODULE_PARM(num_txds, "i");
-MODULE_PARM(gm_port_id, "i");
-
-MODULE_AUTHOR("Morgan Doyle");
-
-MODULE_DESCRIPTION("A Portals kernel NAL for Myrinet GM.");
-
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel GM LND v1.01");
 MODULE_LICENSE("GPL");
index 00bedf5..9810731 100644 (file)
  *   along with Lustre; if not, write to the Free Software
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
-/*
- *     All utilities required by lgmanl
- */
 
-#include "gmnal.h"
+#include "gmlnd.h"
+
+void
+gmnal_free_netbuf_pages (gmnal_netbuf_t *nb, int npages) 
+{
+        int     i;
+        
+        for (i = 0; i < npages; i++)
+                __free_page(nb->nb_kiov[i].kiov_page);
+}
 
-/*
- *     Am I one of the gmnal rxthreads ?
- */
 int
-gmnal_is_rxthread(gmnal_ni_t *gmnalni)
+gmnal_alloc_netbuf_pages (gmnal_ni_t *gmni, gmnal_netbuf_t *nb, int npages)
 {
-       int i;
+        int          i;
+        gm_status_t  gmrc;
 
-       for (i = 0; i < gmnalni->gmni_nrxthreads; i++)
-               if (gmnalni->gmni_rxthread_pid[i] == current->pid)
-                       return 1;
-       return 0;
+        LASSERT (npages > 0);
+
+        for (i = 0; i < npages; i++) {
+                
+                nb->nb_kiov[i].kiov_page = alloc_page(GFP_KERNEL);
+                nb->nb_kiov[i].kiov_offset = 0;
+                nb->nb_kiov[i].kiov_len = PAGE_SIZE;
+
+                if (nb->nb_kiov[i].kiov_page == NULL) {
+                        CERROR("Can't allocate page\n");
+                        gmnal_free_netbuf_pages(nb, i);
+                        return -ENOMEM;
+                }
+
+                CDEBUG(D_NET,"[%3d] page %p, phys "LPX64", @ "LPX64"\n",
+                       i, nb->nb_kiov[i].kiov_page, 
+                       lnet_page2phys(nb->nb_kiov[i].kiov_page),
+                       gmni->gmni_netaddr_base);
+
+                gmrc = gm_register_memory_ex_phys(
+                        gmni->gmni_port,
+                        lnet_page2phys(nb->nb_kiov[i].kiov_page),
+                        PAGE_SIZE,
+                        gmni->gmni_netaddr_base);
+                CDEBUG(D_NET,"[%3d] page %p: %d\n", 
+                       i, nb->nb_kiov[i].kiov_page, gmrc);
+
+                if (gmrc != GM_SUCCESS) {
+                        CERROR("Can't map page: %d(%s)\n", gmrc,
+                               gmnal_gmstatus2str(gmrc));
+                        gmnal_free_netbuf_pages(nb, i+1);
+                        return -ENOMEM;
+                }
+                
+                if (i == 0) 
+                        nb->nb_netaddr = gmni->gmni_netaddr_base;
+                
+                gmni->gmni_netaddr_base += PAGE_SIZE;
+        }
+        
+        return 0;
 }
 
-gmnal_tx_t *
-gmnal_alloc_tx (gmnal_ni_t *gmnalni) 
+void
+gmnal_free_ltxbuf (gmnal_ni_t *gmni, gmnal_txbuf_t *txb)
 {
-        gmnal_tx_t  *tx;
-        void        *buffer;
+        int            npages = gmni->gmni_large_pages;
+
+        LASSERT (gmni->gmni_port == NULL);
+        /* No unmapping; the port has been closed */
+
+        gmnal_free_netbuf_pages(&txb->txb_buf, gmni->gmni_large_pages);
+        LIBCFS_FREE(txb, offsetof(gmnal_txbuf_t, txb_buf.nb_kiov[npages]));
+}
+
+int
+gmnal_alloc_ltxbuf (gmnal_ni_t *gmni)
+{
+        int            npages = gmni->gmni_large_pages;
+        int            sz = offsetof(gmnal_txbuf_t, txb_buf.nb_kiov[npages]);
+        gmnal_txbuf_t *txb;
+        int            rc;
         
-        PORTAL_ALLOC(tx, sizeof(*tx));
-        if (tx == NULL) {
-                CERROR ("Failed to allocate tx\n");
-                return NULL;
+        LIBCFS_ALLOC(txb, sz);
+        if (txb == NULL) {
+                CERROR("Can't allocate large txbuffer\n");
+                return -ENOMEM;
         }
-        
-        buffer = gm_dma_malloc(gmnalni->gmni_port, gmnalni->gmni_msg_size);
-        if (buffer == NULL) {
-                CERROR("Failed to gm_dma_malloc tx buffer size [%d]\n", 
-                       gmnalni->gmni_msg_size);
-                PORTAL_FREE(tx, sizeof(*tx));
-                return NULL;
+
+        rc = gmnal_alloc_netbuf_pages(gmni, &txb->txb_buf, npages);
+        if (rc != 0) {
+                LIBCFS_FREE(txb, sz);
+                return rc;
         }
 
-        memset(tx, 0, sizeof(*tx));
-        tx->tx_msg = (gmnal_msg_t *)buffer;
-        tx->tx_buffer_size = gmnalni->gmni_msg_size;
-        tx->tx_gm_size = gm_min_size_for_length(tx->tx_buffer_size);
-        tx->tx_gmni = gmnalni;
+        list_add_tail(&txb->txb_list, &gmni->gmni_idle_ltxbs);
 
-        CDEBUG(D_NET, "Created tx [%p] with buffer [%p], size [%d]\n", 
-               tx, tx->tx_msg, tx->tx_buffer_size);
+        txb->txb_next = gmni->gmni_ltxbs;
+        gmni->gmni_ltxbs = txb;
 
-        return tx;
+        return 0;
 }
 
 void
 gmnal_free_tx (gmnal_tx_t *tx)
 {
-        gmnal_ni_t *gmnalni = tx->tx_gmni;
-        
-        CDEBUG(D_NET, "Freeing tx [%p] with buffer [%p], size [%d]\n", 
-               tx, tx->tx_msg, tx->tx_buffer_size);
-#if 0
-        /* We free buffers after we've closed the GM port */
-        gm_dma_free(gmnalni->gmni_port, tx->tx_msg);
-#endif
-        PORTAL_FREE(tx, sizeof(*tx));
+        LASSERT (tx->tx_gmni->gmni_port == NULL);
+
+        gmnal_free_netbuf_pages(&tx->tx_buf, 1);
+        LIBCFS_FREE(tx, sizeof(*tx));
 }
 
 int
-gmnal_alloc_txs(gmnal_ni_t *gmnalni)
+gmnal_alloc_tx (gmnal_ni_t *gmni) 
 {
-       int           ntxcred = gm_num_send_tokens(gmnalni->gmni_port);
-       int           ntx;
-        int           nrxt_tx;
-        int           i;
-       gmnal_tx_t   *tx;
-
-        CWARN("ntxcred: %d\n", ntxcred);
-
-       ntx = num_txds;
-        nrxt_tx = num_txds + 1;
-
-        if (ntx + nrxt_tx > ntxcred) {
-                CERROR ("Asked for %d + %d tx credits, but only %d available\n",
-                        ntx, nrxt_tx, ntxcred);
+        gmnal_tx_t  *tx;
+        int          rc;
+        
+        LIBCFS_ALLOC(tx, sizeof(*tx));
+        if (tx == NULL) {
+                CERROR("Failed to allocate tx\n");
                 return -ENOMEM;
         }
         
-       /* A semaphore is initialised with the number of transmit tokens
-        * available.  To get a stxd, acquire the token semaphore.  this
-        * decrements the available token count (if no tokens you block here,
-        * someone returning a stxd will release the semaphore and wake you)
-        * When token is obtained acquire the spinlock to manipulate the
-        * list */
-       sema_init(&gmnalni->gmni_tx_token, ntx);
-       spin_lock_init(&gmnalni->gmni_tx_lock);
-        LASSERT (gmnalni->gmni_tx == NULL);
-
-       for (i = 0; i <= ntx; i++) {
-                tx = gmnal_alloc_tx(gmnalni);
-               if (tx == NULL) {
-                        CERROR("Failed to create tx %d\n", i);
-                        return -ENOMEM;
-                }
-                
-                tx->tx_rxt = 0;
-               tx->tx_next = gmnalni->gmni_tx;
-               gmnalni->gmni_tx = tx;
-       }
-
-       sema_init(&gmnalni->gmni_rxt_tx_token, nrxt_tx);
-       spin_lock_init(&gmnalni->gmni_rxt_tx_lock);
-        LASSERT (gmnalni->gmni_rxt_tx == NULL);
+        memset(tx, 0, sizeof(*tx));
 
-       for (i = 0; i <= nrxt_tx; i++) {
-                tx = gmnal_alloc_tx(gmnalni);
-               if (tx == NULL) {
-                        CERROR("Failed to create tx %d + %d\n", ntx, i);
-                        return -ENOMEM;
-                }
+        rc = gmnal_alloc_netbuf_pages(gmni, &tx->tx_buf, 1);
+        if (rc != 0) {
+                LIBCFS_FREE(tx, sizeof(*tx));
+                return -ENOMEM;
+        }
 
-                tx->tx_rxt = 1;
-               tx->tx_next = gmnalni->gmni_rxt_tx;
-               gmnalni->gmni_rxt_tx = tx;
-       }
+        tx->tx_gmni = gmni;
+        
+        list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs);
 
-       return 0;
+        tx->tx_next = gmni->gmni_txs;
+        gmni->gmni_txs = tx;
+                
+        return 0;
 }
 
 void
-gmnal_free_txs(gmnal_ni_t *gmnalni)
+gmnal_free_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx)
 {
-       gmnal_tx_t *tx;
-
-        while ((tx = gmnalni->gmni_tx) != NULL) {
-                gmnalni->gmni_tx = tx->tx_next;
-                gmnal_free_tx (tx);
-       }
+        int   npages = rx->rx_islarge ? gmni->gmni_large_pages : 1;
+        
+        LASSERT (gmni->gmni_port == NULL);
 
-        while ((tx = gmnalni->gmni_rxt_tx) != NULL) {
-                gmnalni->gmni_rxt_tx = tx->tx_next;
-                gmnal_free_tx (tx);
-       }
+        gmnal_free_netbuf_pages(&rx->rx_buf, npages);
+        LIBCFS_FREE(rx, offsetof(gmnal_rx_t, rx_buf.nb_kiov[npages]));
 }
 
-
-/*
- *     Get a tx from the list
- *     This get us a wired and gm_registered small tx buffer.
- *     This implicitly gets us a send token also.
- */
-gmnal_tx_t *
-gmnal_get_tx(gmnal_ni_t *gmnalni, int block)
+int
+gmnal_alloc_rx (gmnal_ni_t *gmni, int islarge)
 {
+        int         npages = islarge ? gmni->gmni_large_pages : 1;
+        int         sz = offsetof(gmnal_rx_t, rx_buf.nb_kiov[npages]);
+        int         rc;
+        gmnal_rx_t *rx;
+        gm_status_t gmrc;
+        
+        LIBCFS_ALLOC(rx, sz);
+        if (rx == NULL) {
+                CERROR("Failed to allocate rx\n");
+                return -ENOMEM;
+        }
+        
+        memset(rx, 0, sizeof(*rx));
 
-       gmnal_tx_t      *tx = NULL;
-       pid_t           pid = current->pid;
-
-
-       CDEBUG(D_TRACE, "gmnal_get_tx gmnalni [%p] block[%d] pid [%d]\n", 
-              gmnalni, block, pid);
-
-       if (gmnal_is_rxthread(gmnalni)) {
-                CDEBUG(D_NET, "RXTHREAD Attempting to get token\n");
-               down(&gmnalni->gmni_rxt_tx_token);
-               spin_lock(&gmnalni->gmni_rxt_tx_lock);
-               tx = gmnalni->gmni_rxt_tx;
-               gmnalni->gmni_rxt_tx = tx->tx_next;
-               spin_unlock(&gmnalni->gmni_rxt_tx_lock);
-               CDEBUG(D_NET, "RXTHREAD got [%p], head is [%p]\n", 
-                      tx, gmnalni->gmni_rxt_tx);
-                tx->tx_rxt = 1;
-        } else {
-               if (block) {
-                        CDEBUG(D_NET, "Attempting to get token\n");
-                       down(&gmnalni->gmni_tx_token);
-                        CDEBUG(D_PORTALS, "Got token\n");
-               } else {
-                       if (down_trylock(&gmnalni->gmni_tx_token)) {
-                               CERROR("can't get token\n");
-                               return(NULL);
-                       }
-               }
-               spin_lock(&gmnalni->gmni_tx_lock);
-               tx = gmnalni->gmni_tx;
-               gmnalni->gmni_tx = tx->tx_next;
-               spin_unlock(&gmnalni->gmni_tx_lock);
-               CDEBUG(D_NET, "got [%p], head is [%p]\n", tx,
-                      gmnalni->gmni_tx);
-        }       /* general tx get */
-
-       return tx;
+        rc = gmnal_alloc_netbuf_pages(gmni, &rx->rx_buf, npages);
+        if (rc != 0) {
+                LIBCFS_FREE(rx, sz);
+                return rc;
+        }
+        
+        rx->rx_islarge = islarge;
+        rx->rx_next = gmni->gmni_rxs;
+        gmni->gmni_rxs = rx;
+
+        gmrc = gm_hash_insert(gmni->gmni_rx_hash, 
+                              GMNAL_NETBUF_LOCAL_NETADDR(&rx->rx_buf), rx);
+        if (gmrc != GM_SUCCESS) {
+                CERROR("Couldn't add rx to hash table: %d\n", gmrc);
+                return -ENOMEM;
+        }
+        
+        return 0;
 }
 
-/*
- *     Return a tx to the list
- */
 void
-gmnal_return_tx(gmnal_ni_t *gmnalni, gmnal_tx_t *tx)
+gmnal_free_ltxbufs (gmnal_ni_t *gmni)
 {
-       CDEBUG(D_TRACE, "gmnalni [%p], tx[%p] rxt[%d]\n", gmnalni,
-              tx, tx->tx_rxt);
-
-        /*
-         *      this transmit descriptor is 
-         *      for the rxthread
-         */
-        if (tx->tx_rxt) {
-               spin_lock(&gmnalni->gmni_rxt_tx_lock);
-               tx->tx_next = gmnalni->gmni_rxt_tx;
-               gmnalni->gmni_rxt_tx = tx;
-               spin_unlock(&gmnalni->gmni_rxt_tx_lock);
-               up(&gmnalni->gmni_rxt_tx_token);
-                CDEBUG(D_NET, "Returned tx to rxthread list\n");
-        } else {
-               spin_lock(&gmnalni->gmni_tx_lock);
-               tx->tx_next = gmnalni->gmni_tx;
-               gmnalni->gmni_tx = tx;
-               spin_unlock(&gmnalni->gmni_tx_lock);
-               up(&gmnalni->gmni_tx_token);
-                CDEBUG(D_NET, "Returned tx to general list\n");
+        gmnal_txbuf_t *txb;
+        
+        while ((txb = gmni->gmni_ltxbs) != NULL) {
+                gmni->gmni_ltxbs = txb->txb_next;
+                gmnal_free_ltxbuf(gmni, txb);
         }
-       return;
 }
 
-
-/*
- *     allocate a number of small rx buffers and register with GM
- *     so they are wired and set up for DMA. This is a costly operation.
- *     Also allocate a corrosponding descriptor to keep track of 
- *     the buffer.
- *     Put all descriptors on singly linked list to be available to 
- *     receive thread.
- */
 int
-gmnal_alloc_rxs (gmnal_ni_t *gmnalni)
+gmnal_alloc_ltxbufs (gmnal_ni_t *gmni)
 {
-        int          nrxcred = gm_num_receive_tokens(gmnalni->gmni_port);
-        int          nrx;
-        int          i;
-       gmnal_rx_t  *rxd;
-       void        *rxbuffer;
-
-        CWARN("nrxcred: %d\n", nrxcred);
+        int     nlarge_tx_bufs = *gmnal_tunables.gm_nlarge_tx_bufs;
+        int     i;
+        int     rc;
 
-       nrx = num_txds*2 + 2;
-        if (nrx > nrxcred) {
-                CERROR("Can't allocate %d rx credits: (%d available)\n",
-                       nrx, nrxcred);
-                return -ENOMEM;
+        for (i = 0; i < nlarge_tx_bufs; i++) {
+                rc = gmnal_alloc_ltxbuf(gmni);
+                
+                if (rc != 0)
+                        return rc;
         }
 
-       CDEBUG(D_NET, "Allocated [%d] receive tokens to small messages\n", nrx);
+        return 0;
+}
 
-       gmnalni->gmni_rx_hash = gm_create_hash(gm_hash_compare_ptrs, 
-                                               gm_hash_hash_ptr, 0, 0, nrx, 0);
-       if (gmnalni->gmni_rx_hash == NULL) {
-                CERROR("Failed to create hash table\n");
-                return -ENOMEM;
-       }
+void
+gmnal_free_txs(gmnal_ni_t *gmni)
+{
+       gmnal_tx_t *tx;
 
-        LASSERT (gmnalni->gmni_rx == NULL);
-
-       for (i=0; i <= nrx; i++) {
-
-               PORTAL_ALLOC(rxd, sizeof(*rxd));
-               if (rxd == NULL) {
-                       CERROR("Failed to malloc rxd [%d]\n", i);
-                       return -ENOMEM;
-               }
-
-               rxbuffer = gm_dma_malloc(gmnalni->gmni_port, 
-                                        gmnalni->gmni_msg_size);
-               if (rxbuffer == NULL) {
-                       CERROR("Failed to gm_dma_malloc rxbuffer [%d], "
-                              "size [%d]\n",i ,gmnalni->gmni_msg_size);
-                       PORTAL_FREE(rxd, sizeof(*rxd));
-                       return -ENOMEM;
-               }
-
-               rxd->rx_msg = (gmnal_msg_t *)rxbuffer;
-               rxd->rx_size = gmnalni->gmni_msg_size;
-               rxd->rx_gmsize = gm_min_size_for_length(rxd->rx_size);
-
-               rxd->rx_next = gmnalni->gmni_rx;
-               gmnalni->gmni_rx = rxd;
-
-               if (gm_hash_insert(gmnalni->gmni_rx_hash,
-                                  (void*)rxbuffer, (void*)rxd)) {
-                       CERROR("failed to create hash entry rxd[%p] "
-                              "for rxbuffer[%p]\n", rxd, rxbuffer);
-                       return -ENOMEM;
-               }
-
-               CDEBUG(D_NET, "Registered rxd [%p] with buffer [%p], "
-                      "size [%d]\n", rxd, rxd->rx_msg, rxd->rx_size);
+        while ((tx = gmni->gmni_txs) != NULL) {
+                gmni->gmni_txs = tx->tx_next;
+                gmnal_free_tx (tx);
        }
-
-       return 0;
 }
 
-void
-gmnal_free_rxs(gmnal_ni_t *gmnalni)
+int
+gmnal_alloc_txs(gmnal_ni_t *gmni)
 {
-       gmnal_rx_t *rx;
-
-       CDEBUG(D_TRACE, "gmnal_free_small rx\n");
+        int           ntxcred = gm_num_send_tokens(gmni->gmni_port);
+        int           ntx = *gmnal_tunables.gm_ntx;
+        int           i;
+        int           rc;
 
-       while ((rx = gmnalni->gmni_rx) != NULL) {
-                gmnalni->gmni_rx = rx->rx_next;
+        CDEBUG(D_NET, "ntxcred: %d\n", ntxcred);
+        gmni->gmni_tx_credits = ntxcred;
 
-               CDEBUG(D_NET, "Freeing rxd [%p] buffer [%p], size [%d]\n",
-                      rx, rx->rx_msg, rx->rx_size);
-#if 0
-                /* We free buffers after we've shutdown the GM port */
-               gm_dma_free(gmnalni->gmni_port, _rxd->rx_msg);
-#endif
-               PORTAL_FREE(rx, sizeof(*rx));
-       }
+        for (i = 0; i < ntx; i++) {
+                rc = gmnal_alloc_tx(gmni);
+                if (rc != 0)
+                        return rc;
+        }
 
-#if 0
-        /* see above */
-        if (gmnalni->gmni_rx_hash != NULL)
-                gm_destroy_hash(gmnalni->gmni_rx_hash);
-#endif
+        return 0;
 }
 
 void
-gmnal_stop_threads(gmnal_ni_t *gmnalni)
+gmnal_free_rxs(gmnal_ni_t *gmni)
 {
-        int count = 2;
-        int i;
+       gmnal_rx_t *rx;
 
-        gmnalni->gmni_thread_shutdown = 1;
+       while ((rx = gmni->gmni_rxs) != NULL) {
+                gmni->gmni_rxs = rx->rx_next;
 
-        /* wake ctthread with an alarm */
-       spin_lock(&gmnalni->gmni_gm_lock);
-       gm_set_alarm(gmnalni->gmni_port, &gmnalni->gmni_ctthread_alarm, 
-                     0, NULL, NULL);
-       spin_unlock(&gmnalni->gmni_gm_lock);
+                gmnal_free_rx(gmni, rx);
+        }
 
-        /* wake each rxthread */
-        for (i = 0; i < num_online_cpus(); i++)
-                up(&gmnalni->gmni_rxq_wait);
-        
-       while (atomic_read(&gmnalni->gmni_nthreads) != 0) {
-                count++;
-                if ((count & (count - 1)) == 0)
-                        CWARN("Waiting for %d threads to stop\n",
-                              atomic_read(&gmnalni->gmni_nthreads));
-                gmnal_yield(1);
-       }
+        LASSERT (gmni->gmni_port == NULL);
+#if 0
+        /* GM releases all resources allocated to a port when it closes */
+        if (gmni->gmni_rx_hash != NULL)
+                gm_destroy_hash(gmni->gmni_rx_hash);
+#endif
 }
 
-/*
- *     Start the caretaker thread and a number of receiver threads
- *     The caretaker thread gets events from the gm library.
- *     It passes receive events to the receiver threads via a work list.
- *     It processes other events itself in gm_unknown. These will be
- *     callback events or sleeps.
- */
 int
-gmnal_start_threads(gmnal_ni_t *gmnalni)
+gmnal_alloc_rxs (gmnal_ni_t *gmni)
 {
-        int     i;
-        int     pid;
-
-        gmnalni->gmni_thread_shutdown = 0;
-        gmnalni->gmni_nrxthreads = 0;
-        atomic_set(&gmnalni->gmni_nthreads, 0);
-
-        INIT_LIST_HEAD(&gmnalni->gmni_rxq);
-       spin_lock_init(&gmnalni->gmni_rxq_lock);
-       sema_init(&gmnalni->gmni_rxq_wait, 0);
-
-       /*
-        *      the alarm is used to wake the caretaker thread from 
-        *      gm_unknown call (sleeping) to exit it.
-        */
-       CDEBUG(D_NET, "Initializing caretaker thread alarm and flag\n");
-       gm_initialize_alarm(&gmnalni->gmni_ctthread_alarm);
-
-        pid = kernel_thread(gmnal_ct_thread, (void*)gmnalni, 0);
-       if (pid < 0) {
-               CERROR("Caretaker thread failed to start: %d\n", pid);
-               return pid;
-       }
-        atomic_inc(&gmnalni->gmni_nthreads);
-
-       for (i = 0; i < num_online_cpus(); i++) {
+        int          nrxcred = gm_num_receive_tokens(gmni->gmni_port);
+        int          nrx_small = *gmnal_tunables.gm_nrx_small;
+        int          nrx_large = *gmnal_tunables.gm_nrx_large;
+        int          nrx = nrx_large + nrx_small;
+        int          rc;
+        int          i;
 
-                pid = kernel_thread(gmnal_rx_thread, (void*)gmnalni, 0);
-                if (pid < 0) {
-                        CERROR("rx thread failed to start: %d\n", pid);
-                        gmnal_stop_threads(gmnalni);
-                        return pid;
-                }
+        CDEBUG(D_NET, "nrxcred: %d(%dL+%dS)\n", nrxcred, nrx_large, nrx_small);
 
-                atomic_inc(&gmnalni->gmni_nthreads);
-               gmnalni->gmni_rxthread_pid[i] = pid;
-                gmnalni->gmni_nrxthreads++;
+        if (nrx > nrxcred) {
+                int nlarge = (nrx_large * nrxcred)/nrx;
+                int nsmall = nrxcred - nlarge;
+                
+                CWARN("Only %d rx credits: "
+                      "reducing large %d->%d, small %d->%d\n", nrxcred,
+                      nrx_large, nlarge, nrx_small, nsmall);
+                
+                *gmnal_tunables.gm_nrx_large = nrx_large = nlarge;
+                *gmnal_tunables.gm_nrx_small = nrx_small = nsmall;
+                nrx = nlarge + nsmall;
+        }
+        
+       gmni->gmni_rx_hash = gm_create_hash(gm_hash_compare_ptrs, 
+                                            gm_hash_hash_ptr, 0, 0, nrx, 0);
+       if (gmni->gmni_rx_hash == NULL) {
+                CERROR("Failed to create hash table\n");
+                return -ENOMEM;
        }
 
+        for (i = 0; i < nrx; i++ ) {
+                rc = gmnal_alloc_rx(gmni, i < nrx_large);
+                if (rc != 0)
+                        return rc;
+        }
+
        return 0;
 }
 
@@ -674,62 +577,3 @@ gmnal_yield(int delay)
        set_current_state(TASK_INTERRUPTIBLE);
        schedule_timeout(delay);
 }
-
-int
-gmnal_enqueue_rx(gmnal_ni_t *gmnalni, gm_recv_t *recv)
-{
-        void       *ptr = gm_ntohp(recv->buffer);
-        gmnal_rx_t *rx = gm_hash_find(gmnalni->gmni_rx_hash, ptr);
-
-        /* No locking; hash is read-only */
-
-       LASSERT (rx != NULL);
-        LASSERT (rx->rx_msg == (gmnal_msg_t *)ptr);
-
-        rx->rx_recv_nob = gm_ntohl(recv->length);
-        rx->rx_recv_gmid = gm_ntoh_u16(recv->sender_node_id);
-        rx->rx_recv_port = gm_ntoh_u8(recv->sender_port_id);
-        rx->rx_recv_type = gm_ntoh_u8(recv->type);
-        
-       spin_lock(&gmnalni->gmni_rxq_lock);
-        list_add_tail (&rx->rx_list, &gmnalni->gmni_rxq);
-       spin_unlock(&gmnalni->gmni_rxq_lock);
-
-       up(&gmnalni->gmni_rxq_wait);
-       return 0;
-}
-
-gmnal_rx_t *
-gmnal_dequeue_rx(gmnal_ni_t *gmnalni)
-{
-       gmnal_rx_t      *rx;
-
-       CDEBUG(D_NET, "Getting entry to list\n");
-
-        for (;;) {
-               while(down_interruptible(&gmnalni->gmni_rxq_wait) != 0)
-                        /* do nothing */;
-
-               if (gmnalni->gmni_thread_shutdown)
-                       return NULL;
-
-               spin_lock(&gmnalni->gmni_rxq_lock);
-
-                if (list_empty(&gmnalni->gmni_rxq)) {
-                        rx = NULL;
-                } else {
-                        rx = list_entry(gmnalni->gmni_rxq.next,
-                                        gmnal_rx_t, rx_list);
-                        list_del(&rx->rx_list);
-                }
-
-               spin_unlock(&gmnalni->gmni_rxq_lock);
-
-                if (rx != NULL)
-                        return rx;
-                
-                CWARN("woken but no work\n");
-       }
-}
-
-
index e7934e2..7ee9b64 100644 (file)
@@ -1,5 +1,5 @@
-MODULES := kiibnal
-kiibnal-objs := iibnal.o iibnal_cb.o
+MODULES := kiiblnd
+kiiblnd-objs := iiblnd.o iiblnd_cb.o iiblnd_modparams.o
 
 EXTRA_POST_CFLAGS := @IIBCPPFLAGS@
 
index d61ffe7..d08d079 100644 (file)
@@ -4,12 +4,10 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
-if BUILD_IIBNAL
-modulenet_DATA = kiibnal$(KMODEXT)
-endif
+if BUILD_IIBLND
+modulenet_DATA = kiiblnd$(KMODEXT)
 endif
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kiiblnd-objs:%.o=%.c) iiblnd.h
index 1ecd32d..27b31a5 100644 (file)
  *
  */
 
-#include "iibnal.h"
-
-nal_t                   kibnal_api;
-ptl_handle_ni_t         kibnal_ni;
-kib_tunables_t          kibnal_tunables;
-
-kib_data_t              kibnal_data = {
-        .kib_service_id = IBNAL_SERVICE_NUMBER,
+#include "iiblnd.h"
+
+lnd_t the_kiblnd = {
+        .lnd_type          = IIBLND,
+        .lnd_startup       = kibnal_startup,
+        .lnd_shutdown      = kibnal_shutdown,
+        .lnd_ctl           = kibnal_ctl,
+        .lnd_send          = kibnal_send,
+        .lnd_recv          = kibnal_recv,
+        .lnd_eager_recv    = kibnal_eager_recv,
 };
 
-#ifdef CONFIG_SYSCTL
-#define IBNAL_SYSCTL             202
+kib_data_t              kibnal_data;
 
-#define IBNAL_SYSCTL_TIMEOUT     1
+__u32 
+kibnal_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
 
-static ctl_table kibnal_ctl_table[] = {
-        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &kibnal_tunables.kib_io_timeout, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        { 0 }
-};
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+        
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
 
-static ctl_table kibnal_top_ctl_table[] = {
-        {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
-        { 0 }
-};
-#endif
+void
+kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
+{
+        msg->ibm_type = type;
+        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
 
-#ifdef unused
 void
-print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, 
+                lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
 {
-        char name[32];
+        /* CAVEAT EMPTOR! all message fields not set here should have been
+         * initialised previously. */
+        msg->ibm_magic    = IBNAL_MSG_MAGIC;
+        msg->ibm_version  = version;
+        /*   ibm_type */
+        msg->ibm_credits  = credits;
+        /*   ibm_nob */
+        msg->ibm_cksum    = 0;
+        msg->ibm_srcnid   = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
+                                                  dstnid);
+        msg->ibm_srcstamp = kibnal_data.kib_incarnation;
+        msg->ibm_dstnid   = dstnid;
+        msg->ibm_dststamp = dststamp;
+        msg->ibm_seq      = seq;
+
+        if (*kibnal_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
+        }
+}
 
-        if (service == NULL) 
-        {
-                CWARN("tag       : %s\n"
-                      "status    : %d (NULL)\n", tag, rc);
-                return;
+void
+kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, 
+                    int type, lnet_nid_t dstnid, __u64 dststamp)
+{
+        LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
+
+        memset(msg, 0, nob);
+        kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
+
+        msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
+        msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
+        msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
+
+        kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0);
+}
+
+int
+kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
+{
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+        __u32     msg_cksum;
+        __u32     msg_version;
+        int       flip;
+        int       msg_nob;
+#if !IBNAL_USE_FMR
+        int       i;
+        int       n;
+#endif
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        /* Future protocol version compatibility support!
+         * If the iiblnd-specific protocol changes, or when LNET unifies
+         * protocols over all LNDs, the initial connection will negotiate a
+         * protocol version.  If I find this, I avoid any console errors.  If
+         * my is doing connection establishment, the reject will tell the peer
+         * which version I'm running. */
+
+        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                if (msg->ibm_magic == LNET_PROTO_MAGIC ||
+                    msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+                        return -EPROTO;
+
+                /* Completely out to lunch */
+                CERROR("Bad magic: %08x\n", msg->ibm_magic);
+                return -EPROTO;
         }
-        strncpy (name, service->ServiceName, sizeof(name)-1);
-        name[sizeof(name)-1] = 0;
+
+        msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+        if (expected_version == 0) {
+                if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
+                    msg_version != IBNAL_MSG_VERSION)
+                        return -EPROTO;
+        } else if (msg_version != expected_version) {
+                CERROR("Bad version: %x(%x expected)\n", 
+                       msg_version, expected_version);
+                return -EPROTO;
+        }
+
+        if (nob < hdr_size) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+        if (msg_nob > nob) {
+                CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+                return -EPROTO;
+        }
+
+        /* checksum must be computed with ibm_cksum zero and BEFORE anything
+         * gets flipped */
+        msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+        msg->ibm_cksum = 0;
+        if (msg_cksum != 0 &&
+            msg_cksum != kibnal_cksum(msg, msg_nob)) {
+                CERROR("Bad checksum\n");
+                return -EPROTO;
+        }
+        msg->ibm_cksum = msg_cksum;
         
-        CWARN("tag       : %s\n"
-              "status    : %d\n"
-              "service id: "LPX64"\n"
-              "name      : %s\n"
-              "NID       : "LPX64"\n", tag, rc,
-              service->RID.ServiceID, name,
-              *kibnal_service_nid_field(service));
-}
+        if (flip) {
+                /* leave magic unflipped as a clue to peer endianness */
+                msg->ibm_version = msg_version;
+                CLASSERT (sizeof(msg->ibm_type) == 1);
+                CLASSERT (sizeof(msg->ibm_credits) == 1);
+                msg->ibm_nob = msg_nob;
+                __swab64s(&msg->ibm_srcnid);
+                __swab64s(&msg->ibm_srcstamp);
+                __swab64s(&msg->ibm_dstnid);
+                __swab64s(&msg->ibm_dststamp);
+                __swab64s(&msg->ibm_seq);
+        }
+        
+        if (msg->ibm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+                return -EPROTO;
+        }
+
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Unknown message type %x\n", msg->ibm_type);
+                return -EPROTO;
+                
+        case IBNAL_MSG_NOOP:
+                break;
+
+        case IBNAL_MSG_IMMEDIATE:
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
+                        CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBNAL_MSG_PUT_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
+                        CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBNAL_MSG_PUT_ACK:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.putack)));
+                        return -EPROTO;
+                }
+#if IBNAL_USE_FMR
+                if (flip) {
+                        __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                }
+#else
+                if (flip) {
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
+                }
+                
+                n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+                if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBNAL_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+
+                if (flip) {
+                        for (i = 0; i < n; i++) {
+                                __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
+                                __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
+                        }
+                }
 #endif
+                break;
 
-static void
-kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
-                              FSTATUS frc, uint32 madrc)
+        case IBNAL_MSG_GET_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.get)));
+                        return -EPROTO;
+                }
+#if IBNAL_USE_FMR
+                if (flip) {
+                        __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                }
+#else                
+                if (flip) {
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
+                }
+
+                n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
+                if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBNAL_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+                
+                if (flip)
+                        for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
+                                __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
+                                __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
+                        }
+#endif
+                break;
+
+        case IBNAL_MSG_PUT_NAK:
+        case IBNAL_MSG_PUT_DONE:
+        case IBNAL_MSG_GET_DONE:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
+                        CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.completion)));
+                        return -EPROTO;
+                }
+                if (flip)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                break;
+
+        case IBNAL_MSG_CONNREQ:
+        case IBNAL_MSG_CONNACK:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
+                        CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
+                }
+                break;
+        }
+        return 0;
+}
+
+IB_HANDLE
+kibnal_create_cep(lnet_nid_t nid)
 {
-        *(FSTATUS *)arg = frc;
-        up (&kibnal_data.kib_nid_signal);
+        FSTATUS        frc;
+        __u32          u32val;
+        IB_HANDLE      cep;
+
+        cep = iba_cm_create_cep(CM_RC_TYPE);
+        if (cep == NULL) {
+                CERROR ("Can't create CEP for %s\n",
+                        (nid == LNET_NID_ANY) ? "listener" :
+                        libcfs_nid2str(nid));
+                return NULL;
+        }
+
+        if (nid == LNET_NID_ANY) {
+                u32val = 1;
+                frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
+                                        (char *)&u32val, sizeof(u32val), 0);
+                if (frc != FSUCCESS) {
+                        CERROR("Can't set async_accept: %d\n", frc);
+                        goto failed;
+                }
+
+                u32val = 0;                     /* sets system max */
+                frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
+                                        (char *)&u32val, sizeof(u32val), 0);
+                if (frc != FSUCCESS) {
+                        CERROR("Can't set listen backlog: %d\n", frc);
+                        goto failed;
+                }
+        }
+        
+        u32val = 1;
+        frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+                                (char *)&u32val, sizeof(u32val), 0);
+        if (frc != FSUCCESS) {
+                CERROR("Can't set timewait_callback for %s: %d\n", 
+                        (nid == LNET_NID_ANY) ? "listener" :
+                        libcfs_nid2str(nid), frc);
+                goto failed;
+        }
+
+        return cep;
+        
+ failed:
+        iba_cm_destroy_cep(cep);
+        return NULL;
 }
 
+#define IBNAL_CHECK_ADVERT 1
 #if IBNAL_CHECK_ADVERT
-static void
+void
 kibnal_service_query_done (void *arg, QUERY *qry, 
                            QUERY_RESULT_VALUES *qry_result)
 {
-        FSTATUS frc = qry_result->Status;
+        int                    *rcp = arg;
+        FSTATUS                 frc = qry_result->Status;
+        SERVICE_RECORD_RESULTS *svc_rslt;
+        IB_SERVICE_RECORD      *svc;
+        lnet_nid_t              nid;
+
+        if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
+                CERROR("Error checking advert: status %d data size %d\n",
+                       frc, qry_result->ResultDataSize);
+                *rcp = -EIO;
+                goto out;
+        }
+
+        svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
+
+        if (svc_rslt->NumServiceRecords < 1) {
+                CERROR("Check advert: %d records\n",
+                       svc_rslt->NumServiceRecords);
+                *rcp = -ENOENT;
+                goto out;
+        }
 
-        if (frc != FSUCCESS &&
-            qry_result->ResultDataSize == 0)
-                frc = FERROR;
+        svc = &svc_rslt->ServiceRecords[0];
+        nid = le64_to_cpu(*kibnal_service_nid_field(svc));
         
-        *(FSTATUS *)arg = frc;
-        up (&kibnal_data.kib_nid_signal);
+        CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n",
+               libcfs_nid2str(nid), svc->RID.ServiceID, 
+               svc->RID.ServiceGID.Type.Global.InterfaceID, 
+               svc->RID.ServiceP_Key);
+
+        if (nid != kibnal_data.kib_ni->ni_nid) {
+                CERROR("Check advert: Bad NID %s (%s expected)\n",
+                       libcfs_nid2str(nid),
+                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
+                CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n",
+                       svc->RID.ServiceID,
+                       *kibnal_tunables.kib_service_number);
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        if (svc->RID.ServiceGID.Type.Global.InterfaceID != 
+            kibnal_data.kib_port_guid) {
+                CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
+                       svc->RID.ServiceGID.Type.Global.InterfaceID,
+                       kibnal_data.kib_port_guid);
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
+                CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
+                       svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
+                *rcp = -EINVAL;
+                goto out;
+        }
+
+        CDEBUG(D_NET, "Check advert OK\n");
+        *rcp = 0;
+                
+ out:
+        up (&kibnal_data.kib_listener_signal);                
 }
 
-static void
+int
 kibnal_check_advert (void)
 {
-        QUERY                  *qry;
-        IB_SERVICE_RECORD      *svc;
-        FSTATUS                 frc;
-        FSTATUS                 frc2;
+        /* single-threaded */
+        static QUERY               qry;
 
-        PORTAL_ALLOC(qry, sizeof(*qry));
-        if (qry == NULL)
-                return;
+        FSTATUS                    frc;
+        int                        rc;
 
-        memset (qry, 0, sizeof(*qry));
-        qry->InputType = InputTypeServiceRecord;
-        qry->OutputType = OutputTypeServiceRecord;
-        qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
-        svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
-        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
-
-        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
-                                                    kibnal_data.kib_port_guid,
-                                                    qry,
-                                                    kibnal_service_query_done,
-                                                    NULL, &frc2);
-        if (frc != FSUCCESS && frc != FPENDING) {
-                CERROR ("Immediate error %d checking SM service\n", frc);
-        } else {
-                down (&kibnal_data.kib_nid_signal);
-                frc = frc2;
+        memset (&qry, 0, sizeof(qry));
+        qry.InputType = InputTypeServiceRecord;
+        qry.OutputType = OutputTypeServiceRecord;
+        kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
+                                kibnal_data.kib_ni->ni_nid);
+        qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
 
-                if (frc != 0)
-                        CERROR ("Error %d checking SM service\n", rc);
+        frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, 
+                                            kibnal_data.kib_port_guid,
+                                            &qry, 
+                                            kibnal_service_query_done,
+                                            &kibnal_data.kib_sdretry, 
+                                            &rc);
+        if (frc != FPENDING) {
+                CERROR ("Immediate error %d checking SM service\n", frc);
+                return -EIO;
         }
-
-        return (rc);
+        
+        down (&kibnal_data.kib_listener_signal);
+        
+        if (rc != 0)
+                CERROR ("Error %d checking SM service\n", rc);
+        return rc;
+}
+#else
+int
+kibnal_check_advert(void)
+{
+        return 0;
 }
 #endif
 
-static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+void 
+kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
 {
         IB_SERVICE_RECORD     *svc;
 
@@ -143,211 +493,208 @@ static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
         fod->Type = type;
 
         svc = &fod->Value.ServiceRecordValue.ServiceRecord;
-        svc->RID.ServiceID = kibnal_data.kib_service_id;
+        svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
         svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
         svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
         svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
         svc->ServiceLease = 0xffffffff;
 
-        kibnal_set_service_keys(svc, kibnal_data.kib_nid);
+        kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
 }
 
-static int
-kibnal_advertise (void)
+void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+                              FSTATUS frc, uint32 madrc)
 {
-        FABRIC_OPERATION_DATA *fod;
-        IB_SERVICE_RECORD     *svc;
-        FSTATUS                frc;
-        FSTATUS                frc2;
-
-        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+        *(FSTATUS *)arg = frc;
+        up (&kibnal_data.kib_listener_signal);
+}
 
-        PORTAL_ALLOC(fod, sizeof(*fod));
-        if (fod == NULL)
-                return (-ENOMEM);
+int
+kibnal_advertise (void)
+{
+        /* Single threaded here */
+        static FABRIC_OPERATION_DATA fod;
+
+        IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
+        FSTATUS            frc;
+        FSTATUS            frc2;
+
+        if (strlen(*kibnal_tunables.kib_service_name) >=
+            sizeof(svc->ServiceName)) {
+                CERROR("Service name '%s' too long (%d chars max)\n",
+                       *kibnal_tunables.kib_service_name,
+                       (int)sizeof(svc->ServiceName) - 1);
+                return -EINVAL;
+        }
 
-        fill_fod(fod, FabOpSetServiceRecord);
-        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+        kibnal_fill_fod(&fod, FabOpSetServiceRecord);
 
-        CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", 
-               svc->RID.ServiceID, 
-               svc->ServiceName, *kibnal_service_nid_field(svc));
+        CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n", 
+               svc->RID.ServiceID, svc->ServiceName, 
+               libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
 
-        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
-                                            kibnal_data.kib_port_guid,
-                                            fod, kibnal_service_setunset_done, 
-                                            NULL, &frc2);
+        frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                           kibnal_data.kib_port_guid,
+                                           &fod, 
+                                           kibnal_service_setunset_done, 
+                                           &kibnal_data.kib_sdretry,
+                                           &frc2);
 
         if (frc != FSUCCESS && frc != FPENDING) {
-                CERROR ("Immediate error %d advertising NID "LPX64"\n",
-                        frc, kibnal_data.kib_nid);
-                goto out;
+                CERROR ("Immediate error %d advertising NID %s\n",
+                        frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+                return -EIO;
         }
 
-        down (&kibnal_data.kib_nid_signal);
+        down (&kibnal_data.kib_listener_signal);
 
         frc = frc2;
-        if (frc != FSUCCESS)
-                CERROR ("Error %d advertising BUD "LPX64"\n",
-                        frc, kibnal_data.kib_nid);
-out:
-        PORTAL_FREE(fod, sizeof(*fod));
-        return (frc == FSUCCESS) ? 0 : -EINVAL;
+        if (frc == FSUCCESS)
+                return 0;
+        
+        CERROR ("Error %d advertising %s\n",
+                frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+        return -EIO;
 }
 
-static void
+void
 kibnal_unadvertise (int expect_success)
 {
-        FABRIC_OPERATION_DATA *fod;
-        IB_SERVICE_RECORD     *svc;
-        FSTATUS                frc;
-        FSTATUS                frc2;
+        /* single threaded */
+        static FABRIC_OPERATION_DATA fod;
 
-        LASSERT (kibnal_data.kib_nid != PTL_NID_ANY);
+        IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
+        FSTATUS            frc;
+        FSTATUS            frc2;
 
-        PORTAL_ALLOC(fod, sizeof(*fod));
-        if (fod == NULL)
-                return;
+        LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
 
-        fill_fod(fod, FabOpDeleteServiceRecord);
-        svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+        kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
 
-        CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n",
-               svc->ServiceName, *kibnal_service_nid_field(svc));
+        CDEBUG(D_NET, "Unadvertising service %s:%s\n",
+               svc->ServiceName, 
+               libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
         
-        frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
-                                            kibnal_data.kib_port_guid,
-                                            fod, kibnal_service_setunset_done, 
-                                            NULL, &frc2);
-
+        frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd,
+                                           kibnal_data.kib_port_guid,
+                                           &fod, 
+                                           kibnal_service_setunset_done, 
+                                           &kibnal_data.kib_sdretry, 
+                                           &frc2);
         if (frc != FSUCCESS && frc != FPENDING) {
-                CERROR ("Immediate error %d unadvertising NID "LPX64"\n",
-                        frc, kibnal_data.kib_nid);
-                goto out;
+                CERROR ("Immediate error %d unadvertising NID %s\n",
+                        frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+                return;
         }
 
-        down (&kibnal_data.kib_nid_signal);
+        down (&kibnal_data.kib_listener_signal);
+
+        CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2);
 
         if ((frc2 == FSUCCESS) == !!expect_success)
-                goto out;
+                return;
 
         if (expect_success)
-                CERROR("Error %d unadvertising NID "LPX64"\n",
-                       frc2, kibnal_data.kib_nid);
+                CERROR("Error %d unadvertising NID %s\n",
+                       frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
         else
-                CWARN("Removed conflicting NID "LPX64"\n",
-                      kibnal_data.kib_nid);
- out:
-        PORTAL_FREE(fod, sizeof(*fod));
+                CWARN("Removed conflicting NID %s\n",
+                      libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
 }
 
-static int
-kibnal_set_mynid(ptl_nid_t nid)
+void
+kibnal_stop_listener(int normal_shutdown)
 {
-        struct timeval tv;
-        lib_ni_t      *ni = &kibnal_lib.libnal_ni;
-        int            rc;
+        /* NB this also disables peer creation and destroys all existing
+         * peers */
+        IB_HANDLE      cep = kibnal_data.kib_listener_cep;
+        unsigned long  flags;
         FSTATUS        frc;
 
-        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->ni_pid.nid);
+        LASSERT (cep != NULL);
 
-        do_gettimeofday(&tv);
+        kibnal_unadvertise(normal_shutdown);
 
-        down (&kibnal_data.kib_nid_mutex);
+        frc = iba_cm_cancel(cep);
+        if (frc != FSUCCESS && frc != FPENDING)
+                CERROR ("Error %d stopping listener\n", frc);
 
-        if (nid == kibnal_data.kib_nid) {
-                /* no change of NID */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
-        }
+        down(&kibnal_data.kib_listener_signal);
 
-        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               kibnal_data.kib_nid, nid);
-        
-        if (kibnal_data.kib_nid != PTL_NID_ANY) {
+        frc = iba_cm_destroy_cep(cep);
+        if (frc != FSUCCESS)
+                CERROR ("Error %d destroying listener CEP\n", frc);
 
-                kibnal_unadvertise (1);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        /* This assignment disables peer creation */
+        kibnal_data.kib_listener_cep = NULL;
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-                frc = iibt_cm_cancel(kibnal_data.kib_cep);
-                if (frc != FSUCCESS && frc != FPENDING)
-                        CERROR ("Error %d stopping listener\n", frc);
+        /* Start to tear down any peers created while the listener was
+         * running */
+        kibnal_del_peer(LNET_NID_ANY);
+}
 
-                frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
-                if (frc != FSUCCESS)
-                        CERROR ("Error %d destroying CEP\n", frc);
+int
+kibnal_start_listener(void)
+{
+        /* NB this also enables peer creation */
 
-                kibnal_data.kib_cep = NULL;
-        }
-        
-        kibnal_data.kib_nid = ni->ni_pid.nid = nid;
-        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-        
-        /* Delete all existing peers and their connections after new
-         * NID/incarnation set to ensure no old connections in our brave
-         * new world. */
-        kibnal_del_peer (PTL_NID_ANY, 0);
-
-        if (kibnal_data.kib_nid == PTL_NID_ANY) {
-                /* No new NID to install */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
-        }
+        IB_HANDLE      cep;
+        CM_LISTEN_INFO info;
+        unsigned long  flags;
+        int            rc;
+        FSTATUS        frc;
 
-        /* remove any previous advert (crashed node etc) */
-        kibnal_unadvertise(0);
+        LASSERT (kibnal_data.kib_listener_cep == NULL);
+        init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
 
-        kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
-        if (kibnal_data.kib_cep == NULL) {
-                CERROR ("Can't create CEP\n");
-                rc = -ENOMEM;
-        } else {
-                CM_LISTEN_INFO info;
-                memset (&info, 0, sizeof(info));
-                info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
-
-                frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
-                                     kibnal_listen_callback, NULL);
-                if (frc != FSUCCESS && frc != FPENDING) {
-                        CERROR ("iibt_cm_listen error: %d\n", frc);
-                        rc = -EINVAL;
-                } else {
-                        rc = 0;
-                }
-        }
-        
-        if (rc == 0) {
-                rc = kibnal_advertise();
-                if (rc == 0) {
-#if IBNAL_CHECK_ADVERT
-                        kibnal_check_advert();
-#endif
-                        up (&kibnal_data.kib_nid_mutex);
-                        return (0);
-                }
-                
-                iibt_cm_cancel (kibnal_data.kib_cep);
-                iibt_cm_destroy_cep (kibnal_data.kib_cep);
-                /* remove any peers that sprung up while I failed to
-                 * advertise myself */
-                kibnal_del_peer (PTL_NID_ANY, 0);
+        cep = kibnal_create_cep(LNET_NID_ANY);
+        if (cep == NULL)
+                return -ENOMEM;
+
+        memset (&info, 0, sizeof(info));
+        info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
+
+        frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL);
+        if (frc != FSUCCESS && frc != FPENDING) {
+                CERROR ("iba_cm_listen error: %d\n", frc);
+
+                iba_cm_destroy_cep(cep);
+                return -EIO;
         }
 
-        kibnal_data.kib_nid = PTL_NID_ANY;
-        up (&kibnal_data.kib_nid_mutex);
-        return (rc);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        /* This assignment enables peer creation */
+        kibnal_data.kib_listener_cep = cep;
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        rc = kibnal_advertise();
+        if (rc == 0)
+                rc = kibnal_check_advert();
+
+        if (rc == 0)
+                return 0;
+
+        kibnal_stop_listener(0);
+        return rc;
 }
 
-kib_peer_t *
-kibnal_create_peer (ptl_nid_t nid)
+int
+kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
 {
-        kib_peer_t *peer;
+        kib_peer_t    *peer;
+        unsigned long  flags;
+        int            rc;
 
-        LASSERT (nid != PTL_NID_ANY);
+        LASSERT (nid != LNET_NID_ANY);
 
-        PORTAL_ALLOC (peer, sizeof (*peer));
-        if (peer == NULL)
-                return (NULL);
+        LIBCFS_ALLOC (peer, sizeof (*peer));
+        if (peer == NULL) {
+                CERROR("Cannot allocate peer\n");
+                return -ENOMEM;
+        }
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
@@ -358,11 +705,35 @@ kibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_conns);
         INIT_LIST_HEAD (&peer->ibp_tx_queue);
 
-        peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_error = 0;
+        peer->ibp_last_alive = cfs_time_current();
+        peer->ibp_reconnect_interval = 0;       /* OK to connect at any time */
+
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        
+        if (atomic_read(&kibnal_data.kib_npeers) >=
+            *kibnal_tunables.kib_concurrent_peers) {
+                rc = -EOVERFLOW;        /* !! but at least it distinguishes */
+        } else if (kibnal_data.kib_listener_cep == NULL) {
+                rc = -ESHUTDOWN;        /* shutdown has started */
+        } else {
+                rc = 0;
+                /* npeers only grows with the global lock held */
+                atomic_inc(&kibnal_data.kib_npeers);
+        }
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-        atomic_inc (&kibnal_data.kib_npeers);
-        return (peer);
+        if (rc != 0) {
+                CERROR("Can't create peer: %s\n", 
+                       (rc == -ESHUTDOWN) ? "shutting down" : 
+                       "too many peers");
+                LIBCFS_FREE(peer, sizeof(*peer));
+        } else {
+                *peerp = peer;
+        }
+        
+        return rc;
 }
 
 void
@@ -372,11 +743,11 @@ kibnal_destroy_peer (kib_peer_t *peer)
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
         LASSERT (peer->ibp_persistence == 0);
         LASSERT (!kibnal_peer_active(peer));
-        LASSERT (peer->ibp_connecting == 0);
+        LASSERT (!kibnal_peer_connecting(peer));
         LASSERT (list_empty (&peer->ibp_conns));
         LASSERT (list_empty (&peer->ibp_tx_queue));
 
-        PORTAL_FREE (peer, sizeof (*peer));
+        LIBCFS_FREE (peer, sizeof (*peer));
 
         /* NB a peer's connections keep a reference on their peer until
          * they are destroyed, so we can be assured that _all_ state to do
@@ -388,7 +759,7 @@ kibnal_destroy_peer (kib_peer_t *peer)
 /* the caller is responsible for accounting for the additional reference
  * that this creates */
 kib_peer_t *
-kibnal_find_peer_locked (ptl_nid_t nid)
+kibnal_find_peer_locked (lnet_nid_t nid)
 {
         struct list_head *peer_list = kibnal_nid2peerlist (nid);
         struct list_head *tmp;
@@ -398,35 +769,20 @@ kibnal_find_peer_locked (ptl_nid_t nid)
 
                 peer = list_entry (tmp, kib_peer_t, ibp_list);
 
-                LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
-                         peer->ibp_connecting != 0 || /* creating conns */
-                         !list_empty (&peer->ibp_conns));  /* active conn */
+                LASSERT (peer->ibp_persistence != 0 ||
+                         kibnal_peer_connecting(peer) ||
+                         !list_empty (&peer->ibp_conns));
 
                 if (peer->ibp_nid != nid)
                         continue;
 
-                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
-                       peer, nid, atomic_read (&peer->ibp_refcount));
+                CDEBUG(D_NET, "got peer %s (%d)\n",
+                       libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount));
                 return (peer);
         }
         return (NULL);
 }
 
-kib_peer_t *
-kibnal_get_peer (ptl_nid_t nid)
-{
-        kib_peer_t     *peer;
-        unsigned long   flags;
-
-        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
-        peer = kibnal_find_peer_locked (nid);
-        if (peer != NULL)                       /* +1 ref for caller? */
-                kib_peer_addref(peer);
-        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
-
-        return (peer);
-}
-
 void
 kibnal_unlink_peer_locked (kib_peer_t *peer)
 {
@@ -436,11 +792,11 @@ kibnal_unlink_peer_locked (kib_peer_t *peer)
         LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        kib_peer_decref(peer);
+        kibnal_peer_decref(peer);
 }
 
-static int
-kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
+int
+kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
 {
         kib_peer_t        *peer;
         struct list_head  *ptmp;
@@ -455,7 +811,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
 
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
-                                 peer->ibp_connecting != 0 ||
+                                 kibnal_peer_connecting(peer) ||
                                  !list_empty (&peer->ibp_conns));
 
                         if (index-- > 0)
@@ -474,25 +830,26 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep)
         return (-ENOENT);
 }
 
-static int
-kibnal_add_persistent_peer (ptl_nid_t nid)
+int
+kibnal_add_persistent_peer (lnet_nid_t nid)
 {
         unsigned long      flags;
         kib_peer_t        *peer;
         kib_peer_t        *peer2;
+        int                rc;
         
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (-EINVAL);
 
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = kibnal_create_peer(&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                kib_peer_decref (peer);
+                kibnal_peer_decref (peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
@@ -506,20 +863,14 @@ kibnal_add_persistent_peer (ptl_nid_t nid)
         return (0);
 }
 
-static void
-kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+void
+kibnal_del_peer_locked (kib_peer_t *peer)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
         kib_conn_t       *conn;
 
-        if (!single_share)
-                peer->ibp_persistence = 0;
-        else if (peer->ibp_persistence > 0)
-                peer->ibp_persistence--;
-
-        if (peer->ibp_persistence != 0)
-                return;
+        peer->ibp_persistence = 0;
 
         if (list_empty(&peer->ibp_conns)) {
                 kibnal_unlink_peer_locked(peer);
@@ -537,9 +888,10 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 }
 
 int
-kibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (lnet_nid_t nid)
 {
         unsigned long      flags;
+        CFS_LIST_HEAD     (zombies);
         struct list_head  *ptmp;
         struct list_head  *pnxt;
         kib_peer_t        *peer;
@@ -550,7 +902,7 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -561,26 +913,31 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
                 list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) {
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
-                                 peer->ibp_connecting != 0 ||
+                                 kibnal_peer_connecting(peer) ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                        if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
                                 continue;
 
-                        kibnal_del_peer_locked (peer, single_share);
-                        rc = 0;         /* matched something */
+                        if (!list_empty(&peer->ibp_tx_queue)) {
+                                LASSERT (list_empty(&peer->ibp_conns));
 
-                        if (single_share)
-                                goto out;
+                                list_splice_init(&peer->ibp_tx_queue, &zombies);
+                        }
+
+                        kibnal_del_peer_locked (peer);
+                        rc = 0;         /* matched something */
                 }
         }
- out:
+
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
+        kibnal_txlist_done(&zombies, -EIO);
+
         return (rc);
 }
 
-static kib_conn_t *
+kib_conn_t *
 kibnal_get_conn_by_idx (int index)
 {
         kib_peer_t        *peer;
@@ -596,37 +953,111 @@ kibnal_get_conn_by_idx (int index)
                 list_for_each (ptmp, &kibnal_data.kib_peers[i]) {
 
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
-                        LASSERT (peer->ibp_persistence > 0 ||
-                                 peer->ibp_connecting != 0 ||
+                        LASSERT (peer->ibp_persistence != 0 ||
+                                 kibnal_peer_connecting(peer) ||
                                  !list_empty (&peer->ibp_conns));
 
                         list_for_each (ctmp, &peer->ibp_conns) {
                                 if (index-- > 0)
                                         continue;
 
-                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
-                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                                       atomic_read (&conn->ibc_refcount));
-                                atomic_inc (&conn->ibc_refcount);
-                                read_unlock_irqrestore(&kibnal_data.kib_global_lock,
-                                                       flags);
-                                return (conn);
-                        }
-                }
+                                conn = list_entry (ctmp, kib_conn_t, ibc_list);
+                                kibnal_conn_addref(conn);
+                                read_unlock_irqrestore(&kibnal_data.kib_global_lock,
+                                                       flags);
+                                return (conn);
+                        }
+                }
+        }
+
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+        return (NULL);
+}
+
+int
+kibnal_conn_rts(kib_conn_t *conn, 
+                __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
+{
+        IB_PATH_RECORD         *path = &conn->ibc_cvars->cv_path;
+        IB_HANDLE               qp = conn->ibc_qp;
+        IB_QP_ATTRIBUTES_MODIFY modify_attr;
+        FSTATUS                 frc;
+        int                     rc;
+
+        if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
+                resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
+
+        if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
+                init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState       = QPStateReadyToRecv,
+                .RecvPSN            = IBNAL_STARTING_PSN,
+                .DestQPNumber       = qpn,
+                .ResponderResources = resp_res,
+                .MinRnrTimer        = UsecToRnrNakTimer(2000), /* 20 ms */
+                .Attrs              = (IB_QP_ATTR_RECVPSN |
+                                       IB_QP_ATTR_DESTQPNUMBER | 
+                                       IB_QP_ATTR_RESPONDERRESOURCES | 
+                                       IB_QP_ATTR_DESTAV | 
+                                       IB_QP_ATTR_PATHMTU | 
+                                       IB_QP_ATTR_MINRNRTIMER),
+        };
+        GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
+                      &modify_attr.DestAV);
+
+        frc = iba_modify_qp(qp, &modify_attr, NULL);
+        if (frc != FSUCCESS) {
+                CERROR("Can't set QP %s ready to receive: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                return -EIO;
+        }
+
+        rc = kibnal_post_receives(conn);
+        if (rc != 0) {
+                CERROR("Can't post receives for %s: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                return rc;
+        }
+
+        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+                .RequestState           = QPStateReadyToSend,
+                .FlowControl            = TRUE,
+                .InitiatorDepth         = init_depth,
+                .SendPSN                = psn,
+                .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
+                .RetryCount             = IBNAL_RETRY,
+                .RnrRetryCount          = IBNAL_RNR_RETRY,
+                .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
+                                           IB_QP_ATTR_INITIATORDEPTH | 
+                                           IB_QP_ATTR_SENDPSN | 
+                                           IB_QP_ATTR_LOCALACKTIMEOUT | 
+                                           IB_QP_ATTR_RETRYCOUNT | 
+                                           IB_QP_ATTR_RNRRETRYCOUNT),
+        };
+
+        frc = iba_modify_qp(qp, &modify_attr, NULL);
+        if (frc != FSUCCESS) {
+                CERROR("Can't set QP %s ready to send: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                return -EIO;
         }
 
-        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
-        return (NULL);
+        frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't query QP %s attributes: %d\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                return -EIO;
+        }
+        
+        return 0;
 }
 
 kib_conn_t *
-kibnal_create_conn (void)
+kibnal_create_conn (lnet_nid_t nid, int proto_version)
 {
         kib_conn_t  *conn;
         int          i;
-        __u64        vaddr = 0;
-        __u64        vaddr_base;
         int          page_offset;
         int          ipage;
         int          rc;
@@ -636,50 +1067,61 @@ kibnal_create_conn (void)
                 IB_QP_ATTRIBUTES_MODIFY    qp_attr;
         } params;
         
-        PORTAL_ALLOC (conn, sizeof (*conn));
+        LIBCFS_ALLOC (conn, sizeof (*conn));
         if (conn == NULL) {
-                CERROR ("Can't allocate connection\n");
+                CERROR ("Can't allocate connection for %s\n",
+                        libcfs_nid2str(nid));
                 return (NULL);
         }
 
         /* zero flags, NULL pointers etc... */
         memset (conn, 0, sizeof (*conn));
+        conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
+        conn->ibc_version = proto_version;
 
+        INIT_LIST_HEAD (&conn->ibc_early_rxs);
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
         INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
         
         atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
-        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
-        if (conn->ibc_rxs == NULL)
+        LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars));
+        if (conn->ibc_cvars == NULL) {
+                CERROR ("Can't allocate connvars for %s\n", 
+                        libcfs_nid2str(nid));
                 goto failed;
-        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+        }
+        memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
 
-        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
-        if (rc != 0)
+        LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        if (conn->ibc_rxs == NULL) {
+                CERROR("Cannot allocate RX descriptors for %s\n",
+                       libcfs_nid2str(nid));
                 goto failed;
+        }
+        memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
-        vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
-
+        rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
+        if (rc != 0) {
+                CERROR("Can't allocate RX buffers for %s\n",
+                       libcfs_nid2str(nid));
+                goto failed;
+        }
+        
         for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
                 struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
-                kib_rx_t   *rx = &conn->ibc_rxs[i];
+                kib_rx_t    *rx = &conn->ibc_rxs[i];
 
                 rx->rx_conn = conn;
                 rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                              page_offset);
 
-                if (kibnal_whole_mem()) 
-                        rx->rx_vaddr = kibnal_page2phys(page) + 
-                                       page_offset + 
-                                       kibnal_data.kib_md.md_addr;
-                else
-                        rx->rx_vaddr = vaddr;
-                
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+                rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
+                                 lnet_page2phys(page) + page_offset;
                 
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
@@ -693,9 +1135,9 @@ kibnal_create_conn (void)
 
         params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
                 .Type                    = QPTypeReliableConnected,
-                .SendQDepth              = IBNAL_TX_MAX_SG * 
-                                           IBNAL_MSG_QUEUE_SIZE,
-                .RecvQDepth              = IBNAL_MSG_QUEUE_SIZE,
+                .SendQDepth              = (1 + IBNAL_MAX_RDMA_FRAGS) *
+                                           (*kibnal_tunables.kib_concurrent_sends),
+                .RecvQDepth              = IBNAL_RX_MSGS,
                 .SendDSListDepth         = 1,
                 .RecvDSListDepth         = 1,
                 .SendCQHandle            = kibnal_data.kib_cq,
@@ -703,15 +1145,15 @@ kibnal_create_conn (void)
                 .PDHandle                = kibnal_data.kib_pd,
                 .SendSignaledCompletions = TRUE,
         };
-        frc = iibt_qp_create(kibnal_data.kib_hca, &params.qp_create, NULL,
-                             &conn->ibc_qp, &conn->ibc_qp_attrs);
-        if (rc != 0) {
-                CERROR ("Failed to create queue pair: %d\n", rc);
+        frc = iba_create_qp(kibnal_data.kib_hca, &params.qp_create, NULL,
+                            &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
+        if (frc != 0) {
+                CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
                 goto failed;
         }
 
         /* Mark QP created */
-        conn->ibc_state = IBNAL_CONN_INIT_QP;
+        kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
 
         params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
                 .RequestState             = QPStateInit,
@@ -720,21 +1162,30 @@ kibnal_create_conn (void)
                                              IB_QP_ATTR_ACCESSCONTROL),
                 .PortGUID                 = kibnal_data.kib_port_guid,
                 .PkeyIndex                = 0,
-                .AccessControl = {
+                .AccessControl = { 
                         .s = {
                                 .RdmaWrite = 1,
                                 .RdmaRead  = 1,
                         },
                 },
         };
-        rc = iibt_qp_modify(conn->ibc_qp, &params.qp_attr, NULL);
-        if (rc != 0) {
-                CERROR ("Failed to modify queue pair: %d\n", rc);
+        frc = iba_modify_qp(conn->ibc_qp, &params.qp_attr, NULL);
+        if (frc != 0) {
+                CERROR ("Can't set QP %s state to INIT: %d\n",
+                        libcfs_nid2str(nid), frc);
+                goto failed;
+        }
+
+        frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't query QP %s attributes: %d\n",
+                        libcfs_nid2str(nid), frc);
                 goto failed;
         }
 
         /* 1 ref for caller */
         atomic_set (&conn->ibc_refcount, 1);
+        CDEBUG(D_NET, "New conn %p\n", conn);
         return (conn);
         
  failed:
@@ -745,92 +1196,70 @@ kibnal_create_conn (void)
 void
 kibnal_destroy_conn (kib_conn_t *conn)
 {
-        int    rc;
         FSTATUS frc;
+
+        LASSERT (!in_interrupt());
         
-        CDEBUG (D_NET, "connection %p\n", conn);
+        CDEBUG (D_NET, "connection %s\n", 
+                (conn->ibc_peer) == NULL ? "<ANON>" :
+                libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_early_rxs));
         LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+        LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
         LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
-        LASSERT (conn->ibc_connreq == NULL);
 
         switch (conn->ibc_state) {
-        case IBNAL_CONN_DISCONNECTED:
-                /* called after connection sequence initiated */
-                /* fall through */
-
-        case IBNAL_CONN_INIT_QP:
-                /* _destroy includes an implicit Reset of the QP which 
-                 * discards posted work */
-                rc = iibt_qp_destroy(conn->ibc_qp);
-                if (rc != 0)
-                        CERROR("Can't destroy QP: %d\n", rc);
-                /* fall through */
-                
         case IBNAL_CONN_INIT_NOTHING:
+        case IBNAL_CONN_INIT_QP:
+        case IBNAL_CONN_DISCONNECTED:
                 break;
 
         default:
-                LASSERT (0);
+                /* conn must either have never engaged with the CM, or have
+                 * completely disengaged from it */
+                CERROR("Bad conn %s state %d\n",
+                       (conn->ibc_peer) == NULL ? "<anon>" :
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
+                LBUG();
         }
 
         if (conn->ibc_cep != NULL) {
-                frc = iibt_cm_destroy_cep(conn->ibc_cep);
-                if (frc != 0)
-                        CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, 
-                               frc);
+                frc = iba_cm_destroy_cep(conn->ibc_cep);
+                if (frc != FSUCCESS)
+                        CERROR("Error destroying CEP %p: %d\n",
+                               conn->ibc_cep, frc);
+        }
+
+        if (conn->ibc_qp != NULL) {
+                frc = iba_destroy_qp(conn->ibc_qp);
+                if (frc != FSUCCESS)
+                        CERROR("Error destroying QP %p: %d\n",
+                               conn->ibc_qp, frc);
         }
 
         if (conn->ibc_rx_pages != NULL) 
                 kibnal_free_pages(conn->ibc_rx_pages);
         
         if (conn->ibc_rxs != NULL)
-                PORTAL_FREE(conn->ibc_rxs, 
+                LIBCFS_FREE(conn->ibc_rxs, 
                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
+        if (conn->ibc_cvars != NULL)
+                LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
+
         if (conn->ibc_peer != NULL)
-                kib_peer_decref(conn->ibc_peer);
+                kibnal_peer_decref(conn->ibc_peer);
 
-        PORTAL_FREE(conn, sizeof (*conn));
+        LIBCFS_FREE(conn, sizeof (*conn));
 
         atomic_dec(&kibnal_data.kib_nconns);
-        
-        if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
-            kibnal_data.kib_shutdown) {
-                /* I just nuked the last connection on shutdown; wake up
-                 * everyone so they can exit. */
-                wake_up_all(&kibnal_data.kib_sched_waitq);
-                wake_up_all(&kibnal_data.kib_connd_waitq);
-        }
-}
-
-void
-kibnal_put_conn (kib_conn_t *conn)
-{
-        unsigned long flags;
-
-        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
-                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                atomic_read (&conn->ibc_refcount));
-
-        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
-        if (!atomic_dec_and_test (&conn->ibc_refcount))
-                return;
-
-        /* must disconnect before dropping the final ref */
-        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
-
-        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
-
-        list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up (&kibnal_data.kib_connd_waitq);
-
-        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 }
 
-static int
+int
 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
         kib_conn_t         *conn;
@@ -862,8 +1291,9 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
                 if (conn->ibc_incarnation == incarnation)
                         continue;
 
-                CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
-                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n",
+                       libcfs_nid2str(peer->ibp_nid), 
+                       conn->ibc_incarnation, incarnation);
                 
                 count++;
                 kibnal_close_conn_locked (conn, -ESTALE);
@@ -872,8 +1302,8 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
         return (count);
 }
 
-static int
-kibnal_close_matching_conns (ptl_nid_t nid)
+int
+kibnal_close_matching_conns (lnet_nid_t nid)
 {
         unsigned long       flags;
         kib_peer_t         *peer;
@@ -886,7 +1316,7 @@ kibnal_close_matching_conns (ptl_nid_t nid)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -898,10 +1328,10 @@ kibnal_close_matching_conns (ptl_nid_t nid)
 
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
-                                 peer->ibp_connecting != 0 ||
+                                 kibnal_peer_connecting(peer) ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                        if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
                                 continue;
 
                         count += kibnal_close_peer_conns_locked (peer, 0);
@@ -911,69 +1341,65 @@ kibnal_close_matching_conns (ptl_nid_t nid)
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (0);
         
         return (count == 0 ? -ENOENT : 0);
 }
 
-static int
-kibnal_cmd(struct portals_cfg *pcfg, void * private)
+int
+kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 {
-        int rc = -EINVAL;
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
         ENTRY;
 
-        LASSERT (pcfg != NULL);
+        LASSERT (ni == kibnal_data.kib_ni);
 
-        switch(pcfg->pcfg_command) {
-        case NAL_CMD_GET_PEER: {
-                ptl_nid_t   nid = 0;
-                int         share_count = 0;
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
+                int          share_count = 0;
 
-                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                rc = kibnal_get_peer_info(data->ioc_count,
                                           &nid, &share_count);
-                pcfg->pcfg_nid   = nid;
-                pcfg->pcfg_size  = 0;
-                pcfg->pcfg_id    = 0;
-                pcfg->pcfg_misc  = 0;
-                pcfg->pcfg_count = 0;
-                pcfg->pcfg_wait  = share_count;
+                data->ioc_nid   = nid;
+                data->ioc_count = share_count;
                 break;
         }
-        case NAL_CMD_ADD_PEER: {
-                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid);
+        case IOC_LIBCFS_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_DEL_PEER: {
-                rc = kibnal_del_peer (pcfg->pcfg_nid, 
-                                       /* flags == single_share */
-                                       pcfg->pcfg_flags != 0);
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kibnal_del_peer (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_GET_CONN: {
-                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+        case IOC_LIBCFS_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
                 else {
                         rc = 0;
-                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
-                        pcfg->pcfg_id    = 0;
-                        pcfg->pcfg_misc  = 0;
-                        pcfg->pcfg_flags = 0;
-                        kibnal_put_conn (conn);
+                        data->ioc_nid = conn->ibc_peer->ibp_nid;
+                        kibnal_conn_decref(conn);
                 }
                 break;
         }
-        case NAL_CMD_CLOSE_CONNECTION: {
-                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_REGISTER_MYNID: {
-                if (pcfg->pcfg_nid == PTL_NID_ANY)
+        case IOC_LIBCFS_REGISTER_MYNID: {
+                if (ni->ni_nid == data->ioc_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                }
                 break;
         }
         }
@@ -985,38 +1411,22 @@ void
 kibnal_free_pages (kib_pages_t *p)
 {
         int     npages = p->ibp_npages;
-        int     rc;
         int     i;
         
-        if (p->ibp_mapped) {
-                rc = iibt_deregister_memory(p->ibp_handle);
-                if (rc != 0)
-                        CERROR ("Deregister error: %d\n", rc);
-        }
-        
         for (i = 0; i < npages; i++)
                 if (p->ibp_pages[i] != NULL)
                         __free_page(p->ibp_pages[i]);
         
-        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
-kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+kibnal_alloc_pages (kib_pages_t **pp, int npages)
 {
-        kib_pages_t                *p;
-        __u64                      *phys_pages;
-        int                         i;
-        FSTATUS                     frc;
-        IB_ACCESS_CONTROL           access;
-
-        memset(&access, 0, sizeof(access));
-        access.s.MWBindable = 1;
-        access.s.LocalWrite = 1;
-        access.s.RdmaRead = 1;
-        access.s.RdmaWrite = 1;
+        kib_pages_t   *p;
+        int            i;
 
-        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR ("Can't allocate buffer %d\n", npages);
                 return (-ENOMEM);
@@ -1034,107 +1444,131 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
                 }
         }
 
-        if (kibnal_whole_mem())
-                goto out;
-
-        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
-        if (phys_pages == NULL) {
-                CERROR ("Can't allocate physarray for %d pages\n", npages);
-                /* XXX free ibp_pages? */
-                kibnal_free_pages(p);
-                return (-ENOMEM);
-        }
+        *pp = p;
+        return (0);
+}
 
-        /* if we were using the _contig_ registration variant we would have
-         * an array of PhysAddr/Length pairs, but the discontiguous variant
-         * just takes the PhysAddr */
-        for (i = 0; i < npages; i++)
-                phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
-
-        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
-                                            0,          /* requested vaddr */
-                                            phys_pages, npages,
-                                            0,          /* offset */
-                                            kibnal_data.kib_pd,
-                                            access,
-                                            &p->ibp_handle, &p->ibp_vaddr,
-                                            &p->ibp_lkey, &p->ibp_rkey);
+int
+kibnal_alloc_tx_descs (void) 
+{
+        int    i;
         
-        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+        LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS() * sizeof(kib_tx_t));
+        if (kibnal_data.kib_tx_descs == NULL)
+                return -ENOMEM;
         
-        if (frc != FSUCCESS) {
-                CERROR ("Error %d mapping %d pages\n", frc, npages);
-                kibnal_free_pages(p);
-                return (-ENOMEM);
+        memset(kibnal_data.kib_tx_descs, 0,
+               IBNAL_TX_MSGS() * sizeof(kib_tx_t));
+
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
+                kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+#if IBNAL_USE_FMR
+                LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
+                             sizeof(*tx->tx_pages));
+                if (tx->tx_pages == NULL)
+                        return -ENOMEM;
+#else
+                LIBCFS_ALLOC(tx->tx_wrq, 
+                             (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_wrq));
+                if (tx->tx_wrq == NULL)
+                        return -ENOMEM;
+                
+                LIBCFS_ALLOC(tx->tx_gl, 
+                             (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_gl));
+                if (tx->tx_gl == NULL)
+                        return -ENOMEM;
+                
+                LIBCFS_ALLOC(tx->tx_rd, 
+                             offsetof(kib_rdma_desc_t, 
+                                      rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+                if (tx->tx_rd == NULL)
+                        return -ENOMEM;
+#endif
         }
 
-        CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
-                      "lkey %x rkey %x\n", npages, p->ibp_handle,
-                      p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
-        
-        p->ibp_mapped = 1;
-out:
-        *pp = p;
-        return (0);
+        return 0;
+}
+
+void
+kibnal_free_tx_descs (void) 
+{
+        int    i;
+
+        if (kibnal_data.kib_tx_descs == NULL)
+                return;
+
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
+                kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+#if IBNAL_USE_FMR
+                if (tx->tx_pages != NULL)
+                        LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
+                                    sizeof(*tx->tx_pages));
+#else
+                if (tx->tx_wrq != NULL)
+                        LIBCFS_FREE(tx->tx_wrq, 
+                                    (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                                    sizeof(*tx->tx_wrq));
+
+                if (tx->tx_gl != NULL)
+                        LIBCFS_FREE(tx->tx_gl, 
+                                    (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                                    sizeof(*tx->tx_gl));
+
+                if (tx->tx_rd != NULL)
+                        LIBCFS_FREE(tx->tx_rd, 
+                                    offsetof(kib_rdma_desc_t, 
+                                             rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+#endif
+        }
+
+        LIBCFS_FREE(kibnal_data.kib_tx_descs,
+                    IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 }
 
-static int
+int
 kibnal_setup_tx_descs (void)
 {
         int           ipage = 0;
         int           page_offset = 0;
-        __u64         vaddr;
-        __u64         vaddr_base;
         struct page  *page;
         kib_tx_t     *tx;
         int           i;
         int           rc;
 
         /* pre-mapped messages are not bigger than 1 page */
-        LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+        CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
 
         /* No fancy arithmetic when we do the buffer calculations */
-        LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+        CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
-        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
-                                0);
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+                                IBNAL_TX_MSG_PAGES());
         if (rc != 0)
                 return (rc);
 
-        /* ignored for the whole_mem case */
-        vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
-
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
-                memset (tx, 0, sizeof(*tx));    /* zero flags etc */
-                
+#if IBNAL_USE_FMR
+                /* Allocate an FMR for this TX so it can map src/sink buffers
+                 * for large transfers */
+#endif
                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
                                             page_offset);
 
-                if (kibnal_whole_mem()) 
-                        tx->tx_vaddr = kibnal_page2phys(page) + 
-                                       page_offset + 
-                                       kibnal_data.kib_md.md_addr;
-                else
-                        tx->tx_vaddr = vaddr;
-
-                tx->tx_isnblk = (i >= IBNAL_NTX);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
+                tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
+                                 lnet_page2phys(page) + page_offset;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
-                       i, tx, tx->tx_msg, tx->tx_vaddr);
+                       i, tx, tx->tx_msg, tx->tx_hca_msg);
 
-                if (tx->tx_isnblk)
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_nblk_txs);
-                else
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_txs);
-
-                vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
@@ -1142,29 +1576,89 @@ kibnal_setup_tx_descs (void)
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
                 }
         }
         
         return (0);
 }
 
-static void
-kibnal_api_shutdown (nal_t *nal)
+int
+kibnal_register_all_memory(void)
 {
-        int   i;
-        int   rc;
+        /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
+         * chunk starting at 0 */
+        struct sysinfo     si;
+        __u64              total;
+        __u64              total2;
+        __u64              roundup = (128<<20);     /* round up in big chunks */
+        IB_MR_PHYS_BUFFER  phys;
+        IB_ACCESS_CONTROL  access;
+        FSTATUS            frc;
 
-        if (nal->nal_refct != 0) {
-                /* This module got the first ref */
-                PORTAL_MODULE_UNUSE;
-                return;
+        memset(&access, 0, sizeof(access));
+        access.s.MWBindable = 1;
+        access.s.LocalWrite = 1;
+        access.s.RdmaRead = 1;
+        access.s.RdmaWrite = 1;
+
+        /* XXX we don't bother with first-gen cards */
+        if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 && 
+            kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
+                CERROR("Can't register all memory on first generation HCAs\n");
+                return -EINVAL;
         }
 
-        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
+        si_meminfo(&si);
+
+        CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n",
+               si.totalram, si.mem_unit, num_physpages, PAGE_SIZE);
+
+        total = ((__u64)si.totalram) * si.mem_unit;
+        total2 = num_physpages * PAGE_SIZE;
+        if (total < total2)
+                total = total2;
+
+        if (total == 0) {
+                CERROR("Can't determine memory size\n");
+                return -ENOMEM;
+        }
+                 
+        roundup = (128<<20);
+        total = (total + (roundup - 1)) & ~(roundup - 1);
+
+        phys.PhysAddr = 0;
+        phys.Length = total;
 
-        LASSERT(nal == &kibnal_api);
+        frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0,
+                                      kibnal_data.kib_pd, access,
+                                      &kibnal_data.kib_whole_mem.md_handle,
+                                      &kibnal_data.kib_whole_mem.md_addr,
+                                      &kibnal_data.kib_whole_mem.md_lkey,
+                                      &kibnal_data.kib_whole_mem.md_rkey);
+
+        if (frc != FSUCCESS) {
+                CERROR("registering physical memory failed: %d\n", frc);
+                return -EIO;
+        }
+
+        CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n",
+               phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr);
+
+        return 0;
+}
+
+void
+kibnal_shutdown (lnet_ni_t *ni)
+{
+        int   i;
+        int   rc;
+
+        LASSERT (ni == kibnal_data.kib_ni);
+        LASSERT (ni->ni_data == &kibnal_data);
+       
+        CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
+               atomic_read (&libcfs_kmemory));
 
         switch (kibnal_data.kib_init) {
         default:
@@ -1172,20 +1666,16 @@ kibnal_api_shutdown (nal_t *nal)
                 LBUG();
 
         case IBNAL_INIT_ALL:
-                /* stop calls to nal_cmd */
-                libcfs_nal_cmd_unregister(IIBNAL);
-                /* No new peers */
+                /* stop accepting connections, prevent new peers and start to
+                 * tear down all existing ones... */
+                kibnal_stop_listener(1);
 
-                /* resetting my NID to unadvertises me, removes my
-                 * listener and nukes all current peers */
-                kibnal_set_mynid (PTL_NID_ANY);
-
-                /* Wait for all peer state to clean up (crazy) */
+                /* Wait for all peer state to clean up */
                 i = 2;
                 while (atomic_read (&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
-                               "waiting for %d peers to disconnect (can take a few seconds)\n",
+                               "waiting for %d peers to disconnect\n",
                                atomic_read (&kibnal_data.kib_npeers));
                         set_current_state (TASK_UNINTERRUPTIBLE);
                         schedule_timeout (HZ);
@@ -1193,7 +1683,7 @@ kibnal_api_shutdown (nal_t *nal)
                 /* fall through */
 
         case IBNAL_INIT_CQ:
-                rc = iibt_cq_destroy(kibnal_data.kib_cq);
+                rc = iba_destroy_cq(kibnal_data.kib_cq);
                 if (rc != 0)
                         CERROR ("Destroy CQ error: %d\n", rc);
                 /* fall through */
@@ -1202,63 +1692,43 @@ kibnal_api_shutdown (nal_t *nal)
                 kibnal_free_pages (kibnal_data.kib_tx_pages);
                 /* fall through */
 
-        case IBNAL_INIT_MR:
-                if (kibnal_data.kib_md.md_handle != NULL) {
-                        rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
-                        if (rc != FSUCCESS)
-                                CERROR ("Deregister memory: %d\n", rc);
-                }
+        case IBNAL_INIT_MD:
+                rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle);
+                if (rc != FSUCCESS)
+                        CERROR ("Deregister memory: %d\n", rc);
                 /* fall through */
 
-#if IBNAL_FMR
-        case IBNAL_INIT_FMR:
-                rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
-                if (rc != 0)
-                        CERROR ("Destroy FMR pool error: %d\n", rc);
-                /* fall through */
-#endif
         case IBNAL_INIT_PD:
-                rc = iibt_pd_free(kibnal_data.kib_pd);
+                rc = iba_free_pd(kibnal_data.kib_pd);
                 if (rc != 0)
                         CERROR ("Destroy PD error: %d\n", rc);
                 /* fall through */
 
         case IBNAL_INIT_SD:
-                rc = iibt_sd_deregister(kibnal_data.kib_sd);
+                rc = iba_sd_deregister(kibnal_data.kib_sd);
                 if (rc != 0)
                         CERROR ("Deregister SD error: %d\n", rc);
                 /* fall through */
 
-        case IBNAL_INIT_PORT:
-                /* XXX ??? */
-                /* fall through */
-
         case IBNAL_INIT_PORTATTRS:
-                PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
+                LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
                             kibnal_data.kib_hca_attrs.PortAttributesListSize);
                 /* fall through */
 
         case IBNAL_INIT_HCA:
-                rc = iibt_close_hca(kibnal_data.kib_hca);
+                rc = iba_close_ca(kibnal_data.kib_hca);
                 if (rc != 0)
                         CERROR ("Close HCA  error: %d\n", rc);
                 /* fall through */
 
-        case IBNAL_INIT_LIB:
-                lib_fini(&kibnal_lib);
-                /* fall through */
-
         case IBNAL_INIT_DATA:
-                /* Module refcount only gets to zero when all peers
-                 * have been closed so all lists must be empty */
                 LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
                 LASSERT (kibnal_data.kib_peers != NULL);
                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
                 }
                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
-                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
-                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+                LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
                 LASSERT (list_empty (&kibnal_data.kib_connd_peers));
 
@@ -1282,83 +1752,143 @@ kibnal_api_shutdown (nal_t *nal)
                 break;
         }
 
-        if (kibnal_data.kib_tx_descs != NULL)
-                PORTAL_FREE (kibnal_data.kib_tx_descs,
-                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        kibnal_free_tx_descs();
 
         if (kibnal_data.kib_peers != NULL)
-                PORTAL_FREE (kibnal_data.kib_peers,
+                LIBCFS_FREE (kibnal_data.kib_peers,
                              sizeof (struct list_head) * 
                              kibnal_data.kib_peer_hash_size);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
-        printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n",
-               atomic_read(&portal_kmemory));
+               atomic_read (&libcfs_kmemory));
 
         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+        PORTAL_MODULE_UNUSE;
 }
 
-#define roundup_power(val, power) \
-        ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
-
-/* this isn't very portable or sturdy in the face of funny mem/bus configs */
-static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
+int 
+kibnal_get_ipif_name(char *ifname, int ifname_size, int idx)
 {
-        struct sysinfo si;
-        __u64 ret;
+        char  *basename = *kibnal_tunables.kib_ipif_basename;
+        int    n = strlen(basename);
+        int    baseidx;
+        int    m;
 
-        /* XXX we don't bother with first-gen cards */
-        if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
-                return 0ULL;
+        if (n == 0) {                           /* empty string */
+                CERROR("Empty IP interface basename specified\n");
+                return -EINVAL;
+        }
 
-        si_meminfo(&si);
-        ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
-        return roundup_power(ret, 128 * 1024 * 1024);
-} 
-#undef roundup_power
-
-static int
-kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
-                     ptl_ni_limits_t *requested_limits,
-                     ptl_ni_limits_t *actual_limits)
+        for (m = n; m > 0; m--)                 /* find max numeric postfix */
+                if (sscanf(basename + m - 1, "%d", &baseidx) != 1)
+                        break;
+
+        if (m == 0)                             /* just a number */
+                m = n;
+
+        if (m == n)                             /* no postfix */
+                baseidx = 1;                    /* default to 1 */
+
+        if (m >= ifname_size)
+                m = ifname_size - 1;
+
+        memcpy(ifname, basename, m);            /* copy prefix name */
+        
+        snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx);
+        
+        if (strlen(ifname) == ifname_size - 1) {
+                CERROR("IP interface basename %s too long\n", basename);
+                return -EINVAL;
+        }
+        
+        return 0;
+}
+
+int
+kibnal_startup (lnet_ni_t *ni)
 {
-        ptl_process_id_t    process_id;
-        int                 pkmem = atomic_read(&portal_kmemory);
+        char                ipif_name[32];
+        __u32               ip;
+        __u32               netmask;
+        int                 up;
+        int                 nob;
+        struct timeval      tv;
         IB_PORT_ATTRIBUTES *pattr;
         FSTATUS             frc;
         int                 rc;
-        int                 n;
+        __u32               n;
         int                 i;
 
-        LASSERT (nal == &kibnal_api);
+        LASSERT (ni->ni_lnd == &the_kiblnd);
 
-        if (nal->nal_refct != 0) {
-                if (actual_limits != NULL)
-                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
-                /* This module got the first ref */
-                PORTAL_MODULE_USE;
-                return (PTL_OK);
+        /* Only 1 instance supported */
+        if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
+                CERROR ("Only 1 instance supported\n");
+                return -EPERM;
         }
 
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+        if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
+                CERROR ("Can't set credits(%d) > ntx(%d)\n",
+                        *kibnal_tunables.kib_credits,
+                        *kibnal_tunables.kib_ntx);
+                return -EINVAL;
+        }
 
-        frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, 
-                                       &kibnal_data.kib_interfaces);
-        if (frc != FSUCCESS) {
-                CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
-                        frc);
-                return -ENOSYS;
+        ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
+        ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
+
+        CLASSERT (LNET_MAX_INTERFACES > 1);
+
+        if (ni->ni_interfaces[0] == NULL) {
+                kibnal_data.kib_hca_idx = 0;
+        } else {
+                /* Use the HCA specified in 'networks=' */
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Multiple interfaces not supported\n");
+                        return -EPERM;
+                }
+                
+                /* Parse <number> into kib_hca_idx */
+                nob = strlen(ni->ni_interfaces[0]);
+                if (sscanf(ni->ni_interfaces[0], "%d%n", 
+                           &kibnal_data.kib_hca_idx, &nob) < 1 ||
+                    nob != strlen(ni->ni_interfaces[0])) {
+                        CERROR("Can't parse interface '%s'\n",
+                               ni->ni_interfaces[0]);
+                        return -EINVAL;
+                }
+        }
+
+        rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name),
+                                  kibnal_data.kib_hca_idx);
+        if (rc != 0)
+                return rc;
+        
+        rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
+        if (rc != 0) {
+                CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
+                return -ENETDOWN;
+        }
+        
+        if (!up) {
+                CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
+                return -ENETDOWN;
         }
+        
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
+
+        ni->ni_data = &kibnal_data;
+        kibnal_data.kib_ni = ni;
+
+        do_gettimeofday(&tv);
+        kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        init_MUTEX (&kibnal_data.kib_nid_mutex);
-        init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
-        kibnal_data.kib_nid = PTL_NID_ANY;
+        PORTAL_MODULE_USE;
 
         rwlock_init(&kibnal_data.kib_global_lock);
 
         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (kibnal_data.kib_peers,
+        LIBCFS_ALLOC (kibnal_data.kib_peers,
                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
         if (kibnal_data.kib_peers == NULL) {
                 goto failed;
@@ -1369,22 +1899,18 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         spin_lock_init (&kibnal_data.kib_connd_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
         INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+        INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
 
         spin_lock_init (&kibnal_data.kib_sched_lock);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
 
         spin_lock_init (&kibnal_data.kib_tx_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
-        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
-        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
 
-        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
-                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
-        if (kibnal_data.kib_tx_descs == NULL) {
-                CERROR ("Can't allocate tx descs\n");
+        rc = kibnal_alloc_tx_descs();
+        if (rc != 0) {
+                CERROR("Can't allocate tx descs\n");
                 goto failed;
         }
 
@@ -1392,24 +1918,15 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
-        process_id.pid = requested_pid;
-        process_id.nid = kibnal_data.kib_nid;
-        
-        rc = lib_init(&kibnal_lib, nal, process_id,
-                      requested_limits, actual_limits);
-        if (rc != PTL_OK) {
-                CERROR("lib_init failed: error %d\n", rc);
-                goto failed;
-        }
-
-        /* lib interface initialised */
-        kibnal_data.kib_init = IBNAL_INIT_LIB;
-        /*****************************************************/
+        kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
+        kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
+                                          *kibnal_tunables.kib_sd_retries;
 
         for (i = 0; i < IBNAL_N_SCHED; i++) {
-                rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
+                rc = kibnal_thread_start (kibnal_scheduler,
+                                          (void *)(unsigned long)i);
                 if (rc != 0) {
-                        CERROR("Can't spawn iibnal scheduler[%d]: %d\n",
+                        CERROR("Can't spawn iib scheduler[%d]: %d\n",
                                i, rc);
                         goto failed;
                 }
@@ -1417,30 +1934,38 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         rc = kibnal_thread_start (kibnal_connd, NULL);
         if (rc != 0) {
-                CERROR ("Can't spawn iibnal connd: %d\n", rc);
+                CERROR ("Can't spawn iib connd: %d\n", rc);
                 goto failed;
         }
 
         n = sizeof(kibnal_data.kib_hca_guids) /
             sizeof(kibnal_data.kib_hca_guids[0]);
-        frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
+        frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids);
         if (frc != FSUCCESS) {
-                CERROR ("Can't get channel adapter guids: %d\n", frc);
+                CERROR ("Can't get HCA guids: %d\n", frc);
                 goto failed;
         }
+
         if (n == 0) {
-                CERROR ("No channel adapters found\n");
+                CERROR ("No HCAs found\n");
                 goto failed;
         }
 
-        /* Infinicon has per-HCA rather than per CQ completion handlers */
-        frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
-                            kibnal_ca_callback,
-                            kibnal_ca_async_callback,
-                            &kibnal_data.kib_hca,
+        if (n <= kibnal_data.kib_hca_idx) {
+                CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
+                       kibnal_data.kib_hca_idx, n - 1);
+                goto failed;
+        }
+        
+        /* Infinicon has per-HCA notification callbacks */
+        frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
+                            kibnal_hca_callback,
+                            kibnal_hca_async_callback,
+                            NULL,
                             &kibnal_data.kib_hca);
         if (frc != FSUCCESS) {
-                CERROR ("Can't open CA[0]: %d\n", frc);
+                CERROR ("Can't open HCA[%d]: %d\n", 
+                        kibnal_data.kib_hca_idx, frc);
                 goto failed;
         }
         
@@ -1450,14 +1975,14 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         kibnal_data.kib_hca_attrs.PortAttributesList = NULL;
         kibnal_data.kib_hca_attrs.PortAttributesListSize = 0;
-        frc = iibt_query_hca(kibnal_data.kib_hca,
-                             &kibnal_data.kib_hca_attrs, NULL);
+        frc = iba_query_ca(kibnal_data.kib_hca,
+                           &kibnal_data.kib_hca_attrs, NULL);
         if (frc != FSUCCESS) {
                 CERROR ("Can't size port attrs: %d\n", frc);
                 goto failed;
         }
         
-        PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
+        LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList,
                      kibnal_data.kib_hca_attrs.PortAttributesListSize);
         if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL)
                 goto failed;
@@ -1466,10 +1991,11 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_PORTATTRS;
         /*****************************************************/
         
-        frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
-                             NULL);
+        frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
+                           NULL);
         if (frc != FSUCCESS) {
-                CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+                CERROR ("Can't get port attrs for HCA %d: %d\n",
+                        kibnal_data.kib_hca_idx, frc);
                 goto failed;
         }
 
@@ -1508,11 +2034,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
         
-        /* Active port found */
-        kibnal_data.kib_init = IBNAL_INIT_PORT;
-        /*****************************************************/
-
-        frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
+        frc = iba_sd_register(&kibnal_data.kib_sd, NULL);
         if (frc != FSUCCESS) {
                 CERROR ("Can't register with SD: %d\n", frc);
                 goto failed;
@@ -1522,7 +2044,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_SD;
         /*****************************************************/
 
-        frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
+        frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd);
         if (frc != FSUCCESS) {
                 CERROR ("Can't create PD: %d\n", rc);
                 goto failed;
@@ -1532,73 +2054,14 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_PD;
         /*****************************************************/
 
-#if IBNAL_FMR
-        {
-                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
-                struct ib_fmr_pool_param params = {
-                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
-                        .access            = (IB_ACCESS_LOCAL_WRITE |
-                                              IB_ACCESS_REMOTE_WRITE |
-                                              IB_ACCESS_REMOTE_READ),
-                        .pool_size         = pool_size,
-                        .dirty_watermark   = (pool_size * 3)/4,
-                        .flush_function    = NULL,
-                        .flush_arg         = NULL,
-                        .cache             = 1,
-                };
-                rc = ib_fmr_pool_create(kibnal_data.kib_pd, &params,
-                                        &kibnal_data.kib_fmr_pool);
-                if (rc != 0) {
-                        CERROR ("Can't create FMR pool size %d: %d\n", 
-                                pool_size, rc);
-                        goto failed;
-                }
-        }
-
-        /* flag FMR pool initialised */
-        kibnal_data.kib_init = IBNAL_INIT_FMR;
-#endif
-        /*****************************************************/
-        if (IBNAL_WHOLE_MEM) {
-                IB_MR_PHYS_BUFFER phys;
-                IB_ACCESS_CONTROL access;
-                kib_md_t *md = &kibnal_data.kib_md;
-
-                memset(&access, 0, sizeof(access));
-                access.s.MWBindable = 1;
-                access.s.LocalWrite = 1;
-                access.s.RdmaRead = 1;
-                access.s.RdmaWrite = 1;
-
-                phys.PhysAddr = 0;
-                phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
-                if (phys.Length == 0) {
-                        CERROR ("couldn't determine the end of phys mem\n");
-                        goto failed;
-                }
-       
-                rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
-                                                          0,
-                                                          &phys, 1,
-                                                          0,
-                                                          kibnal_data.kib_pd,
-                                                          access,
-                                                          &md->md_handle,
-                                                          &md->md_addr,
-                                                          &md->md_lkey,
-                                                          &md->md_rkey);
-                if (rc != FSUCCESS) {
-                        CERROR("registering physical memory failed: %d\n", 
-                               rc);
-                        CERROR("falling back to registration per-rdma\n");
-                        md->md_handle = NULL;
-                } else {
-                        CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
-                               phys.Length);
-                        kibnal_data.kib_init = IBNAL_INIT_MR;
-                }
+        rc = kibnal_register_all_memory();
+        if (rc != 0) {
+                CERROR ("Can't register all memory\n");
+                goto failed;
         }
-
+        
+        /* flag whole memory MD initialised */
+        kibnal_data.kib_init = IBNAL_INIT_MD;
         /*****************************************************/
 
         rc = kibnal_setup_tx_descs();
@@ -1611,38 +2074,33 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_TXD;
         /*****************************************************/
         
-        {
-                uint32 nentries;
-
-                frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
-                                     &kibnal_data.kib_cq, &kibnal_data.kib_cq,
-                                     &nentries);
-                if (frc != FSUCCESS) {
-                        CERROR ("Can't create RX CQ: %d\n", frc);
-                        goto failed;
-                }
-
-                /* flag CQ initialised */
-                kibnal_data.kib_init = IBNAL_INIT_CQ;
+        frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
+                            &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+                            &n);
+        if (frc != FSUCCESS) {
+                CERROR ("Can't create RX CQ: %d\n", frc);
+                goto failed;
+        }
 
-                if (nentries < IBNAL_CQ_ENTRIES) {
-                        CERROR ("CQ only has %d entries, need %d\n", 
-                                nentries, IBNAL_CQ_ENTRIES);
-                        goto failed;
-                }
+        /* flag CQ initialised */
+        kibnal_data.kib_init = IBNAL_INIT_CQ;
+        /*****************************************************/
+        
+        if (n < IBNAL_CQ_ENTRIES()) {
+                CERROR ("CQ only has %d entries: %d needed\n", 
+                        n, IBNAL_CQ_ENTRIES());
+                goto failed;
+        }
 
-                rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
-                if (rc != 0) {
-                        CERROR ("Failed to re-arm completion queue: %d\n", rc);
-                        goto failed;
-                }
+        rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC);
+        if (rc != 0) {
+                CERROR ("Failed to re-arm completion queue: %d\n", rc);
+                goto failed;
         }
         
-        /*****************************************************/
-
-        rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL);
+        rc = kibnal_start_listener();
         if (rc != 0) {
-                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                CERROR("Can't start listener: %d\n", rc);
                 goto failed;
         }
 
@@ -1650,26 +2108,18 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
 
-        printk(KERN_INFO "Lustre: Infinicon IB NAL loaded "
-               "(initial mem %d)\n", pkmem);
-
-        return (PTL_OK);
+        return (0);
 
  failed:
-        kibnal_api_shutdown (&kibnal_api);    
-        return (PTL_FAIL);
+        kibnal_shutdown (ni);    
+        return (-ENETDOWN);
 }
 
 void __exit
 kibnal_module_fini (void)
 {
-#ifdef CONFIG_SYSCTL
-        if (kibnal_tunables.kib_sysctl != NULL)
-                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-#endif
-        PtlNIFini(kibnal_ni);
-
-        ptl_unregister_nal(IIBNAL);
+        lnet_unregister_lnd(&the_kiblnd);
+        kibnal_tunables_fini();
 }
 
 int __init
@@ -1677,46 +2127,22 @@ kibnal_module_init (void)
 {
         int    rc;
 
-        if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
-                CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
-                return -EINVAL;
-        }
-
-        /* the following must be sizeof(int) for proc_dointvec() */
-        if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
-                CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
-                return -EINVAL;
+        if (the_lnet.ln_ptlcompat != 0) {
+                LCONSOLE_ERROR("IIB does not support portals compatibility mode\n");
+                return -ENODEV;
         }
+        
+        rc = kibnal_tunables_init();
+        if (rc != 0)
+                return rc;
 
-        kibnal_api.nal_ni_init = kibnal_api_startup;
-        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
-
-        /* Initialise dynamic tunables to defaults once only */
-        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
-
-        rc = ptl_register_nal(IIBNAL, &kibnal_api);
-        if (rc != PTL_OK) {
-                CERROR("Can't register IBNAL: %d\n", rc);
-                return (-ENOMEM);               /* or something... */
-        }
+        lnet_register_lnd(&the_kiblnd);
 
-        /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(IIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                ptl_unregister_nal(IIBNAL);
-                return (-ENODEV);
-        }
-        
-#ifdef CONFIG_SYSCTL
-        /* Press on regardless even if registering sysctl doesn't work */
-        kibnal_tunables.kib_sysctl = 
-                register_sysctl_table (kibnal_top_ctl_table, 0);
-#endif
-        return (0);
+        return 0;
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
index e16bd4c..0a2fa94 100644 (file)
 #include <linux/kmod.h>
 #include <linux/sysctl.h>
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
 
 #include <linux/iba/ibt.h>
 
 #error Invalid GCC version. Must use GCC >= 3.2.3
 #endif
 
-#define IBNAL_SERVICE_NAME   "iibnal"
-#define IBNAL_SERVICE_NUMBER 0x11b9a1
-
 #if CONFIG_SMP
 # define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
 #else
 # define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
-#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
-#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
+#define IBNAL_FMR                    0          /* map on demand v. use whole mem mapping */
 
-#define IBNAL_MSG_SIZE       (4<<10)            /* max size of queued messages (inc hdr) */
+/* tunables fixed at compile time */
+#define IBNAL_PEER_HASH_SIZE         101        /* # peer lists */
+#define IBNAL_RESCHED                100        /* # scheduler loops before reschedule */
+#define IBNAL_MSG_QUEUE_SIZE         8          /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER       7          /* when to eagerly return credits */
+#define IBNAL_MSG_SIZE              (4<<10)     /* max size of queued messages (inc hdr) */
+#define IBNAL_RDMA_BASE              0x0eeb0000
+#define IBNAL_STARTING_PSN           1
 
-#define IBNAL_MSG_QUEUE_SIZE   8                /* # messages/RDMAs in-flight */
-#define IBNAL_CREDIT_HIGHWATER 7                /* when to eagerly return credits */
+/* QP tunables */
 /* 7 indicates infinite retry attempts, Infinicon recommended 5 */
-#define IBNAL_RETRY            5                /* # times to retry */
-#define IBNAL_RNR_RETRY        5                /*  */
-#define IBNAL_CM_RETRY         5                /* # times to retry connection */
-#define IBNAL_FLOW_CONTROL     1
-#define IBNAL_ACK_TIMEOUT       20              /* supposedly 4 secs */
-
-#define IBNAL_NTX             64                /* # tx descs */
-/* this had to be dropped down so that we only register < 255 pages per
- * region.  this will change if we register all memory. */
-#define IBNAL_NTX_NBLK        128               /* # reserved tx descs */
-
-#define IBNAL_PEER_HASH_SIZE  101               /* # peer lists */
-
-#define IBNAL_RESCHED         100               /* # scheduler loops before reschedule */
-
-#define IBNAL_CONCURRENT_PEERS 1000             /* # nodes all talking at once to me */
-
-/* default vals for runtime tunables */
-#define IBNAL_IO_TIMEOUT      50                /* default comms timeout (seconds) */
+#define IBNAL_RETRY                  5          /* # times to retry */
+#define IBNAL_RNR_RETRY              5          /*  */
+#define IBNAL_CM_RETRY               5          /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL           1
+#define IBNAL_ACK_TIMEOUT            20         /* supposedly 4 secs */
+#define IBNAL_EE_FLOW                1
+#define IBNAL_LOCAL_SUB              1
+#define IBNAL_FAILOVER_ACCEPTED      0
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
-#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_TX_MSGS()       (*kibnal_tunables.kib_ntx)
+#define IBNAL_TX_MSG_BYTES()  (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES()  ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
-#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1)
+#if IBNAL_USE_FMR
+# define IBNAL_MAX_RDMA_FRAGS 1
+# define IBNAL_CONCURRENT_SENDS IBNAL_RX_MSGS
+#else
+# define IBNAL_MAX_RDMA_FRAGS LNET_MAX_IOV
+# define IBNAL_CONCURRENT_SENDS IBNAL_MSG_QUEUE_SIZE
+#endif
 
 /* RX messages (per connection) */
-#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
-#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
+#define IBNAL_RX_MSGS         (IBNAL_MSG_QUEUE_SIZE * 2)
+#define IBNAL_RX_MSG_BYTES    (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES    ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-/* we may have up to 2 completions per transmit +
-   1 completion per receive, per connection */
-#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
-                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
-
-#define IBNAL_RDMA_BASE  0x0eeb0000
-#define IBNAL_FMR        0
-#define IBNAL_WHOLE_MEM  1
-#define IBNAL_CKSUM      0
-//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
-#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
-
-/* XXX I have no idea. */
-#define IBNAL_STARTING_PSN 1
+#define IBNAL_CQ_ENTRIES()  (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) +             \
+                             (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers))
 
 typedef struct
 {
-        int               kib_io_timeout;       /* comms timeout (seconds) */
+        char            **kib_hca_basename;     /* HCA base name */
+        char            **kib_ipif_basename;    /* IPoIB interface base name */
+        char            **kib_service_name;     /* global service name */
+        unsigned int     *kib_service_number;   /* global service number */
+        int              *kib_min_reconnect_interval; /* min connect retry seconds... */
+        int              *kib_max_reconnect_interval; /* max connect retry seconds */
+        int              *kib_concurrent_peers; /* max # peers */
+        int              *kib_cksum;            /* checksum kib_msg_t? */
+        int              *kib_timeout;          /* comms timeout (seconds) */
+        int              *kib_keepalive;        /* keepalive timeout (seconds) */
+        int              *kib_ntx;              /* # tx descs */
+        int              *kib_credits;          /* # concurrent sends */
+        int              *kib_peercredits;      /* # concurrent sends to 1 peer */
+        int              *kib_sd_retries;       /* # concurrent sends to 1 peer */
+        int              *kib_concurrent_sends; /* send work queue sizing */
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
         struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+#endif
 } kib_tunables_t;
 
-/* some of these have specific types in the stack that just map back
- * to the uFOO types, like IB_{L,R}_KEY. */
+/* NB The Infinicon stack has specific typedefs for some things
+ * (e.g. IB_{L,R}_KEY), that just map back to __u32 etc */
 typedef struct
 {
         int               ibp_npages;           /* # pages */
-        int               ibp_mapped;           /* mapped? */
-        __u64             ibp_vaddr;            /* mapped region vaddr */
-        __u32             ibp_lkey;             /* mapped region lkey */
-        __u32             ibp_rkey;             /* mapped region rkey */
-        IB_HANDLE         ibp_handle;           /* mapped region handle */
         struct page      *ibp_pages[0];
 } kib_pages_t;
 
@@ -170,39 +164,35 @@ typedef struct
         __u64             kib_incarnation;      /* which one am I */
         int               kib_shutdown;         /* shut down? */
         atomic_t          kib_nthreads;         /* # live threads */
+        lnet_ni_t        *kib_ni;               /* _the_ iib instance */
 
-        __u64             kib_service_id;       /* service number I listen on */
         __u64             kib_port_guid;        /* my GUID (lo 64 of GID)*/
         __u16             kib_port_pkey;        /* my pkey, whatever that is */
-        ptl_nid_t         kib_nid;              /* my NID */
-        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
-        struct semaphore  kib_nid_signal;       /* signal completion */
-        IB_HANDLE         kib_cep;              /* connection end point */
+        struct semaphore  kib_listener_signal;  /* signal completion */
+        IB_HANDLE         kib_listener_cep;     /* connection end point */
 
         rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
+        int               kib_ready;            /* CQ callback fired */
+        int               kib_checking_cq;      /* a scheduler is checking the CQ */
 
         struct list_head *kib_peers;            /* hash table of all my known peers */
         int               kib_peer_hash_size;   /* size of kib_peers */
         atomic_t          kib_npeers;           /* # peers extant */
         atomic_t          kib_nconns;           /* # connections extant */
 
+        struct list_head  kib_connd_zombies;    /* connections to free */
         struct list_head  kib_connd_conns;      /* connections to progress */
         struct list_head  kib_connd_peers;      /* peers waiting for a connection */
-        wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
-        unsigned long     kib_connd_waketime;   /* when connd will wake */
+        wait_queue_head_t kib_connd_waitq;      /* connection daemon sleep here */
         spinlock_t        kib_connd_lock;       /* serialise */
 
         wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
-        struct list_head  kib_sched_txq;        /* tx requiring attention */
-        struct list_head  kib_sched_rxq;        /* rx requiring attention */
         spinlock_t        kib_sched_lock;       /* serialise */
 
         struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
         kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
 
         struct list_head  kib_idle_txs;         /* idle tx descriptors */
-        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
-        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
         __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
         spinlock_t        kib_tx_lock;          /* serialise */
 
@@ -211,15 +201,13 @@ typedef struct
         IB_HANDLE         kib_pd;               /* protection domain */
         IB_HANDLE         kib_sd;               /* SD handle */
         IB_HANDLE         kib_cq;               /* completion queue */
-        kib_md_t          kib_md;               /* full-mem registration */
+        kib_md_t          kib_whole_mem;        /* whole-mem registration */
 
-        void             *kib_listen_handle;    /* where I listen for connections */
+        int               kib_hca_idx;          /* my HCA number */
+        uint64            kib_hca_guids[8];     /* all the HCA guids */
+        IB_CA_ATTRIBUTES  kib_hca_attrs;        /* where to get HCA attrs */
 
-        IBT_INTERFACE_UNION kib_interfaces;     /* The Infinicon IBT interface */
-
-        uint64              kib_hca_guids[8];   /* all the HCA guids */
-        IB_CA_ATTRIBUTES    kib_hca_attrs;      /* where to get HCA attrs */
-        FABRIC_OPERATION_DATA kib_fabopdata;    /* (un)advertise service record */
+        COMMAND_CONTROL_PARAMETERS kib_sdretry; /* control SD query retries */
 } kib_data_t;
 
 #define IBNAL_INIT_NOTHING         0
@@ -227,14 +215,12 @@ typedef struct
 #define IBNAL_INIT_LIB             2
 #define IBNAL_INIT_HCA             3
 #define IBNAL_INIT_PORTATTRS       4
-#define IBNAL_INIT_PORT            5
-#define IBNAL_INIT_SD              6
-#define IBNAL_INIT_PD              7
-#define IBNAL_INIT_FMR             8
-#define IBNAL_INIT_MR              9
-#define IBNAL_INIT_TXD             10
-#define IBNAL_INIT_CQ              11
-#define IBNAL_INIT_ALL             12
+#define IBNAL_INIT_SD              5
+#define IBNAL_INIT_PD              6
+#define IBNAL_INIT_MD              7
+#define IBNAL_INIT_TXD             8
+#define IBNAL_INIT_CQ              9
+#define IBNAL_INIT_ALL             10
 
 /************************************************************************
  * Wire message structs.
@@ -243,35 +229,60 @@ typedef struct
  * private data and SM service info), is LE on the wire.
  */
 
-/* also kib_md_t above */
+typedef struct kib_connparams
+{
+        __u32             ibcp_queue_depth;
+        __u32             ibcp_max_msg_size;
+        __u32             ibcp_max_frags;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+        lnet_hdr_t        ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
 
+#if IBNAL_USE_FMR
 typedef struct
 {
-        __u32                 rd_nob;           /* # of bytes */
-        __u64                 rd_addr;          /* remote io vaddr */
+       __u64             rd_addr;              /* IO VMA address */
+       __u32             rd_nob;               /* # of bytes */
+       __u32             rd_key;               /* remote key */
 } WIRE_ATTR kib_rdma_desc_t;
+#else
+typedef struct
+{
+        __u32             rf_nob;               /* # of bytes */
+        __u64             rf_addr;              /* remote io vaddr */
+} WIRE_ATTR kib_rdma_frag_t;
 
 typedef struct
 {
-        ptl_hdr_t         ibim_hdr;             /* portals header */
-        char              ibim_payload[0];      /* piggy-backed payload */
-} WIRE_ATTR kib_immediate_msg_t;
+        __u32             rd_key;               /* local/remote key */
+        __u32             rd_nfrag;             /* # fragments */
+        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+#endif
+
+typedef struct
+{
+        lnet_hdr_t        ibprm_hdr;            /* LNET header */
+        __u64             ibprm_cookie;         /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
 
-/* these arrays serve two purposes during rdma.  they are built on the passive
- * side and sent to the active side as remote arguments.  On the active side
- * the descs are used as a data structure on the way to local gather items.
- * the different roles result in split local/remote meaning of desc->rd_key */
 typedef struct
 {
-        ptl_hdr_t         ibrm_hdr;             /* portals header */
-        __u64             ibrm_cookie;          /* opaque completion cookie */
-        __u32             ibrm_num_descs;       /* how many descs */
-        __u32             rd_key;               /* remote key */
-        kib_rdma_desc_t   ibrm_desc[0];         /* where to suck/blow */
-} WIRE_ATTR kib_rdma_msg_t;
+        __u64             ibpam_src_cookie;     /* reflected completion cookie */
+        __u64             ibpam_dst_cookie;     /* opaque completion cookie */
+        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
 
-#define kib_rdma_msg_len(num_descs) \
-        offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs])
+typedef struct
+{
+        lnet_hdr_t        ibgm_hdr;             /* LNET header */
+        __u64             ibgm_cookie;          /* opaque completion cookie */
+        kib_rdma_desc_t   ibgm_rd;              /* sender's sink buffer */
+} WIRE_ATTR kib_get_msg_t;
 
 typedef struct
 {
@@ -281,30 +292,49 @@ typedef struct
 
 typedef struct
 {
-        __u32              ibm_magic;           /* I'm an openibnal message */
-        __u16              ibm_version;         /* this is my version number */
-        __u8               ibm_type;            /* msg type */
-        __u8               ibm_credits;         /* returned credits */
-#if IBNAL_CKSUM
-        __u32              ibm_nob;
-        __u32              ibm_cksum;
-#endif
+        /* First 2 fields fixed FOR ALL TIME */
+        __u32             ibm_magic;            /* I'm an openibnal message */
+        __u16             ibm_version;          /* this is my version number */
+
+        __u8              ibm_type;             /* msg type */
+        __u8              ibm_credits;          /* returned credits */
+        __u32             ibm_nob;              /* # bytes in whole message */
+        __u32             ibm_cksum;            /* checksum (0 == no checksum) */
+        __u64             ibm_srcnid;           /* sender's NID */
+        __u64             ibm_srcstamp;         /* sender's incarnation */
+        __u64             ibm_dstnid;           /* destination's NID */
+        __u64             ibm_dststamp;         /* destination's incarnation */
+        __u64             ibm_seq;              /* sequence number */
+
         union {
+                kib_connparams_t      connparams;
                 kib_immediate_msg_t   immediate;
-                kib_rdma_msg_t        rdma;
+                kib_putreq_msg_t      putreq;
+                kib_putack_msg_t      putack;
+                kib_get_msg_t         get;
                 kib_completion_msg_t  completion;
         } WIRE_ATTR ibm_u;
 } WIRE_ATTR kib_msg_t;
 
-#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
-#define IBNAL_MSG_VERSION              1        /* current protocol version */
+#define IBNAL_MSG_MAGIC LNET_PROTO_IIB_MAGIC    /* unique magic */
+#define IBNAL_MSG_VERSION              2        /* current protocol version */
+#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 1   /* previous version */
 
+#define IBNAL_MSG_CONNREQ           0xc0        /* connection request */
+#define IBNAL_MSG_CONNACK           0xc1        /* connection acknowledge */
 #define IBNAL_MSG_NOOP              0xd0        /* nothing (just credits) */
-#define IBNAL_MSG_IMMEDIATE         0xd1        /* portals hdr + payload */
-#define IBNAL_MSG_PUT_RDMA          0xd2        /* portals PUT hdr + source rdma desc */
-#define IBNAL_MSG_PUT_DONE          0xd3        /* signal PUT rdma completion */
-#define IBNAL_MSG_GET_RDMA          0xd4        /* portals GET hdr + sink rdma desc */
-#define IBNAL_MSG_GET_DONE          0xd5        /* signal GET rdma completion */
+#define IBNAL_MSG_IMMEDIATE         0xd1        /* immediate */
+#define IBNAL_MSG_PUT_REQ           0xd2        /* putreq (src->sink) */
+#define IBNAL_MSG_PUT_NAK           0xd3        /* completion (sink->src) */
+#define IBNAL_MSG_PUT_ACK           0xd4        /* putack (sink->src) */
+#define IBNAL_MSG_PUT_DONE          0xd5        /* completion (src->sink) */
+#define IBNAL_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
+#define IBNAL_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
+
+/* connection rejection reasons */
+#define IBNAL_REJECT_CONN_RACE       0          /* You lost connection race */
+#define IBNAL_REJECT_NO_RESOURCES    1          /* Out of memory/conns etc */
+#define IBNAL_REJECT_FATAL           2          /* Anything else */
 
 /***********************************************************************/
 
@@ -312,431 +342,167 @@ typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
         struct kib_conn          *rx_conn;      /* owning conn */
-        int                       rx_rdma;      /* RDMA completion posted? */
-        int                       rx_posted;    /* posted? */
-        __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        int                       rx_nob;       /* # bytes received (-1 while posted) */
+        __u64                     rx_hca_msg;   /* pre-mapped buffer (hca vaddr) */
         kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
-        IB_WORK_REQ               rx_wrq;
+        IB_WORK_REQ2              rx_wrq;
         IB_LOCAL_DATASEGMENT      rx_gl;        /* and its memory */
 } kib_rx_t;
 
 typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
-        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
         struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_queued;    /* queued for sending */
+        int                       tx_waiting;   /* waiting for peer */
         int                       tx_status;    /* completion status */
         unsigned long             tx_deadline;  /* completion deadline */
-        int                       tx_passive_rdma; /* peer sucks/blows */
-        int                       tx_passive_rdma_wait; /* waiting for peer to complete */
-        __u64                     tx_passive_rdma_cookie; /* completion cookie */
-        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
-        kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
-        __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
+        __u64                     tx_cookie;    /* completion cookie */
+        lnet_msg_t               *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
         kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
-        int                       tx_nsp;       /* # send work items */
-        IB_WORK_REQ               tx_wrq[IBNAL_TX_MAX_SG];    /* send work items... */
-        IB_LOCAL_DATASEGMENT      tx_gl[IBNAL_TX_MAX_SG];     /* ...and their memory */
+        __u64                     tx_hca_msg;   /* pre-mapped buffer (HCA vaddr) */
+        int                       tx_nwrq;      /* # send work items */
+#if IBNAL_USE_FMR
+        IB_WORK_REQ2              tx_wrq[2];    /* send work items... */
+        IB_LOCAL_DATASEGMENT      tx_gl[2];     /* ...and their memory */
+        kib_rdma_desc_t           tx_rd[1];     /* rdma descriptor */
+        kib_md_t                  tx_md;        /* mapping */
+        __u64                    *tx_pages;     /* page phys addrs */
+#else
+        IB_WORK_REQ2             *tx_wrq;       /* send work items... */
+        IB_LOCAL_DATASEGMENT     *tx_gl;        /* ...and their memory */
+        kib_rdma_desc_t          *tx_rd;        /* rdma descriptor (src buffers) */
+#endif
 } kib_tx_t;
 
-#define KIB_TX_UNMAPPED       0
-#define KIB_TX_MAPPED         1
-#define KIB_TX_MAPPED_FMR     2
-
-typedef struct kib_wire_connreq
-{
-        __u32        wcr_magic;                 /* I'm an openibnal connreq */
-        __u16        wcr_version;               /* this is my version number */
-        __u16        wcr_queue_depth;           /* this is my receive queue size */
-        __u64        wcr_nid;                   /* peer's NID */
-        __u64        wcr_incarnation;           /* peer's incarnation */
-} kib_wire_connreq_t;
-
-typedef struct kib_gid
-{
-        __u64   hi, lo;
-} kib_gid_t;
-
-typedef struct kib_connreq
+typedef struct
 {
-        /* connection-in-progress */
-        struct kib_conn                    *cr_conn;
-        kib_wire_connreq_t                  cr_wcr;
-        __u64                               cr_tid;
-        IB_SERVICE_RECORD                   cr_service;
-        kib_gid_t                           cr_gid;
-        IB_PATH_RECORD                      cr_path;
-        CM_REQUEST_INFO                     cr_cmreq;
-        CM_CONN_INFO                        cr_discarded;
-} kib_connreq_t;
+        /* scratchpad during connection establishment */
+        IB_QP_ATTRIBUTES_QUERY cv_qpattrs;
+        QUERY                  cv_query;
+        IB_SERVICE_RECORD      cv_svcrec;
+        IB_PATH_RECORD         cv_path;
+        CM_CONN_INFO           cv_cmci;
+} kib_connvars_t;
 
 typedef struct kib_conn
 {
         struct kib_peer    *ibc_peer;           /* owning peer */
         struct list_head    ibc_list;           /* stash on peer's conn list */
         __u64               ibc_incarnation;    /* which instance of the peer */
+        __u64               ibc_txseq;          /* tx sequence number */
+        __u64               ibc_rxseq;          /* rx sequence number */
+        __u32               ibc_version;        /* peer protocol version */
         atomic_t            ibc_refcount;       /* # users */
         int                 ibc_state;          /* what's happening */
-        atomic_t            ibc_nob;            /* # bytes buffered */
         int                 ibc_nsends_posted;  /* # uncompleted sends */
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
-        int                 ibc_rcvd_disconnect;/* received discon request */
-        int                 ibc_sent_disconnect;/* sent discon request */
+        int                 ibc_reserved_credits; /* # credits for ACK/DONE msgs */
+        unsigned long       ibc_last_send;      /* time of last send */
+        struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
+        struct list_head    ibc_tx_queue_nocred; /* sends that don't need a cred */
+        struct list_head    ibc_tx_queue_rsrvd; /* sends that need a reserved cred */
         struct list_head    ibc_tx_queue;       /* send queue */
         struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
         kib_rx_t           *ibc_rxs;            /* the rx descs */
         kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
         IB_HANDLE           ibc_qp;             /* queue pair */
-        IB_HANDLE           ibc_cep;            /* connection ID? */
-        IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs;    /* QP attrs */
-        kib_connreq_t      *ibc_connreq;        /* connection request state */
+        IB_HANDLE           ibc_cep;            /* CM endpoint */
+        kib_connvars_t     *ibc_cvars;          /* connection scratchpad */
 } kib_conn_t;
 
 #define IBNAL_CONN_INIT_NOTHING      0          /* initial state */
 #define IBNAL_CONN_INIT_QP           1          /* ibc_qp set up */
 #define IBNAL_CONN_CONNECTING        2          /* started to connect */
 #define IBNAL_CONN_ESTABLISHED       3          /* connection established */
-#define IBNAL_CONN_SEND_DREQ         4          /* to send disconnect req */
-#define IBNAL_CONN_DREQ              5          /* sent disconnect req */
-#define IBNAL_CONN_DREP              6          /* sent disconnect rep */
-#define IBNAL_CONN_DISCONNECTED      7          /* no more QP or CM traffic */
+#define IBNAL_CONN_DISCONNECTING     4          /* to send disconnect req */
+#define IBNAL_CONN_DISCONNECTED      5          /* no more QP or CM traffic */
 
-#define KIB_ASSERT_CONN_STATE(conn, state) do {                         \
-        LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state);  \
-} while (0)
-
-#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do {               \
-        LASSERTF(low <= high, "%d %d\n", low, high);                    \
-        LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \
-                 "%d\n", conn->ibc_state);                              \
-} while (0)
+/* types of connection */
+#define IBNAL_CONN_ACTIVE            0          /* active connect */
+#define IBNAL_CONN_PASSIVE           1          /* passive connect */
+#define IBNAL_CONN_WAITING           2          /* waiting for connect */
 
 typedef struct kib_peer
 {
         struct list_head    ibp_list;           /* stash on global peer list */
         struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
-        ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        lnet_nid_t          ibp_nid;            /* who's on the other end(s) */
         atomic_t            ibp_refcount;       /* # users */
         int                 ibp_persistence;    /* "known" peer refs */
+        int                 ibp_version;        /* protocol version */
         struct list_head    ibp_conns;          /* all active connections */
         struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
-        int                 ibp_connecting;     /* connecting+accepting */
+        int                 ibp_connecting;     /* active connects in progress */
+        int                 ibp_accepting;      /* passive connects in progress */
+        int                 ibp_passivewait;    /* waiting for peer to connect */
+        unsigned long       ibp_passivewait_deadline; /* when passive wait must complete */
         unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
+        int                 ibp_error;          /* errno on closing this peer */
+        cfs_time_t          ibp_last_alive;     /* when (in jiffies) I was last alive */
 } kib_peer_t;
 
 
-extern lib_nal_t       kibnal_lib;
 extern kib_data_t      kibnal_data;
 extern kib_tunables_t  kibnal_tunables;
 
 /******************************************************************************/
-/* Infinicon IBT interface wrappers */
-#define IIBT_IF (kibnal_data.kib_interfaces.ver2)
-
-static inline FSTATUS
-iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list)
-{
-        return IIBT_IF.GetCaGuids(hca_count, hca_guid_list);
-}
-
-static inline FSTATUS
-iibt_open_hca(EUI64                    hca_guid,
-             IB_COMPLETION_CALLBACK   completion_callback,
-             IB_ASYNC_EVENT_CALLBACK  async_event_callback,
-             void                    *arg,
-             IB_HANDLE               *handle)
-{
-        return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback,
-                                  async_event_callback, arg, handle);
-}
-
-static inline FSTATUS
-iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp)
-{
-        return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp);
-}
-
-static inline FSTATUS
-iibt_close_hca(IB_HANDLE hca_handle)
-{
-        return IIBT_IF.Vpi.CloseCA(hca_handle);
-}
-
-static inline FSTATUS
-iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle)
-{
-        return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle);
-}
-
-static inline FSTATUS
-iibt_pd_free(IB_HANDLE pd_handle)
-{
-        return IIBT_IF.Vpi.FreePD(pd_handle);
-}
-
-static inline FSTATUS
-iibt_register_physical_memory(IB_HANDLE hca_handle,
-                              IB_VIRT_ADDR requested_io_va,
-                              void *phys_buffers, uint64 nphys_buffers,
-                              uint32 io_va_offset, IB_HANDLE pd_handle,
-                              IB_ACCESS_CONTROL access,
-                              IB_HANDLE *mem_handle,
-                              IB_VIRT_ADDR *actual_io_va,
-                              IB_L_KEY *lkey, IB_R_KEY *rkey)
-{
-        return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va,
-                                                 phys_buffers, nphys_buffers,
-                                                 io_va_offset, pd_handle,
-                                                 access,
-                                                 mem_handle, actual_io_va,
-                                                 lkey, rkey);
-}
-
-static inline FSTATUS
-iibt_register_contig_physical_memory(IB_HANDLE hca_handle,
-                                     IB_VIRT_ADDR requested_io_va,
-                                     IB_MR_PHYS_BUFFER *phys_buffers,
-                                     uint64 nphys_buffers,
-                                     uint32 io_va_offset, IB_HANDLE pd_handle,
-                                     IB_ACCESS_CONTROL access,
-                                     IB_HANDLE *mem_handle,
-                                     IB_VIRT_ADDR *actual_io_va,
-                                     IB_L_KEY *lkey, IB_R_KEY *rkey)
-{
-        return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle,
-                                                       requested_io_va,
-                                                       phys_buffers,
-                                                       nphys_buffers,
-                                                       io_va_offset, pd_handle,
-                                                       access,
-                                                       mem_handle, actual_io_va,
-                                                       lkey, rkey);
-}
-
-static inline FSTATUS
-iibt_register_memory(IB_HANDLE hca_handle,
-                     void *virt_addr, unsigned int length,
-                     IB_HANDLE pd_handle,
-                     IB_ACCESS_CONTROL access,
-                     IB_HANDLE *mem_handle,
-                     IB_L_KEY *lkey, IB_R_KEY *rkey)
-{
-        return IIBT_IF.Vpi.RegisterMemRegion(hca_handle,
-                                             virt_addr, length,
-                                             pd_handle,
-                                             access,
-                                             mem_handle,
-                                             lkey, rkey);
-}
-
-static inline FSTATUS
-iibt_deregister_memory(IB_HANDLE mem_handle)
-{
-        return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle);
-}
-
-static inline FSTATUS
-iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size,
-              void *arg, IB_HANDLE *cq_handle, uint32 *actual_size)
-{
-        return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size,
-                                   arg, cq_handle, actual_size);
-}
-
-static inline FSTATUS
-iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc)
-{
-        return IIBT_IF.Vpi.PollCQ(cq_handle, wc);
-}
-
-static inline FSTATUS
-iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select)
-{
-        return IIBT_IF.Vpi.RearmCQ(cq_handle, select);
-}
-
-static inline FSTATUS
-iibt_cq_destroy(IB_HANDLE cq_handle)
-{
-        return IIBT_IF.Vpi.DestroyCQ(cq_handle);
-}
-
-static inline FSTATUS
-iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr,
-              void *arg, IB_HANDLE *cq_handle,
-              IB_QP_ATTRIBUTES_QUERY *query_attr)
-{
-        return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle,
-                                    query_attr);
-}
-
-static inline FSTATUS
-iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr,
-              void **arg_ptr)
-{
-        return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr);
-}
-
-static inline FSTATUS
-iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr,
-               IB_QP_ATTRIBUTES_QUERY *query_attr)
-{
-        return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr);
-}
-
-static inline FSTATUS
-iibt_qp_destroy(IB_HANDLE qp_handle)
-{
-        return IIBT_IF.Vpi.DestroyQP(qp_handle);
-}
-
-static inline FSTATUS
-iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
-{
-        return IIBT_IF.Vpi.PostRecv(qp_handle, work_req);
-}
-
-static inline FSTATUS
-iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req)
-{
-        return IIBT_IF.Vpi.PostSend(qp_handle, work_req);
-}
-
-static inline FSTATUS
-iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p)
-{
-        return IIBT_IF.Sdi.Register(sd_handle, p);
-}
-
-static inline FSTATUS
-iibt_sd_deregister(IB_HANDLE sd_handle)
-{
-        return IIBT_IF.Sdi.Deregister(sd_handle);
-}
-
-static inline FSTATUS
-iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid,
-                              FABRIC_OPERATION_DATA *fod,
-                              PFABRIC_OPERATION_CALLBACK callback,
-                              COMMAND_CONTROL_PARAMETERS *p, void *arg)
-{
-        return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid,
-                                               fod, callback, p, arg);
-}
-
-static inline FSTATUS
-iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid,
-                                      QUERY *qry,
-                                      PQUERY_CALLBACK callback,
-                                      COMMAND_CONTROL_PARAMETERS *p, void *arg)
-{
-        return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid,
-                                                      qry, callback, p, arg);
-}
-
-static inline IB_HANDLE
-iibt_cm_create_cep(CM_CEP_TYPE type)
-{
-        return IIBT_IF.Cmi.CmCreateCEP(type);
-}
-
-static inline FSTATUS
-iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len,
-                   uint32 offset)
-{
-        return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset);
-}
-
-static inline FSTATUS
-iibt_cm_destroy_cep(IB_HANDLE cep_handle)
-{
-        return IIBT_IF.Cmi.CmDestroyCEP(cep_handle);
-}
-
-static inline FSTATUS
-iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info,
-               PFN_CM_CALLBACK callback, void *arg)
-{
-        return IIBT_IF.Cmi.CmListen(cep, info, callback, arg);
-}
-
-static inline FSTATUS
-iibt_cm_cancel(IB_HANDLE cep)
-{
-        return IIBT_IF.Cmi.CmCancel(cep);
-}
-
-static inline FSTATUS
-iibt_cm_accept(IB_HANDLE cep,
-               CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info,
-               PFN_CM_CALLBACK callback, void *arg,
-               IB_HANDLE *new_cep)
-{
-        return IIBT_IF.Cmi.CmAccept(cep,
-                                    send_info, recv_info,
-                                    callback, arg, new_cep);
-}
-
-static inline FSTATUS
-iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej)
-{
-        return IIBT_IF.Cmi.CmReject(cep, rej);
-}
-
-static inline FSTATUS
-iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req,
-                   CM_DREPLY_INFO *reply)
-{
-        return IIBT_IF.Cmi.CmDisconnect(cep, req, reply);
-}
-
-static inline FSTATUS
-iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req,
-                 PFN_CM_CALLBACK callback, void *arg)
-{
-        return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg);
-}
-
-static inline int wrq_signals_completion(IB_WORK_REQ *wrq)
-{
-        return wrq->Req.SendRC.Options.s.SignaledCompletion == 1;
-}
-
-
-/******************************************************************************/
 
 /* these are purposely avoiding using local vars so they don't increase
  * stack consumption. */
 
-#define kib_peer_addref(peer) do {                                      \
-        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
-                 atomic_read(&peer->ibp_refcount));                     \
-        CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n",                   \
-               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
-        atomic_inc(&peer->ibp_refcount);                                \
+#define kibnal_conn_addref(conn)                                \
+do {                                                            \
+        CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
+               (conn), atomic_read(&(conn)->ibc_refcount));     \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);        \
+        atomic_inc(&(conn)->ibc_refcount);                      \
+} while (0)
+
+#define kibnal_conn_decref(conn)                                              \
+do {                                                                          \
+        unsigned long   flags;                                                \
+                                                                              \
+        CDEBUG(D_NET, "conn[%p] (%d)--\n",                                    \
+               (conn), atomic_read(&(conn)->ibc_refcount));                   \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);                      \
+        if (atomic_dec_and_test(&(conn)->ibc_refcount)) {                     \
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);        \
+                list_add_tail(&(conn)->ibc_list,                              \
+                              &kibnal_data.kib_connd_zombies);                \
+                wake_up(&kibnal_data.kib_connd_waitq);                        \
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);   \
+        }                                                                     \
 } while (0)
 
-#define kib_peer_decref(peer) do {                                      \
-        LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n",          \
-                 atomic_read(&peer->ibp_refcount));                     \
-        CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n",                   \
-               peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \
-        if (atomic_dec_and_test (&peer->ibp_refcount)) {                \
-                CDEBUG (D_NET, "destroying peer "LPX64" %p\n",          \
-                        peer->ibp_nid, peer);                           \
-                kibnal_destroy_peer (peer);                             \
-        }                                                               \
+#define kibnal_peer_addref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        atomic_inc(&(peer)->ibp_refcount);                      \
+} while (0)
+
+#define kibnal_peer_decref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
+                kibnal_destroy_peer(peer);                      \
 } while (0)
 
 /******************************************************************************/
 
 static inline struct list_head *
-kibnal_nid2peerlist (ptl_nid_t nid)
+kibnal_nid2peerlist (lnet_nid_t nid)
 {
         unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
 
@@ -750,17 +516,79 @@ kibnal_peer_active(kib_peer_t *peer)
         return (!list_empty(&peer->ibp_list));
 }
 
+static inline int
+kibnal_peer_connecting(kib_peer_t *peer)
+{
+        /* Am I expecting a connection to materialise? */
+        return (peer->ibp_connecting != 0 ||
+                peer->ibp_accepting != 0 ||
+                peer->ibp_passivewait);
+}
+
 static inline void
 kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
-        /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+        struct list_head  *q;
+        
+        LASSERT (tx->tx_nwrq > 0);              /* work items set up */
+        LASSERT (!tx->tx_queued);               /* not queued for sending already */
+
+        tx->tx_queued = 1;
+        tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ);
+
+        if (tx->tx_conn == NULL) {
+                kibnal_conn_addref(conn);
+                tx->tx_conn = conn;
+                LASSERT (tx->tx_msg->ibm_type != IBNAL_MSG_PUT_DONE);
+        } else {
+                LASSERT (tx->tx_conn == conn);
+                LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE);
+        }
 
-        LASSERT (tx->tx_nsp > 0);               /* work items set up */
-        LASSERT (tx->tx_conn == NULL);          /* only set here */
+        if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                /* All messages have simple credit control */
+                q = &conn->ibc_tx_queue;
+        } else {
+                LASSERT (conn->ibc_version == IBNAL_MSG_VERSION);
+                
+                switch (tx->tx_msg->ibm_type) {
+                case IBNAL_MSG_PUT_REQ:
+                case IBNAL_MSG_GET_REQ:
+                        /* RDMA request: reserve a buffer for the RDMA reply
+                         * before sending */
+                        q = &conn->ibc_tx_queue_rsrvd;
+                        break;
+
+                case IBNAL_MSG_PUT_NAK:
+                case IBNAL_MSG_PUT_ACK:
+                case IBNAL_MSG_PUT_DONE:
+                case IBNAL_MSG_GET_DONE:
+                        /* RDMA reply/completion: no credits; peer has reserved
+                         * a reply buffer */
+                        q = &conn->ibc_tx_queue_nocred;
+                        break;
+                
+                case IBNAL_MSG_NOOP:
+                case IBNAL_MSG_IMMEDIATE:
+                        /* Otherwise: consume a credit before sending */
+                        q = &conn->ibc_tx_queue;
+                        break;
+                
+                default:
+                        LBUG();
+                        q = NULL;
+                }
+        }
+        
+        list_add_tail(&tx->tx_list, q);
+}
 
-        tx->tx_conn = conn;
-        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
-        list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+static inline int
+kibnal_send_keepalive(kib_conn_t *conn) 
+{
+        return (*kibnal_tunables.kib_keepalive > 0) &&
+                time_after(jiffies, conn->ibc_last_send +
+                           *kibnal_tunables.kib_keepalive*HZ);
 }
 
 #define KIBNAL_SERVICE_KEY_MASK  (IB_SERVICE_RECORD_COMP_SERVICENAME |          \
@@ -780,112 +608,130 @@ kibnal_service_nid_field(IB_SERVICE_RECORD *srv)
         return (__u64 *)srv->ServiceData8;
 }
 
-
 static inline void
-kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid)
+kibnal_set_service_keys(IB_SERVICE_RECORD *srv, lnet_nid_t nid)
 {
-        LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName));
+        char *svc_name = *kibnal_tunables.kib_service_name;
+
+        LASSERT (strlen(svc_name) < sizeof(srv->ServiceName));
         memset (srv->ServiceName, 0, sizeof(srv->ServiceName));
-        strcpy (srv->ServiceName, IBNAL_SERVICE_NAME);
+        strcpy (srv->ServiceName, svc_name);
 
         *kibnal_service_nid_field(srv) = cpu_to_le64(nid);
 }
 
-#if 0
-static inline void
-kibnal_show_rdma_attr (kib_conn_t *conn)
-{
-        struct ib_qp_attribute qp_attr;
-        int                    rc;
-
-        memset (&qp_attr, 0, sizeof(qp_attr));
-        rc = ib_qp_query(conn->ibc_qp, &qp_attr);
-        if (rc != 0) {
-                CERROR ("Can't get qp attrs: %d\n", rc);
-                return;
-        }
+/* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to use the
+ * lowest 2 bits of the work request id to stash the work item type (the op
+ * field is not valid when the wc completes in error). */
 
-        CWARN ("RDMA CAPABILITY: write %s read %s\n",
-               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
-               (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
-               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
-               (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
-}
-#endif
+#define IBNAL_WID_TX    0
+#define IBNAL_WID_RX    1
+#define IBNAL_WID_RDMA  2
+#define IBNAL_WID_MASK  3UL
 
-#if CONFIG_X86
 static inline __u64
-kibnal_page2phys (struct page *p)
+kibnal_ptr2wreqid (void *ptr, int type)
 {
-        __u64 page_number = p - mem_map;
+        unsigned long lptr = (unsigned long)ptr;
 
-        return (page_number << PAGE_SHIFT);
+        LASSERT ((lptr & IBNAL_WID_MASK) == 0);
+        LASSERT ((type & ~IBNAL_WID_MASK) == 0);
+        return (__u64)(lptr | type);
 }
-#else
-# error "no page->phys"
-#endif
-
-/* CAVEAT EMPTOR:
- * We rely on tx/rx descriptor alignment to allow us to use the lowest bit
- * of the work request id as a flag to determine if the completion is for a
- * transmit or a receive.  It seems that that the CQ entry's 'op' field
- * isn't always set correctly on completions that occur after QP teardown. */
 
-static inline __u64
-kibnal_ptr2wreqid (void *ptr, int isrx)
+static inline void *
+kibnal_wreqid2ptr (__u64 wreqid)
 {
-        unsigned long lptr = (unsigned long)ptr;
+        return (void *)(((unsigned long)wreqid) & ~IBNAL_WID_MASK);
+}
 
-        LASSERT ((lptr & 1) == 0);
-        return (__u64)(lptr | (isrx ? 1 : 0));
+static inline int
+kibnal_wreqid2type (__u64 wreqid)
+{
+        return (wreqid & IBNAL_WID_MASK);
 }
 
-static inline void *
-kibnal_wreqid2ptr (__u64 wreqid)
+static inline void
+kibnal_set_conn_state (kib_conn_t *conn, int state)
 {
-        return (void *)(((unsigned long)wreqid) & ~1UL);
+        CDEBUG(D_NET,"%p state %d\n", conn, state);
+        conn->ibc_state = state;
+        mb();
 }
 
+#if IBNAL_USE_FMR
+
 static inline int
-kibnal_wreqid_is_rx (__u64 wreqid)
+kibnal_rd_size (kib_rdma_desc_t *rd) 
 {
-        return (wreqid & 1) != 0;
+        return rd->rd_nob;
 }
 
+#else
 static inline int
-kibnal_whole_mem(void)
+kibnal_rd_size (kib_rdma_desc_t *rd)
 {
-        return kibnal_data.kib_md.md_handle != NULL;
+        int   i;
+        int   size;
+        
+        for (i = size = 0; i < rd->rd_nfrag; i++)
+                size += rd->rd_frags[i].rf_nob;
+        
+        return size;
 }
+#endif
 
-extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
-extern void kibnal_destroy_peer (kib_peer_t *peer);
-extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
-extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
-extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
-extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer,
-                                              __u64 incarnation);
-extern kib_conn_t *kibnal_create_conn (void);
-extern void kibnal_put_conn (kib_conn_t *conn);
-extern void kibnal_destroy_conn (kib_conn_t *conn);
+int  kibnal_startup (lnet_ni_t *ni);
+void kibnal_shutdown (lnet_ni_t *ni);
+int  kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int  kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kibnal_eager_recv (lnet_ni_t *ni, void *private, 
+                        lnet_msg_t *lntmsg, void **new_private);
+int  kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg,
+                  int delayed, unsigned int niov,
+                  struct iovec *iov, lnet_kiov_t *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen);
+void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob);
+void kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, 
+                     lnet_nid_t dstnid, __u64 dststamp, __u64 seq);
+void kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, int type,
+                         lnet_nid_t dstnid, __u64 dststamp);
+int  kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob);
+IB_HANDLE kibnal_create_cep(lnet_nid_t nid);
+int  kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid);
+void kibnal_destroy_peer (kib_peer_t *peer);
+kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid);
+int  kibnal_del_peer (lnet_nid_t nid);
+void kibnal_peer_alive (kib_peer_t *peer);
+void kibnal_unlink_peer_locked (kib_peer_t *peer);
+int  kibnal_add_persistent_peer (lnet_nid_t nid);
+int  kibnal_close_stale_conns_locked (kib_peer_t *peer,
+                                      __u64 incarnation);
+int  kibnal_conn_rts(kib_conn_t *conn,
+                     __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn);
+kib_conn_t *kibnal_create_conn (lnet_nid_t nid, int proto_version);
+void kibnal_destroy_conn (kib_conn_t *conn);
 void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg);
-
-extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
-extern void kibnal_free_pages (kib_pages_t *p);
-
-extern void kibnal_check_sends (kib_conn_t *conn);
-extern void kibnal_close_conn_locked (kib_conn_t *conn, int error);
-extern void kibnal_destroy_conn (kib_conn_t *conn);
-extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
-extern int  kibnal_scheduler(void *arg);
-extern int  kibnal_connd (void *arg);
-extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
-extern void kibnal_close_conn (kib_conn_t *conn, int why);
-extern void kibnal_start_active_rdma (int type, int status,
-                                      kib_rx_t *rx, lib_msg_t *libmsg,
-                                      unsigned int niov,
-                                      struct iovec *iov, ptl_kiov_t *kiov,
-                                      size_t offset, size_t nob);
-
-void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev);
-void kibnal_ca_callback (void *ca_arg, void *cq_arg);
+int  kibnal_alloc_pages (kib_pages_t **pp, int npages);
+void kibnal_free_pages (kib_pages_t *p);
+void kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kibnal_txlist_done (struct list_head *txlist, int status);
+int  kibnal_post_receives (kib_conn_t *conn);
+int  kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
+                       kib_rdma_desc_t *dstrd, __u64 dstcookie);
+void kibnal_check_sends (kib_conn_t *conn);
+void kibnal_close_conn_locked (kib_conn_t *conn, int error);
+int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
+int  kibnal_scheduler(void *arg);
+int  kibnal_connd (void *arg);
+void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
+void kibnal_close_conn (kib_conn_t *conn, int why);
+void kibnal_start_active_rdma (int type, int status,
+                               kib_rx_t *rx, lnet_msg_t *lntmsg,
+                               unsigned int niov,
+                               struct iovec *iov, lnet_kiov_t *kiov,
+                               unsigned int offset, unsigned int nob);
+void kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev);
+void kibnal_hca_callback (void *hca_arg, void *cq_arg);
+int  kibnal_tunables_init (void);
+void kibnal_tunables_fini (void);
index eb9e6fa..fb4bba0 100644 (file)
  *
  */
 
-#include "iibnal.h"
+#include "iiblnd.h"
 
-/*
- *  LIB functions follow
- *
- */
-static void
-kibnal_schedule_tx_done (kib_tx_t *tx)
+void
+hexdump(char *string, void *ptr, int len)
 {
-        unsigned long flags;
+        unsigned char *c = ptr;
+        int i;
+
+        return;
 
-        spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+        if (len < 0 || len > 2048)  {
+                printk("XXX what the hell? %d\n",len);
+                return;
+        }
 
-        list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
-        wake_up (&kibnal_data.kib_sched_waitq);
+        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
 
-        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        for (i = 0; i < len;) {
+                printk("%02x",*(c++));
+                i++;
+                if (!(i & 15)) {
+                        printk("\n");
+                } else if (!(i&1)) {
+                        printk(" ");
+                }
+        }
+
+        if(len & 15) {
+                printk("\n");
+        }
 }
 
-static void
+void
 kibnal_tx_done (kib_tx_t *tx)
 {
-        ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
-        unsigned long    flags;
-        int              i;
-        FSTATUS          frc;
-
-        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
-        LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
+        lnet_msg_t *lntmsg[2];
+        int         rc = tx->tx_status;
+        int         i;
 
-        switch (tx->tx_mapped) {
-        default:
-                LBUG();
+        LASSERT (!in_interrupt());
+        LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
+        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
+        LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
 
-        case KIB_TX_UNMAPPED:
-                break;
+#if IBNAL_USE_FMR
+        /* Handle unmapping if required */
+#endif
+        /* tx may have up to 2 lnet msgs to finalise */
+        lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+        lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+        
+        if (tx->tx_conn != NULL) {
+                kibnal_conn_decref(tx->tx_conn);
+                tx->tx_conn = NULL;
+        }
 
-        case KIB_TX_MAPPED:
-                if (in_interrupt()) {
-                        /* can't deregister memory in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }
-                frc = iibt_deregister_memory(tx->tx_md.md_handle);
-                LASSERT (frc == FSUCCESS);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
-                break;
+        tx->tx_nwrq = 0;
+        tx->tx_status = 0;
 
-#if IBNAL_FMR
-        case KIB_TX_MAPPED_FMR:
-                if (in_interrupt() && tx->tx_status != 0) {
-                        /* can't flush FMRs in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }              
+        spin_lock(&kibnal_data.kib_tx_lock);
 
-                rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
-                LASSERT (rc == 0);
+        list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
-                if (tx->tx_status != 0)
-                        ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
-                tx->tx_mapped = KIB_TX_UNMAPPED;
-                break;
-#endif
-        }
+        spin_unlock(&kibnal_data.kib_tx_lock);
 
+        /* delay finalize until my descs have been freed */
         for (i = 0; i < 2; i++) {
-                /* tx may have up to 2 libmsgs to finalise */
-                if (tx->tx_libmsg[i] == NULL)
+                if (lntmsg[i] == NULL)
                         continue;
 
-                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
-                tx->tx_libmsg[i] = NULL;
+                lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
         }
+}
+
+kib_tx_t *
+kibnal_get_idle_tx (void) 
+{
+        kib_tx_t      *tx;
         
-        if (tx->tx_conn != NULL) {
-                kibnal_put_conn (tx->tx_conn);
-                tx->tx_conn = NULL;
+        spin_lock(&kibnal_data.kib_tx_lock);
+
+        if (list_empty (&kibnal_data.kib_idle_txs)) {
+                spin_unlock(&kibnal_data.kib_tx_lock);
+                return NULL;
         }
 
-        tx->tx_nsp = 0;
-        tx->tx_passive_rdma = 0;
-        tx->tx_status = 0;
+        tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
+        list_del (&tx->tx_list);
 
-        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+        /* Allocate a new completion cookie.  It might not be needed,
+         * but we've got a lock right now and we're unlikely to
+         * wrap... */
+        tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
 
-        if (tx->tx_isnblk) {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
-        } else {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
-                wake_up (&kibnal_data.kib_idle_tx_waitq);
-        }
+        spin_unlock(&kibnal_data.kib_tx_lock);
 
-        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (!tx->tx_queued);
+        LASSERT (tx->tx_sending == 0);
+        LASSERT (!tx->tx_waiting);
+        LASSERT (tx->tx_status == 0);
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_lntmsg[0] == NULL);
+        LASSERT (tx->tx_lntmsg[1] == NULL);
+        
+        return tx;
 }
 
-static kib_tx_t *
-kibnal_get_idle_tx (int may_block) 
+int
+kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
 {
-        unsigned long  flags;
-        kib_tx_t      *tx = NULL;
-        ENTRY;
-        
-        for (;;) {
-                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+        kib_conn_t   *conn = rx->rx_conn;
+        int           rc = 0;
+        FSTATUS       frc;
 
-                /* "normal" descriptor is free */
-                if (!list_empty (&kibnal_data.kib_idle_txs)) {
-                        tx = list_entry (kibnal_data.kib_idle_txs.next,
-                                         kib_tx_t, tx_list);
-                        break;
-                }
+        LASSERT (!in_interrupt());
+        /* old peers don't reserve rxs for RDMA replies */
+        LASSERT (!rsrvd_credit ||
+                 conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+        
+        rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+                .Address = rx->rx_hca_msg,
+                .Lkey    = kibnal_data.kib_whole_mem.md_lkey,
+                .Length  = IBNAL_MSG_SIZE,
+        };
 
-                if (!may_block) {
-                        /* may dip into reserve pool */
-                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
-                                CERROR ("reserved tx desc pool exhausted\n");
-                                break;
-                        }
+        rx->rx_wrq = (IB_WORK_REQ2) {
+                .Next          = NULL,
+                .WorkReqId     = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
+                .MessageLen    = IBNAL_MSG_SIZE,
+                .DSList        = &rx->rx_gl,
+                .DSListDepth   = 1,
+                .Operation     = WROpRecv,
+        };
 
-                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
-                                         kib_tx_t, tx_list);
-                        break;
-                }
+        LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
+        LASSERT (rx->rx_nob >= 0);              /* not posted */
 
-                /* block for idle tx */
-                spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+        CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", 
+               rx->rx_wrq.DSList->Length,
+               rx->rx_wrq.DSList->Lkey,
+               rx->rx_wrq.DSList->Address);
 
-                wait_event (kibnal_data.kib_idle_tx_waitq,
-                            !list_empty (&kibnal_data.kib_idle_txs) ||
-                            kibnal_data.kib_shutdown);
+        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
+                /* No more posts for this rx; so lose its ref */
+                kibnal_conn_decref(conn);
+                return 0;
         }
+        
+        rx->rx_nob = -1;                        /* flag posted */
+        mb();
 
-        if (tx != NULL) {
-                list_del (&tx->tx_list);
+        frc = iba_post_recv2(conn->ibc_qp, &rx->rx_wrq, NULL);
+        if (frc == FSUCCESS) {
+                if (credit || rsrvd_credit) {
+                        spin_lock(&conn->ibc_lock);
 
-                /* Allocate a new passive RDMA completion cookie.  It might
-                 * not be needed, but we've got a lock right now and we're
-                 * unlikely to wrap... */
-                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+                        if (credit)
+                                conn->ibc_outstanding_credits++;
+                        if (rsrvd_credit)
+                                conn->ibc_reserved_credits++;
 
-                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-                LASSERT (tx->tx_nsp == 0);
-                LASSERT (tx->tx_sending == 0);
-                LASSERT (tx->tx_status == 0);
-                LASSERT (tx->tx_conn == NULL);
-                LASSERT (!tx->tx_passive_rdma);
-                LASSERT (!tx->tx_passive_rdma_wait);
-                LASSERT (tx->tx_libmsg[0] == NULL);
-                LASSERT (tx->tx_libmsg[1] == NULL);
-        }
+                        spin_unlock(&conn->ibc_lock);
 
-        spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+                        kibnal_check_sends(conn);
+                }
+                return 0;
+        }
         
-        RETURN(tx);
+        CERROR ("post rx -> %s failed %d\n", 
+                libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        rc = -EIO;
+        kibnal_close_conn(rx->rx_conn, rc);
+        /* No more posts for this rx; so lose its ref */
+        kibnal_conn_decref(conn);
+        return rc;
 }
 
-static int
-kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+int
+kibnal_post_receives (kib_conn_t *conn)
 {
-        /* I would guess that if kibnal_get_peer (nid) == NULL,
-           and we're not routing, then 'nid' is very distant :) */
-        if ( nal->libnal_ni.ni_pid.nid == nid ) {
-                *dist = 0;
-        } else {
-                *dist = 1;
+        int    i;
+        int    rc;
+
+        LASSERT (conn->ibc_state == IBNAL_CONN_CONNECTING);
+
+        for (i = 0; i < IBNAL_RX_MSGS; i++) {
+                /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
+                 * fails (i.e. actual failure or we're disconnecting) */
+                kibnal_conn_addref(conn);
+                rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
+                if (rc != 0)
+                        return rc;
         }
 
         return 0;
 }
 
-static void
-kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+kib_tx_t *
+kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
 {
-        struct list_head *ttmp;
-        unsigned long     flags;
-        int               idle;
-
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-
-        list_for_each (ttmp, &conn->ibc_active_txs) {
-                kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
-
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
+        struct list_head   *tmp;
+        
+        list_for_each(tmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+                
+                LASSERT (!tx->tx_queued);
+                LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
 
-                if (!tx->tx_passive_rdma_wait ||
-                    tx->tx_passive_rdma_cookie != cookie)
+                if (tx->tx_cookie != cookie)
                         continue;
 
-                CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+                if (tx->tx_waiting &&
+                    tx->tx_msg->ibm_type == txtype)
+                        return tx;
 
-                tx->tx_status = status;
-                tx->tx_passive_rdma_wait = 0;
-                idle = (tx->tx_sending == 0);
+                CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+                      tx->tx_waiting ? "" : "NOT ",
+                      tx->tx_msg->ibm_type, txtype);
+        }
+        return NULL;
+}
 
-                if (idle)
-                        list_del (&tx->tx_list);
+void
+kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+        kib_tx_t    *tx;
+        int          idle;
 
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        spin_lock(&conn->ibc_lock);
 
-                /* I could be racing with tx callbacks.  It's whoever
-                 * _makes_ tx idle that frees it */
-                if (idle)
-                        kibnal_tx_done (tx);
+        tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
+        if (tx == NULL) {
+                spin_unlock(&conn->ibc_lock);
+
+                CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_close_conn (conn, -EPROTO);
                 return;
         }
-                
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
-        CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
-                cookie, conn->ibc_peer->ibp_nid);
+        if (tx->tx_status == 0) {               /* success so far */
+                if (status < 0) {               /* failed? */
+                        tx->tx_status = status;
+                } else if (txtype == IBNAL_MSG_GET_REQ) {
+                        lnet_set_reply_msg_len(kibnal_data.kib_ni,
+                                               tx->tx_lntmsg[1], status);
+                }
+        }
+        
+        tx->tx_waiting = 0;
+
+        idle = !tx->tx_queued && (tx->tx_sending == 0);
+        if (idle)
+                list_del(&tx->tx_list);
+
+        spin_unlock(&conn->ibc_lock);
+        
+        if (idle)
+                kibnal_tx_done(tx);
 }
 
-static __u32
-kibnal_lkey(kib_pages_t *ibp)
+void
+kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
 {
-        if (kibnal_whole_mem())
-                return kibnal_data.kib_md.md_lkey;
-
-        return ibp->ibp_lkey;
+        kib_tx_t    *tx = kibnal_get_idle_tx();
+        
+        if (tx == NULL) {
+                CERROR("Can't get tx for completion %x for %s\n",
+                       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+        
+        tx->tx_msg->ibm_u.completion.ibcm_status = status;
+        tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+        kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
+        
+        kibnal_queue_tx(tx, conn);
 }
 
-static void
-kibnal_post_rx (kib_rx_t *rx, int do_credits)
+void
+kibnal_handle_rx (kib_rx_t *rx)
 {
+        kib_msg_t    *msg = rx->rx_msg;
         kib_conn_t   *conn = rx->rx_conn;
+        int           credits = msg->ibm_credits;
+        kib_tx_t     *tx;
         int           rc = 0;
-        unsigned long flags;
-        FSTATUS       frc;
-        ENTRY;
-
-        rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
-                .Address = rx->rx_vaddr,
-                .Length  = IBNAL_MSG_SIZE,
-                .Lkey    = kibnal_lkey(conn->ibc_rx_pages),
-        };
+        int           repost = 1;
+        int           rsrvd_credit = 0;
+        int           rc2;
 
-        rx->rx_wrq = (IB_WORK_REQ) {
-                .Operation              = WROpRecv,
-                .DSListDepth            = 1,
-                .MessageLen             = IBNAL_MSG_SIZE,
-                .WorkReqId              = kibnal_ptr2wreqid(rx, 1),
-                .DSList                 = &rx->rx_gl,
-        };
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
-                                    IBNAL_CONN_DREP);
-        LASSERT (!rx->rx_posted);
-        rx->rx_posted = 1;
-        mb();
+        CDEBUG (D_NET, "Received %x[%d] from %s\n",
+                msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+        
+        if (credits != 0) {
+                /* Have I received credits that will let me send? */
+                spin_lock(&conn->ibc_lock);
+                conn->ibc_credits += credits;
+                spin_unlock(&conn->ibc_lock);
 
-        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
-                rc = -ECONNABORTED;
-        else {
-                frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
-                if (frc != FSUCCESS) {
-                        CDEBUG(D_NET, "post failed %d\n", frc);
-                        rc = -EINVAL;
-                }
-                CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+                kibnal_check_sends(conn);
         }
 
-        if (rc == 0) {
-                if (do_credits) {
-                        spin_lock_irqsave(&conn->ibc_lock, flags);
-                        conn->ibc_outstanding_credits++;
-                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Bad IBNAL message type %x from %s\n",
+                       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                rc = -EPROTO;
+                break;
 
-                        kibnal_check_sends(conn);
-                }
-                EXIT;
-                return;
-        }
+        case IBNAL_MSG_NOOP:
+                break;
 
-        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                CERROR ("Error posting receive -> "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, rc);
-                kibnal_close_conn (rx->rx_conn, rc);
-        } else {
-                CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, rc);
-        }
+        case IBNAL_MSG_IMMEDIATE:
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
+                                msg->ibm_srcnid, rx, 0);
+                repost = rc < 0;                /* repost on error */
+                break;
+                
+        case IBNAL_MSG_PUT_REQ:
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                repost = rc < 0;                /* repost on error */
+                break;
 
-        /* Drop rx's ref */
-        kibnal_put_conn (conn);
-        EXIT;
-}
+        case IBNAL_MSG_PUT_NAK:
+                rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 
-#if IBNAL_CKSUM
-static inline __u32 kibnal_cksum (void *ptr, int nob)
-{
-        char  *c  = ptr;
-        __u32  sum = 0;
+                CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
 
-        while (nob-- > 0)
-                sum = ((sum << 1) | (sum >> 31)) + *c++;
-        
-        return (sum);
-}
-#endif
+        case IBNAL_MSG_PUT_ACK:
+                rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
 
-static void hexdump(char *string, void *ptr, int len)
-{
-        unsigned char *c = ptr;
-        int i;
+                spin_lock(&conn->ibc_lock);
+                tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
+                                                   msg->ibm_u.putack.ibpam_src_cookie);
+                if (tx != NULL)
+                        list_del(&tx->tx_list);
+                spin_unlock(&conn->ibc_lock);
 
-        return;
+                if (tx == NULL) {
+                        CERROR("Unmatched PUT_ACK from %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        rc = -EPROTO;
+                        break;
+                }
 
-        if (len < 0 || len > 2048)  {
-                printk("XXX what the hell? %d\n",len);
-                return;
-        }
+                LASSERT (tx->tx_waiting);
+                /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                 * (a) I can overwrite tx_msg since my peer has received it!
+                 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+                tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+
+                rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
+                                       kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                       &msg->ibm_u.putack.ibpam_rd,
+                                       msg->ibm_u.putack.ibpam_dst_cookie);
+                if (rc2 < 0)
+                        CERROR("Can't setup rdma for PUT to %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+                spin_lock(&conn->ibc_lock);
+                if (tx->tx_status == 0 && rc2 < 0)
+                        tx->tx_status = rc2;
+                tx->tx_waiting = 0;             /* clear waiting and queue atomically */
+                kibnal_queue_tx_locked(tx, conn);
+                spin_unlock(&conn->ibc_lock);
+                break;
+                
+        case IBNAL_MSG_PUT_DONE:
+                /* This buffer was pre-reserved by not returning the credit
+                 * when the PUT_REQ's buffer was reposted, so I just return it
+                 * now */
+                kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
 
-        printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+        case IBNAL_MSG_GET_REQ:
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                repost = rc < 0;                /* repost on error */
+                break;
 
-        for (i = 0; i < len;) {
-                printk("%02x",*(c++));
-                i++;
-                if (!(i & 15)) {
-                        printk("\n");
-                } else if (!(i&1)) {
-                        printk(" ");
-                }
+        case IBNAL_MSG_GET_DONE:
+                rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
+
+                kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
         }
 
-        if(len & 15) {
-                printk("\n");
+        if (rc < 0)                             /* protocol error */
+                kibnal_close_conn(conn, rc);
+
+        if (repost) {
+                if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
+                        rsrvd_credit = 0;       /* peer isn't pre-reserving */
+
+                kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
         }
 }
 
-static void
-kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+void
+kibnal_rx_complete (IB_WORK_COMPLETION *wc, __u64 rxseq)
 {
         kib_rx_t     *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+        int           nob = wc->Length;
         kib_msg_t    *msg = rx->rx_msg;
         kib_conn_t   *conn = rx->rx_conn;
-        int           nob = wc->Length;
-        const int     base_nob = offsetof(kib_msg_t, ibm_u);
-        int           credits;
-        int           flipped;
         unsigned long flags;
-        __u32         i;
-#if IBNAL_CKSUM
-        __u32         msg_cksum;
-        __u32         computed_cksum;
-#endif
-
-        /* we set the QP to erroring after we've finished disconnecting, 
-         * maybe we should do so sooner. */
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, 
-                                    IBNAL_CONN_DISCONNECTED);
+        int           rc;
+        int           err = -EIO;
 
-        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        LASSERT (rx->rx_posted);
-        rx->rx_posted = 0;
+        LASSERT (rx->rx_nob < 0);               /* was posted */
+        rx->rx_nob = 0;                         /* isn't now */
         mb();
 
         /* receives complete with error in any case after we've started
          * disconnecting */
         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
-                goto failed;
+                goto ignore;
 
         if (wc->Status != WRStatusSuccess) {
-                CERROR("Rx from "LPX64" failed: %d\n", 
-                       conn->ibc_peer->ibp_nid, wc->Status);
+                CERROR("Rx from %s failed: %d\n", 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), wc->Status);
                 goto failed;
         }
 
-        if (nob < base_nob) {
-                CERROR ("Short rx from "LPX64": %d < expected %d\n",
-                        conn->ibc_peer->ibp_nid, nob, base_nob);
+        rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking rx from %s\n",
+                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 goto failed;
         }
 
-        hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
-
-        /* Receiver does any byte flipping if necessary... */
-
-        if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
-                flipped = 0;
-        } else {
-                if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
-                        CERROR ("Unrecognised magic: %08x from "LPX64"\n", 
-                                msg->ibm_magic, conn->ibc_peer->ibp_nid);
-                        goto failed;
-                }
-                flipped = 1;
-                __swab16s (&msg->ibm_version);
-                LASSERT (sizeof(msg->ibm_type) == 1);
-                LASSERT (sizeof(msg->ibm_credits) == 1);
-        }
+        rx->rx_nob = nob;                       /* Now I know nob > 0 */
+        mb();
 
-        if (msg->ibm_version != IBNAL_MSG_VERSION) {
-                CERROR ("Incompatible msg version %d (%d expected)\n",
-                        msg->ibm_version, IBNAL_MSG_VERSION);
+        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+            msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid ||
+            msg->ibm_srcstamp != conn->ibc_incarnation ||
+            msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+                CERROR ("Stale rx from %s\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                err = -ESTALE;
                 goto failed;
         }
 
-#if IBNAL_CKSUM
-        if (nob != msg->ibm_nob) {
-                CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+        if (msg->ibm_seq != rxseq) {
+                CERROR ("Out-of-sequence rx from %s"
+                        ": got "LPD64" but expected "LPD64"\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                        msg->ibm_seq, rxseq);
                 goto failed;
         }
 
-        msg_cksum = le32_to_cpu(msg->ibm_cksum);
-        msg->ibm_cksum = 0;
-        computed_cksum = kibnal_cksum (msg, nob);
-        
-        if (msg_cksum != computed_cksum) {
-                CERROR ("Checksum failure %d: (%d expected)\n",
-                        computed_cksum, msg_cksum);
-//                goto failed;
-        }
-        CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
-#endif
-
-        /* Have I received credits that will let me send? */
-        credits = msg->ibm_credits;
-        if (credits != 0) {
-                spin_lock_irqsave(&conn->ibc_lock, flags);
-                conn->ibc_credits += credits;
-                spin_unlock_irqrestore(&conn->ibc_lock, flags);
-                
-                kibnal_check_sends(conn);
-        }
-
-        switch (msg->ibm_type) {
-        case IBNAL_MSG_NOOP:
-                kibnal_post_rx (rx, 1);
-                return;
-
-        case IBNAL_MSG_IMMEDIATE:
-                if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
-                        CERROR ("Short IMMEDIATE from "LPX64": %d\n",
-                                conn->ibc_peer->ibp_nid, nob);
-                        goto failed;
-                }
-                break;
-                
-        case IBNAL_MSG_PUT_RDMA:
-        case IBNAL_MSG_GET_RDMA:
-                if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
-                        CERROR ("Short RDMA msg from "LPX64": %d\n",
-                                conn->ibc_peer->ibp_nid, nob);
-                        goto failed;
-                }
-                if (flipped) 
-                        __swab32(msg->ibm_u.rdma.ibrm_num_descs);
-
-                CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
-                       msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
-
-                if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) ||
-                    (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > 
-                     min(nob, IBNAL_MSG_SIZE))) {
-                        CERROR ("num_descs %d too large\n", 
-                                msg->ibm_u.rdma.ibrm_num_descs);
-                        goto failed;
-                }
-
-                if (flipped) {
-                        __swab32(msg->ibm_u.rdma.rd_key);
-                }
-
-                for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
-                        kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
+        /* set time last known alive */
+        kibnal_peer_alive(conn->ibc_peer);
 
-                        if (flipped) {
-                                __swab32(desc->rd_nob);
-                                __swab64(desc->rd_addr);
-                        }
+        /* racing with connection establishment/teardown! */
 
-                        CDEBUG(D_NET, "  key %x, " "addr "LPX64", nob %u\n",
-                               msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob);
-                }
-                break;
-                        
-        case IBNAL_MSG_PUT_DONE:
-        case IBNAL_MSG_GET_DONE:
-                if (nob < base_nob + sizeof (kib_completion_msg_t)) {
-                        CERROR ("Short COMPLETION msg from "LPX64": %d\n",
-                                conn->ibc_peer->ibp_nid, nob);
-                        goto failed;
+        if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                /* must check holding global lock to eliminate race */
+                if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+                        list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
+                        return;
                 }
-                if (flipped)
-                        __swab32s(&msg->ibm_u.completion.ibcm_status);
-                
-                CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
-                       msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
-                       msg->ibm_u.completion.ibcm_status);
-
-                kibnal_complete_passive_rdma (conn, 
-                                              msg->ibm_u.completion.ibcm_cookie,
-                                              msg->ibm_u.completion.ibcm_status);
-                kibnal_post_rx (rx, 1);
-                return;
-                        
-        default:
-                CERROR ("Can't parse type from "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, msg->ibm_type);
-                goto failed;
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                        flags);
         }
-
-        /* schedule for kibnal_rx() in thread context */
-        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
-        
-        list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
-        wake_up (&kibnal_data.kib_sched_waitq);
-        
-        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+        kibnal_handle_rx(rx);
         return;
         
  failed:
-        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        kibnal_close_conn(conn, -ECONNABORTED);
-
+        kibnal_close_conn(conn, err);
+ ignore:
         /* Don't re-post rx & drop its ref on conn */
-        kibnal_put_conn(conn);
+        kibnal_conn_decref(conn);
 }
 
-void
-kibnal_rx (kib_rx_t *rx)
+struct page *
+kibnal_kvaddr_to_page (unsigned long vaddr)
 {
-        kib_msg_t   *msg = rx->rx_msg;
-
-        /* Clear flag so I can detect if I've sent an RDMA completion */
-        rx->rx_rdma = 0;
+        struct page *page;
 
-        switch (msg->ibm_type) {
-        case IBNAL_MSG_GET_RDMA:
-                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
-                /* If the incoming get was matched, I'll have initiated the
-                 * RDMA and the completion message... */
-                if (rx->rx_rdma)
-                        break;
-
-                /* Otherwise, I'll send a failed completion now to prevent
-                 * the peer's GET blocking for the full timeout. */
-                CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
-                                          rx, NULL, 0, NULL, NULL, 0, 0);
-                break;
-                
-        case IBNAL_MSG_PUT_RDMA:
-                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
-                if (rx->rx_rdma)
-                        break;
-                /* This is most unusual, since even if lib_parse() didn't
-                 * match anything, it should have asked us to read (and
-                 * discard) the payload.  The portals header must be
-                 * inconsistent with this message type, so it's the
-                 * sender's fault for sending garbage and she can time
-                 * herself out... */
-                CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                break;
-
-        case IBNAL_MSG_IMMEDIATE:
-                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
-                LASSERT (!rx->rx_rdma);
-                break;
-                
-        default:
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END) {
+                page = vmalloc_to_page ((void *)vaddr);
+                LASSERT (page != NULL);
+                return page;
+        }
+#if CONFIG_HIGHMEM
+        if (vaddr >= PKMAP_BASE &&
+            vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+                /* No highmem pages only used for bulk (kiov) I/O */
+                CERROR("find page for address in highmem\n");
                 LBUG();
-                break;
         }
-
-        kibnal_post_rx (rx, 1);
+#endif
+        page = virt_to_page (vaddr);
+        LASSERT (page != NULL);
+        return page;
 }
 
-static struct page *
-kibnal_kvaddr_to_page (unsigned long vaddr)
+#if !IBNAL_USE_FMR
+int
+kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, 
+                     unsigned long page_offset, unsigned long len)
 {
-        struct page *page;
+        kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
 
-        if (vaddr >= VMALLOC_START &&
-            vaddr < VMALLOC_END)
-                page = vmalloc_to_page ((void *)vaddr);
-#if CONFIG_HIGHMEM
-        else if (vaddr >= PKMAP_BASE &&
-                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
-                page = vmalloc_to_page ((void *)vaddr);
-        /* in 2.4 ^ just walks the page tables */
-#endif
-        else
-                page = virt_to_page (vaddr);
+        if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
+                CERROR ("Too many RDMA fragments\n");
+                return -EMSGSIZE;
+        }
+
+        if (active) {
+                if (rd->rd_nfrag == 0)
+                        rd->rd_key = kibnal_data.kib_whole_mem.md_lkey;
+        } else {
+                if (rd->rd_nfrag == 0)
+                        rd->rd_key = kibnal_data.kib_whole_mem.md_rkey;
+        }
 
-        if (!VALID_PAGE (page))
-                page = NULL;
+        frag->rf_nob  = len;
+        frag->rf_addr = kibnal_data.kib_whole_mem.md_addr +
+                        lnet_page2phys(page) + page_offset;
 
-        return page;
+        CDEBUG(D_NET,"map key %x frag [%d]["LPX64" for %d]\n", 
+               rd->rd_key, rd->rd_nfrag, frag->rf_addr, frag->rf_nob);
+
+        rd->rd_nfrag++;
+        return 0;
 }
 
-static void
-kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
-                 unsigned long len, int active)
+int
+kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                    unsigned int niov, struct iovec *iov, int offset, int nob)
+                 
 {
-        kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
-        kib_rdma_desc_t *desc;
+        int           fragnob;
+        int           rc;
+        unsigned long vaddr;
+        struct page  *page;
+        int           page_offset;
 
-        LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", 
-                 ibrm->ibrm_num_descs);
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT ((rd != tx->tx_rd) == !active);
 
-        desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
-        if (active)
-                ibrm->rd_key = kibnal_data.kib_md.md_lkey;
-        else
-                ibrm->rd_key = kibnal_data.kib_md.md_rkey;
-        desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
-        desc->rd_addr = kibnal_page2phys(page) + page_offset +
-                        kibnal_data.kib_md.md_addr;
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        rd->rd_nfrag = 0;
+        do {
+                LASSERT (niov > 0);
+
+                vaddr = ((unsigned long)iov->iov_base) + offset;
+                page_offset = vaddr & (PAGE_SIZE - 1);
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR ("Can't find page\n");
+                        return -EFAULT;
+                }
 
-        ibrm->ibrm_num_descs++;
+                fragnob = min((int)(iov->iov_len - offset), nob);
+                fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+                rc = kibnal_append_rdfrag(rd, active, page, 
+                                          page_offset, fragnob);
+                if (rc != 0)
+                        return rc;
+
+                if (offset + fragnob < iov->iov_len) {
+                        offset += fragnob;
+                } else {
+                        offset = 0;
+                        iov++;
+                        niov--;
+                }
+                nob -= fragnob;
+        } while (nob > 0);
+        
+        return 0;
 }
 
-static int
-kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 {
-        struct page *page;
-        int page_offset, len;
+        int            fragnob;
+        int            rc;
 
-        while (nob > 0) {
-                page = kibnal_kvaddr_to_page(vaddr);
-                if (page == NULL)
-                        return -EFAULT;
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
-                page_offset = vaddr & (PAGE_SIZE - 1);
-                len = min(nob, (int)PAGE_SIZE - page_offset);
-                
-                kibnal_fill_ibrm(tx, page, page_offset, len, active);
-                nob -= len;
-                vaddr += len;
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT ((rd != tx->tx_rd) == !active);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
         }
+
+        rd->rd_nfrag = 0;
+        do {
+                LASSERT (nkiov > 0);
+                fragnob = min((int)(kiov->kiov_len - offset), nob);
+                
+                rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
+                                          kiov->kiov_offset + offset,
+                                          fragnob);
+                if (rc != 0)
+                        return rc;
+
+                offset = 0;
+                kiov++;
+                nkiov--;
+                nob -= fragnob;
+        } while (nob > 0);
+
         return 0;
 }
+#else
+int
+kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+               int npages, unsigned long page_offset, int nob)
+{
+        IB_ACCESS_CONTROL access = {0,};
+        FSTATUS           frc;
+
+        LASSERT ((rd != tx->tx_rd) == !active);
+        LASSERT (!tx->tx_md.md_active);
+        LASSERT (tx->tx_md.md_fmrcount > 0);
+        LASSERT (page_offset < PAGE_SIZE);
+        LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
+        LASSERT (npages <= LNET_MAX_IOV);
+
+        if (!active) {
+                // access.s.MWBindable = 1;
+                access.s.LocalWrite = 1;
+                access.s.RdmaWrite = 1;
+        }
+
+        /* Map the memory described by tx->tx_pages
+        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+                                            IBNAL_RDMA_BASE,
+                                            tx->tx_pages, npages,
+                                            page_offset,
+                                            kibnal_data.kib_pd,
+                                            access,
+                                            &tx->tx_md.md_handle,
+                                            &tx->tx_md.md_addr,
+                                            &tx->tx_md.md_lkey,
+                                            &tx->tx_md.md_rkey);
+        */
+        return -EINVAL;
+}
 
-static int
-kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
-                 int niov, struct iovec *iov, int offset, int nob, int active)
+int
+kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                     unsigned int niov, struct iovec *iov, int offset, int nob)
                  
 {
-        void   *vaddr;
-        FSTATUS frc;
+        int           resid;
+        int           fragnob;
+        struct page  *page;
+        int           npages;
+        unsigned long page_offset;
+        unsigned long vaddr;
 
         LASSERT (nob > 0);
         LASSERT (niov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
 
         while (offset >= iov->iov_len) {
                 offset -= iov->iov_len;
@@ -686,54 +726,47 @@ kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
                 return (-EMSGSIZE);
         }
 
-        /* our large contiguous iov could be backed by multiple physical
-         * pages. */
-        if (kibnal_whole_mem()) {
-                int rc;
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
-                rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + 
-                                         offset, nob, active);
-                if (rc != 0) {
-                        CERROR ("Can't map iov: %d\n", rc);
-                        return rc;
+        vaddr = ((unsigned long)iov->iov_base) + offset;
+        
+        page_offset = vaddr & (PAGE_SIZE - 1);
+        resid = nob;
+        npages = 0;
+
+        do {
+                LASSERT (npages < LNET_MAX_IOV);
+
+                page = kibnal_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR("Can't find page for %lu\n", vaddr);
+                        return -EFAULT;
                 }
-                return 0;
-        }
 
-        vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
-        tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+                tx->tx_pages[npages++] = lnet_page2phys(page);
 
-        frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
-                                   kibnal_data.kib_pd, access,
-                                   &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
-                                   &tx->tx_md.md_rkey);
-        if (frc != 0) {
-                CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
-                return -EINVAL;
-        }
+                fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
+                vaddr += fragnob;
+                resid -= fragnob;
 
-        tx->tx_mapped = KIB_TX_MAPPED;
-        return (0);
+        } while (resid > 0);
+
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
 
-static int
-kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
-                  int nkiov, ptl_kiov_t *kiov,
-                  int offset, int nob, int active)
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 {
-        __u64                      *phys = NULL;
-        int                         page_offset;
-        int                         nphys;
-        int                         resid;
-        int                         phys_size = 0;
-        FSTATUS                     frc;
-        int                         i, rc = 0;
-
+        int            resid;
+        int            npages;
+        unsigned long  page_offset;
+        
         CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT (nkiov <= LNET_MAX_IOV);
+        LASSERT (!tx->tx_md.md_active);
+        LASSERT ((rd != tx->tx_rd) == !active);
 
         while (offset >= kiov->kiov_len) {
                 offset -= kiov->kiov_len;
@@ -743,122 +776,36 @@ kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
         }
 
         page_offset = kiov->kiov_offset + offset;
-        nphys = 1;
-
-        if (!kibnal_whole_mem()) {
-                phys_size = nkiov * sizeof (*phys);
-                PORTAL_ALLOC(phys, phys_size);
-                if (phys == NULL) {
-                        CERROR ("Can't allocate tmp phys\n");
-                        return (-ENOMEM);
-                }
-
-                phys[0] = kibnal_page2phys(kiov->kiov_page);
-        } else {
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
-                kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, 
-                                 kiov->kiov_len, active);
-        }
-
-        resid = nob - (kiov->kiov_len - offset);
+        
+        resid = offset + nob;
+        npages = 0;
 
-        while (resid > 0) {
-                kiov++;
-                nkiov--;
+        do {
+                LASSERT (npages < LNET_MAX_IOV);
                 LASSERT (nkiov > 0);
 
-                if (kiov->kiov_offset != 0 ||
-                    ((resid > PAGE_SIZE) && 
-                     kiov->kiov_len < PAGE_SIZE)) {
+                if ((npages > 0 && kiov->kiov_offset != 0) ||
+                    (resid > kiov->kiov_len && 
+                     (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
                         /* Can't have gaps */
                         CERROR ("Can't make payload contiguous in I/O VM:"
-                                "page %d, offset %d, len %d \n", nphys, 
-                                kiov->kiov_offset, kiov->kiov_len);
-
-                        for (i = -nphys; i < nkiov; i++) 
-                        {
-                                CERROR("kiov[%d] %p +%d for %d\n",
-                                       i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
-                        }
+                                "page %d, offset %d, len %d \n",
+                                npages, kiov->kiov_offset, kiov->kiov_len);
                         
-                        rc = -EINVAL;
-                        goto out;
-                }
-
-                if (nphys == PTL_MD_MAX_IOV) {
-                        CERROR ("payload too big (%d)\n", nphys);
-                        rc = -EMSGSIZE;
-                        goto out;
-                }
-
-                if (!kibnal_whole_mem()) {
-                        LASSERT (nphys * sizeof (*phys) < phys_size);
-                        phys[nphys] = kibnal_page2phys(kiov->kiov_page);
-                } else {
-                        if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
-                                CERROR ("payload too big (%d)\n", nphys);
-                                rc = -EMSGSIZE;
-                                goto out;
-                        }
-                        kibnal_fill_ibrm(tx, kiov->kiov_page, 
-                                         kiov->kiov_offset, kiov->kiov_len,
-                                         active);
+                        return -EINVAL;
                 }
 
-                nphys ++;
-                resid -= PAGE_SIZE;
-        }
-
-        if (kibnal_whole_mem())
-                goto out;
-
-#if 0
-        CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
-        for (i = 0; i < nphys; i++)
-                CWARN ("   [%d] "LPX64"\n", i, phys[i]);
-#endif
-
-#if IBNAL_FMR
-#error "iibnal hasn't learned about FMR yet"
-        rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
-                                       phys, nphys,
-                                       &tx->tx_md.md_addr,
-                                       page_offset,
-                                       &tx->tx_md.md_handle.fmr,
-                                       &tx->tx_md.md_lkey,
-                                       &tx->tx_md.md_rkey);
-#else
-        frc = iibt_register_physical_memory(kibnal_data.kib_hca,
-                                            IBNAL_RDMA_BASE,
-                                            phys, nphys,
-                                            0,          /* offset */
-                                            kibnal_data.kib_pd,
-                                            access,
-                                            &tx->tx_md.md_handle,
-                                            &tx->tx_md.md_addr,
-                                            &tx->tx_md.md_lkey,
-                                            &tx->tx_md.md_rkey);
-#endif
-        if (frc == FSUCCESS) {
-                CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
-                       nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
-#if IBNAL_FMR
-                tx->tx_mapped = KIB_TX_MAPPED_FMR;
-#else
-                tx->tx_mapped = KIB_TX_MAPPED;
-#endif
-        } else {
-                CERROR ("Can't map phys: %d\n", frc);
-                rc = -EFAULT;
-        }
+                tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
+                resid -= kiov->kiov_len;
+                kiov++;
+                nkiov--;
+        } while (resid > 0);
 
- out:
-        if (phys != NULL)
-                PORTAL_FREE(phys, phys_size);
-        return (rc);
+        return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
 }
+#endif
 
-static kib_conn_t *
+kib_conn_t *
 kibnal_find_conn_locked (kib_peer_t *peer)
 {
         struct list_head *tmp;
@@ -874,134 +821,173 @@ kibnal_find_conn_locked (kib_peer_t *peer)
 void
 kibnal_check_sends (kib_conn_t *conn)
 {
-        unsigned long   flags;
         kib_tx_t       *tx;
+        FSTATUS         frc;
         int             rc;
-        int             i;
+        int             consume_cred;
         int             done;
-        int             nwork;
-        ENTRY;
 
-        spin_lock_irqsave (&conn->ibc_lock, flags);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+        
+        spin_lock(&conn->ibc_lock);
 
-        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+        LASSERT (conn->ibc_nsends_posted <=
+                *kibnal_tunables.kib_concurrent_sends);
+        LASSERT (conn->ibc_reserved_credits >= 0);
+        
+        while (conn->ibc_reserved_credits > 0 &&
+               !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+                LASSERT (conn->ibc_version != 
+                         IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+                tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+                                kib_tx_t, tx_list);
+                list_del(&tx->tx_list);
+                list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+                conn->ibc_reserved_credits--;
+        }
 
         if (list_empty(&conn->ibc_tx_queue) &&
-            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
-                spin_unlock_irqrestore(&conn->ibc_lock, flags);
+            list_empty(&conn->ibc_tx_queue_nocred) &&
+            (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
+             kibnal_send_keepalive(conn))) {
+                spin_unlock(&conn->ibc_lock);
                 
-                tx = kibnal_get_idle_tx(0);     /* don't block */
+                tx = kibnal_get_idle_tx();
                 if (tx != NULL)
                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
-                spin_lock_irqsave(&conn->ibc_lock, flags);
+                spin_lock(&conn->ibc_lock);
                 
-                if (tx != NULL) {
-                        atomic_inc(&conn->ibc_refcount);
+                if (tx != NULL)
                         kibnal_queue_tx_locked(tx, conn);
-                }
         }
 
-        while (!list_empty (&conn->ibc_tx_queue)) {
-                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+        for (;;) {
+                if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                        LASSERT (conn->ibc_version != 
+                                 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+                        tx = list_entry (conn->ibc_tx_queue_nocred.next, 
+                                         kib_tx_t, tx_list);
+                        consume_cred = 0;
+                } else if (!list_empty (&conn->ibc_tx_queue)) {
+                        tx = list_entry (conn->ibc_tx_queue.next, 
+                                         kib_tx_t, tx_list);
+                        consume_cred = 1;
+                } else {
+                        /* nothing waiting */
+                        break;
+                }
 
+                LASSERT (tx->tx_queued);
                 /* We rely on this for QP sizing */
-                LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+                LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
 
                 LASSERT (conn->ibc_outstanding_credits >= 0);
                 LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
                 LASSERT (conn->ibc_credits >= 0);
                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 
-                /* Not on ibc_rdma_queue */
-                LASSERT (!tx->tx_passive_rdma_wait);
-
-                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
-                        GOTO(out, 0);
+                if (conn->ibc_nsends_posted ==
+                    *kibnal_tunables.kib_concurrent_sends) {
+                        /* We've got some tx completions outstanding... */
+                        CDEBUG(D_NET, "%s: posted enough\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        break;
+                }
 
-                if (conn->ibc_credits == 0)     /* no credits */
-                        GOTO(out, 1);
+                if (consume_cred) {
+                        if (conn->ibc_credits == 0) {   /* no credits */
+                                CDEBUG(D_NET, "%s: no credits\n",
+                                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                                break;
+                        }
+                        
+                        if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                            conn->ibc_outstanding_credits == 0) { /* giving back credits */
+                                CDEBUG(D_NET, "%s: not using last credit\n",
+                                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                                break;
+                        }
+                }
                 
-                if (conn->ibc_credits == 1 &&   /* last credit reserved for */
-                    conn->ibc_outstanding_credits == 0) /* giving back credits */
-                        GOTO(out, 2);
-
                 list_del (&tx->tx_list);
+                tx->tx_queued = 0;
+
+                /* NB don't drop ibc_lock before bumping tx_sending */
 
                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
-                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                     !list_empty(&conn->ibc_tx_queue_nocred) ||
+                     (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
+                      !kibnal_send_keepalive(conn)))) {
                         /* redundant NOOP */
-                        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         kibnal_tx_done(tx);
-                        spin_lock_irqsave(&conn->ibc_lock, flags);
+                        spin_lock(&conn->ibc_lock);
+                        CDEBUG(D_NET, "%s: redundant noop\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         continue;
                 }
 
-                tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
-                conn->ibc_outstanding_credits = 0;
+                kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
+                                conn->ibc_outstanding_credits,
+                                conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
+                                conn->ibc_txseq);
 
+                conn->ibc_txseq++;
+                conn->ibc_outstanding_credits = 0;
                 conn->ibc_nsends_posted++;
-                conn->ibc_credits--;
+                if (consume_cred)
+                        conn->ibc_credits--;
+
+                /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+                 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+                 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+                 * and then re-queued here.  It's (just) possible that
+                 * tx_sending is non-zero if we've not done the tx_complete() from
+                 * the first send; hence the ++ rather than = below. */
+                tx->tx_sending++;
 
-                /* we only get a tx completion for the final rdma op */ 
-                tx->tx_sending = min(tx->tx_nsp, 2);
-                tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
                 list_add (&tx->tx_list, &conn->ibc_active_txs);
-#if IBNAL_CKSUM
-                tx->tx_msg->ibm_cksum = 0;
-                tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
-                CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
-#endif
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-                /* NB the gap between removing tx from the queue and sending it
-                 * allows message re-ordering to occur */
-
-                LASSERT (tx->tx_nsp > 0);
-
-                rc = -ECONNABORTED;
-                nwork = 0;
-                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                        tx->tx_status = 0;
-                        /* Driver only accepts 1 item at a time */
-                        for (i = 0; i < tx->tx_nsp; i++) {
-                                hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
-                                rc = iibt_postsend(conn->ibc_qp, 
-                                                   &tx->tx_wrq[i]);
-                                if (rc != 0)
-                                        break;
-                                if (wrq_signals_completion(&tx->tx_wrq[i]))
-                                        nwork++;
-                                CDEBUG(D_NET, "posted tx wrq %p\n", 
-                                       &tx->tx_wrq[i]);
-                        }
+
+                LASSERT (tx->tx_nwrq > 0);
+
+                rc = 0;
+                frc = FSUCCESS;
+                if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) {
+                        rc = -ECONNABORTED;
+                } else {
+                        frc = iba_post_send2(conn->ibc_qp, tx->tx_wrq, NULL);
+                        if (frc != FSUCCESS)
+                                rc = -EIO;
                 }
 
-                spin_lock_irqsave (&conn->ibc_lock, flags);
+                conn->ibc_last_send = jiffies;
+
                 if (rc != 0) {
                         /* NB credits are transferred in the actual
                          * message, which can only be the last work item */
                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
-                        conn->ibc_credits++;
+                        if (consume_cred)
+                                conn->ibc_credits++;
                         conn->ibc_nsends_posted--;
 
                         tx->tx_status = rc;
-                        tx->tx_passive_rdma_wait = 0;
-                        tx->tx_sending -= tx->tx_nsp - nwork;
-
+                        tx->tx_waiting = 0;
+                        tx->tx_sending--;
+                        
                         done = (tx->tx_sending == 0);
                         if (done)
                                 list_del (&tx->tx_list);
                         
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+                        spin_unlock(&conn->ibc_lock);
                         
                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
-                                CERROR ("Error %d posting transmit to "LPX64"\n", 
-                                        rc, conn->ibc_peer->ibp_nid);
+                                CERROR ("Error %d posting transmit to %s\n", 
+                                        frc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         else
-                                CDEBUG (D_NET, "Error %d posting transmit to "
-                                        LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+                                CDEBUG (D_NET, "Error %d posting transmit to %s\n",
+                                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
                         kibnal_close_conn (conn, rc);
 
@@ -1009,138 +995,172 @@ kibnal_check_sends (kib_conn_t *conn)
                                 kibnal_tx_done (tx);
                         return;
                 }
-                
         }
 
-        EXIT;
-out:
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        spin_unlock(&conn->ibc_lock);
 }
 
-static void
-kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+void
+kibnal_tx_complete (IB_WORK_COMPLETION *wc)
 {
         kib_tx_t     *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
-        kib_conn_t   *conn;
-        unsigned long flags;
+        kib_conn_t   *conn = tx->tx_conn;
+        int           failed = wc->Status != WRStatusSuccess;
         int           idle;
 
-        conn = tx->tx_conn;
-        LASSERT (conn != NULL);
-        LASSERT (tx->tx_sending != 0);
+        CDEBUG(D_NET, "%s: sending %d nwrq %d status %d\n", 
+               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+               tx->tx_sending, tx->tx_nwrq, wc->Status);
+
+        LASSERT (tx->tx_sending > 0);
 
-        spin_lock_irqsave(&conn->ibc_lock, flags);
+        if (failed &&
+            tx->tx_status == 0 &&
+            conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
+#if KIBLND_DETAILED_DEBUG
+                int                   i;
+                IB_WORK_REQ2         *wrq = &tx->tx_wrq[0];
+                IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[0];
+                lnet_msg_t           *lntmsg = tx->tx_lntmsg[0];
+#endif
+                CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
+                       " sending %d waiting %d failed %d nwrk %d\n", 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                       tx->tx_msg->ibm_type, tx->tx_cookie,
+                       tx->tx_sending, tx->tx_waiting, wc->Status,
+                       tx->tx_nwrq);
+#if KIBLND_DETAILED_DEBUG
+                for (i = 0; i < tx->tx_nwrq; i++, wrq++, gl++) {
+                        switch (wrq->Operation) {
+                        default:
+                                CDEBUG(D_NETERROR, "    [%3d] Addr %p Next %p OP %d "
+                                       "DSList %p(%p)/%d: "LPX64"/%d K %x\n",
+                                       i, wrq, wrq->Next, wrq->Operation,
+                                       wrq->DSList, gl, wrq->DSListDepth,
+                                       gl->Address, gl->Length, gl->Lkey);
+                                break;
+                        case WROpSend:
+                                CDEBUG(D_NETERROR, "    [%3d] Addr %p Next %p SEND "
+                                       "DSList %p(%p)/%d: "LPX64"/%d K %x\n",
+                                       i, wrq, wrq->Next, 
+                                       wrq->DSList, gl, wrq->DSListDepth,
+                                       gl->Address, gl->Length, gl->Lkey);
+                                break;
+                        case WROpRdmaWrite:
+                                CDEBUG(D_NETERROR, "    [%3d] Addr %p Next %p DMA "
+                                       "DSList: %p(%p)/%d "LPX64"/%d K %x -> "
+                                       LPX64" K %x\n",
+                                       i, wrq, wrq->Next, 
+                                       wrq->DSList, gl, wrq->DSListDepth,
+                                       gl->Address, gl->Length, gl->Lkey,
+                                       wrq->Req.SendRC.RemoteDS.Address,
+                                       wrq->Req.SendRC.RemoteDS.Rkey);
+                                break;
+                        }
+                }
+                
+                switch (tx->tx_msg->ibm_type) {
+                default:
+                        CDEBUG(D_NETERROR, "  msg type %x %p/%d, No RDMA\n", 
+                               tx->tx_msg->ibm_type, 
+                               tx->tx_msg, tx->tx_msg->ibm_nob);
+                        break;
 
-        CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
-               tx->tx_sending, tx->tx_nsp, wc->Status);
+                case IBNAL_MSG_PUT_DONE:
+                case IBNAL_MSG_GET_DONE:
+                        CDEBUG(D_NETERROR, "  msg type %x %p/%d, RDMA key %x frags %d...\n", 
+                               tx->tx_msg->ibm_type, 
+                               tx->tx_msg, tx->tx_msg->ibm_nob,
+                               tx->tx_rd->rd_key, tx->tx_rd->rd_nfrag);
+                        for (i = 0; i < tx->tx_rd->rd_nfrag; i++)
+                                CDEBUG(D_NETERROR, "    [%d] "LPX64"/%d\n", i,
+                                       tx->tx_rd->rd_frags[i].rf_addr,
+                                       tx->tx_rd->rd_frags[i].rf_nob);
+                        if (lntmsg == NULL) {
+                                CDEBUG(D_NETERROR, "  No lntmsg\n");
+                        } else if (lntmsg->msg_iov != NULL) {
+                                CDEBUG(D_NETERROR, "  lntmsg in %d VIRT frags...\n", 
+                                       lntmsg->msg_niov);
+                                for (i = 0; i < lntmsg->msg_niov; i++)
+                                        CDEBUG(D_NETERROR, "    [%d] %p/%d\n", i,
+                                               lntmsg->msg_iov[i].iov_base,
+                                               lntmsg->msg_iov[i].iov_len);
+                        } else if (lntmsg->msg_kiov != NULL) {
+                                CDEBUG(D_NETERROR, "  lntmsg in %d PAGE frags...\n", 
+                                       lntmsg->msg_niov);
+                                for (i = 0; i < lntmsg->msg_niov; i++)
+                                        CDEBUG(D_NETERROR, "    [%d] %p+%d/%d\n", i,
+                                               lntmsg->msg_kiov[i].kiov_page,
+                                               lntmsg->msg_kiov[i].kiov_offset,
+                                               lntmsg->msg_kiov[i].kiov_len);
+                        } else {
+                                CDEBUG(D_NETERROR, "  lntmsg in %d frags\n", 
+                                       lntmsg->msg_niov);
+                        }
+                        
+                        break;
+                }
+#endif
+        }
+        
+        spin_lock(&conn->ibc_lock);
 
         /* I could be racing with rdma completion.  Whoever makes 'tx' idle
-         * gets to free it, which also drops its ref on 'conn'.  If it's
-         * not me, then I take an extra ref on conn so it can't disappear
-         * under me. */
+         * gets to free it, which also drops its ref on 'conn'. */
 
         tx->tx_sending--;
+        conn->ibc_nsends_posted--;
+
+        if (failed) {
+                tx->tx_waiting = 0;
+                tx->tx_status = -EIO;
+        }
+        
         idle = (tx->tx_sending == 0) &&         /* This is the final callback */
-               (!tx->tx_passive_rdma_wait);     /* Not waiting for RDMA completion */
+               !tx->tx_waiting &&               /* Not waiting for peer */
+               !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
         if (idle)
                 list_del(&tx->tx_list);
 
-        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-               atomic_read (&conn->ibc_refcount));
-        atomic_inc (&conn->ibc_refcount);
+        kibnal_conn_addref(conn);               /* 1 ref for me.... */
 
-        if (tx->tx_sending == 0)
-                conn->ibc_nsends_posted--;
-
-        if (wc->Status != WRStatusSuccess &&
-            tx->tx_status == 0)
-                tx->tx_status = -ECONNABORTED;
-                
-        spin_unlock_irqrestore(&conn->ibc_lock, flags);
+        spin_unlock(&conn->ibc_lock);
 
         if (idle)
                 kibnal_tx_done (tx);
 
-        if (wc->Status != WRStatusSuccess) {
-                CERROR ("Tx completion to "LPX64" failed: %d\n", 
-                        conn->ibc_peer->ibp_nid, wc->Status);
-                kibnal_close_conn (conn, -ENETDOWN);
+        if (failed) {
+                kibnal_close_conn (conn, -EIO);
         } else {
-                /* can I shovel some more sends out the door? */
+                kibnal_peer_alive(conn->ibc_peer);
                 kibnal_check_sends(conn);
         }
 
-        kibnal_put_conn (conn);
-}
-
-void 
-kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
-{
-        /* XXX flesh out.  this seems largely for async errors */
-        CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
-}
-
-void
-kibnal_ca_callback (void *ca_arg, void *cq_arg)
-{
-        IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
-        IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
-        IB_WORK_COMPLETION wc;
-        int armed = 0;
-
-        CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
-
-        for(;;) {
-                while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
-
-                        /* We will need to rearm the CQ to avoid a potential race. */
-                        armed = 0;
-                        
-                        if (kibnal_wreqid_is_rx(wc.WorkReqId))
-                                kibnal_rx_callback(&wc);
-                        else
-                                kibnal_tx_callback(&wc);
-                }
-                if (armed)
-                        return;
-                if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
-                        CERROR("rearm failed?\n");
-                        return;
-                }
-                armed = 1;
-        }
+        kibnal_conn_decref(conn);               /* ...until here */
 }
 
 void
 kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
 {
-        IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
-        IB_WORK_REQ         *wrq = &tx->tx_wrq[tx->tx_nsp];
-        int                       fence;
-        int                       nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+        IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nwrq];
+        IB_WORK_REQ2         *wrq = &tx->tx_wrq[tx->tx_nwrq];
+        int                   nob = offsetof (kib_msg_t, ibm_u) + body_nob;
 
-        LASSERT (tx->tx_nsp >= 0 && 
-                 tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+        LASSERT (tx->tx_nwrq >= 0 && 
+                 tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
         LASSERT (nob <= IBNAL_MSG_SIZE);
-        
-        tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
-        tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
-        tx->tx_msg->ibm_type = type;
-#if IBNAL_CKSUM
-        tx->tx_msg->ibm_nob = nob;
-#endif
-        /* Fence the message if it's bundled with an RDMA read */
-        fence = (tx->tx_nsp > 0) &&
-                (type == IBNAL_MSG_PUT_DONE);
+
+        kibnal_init_msg(tx->tx_msg, type, body_nob);
 
         *gl = (IB_LOCAL_DATASEGMENT) {
-                .Address = tx->tx_vaddr,
+                .Address = tx->tx_hca_msg,
                 .Length  = IBNAL_MSG_SIZE,
-                .Lkey    = kibnal_lkey(kibnal_data.kib_tx_pages),
+                .Lkey    = kibnal_data.kib_whole_mem.md_lkey,
         };
 
-        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
+        wrq->Next           = NULL;             /* This is the last one */
+
+        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
         wrq->Operation      = WROpSend;
         wrq->DSList         = gl;
         wrq->DSListDepth    = 1;
@@ -1149,869 +1169,1339 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
         wrq->Req.SendRC.Options.s.SolicitedEvent         = 1;
         wrq->Req.SendRC.Options.s.SignaledCompletion     = 1;
         wrq->Req.SendRC.Options.s.ImmediateData          = 0;
-        wrq->Req.SendRC.Options.s.Fence                  = fence;
-
-        tx->tx_nsp++;
+        wrq->Req.SendRC.Options.s.Fence                  = 0; 
+        /* fence only needed on RDMA reads */
+        
+        tx->tx_nwrq++;
 }
 
-static void
-kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+int
+kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
+                  kib_rdma_desc_t *dstrd, __u64 dstcookie)
 {
-        unsigned long         flags;
+        kib_msg_t            *ibmsg = tx->tx_msg;
+        kib_rdma_desc_t      *srcrd = tx->tx_rd;
+        IB_LOCAL_DATASEGMENT *gl;
+        IB_WORK_REQ2         *wrq;
+        int                   rc;
 
-        spin_lock_irqsave(&conn->ibc_lock, flags);
+#if IBNAL_USE_FMR
+        LASSERT (tx->tx_nwrq == 0);
 
-        kibnal_queue_tx_locked (tx, conn);
-        
-        spin_unlock_irqrestore(&conn->ibc_lock, flags);
-        
+        gl = &tx->tx_gl[0];
+        gl->Length  = nob;
+        gl->Address = srcrd->rd_addr;
+        gl->Lkey    = srcrd->rd_key;
+
+        wrq = &tx->tx_wrq[0];
+
+        wrq->Next           = wrq + 1;
+        wrq->WorkReqId      = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+        wrq->Operation      = WROpRdmaWrite;
+        wrq->DSList         = gl;
+        wrq->DSListDepth    = 1;
+        wrq->MessageLen     = nob;
+
+        wrq->Req.SendRC.ImmediateData                = 0;
+        wrq->Req.SendRC.Options.s.SolicitedEvent     = 0;
+        wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+        wrq->Req.SendRC.Options.s.ImmediateData      = 0;
+        wrq->Req.SendRC.Options.s.Fence              = 0; 
+
+        wrq->Req.SendRC.RemoteDS.Address = dstrd->rd_addr;
+        wrq->Req.SendRC.RemoteDS.Rkey    = dstrd->rd_key;
+
+        tx->tx_nwrq = 1;
+        rc = nob;
+#else
+        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+        int              resid = nob;
+        kib_rdma_frag_t *srcfrag;
+        int              srcidx;
+        kib_rdma_frag_t *dstfrag;
+        int              dstidx;
+        int              wrknob;
+
+        /* Called by scheduler */
+        LASSERT (!in_interrupt());
+
+        LASSERT (type == IBNAL_MSG_GET_DONE ||
+                 type == IBNAL_MSG_PUT_DONE);
+
+        srcidx = dstidx = 0;
+        srcfrag = &srcrd->rd_frags[0];
+        dstfrag = &dstrd->rd_frags[0];
+        rc = resid;
+
+        while (resid > 0) {
+                if (srcidx >= srcrd->rd_nfrag) {
+                        CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                        rc = -EPROTO;
+                        break;
+                }
+                
+                if (dstidx == dstrd->rd_nfrag) {
+                        CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+                        rc = -EPROTO;
+                        break;
+                }
+
+                if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
+                        CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
+                               srcidx, srcrd->rd_nfrag,
+                               dstidx, dstrd->rd_nfrag);
+                        rc = -EMSGSIZE;
+                        break;
+                }
+
+                wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
+
+                gl = &tx->tx_gl[tx->tx_nwrq];
+                gl->Length  = wrknob;
+                gl->Address = srcfrag->rf_addr;
+                gl->Lkey    = srcrd->rd_key;
+
+                wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+                wrq->Next           = wrq + 1;
+                wrq->WorkReqId      = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+                wrq->Operation      = WROpRdmaWrite;
+                wrq->DSList         = gl;
+                wrq->DSListDepth    = 1;
+                wrq->MessageLen     = nob;
+
+                wrq->Req.SendRC.ImmediateData                = 0;
+                wrq->Req.SendRC.Options.s.SolicitedEvent     = 0;
+                wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+                wrq->Req.SendRC.Options.s.ImmediateData      = 0;
+                wrq->Req.SendRC.Options.s.Fence              = 0; 
+
+                wrq->Req.SendRC.RemoteDS.Address = dstfrag->rf_addr;
+                wrq->Req.SendRC.RemoteDS.Rkey    = dstrd->rd_key;
+
+                resid -= wrknob;
+                if (wrknob < srcfrag->rf_nob) {
+                        srcfrag->rf_addr += wrknob;
+                        srcfrag->rf_nob -= wrknob;
+                } else {
+                        srcfrag++;
+                        srcidx++;
+                }
+                
+                if (wrknob < dstfrag->rf_nob) {
+                        dstfrag->rf_addr += wrknob;
+                        dstfrag->rf_nob -= wrknob;
+                } else {
+                        dstfrag++;
+                        dstidx++;
+                }
+                
+                tx->tx_nwrq++;
+        }
+
+        if (rc < 0)                             /* no RDMA if completing with failure */
+                tx->tx_nwrq = 0;
+#endif
+        
+        ibmsg->ibm_u.completion.ibcm_status = rc;
+        ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+
+        return rc;
+}
+
+void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+        spin_lock(&conn->ibc_lock);
+        kibnal_queue_tx_locked (tx, conn);
+        spin_unlock(&conn->ibc_lock);
+        
         kibnal_check_sends(conn);
 }
 
-static void
-kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+void
+kibnal_schedule_active_connect_locked (kib_peer_t *peer, int proto_version)
+{
+        /* Called holding kib_global_lock exclusive with IRQs disabled */
+
+        peer->ibp_version = proto_version;      /* proto version for new conn */
+        peer->ibp_connecting++;                 /* I'm connecting */
+        kibnal_peer_addref(peer);               /* extra ref for connd */
+
+        spin_lock(&kibnal_data.kib_connd_lock);
+
+        list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
+        wake_up (&kibnal_data.kib_connd_waitq);
+
+        spin_unlock(&kibnal_data.kib_connd_lock);
+}
+
+void
+kibnal_schedule_active_connect (kib_peer_t *peer, int proto_version)
+{
+        unsigned long flags;
+
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        kibnal_schedule_active_connect_locked(peer, proto_version);
+
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
+
+void
+kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
 {
-        unsigned long    flags;
         kib_peer_t      *peer;
         kib_conn_t      *conn;
+        unsigned long    flags;
         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+        int              retry;
+        int              rc;
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
         
         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
-        LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
+        LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
 
-        read_lock_irqsave(g_lock, flags);
+        for (retry = 0; ; retry = 1) {
+                read_lock_irqsave(g_lock, flags);
         
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
-                read_unlock_irqrestore(g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
-        }
-
-        conn = kibnal_find_conn_locked (peer);
-        if (conn != NULL) {
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
-                read_unlock_irqrestore(g_lock, flags);
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL) {
+                        conn = kibnal_find_conn_locked (peer);
+                        if (conn != NULL) {
+                                kibnal_conn_addref(conn); /* 1 ref for me... */
+                                read_unlock_irqrestore(g_lock, flags);
+
+                                kibnal_queue_tx (tx, conn);
+                                kibnal_conn_decref(conn); /* ...to here */
+                                return;
+                        }
+                }
                 
-                kibnal_queue_tx (tx, conn);
-                return;
-        }
-        
-        /* Making one or more connections; I'll need a write lock... */
-        read_unlock(g_lock);
-        write_lock(g_lock);
+                /* Making one or more connections; I'll need a write lock... */
+                read_unlock(g_lock);
+                write_lock(g_lock);
 
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
-                write_unlock_irqrestore (g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL)
+                        break;
+
+                write_unlock_irqrestore(g_lock, flags);
+
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
+
+                        tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+
+                rc = kibnal_add_persistent_peer(nid);
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        
+                        tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
+                        kibnal_tx_done (tx);
+                        return;
+                }
         }
 
         conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
-                write_unlock_irqrestore (g_lock, flags);
+                kibnal_conn_addref(conn);       /* 1 ref for me... */
+                write_unlock_irqrestore(g_lock, flags);
                 
                 kibnal_queue_tx (tx, conn);
+                kibnal_conn_decref(conn);       /* ...until here */
                 return;
         }
 
-        if (peer->ibp_connecting == 0) {
-                if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
-                        write_unlock_irqrestore (g_lock, flags);
+        if (!kibnal_peer_connecting(peer)) {
+                if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
+                      time_after_eq(jiffies, peer->ibp_reconnect_time))) {
+                        write_unlock_irqrestore(g_lock, flags);
                         tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
                         kibnal_tx_done (tx);
                         return;
                 }
-        
-                peer->ibp_connecting = 1;
-                kib_peer_addref(peer); /* extra ref for connd */
-        
-                spin_lock (&kibnal_data.kib_connd_lock);
-        
-                list_add_tail (&peer->ibp_connd_list,
-                               &kibnal_data.kib_connd_peers);
-                wake_up (&kibnal_data.kib_connd_waitq);
-        
-                spin_unlock (&kibnal_data.kib_connd_lock);
+
+                kibnal_schedule_active_connect_locked(peer, IBNAL_MSG_VERSION);
         }
         
         /* A connection is being established; queue the message... */
         list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
 
-        write_unlock_irqrestore (g_lock, flags);
+        write_unlock_irqrestore(g_lock, flags);
 }
 
-static ptl_err_t
-kibnal_start_passive_rdma (int type, ptl_nid_t nid,
-                            lib_msg_t *libmsg, ptl_hdr_t *hdr)
+void
+kibnal_txlist_done (struct list_head *txlist, int status)
 {
-        int         nob = libmsg->md->length;
-        kib_tx_t   *tx;
-        kib_msg_t  *ibmsg;
-        int         rc;
-        IB_ACCESS_CONTROL         access = {0,};
-        
-        LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
-        LASSERT (nob > 0);
-        LASSERT (!in_interrupt());              /* Mapping could block */
-
-        access.s.MWBindable = 1;
-        access.s.LocalWrite = 1;
-        access.s.RdmaRead = 1;
-        access.s.RdmaWrite = 1;
+        kib_tx_t *tx;
 
-        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
-        LASSERT (tx != NULL);
+        while (!list_empty (txlist)) {
+                tx = list_entry (txlist->next, kib_tx_t, tx_list);
 
-        if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
-                rc = kibnal_map_iov (tx, access,
-                                     libmsg->md->md_niov,
-                                     libmsg->md->md_iov.iov,
-                                     0, nob, 0);
-        else
-                rc = kibnal_map_kiov (tx, access,
-                                      libmsg->md->md_niov, 
-                                      libmsg->md->md_iov.kiov,
-                                      0, nob, 0);
-
-        if (rc != 0) {
-                CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
-                goto failed;
-        }
-        
-        if (type == IBNAL_MSG_GET_RDMA) {
-                /* reply gets finalized when tx completes */
-                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, 
-                                                        nid, libmsg);
-                if (tx->tx_libmsg[1] == NULL) {
-                        CERROR ("Can't create reply for GET -> "LPX64"\n",
-                                nid);
-                        rc = -ENOMEM;
-                        goto failed;
-                }
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_waiting = 0;
+                tx->tx_status = status;
+                kibnal_tx_done (tx);
         }
-        
-        tx->tx_passive_rdma = 1;
+}
 
-        ibmsg = tx->tx_msg;
+int
+kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr; 
+        int               type = lntmsg->msg_type; 
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      payload_niov = lntmsg->msg_niov; 
+        struct iovec     *payload_iov = lntmsg->msg_iov; 
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        kib_msg_t        *ibmsg;
+        kib_tx_t         *tx;
+        int               nob;
+        int               rc;
 
-        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
-        ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
-        /* map_kiov alrady filled the rdma descs for the whole_mem case */
-        if (!kibnal_whole_mem()) {
-                ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey;
-                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
-                ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
-                ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
-        }
+        /* NB 'private' is different depending on what we're sending.... */
 
-        kibnal_init_tx_msg (tx, type, 
-                            kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
 
-        CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
-               LPX64", nob %d\n",
-               tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
-               tx->tx_md.md_addr, nob);
-        
-        /* libmsg gets finalized when tx completes. */
-        tx->tx_libmsg[0] = libmsg;
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
 
-        kibnal_launch_tx(tx, nid);
-        return (PTL_OK);
+        /* Thread context */
+        LASSERT (!in_interrupt());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
- failed:
-        tx->tx_status = rc;
-        kibnal_tx_done (tx);
-        return (PTL_FAIL);
-}
+        switch (type) {
+        default:
+                LBUG();
+                return (-EIO);
+                
+        case LNET_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
 
-void
-kibnal_start_active_rdma (int type, int status,
-                           kib_rx_t *rx, lib_msg_t *libmsg, 
-                           unsigned int niov,
-                           struct iovec *iov, ptl_kiov_t *kiov,
-                           size_t offset, size_t nob)
-{
-        kib_msg_t    *rxmsg = rx->rx_msg;
-        kib_msg_t    *txmsg;
-        kib_tx_t     *tx;
-        IB_ACCESS_CONTROL access = {0,};
-        IB_WR_OP      rdma_op;
-        int           rc;
-        __u32         i;
+        case LNET_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+                
+                /* is the REPLY message too small for RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+                tx = kibnal_get_idle_tx();
+                if (tx == NULL) {
+                        CERROR("Can allocate txd for GET to %s: \n",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
+                
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+                ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                        rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+                                                 0,
+                                                 lntmsg->msg_md->md_niov,
+                                                 lntmsg->msg_md->md_iov.iov,
+                                                 0, lntmsg->msg_md->md_length);
+                else
+                        rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+                                                  0,
+                                                  lntmsg->msg_md->md_niov,
+                                                  lntmsg->msg_md->md_iov.kiov,
+                                                  0, lntmsg->msg_md->md_length);
+                if (rc != 0) {
+                        CERROR("Can't setup GET sink for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
+                        kibnal_tx_done(tx);
+                        return -EIO;
+                }
 
-        CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
-               type, status, niov, offset, nob);
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_get_msg_t);
+#else
+                {
+                        int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
+                        
+                        nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+                }
+#endif
+                kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
 
-        /* Called by scheduler */
-        LASSERT (!in_interrupt ());
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
+                                                         lntmsg);
+                if (tx->tx_lntmsg[1] == NULL) {
+                        CERROR("Can't create reply for GET -> %s\n",
+                               libcfs_nid2str(target.nid));
+                        kibnal_tx_done(tx);
+                        return -EIO;
+                }
 
-        /* Either all pages or all vaddrs */
-        LASSERT (!(kiov != NULL && iov != NULL));
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+                tx->tx_waiting = 1;             /* waiting for GET_DONE */
+                kibnal_launch_tx(tx, target.nid);
+                return 0;
 
-        /* No data if we're completing with failure */
-        LASSERT (status == 0 || nob == 0);
+        case LNET_MSG_REPLY: 
+        case LNET_MSG_PUT:
+                /* Is the payload small enough not to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+                tx = kibnal_get_idle_tx();
+                if (tx == NULL) {
+                        CERROR("Can't allocate %s txd for %s\n",
+                               type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
 
-        LASSERT (type == IBNAL_MSG_GET_DONE ||
-                 type == IBNAL_MSG_PUT_DONE);
+                if (payload_kiov == NULL)
+                        rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1,
+                                                 payload_niov, payload_iov,
+                                                 payload_offset, payload_nob);
+                else
+                        rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1,
+                                                  payload_niov, payload_kiov,
+                                                  payload_offset, payload_nob);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT src for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
+                        kibnal_tx_done(tx);
+                        return -EIO;
+                }
 
-        /* Flag I'm completing the RDMA.  Even if I fail to send the
-         * completion message, I will have tried my best so further
-         * attempts shouldn't be tried. */
-        LASSERT (!rx->rx_rdma);
-        rx->rx_rdma = 1;
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+                ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+                kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
 
-        if (type == IBNAL_MSG_GET_DONE) {
-                rdma_op  = WROpRdmaWrite;
-                LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
-        } else {
-                access.s.LocalWrite = 1;
-                rdma_op  = WROpRdmaRead;
-                LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
+                kibnal_launch_tx(tx, target.nid);
+                return 0;
         }
 
-        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
+        /* send IMMEDIATE */
+
+        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                 <= IBNAL_MSG_SIZE);
+
+        tx = kibnal_get_idle_tx();
         if (tx == NULL) {
-                CERROR ("tx descs exhausted on RDMA from "LPX64
-                        " completing locally with failure\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
-                return;
+                CERROR ("Can't send %d to %s: tx descs exhausted\n",
+                        type, libcfs_nid2str(target.nid));
+                return -ENOMEM;
         }
-        LASSERT (tx->tx_nsp == 0);
-                        
-        if (nob == 0) 
-                GOTO(init_tx, 0);
-
-        /* We actually need to transfer some data (the transfer
-         * size could get truncated to zero when the incoming
-         * message is matched) */
-        if (kiov != NULL)
-                rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
+
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+        if (payload_kiov != NULL)
+                lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
+                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                    payload_niov, payload_kiov,
+                                    payload_offset, payload_nob);
         else
-                rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
-        
-        if (rc != 0) {
-                CERROR ("Can't map RDMA -> "LPX64": %d\n", 
-                        rx->rx_conn->ibc_peer->ibp_nid, rc);
-                /* We'll skip the RDMA and complete with failure. */
-                status = rc;
-                nob = 0;
-                GOTO(init_tx, rc);
-        } 
-
-        if (!kibnal_whole_mem()) {
-                tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey;
-                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
-                tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
-                tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
-        }
-
-        /* XXX ugh.  different page-sized hosts. */ 
-        if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
-            rxmsg->ibm_u.rdma.ibrm_num_descs) {
-                CERROR("tx descs (%u) != rx descs (%u)\n", 
-                       tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
-                       rxmsg->ibm_u.rdma.ibrm_num_descs);
-                /* We'll skip the RDMA and complete with failure. */
-                status = rc;
-                nob = 0;
-                GOTO(init_tx, rc);
-        }
-
-        /* map_kiov filled in the rdma descs which describe our side of the
-         * rdma transfer. */
-        /* ibrm_num_descs was verified in rx_callback */
-        for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
-                kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
-                IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
-                IB_WORK_REQ  *wrq = &tx->tx_wrq[i];
-
-                ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
-                rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
-
-                ds->Address = ldesc->rd_addr;
-                ds->Length  = ldesc->rd_nob;
-                ds->Lkey    = tx->tx_msg->ibm_u.rdma.rd_key;
-
-                memset(wrq, 0, sizeof(*wrq));
-                wrq->WorkReqId      = kibnal_ptr2wreqid(tx, 0);
-                wrq->Operation      = rdma_op;
-                wrq->DSList         = ds;
-                wrq->DSListDepth    = 1;
-                wrq->MessageLen     = ds->Length;
-                wrq->Req.SendRC.ImmediateData  = 0;
-                wrq->Req.SendRC.Options.s.SolicitedEvent         = 0;
-                wrq->Req.SendRC.Options.s.SignaledCompletion     = 0;
-                wrq->Req.SendRC.Options.s.ImmediateData          = 0;
-                wrq->Req.SendRC.Options.s.Fence                  = 0;
-                wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
-                wrq->Req.SendRC.RemoteDS.Rkey = rxmsg->ibm_u.rdma.rd_key;
+                lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                   payload_niov, payload_iov,
+                                   payload_offset, payload_nob);
 
-                /* only the last rdma post triggers tx completion */
-                if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
-                        wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
 
-                tx->tx_nsp++;
+        tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
+        kibnal_launch_tx(tx, target.nid);
+        return 0;
+}
+
+void
+kibnal_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+        lnet_process_id_t target = lntmsg->msg_target;
+        unsigned int      niov = lntmsg->msg_niov; 
+        struct iovec     *iov = lntmsg->msg_iov; 
+        lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+        unsigned int      offset = lntmsg->msg_offset;
+        unsigned int      nob = lntmsg->msg_len;
+        kib_tx_t         *tx;
+        int               rc;
+        
+        tx = kibnal_get_idle_tx();
+        if (tx == NULL) {
+                CERROR("Can't get tx for REPLY to %s\n",
+                       libcfs_nid2str(target.nid));
+                goto failed_0;
         }
 
-init_tx:
-        txmsg = tx->tx_msg;
+        if (nob == 0)
+                rc = 0;
+        else if (kiov == NULL)
+                rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1, 
+                                         niov, iov, offset, nob);
+        else
+                rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1, 
+                                          niov, kiov, offset, nob);
 
-        txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
-        txmsg->ibm_u.completion.ibcm_status = status;
+        if (rc != 0) {
+                CERROR("Can't setup GET src for %s: %d\n",
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
         
-        kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
-
-        if (status == 0 && nob != 0) {
-                LASSERT (tx->tx_nsp > 1);
-                /* RDMA: libmsg gets finalized when the tx completes.  This
-                 * is after the completion message has been sent, which in
-                 * turn is after the RDMA has finished. */
-                tx->tx_libmsg[0] = libmsg;
+        rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
+                              &rx->rx_msg->ibm_u.get.ibgm_rd,
+                              rx->rx_msg->ibm_u.get.ibgm_cookie);
+        if (rc < 0) {
+                CERROR("Can't setup rdma for GET from %s: %d\n", 
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
+        
+        if (rc == 0) {
+                /* No RDMA: local completion may happen now! */
+                lnet_finalize(ni, lntmsg, 0);
         } else {
-                LASSERT (tx->tx_nsp == 1);
-                /* No RDMA: local completion happens now! */
-                CWARN("No data: immediate completion\n");
-                lib_finalize (&kibnal_lib, NULL, libmsg,
-                              status == 0 ? PTL_OK : PTL_FAIL);
-        }
-
-        /* +1 ref for this tx... */
-        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-               rx->rx_conn, rx->rx_conn->ibc_state, 
-               rx->rx_conn->ibc_peer->ibp_nid,
-               atomic_read (&rx->rx_conn->ibc_refcount));
-        atomic_inc (&rx->rx_conn->ibc_refcount);
-        /* ...and queue it up */
+                /* RDMA: lnet_finalize(lntmsg) when it
+                 * completes */
+                tx->tx_lntmsg[0] = lntmsg;
+        }
+        
         kibnal_queue_tx(tx, rx->rx_conn);
+        return;
+        
+ failed_1:
+        kibnal_tx_done(tx);
+ failed_0:
+        lnet_finalize(ni, lntmsg, -EIO);
 }
 
-static ptl_err_t
-kibnal_sendmsg(lib_nal_t    *nal, 
-                void         *private,
-                lib_msg_t    *libmsg,
-                ptl_hdr_t    *hdr, 
-                int           type, 
-                ptl_nid_t     nid, 
-                ptl_pid_t     pid,
-                unsigned int  payload_niov, 
-                struct iovec *payload_iov, 
-                ptl_kiov_t   *payload_kiov,
-                size_t        payload_offset,
-                size_t        payload_nob)
-{
-        kib_msg_t  *ibmsg;
-        kib_tx_t   *tx;
-        int         nob;
-
-        /* NB 'private' is different depending on what we're sending.... */
-
-        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
-               " pid %d\n", payload_nob, payload_niov, nid , pid);
+int
+kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                   void **new_private)
+{
+        kib_rx_t    *rx = private;
+        kib_conn_t  *conn = rx->rx_conn;
 
-        LASSERT (payload_nob == 0 || payload_niov > 0);
-        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                /* Can't block if RDMA completions need normal credits */
+                LCONSOLE_ERROR("Dropping message from %s: no buffers free. "
+                               "%s is running an old version of LNET that may "
+                               "deadlock if messages wait for buffers)\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return -EDEADLK;
+        }
+        
+        *new_private = private;
+        return 0;
+}
 
-        /* Thread context if we're sending payload */
-        LASSERT (!in_interrupt() || payload_niov == 0);
-        /* payload is either all vaddrs or all pages */
-        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+int
+kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+             unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+             unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        kib_conn_t  *conn = rx->rx_conn;
+        kib_tx_t    *tx;
+        kib_msg_t   *txmsg;
+        int          nob;
+        int          post_cred = 1;
+        int          rc = 0;
+        
+        LASSERT (mlen <= rlen);
+        LASSERT (!in_interrupt());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
 
-        switch (type) {
+        switch (rxmsg->ibm_type) {
         default:
                 LBUG();
-                return (PTL_FAIL);
                 
-        case PTL_MSG_REPLY: {
-                /* reply's 'private' is the incoming receive */
-                kib_rx_t *rx = private;
-
-                /* RDMA reply expected? */
-                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
-                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
-                                                 rx, libmsg, payload_niov, 
-                                                 payload_iov, payload_kiov,
-                                                 payload_offset, payload_nob);
-                        return (PTL_OK);
+        case IBNAL_MSG_IMMEDIATE:
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (nob > rx->rx_nob) {
+                        CERROR ("Immediate message from %s too big: %d(%d)\n",
+                                libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+                                nob, rx->rx_nob);
+                        rc = -EPROTO;
+                        break;
+                }
+
+                if (kiov != NULL)
+                        lnet_copy_flat2kiov(niov, kiov, offset,
+                                            IBNAL_MSG_SIZE, rxmsg,
+                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                            mlen);
+                else
+                        lnet_copy_flat2iov(niov, iov, offset,
+                                           IBNAL_MSG_SIZE, rxmsg,
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                           mlen);
+                lnet_finalize (ni, lntmsg, 0);
+                break;
+
+        case IBNAL_MSG_PUT_REQ:
+                if (mlen == 0) {
+                        lnet_finalize(ni, lntmsg, 0);
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
                 }
                 
-                /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
-                        CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
-                                nid, rx->rx_msg->ibm_type);
-                        return (PTL_FAIL);
+                tx = kibnal_get_idle_tx();
+                if (tx == NULL) {
+                        CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        /* Not replying will break the connection */
+                        rc = -ENOMEM;
+                        break;
                 }
 
-                /* Will it fit in a message? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob >= IBNAL_MSG_SIZE) {
-                        CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
-                               nid, payload_nob);
-                        return (PTL_FAIL);
+                txmsg = tx->tx_msg;
+                if (kiov == NULL)
+                        rc = kibnal_setup_rd_iov(tx, 
+                                                 &txmsg->ibm_u.putack.ibpam_rd,
+                                                 0,
+                                                 niov, iov, offset, mlen);
+                else
+                        rc = kibnal_setup_rd_kiov(tx,
+                                                  &txmsg->ibm_u.putack.ibpam_rd,
+                                                  0,
+                                                  niov, kiov, offset, mlen);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT sink for %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                        kibnal_tx_done(tx);
+                        /* tell peer it's over */
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
                 }
-                break;
-        }
 
-        case PTL_MSG_GET:
-                /* might the REPLY message be big enough to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
-                                                          nid, libmsg, hdr));
-                break;
+                txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+                txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+#if IBNAL_USE_FMR
+                nob = sizeof(kib_putack_msg_t);
+#else
+                {
+                        int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
 
-        case PTL_MSG_ACK:
-                LASSERT (payload_nob == 0);
+                        nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+                }
+#endif
+                kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_DONE */
+                kibnal_queue_tx(tx, conn);
+
+                if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
+                        post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
                 break;
 
-        case PTL_MSG_PUT:
-                /* Is the payload big enough to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
-                                                          nid, libmsg, hdr));
-                
+        case IBNAL_MSG_GET_REQ:
+                if (lntmsg != NULL) {
+                        /* Optimized GET; RDMA lntmsg's payload */
+                        kibnal_reply(ni, rx, lntmsg);
+                } else {
+                        /* GET didn't match anything */
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, 
+                                               -ENODATA,
+                                               rxmsg->ibm_u.get.ibgm_cookie);
+                }
                 break;
         }
 
-        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
-                                  type == PTL_MSG_REPLY ||
-                                  in_interrupt()));
-        if (tx == NULL) {
-                CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
-                        type, nid, in_interrupt() ? " (intr)" : "");
-                return (PTL_NO_SPACE);
+        kibnal_post_rx(rx, post_cred, 0);
+        return rc;
+}
+
+int
+kibnal_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kibnal_data.kib_nthreads);
+        return (0);
+}
+
+void
+kibnal_thread_fini (void)
+{
+        atomic_dec (&kibnal_data.kib_nthreads);
+}
+
+void
+kibnal_peer_alive (kib_peer_t *peer)
+{
+        /* This is racy, but everyone's only writing cfs_time_current() */
+        peer->ibp_last_alive = cfs_time_current();
+        mb();
+}
+
+void
+kibnal_peer_notify (kib_peer_t *peer)
+{
+        time_t        last_alive = 0;
+        int           error = 0;
+        unsigned long flags;
+        
+        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        if (list_empty(&peer->ibp_conns) &&
+            peer->ibp_accepting == 0 &&
+            peer->ibp_connecting == 0 &&
+            peer->ibp_error != 0) {
+                error = peer->ibp_error;
+                peer->ibp_error = 0;
+                last_alive = cfs_time_current_sec() -
+                             cfs_duration_sec(cfs_time_current() -
+                                              peer->ibp_last_alive);
+        }
+        
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+        
+        if (error != 0)
+                lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
+}
+
+void
+kibnal_schedule_conn (kib_conn_t *conn)
+{
+        unsigned long flags;
+
+        kibnal_conn_addref(conn);               /* ++ref for connd */
+        
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+                
+        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+}
+
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immediate housekeeping to start shutdown of an
+         * established connection.  'error' is zero for a normal shutdown.
+         * Caller holds kib_global_lock exclusively in irq context */
+        kib_peer_t       *peer = conn->ibc_peer;
+        
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+
+        if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+                return; /* already being handled  */
+        
+        /* NB Can't take ibc_lock here (could be in IRQ context), without
+         * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
+
+        if (error == 0 &&
+            list_empty(&conn->ibc_tx_queue) &&
+            list_empty(&conn->ibc_tx_queue_rsrvd) &&
+            list_empty(&conn->ibc_tx_queue_nocred) &&
+            list_empty(&conn->ibc_active_txs)) {
+                CDEBUG(D_NET, "closing conn to %s"
+                       " rx# "LPD64" tx# "LPD64"\n", 
+                       libcfs_nid2str(peer->ibp_nid),
+                       conn->ibc_txseq, conn->ibc_rxseq);
+        } else {
+                CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
+                       " rx# "LPD64" tx# "LPD64"\n",
+                       libcfs_nid2str(peer->ibp_nid), error,
+                       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                       list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+                       list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+                       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
+                       conn->ibc_txseq, conn->ibc_rxseq);
+#if 0
+                /* can't skip down the queue without holding ibc_lock (see above) */
+                list_for_each(tmp, &conn->ibc_tx_queue) {
+                        kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+                        
+                        CERROR("   queued tx type %x cookie "LPX64
+                               " sending %d waiting %d ticks %ld/%d\n", 
+                               tx->tx_msg->ibm_type, tx->tx_cookie, 
+                               tx->tx_sending, tx->tx_waiting,
+                               (long)(tx->tx_deadline - jiffies), HZ);
+                }
+
+                list_for_each(tmp, &conn->ibc_active_txs) {
+                        kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+                        
+                        CERROR("   active tx type %x cookie "LPX64
+                               " sending %d waiting %d ticks %ld/%d\n", 
+                               tx->tx_msg->ibm_type, tx->tx_cookie, 
+                               tx->tx_sending, tx->tx_waiting,
+                               (long)(tx->tx_deadline - jiffies), HZ);
+                }
+#endif
         }
 
-        ibmsg = tx->tx_msg;
-        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+        list_del (&conn->ibc_list);
 
-        if (payload_nob > 0) {
-                if (payload_kiov != NULL)
-                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
-                                          payload_niov, payload_kiov,
-                                          payload_offset, payload_nob);
-                else
-                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
-                                         payload_niov, payload_iov,
-                                         payload_offset, payload_nob);
-        }
+        if (list_empty (&peer->ibp_conns)) {   /* no more conns */
+                if (peer->ibp_persistence == 0 && /* non-persistent peer */
+                    kibnal_peer_active(peer))     /* still in peer table */
+                        kibnal_unlink_peer_locked (peer);
 
-        kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
-                            offsetof(kib_immediate_msg_t, 
-                                     ibim_payload[payload_nob]));
+                peer->ibp_error = error; /* set/clear error on last conn */
+        }
 
-        /* libmsg gets finalized when tx completes */
-        tx->tx_libmsg[0] = libmsg;
+        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTING);
 
-        kibnal_launch_tx(tx, nid);
-        return (PTL_OK);
+        kibnal_schedule_conn(conn);
+        kibnal_conn_decref(conn);               /* lose ibc_list's ref */
 }
 
-static ptl_err_t
-kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
-               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-               unsigned int payload_niov, struct iovec *payload_iov,
-               size_t payload_offset, size_t payload_len)
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
 {
-        return (kibnal_sendmsg(nal, private, cookie,
-                               hdr, type, nid, pid,
-                               payload_niov, payload_iov, NULL,
-                               payload_offset, payload_len));
-}
+        unsigned long flags;
+        
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-static ptl_err_t
-kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
-                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
-                     size_t payload_offset, size_t payload_len)
-{
-        return (kibnal_sendmsg(nal, private, cookie,
-                               hdr, type, nid, pid,
-                               payload_niov, NULL, payload_kiov,
-                               payload_offset, payload_len));
+        kibnal_close_conn_locked (conn, error);
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 }
 
-static ptl_err_t
-kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
-                 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
-                 size_t offset, size_t mlen, size_t rlen)
+void
+kibnal_handle_early_rxs(kib_conn_t *conn)
 {
-        kib_rx_t    *rx = private;
-        kib_msg_t   *rxmsg = rx->rx_msg;
-        int          msg_nob;
-        
-        LASSERT (mlen <= rlen);
-        LASSERT (!in_interrupt ());
-        /* Either all pages or all vaddrs */
-        LASSERT (!(kiov != NULL && iov != NULL));
+        unsigned long    flags;
+        kib_rx_t        *rx;
 
-        switch (rxmsg->ibm_type) {
-        default:
-                LBUG();
-                return (PTL_FAIL);
+        LASSERT (!in_interrupt());
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+        
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        while (!list_empty(&conn->ibc_early_rxs)) {
+                rx = list_entry(conn->ibc_early_rxs.next,
+                                kib_rx_t, rx_list);
+                list_del(&rx->rx_list);
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
                 
-        case IBNAL_MSG_IMMEDIATE:
-                msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
-                if (msg_nob > IBNAL_MSG_SIZE) {
-                        CERROR ("Immediate message from "LPX64" too big: %d\n",
-                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
-                        return (PTL_FAIL);
-                }
+                kibnal_handle_rx(rx);
+                
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        }
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
 
-                if (kiov != NULL)
-                        lib_copy_buf2kiov(niov, kiov, offset,
-                                          rxmsg->ibm_u.immediate.ibim_payload,
-                                          mlen);
-                else
-                        lib_copy_buf2iov(niov, iov, offset,
-                                         rxmsg->ibm_u.immediate.ibim_payload,
-                                         mlen);
+void
+kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+        LIST_HEAD           (zombies); 
+        struct list_head    *tmp;
+        struct list_head    *nxt;
+        kib_tx_t            *tx;
 
-                lib_finalize (nal, NULL, libmsg, PTL_OK);
-                return (PTL_OK);
+        spin_lock(&conn->ibc_lock);
 
-        case IBNAL_MSG_GET_RDMA:
-                /* We get called here just to discard any junk after the
-                 * GET hdr. */
-                LASSERT (libmsg == NULL);
-                lib_finalize (nal, NULL, libmsg, PTL_OK);
-                return (PTL_OK);
+        list_for_each_safe (tmp, nxt, txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
 
-        case IBNAL_MSG_PUT_RDMA:
-                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
-                                          rx, libmsg, 
-                                          niov, iov, kiov, offset, mlen);
-                return (PTL_OK);
+                if (txs == &conn->ibc_active_txs) {
+                        LASSERT (!tx->tx_queued);
+                        LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+                } else {
+                        LASSERT (tx->tx_queued);
+                }
+                
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_queued = 0;
+                tx->tx_waiting = 0;
+                
+                if (tx->tx_sending == 0) {
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
         }
-}
 
-static ptl_err_t
-kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
-              unsigned int niov, struct iovec *iov, 
-              size_t offset, size_t mlen, size_t rlen)
-{
-        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
-                                offset, mlen, rlen));
+        spin_unlock(&conn->ibc_lock);
+
+        kibnal_txlist_done(&zombies, -ECONNABORTED);
 }
 
-static ptl_err_t
-kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
-                     unsigned int niov, ptl_kiov_t *kiov, 
-                     size_t offset, size_t mlen, size_t rlen)
+void
+kibnal_conn_disconnected(kib_conn_t *conn)
 {
-        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
-                                offset, mlen, rlen));
-}
+        static IB_QP_ATTRIBUTES_MODIFY qpam = {.RequestState = QPStateError};
 
-/*****************************************************************************
- * the rest of this file concerns connection management.  active connetions
- * start with connect_peer, passive connections start with passive_callback.
- * active disconnects start with conn_close, cm_callback starts passive
- * disconnects and contains the guts of how the disconnect state machine
- * progresses. 
- *****************************************************************************/
+        FSTATUS           frc;
 
-int
-kibnal_thread_start (int (*fn)(void *arg), void *arg)
-{
-        long    pid = kernel_thread (fn, arg, 0);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP);
 
-        if (pid < 0)
-                return ((int)pid);
+        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
 
-        atomic_inc (&kibnal_data.kib_nthreads);
-        return (0);
-}
+        /* move QP to error state to make posted work items complete */
+        frc = iba_modify_qp(conn->ibc_qp, &qpam, NULL);
+        if (frc != FSUCCESS)
+                CERROR("can't move qp state to error: %d\n", frc);
 
-static void
-kibnal_thread_fini (void)
-{
-        atomic_dec (&kibnal_data.kib_nthreads);
+        /* Complete all tx descs not waiting for sends to complete.
+         * NB we should be safe from RDMA now that the QP has changed state */
+
+        kibnal_abort_txs(conn, &conn->ibc_tx_queue);
+        kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+        kibnal_abort_txs(conn, &conn->ibc_tx_queue);
+        kibnal_abort_txs(conn, &conn->ibc_active_txs);
+
+        kibnal_handle_early_rxs(conn);
 }
 
-/* this can be called by anyone at any time to close a connection.  if
- * the connection is still established it heads to the connd to start
- * the disconnection in a safe context.  It has no effect if called
- * on a connection that is already disconnecting */
 void
-kibnal_close_conn_locked (kib_conn_t *conn, int error)
+kibnal_peer_connect_failed (kib_peer_t *peer, int type, int error)
 {
-        /* This just does the immmediate housekeeping, and schedules the
-         * connection for the connd to finish off.
-         * Caller holds kib_global_lock exclusively in irq context */
-        kib_peer_t   *peer = conn->ibc_peer;
+        LIST_HEAD        (zombies);
+        unsigned long     flags;
 
-        KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
-                                    IBNAL_CONN_DISCONNECTED);
+        LASSERT (error != 0);
+        LASSERT (!in_interrupt());
 
-        if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
-                return; /* already disconnecting */
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        CDEBUG (error == 0 ? D_NET : D_ERROR,
-                "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+        LASSERT (kibnal_peer_connecting(peer));
 
-        if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                /* kib_connd_conns takes ibc_list's ref */
-                list_del (&conn->ibc_list);
-        } else {
-                /* new ref for kib_connd_conns */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
+        switch (type) {
+        case IBNAL_CONN_ACTIVE:
+                LASSERT (peer->ibp_connecting > 0);
+                peer->ibp_connecting--;
+                break;
+                
+        case IBNAL_CONN_PASSIVE:
+                LASSERT (peer->ibp_accepting > 0);
+                peer->ibp_accepting--;
+                break;
+                
+        case IBNAL_CONN_WAITING:
+                /* Can't assert; I might be racing with a successful connection
+                 * which clears passivewait */
+                peer->ibp_passivewait = 0;
+                break;
+        default:
+                LBUG();
+        }
+
+        if (kibnal_peer_connecting(peer) ||     /* another attempt underway */
+            !list_empty(&peer->ibp_conns)) {    /* got connected */
+                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+                return;
         }
+
+        /* Say when active connection can be re-attempted */
+        peer->ibp_reconnect_interval *= 2;
+        peer->ibp_reconnect_interval =
+                MAX(peer->ibp_reconnect_interval,
+                    *kibnal_tunables.kib_min_reconnect_interval);
+        peer->ibp_reconnect_interval =
+                MIN(peer->ibp_reconnect_interval,
+                    *kibnal_tunables.kib_max_reconnect_interval);
         
-        if (list_empty (&peer->ibp_conns) &&    /* no more conns */
-            peer->ibp_persistence == 0 &&       /* non-persistent peer */
-            kibnal_peer_active(peer)) {         /* still in peer table */
+        peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval * HZ;
+
+        /* Take peer's blocked transmits to complete with error */
+        list_add(&zombies, &peer->ibp_tx_queue);
+        list_del_init(&peer->ibp_tx_queue);
+                
+        if (kibnal_peer_active(peer) &&
+            peer->ibp_persistence == 0) {
+                /* failed connection attempt on non-persistent peer */
                 kibnal_unlink_peer_locked (peer);
         }
 
-        conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+        peer->ibp_error = error;
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-        spin_lock (&kibnal_data.kib_connd_lock);
+        kibnal_peer_notify(peer);
 
-        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up (&kibnal_data.kib_connd_waitq);
-                
-        spin_unlock (&kibnal_data.kib_connd_lock);
+        if (list_empty (&zombies))
+                return;
+        
+        CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
+                libcfs_nid2str(peer->ibp_nid));
+
+        kibnal_txlist_done (&zombies, -EHOSTUNREACH);
 }
 
 void
-kibnal_close_conn (kib_conn_t *conn, int error)
+kibnal_connreq_done (kib_conn_t *conn, int type, int status)
 {
+        kib_peer_t       *peer = conn->ibc_peer;
+        struct list_head  txs;
+        kib_tx_t         *tx;
         unsigned long     flags;
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        LASSERT (!in_interrupt());
+        LASSERT (type == IBNAL_CONN_ACTIVE || type == IBNAL_CONN_PASSIVE);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP);
+        LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
+        LASSERT (kibnal_peer_connecting(peer));
 
-        kibnal_close_conn_locked (conn, error);
-        
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-}
+        LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
+        conn->ibc_cvars = NULL;
 
-static void
-kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
-{
-        LIST_HEAD        (zombies);
-        kib_tx_t         *tx;
-        unsigned long     flags;
+        if (status != 0) {
+                /* failed to establish connection */
+                kibnal_peer_connect_failed(conn->ibc_peer, type, status);
+                kibnal_conn_disconnected(conn);
+                kibnal_conn_decref(conn);       /* Lose CM's ref */
+                return;
+        }
 
-        LASSERT (rc != 0);
-        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+        /* connection established */
+        LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING);
 
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        conn->ibc_last_send = jiffies;
+        kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
+        kibnal_peer_alive(peer);
 
-        LASSERT (peer->ibp_connecting != 0);
-        peer->ibp_connecting--;
+        CDEBUG(D_NET, "Connection %s ESTABLISHED\n",
+               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        if (peer->ibp_connecting != 0) {
-                /* another connection attempt under way (loopback?)... */
+        peer->ibp_passivewait = 0;              /* not waiting (got conn now) */
+        kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
+        list_add_tail(&conn->ibc_list, &peer->ibp_conns);
+        
+        if (!kibnal_peer_active(peer)) {
+                /* peer has been deleted */
+                kibnal_close_conn_locked(conn, -ECONNABORTED);
                 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+                kibnal_peer_connect_failed(conn->ibc_peer, type, -ECONNABORTED);
+                kibnal_conn_decref(conn);       /* lose CM's ref */
                 return;
         }
-
-        if (list_empty(&peer->ibp_conns)) {
-                /* Say when active connection can be re-attempted */
-                peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
-                /* Increase reconnection interval */
-                peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
-                                                    IBNAL_MAX_RECONNECT_INTERVAL);
         
-                /* Take peer's blocked blocked transmits; I'll complete
-                 * them with error */
-                while (!list_empty (&peer->ibp_tx_queue)) {
-                        tx = list_entry (peer->ibp_tx_queue.next,
-                                         kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
-                        list_add_tail (&tx->tx_list, &zombies);
-                }
-                
-                if (kibnal_peer_active(peer) &&
-                    (peer->ibp_persistence == 0)) {
-                        /* failed connection attempt on non-persistent peer */
-                        kibnal_unlink_peer_locked (peer);
-                }
-        } else {
-                /* Can't have blocked transmits if there are connections */
-                LASSERT (list_empty(&peer->ibp_tx_queue));
+        switch (type) {
+        case IBNAL_CONN_ACTIVE:
+                LASSERT (peer->ibp_connecting > 0);
+                peer->ibp_connecting--;
+                break;
+
+        case IBNAL_CONN_PASSIVE:
+                LASSERT (peer->ibp_accepting > 0);
+                peer->ibp_accepting--;
+                break;
+        default:
+                LBUG();
         }
         
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        peer->ibp_reconnect_interval = 0;       /* OK to reconnect at any time */
 
-        if (!list_empty (&zombies))
-                CERROR ("Deleting messages for "LPX64": connection failed\n",
-                        peer->ibp_nid);
+        /* Nuke any dangling conns from a different peer instance... */
+        kibnal_close_stale_conns_locked(peer, conn->ibc_incarnation);
 
-        while (!list_empty (&zombies)) {
-                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+        /* grab txs blocking for a conn */
+        list_add(&txs, &peer->ibp_tx_queue);
+        list_del_init(&peer->ibp_tx_queue);
 
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+        
+        /* Schedule blocked txs */
+        spin_lock (&conn->ibc_lock);
+        while (!list_empty (&txs)) {
+                tx = list_entry (txs.next, kib_tx_t, tx_list);
                 list_del (&tx->tx_list);
-                /* complete now */
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
+
+                kibnal_queue_tx_locked (tx, conn);
         }
+        spin_unlock (&conn->ibc_lock);
+        kibnal_check_sends (conn);
 }
 
-static void
-kibnal_connreq_done (kib_conn_t *conn, int active, int status)
+void
+kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int why)
 {
-        int               state = conn->ibc_state;
-        kib_peer_t       *peer = conn->ibc_peer;
-        kib_tx_t         *tx;
-        unsigned long     flags;
-        int               i;
-
-        /* passive connection has no connreq & vice versa */
-        LASSERTF(!active == !(conn->ibc_connreq != NULL),
-                 "%d %p\n", active, conn->ibc_connreq);
-        if (active) {
-                PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
-                conn->ibc_connreq = NULL;
-        }
-
-        write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+        static CM_REJECT_INFO  msgs[3];
+        CM_REJECT_INFO        *msg = &msgs[why];
+        FSTATUS                frc;
+
+        LASSERT (why >= 0 && why < sizeof(msgs)/sizeof(msgs[0]));
+
+        /* If I wasn't so lazy, I'd initialise this only once; it's effectively
+         * read-only... */
+        msg->Reason         = RC_USER_REJ;
+        msg->PrivateData[0] = (IBNAL_MSG_MAGIC) & 0xff;
+        msg->PrivateData[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
+        msg->PrivateData[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
+        msg->PrivateData[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
+        msg->PrivateData[4] = (IBNAL_MSG_VERSION) & 0xff;
+        msg->PrivateData[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
+        msg->PrivateData[6] = why;
+
+        frc = iba_cm_reject(cep, msg);
+        if (frc != FSUCCESS)
+                CERROR("Error %d rejecting %s\n", frc, libcfs_nid2str(nid));
+}
 
-        LASSERT (peer->ibp_connecting != 0);
-        
-        if (status == 0) {                         
-                /* connection established... */
-                KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
-                conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+void
+kibnal_check_connreject(kib_conn_t *conn, int type, CM_REJECT_INFO *rej)
+{
+        kib_peer_t    *peer = conn->ibc_peer;
+        unsigned long  flags;
+        int            magic;
+        int            version;
+        int            why;
+
+        LASSERT (type == IBNAL_CONN_ACTIVE ||
+                 type == IBNAL_CONN_PASSIVE);
+
+        CDEBUG(D_NET, "%s connection with %s rejected: %d\n",
+               (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive",
+               libcfs_nid2str(peer->ibp_nid), rej->Reason);
+
+        switch (rej->Reason) {
+        case RC_STALE_CONN:
+                if (type == IBNAL_CONN_PASSIVE) {
+                        CERROR("Connection to %s rejected (stale QP)\n",
+                               libcfs_nid2str(peer->ibp_nid));
+                } else {
+                        CWARN("Connection from %s rejected (stale QP): "
+                              "retrying...\n", libcfs_nid2str(peer->ibp_nid));
 
-                if (!kibnal_peer_active(peer)) {
-                        /* ...but peer deleted meantime */
-                        status = -ECONNABORTED;
+                        /* retry from scratch to allocate a new conn 
+                         * which will use a different QP */
+                        kibnal_schedule_active_connect(peer, peer->ibp_version);
                 }
-        } else {
-                KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
-                                            IBNAL_CONN_CONNECTING);
-        }
 
-        if (status == 0) {
-                /* Everything worked! */
-
-                peer->ibp_connecting--;
+                /* An FCM_DISCONNECTED callback is still outstanding: give it a
+                 * ref since kibnal_connreq_done() drops the CM's ref on conn
+                 * on failure */
+                kibnal_conn_addref(conn);
+                break;
 
-                /* +1 ref for ibc_list; caller(== CM)'s ref remains until
-                 * the IB_CM_IDLE callback */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
-                list_add (&conn->ibc_list, &peer->ibp_conns);
-                
-                /* reset reconnect interval for next attempt */
-                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        case RC_USER_REJ:
+                magic   = (rej->PrivateData[0]) |
+                          (rej->PrivateData[1] << 8) |
+                          (rej->PrivateData[2] << 16) |
+                          (rej->PrivateData[3] << 24);
+                version = (rej->PrivateData[4]) |
+                          (rej->PrivateData[5] << 8);
+                why     = (rej->PrivateData[6]);
+
+                /* retry with old proto version */
+                if (magic == IBNAL_MSG_MAGIC &&
+                    version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
+                    conn->ibc_version == IBNAL_MSG_VERSION &&
+                    type != IBNAL_CONN_PASSIVE) {
+                        /* retry with a new conn */
+                        CWARN ("Connection to %s refused: "
+                               "retrying with old protocol version 0x%x\n", 
+                               libcfs_nid2str(peer->ibp_nid), version);
+                        kibnal_schedule_active_connect(peer, version);
+                        break;
+                }
 
-                /* post blocked sends to the new connection */
-                spin_lock (&conn->ibc_lock);
-                
-                while (!list_empty (&peer->ibp_tx_queue)) {
-                        tx = list_entry (peer->ibp_tx_queue.next, 
-                                         kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
+                if (magic != IBNAL_MSG_MAGIC ||
+                    version != IBNAL_MSG_VERSION) {
+                        CERROR("%s connection with %s rejected "
+                               "(magic/ver %08x/%d why %d): "
+                               "incompatible protocol\n",
+                               (type == IBNAL_CONN_ACTIVE) ?
+                               "Active" : "Passive",
+                               libcfs_nid2str(peer->ibp_nid),
+                               magic, version, why);
+                        break;
+                }
 
-                        /* +1 ref for each tx */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
-                        kibnal_queue_tx_locked (tx, conn);
+                if (type == IBNAL_CONN_ACTIVE && 
+                    why == IBNAL_REJECT_CONN_RACE) {
+                        /* lost connection race */
+                        CWARN("Connection to %s rejected: "
+                              "lost connection race\n",
+                              libcfs_nid2str(peer->ibp_nid));
+
+                        write_lock_irqsave(&kibnal_data.kib_global_lock, 
+                                           flags);
+
+                        if (list_empty(&peer->ibp_conns)) {
+                                peer->ibp_passivewait = 1;
+                                peer->ibp_passivewait_deadline =
+                                        jiffies + 
+                                        (*kibnal_tunables.kib_timeout * HZ);
+                        }
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
+                        break;
                 }
-                
-                spin_unlock (&conn->ibc_lock);
 
-                /* Nuke any dangling conns from a different peer instance... */
-                kibnal_close_stale_conns_locked (conn->ibc_peer,
-                                                 conn->ibc_incarnation);
+                CERROR("%s connection with %s rejected: %d\n",
+                       (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive",
+                       libcfs_nid2str(peer->ibp_nid), why);
+                break;
 
-                write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        default:
+                CERROR("%s connection with %s rejected: %d\n",
+                       (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive",
+                       libcfs_nid2str(peer->ibp_nid), rej->Reason);
+        }
+        
+        kibnal_connreq_done(conn, type, -ECONNREFUSED);
+}
 
-                /* queue up all the receives */
-                for (i = 0; i < IBNAL_RX_MSGS; i++) {
-                        /* +1 ref for rx desc */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
+void
+kibnal_cm_disconnect_callback(kib_conn_t *conn, CM_CONN_INFO *info)
+{
+        CDEBUG(D_NET, "%s: state %d, status 0x%x\n", 
+               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+               conn->ibc_state, info->Status);
+        
+        LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-                        CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
-                               i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
-                               conn->ibc_rxs[i].rx_vaddr);
+        switch (info->Status) {
+        default:
+                LBUG();
+                break;
 
-                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
-                }
+        case FCM_DISCONNECT_REQUEST:
+                /* Schedule conn to iba_cm_disconnect() if it wasn't already */
+                kibnal_close_conn (conn, 0);
+                break;
 
-                kibnal_check_sends (conn);
-                return;
+        case FCM_DISCONNECT_REPLY:              /* peer acks my disconnect req */
+        case FCM_DISCONNECTED:                  /* end of TIME_WAIT */
+                CDEBUG(D_NET, "Connection %s disconnected.\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_conn_decref(conn);       /* Lose CM's ref */
+                break;
         }
+}
 
-        /* connection failed */
-        if (state == IBNAL_CONN_CONNECTING) {
-                /* schedule for connd to close */
-                kibnal_close_conn_locked (conn, status);
-        } else {
-                /* Don't have a CM comm_id; just wait for refs to drain */
-                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
-        } 
+void
+kibnal_cm_passive_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+        kib_conn_t       *conn = arg;
 
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        CDEBUG(D_NET, "status 0x%x\n", info->Status);
+
+        /* Established Connection Notifier */
+        switch (info->Status) {
+        default:
+                CERROR("Unexpected status %d on Connection %s\n",
+                       info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                LBUG();
+                break;
+
+        case FCM_CONNECT_TIMEOUT:
+                kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ETIMEDOUT);
+                break;
+                
+        case FCM_CONNECT_REJECT:
+                kibnal_check_connreject(conn, IBNAL_CONN_PASSIVE, 
+                                        &info->Info.Reject);
+                break;
 
-        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+        case FCM_CONNECT_ESTABLISHED:
+                kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, 0);
+                break;
 
-        /* If we didn't establish the connection we don't have to pass
-         * through the disconnect protocol before dropping the CM ref */
-        if (state < IBNAL_CONN_CONNECTING) 
-                kibnal_put_conn (conn);
+        case FCM_DISCONNECT_REQUEST:
+        case FCM_DISCONNECT_REPLY:
+        case FCM_DISCONNECTED:
+                kibnal_cm_disconnect_callback(conn, info);
+                break;
+        }
 }
 
-static int
-kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
-                ptl_nid_t nid, __u64 incarnation, int queue_depth)
+int
+kibnal_accept (kib_conn_t **connp, IB_HANDLE cep, kib_msg_t *msg, int nob)
 {
-        kib_conn_t    *conn = kibnal_create_conn();
+        lnet_nid_t     nid;
+        kib_conn_t    *conn;
         kib_peer_t    *peer;
         kib_peer_t    *peer2;
         unsigned long  flags;
+        int            rc;
+
+        rc = kibnal_unpack_msg(msg, 0, nob);
+        if (rc != 0) {
+                /* SILENT! kibnal_unpack_msg() complains if required */
+                kibnal_reject(LNET_NID_ANY, cep, IBNAL_REJECT_FATAL);
+                return -EPROTO;
+        }
+
+        nid = msg->ibm_srcnid;
 
-        if (conn == NULL)
-                return (-ENOMEM);
+        if (msg->ibm_version != IBNAL_MSG_VERSION)
+                CWARN("Connection from %s: old protocol version 0x%x\n",
+                      libcfs_nid2str(nid), msg->ibm_version);
 
-        if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
-                CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
-                       nid, queue_depth, IBNAL_MSG_QUEUE_SIZE);
-                atomic_dec (&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
-                return (-EPROTO);
+        if (msg->ibm_type != IBNAL_MSG_CONNREQ) {
+                CERROR("Can't accept %s: bad request type %d (%d expected)\n",
+                       libcfs_nid2str(nid), msg->ibm_type, IBNAL_MSG_CONNREQ);
+                kibnal_reject(nid, cep, IBNAL_REJECT_FATAL);
+                return -EPROTO;
+        }
+        
+        if (msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid) {
+                CERROR("Can't accept %s: bad dst NID %s (%s expected)\n",
+                       libcfs_nid2str(nid), 
+                       libcfs_nid2str(msg->ibm_dstnid), 
+                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+                kibnal_reject(nid, cep, IBNAL_REJECT_FATAL);
+                return -EPROTO;
+        }
+        
+        if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+                CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n",
+                       libcfs_nid2str(nid), 
+                       msg->ibm_u.connparams.ibcp_queue_depth,
+                       msg->ibm_u.connparams.ibcp_max_msg_size,
+                       msg->ibm_u.connparams.ibcp_max_frags,
+                       IBNAL_MSG_QUEUE_SIZE,
+                       IBNAL_MSG_SIZE,
+                       IBNAL_MAX_RDMA_FRAGS);
+                kibnal_reject(nid, cep, IBNAL_REJECT_FATAL);
+                return -EPROTO;
+        }
+
+        conn = kibnal_create_conn(nid, msg->ibm_version);
+        if (conn == NULL) {
+                kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES);
+                return -ENOMEM;
         }
         
         /* assume 'nid' is a new peer */
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL) {
-                CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_dec (&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
-                return (-ENOMEM);
+        rc = kibnal_create_peer(&peer, nid);
+        if (rc != 0) {
+                kibnal_conn_decref(conn);
+                kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES);
+                return -ENOMEM;
         }
         
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
@@ -2020,456 +2510,253 @@ kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
         if (peer2 == NULL) {
                 /* peer table takes my ref on peer */
                 list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
+                LASSERT (peer->ibp_connecting == 0);
         } else {
-                kib_peer_decref (peer);
+                kibnal_peer_decref(peer);
                 peer = peer2;
-        }
 
-        kib_peer_addref(peer); /* +1 ref for conn */
-        peer->ibp_connecting++;
+                if (peer->ibp_connecting != 0 &&
+                    peer->ibp_nid < kibnal_data.kib_ni->ni_nid) {
+                        /* Resolve concurrent connection attempts in favour of
+                         * the higher NID */
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
+                        kibnal_conn_decref(conn);
+                        kibnal_reject(nid, cep, IBNAL_REJECT_CONN_RACE);
+                        return -EALREADY;
+                }
+        }
 
-        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+        kibnal_peer_addref(peer); /* +1 ref for conn */
+        peer->ibp_accepting++;
 
+        kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
         conn->ibc_peer = peer;
-        conn->ibc_state = IBNAL_CONN_CONNECTING;
-        /* conn->ibc_cep is set when cm_accept is called */
-        conn->ibc_incarnation = incarnation;
+        conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+        conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
+        LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+                 <= IBNAL_RX_MSGS);
 
-        *connp = conn;
-        return (0);
-}
-
-static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
-{
-        IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
-        FSTATUS frc;
-
-        modify_attr.RequestState = state;
+        write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
-        frc = iibt_qp_modify(qp, &modify_attr, NULL);
-        if (frc != FSUCCESS)
-                CERROR("couldn't set qp state to %d, error %d\n", state, frc);
+        *connp = conn;
+        return 0;
 }
 
-static void kibnal_flush_pending(kib_conn_t *conn)
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
 {
-        LIST_HEAD        (zombies); 
-        struct list_head *tmp;
-        struct list_head *nxt;
-        kib_tx_t         *tx;
-        unsigned long     flags;
-        int               done;
-
-        /* NB we wait until the connection has closed before completing
-         * outstanding passive RDMAs so we can be sure the network can't 
-         * touch the mapped memory any more. */
-        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
-
-        /* set the QP to the error state so that we get flush callbacks
-         * on our posted receives which can then drop their conn refs */
-        kibnal_set_qp_state(conn->ibc_qp, QPStateError);
-
-        spin_lock_irqsave (&conn->ibc_lock, flags);
-
-        /* grab passive RDMAs not waiting for the tx callback */
-        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
-                tx = list_entry (tmp, kib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
-
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
 
-                /* still waiting for tx callback? */
-                if (!tx->tx_passive_rdma_wait)
-                        continue;
-
-                tx->tx_status = -ECONNABORTED;
-                tx->tx_passive_rdma_wait = 0;
-                done = (tx->tx_sending == 0);
-
-                if (!done)
-                        continue;
-
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
-        }
+        CM_REQUEST_INFO  *req = &info->Info.Request;
+        CM_REPLY_INFO    *rep;
+        kib_conn_t       *conn;
+        FSTATUS           frc;
+        int               rc;
+        
+        LASSERT(arg == NULL); /* no conn yet for passive */
 
-        /* grab all blocked transmits */
-        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
-                tx = list_entry (tmp, kib_tx_t, tx_list);
-                
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
+        CDEBUG(D_NET, "%x\n", info->Status);
+        
+        if (info->Status == FCM_CONNECT_CANCEL) {
+                up(&kibnal_data.kib_listener_signal);
+                return;
         }
         
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        LASSERT (info->Status == FCM_CONNECT_REQUEST);
 
-        while (!list_empty(&zombies)) {
-                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+        rc = kibnal_accept(&conn, cep, (kib_msg_t *)req->PrivateData, 
+                           CM_REQUEST_INFO_USER_LEN);
+        if (rc != 0)                   /* kibnal_accept has rejected */
+                return;
 
-                list_del(&tx->tx_list);
-                kibnal_tx_done (tx);
+        conn->ibc_cvars->cv_path = req->PathInfo.Path;
+        
+        rc = kibnal_conn_rts(conn, 
+                             req->CEPInfo.QPN, 
+                             req->CEPInfo.OfferedInitiatorDepth,
+                             req->CEPInfo.OfferedResponderResources,
+                             req->CEPInfo.StartingPSN);
+        if (rc != 0) {
+                kibnal_reject(conn->ibc_peer->ibp_nid, cep, 
+                              IBNAL_REJECT_NO_RESOURCES);
+                kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED);
+                return;
         }
-}
 
-static void
-kibnal_reject (IB_HANDLE cep, uint16_t reason)
-{
-        CM_REJECT_INFO *rej;
+        memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci));
+        rep = &conn->ibc_cvars->cv_cmci.Info.Reply;
 
-        PORTAL_ALLOC(rej, sizeof(*rej));
-        if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */
-                return;  
+        rep->QPN                   = conn->ibc_cvars->cv_qpattrs.QPNumber;
+        rep->QKey                  = conn->ibc_cvars->cv_qpattrs.Qkey;
+        rep->StartingPSN           = conn->ibc_cvars->cv_qpattrs.RecvPSN;
+        rep->EndToEndFlowControl   = conn->ibc_cvars->cv_qpattrs.FlowControl;
+        rep->ArbInitiatorDepth     = conn->ibc_cvars->cv_qpattrs.InitiatorDepth;
+        rep->ArbResponderResources = conn->ibc_cvars->cv_qpattrs.ResponderResources;
+        rep->TargetAckDelay        = kibnal_data.kib_hca_attrs.LocalCaAckDelay;
+        rep->FailoverAccepted      = IBNAL_FAILOVER_ACCEPTED;
+        rep->RnRRetryCount         = req->CEPInfo.RnrRetryCount;
+        
+        CLASSERT (CM_REPLY_INFO_USER_LEN >=
+                  offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
 
-        rej->Reason = reason;
-        iibt_cm_reject(cep, rej);
-        PORTAL_FREE(rej, sizeof(*rej));
-}
+        kibnal_pack_connmsg((kib_msg_t *)rep->PrivateData,
+                            conn->ibc_version,
+                            CM_REPLY_INFO_USER_LEN,
+                            IBNAL_MSG_CONNACK,
+                            conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
 
-static FSTATUS
-kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, 
-              IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
-{
-        IB_QP_ATTRIBUTES_MODIFY modify_attr;
-        FSTATUS frc;
-        ENTRY;
+        LASSERT (conn->ibc_cep == NULL);
+        kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
 
-        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
-                .RequestState           = QPStateReadyToRecv,
-                .RecvPSN                = IBNAL_STARTING_PSN,
-                .DestQPNumber           = qpn,
-                .ResponderResources     = resp_res,
-                .MinRnrTimer            = UsecToRnrNakTimer(2000), /* 20 ms */
-                .Attrs                  = (IB_QP_ATTR_RECVPSN |
-                                           IB_QP_ATTR_DESTQPNUMBER | 
-                                           IB_QP_ATTR_RESPONDERRESOURCES | 
-                                           IB_QP_ATTR_DESTAV | 
-                                           IB_QP_ATTR_PATHMTU | 
-                                           IB_QP_ATTR_MINRNRTIMER),
-        };
-        GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, 
-                      &modify_attr.DestAV);
-
-        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
-        if (frc != FSUCCESS) 
-                RETURN(frc);
-
-        modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
-                .RequestState           = QPStateReadyToSend,
-                .FlowControl            = TRUE,
-                .InitiatorDepth         = init_depth,
-                .SendPSN                = send_psn,
-                .LocalAckTimeout        = path->PktLifeTime + 2, /* 2 or 1? */
-                .RetryCount             = IBNAL_RETRY,
-                .RnrRetryCount          = IBNAL_RNR_RETRY,
-                .Attrs                  = (IB_QP_ATTR_FLOWCONTROL | 
-                                           IB_QP_ATTR_INITIATORDEPTH | 
-                                           IB_QP_ATTR_SENDPSN | 
-                                           IB_QP_ATTR_LOCALACKTIMEOUT | 
-                                           IB_QP_ATTR_RETRYCOUNT | 
-                                           IB_QP_ATTR_RNRRETRYCOUNT),
-        };
+        frc = iba_cm_accept(cep, 
+                            &conn->ibc_cvars->cv_cmci,
+                            NULL,
+                            kibnal_cm_passive_callback, conn, 
+                            &conn->ibc_cep);
 
-        frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
-        RETURN(frc);
+        if (frc == FSUCCESS || frc == FPENDING)
+                return;
+        
+        CERROR("iba_cm_accept(%s) failed: %d\n", 
+               libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED);
 }
 
-static void
-kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+void
+kibnal_check_connreply(kib_conn_t *conn, CM_REPLY_INFO *rep)
 {
-        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
-        kib_conn_t *conn = arg;
-        kib_wire_connreq_t *wcr;
-        CM_REPLY_INFO *rep = &info->Info.Reply;
-        uint16_t reason;
-        FSTATUS frc;
+        kib_msg_t   *msg = (kib_msg_t *)rep->PrivateData;
+        lnet_nid_t   nid = conn->ibc_peer->ibp_nid;
+        FSTATUS      frc;
+        int          rc;
 
-        wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+        rc = kibnal_unpack_msg(msg, conn->ibc_version, CM_REPLY_INFO_USER_LEN);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking connack from %s\n",
+                        rc, libcfs_nid2str(nid));
+                kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO);
+                return;
+        }
+                        
+        if (msg->ibm_type != IBNAL_MSG_CONNACK) {
+                CERROR("Bad connack request type %d (%d expected) from %s\n",
+                       msg->ibm_type, IBNAL_MSG_CONNREQ,
+                       libcfs_nid2str(msg->ibm_srcnid));
+                kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO);
+                return;
+        }
 
-        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
-                CERROR ("Can't connect "LPX64": bad magic %08x\n",
-                        conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic));
-                GOTO(reject, reason = RC_USER_REJ);
+        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+            msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid ||
+            msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+                CERROR("Stale connack from %s(%s): %s(%s), "LPX64"("LPX64")\n",
+                       libcfs_nid2str(msg->ibm_srcnid), 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                       libcfs_nid2str(msg->ibm_dstnid),
+                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
+                       msg->ibm_dststamp, kibnal_data.kib_incarnation);
+                kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ESTALE);
+                return;
         }
         
-        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
-                CERROR ("Can't connect "LPX64": bad version %d\n",
-                        conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic));
-                GOTO(reject, reason = RC_USER_REJ);
-        }
-                        
-        if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
-                CERROR ("Can't connect "LPX64": bad queue depth %d\n",
-                        conn->ibc_peer->ibp_nid, 
-                        le16_to_cpu(wcr->wcr_queue_depth));
-                GOTO(reject, reason = RC_USER_REJ);
+        if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE ||
+            msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+                CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n",
+                       libcfs_nid2str(msg->ibm_srcnid), 
+                       msg->ibm_u.connparams.ibcp_queue_depth,
+                       msg->ibm_u.connparams.ibcp_max_msg_size,
+                       msg->ibm_u.connparams.ibcp_max_frags,
+                       IBNAL_MSG_QUEUE_SIZE,
+                       IBNAL_MSG_SIZE,
+                       IBNAL_MAX_RDMA_FRAGS);
+                kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO);
+                return;
         }
                         
-        if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
-                CERROR ("Unexpected NID "LPX64" from "LPX64"\n",
-                        le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid);
-                GOTO(reject, reason = RC_USER_REJ);
-        }
-
-        CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
-               conn, conn->ibc_peer->ibp_nid);
+        CDEBUG(D_NET, "Connection %s REP_RECEIVED.\n",
+               libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
-        conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+        conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
-
-        frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, 
-                            min_t(__u8, rep->ArbInitiatorDepth,
-                                  ca_attr->MaxQPResponderResources),
-                            &conn->ibc_connreq->cr_path, 
-                            min_t(__u8, rep->ArbResponderResources,
-                                  ca_attr->MaxQPInitiatorDepth),
-                            rep->StartingPSN);
-        if (frc != FSUCCESS) {
-                CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n",
-                       conn, conn->ibc_peer->ibp_nid, frc);
-                GOTO(reject, reason = RC_NO_QP);
-        }
-
-        /* the callback arguments are ignored for an active accept */
-        conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
-        frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, 
-                             NULL, NULL, NULL, NULL);
-        if (frc != FCM_CONNECT_ESTABLISHED) {
-                CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n",
-                       conn, conn->ibc_peer->ibp_nid, frc);
-                kibnal_connreq_done (conn, 1, -ECONNABORTED);
-                /* XXX don't call reject after accept fails? */
+        conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
+        LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+                 <= IBNAL_RX_MSGS);
+
+        rc = kibnal_conn_rts(conn, 
+                             rep->QPN,
+                             rep->ArbInitiatorDepth,
+                             rep->ArbResponderResources,
+                             rep->StartingPSN);
+        if (rc != 0) {
+                kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_NO_RESOURCES);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EIO);
                 return;
         }
 
-        CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n",
-               conn, conn->ibc_peer->ibp_nid);
-
-        kibnal_connreq_done (conn, 1, 0);
-        return;
+        memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci));
+        
+        frc = iba_cm_accept(conn->ibc_cep, 
+                            &conn->ibc_cvars->cv_cmci, 
+                            NULL, NULL, NULL, NULL);
 
-reject:
-        kibnal_reject(cep, reason);
-        kibnal_connreq_done (conn, 1, -EPROTO);
+        if (frc == FCM_CONNECT_ESTABLISHED) {
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, 0);
+                return;
+        }
+        
+        CERROR("Connection %s CMAccept failed: %d\n",
+               libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ECONNABORTED);
 }
 
-/* ib_cm.h has a wealth of information on the CM procedures */
-static void
-kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+void
+kibnal_cm_active_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
 {
         kib_conn_t       *conn = arg;
 
         CDEBUG(D_NET, "status 0x%x\n", info->Status);
 
-        /* Established Connection Notifier */
         switch (info->Status) {
         default:
-                CERROR("unknown status %d on Connection %p -> "LPX64"\n",
-                       info->Status, conn, conn->ibc_peer->ibp_nid);
+                CERROR("unknown status %d on Connection %s\n", 
+                       info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 LBUG();
                 break;
 
-        case FCM_CONNECT_REPLY:
-                kibnal_connect_reply(cep, info, arg);
+        case FCM_CONNECT_TIMEOUT:
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ETIMEDOUT);
+                break;
+                
+        case FCM_CONNECT_REJECT:
+                kibnal_check_connreject(conn, IBNAL_CONN_ACTIVE,
+                                        &info->Info.Reject);
                 break;
 
-        case FCM_DISCONNECT_REQUEST:
-                /* XXX lock around these state management bits? */
-                if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
-                        kibnal_close_conn (conn, 0);
-                conn->ibc_state = IBNAL_CONN_DREP;
-                iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+        case FCM_CONNECT_REPLY:
+                kibnal_check_connreply(conn, &info->Info.Reply);
                 break;
 
-        /* these both guarantee that no more cm callbacks will occur */
-        case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+        case FCM_DISCONNECT_REQUEST:
         case FCM_DISCONNECT_REPLY:
-                CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n",
-                       conn, conn->ibc_peer->ibp_nid);
-
-                conn->ibc_state = IBNAL_CONN_DISCONNECTED;
-                kibnal_flush_pending(conn);
-                kibnal_put_conn(conn);        /* Lose CM's ref */
+        case FCM_DISCONNECTED:
+                kibnal_cm_disconnect_callback(conn, info);
                 break;
         }
-
-        return;
-}
-
-static int
-kibnal_set_cm_flags(IB_HANDLE cep)
-{
-        FSTATUS frc;
-        uint32 value = 1;
-
-        frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
-                                 (char *)&value, sizeof(value), 0);
-        if (frc != FSUCCESS) {
-                CERROR("error setting timeout callback: %d\n", frc);
-                return -1;
-        }
-
-#if 0
-        frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
-                                 sizeof(value), 0);
-        if (frc != FSUCCESS) {
-                CERROR("error setting async accept: %d\n", frc);
-                return -1;
-        }
-#endif
-
-        return 0;
 }
 
 void
-kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
-{
-        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
-        IB_QP_ATTRIBUTES_QUERY *query;
-        CM_REQUEST_INFO    *req;
-        CM_CONN_INFO       *rep = NULL, *rcv = NULL;
-        kib_wire_connreq_t *wcr;
-        kib_conn_t         *conn = NULL;
-        uint16_t            reason = 0;
-        FSTATUS             frc;
-        int                 rc = 0;
-        
-        LASSERT(cep);
-        LASSERT(info);
-        LASSERT(arg == NULL); /* no conn yet for passive */
-
-        CDEBUG(D_NET, "status 0x%x\n", info->Status);
-
-        req = &info->Info.Request;
-        wcr = (kib_wire_connreq_t *)req->PrivateData;
-
-        CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, 
-               le64_to_cpu(wcr->wcr_nid));
-        
-        if (info->Status == FCM_CONNECT_CANCEL)
-                return;
-        
-        LASSERT (info->Status == FCM_CONNECT_REQUEST);
-        
-        if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
-                CERROR ("Can't accept: bad magic %08x\n",
-                        le32_to_cpu(wcr->wcr_magic));
-                GOTO(out, reason = RC_USER_REJ);
-        }
-
-        if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
-                CERROR ("Can't accept: bad version %d\n",
-                        le16_to_cpu(wcr->wcr_magic));
-                GOTO(out, reason = RC_USER_REJ);
-        }
-
-        rc = kibnal_accept(&conn, cep,
-                           le64_to_cpu(wcr->wcr_nid),
-                           le64_to_cpu(wcr->wcr_incarnation),
-                           le16_to_cpu(wcr->wcr_queue_depth));
-        if (rc != 0) {
-                CERROR ("Can't accept "LPX64": %d\n",
-                        le64_to_cpu(wcr->wcr_nid), rc);
-                GOTO(out, reason = RC_NO_RESOURCES);
-        }
-
-        frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
-                            min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, 
-                                  ca_attr->MaxQPResponderResources),
-                            &req->PathInfo.Path,
-                            min_t(__u8, req->CEPInfo.OfferedResponderResources, 
-                                  ca_attr->MaxQPInitiatorDepth),
-                            req->CEPInfo.StartingPSN);
-
-        if (frc != FSUCCESS) {
-                CERROR ("Can't mark QP RTS/RTR  "LPX64": %d\n",
-                        le64_to_cpu(wcr->wcr_nid), frc);
-                GOTO(out, reason = RC_NO_QP);
-        }
-
-        frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
-        if (frc != FSUCCESS) {
-                CERROR ("Couldn't query qp attributes "LPX64": %d\n",
-                        le64_to_cpu(wcr->wcr_nid), frc);
-                GOTO(out, reason = RC_NO_QP);
-        }
-        query = &conn->ibc_qp_attrs;
-
-        PORTAL_ALLOC(rep, sizeof(*rep));
-        PORTAL_ALLOC(rcv, sizeof(*rcv));
-        if (rep == NULL || rcv == NULL) {
-                if (rep) PORTAL_FREE(rep, sizeof(*rep));
-                if (rcv) PORTAL_FREE(rcv, sizeof(*rcv));
-                CERROR ("can't allocate reply and receive buffers\n");
-                GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
-        }
-
-        /* don't try to deref this into the incoming wcr :) */
-        wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
-
-        rep->Info.Reply = (CM_REPLY_INFO) {
-                .QPN = query->QPNumber,
-                .QKey = query->Qkey,
-                .StartingPSN = query->RecvPSN,
-                .EndToEndFlowControl = query->FlowControl,
-                /* XXX Hmm. */
-                .ArbInitiatorDepth = query->InitiatorDepth,
-                .ArbResponderResources = query->ResponderResources,
-                .TargetAckDelay = 0,
-                .FailoverAccepted = 0,
-                .RnRRetryCount = req->CEPInfo.RnrRetryCount,
-        };
-                
-        *wcr = (kib_wire_connreq_t) {
-                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
-                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
-                .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
-                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
-                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
-        };
-
-        frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, 
-                             &conn->ibc_cep);
-
-        PORTAL_FREE(rep, sizeof(*rep));
-        PORTAL_FREE(rcv, sizeof(*rcv));
-
-        if (frc != FCM_CONNECT_ESTABLISHED) {
-                /* XXX it seems we don't call reject after this point? */
-                CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
-                rc = -ECONNABORTED;
-                goto out;
-        }
-
-        if (kibnal_set_cm_flags(conn->ibc_cep)) {
-                rc = -ECONNABORTED;
-                goto out;
-        }
-
-        CWARN("Connection %p -> "LPX64" ESTABLISHED.\n",
-               conn, conn->ibc_peer->ibp_nid);
-
-out:
-        if (reason) {
-                kibnal_reject(cep, reason);
-                rc = -ECONNABORTED;
-        }
-        if (conn != NULL) 
-                kibnal_connreq_done(conn, 0, rc);
-
-        return;
-}
-
-static void
 dump_path_records(PATH_RESULTS *results)
 {
         IB_PATH_RECORD *path;
         int i;
 
-        for(i = 0; i < results->NumPathRecords; i++) {
+        for (i = 0; i < results->NumPathRecords; i++) {
                 path = &results->PathRecords[i];
                 CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
                        LPX64":"LPX64" pkey %x\n",
@@ -2482,110 +2769,104 @@ dump_path_records(PATH_RESULTS *results)
         }
 }
 
-static void
-kibnal_pathreq_callback (void *arg, QUERY *query, 
-                         QUERY_RESULT_VALUES *query_res)
+void
+kibnal_pathreq_callback (void *arg, QUERY *qry, 
+                         QUERY_RESULT_VALUES *qrslt)
 {
-        IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
-        kib_conn_t *conn = arg;
-        PATH_RESULTS *path;
-        FSTATUS frc;
+        IB_CA_ATTRIBUTES  *ca_attr = &kibnal_data.kib_hca_attrs;
+        kib_conn_t        *conn = arg;
+        CM_REQUEST_INFO   *req = &conn->ibc_cvars->cv_cmci.Info.Request;
+        PATH_RESULTS      *path = (PATH_RESULTS *)qrslt->QueryResult;
+        FSTATUS            frc;
         
-        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
-                CERROR ("status %d data size %d\n", query_res->Status,
-                        query_res->ResultDataSize);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+        if (qrslt->Status != FSUCCESS || 
+            qrslt->ResultDataSize < sizeof(*path)) {
+                CDEBUG (D_NETERROR, "pathreq %s failed: status %d data size %d\n", 
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                        qrslt->Status, qrslt->ResultDataSize);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH);
                 return;
         }
 
-        path = (PATH_RESULTS *)query_res->QueryResult;
-
         if (path->NumPathRecords < 1) {
-                CERROR ("expected path records: %d\n", path->NumPathRecords);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+                CDEBUG (D_NETERROR, "pathreq %s failed: no path records\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH);
                 return;
         }
 
-        dump_path_records(path);
+        //dump_path_records(path);
+        conn->ibc_cvars->cv_path = path->PathRecords[0];
 
-        /* just using the first.  this is probably a horrible idea. */
-        conn->ibc_connreq->cr_path = path->PathRecords[0];
+        LASSERT (conn->ibc_cep == NULL);
 
-        conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+        conn->ibc_cep = kibnal_create_cep(conn->ibc_peer->ibp_nid);
         if (conn->ibc_cep == NULL) {
-                CERROR ("Can't create CEP\n");
-                kibnal_connreq_done (conn, 1, -EINVAL);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ENOMEM);
                 return;
         }
 
-        if (kibnal_set_cm_flags(conn->ibc_cep)) {
-                kibnal_connreq_done (conn, 1, -EINVAL);
-                return;
+        memset(req, 0, sizeof(*req));
+        req->SID                               = conn->ibc_cvars->cv_svcrec.RID.ServiceID;
+        req->CEPInfo.CaGUID                    = kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx];
+        req->CEPInfo.EndToEndFlowControl       = IBNAL_EE_FLOW;
+        req->CEPInfo.PortGUID                  = conn->ibc_cvars->cv_path.SGID.Type.Global.InterfaceID;
+        req->CEPInfo.RetryCount                = IBNAL_RETRY;
+        req->CEPInfo.RnrRetryCount             = IBNAL_RNR_RETRY;
+        req->CEPInfo.AckTimeout                = IBNAL_ACK_TIMEOUT;
+        req->CEPInfo.StartingPSN               = IBNAL_STARTING_PSN;
+        req->CEPInfo.QPN                       = conn->ibc_cvars->cv_qpattrs.QPNumber;
+        req->CEPInfo.QKey                      = conn->ibc_cvars->cv_qpattrs.Qkey;
+        req->CEPInfo.OfferedResponderResources = ca_attr->MaxQPResponderResources;
+        req->CEPInfo.OfferedInitiatorDepth     = ca_attr->MaxQPInitiatorDepth;
+        req->PathInfo.bSubnetLocal             = IBNAL_LOCAL_SUB;
+        req->PathInfo.Path                     = conn->ibc_cvars->cv_path;
+
+        CLASSERT (CM_REQUEST_INFO_USER_LEN >=
+                  offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
+
+        kibnal_pack_connmsg((kib_msg_t *)req->PrivateData, 
+                            conn->ibc_version,
+                            CM_REQUEST_INFO_USER_LEN,
+                            IBNAL_MSG_CONNREQ, 
+                            conn->ibc_peer->ibp_nid, 0);
+
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto test */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        ((kib_msg_t *)req->PrivateData)->ibm_version++;
+                        the_lnet.ln_testprotocompat &= ~1;
+                }
+                if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                        ((kib_msg_t *)req->PrivateData)->ibm_magic =
+                                LNET_PROTO_MAGIC;
+                        the_lnet.ln_testprotocompat &= ~2;
+                }
+                LNET_UNLOCK();
         }
 
-        conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
-                .wcr_magic       = cpu_to_le32(IBNAL_MSG_MAGIC),
-                .wcr_version     = cpu_to_le16(IBNAL_MSG_VERSION),
-                .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
-                .wcr_nid         = cpu_to_le64(kibnal_data.kib_nid),
-                .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
-        };
-
-        conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
-                .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
-                .CEPInfo = (CM_CEP_INFO) { 
-                        .CaGUID = kibnal_data.kib_hca_guids[0],
-                        .EndToEndFlowControl = FALSE,
-                        .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
-                        .RetryCount = IBNAL_RETRY,
-                        .RnrRetryCount = IBNAL_RNR_RETRY,
-                        .AckTimeout = IBNAL_ACK_TIMEOUT,
-                        .StartingPSN = IBNAL_STARTING_PSN,
-                        .QPN = conn->ibc_qp_attrs.QPNumber,
-                        .QKey = conn->ibc_qp_attrs.Qkey,
-                        .OfferedResponderResources = ca_attr->MaxQPResponderResources,
-                        .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
-                },
-                .PathInfo = (CM_CEP_PATHINFO) {
-                        .bSubnetLocal = TRUE,
-                        .Path = conn->ibc_connreq->cr_path,
-                },
-        };
-
-#if 0
-        /* XXX set timeout just like SDP!!!*/
-        conn->ibc_connreq->cr_path.packet_life = 13;
-#endif
         /* Flag I'm getting involved with the CM... */
-        conn->ibc_state = IBNAL_CONN_CONNECTING;
-
-        CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
-               conn->ibc_connreq->cr_service.RID.ServiceID, 
-               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
-
-        memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, 
-               CM_REQUEST_INFO_USER_LEN);
-        memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, 
-               &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
+        kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
 
-        /* kibnal_cm_callback gets my conn ref */
-        frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
-                              kibnal_cm_callback, conn);
-        if (frc != FPENDING && frc != FSUCCESS) {
-                CERROR ("Connect: %d\n", frc);
-                /* Back out state change as connect failed */
-                conn->ibc_state = IBNAL_CONN_INIT_QP;
-                kibnal_connreq_done (conn, 1, -EINVAL);
-        }
+        /* cm callback gets my conn ref */
+        frc = iba_cm_connect(conn->ibc_cep, req, 
+                             kibnal_cm_active_callback, conn);
+        if (frc == FPENDING || frc == FSUCCESS)
+                return;
+        
+        CERROR ("Connect %s failed: %d\n", 
+                libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH);
 }
 
-static void
-dump_service_records(SERVICE_RECORD_RESULTS *results)
+void
+kibnal_dump_service_records(SERVICE_RECORD_RESULTS *results)
 {
         IB_SERVICE_RECORD *svc;
         int i;
 
-        for(i = 0; i < results->NumServiceRecords; i++) {
+        for (i = 0; i < results->NumServiceRecords; i++) {
                 svc = &results->ServiceRecords[i];
                 CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
                        i,
@@ -2596,161 +2877,147 @@ dump_service_records(SERVICE_RECORD_RESULTS *results)
         }
 }
 
-
-static void
-kibnal_service_get_callback (void *arg, QUERY *query, 
-                             QUERY_RESULT_VALUES *query_res)
+void
+kibnal_service_get_callback (void *arg, QUERY *qry, 
+                             QUERY_RESULT_VALUES *qrslt)
 {
-        kib_conn_t *conn = arg;
-        SERVICE_RECORD_RESULTS *svc;
-        COMMAND_CONTROL_PARAMETERS sd_params;
-        QUERY   path_query;
-        FSTATUS frc;
-        
-        if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
-                CERROR ("status %d data size %d\n", query_res->Status,
-                        query_res->ResultDataSize);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+        kib_conn_t              *conn = arg;
+        SERVICE_RECORD_RESULTS  *svc;
+        FSTATUS                  frc;
+
+        if (qrslt->Status != FSUCCESS || 
+            qrslt->ResultDataSize < sizeof(*svc)) {
+                CDEBUG (D_NETERROR, "Lookup %s failed: status %d data size %d\n", 
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                        qrslt->Status, qrslt->ResultDataSize);
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH);
                 return;
         }
 
-        svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
-
+        svc = (SERVICE_RECORD_RESULTS *)qrslt->QueryResult;
         if (svc->NumServiceRecords < 1) {
-                CERROR ("%d service records\n", svc->NumServiceRecords);
-                kibnal_connreq_done (conn, 1, -EINVAL);
+                CDEBUG (D_NETERROR, "lookup %s failed: no service records\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH);
                 return;
         }
 
-        dump_service_records(svc);
+        //kibnal_dump_service_records(svc);
+        conn->ibc_cvars->cv_svcrec = svc->ServiceRecords[0];
 
-        conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
+        qry = &conn->ibc_cvars->cv_query;
+        memset(qry, 0, sizeof(*qry));
 
-        CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n",
-               query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, 
-               *kibnal_service_nid_field(&conn->ibc_connreq->cr_service));
+        qry->OutputType = OutputTypePathRecord;
+        qry->InputType = InputTypePortGuidPair;
 
-        memset(&path_query, 0, sizeof(path_query));
-        path_query.InputType = InputTypePortGuidPair;
-        path_query.OutputType = OutputTypePathRecord;
-        path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
-        path_query.InputValue.PortGuidPair.DestPortGuid  = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+        qry->InputValue.PortGuidPair.SourcePortGuid = 
+                kibnal_data.kib_port_guid;
+        qry->InputValue.PortGuidPair.DestPortGuid  = 
+                conn->ibc_cvars->cv_svcrec.RID.ServiceGID.Type.Global.InterfaceID;
 
-        memset(&sd_params, 0, sizeof(sd_params));
-        sd_params.RetryCount = IBNAL_RETRY;
-        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
-
-        /* kibnal_service_get_callback gets my conn ref */
-
-        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
-                                                    kibnal_data.kib_port_guid,
-                                                    &path_query, 
-                                                    kibnal_pathreq_callback,
-                                                    &sd_params, conn);
+        /* kibnal_pathreq_callback gets my conn ref */
+        frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            qry, 
+                                            kibnal_pathreq_callback,
+                                            &kibnal_data.kib_sdretry,
+                                            conn);
         if (frc == FPENDING)
                 return;
 
-        CERROR ("Path record request failed: %d\n", frc);
-        kibnal_connreq_done (conn, 1, -EINVAL);
+        CERROR ("pathreq %s failed: %d\n", 
+                libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH);
 }
 
-static void
+void
 kibnal_connect_peer (kib_peer_t *peer)
 {
-        COMMAND_CONTROL_PARAMETERS sd_params;
-        QUERY   query;
-        FSTATUS frc;
-        kib_conn_t  *conn = kibnal_create_conn();
+        QUERY                     *qry;
+        FSTATUS                    frc;
+        kib_conn_t                *conn;
 
         LASSERT (peer->ibp_connecting != 0);
 
+        conn = kibnal_create_conn(peer->ibp_nid, peer->ibp_version);
         if (conn == NULL) {
                 CERROR ("Can't allocate conn\n");
-                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
+                kibnal_peer_connect_failed(peer, IBNAL_CONN_ACTIVE, -ENOMEM);
                 return;
         }
 
         conn->ibc_peer = peer;
-        kib_peer_addref(peer);
-
-        PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
-        if (conn->ibc_connreq == NULL) {
-                CERROR ("Can't allocate connreq\n");
-                kibnal_connreq_done (conn, 1, -ENOMEM);
-                return;
-        }
-
-        memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+        kibnal_peer_addref(peer);
 
-        kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+        qry = &conn->ibc_cvars->cv_query;
+        memset(qry, 0, sizeof(*qry));
 
-        memset(&query, 0, sizeof(query));
-        query.InputType = InputTypeServiceRecord;
-        query.OutputType = OutputTypeServiceRecord;
-        query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
-        query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
+        qry->OutputType = OutputTypeServiceRecord;
+        qry->InputType = InputTypeServiceRecord;
 
-        memset(&sd_params, 0, sizeof(sd_params));
-        sd_params.RetryCount = IBNAL_RETRY;
-        sd_params.Timeout = 10 * 1000;   /* wait 10 seconds */
+        qry->InputValue.ServiceRecordValue.ComponentMask = 
+                KIBNAL_SERVICE_KEY_MASK;
+        kibnal_set_service_keys(
+                &qry->InputValue.ServiceRecordValue.ServiceRecord, 
+                peer->ibp_nid);
 
         /* kibnal_service_get_callback gets my conn ref */
-        frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
-                                                    kibnal_data.kib_port_guid,
-                                                    &query, 
-                                                kibnal_service_get_callback, 
-                                                    &sd_params, conn);
+        frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd,
+                                            kibnal_data.kib_port_guid,
+                                            qry,
+                                            kibnal_service_get_callback,
+                                            &kibnal_data.kib_sdretry, 
+                                            conn);
         if (frc == FPENDING)
                 return;
 
-        CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
-        kibnal_connreq_done (conn, 1, frc);
+        CERROR("Lookup %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), frc);
+        kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH);
 }
 
-static int
-kibnal_conn_timed_out (kib_conn_t *conn)
+int
+kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
 {
         kib_tx_t          *tx;
         struct list_head  *ttmp;
-        unsigned long      flags;
+        int                timed_out = 0;
 
-        spin_lock_irqsave (&conn->ibc_lock, flags);
+        spin_lock(&conn->ibc_lock);
 
-        list_for_each (ttmp, &conn->ibc_tx_queue) {
+        list_for_each (ttmp, txs) {
                 tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (!tx->tx_passive_rdma_wait);
-                LASSERT (tx->tx_sending == 0);
-
-                if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-                        return 1;
+                if (txs == &conn->ibc_active_txs) {
+                        LASSERT (!tx->tx_queued);
+                        LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+                } else {
+                        LASSERT (tx->tx_queued);
                 }
-        }
-
-        list_for_each (ttmp, &conn->ibc_active_txs) {
-                tx = list_entry (ttmp, kib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
-
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
 
                 if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-                        return 1;
+                        timed_out = 1;
+                        break;
                 }
         }
 
-        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        spin_unlock(&conn->ibc_lock);
+        return timed_out;
+}
 
-        return 0;
+int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+        return  kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
+                kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
+                kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
+                kibnal_check_txs(conn, &conn->ibc_active_txs);
 }
 
-static void
-kibnal_check_conns (int idx)
+void
+kibnal_check_peers (int idx)
 {
+        rwlock_t          *rwlock = &kibnal_data.kib_global_lock;
         struct list_head  *peers = &kibnal_data.kib_peers[idx];
         struct list_head  *ptmp;
         kib_peer_t        *peer;
@@ -2762,15 +3029,33 @@ kibnal_check_conns (int idx)
         /* NB. We expect to have a look at all the peers and not find any
          * rdmas to time out, so we just use a shared lock while we
          * take a look... */
-        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        read_lock_irqsave(rwlock, flags);
 
         list_for_each (ptmp, peers) {
                 peer = list_entry (ptmp, kib_peer_t, ibp_list);
 
+                if (peer->ibp_passivewait) {
+                        LASSERT (list_empty(&peer->ibp_conns));
+                        
+                        if (!time_after_eq(jiffies, 
+                                           peer->ibp_passivewait_deadline))
+                                continue;
+                        
+                        kibnal_peer_addref(peer); /* ++ ref for me... */
+                        read_unlock_irqrestore(rwlock, flags);
+
+                        kibnal_peer_connect_failed(peer, IBNAL_CONN_WAITING,
+                                                   -ETIMEDOUT);
+                        kibnal_peer_decref(peer); /* ...until here */
+                        
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+
                 list_for_each (ctmp, &peer->ibp_conns) {
                         conn = list_entry (ctmp, kib_conn_t, ibc_list);
 
-                        KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+                        LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
                         /* In case we have enough credits to return via a
                          * NOOP, but there were no non-blocking tx descs
@@ -2779,60 +3064,57 @@ kibnal_check_conns (int idx)
 
                         if (!kibnal_conn_timed_out(conn))
                                 continue;
+
+                        /* Handle timeout by closing the whole connection.  We
+                         * can only be sure RDMA activity has ceased once the
+                         * QP has been modified. */
                         
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
+                        kibnal_conn_addref(conn); /* 1 ref for me... */
 
-                        atomic_inc (&conn->ibc_refcount);
-                        read_unlock_irqrestore(&kibnal_data.kib_global_lock,
-                                               flags);
+                        read_unlock_irqrestore(rwlock, flags);
 
-                        CERROR("Timed out RDMA with "LPX64"\n",
-                               peer->ibp_nid);
+                        CERROR("Timed out RDMA with %s\n",
+                               libcfs_nid2str(peer->ibp_nid));
 
                         kibnal_close_conn (conn, -ETIMEDOUT);
-                        kibnal_put_conn (conn);
+                        kibnal_conn_decref(conn); /* ...until here */
 
                         /* start again now I've dropped the lock */
                         goto again;
                 }
         }
 
-        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+        read_unlock_irqrestore(rwlock, flags);
 }
 
-static void
-kib_connd_handle_state(kib_conn_t *conn)
+void
+kibnal_disconnect_conn (kib_conn_t *conn)
 {
-        FSTATUS frc;
-
-        switch (conn->ibc_state) {
-                /* all refs have gone, free and be done with it */ 
-                case IBNAL_CONN_DISCONNECTED:
-                        kibnal_destroy_conn (conn);
-                        return; /* avoid put_conn */
+        FSTATUS       frc;
 
-                case IBNAL_CONN_SEND_DREQ:
-                        frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
-                        if (frc != FSUCCESS) /* XXX do real things */
-                                CERROR("disconnect failed: %d\n", frc);
-                        conn->ibc_state = IBNAL_CONN_DREQ;
-                        break;
+        LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTING);
 
-                /* a callback got to the conn before we did */ 
-                case IBNAL_CONN_DREP:
-                        break;
-                                
-                default:
-                        CERROR ("Bad conn %p state: %d\n", conn, 
-                                conn->ibc_state);
-                        LBUG();
-                        break;
+        kibnal_conn_disconnected(conn);
+                
+        frc = iba_cm_disconnect(conn->ibc_cep, NULL, NULL);
+        switch (frc) {
+        case FSUCCESS:
+                break;
+                
+        case FINSUFFICIENT_RESOURCES:
+                CERROR("ENOMEM disconnecting %s\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                /* This might cause the module to become unloadable since the
+                 * FCM_DISCONNECTED callback is still outstanding */
+                break;
+                
+        default:
+                CERROR("Unexpected error disconnecting %s: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+                LBUG();
         }
 
-        /* drop ref from close_conn */
-        kibnal_put_conn(conn);
+        kibnal_peer_notify(conn->ibc_peer);
 }
 
 int
@@ -2844,27 +3126,43 @@ kibnal_connd (void *arg)
         kib_peer_t        *peer;
         int                timeout;
         int                i;
+        int                did_something;
         int                peer_index = 0;
         unsigned long      deadline = jiffies;
         
-        kportal_daemonize ("kibnal_connd");
-        kportal_blockallsigs ();
+        cfs_daemonize ("kibnal_connd");
+        cfs_block_allsigs ();
 
         init_waitqueue_entry (&wait, current);
 
-        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+
+        while (!kibnal_data.kib_shutdown) {
+                did_something = 0;
+
+                if (!list_empty (&kibnal_data.kib_connd_zombies)) {
+                        conn = list_entry (kibnal_data.kib_connd_zombies.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        did_something = 1;
+
+                        kibnal_destroy_conn(conn);
+
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                }
 
-        for (;;) {
                 if (!list_empty (&kibnal_data.kib_connd_conns)) {
                         conn = list_entry (kibnal_data.kib_connd_conns.next,
                                            kib_conn_t, ibc_list);
                         list_del (&conn->ibc_list);
-                        
                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
-                        kib_connd_handle_state(conn);
+                        did_something = 1;
 
+                        kibnal_disconnect_conn(conn);
+                        kibnal_conn_decref(conn);
+                        
                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
-                        continue;
                 }
 
                 if (!list_empty (&kibnal_data.kib_connd_peers)) {
@@ -2873,26 +3171,22 @@ kibnal_connd (void *arg)
                         
                         list_del_init (&peer->ibp_connd_list);
                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+                        did_something = 1;
 
                         kibnal_connect_peer (peer);
-                        kib_peer_decref (peer);
+                        kibnal_peer_decref (peer);
 
                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                 }
 
-                /* shut down and nobody left to reap... */
-                if (kibnal_data.kib_shutdown &&
-                    atomic_read(&kibnal_data.kib_nconns) == 0)
-                        break;
-
-                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
-
                 /* careful with the jiffy wrap... */
                 while ((timeout = (int)(deadline - jiffies)) <= 0) {
                         const int n = 4;
                         const int p = 1;
                         int       chunk = kibnal_data.kib_peer_hash_size;
                         
+                        spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
                         /* Time to check for RDMA timeouts on a few more
                          * peers: I do checks every 'p' seconds on a
                          * proportion of the peer table and I need to check
@@ -2901,22 +3195,27 @@ kibnal_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (kibnal_tunables.kib_io_timeout > n * p)
+                        if (*kibnal_tunables.kib_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        kibnal_tunables.kib_io_timeout;
+                                        *kibnal_tunables.kib_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
                         for (i = 0; i < chunk; i++) {
-                                kibnal_check_conns (peer_index);
+                                kibnal_check_peers (peer_index);
                                 peer_index = (peer_index + 1) % 
                                              kibnal_data.kib_peer_hash_size;
                         }
 
                         deadline += p * HZ;
+                        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+                        did_something = 1;
                 }
 
-                kibnal_data.kib_connd_waketime = jiffies + timeout;
+                if (did_something)
+                        continue;
+
+                spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                 set_current_state (TASK_INTERRUPTIBLE);
                 add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
@@ -2938,78 +3237,149 @@ kibnal_connd (void *arg)
         return (0);
 }
 
+
+void 
+kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev)
+{
+        /* XXX flesh out.  this seems largely for async errors */
+        CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_hca_callback (void *hca_arg, void *cq_arg)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        kibnal_data.kib_ready = 1;
+        wake_up(&kibnal_data.kib_sched_waitq);
+        spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
 int
 kibnal_scheduler(void *arg)
 {
-        long            id = (long)arg;
-        char            name[16];
-        kib_rx_t       *rx;
-        kib_tx_t       *tx;
-        unsigned long   flags;
-        int             rc;
-        int             counter = 0;
-        int             did_something;
+        long               id = (long)arg;
+        wait_queue_t       wait;
+        char               name[16];
+        FSTATUS            frc;
+        FSTATUS            frc2;
+        IB_WORK_COMPLETION wc;
+        kib_rx_t          *rx;
+        unsigned long      flags;
+        __u64              rxseq = 0;
+        int                busy_loops = 0;
 
         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
+        cfs_daemonize(name);
+        cfs_block_allsigs();
 
-        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+        init_waitqueue_entry(&wait, current);
 
-        for (;;) {
-                did_something = 0;
+        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
 
-                while (!list_empty(&kibnal_data.kib_sched_txq)) {
-                        tx = list_entry(kibnal_data.kib_sched_txq.next,
-                                        kib_tx_t, tx_list);
-                        list_del(&tx->tx_list);
+        while (!kibnal_data.kib_shutdown) {
+                if (busy_loops++ >= IBNAL_RESCHED) {
                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
-                        kibnal_tx_done(tx);
 
-                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
-                                          flags);
+                        our_cond_resched();
+                        busy_loops = 0;
+                        
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
                 }
 
-                if (!list_empty(&kibnal_data.kib_sched_rxq)) {
-                        rx = list_entry(kibnal_data.kib_sched_rxq.next,
-                                        kib_rx_t, rx_list);
-                        list_del(&rx->rx_list);
+                if (kibnal_data.kib_ready &&
+                    !kibnal_data.kib_checking_cq) {
+                        /* take ownership of completion polling */
+                        kibnal_data.kib_checking_cq = 1;
+                        /* Assume I'll exhaust the CQ */
+                        kibnal_data.kib_ready = 0;
                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
+                        
+                        frc = iba_poll_cq(kibnal_data.kib_cq, &wc);
+                        if (frc == FNOT_DONE) {
+                                /* CQ empty */
+                                frc2 = iba_rearm_cq(kibnal_data.kib_cq,
+                                                    CQEventSelNextWC);
+                                LASSERT (frc2 == FSUCCESS);
+                        }
+                        
+                        if (frc == FSUCCESS &&
+                            kibnal_wreqid2type(wc.WorkReqId) == IBNAL_WID_RX) {
+                                rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.WorkReqId);
+                                
+                                /* Grab the RX sequence number NOW before
+                                 * anyone else can get an RX completion */
+                                rxseq = rx->rx_conn->ibc_rxseq++;
+                        }
+                                
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+                        /* give up ownership of completion polling */
+                        kibnal_data.kib_checking_cq = 0;
 
-                        kibnal_rx(rx);
+                        if (frc == FNOT_DONE)
+                                continue;
 
-                        did_something = 1;
-                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
-                                          flags);
-                }
+                        LASSERT (frc == FSUCCESS);
+                        /* Assume there's more: get another scheduler to check
+                         * while I handle this completion... */
 
-                /* shut down and no receives to complete... */
-                if (kibnal_data.kib_shutdown &&
-                    atomic_read(&kibnal_data.kib_nconns) == 0)
-                        break;
+                        kibnal_data.kib_ready = 1;
+                        wake_up(&kibnal_data.kib_sched_waitq);
 
-                /* nothing to do or hogging CPU */
-                if (!did_something || counter++ == IBNAL_RESCHED) {
                         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                                flags);
-                        counter = 0;
-
-                        if (!did_something) {
-                                rc = wait_event_interruptible(
-                                        kibnal_data.kib_sched_waitq,
-                                        !list_empty(&kibnal_data.kib_sched_txq) || 
-                                        !list_empty(&kibnal_data.kib_sched_rxq) || 
-                                        (kibnal_data.kib_shutdown &&
-                                         atomic_read (&kibnal_data.kib_nconns) == 0));
-                        } else {
-                                our_cond_resched();
-                        }
 
-                        spin_lock_irqsave(&kibnal_data.kib_sched_lock,
-                                          flags);
+                        switch (kibnal_wreqid2type(wc.WorkReqId)) {
+                        case IBNAL_WID_RX:
+                                kibnal_rx_complete(&wc, rxseq);
+                                break;
+                                
+                        case IBNAL_WID_TX:
+                                kibnal_tx_complete(&wc);
+                                break;
+                                
+                        case IBNAL_WID_RDMA:
+                                /* We only get RDMA completion notification if
+                                 * it fails.  So we just ignore them completely
+                                 * because...
+                                 *
+                                 * 1) If an RDMA fails, all subsequent work
+                                 * items, including the final SEND will fail
+                                 * too, so I'm still guaranteed to notice that
+                                 * this connection is hosed.
+                                 *
+                                 * 2) It's positively dangerous to look inside
+                                 * the tx descriptor obtained from an RDMA work
+                                 * item.  As soon as I drop the kib_sched_lock,
+                                 * I give a scheduler on another CPU a chance
+                                 * to get the final SEND completion, so the tx
+                                 * descriptor can get freed as I inspect it. */
+                                CERROR ("RDMA failed: %d\n", wc.Status);
+                                break;
+
+                        default:
+                                LBUG();
+                        }
+                        
+                        spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+                        continue;
                 }
+
+                /* Nothing to do; sleep... */
+
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
+                spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+                                       flags);
+
+                schedule();
+
+                remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
+                set_current_state(TASK_RUNNING);
+                spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
         }
 
         spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
@@ -3017,13 +3387,3 @@ kibnal_scheduler(void *arg)
         kibnal_thread_fini();
         return (0);
 }
-
-
-lib_nal_t kibnal_lib = {
-        libnal_data:        &kibnal_data,      /* NAL private data */
-        libnal_send:         kibnal_send,
-        libnal_send_pages:   kibnal_send_pages,
-        libnal_recv:         kibnal_recv,
-        libnal_recv_pages:   kibnal_recv_pages,
-        libnal_dist:         kibnal_dist
-};
diff --git a/lnet/klnds/iiblnd/iiblnd_modparams.c b/lnet/klnds/iiblnd/iiblnd_modparams.c
new file mode 100644 (file)
index 0000000..ceb6e5d
--- /dev/null
@@ -0,0 +1,179 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "iiblnd.h"
+
+static char *ipif_basename = "ib";
+CFS_MODULE_PARM(ipif_basename, "s", charp, 0444,
+                "IPoIB interface base name");
+
+static char *service_name = "iiblnd";
+CFS_MODULE_PARM(service_name, "s", charp, 0444,
+                "IB service name");
+
+static int service_number = 0x11b9a2;
+CFS_MODULE_PARM(service_number, "i", int, 0444,
+                "IB service number");
+
+static int min_reconnect_interval = 1;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = 60;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int concurrent_peers = 1152;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+               "maximum number of peers that may connect");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int ntx = 256;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of message descriptors");
+
+static int credits = 128;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int sd_retries = 8;
+CFS_MODULE_PARM(sd_retries, "i", int, 0444,
+               "# times to retry SD queries");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+                "Idle time in seconds before sending a keepalive");
+
+static int concurrent_sends = IBNAL_RX_MSGS;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0644,
+                "Send work queue sizing");
+
+kib_tunables_t kibnal_tunables = {
+        .kib_ipif_basename          = &ipif_basename,
+        .kib_service_name           = &service_name,
+        .kib_service_number         = &service_number,
+        .kib_min_reconnect_interval = &min_reconnect_interval,
+        .kib_max_reconnect_interval = &max_reconnect_interval,
+        .kib_concurrent_peers       = &concurrent_peers,
+       .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_keepalive              = &keepalive,
+        .kib_ntx                    = &ntx,
+        .kib_credits                = &credits,
+        .kib_peercredits            = &peer_credits,
+        .kib_sd_retries             = &sd_retries,
+        .kib_concurrent_sends       = &concurrent_sends,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+/* NB max_size specified for proc_dostring entries only needs to be big enough
+ * not to truncate the printout; it only needs to be the actual size of the
+ * string buffer if we allow writes (and we don't) */
+
+static ctl_table kibnal_ctl_table[] = {
+       {1, "ipif_basename", &ipif_basename, 
+         1024, 0444, NULL, &proc_dostring},
+       {2, "service_name", &service_name, 
+         1024, 0444, NULL, &proc_dostring},
+       {3, "service_number", &service_number, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {4, "min_reconnect_interval", &min_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {5, "max_reconnect_interval", &max_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {6, "concurrent_peers", &concurrent_peers, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "cksum", &cksum, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {8, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {9, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "credits", &credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {11, "peer_credits", &peer_credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {12, "sd_retries", &sd_retries, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {13, "keepalive", &keepalive, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {14, "concurrent_sends", &concurrent_sends, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {0}
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+       {203, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
+       {0}
+};
+
+int
+kibnal_tunables_init ()
+{
+       kibnal_tunables.kib_sysctl =
+               register_sysctl_table(kibnal_top_ctl_table, 0);
+       
+       if (kibnal_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+        if (*kibnal_tunables.kib_concurrent_sends > IBNAL_RX_MSGS)
+                *kibnal_tunables.kib_concurrent_sends = IBNAL_RX_MSGS;
+        if (*kibnal_tunables.kib_concurrent_sends < IBNAL_MSG_QUEUE_SIZE)
+                *kibnal_tunables.kib_concurrent_sends = IBNAL_MSG_QUEUE_SIZE;
+
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+       if (kibnal_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kibnal_tunables.kib_sysctl);
+}
+
+#else
+
+int
+kibnal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+}
+
+#endif
diff --git a/lnet/klnds/lolnd/Makefile.in b/lnet/klnds/lolnd/Makefile.in
deleted file mode 100644 (file)
index 222e861..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-MODULES := klonal
-klonal-objs := lonal.o lonal_cb.o
-
-@INCLUDE_RULES@
diff --git a/lnet/klnds/lolnd/lolnd.c b/lnet/klnds/lolnd/lolnd.c
deleted file mode 100644 (file)
index 03c2742..0000000
+++ /dev/null
@@ -1,164 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2004 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "lonal.h"
-
-nal_t                  klonal_api;
-klonal_data_t          klonal_data;
-ptl_handle_ni_t         klonal_ni;
-
-
-int
-klonal_cmd (struct portals_cfg *pcfg, void *private)
-{
-       LASSERT (pcfg != NULL);
-       
-       switch (pcfg->pcfg_command) {
-       case NAL_CMD_REGISTER_MYNID:
-               CDEBUG (D_IOCTL, "setting NID to "LPX64" (was "LPX64")\n",
-                       pcfg->pcfg_nid, klonal_lib.libnal_ni.ni_pid.nid);
-               klonal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid;
-               return (0);
-               
-       default:
-               return (-EINVAL);
-       }
-}
-
-static void
-klonal_shutdown(nal_t *nal)
-{
-       /* NB The first ref was this module! */
-       if (nal->nal_refct != 0)
-               return;
-
-       CDEBUG (D_NET, "shutdown\n");
-       LASSERT (nal == &klonal_api);
-
-       switch (klonal_data.klo_init)
-       {
-       default:
-               LASSERT (0);
-
-       case KLO_INIT_ALL:
-                libcfs_nal_cmd_unregister(LONAL);
-               /* fall through */
-
-       case KLO_INIT_LIB:
-               lib_fini (&klonal_lib);
-               break;
-
-       case KLO_INIT_NOTHING:
-               return;
-       }
-
-       memset(&klonal_data, 0, sizeof (klonal_data));
-
-       CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
-
-       printk (KERN_INFO "Lustre: LO NAL unloaded (final mem %d)\n",
-                atomic_read(&portal_kmemory));
-       PORTAL_MODULE_UNUSE;
-}
-
-static int
-klonal_startup (nal_t *nal, ptl_pid_t requested_pid,
-               ptl_ni_limits_t *requested_limits, 
-               ptl_ni_limits_t *actual_limits)
-{
-       int               rc;
-       ptl_process_id_t  my_process_id;
-       int               pkmem = atomic_read(&portal_kmemory);
-
-       LASSERT (nal == &klonal_api);
-
-       if (nal->nal_refct != 0) {
-               if (actual_limits != NULL)
-                       *actual_limits = klonal_lib.libnal_ni.ni_actual_limits;
-               return (PTL_OK);
-       }
-
-       LASSERT (klonal_data.klo_init == KLO_INIT_NOTHING);
-
-       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
-
-       /* ensure all pointers NULL etc */
-       memset (&klonal_data, 0, sizeof (klonal_data));
-
-       my_process_id.nid = 0;
-       my_process_id.pid = requested_pid;
-
-       rc = lib_init(&klonal_lib, nal, my_process_id,
-                     requested_limits, actual_limits);
-        if (rc != PTL_OK) {
-               CERROR ("lib_init failed %d\n", rc);
-               klonal_shutdown (nal);
-               return (rc);
-       }
-
-       klonal_data.klo_init = KLO_INIT_LIB;
-
-       rc = libcfs_nal_cmd_register (LONAL, &klonal_cmd, NULL);
-       if (rc != 0) {
-               CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-               klonal_shutdown (nal);
-               return (PTL_FAIL);
-       }
-
-       klonal_data.klo_init = KLO_INIT_ALL;
-
-       printk(KERN_INFO "Lustre: LO NAL (initial mem %d)\n", pkmem);
-       PORTAL_MODULE_USE;
-
-       return (PTL_OK);
-}
-
-void __exit
-klonal_finalise (void)
-{
-       PtlNIFini(klonal_ni);
-
-       ptl_unregister_nal(LONAL);
-}
-
-static int __init
-klonal_initialise (void)
-{
-       int   rc;
-
-       klonal_api.nal_ni_init = klonal_startup;
-       klonal_api.nal_ni_fini = klonal_shutdown;
-
-       rc = ptl_register_nal(LONAL, &klonal_api);
-       if (rc != PTL_OK) {
-               CERROR("Can't register LONAL: %d\n", rc);
-               return (-ENOMEM);               /* or something... */
-       }
-
-       return (0);
-}
-
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Loopback NAL v0.01");
-MODULE_LICENSE("GPL");
-
-module_init (klonal_initialise);
-module_exit (klonal_finalise);
diff --git a/lnet/klnds/lolnd/lolnd.h b/lnet/klnds/lolnd/lolnd.h
deleted file mode 100644 (file)
index 6d8d77d..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef _LONAL_H
-#define _LONAL_H
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/uio.h>
-#include <linux/init.h>
-
-#define DEBUG_SUBSYSTEM S_NAL
-
-#include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
-
-#define KLOD_IOV        153401
-#define KLOD_KIOV       153402
-
-typedef struct
-{
-        unsigned int     klod_type;
-        unsigned int     klod_niov;
-        size_t           klod_offset;
-        size_t           klod_nob;
-        union {
-                struct iovec  *iov;
-                ptl_kiov_t    *kiov;
-        }                klod_iov;
-} klo_desc_t;
-
-typedef struct
-{
-        char               klo_init;            /* what's been initialised */
-}  klonal_data_t;
-
-/* kqn_init state */
-#define KLO_INIT_NOTHING        0               /* MUST BE ZERO so zeroed state is initialised OK */
-#define KLO_INIT_LIB            1
-#define KLO_INIT_ALL            2
-
-extern lib_nal_t           klonal_lib;
-extern nal_t               klonal_api;
-extern klonal_data_t       klonal_data;
-
-#endif /* _LONAL_H */
diff --git a/lnet/klnds/lolnd/lolnd_cb.c b/lnet/klnds/lolnd/lolnd_cb.c
deleted file mode 100644 (file)
index cf5df0d..0000000
+++ /dev/null
@@ -1,267 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (C) 2004 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "lonal.h"
-
-/*
- *  LIB functions follow
- *
- */
-static int
-klonal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
-        *dist = 0;                      /* it's me */
-        return (0);
-}
-
-static ptl_err_t
-klonal_send (lib_nal_t    *nal,
-             void         *private,
-             lib_msg_t    *libmsg,
-             ptl_hdr_t    *hdr,
-             int           type,
-             ptl_nid_t     nid,
-             ptl_pid_t     pid,
-             unsigned int  payload_niov,
-             struct iovec *payload_iov,
-             size_t        payload_offset,
-             size_t        payload_nob)
-{
-        klo_desc_t klod = {
-                .klod_type    = KLOD_IOV,
-                .klod_niov    = payload_niov,
-                .klod_offset  = payload_offset,
-                .klod_nob     = payload_nob,
-                .klod_iov     = { .iov = payload_iov } };
-        ptl_err_t rc;
-
-        LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid);
-
-        rc = lib_parse(&klonal_lib, hdr, &klod);
-        if (rc == PTL_OK)
-                lib_finalize(&klonal_lib, private, libmsg, PTL_OK);
-        
-        return rc;
-}
-
-static ptl_err_t
-klonal_send_pages (lib_nal_t    *nal,
-                   void         *private,
-                   lib_msg_t    *libmsg,
-                   ptl_hdr_t    *hdr,
-                   int           type,
-                   ptl_nid_t     nid,
-                   ptl_pid_t     pid,
-                   unsigned int  payload_niov,
-                   ptl_kiov_t   *payload_kiov,
-                   size_t        payload_offset,
-                   size_t        payload_nob)
-{
-        klo_desc_t klod = {
-                .klod_type     = KLOD_KIOV,
-                .klod_niov     = payload_niov,
-                .klod_offset   = payload_offset,
-                .klod_nob      = payload_nob,
-                .klod_iov      = { .kiov = payload_kiov } };
-        ptl_err_t   rc;
-
-        LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid);
-        
-        rc = lib_parse(&klonal_lib, hdr, &klod);
-        if (rc == PTL_OK)
-                lib_finalize(&klonal_lib, private, libmsg, PTL_OK);
-        
-        return rc;
-}
-
-static ptl_err_t
-klonal_recv(lib_nal_t    *nal,
-            void         *private,
-            lib_msg_t    *libmsg,
-            unsigned int  niov,
-            struct iovec *iov,
-            size_t        offset,
-            size_t        mlen,
-            size_t        rlen)
-{
-        klo_desc_t *klod = (klo_desc_t *)private;
-
-        /* I only handle mapped->mapped matches */
-        LASSERT(klod->klod_type == KLOD_IOV);
-
-        if (mlen == 0)
-                return PTL_OK;
-
-        while (offset >= iov->iov_len) {
-                offset -= iov->iov_len;
-                iov++;
-                niov--;
-                LASSERT(niov > 0);
-        }
-        
-        while (klod->klod_offset >= klod->klod_iov.iov->iov_len) {
-                klod->klod_offset -= klod->klod_iov.iov->iov_len;
-                klod->klod_iov.iov++;
-                klod->klod_niov--;
-                LASSERT(klod->klod_niov > 0);
-        }
-        
-        do {
-                int fraglen = MIN(iov->iov_len - offset,
-                                  klod->klod_iov.iov->iov_len - klod->klod_offset);
-
-                LASSERT(niov > 0);
-                LASSERT(klod->klod_niov > 0);
-
-                if (fraglen > mlen)
-                        fraglen = mlen;
-                
-                memcpy((void *)((unsigned long)iov->iov_base + offset),
-                       (void *)((unsigned long)klod->klod_iov.iov->iov_base +
-                                klod->klod_offset),
-                       fraglen);
-
-                if (offset + fraglen < iov->iov_len) {
-                        offset += fraglen;
-                } else {
-                        offset = 0;
-                        iov++;
-                        niov--;
-                }
-
-                if (klod->klod_offset + fraglen < klod->klod_iov.iov->iov_len ) {
-                        klod->klod_offset += fraglen;
-                } else {
-                        klod->klod_offset = 0;
-                        klod->klod_iov.iov++;
-                        klod->klod_niov--;
-                }
-
-                mlen -= fraglen;
-        } while (mlen > 0);
-        
-        lib_finalize(&klonal_lib, private, libmsg, PTL_OK);
-        return PTL_OK;
-}
-
-static ptl_err_t
-klonal_recv_pages(lib_nal_t    *nal,
-                  void         *private,
-                  lib_msg_t    *libmsg,
-                  unsigned int  niov,
-                  ptl_kiov_t   *kiov,
-                  size_t        offset,
-                  size_t        mlen,
-                  size_t        rlen)
-{
-        void          *srcaddr = NULL;
-        void          *dstaddr = NULL;
-        unsigned long  srcfrag = 0;
-        unsigned long  dstfrag = 0;
-        unsigned long  fraglen;
-        klo_desc_t    *klod = (klo_desc_t *)private;
-
-        /* I only handle unmapped->unmapped matches */
-        LASSERT(klod->klod_type == KLOD_KIOV);
-
-        if (mlen == 0)
-                return PTL_OK;
-
-        while (offset >= kiov->kiov_len) {
-                offset -= kiov->kiov_len;
-                kiov++;
-                niov--;
-                LASSERT(niov > 0);
-        }
-
-        while (klod->klod_offset >= klod->klod_iov.kiov->kiov_len) {
-                klod->klod_offset -= klod->klod_iov.kiov->kiov_len;
-                klod->klod_iov.kiov++;
-                klod->klod_niov--;
-                LASSERT(klod->klod_niov > 0);
-        }
-
-        do {
-        /* CAVEAT EMPTOR: I kmap 2 pages at once == slight risk of deadlock */
-                LASSERT(niov > 0);
-                if (dstaddr == NULL) {
-                        dstaddr = (void *)((unsigned long)kmap(kiov->kiov_page) +
-                                           kiov->kiov_offset + offset);
-                        dstfrag = kiov->kiov_len -  offset;
-                }
-
-                LASSERT(klod->klod_niov > 0);
-                if (srcaddr == NULL) {
-                        srcaddr = (void *)((unsigned long)kmap(klod->klod_iov.kiov->kiov_page) +
-                                           klod->klod_iov.kiov->kiov_offset + klod->klod_offset);
-                        srcfrag = klod->klod_iov.kiov->kiov_len - klod->klod_offset;
-                }
-                
-                fraglen = MIN(srcfrag, dstfrag);
-                if (fraglen > mlen)
-                        fraglen = mlen;
-                
-                memcpy(dstaddr, srcaddr, fraglen);
-                
-                if (fraglen < dstfrag) {
-                        dstfrag -= fraglen;
-                        dstaddr = (void *)((unsigned long)dstaddr + fraglen);
-                } else {
-                        kunmap(kiov->kiov_page);
-                        dstaddr = NULL;
-                        offset = 0;
-                        kiov++;
-                        niov--;
-                }
-
-                if (fraglen < srcfrag) {
-                        srcfrag -= fraglen;
-                        srcaddr = (void *)((unsigned long)srcaddr + fraglen);
-                } else {
-                        kunmap(klod->klod_iov.kiov->kiov_page);
-                        srcaddr = NULL;
-                        klod->klod_offset = 0;
-                        klod->klod_iov.kiov++;
-                        klod->klod_niov--;
-                }
-
-                mlen -= fraglen;
-        } while (mlen > 0);
-
-        if (dstaddr != NULL)
-                kunmap(kiov->kiov_page);
-
-        if (srcaddr != NULL)
-                kunmap(klod->klod_iov.kiov->kiov_page);
-
-        lib_finalize(&klonal_lib, private, libmsg, PTL_OK);
-        return PTL_OK;
-}
-
-lib_nal_t klonal_lib =
-{
-        libnal_data:       &klonal_data,         /* NAL private data */
-        libnal_send:        klonal_send,
-        libnal_send_pages:  klonal_send_pages,
-        libnal_recv:        klonal_recv,
-        libnal_recv_pages:  klonal_recv_pages,
-        libnal_dist:        klonal_dist
-};
diff --git a/lnet/klnds/mxlnd/.cvsignore b/lnet/klnds/mxlnd/.cvsignore
new file mode 100644 (file)
index 0000000..26bf56c
--- /dev/null
@@ -0,0 +1,11 @@
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
+
diff --git a/lnet/klnds/mxlnd/Makefile.in b/lnet/klnds/mxlnd/Makefile.in
new file mode 100644 (file)
index 0000000..378dbdd
--- /dev/null
@@ -0,0 +1,6 @@
+MODULES := kmxlnd
+kmxlnd-objs := mxlnd.o mxlnd_cb.o mxlnd_modparams.o
+
+EXTRA_POST_CFLAGS := @MXCPPFLAGS@
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/mxlnd/README b/lnet/klnds/mxlnd/README
new file mode 100644 (file)
index 0000000..cc87e7a
--- /dev/null
@@ -0,0 +1,190 @@
+*************************************************************************
+*                                                                       *
+*    Myrinet Express Lustre Networking Driver (MXLND) documentation     *
+*                                                                       *
+*************************************************************************
+
+README of MXLND
+
+MXLND provides support for Myricom's Myrinet Express (MX) communication
+layer in Lustre.
+
+MXLND may be used with either MX-10G or MX-2G. See MX's README for
+supported NICs.
+
+Table of Contents:
+    I. Installation
+       1. Configuring and compiling
+       2. Module Parameters
+   II. MXLND Performance
+  III. Caveats
+       1. Systems with different page sizes
+       2. Multi-homing
+       3. MX endpoint collision
+   IV. License
+    V. Support
+
+================
+I. Installation
+================
+
+MXLND is supported on Linux 2.6. It may be possible to run it on 2.4,
+but it has not been tested. MXLND requires Myricom's MX version 1.2.1
+or higher. See MX's README for the supported list of processors.
+
+1. Configuring and compiling
+
+MXLND should be already integrated into the Lustre build process. To 
+build MXLND, you will need to set the path to your MX installation
+in Lustre's ./configure:
+
+    --with-mx=/opt/mx
+
+replacing /opt with the actual path. Configure will check to ensure that
+the MX version has the required functions. If not, it will fail to build.
+To check if MXLND built, look for:
+
+    checking whether to enable Myrinet MX support... yes
+
+in configure's output or the presence of Makefile in
+$LUSTRE/lnet/klnds/mxlnd.
+
+2. Module Parameters
+
+MXLND supports a number of load-time parameters using Linux's module
+parameter system. On our test systems, we created the following file:
+
+    /etc/modprobe.d/kmxlnd
+
+On some (older?) systems, you may need to modify /etc/modprobe.conf.
+
+The available options are:
+
+    n_waitd     # of completion daemons
+    max_peers   maximum number of peers that may connect
+    cksum       set non-zero to enable small message (< 4KB) checksums
+    ntx         # of total tx message descriptors
+    credits     # concurrent sends to a single peer
+    board       index value of the Myrinet board (NIC)
+    ep_id       MX endpoint ID
+    polling     Use 0 to block (wait). A value > 0 will poll that many times before blocking
+    hosts       IP-to-hostname resolution file
+
+Of these, only hosts is required. It must be the absolute path to the
+MXLND hosts file. For example:
+
+    options kmxlnd hosts=/etc/hosts.mxlnd
+
+The file format for the hosts file is as follows:
+
+IP  HOST  BOARD   EP_ID
+
+The values must be space and/or tab separated where:
+
+    IP is a valid IPv4 address
+    HOST is the name returned by `hostname` on that machine
+    BOARD is the index of the Myricom NIC (0 for the first card, etc.)
+    EP_ID is the MX endpoint ID
+
+You may want to vary the remaining options to obtain the optimal performance
+for your platform.
+
+    n_waitd sets the number of threads that process completed MX requests
+(sends and receives). In our testing, the default of 1 performed best.
+
+    max_peers tells MXLND the upper limit of machines that it will need to 
+communicate with. This affects how many receives it will pre-post and each
+receive will use one page of memory. Ideally, on clients, this value will
+be equal to the total number of Lustre servers (MDS and OSS). On servers,
+it needs to equal the total number of machines in the storage system.
+
+    cksum turns on small message checksums. It can be used to aid in trouble-
+shooting. MX also provides an optional checksumming feature which can check 
+all messages (large and small). See the MX README for details.
+
+    ntx is the number of total sends in flight from this machine. In actuality,
+MXLND reserves half of them for connect messages so make this value twice as large
+as you want for the total number of sends in flight.
+
+    credits is the number of in-flight messages for a specific peer. This is part
+of the flow-control system in Lustre. Increasing this value may improve performance
+but it requires more memory since each message requires at least one page.
+
+    board is the index of the Myricom NIC. Hosts can have multiple Myricom NICs
+and this identifies which one MXLND should use. This value must match the board
+value in your MXLND hosts file for this host.
+
+    ep_id is the MX endpoint ID. Each process that uses MX is required to have at
+least one MX endpoint to access the MX library and NIC. The ID is a simple index
+starting at 0. This value must match the endpoint ID value in your MXLND hosts 
+file for this host.
+
+    polling determines whether this host will poll or block for MX request com-
+pletions. A value of 0 blocks and any positive value will poll that many times
+before blocking. Since polling increases CPU usage, we suggest you set this to
+0 on the client and experiment with different values for servers.
+
+=====================
+II. MXLND Performance
+=====================
+
+On MX-2G systems, MXLND should easily saturate the link and use minimal CPU 
+(5-10% for read and write operations). On MX-10G systems, MXLND can saturate 
+the link and use moderate CPU resources (20-30% for read and write operations).
+MX-10G relies on PCI-Express which is relatively new and performance varies
+considerably by processor, motherboard and PCI-E chipset. Refer to Myricom's
+website for the latest DMA read/write performance results by motherboard. The
+DMA results will place an upper-bound on MXLND performance.
+
+============
+III. Caveats
+============
+
+1. Systems with different page sizes
+
+MXLND will set the maximum small message size equal to the kernel's page size.
+This means that machines running MXLND that have different page sizes are not
+able to communicate with each other. If you wish to run MXLND in this case,
+send email to help@myri.com.
+
+2. Multi-homing
+
+At this time, the MXLND cannot drive more than one interface at a time.  Thus,
+a single Lustre router cannot route between two MX-10G, between two MX-2G, or
+between MX-10G and MX-2G fabrics.
+
+3. MX endpoint collision
+
+Each process that uses MX is required to have at least one MX endpoint to
+access the MX library and NIC. Other processes may need to use MX and no two
+processes can use the same endpoint ID.  MPICH-MX dynamically chooses one at
+MPI startup and should not interfere with MXLND. Sockets-MX, on the other hand,
+is hard coded to use 0 for its ID. If it is possible that anyone will want to
+run Sockets-MX on this system, use a non-0 value for MXLND's endpoint ID.
+
+
+===========
+IV. License
+===========
+
+MXLND is copyright (C) 2006 of Myricom, Inc. 
+
+MXLND is part of Lustre, http://www.lustre.org.
+
+MXLND is free software; you can redistribute it and/or modify it under the
+terms of version 2 of the GNU General Public License as published by the Free
+Software Foundation.
+
+MXLND is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass Ave,
+Cambridge, MA 02139, USA.
+
+==========
+V. Support
+==========
+
+If you have questions about MXLND, please contact help@myri.com.
similarity index 52%
rename from lnet/router/autoMakefile.am
rename to lnet/klnds/mxlnd/autoMakefile.am
index 070b008..1d94f86 100644 (file)
@@ -4,14 +4,10 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
-
-if LINUX
-modulenet_DATA = kptlrouter$(KMODEXT)
-endif
-
+if BUILD_MXLND
+modulenet_DATA = kmxlnd$(KMODEXT)
 endif
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(kptlrouter-objs:%.o=%.c) router.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kmxlnd-objs:%.o=%.c) mxlnd.h
diff --git a/lnet/klnds/mxlnd/mxlnd.c b/lnet/klnds/mxlnd/mxlnd.c
new file mode 100644 (file)
index 0000000..bb6991d
--- /dev/null
@@ -0,0 +1,920 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ * Copyright (C) 2006 Myricom, Inc.
+ *   Author: Scott Atchley <atchley at myri.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "mxlnd.h"
+
+lnd_t the_kmxlnd = {
+        .lnd_type       = MXLND,
+        .lnd_startup    = mxlnd_startup,
+        .lnd_shutdown   = mxlnd_shutdown,
+        .lnd_ctl        = mxlnd_ctl,
+        .lnd_send       = mxlnd_send,
+        .lnd_recv       = mxlnd_recv,
+};
+
+kmx_data_t               kmxlnd_data;
+
+/**
+ * mxlnd_ctx_free - free ctx struct
+ * @ctx - a kmx_peer pointer
+ *
+ * The calling function should remove the ctx from the ctx list first
+ * then free it.
+ */
+void
+mxlnd_ctx_free(struct kmx_ctx *ctx)
+{
+        if (ctx == NULL) return;
+
+        if (ctx->mxc_page != NULL) {
+                __free_page(ctx->mxc_page);
+                spin_lock(&kmxlnd_data.kmx_global_lock);
+                kmxlnd_data.kmx_mem_used -= MXLND_EAGER_SIZE;
+                spin_unlock(&kmxlnd_data.kmx_global_lock);
+        }
+
+        if (ctx->mxc_seg_list != NULL) {
+                LASSERT(ctx->mxc_nseg > 0);
+                MXLND_FREE(ctx->mxc_seg_list, ctx->mxc_nseg * sizeof(mx_ksegment_t));
+        }
+
+        MXLND_FREE (ctx, sizeof (*ctx));
+        return;
+}
+
+/**
+ * mxlnd_ctx_alloc - allocate and initialize a new ctx struct
+ * @ctxp - address of a kmx_ctx pointer
+ *
+ * Returns 0 on success and -EINVAL, -ENOMEM on failure
+ */
+int
+mxlnd_ctx_alloc(struct kmx_ctx **ctxp, enum kmx_req_type type)
+{
+        int             ret     = 0;
+        struct kmx_ctx  *ctx    = NULL;
+
+        if (ctxp == NULL) return -EINVAL;
+
+        MXLND_ALLOC(ctx, sizeof (*ctx));
+        if (ctx == NULL) {
+                CDEBUG(D_NETERROR, "Cannot allocate ctx\n");
+                return -ENOMEM;
+        }
+        memset(ctx, 0, sizeof(*ctx));
+        spin_lock_init(&ctx->mxc_lock);
+
+        ctx->mxc_type = type;
+        ctx->mxc_page = alloc_page (GFP_KERNEL);
+        if (ctx->mxc_page == NULL) {
+                CDEBUG(D_NETERROR, "Can't allocate page\n");
+                ret = -ENOMEM;
+                goto failed;
+        }
+        spin_lock(&kmxlnd_data.kmx_global_lock);
+        kmxlnd_data.kmx_mem_used += MXLND_EAGER_SIZE;
+        spin_unlock(&kmxlnd_data.kmx_global_lock);
+        ctx->mxc_msg = (struct kmx_msg *)((char *)page_address(ctx->mxc_page));
+        ctx->mxc_seg.segment_ptr = MX_PA_TO_U64(lnet_page2phys(ctx->mxc_page));
+        ctx->mxc_state = MXLND_CTX_IDLE;
+
+        *ctxp = ctx;
+        return 0;
+
+failed:
+        mxlnd_ctx_free(ctx);
+        return ret;
+}
+
+/**
+ * mxlnd_ctx_init - reset ctx struct to the default values
+ * @ctx - a kmx_ctx pointer
+ */
+void
+mxlnd_ctx_init(struct kmx_ctx *ctx)
+{
+        if (ctx == NULL) return;
+
+        /* do not change mxc_type */
+        ctx->mxc_incarnation = 0;
+        ctx->mxc_deadline = 0;
+        ctx->mxc_state = MXLND_CTX_IDLE;
+        /* ignore mxc_global_list */
+        if (ctx->mxc_list.next != NULL && !list_empty(&ctx->mxc_list)) {
+                if (ctx->mxc_peer != NULL)
+                        spin_lock(&ctx->mxc_lock);
+                list_del_init(&ctx->mxc_list);
+                if (ctx->mxc_peer != NULL)
+                        spin_unlock(&ctx->mxc_lock);
+        }
+        /* ignore mxc_rx_list */
+        /* ignore mxc_lock */
+        ctx->mxc_nid = 0;
+        ctx->mxc_peer = NULL;
+        ctx->mxc_conn = NULL;
+        /* ignore mxc_msg */
+        /* ignore mxc_page */
+        ctx->mxc_lntmsg[0] = NULL;
+        ctx->mxc_lntmsg[1] = NULL;
+        ctx->mxc_msg_type = 0;
+        ctx->mxc_cookie = 0LL;
+        ctx->mxc_match = 0LL;
+        /* ctx->mxc_seg.segment_ptr points to mxc_page */
+        ctx->mxc_seg.segment_length = 0;
+        if (ctx->mxc_seg_list != NULL) {
+                LASSERT(ctx->mxc_nseg > 0);
+                MXLND_FREE(ctx->mxc_seg_list, ctx->mxc_nseg * sizeof(mx_ksegment_t));
+        }
+        ctx->mxc_seg_list = NULL;
+        ctx->mxc_nseg = 0;
+        ctx->mxc_nob = 0;
+        ctx->mxc_mxreq = NULL;
+        memset(&ctx->mxc_status, 0, sizeof(mx_status_t));
+        /* ctx->mxc_get */
+        /* ctx->mxc_put */
+
+        ctx->mxc_msg->mxm_type = 0;
+        ctx->mxc_msg->mxm_credits = 0;
+        ctx->mxc_msg->mxm_nob = 0;
+        ctx->mxc_msg->mxm_seq = 0;
+
+        return;
+}
+
+/**
+ * mxlnd_free_txs - free kmx_txs and associated pages
+ *
+ * Called from mxlnd_shutdown()
+ */
+void
+mxlnd_free_txs(void)
+{
+        struct kmx_ctx          *tx     = NULL;
+        struct kmx_ctx          *next   = NULL;
+
+        list_for_each_entry_safe(tx, next, &kmxlnd_data.kmx_txs, mxc_global_list) {
+                list_del_init(&tx->mxc_global_list);
+                mxlnd_ctx_free(tx);
+        }
+        return;
+}
+
+/**
+ * mxlnd_init_txs - allocate tx descriptors then stash on txs and idle tx lists
+ *
+ * Called from mxlnd_startup()
+ * returns 0 on success, else -ENOMEM
+ */
+int
+mxlnd_init_txs(void)
+{
+        int             ret     = 0;
+        int             i       = 0;
+        struct kmx_ctx  *tx      = NULL;
+
+        for (i = 0; i < *kmxlnd_tunables.kmx_ntx; i++) {
+                ret = mxlnd_ctx_alloc(&tx, MXLND_REQ_TX);
+                if (ret != 0) {
+                        mxlnd_free_txs();
+                        return ret;
+                }
+                mxlnd_ctx_init(tx);
+                /* in startup(), no locks required */
+                list_add_tail(&tx->mxc_global_list, &kmxlnd_data.kmx_txs);
+                list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle);
+        }
+        return 0;
+}
+
+/**
+ * mxlnd_free_rxs - free initial kmx_rx descriptors and associated pages
+ *
+ * Called from mxlnd_shutdown()
+ */
+void
+mxlnd_free_rxs(void)
+{
+        struct kmx_ctx          *rx     = NULL;
+        struct kmx_ctx          *next   = NULL;
+
+        list_for_each_entry_safe(rx, next, &kmxlnd_data.kmx_rxs, mxc_global_list) {
+                list_del_init(&rx->mxc_global_list);
+                mxlnd_ctx_free(rx);
+        }
+        return;
+}
+
+/**
+ * mxlnd_init_rxs - allocate initial rx descriptors 
+ *
+ * Called from startup(). We create MXLND_MAX_PEERS plus MXLND_NTX
+ * rx descriptors. We create one for each potential peer to handle 
+ * the initial connect request. We create on for each tx in case the 
+ * send requires a non-eager receive.
+ *
+ * Returns 0 on success, else -ENOMEM
+ */
+int
+mxlnd_init_rxs(void)
+{
+        int             ret     = 0;
+        int             i       = 0;
+        struct kmx_ctx  *rx      = NULL;
+
+        for (i = 0; i < (*kmxlnd_tunables.kmx_ntx + *kmxlnd_tunables.kmx_max_peers); i++) {
+                ret = mxlnd_ctx_alloc(&rx, MXLND_REQ_RX);
+                if (ret != 0) {
+                        mxlnd_free_rxs();
+                        return ret;
+                }
+                mxlnd_ctx_init(rx);
+                /* in startup(), no locks required */
+                list_add_tail(&rx->mxc_global_list, &kmxlnd_data.kmx_rxs);
+                list_add_tail(&rx->mxc_list, &kmxlnd_data.kmx_rx_idle);
+        }
+        return 0;
+}
+
+/**
+ * mxlnd_free_peers - free peers
+ *
+ * Called from mxlnd_shutdown()
+ */
+void
+mxlnd_free_peers(void)
+{
+        int                      i      = 0;
+        struct kmx_peer         *peer   = NULL;
+        struct kmx_peer         *next   = NULL;
+
+        for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                list_for_each_entry_safe(peer, next, &kmxlnd_data.kmx_peers[i], mxp_peers) {
+                        list_del_init(&peer->mxp_peers);
+                        if (peer->mxp_conn) mxlnd_conn_decref(peer->mxp_conn);
+                        mxlnd_peer_decref(peer);
+                }
+        }
+}
+
+int
+mxlnd_host_alloc(struct kmx_host **hostp)
+{
+        struct kmx_host *host   = NULL;
+
+        MXLND_ALLOC(host, sizeof (*host));
+        if (host == NULL) {
+                CDEBUG(D_NETERROR, "Cannot allocate host\n");
+                return -1;
+        }
+        memset(host, 0, sizeof(*host));
+        spin_lock_init(&host->mxh_lock);
+
+        *hostp = host;
+
+        return 0;
+}
+
+void
+mxlnd_host_free(struct kmx_host *host)
+{
+        if (host == NULL) return;
+
+        if (host->mxh_hostname != NULL)
+                MXLND_FREE(host->mxh_hostname, strlen(host->mxh_hostname) + 1);
+
+        MXLND_FREE(host, sizeof(*host));
+        return;
+}
+
+/**
+ * mxlnd_free_hosts - free kmx_hosts
+ *
+ * Called from mxlnd_shutdown()
+ */
+void
+mxlnd_free_hosts(void)
+{
+        struct kmx_host         *host   = NULL;
+        struct kmx_host         *next   = NULL;
+
+        list_for_each_entry_safe(host, next, &kmxlnd_data.kmx_hosts, mxh_list) {
+                list_del_init(&host->mxh_list);
+                mxlnd_host_free(host);
+        }
+        return;
+}
+
+#define xstr(s) #s
+#define str(s) xstr(s)
+#define MXLND_MAX_BOARD 4       /* we expect hosts to have fewer NICs than this */
+#define MXLND_MAX_EP_ID 16      /* we expect hosts to have less than this endpoints */
+
+/* this parses a line that consists of:
+ * 
+ * IP              HOSTNAME           BOARD        ENDPOINT ID
+ * 169.192.0.113   mds01              0            3
+ * 
+ * By default MX uses the alias (short hostname). If you override
+ * it using mx_hostname to use the FQDN or some other name, the hostname
+ * here must match exactly.
+ */
+
+/* MX_MAX_HOSTNAME_LEN = 80. See myriexpress.h */
+int
+mxlnd_parse_line(char *line)
+{
+        int             i               = 0;
+        int             ret             = 0;
+        int             len             = 0;
+        u32             ip[4]           = { 0, 0, 0, 0 };
+        char            hostname[MX_MAX_HOSTNAME_LEN];
+        u32             board           = -1;
+        u32             ep_id           = -1;
+        struct kmx_host *host           = NULL;
+
+        if (line == NULL) return -1;
+
+        len = strlen(line);
+
+        if (len == 0) return -1;
+
+        /* convert tabs to spaces */
+        for (i = 0; i < len; i++) {
+                if (line[i] == '\t') line[i] = ' ';
+        }
+
+        memset(&hostname, 0 , sizeof(hostname));
+        ret = sscanf(line, "%d.%d.%d.%d %" str(MX_MAX_HOSTNAME_LEN) "s %d %d", 
+                     &ip[0], &ip[1], &ip[2], &ip[3], hostname, &board, &ep_id);
+
+        if (ret != 7) {
+                return -1;
+        }
+
+        /* check for valid values */
+        /* we assume a valid IP address (all <= 255), number of NICs,
+         * and number of endpoint IDs */
+        if (ip[0] > 255 || ip [1] > 255 || ip[2] > 255 || ip[3] > 255 ||
+            board > MXLND_MAX_BOARD || ep_id > MXLND_MAX_EP_ID) {
+                CDEBUG(D_NETERROR, "Illegal value in \"%s\". Ignoring "
+                                   "this host.\n", line);
+                return -1;
+        }
+
+        ret = mxlnd_host_alloc(&host);
+        if (ret != 0) return -1;
+
+        host->mxh_addr = ((ip[0]<<24)|(ip[1]<<16)|(ip[2]<<8)|ip[3]);
+        len = strlen(hostname);
+        MXLND_ALLOC(host->mxh_hostname, len + 1);
+        memset(host->mxh_hostname, 0, len + 1);
+        strncpy(host->mxh_hostname, hostname, len);
+        host->mxh_board = board;
+        host->mxh_ep_id = ep_id;
+
+        spin_lock(&kmxlnd_data.kmx_hosts_lock);
+        list_add_tail(&host->mxh_list, &kmxlnd_data.kmx_hosts);
+        spin_unlock(&kmxlnd_data.kmx_hosts_lock);
+
+        return 0;
+}
+
+void
+mxlnd_print_hosts(void)
+{
+#if MXLND_DEBUG
+        struct kmx_host         *host   = NULL;
+
+        list_for_each_entry(host, &kmxlnd_data.kmx_hosts, mxh_list) {
+                int             ip[4];
+                u32             addr    = host->mxh_addr;
+
+                ip[0] = (addr >> 24) & 0xff;
+                ip[1] = (addr >> 16) & 0xff;
+                ip[2] = (addr >>  8) & 0xff;
+                ip[3] = addr & 0xff;
+                CDEBUG(D_NET, "\tip= %d.%d.%d.%d\n\thost= %s\n\tboard= %d\n\tep_id= %d\n\n",
+                            ip[0], ip[1], ip[2], ip[3],
+                            host->mxh_hostname, host->mxh_board, host->mxh_ep_id);
+        }
+#endif
+        return;
+}
+
+#define MXLND_BUFSIZE (PAGE_SIZE - 1)
+
+int
+mxlnd_parse_hosts(char *filename)
+{
+        int             ret             = 0;
+        s32             size            = 0;
+        s32             bufsize         = MXLND_BUFSIZE;
+        s32             allocd          = 0;
+        loff_t          offset          = 0;
+        struct file     *filp           = NULL;
+        char            *buf            = NULL;
+        s32             buf_off         = 0;
+        char            *sep            = NULL;
+        char            *line           = NULL;
+
+        if (filename == NULL) return -1;
+
+        filp = filp_open(filename, O_RDONLY, 0);
+        if (IS_ERR(filp)) {
+                CERROR("filp_open() failed for %s\n", filename);
+                return -1;
+        }
+
+        size = (s32) filp->f_dentry->d_inode->i_size;
+        if (size < MXLND_BUFSIZE) bufsize = size;
+        allocd = bufsize;
+        MXLND_ALLOC(buf, allocd + 1);
+        if (buf == NULL) {
+                CERROR("Cannot allocate buf\n");
+                filp_close(filp, current->files);
+                return -1;
+        }
+
+        while (offset < size) {
+                memset(buf, 0, bufsize + 1);
+                ret = kernel_read(filp, (unsigned long) offset, buf, (unsigned long) bufsize);
+                if (ret < 0) {
+                        CDEBUG(D_NETERROR, "kernel_read() returned %d - closing %s\n", ret, filename);
+                        filp_close(filp, current->files);
+                        MXLND_FREE(buf, allocd + 1);
+                        return -1;
+                }
+
+                if (ret < bufsize) bufsize = ret;
+                buf_off = 0;
+                while (buf_off < bufsize) {
+                        sep = strchr(buf + buf_off, '\n');
+                        if (sep != NULL) {
+                                /* we have a line */
+                                line = buf + buf_off;
+                                *sep = '\0';
+                                ret = mxlnd_parse_line(line);
+                                if (ret != 0 && strlen(line) != 0) {
+                                        CDEBUG(D_NETERROR, "Failed to parse \"%s\". Ignoring this host.\n", line);
+                                }
+                                buf_off += strlen(line) + 1;
+                        } else {
+                                /* last line or we need to read more */
+                                line = buf + buf_off;
+                                ret = mxlnd_parse_line(line);
+                                if (ret != 0) {
+                                        bufsize -= strlen(line) + 1;
+                                }
+                                buf_off += strlen(line) + 1;
+                        }
+                }
+                offset += bufsize;
+                bufsize = MXLND_BUFSIZE;
+        }
+
+        MXLND_FREE(buf, allocd + 1);
+        filp_close(filp, current->files);
+        mxlnd_print_hosts();
+
+        return 0;
+}
+
+/**
+ * mxlnd_init_mx - open the endpoint, set out ID, register the EAGER callback
+ * @ni - the network interface
+ *
+ * Returns 0 on success, -1 on failure
+ */
+int
+mxlnd_init_mx(lnet_ni_t *ni)
+{
+        int                     ret     = 0;
+        int                     found   = 0;
+        mx_return_t             mxret;
+        mx_endpoint_addr_t      addr;
+        u32                     board   = *kmxlnd_tunables.kmx_board;
+        u32                     ep_id   = *kmxlnd_tunables.kmx_ep_id;
+        u64                     nic_id  = 0LL;
+        struct kmx_host         *host   = NULL;
+
+        mxret = mx_init();
+        if (mxret != MX_SUCCESS) {
+                CERROR("mx_init() failed with %s (%d)\n", mx_strerror(mxret), mxret);
+                return -1;
+        }
+
+        ret = mxlnd_parse_hosts(*kmxlnd_tunables.kmx_hosts);
+        if (ret != 0) {
+                if (*kmxlnd_tunables.kmx_hosts != NULL) {
+                        CERROR("mxlnd_parse_hosts(%s) failed\n", *kmxlnd_tunables.kmx_hosts);
+                }
+                mx_finalize();
+                return -1;
+        }
+
+        list_for_each_entry(host, &kmxlnd_data.kmx_hosts, mxh_list) {
+                if (strcmp(host->mxh_hostname, system_utsname.nodename) == 0) {
+                        /* override the defaults and module parameters with 
+                         * the info from the hosts file */
+                        board = host->mxh_board;
+                        ep_id = host->mxh_ep_id;
+                        kmxlnd_data.kmx_localhost = host;
+                        CDEBUG(D_NET, "my hostname is %s board %d ep_id %d\n", kmxlnd_data.kmx_localhost->mxh_hostname, kmxlnd_data.kmx_localhost->mxh_board, kmxlnd_data.kmx_localhost->mxh_ep_id);
+                        found = 1;
+                        break;
+                }
+        }
+
+        if (found == 0) {
+                CERROR("no host entry found for localhost\n");
+                mx_finalize();
+                return -1;
+        }
+
+        mxret = mx_open_endpoint(board, ep_id, MXLND_MSG_MAGIC, 
+                                 NULL, 0, &kmxlnd_data.kmx_endpt);
+        if (mxret != MX_SUCCESS) {
+                CERROR("mx_open_endpoint() failed with %d\n", mxret);
+                mx_finalize();
+                return -1;
+        }
+
+        mx_get_endpoint_addr(kmxlnd_data.kmx_endpt, &addr);
+        mx_decompose_endpoint_addr(addr, &nic_id, &ep_id);
+
+        LASSERT(host != NULL);
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), host->mxh_addr);
+
+        CDEBUG(D_NET, "My NID is 0x%llx\n", ni->ni_nid);
+
+        /* this will catch all unexpected receives. */
+        mxret = mx_register_unexp_handler(kmxlnd_data.kmx_endpt,
+                                          (mx_unexp_handler_t) mxlnd_unexpected_recv,
+                                          NULL);
+        if (mxret != MX_SUCCESS) {
+                CERROR("mx_register_unexp_callback() failed with %s\n", 
+                         mx_strerror(mxret));
+                mx_close_endpoint(kmxlnd_data.kmx_endpt);
+                mx_finalize();
+                return -1;
+        }
+        mxret = mx_set_request_timeout(kmxlnd_data.kmx_endpt, NULL, MXLND_COMM_TIMEOUT/HZ);
+        if (mxret != MX_SUCCESS) {
+                CERROR("mx_set_request_timeout() failed with %s\n", 
+                        mx_strerror(mxret));
+                mx_close_endpoint(kmxlnd_data.kmx_endpt);
+                mx_finalize();
+                return -1;
+        }
+        return 0;
+}
+
+
+/**
+ * mxlnd_thread_start - spawn a kernel thread with this function
+ * @fn - function pointer
+ * @arg - pointer to the parameter data
+ *
+ * Returns 0 on success and a negative value on failure
+ */
+int
+mxlnd_thread_start(int (*fn)(void *arg), void *arg)
+{
+        int     pid = 0;
+        int     i   = (int) ((long) arg);
+
+        atomic_inc(&kmxlnd_data.kmx_nthreads);
+        init_completion(&kmxlnd_data.kmx_completions[i]);
+
+        pid = kernel_thread (fn, arg, 0);
+        if (pid <= 0) {
+                CERROR("mx_thread_start() failed with %d\n", pid);
+                atomic_dec(&kmxlnd_data.kmx_nthreads);
+        }
+        return pid;
+}
+
+/**
+ * mxlnd_thread_stop - decrement thread counter
+ *
+ * The thread returns 0 when it detects shutdown.
+ * We are simply decrementing the thread counter.
+ */
+void
+mxlnd_thread_stop(long id)
+{
+        int     i       = (int) id;
+        atomic_dec (&kmxlnd_data.kmx_nthreads);
+        complete(&kmxlnd_data.kmx_completions[i]);
+}
+
+/**
+ * mxlnd_shutdown - stop IO, clean up state
+ * @ni - LNET interface handle
+ *
+ * No calls to the LND should be made after calling this function.
+ */
+void
+mxlnd_shutdown (lnet_ni_t *ni)
+{
+        int             i               = 0;
+
+        LASSERT (ni == kmxlnd_data.kmx_ni);
+        LASSERT (ni->ni_data == &kmxlnd_data);
+        CDEBUG(D_NET, "in shutdown()\n");
+
+        CDEBUG(D_MALLOC, "before MXLND cleanup: libcfs_kmemory %d "
+                         "kmx_mem_used %ld\n", atomic_read (&libcfs_kmemory), 
+                         kmxlnd_data.kmx_mem_used);
+
+        switch (kmxlnd_data.kmx_init) {
+
+        case MXLND_INIT_ALL:
+
+                CDEBUG(D_NET, "setting shutdown = 1\n");
+                /* set shutdown and wakeup request_waitds */
+                kmxlnd_data.kmx_shutdown = 1;
+                mb();
+                mx_wakeup(kmxlnd_data.kmx_endpt);
+                up(&kmxlnd_data.kmx_tx_queue_sem);
+                mxlnd_sleep(2 * HZ);
+
+                /* fall through */
+
+        case MXLND_INIT_THREADS:
+
+                CDEBUG(D_NET, "waiting on threads\n");
+                /* wait for threads to complete */
+                for (i = 0; i < MXLND_NCOMPLETIONS; i++) {
+                        wait_for_completion(&kmxlnd_data.kmx_completions[i]);
+                }
+                LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
+
+                CDEBUG(D_NET, "freeing completions\n");
+                MXLND_FREE(kmxlnd_data.kmx_completions, 
+                            MXLND_NCOMPLETIONS * sizeof(struct completion));
+
+                /* fall through */
+
+        case MXLND_INIT_MX:
+
+                CDEBUG(D_NET, "stopping mx\n");
+
+                /* wakeup waiters if they missed the above.
+                 * close endpoint to stop all traffic.
+                 * this will cancel and cleanup all requests, etc. */
+
+                mx_wakeup(kmxlnd_data.kmx_endpt);
+                mx_close_endpoint(kmxlnd_data.kmx_endpt);
+                mx_finalize();
+
+                CDEBUG(D_NET, "mxlnd_free_hosts();\n");
+                mxlnd_free_hosts();
+
+                /* fall through */
+
+        case MXLND_INIT_RXS:
+
+                CDEBUG(D_NET, "freeing rxs\n");
+
+                /* free all rxs and associated pages */
+                mxlnd_free_rxs();
+
+                /* fall through */
+
+        case MXLND_INIT_TXS:
+
+                CDEBUG(D_NET, "freeing txs\n");
+
+                /* free all txs and associated pages */
+                mxlnd_free_txs();
+
+                /* fall through */
+
+        case MXLND_INIT_DATA:
+
+                CDEBUG(D_NET, "freeing peers\n");
+
+                /* free peer list */
+                mxlnd_free_peers();
+
+                /* fall through */
+
+        case MXLND_INIT_NOTHING:
+                break;
+        }
+        CDEBUG(D_NET, "shutdown complete\n");
+
+        CDEBUG(D_MALLOC, "after MXLND cleanup: libcfs_kmemory %d "
+                         "kmx_mem_used %ld\n", atomic_read (&libcfs_kmemory), 
+                         kmxlnd_data.kmx_mem_used);
+
+        kmxlnd_data.kmx_init = MXLND_INIT_NOTHING;
+        PORTAL_MODULE_UNUSE;
+        return;
+}
+
+/**
+ * mxlnd_startup - initialize state, open an endpoint, start IO
+ * @ni - LNET interface handle
+ *
+ * Initialize state, open an endpoint, start monitoring threads.
+ * Should only be called once.
+ */
+int
+mxlnd_startup (lnet_ni_t *ni)
+{
+        int                     i       = 0;
+        int                     ret     = 0;
+        struct timeval          tv;
+
+        LASSERT (ni->ni_lnd == &the_kmxlnd);
+
+        if (kmxlnd_data.kmx_init != MXLND_INIT_NOTHING) {
+                CERROR("Only 1 instance supported\n");
+                return -EPERM;
+        }
+        CDEBUG(D_MALLOC, "before MXLND startup: libcfs_kmemory %d "
+                         "kmx_mem_used %ld\n", atomic_read (&libcfs_kmemory), 
+                         kmxlnd_data.kmx_mem_used);
+
+        /* reserve 1/2 of tx for connect request messages */
+        ni->ni_maxtxcredits = *kmxlnd_tunables.kmx_ntx / 2;
+        ni->ni_peertxcredits = *kmxlnd_tunables.kmx_credits;
+
+        PORTAL_MODULE_USE;
+        memset (&kmxlnd_data, 0, sizeof (kmxlnd_data));
+
+        kmxlnd_data.kmx_ni = ni;
+        ni->ni_data = &kmxlnd_data;
+
+        do_gettimeofday(&tv);
+        kmxlnd_data.kmx_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        CDEBUG(D_NET, "my incarnation is %lld\n", kmxlnd_data.kmx_incarnation);
+
+        spin_lock_init (&kmxlnd_data.kmx_global_lock);
+
+        INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_req);
+        spin_lock_init (&kmxlnd_data.kmx_conn_lock);
+        sema_init(&kmxlnd_data.kmx_conn_sem, 0);
+
+        INIT_LIST_HEAD (&kmxlnd_data.kmx_hosts);
+        spin_lock_init (&kmxlnd_data.kmx_hosts_lock);
+
+        for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                INIT_LIST_HEAD (&kmxlnd_data.kmx_peers[i]);
+        }
+        rwlock_init (&kmxlnd_data.kmx_peers_lock);
+
+        INIT_LIST_HEAD (&kmxlnd_data.kmx_txs);
+        INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_idle);
+        spin_lock_init (&kmxlnd_data.kmx_tx_idle_lock);
+        kmxlnd_data.kmx_tx_next_cookie = 1;
+        INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_queue);
+        spin_lock_init (&kmxlnd_data.kmx_tx_queue_lock);
+        sema_init(&kmxlnd_data.kmx_tx_queue_sem, 0);
+
+        INIT_LIST_HEAD (&kmxlnd_data.kmx_rxs);
+        spin_lock_init (&kmxlnd_data.kmx_rxs_lock);
+        INIT_LIST_HEAD (&kmxlnd_data.kmx_rx_idle);
+        spin_lock_init (&kmxlnd_data.kmx_rx_idle_lock);
+        
+        kmxlnd_data.kmx_init = MXLND_INIT_DATA;
+        /*****************************************************/
+
+        ret = mxlnd_init_txs();
+        if (ret != 0) {
+                CERROR("Can't alloc tx descs: %d\n", ret);
+                goto failed;
+        }
+        kmxlnd_data.kmx_init = MXLND_INIT_TXS;
+        /*****************************************************/
+
+        ret = mxlnd_init_rxs();
+        if (ret != 0) {
+                CERROR("Can't alloc rx descs: %d\n", ret);
+                goto failed;
+        }
+        kmxlnd_data.kmx_init = MXLND_INIT_RXS;
+        /*****************************************************/
+
+        ret = mxlnd_init_mx(ni);
+        if (ret != 0) {
+                CERROR("Can't init mx\n");
+                goto failed;
+        }
+
+        kmxlnd_data.kmx_init = MXLND_INIT_MX;
+        /*****************************************************/
+
+        /* start threads */
+
+        MXLND_ALLOC (kmxlnd_data.kmx_completions,
+                      MXLND_NCOMPLETIONS * sizeof(struct completion));
+        if (kmxlnd_data.kmx_completions == NULL) {
+                CERROR("failed to alloc kmxlnd_data.kmx_completions");
+                goto failed;
+        }
+        memset(kmxlnd_data.kmx_completions, 0, 
+               MXLND_NCOMPLETIONS * sizeof(struct completion));
+
+        {
+                int     i               = 0;
+                if (MXLND_N_SCHED > *kmxlnd_tunables.kmx_n_waitd) {
+                        *kmxlnd_tunables.kmx_n_waitd = MXLND_N_SCHED;
+                }
+                CDEBUG(D_NET, "using %d %s in mx_wait_any()\n",
+                        *kmxlnd_tunables.kmx_n_waitd, 
+                        *kmxlnd_tunables.kmx_n_waitd == 1 ? "thread" : "threads");
+
+                for (i = 0; i < *kmxlnd_tunables.kmx_n_waitd; i++) {
+                        ret = mxlnd_thread_start(mxlnd_request_waitd, (void*)((long)i));
+                        if (ret < 0) {
+                                CERROR("Starting mxlnd_request_waitd[%d] failed with %d\n", i, ret);
+                                for (--i; i >= 0; i--) {
+                                        wait_for_completion(&kmxlnd_data.kmx_completions[i]);
+                                }
+                                LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
+                                MXLND_FREE(kmxlnd_data.kmx_completions, 
+                                        MXLND_NCOMPLETIONS * sizeof(struct completion));
+
+                                goto failed;
+                        }
+                }
+                ret = mxlnd_thread_start(mxlnd_tx_queued, (void*)((long)i++));
+                if (ret < 0) {
+                        CERROR("Starting mxlnd_tx_queued failed with %d\n", ret);
+                        for (--i; i >= 0; i--) {
+                                wait_for_completion(&kmxlnd_data.kmx_completions[i]);
+                        }
+                        LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
+                        MXLND_FREE(kmxlnd_data.kmx_completions, 
+                                MXLND_NCOMPLETIONS * sizeof(struct completion));
+                        goto failed;
+                }
+                ret = mxlnd_thread_start(mxlnd_timeoutd, (void*)((long)i++));
+                if (ret < 0) {
+                        CERROR("Starting mxlnd_timeoutd failed with %d\n", ret);
+                        for (--i; i >= 0; i--) {
+                                wait_for_completion(&kmxlnd_data.kmx_completions[i]);
+                        }
+                        LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
+                        MXLND_FREE(kmxlnd_data.kmx_completions, 
+                                MXLND_NCOMPLETIONS * sizeof(struct completion));
+                        goto failed;
+                }
+        }
+
+        kmxlnd_data.kmx_init = MXLND_INIT_THREADS;
+        /*****************************************************/
+        
+        kmxlnd_data.kmx_init = MXLND_INIT_ALL;
+        CDEBUG(D_MALLOC, "startup complete (kmx_mem_used %ld)\n", kmxlnd_data.kmx_mem_used);
+        
+        return 0;
+failed:
+        CERROR("mxlnd_startup failed\n");
+        mxlnd_shutdown (ni);    
+        return (-ENETDOWN);
+}
+
+static int mxlnd_init(void)
+{
+        lnet_register_lnd(&the_kmxlnd);
+       return 0;
+}
+
+static void mxlnd_exit(void)
+{
+        lnet_unregister_lnd(&the_kmxlnd);
+       return;
+}
+
+module_init(mxlnd_init);
+module_exit(mxlnd_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Myricom, Inc. - help@myri.com");
+MODULE_DESCRIPTION("Kernel MyrinetExpress LND");
+MODULE_VERSION("0.5.0");
diff --git a/lnet/klnds/mxlnd/mxlnd.h b/lnet/klnds/mxlnd/mxlnd.h
new file mode 100644 (file)
index 0000000..28e58ca
--- /dev/null
@@ -0,0 +1,415 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ * Copyright (C) 2006 Myricom, Inc.
+ *   Author: Scott Atchley <atchley at myri.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>       /* module */
+#include <linux/kernel.h>       /* module */
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>         /* module */
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+#include <linux/utsname.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#include <linux/netdevice.h>    /* these are needed for ARP */
+#include <linux/if_arp.h>
+#include <net/arp.h>
+#include <linux/inetdevice.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include "libcfs/kp30.h"
+#include "lnet/lnet.h"
+#include "lnet/lib-lnet.h"
+
+#define MX_KERNEL 1
+#include "mx_extensions.h"
+#include "myriexpress.h"
+
+#if LNET_MAX_IOV > MX_MAX_SEGMENTS
+    #error LNET_MAX_IOV is greater then MX_MAX_SEGMENTS
+#endif
+
+/* Using MX's 64 match bits
+ * We are using the match bits to specify message type and the cookie.  The
+ * highest four bits (60-63) are reserved for message type. Below we specify
+ * the types. MXLND_MASK_ICON_REQ and MXLND_MASK_ICON_ACK are used for
+ * mx_iconnect().  We reserve the remaining combinations for future use.  The
+ * next 8 bits (52-59) are reserved for returning a status code for failed
+ * GET_DATA (payload) messages. The last 52 bits are used for cookies. That
+ * should allow unique cookies for 4 KB messages at 10 Gbps line rate without
+ * rollover for about 8 years. That should be enough. */
+
+/* constants */
+#define MXLND_MASK_ICON_REQ (0xBLL << 60) /* it is a mx_iconnect() completion */
+#define MXLND_MASK_CONN_REQ (0xCLL << 60) /* CONN_REQ msg */
+#define MXLND_MASK_ICON_ACK (0x9LL << 60) /* it is a mx_iconnect() completion */
+#define MXLND_MASK_CONN_ACK (0xALL << 60) /* CONN_ACK msg*/
+#define MXLND_MASK_EAGER    (0xELL << 60) /* EAGER msg */
+#define MXLND_MASK_NOOP     (0x1LL << 60) /* NOOP msg */
+#define MXLND_MASK_PUT_REQ  (0x2LL << 60) /* PUT_REQ msg */
+#define MXLND_MASK_PUT_ACK  (0x3LL << 60) /* PUT_ACK msg */
+#define MXLND_MASK_PUT_DATA (0x4LL << 60) /* PUT_DATA msg */
+#define MXLND_MASK_GET_REQ  (0x5LL << 60) /* GET_REQ msg */
+#define MXLND_MASK_GET_DATA (0x6LL << 60) /* GET_DATA msg */
+//#define MXLND_MASK_NAK      (0x7LL << 60) /* NAK msg */
+
+#define MXLND_MAX_COOKIE    ((1LL << 52) - 1)         /* when to roll-over the cookie value */
+#define MXLND_NCOMPLETIONS  (MXLND_N_SCHED + 2)   /* max threads for completion array */
+
+/* defaults for configurable parameters */
+#define MXLND_N_SCHED           1               /* # schedulers (mx_wait_any() threads) */
+#define MXLND_MX_BOARD          0               /* Use the first MX NIC if more than 1 avail */
+#define MXLND_MX_EP_ID          3               /* MX endpoint ID */
+#define MXLND_COMM_TIMEOUT      (20 * HZ)       /* timeout for send/recv (jiffies) */
+#define MXLND_WAIT_TIMEOUT      HZ              /* timeout for wait (jiffies) */
+#define MXLND_POLLING           0               /* poll iterations before blocking */
+#define MXLND_MAX_PEERS         1024            /* number of nodes talking to me */
+#define MXLND_EAGER_NUM         MXLND_MAX_PEERS /* number of pre-posted receives */
+#define MXLND_EAGER_SIZE        PAGE_SIZE       /* pre-posted eager message size */
+#define MXLND_MSG_QUEUE_DEPTH   8               /* msg queue depth */
+#define MXLND_CREDIT_HIGHWATER  (MXLND_MSG_QUEUE_DEPTH - 2)
+                                                /* when to send a noop to return credits */
+#define MXLND_NTX               256             /* # of kmx_tx - total sends in flight 
+                                                   1/2 are reserved for connect messages */
+
+#define MXLND_HASH_BITS         6               /* the number of bits to hash over */
+#define MXLND_HASH_SIZE         (1<<MXLND_HASH_BITS)
+                                                /* number of peer lists for lookup.
+                                                   we hash over the last N bits of
+                                                   the IP address converted to an int. */
+#define MXLND_HASH_MASK         (MXLND_HASH_SIZE - 1)
+                                                /* ensure we use only the last N bits */
+
+/* debugging features */
+#define MXLND_CKSUM             0               /* checksum kmx_msg_t */
+#define MXLND_DEBUG             0               /* turn on printk()s */
+
+extern inline void mxlnd_noop(char *s, ...);
+#if MXLND_DEBUG
+        #define MXLND_PRINT printk
+#else
+        #define MXLND_PRINT mxlnd_noop
+#endif
+
+/* provide wrappers around LIBCFS_ALLOC/FREE to keep MXLND specific
+ * memory usage stats that include pages */
+
+#define MXLND_ALLOC(x, size) \
+        do { \
+                spin_lock(&kmxlnd_data.kmx_global_lock); \
+                kmxlnd_data.kmx_mem_used += size; \
+                spin_unlock(&kmxlnd_data.kmx_global_lock); \
+                LIBCFS_ALLOC(x, size); \
+                if (x == NULL) { \
+                        spin_lock(&kmxlnd_data.kmx_global_lock); \
+                        kmxlnd_data.kmx_mem_used -= size; \
+                        spin_unlock(&kmxlnd_data.kmx_global_lock); \
+                } \
+        } while (0)
+
+#define MXLND_FREE(x, size) \
+        do { \
+                spin_lock(&kmxlnd_data.kmx_global_lock); \
+                kmxlnd_data.kmx_mem_used -= size; \
+                spin_unlock(&kmxlnd_data.kmx_global_lock); \
+                LIBCFS_FREE(x, size); \
+        } while (0)
+
+
+typedef struct kmx_tunables {
+        int     *kmx_n_waitd;           /* # completion threads */
+        int     *kmx_max_peers;         /* max # of potential peers */
+        int     *kmx_cksum;             /* checksum small msgs? */
+        int     *kmx_ntx;               /* total # of tx (1/2 for LNET 1/2 for CONN_REQ */
+        int     *kmx_credits;           /* concurrent sends to 1 peer */
+        int     *kmx_board;             /* MX board (NIC) number */
+        int     *kmx_ep_id;             /* MX endpoint number */
+        int     *kmx_polling;           /* if 0, block. if > 0, poll this many
+                                           iterations before blocking */
+        char    **kmx_hosts;            /* Location of hosts file, if used */
+} kmx_tunables_t;
+
+/* structure to hold IP-to-hostname resolution data */
+struct kmx_host {
+        struct kmx_peer    *mxh_peer;           /* pointer to matching peer */
+        u32                 mxh_addr;           /* IP address as int */
+        char               *mxh_hostname;       /* peer's hostname */
+        u32                 mxh_board;          /* peer's board rank */
+        u32                 mxh_ep_id;          /* peer's MX endpoint ID */
+        struct list_head    mxh_list;           /* position on kmx_hosts */
+        spinlock_t          mxh_lock;           /* lock */
+};
+
+/* global interface state */
+typedef struct kmx_data
+{
+        int                 kmx_init;           /* initialization state */
+        int                 kmx_shutdown;       /* shutting down? */
+        atomic_t            kmx_nthreads;       /* number of threads */
+        struct completion  *kmx_completions;    /* array of completion structs */
+        lnet_ni_t          *kmx_ni;             /* the LND instance */
+        u64                 kmx_incarnation;    /* my incarnation value - unused */
+        long                kmx_mem_used;       /* memory used */
+        struct kmx_host    *kmx_localhost;      /* pointer to my kmx_host info */
+        mx_endpoint_t       kmx_endpt;          /* the MX endpoint */
+
+        spinlock_t          kmx_global_lock;    /* global lock */
+
+        struct list_head    kmx_conn_req;       /* list of connection requests */
+        spinlock_t          kmx_conn_lock;      /* connection list lock */
+        struct semaphore    kmx_conn_sem;       /* semaphore for connection request list */
+
+        struct list_head    kmx_hosts;          /* host lookup info */
+        spinlock_t          kmx_hosts_lock;     /* hosts list lock */
+
+        struct list_head    kmx_peers[MXLND_HASH_SIZE];
+                                                /* list of all known peers */
+        rwlock_t            kmx_peers_lock;     /* peer list rw lock */
+        atomic_t            kmx_npeers;         /* number of peers */
+
+        struct list_head    kmx_txs;            /* all tx descriptors */
+        struct list_head    kmx_tx_idle;        /* list of idle tx */
+        spinlock_t          kmx_tx_idle_lock;   /* lock for idle tx list */
+        s32                 kmx_tx_used;        /* txs in use */
+        u64                 kmx_tx_next_cookie; /* unique id for tx */
+        struct list_head    kmx_tx_queue;       /* generic send queue */
+        spinlock_t          kmx_tx_queue_lock;  /* lock for generic sends */
+        struct semaphore    kmx_tx_queue_sem;   /* semaphore for tx queue */
+
+        struct list_head    kmx_rxs;            /* all rx descriptors */
+        spinlock_t          kmx_rxs_lock;       /* lock for rxs list */
+        struct list_head    kmx_rx_idle;        /* list of idle tx */
+        spinlock_t          kmx_rx_idle_lock;   /* lock for idle rx list */
+} kmx_data_t;
+
+#define MXLND_INIT_NOTHING      0       /* in the beginning, there was nothing... */
+#define MXLND_INIT_DATA         1       /* main data structures created */
+#define MXLND_INIT_TXS          2       /* tx descriptors created */
+#define MXLND_INIT_RXS          3       /* initial rx descriptors created */
+#define MXLND_INIT_MX           4       /* initiate MX library, open endpoint, get NIC id */
+#define MXLND_INIT_THREADS      5       /* waitd, timeoutd, tx_queued threads */
+#define MXLND_INIT_ALL          6       /* startup completed */
+
+#include "mxlnd_wire.h"
+
+enum kmx_req_type {
+        MXLND_REQ_TX    = 0,
+        MXLND_REQ_RX    = 1,
+};
+
+/* The life cycle of a request */
+enum kmx_req_state {
+        MXLND_CTX_INIT       = 0,               /* just created */
+        MXLND_CTX_IDLE       = 1,               /* available for use */
+        MXLND_CTX_PREP       = 2,               /* getting ready for send/recv */
+        MXLND_CTX_PENDING    = 3,               /* mx_isend() or mx_irecv() called */
+        MXLND_CTX_COMPLETED  = 4,               /* cleaning up after completion or timeout */
+        MXLND_CTX_CANCELED   = 5,               /* timed out but still in ctx list */
+};
+
+/* Context Structure - generic tx/rx descriptor
+ * It represents the context (or state) of each send or receive request.
+ * In other LNDs, they have separate TX and RX descriptors and this replaces both.
+ *
+ * We will keep the these on the global kmx_rxs and kmx_txs lists for cleanup
+ * during shutdown(). We will move them between the rx/tx idle lists and the
+ * pending list which is monitored by mxlnd_timeoutd().
+ */
+struct kmx_ctx {
+        enum kmx_req_type   mxc_type;           /* TX or RX */
+        u64                 mxc_incarnation;    /* store the peer's incarnation here
+                                                   to verify before changing flow
+                                                   control credits after completion */
+        unsigned long       mxc_deadline;       /* request time out in absolute jiffies */
+        enum kmx_req_state  mxc_state;          /* what is the state of the request? */
+        struct list_head    mxc_global_list;    /* place on kmx_rxs or kmx_txs */
+        struct list_head    mxc_list;           /* place on rx/tx idle list, tx q, peer tx */
+        struct list_head    mxc_rx_list;        /* place on mxp_rx_posted list */
+        spinlock_t          mxc_lock;           /* lock */
+
+        lnet_nid_t          mxc_nid;            /* dst's NID if peer is not known */
+        struct kmx_peer    *mxc_peer;           /* owning peer */
+        struct kmx_conn    *mxc_conn;           /* owning conn */
+        struct kmx_msg     *mxc_msg;            /* msg hdr mapped to mxc_page */
+        struct page        *mxc_page;           /* buffer for eager msgs */
+        lnet_msg_t         *mxc_lntmsg[2];      /* lnet msgs to finalize */
+
+        u8                  mxc_msg_type;       /* what type of message is this? */
+        u64                 mxc_cookie;         /* completion cookie */
+        u64                 mxc_match;          /* MX match info */
+        mx_ksegment_t       mxc_seg;            /* local MX ksegment for non-DATA */
+        mx_ksegment_t      *mxc_seg_list;       /* MX ksegment array for DATA */
+        int                 mxc_nseg;           /* number of segments */
+        unsigned long       mxc_pin_type;       /* MX_PIN_KERNEL or MX_PIN_PHYSICAL */
+        u32                 mxc_nob;            /* number of bytes sent/received */
+        mx_request_t        mxc_mxreq;          /* MX request */
+        mx_status_t         mxc_status;         /* MX status */
+        s64                 mxc_get;            /* # of times returned from idle list */
+        s64                 mxc_put;            /* # of times returned from idle list */
+};
+
+#define MXLND_CONN_DISCONNECT  -2       /* conn is being destroyed - do not add txs */
+#define MXLND_CONN_FAIL        -1       /* connect failed (bad handshake, unavail, etc.) */
+#define MXLND_CONN_INIT         0       /* in the beginning, there was nothing... */
+#define MXLND_CONN_REQ          1       /* a connection request message is needed */
+#define MXLND_CONN_ACK          2       /* a connection ack is needed */
+#define MXLND_CONN_WAIT         3       /* waiting for req or ack to complete */
+#define MXLND_CONN_READY        4       /* ready to send */
+
+/* connection state - queues for queued and pending msgs */
+struct kmx_conn
+{
+        u64                 mxk_incarnation;    /* connections's incarnation value */
+        atomic_t            mxk_refcount;       /* reference counting */
+
+        struct kmx_peer    *mxk_peer;           /* owning peer */
+        mx_endpoint_addr_t  mxk_epa;            /* peer's endpoint address */
+
+        struct list_head    mxk_list;           /* for placing on mxp_conns */
+        spinlock_t          mxk_lock;           /* lock */
+        unsigned long       mxk_timeout;        /* expiration of oldest pending tx/rx */
+        unsigned long       mxk_last_tx;        /* when last tx completed with success */
+        unsigned long       mxk_last_rx;        /* when last rx completed */
+
+        int                 mxk_credits;        /* # of my credits for sending to peer */
+        int                 mxk_outstanding;    /* # of credits to return */
+
+        int                 mxk_status;         /* can we send messages? MXLND_CONN_* */
+        struct list_head    mxk_tx_credit_queue;   /* send queue for peer */
+        struct list_head    mxk_tx_free_queue;  /* send queue for peer */
+        int                 mxk_ntx_msgs;       /* # of msgs on tx queues */
+        int                 mxk_ntx_data ;      /* # of DATA on tx queues */
+        int                 mxk_ntx_posted;     /* # of tx msgs in flight */
+        int                 mxk_data_posted;    /* # of tx data payloads in flight */
+
+        struct list_head    mxk_pending;        /* in flight rxs and txs */
+};
+
+/* peer state */
+struct kmx_peer
+{
+        lnet_nid_t          mxp_nid;            /* peer's LNET NID */
+        u64                 mxp_incarnation;    /* peer's incarnation value */
+        atomic_t            mxp_refcount;       /* reference counts */
+
+        struct kmx_host    *mxp_host;           /* peer lookup info */
+        u64                 mxp_nic_id;         /* remote's MX nic_id for mx_connect() */
+
+        struct list_head    mxp_peers;          /* for placing on kmx_peers */
+        spinlock_t          mxp_lock;           /* lock */
+
+        struct list_head    mxp_conns;          /* list of connections */
+        struct kmx_conn    *mxp_conn;           /* current connection */
+
+        unsigned long       mxp_reconnect_time;  /* when to retry connect */
+        int                 mxp_incompatible;   /* incorrect conn_req values */
+};
+
+extern kmx_data_t       kmxlnd_data;
+extern kmx_tunables_t   kmxlnd_tunables;
+
+/* required for the LNET API */
+int  mxlnd_startup(lnet_ni_t *ni);
+void mxlnd_shutdown(lnet_ni_t *ni);
+int  mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int  mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  mxlnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+                unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, 
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+/* in mxlnd.c */
+extern void mxlnd_thread_stop(long id);
+extern int  mxlnd_ctx_alloc(struct kmx_ctx **ctxp, enum kmx_req_type type);
+extern void mxlnd_ctx_free(struct kmx_ctx *ctx);
+extern void mxlnd_ctx_init(struct kmx_ctx *ctx);
+extern lnet_nid_t mxlnd_nic_id2nid(lnet_ni_t *ni, u64 nic_id);
+extern u64 mxlnd_nid2nic_id(lnet_nid_t nid);
+
+/* in mxlnd_cb.c */
+void mxlnd_eager_recv(void *context, uint64_t match_value, uint32_t length);
+extern mx_unexp_handler_action_t mxlnd_unexpected_recv(void *context,
+                mx_endpoint_addr_t source, uint64_t match_value, uint32_t length, 
+                void *data_if_available);
+extern void mxlnd_peer_free(struct kmx_peer *peer);
+extern void mxlnd_conn_free(struct kmx_conn *conn);
+extern void mxlnd_sleep(unsigned long timeout);
+extern int  mxlnd_tx_queued(void *arg);
+extern void mxlnd_handle_rx_completion(struct kmx_ctx *rx);
+extern int  mxlnd_check_sends(struct kmx_peer *peer);
+extern int  mxlnd_tx_peer_queued(void *arg);
+extern int  mxlnd_request_waitd(void *arg);
+extern int  mxlnd_unex_recvd(void *arg);
+extern int  mxlnd_timeoutd(void *arg);
+extern int  mxlnd_connd(void *arg);
+
+#define mxlnd_peer_addref(peer)                                 \
+do {                                                            \
+        LASSERT(atomic_read(&(peer)->mxp_refcount) > 0);        \
+        atomic_inc(&(peer)->mxp_refcount);                      \
+} while (0)
+
+
+#define mxlnd_peer_decref(peer)                                 \
+do {                                                            \
+        LASSERT(atomic_read(&(peer)->mxp_refcount) > 0);        \
+        if (atomic_dec_and_test(&(peer)->mxp_refcount))         \
+                mxlnd_peer_free(peer);                          \
+} while (0)
+
+#define mxlnd_conn_addref(conn)                                 \
+do {                                                            \
+        LASSERT(atomic_read(&(conn)->mxk_refcount) > 0);        \
+        atomic_inc(&(conn)->mxk_refcount);                      \
+} while (0)
+
+
+#define mxlnd_conn_decref(conn)                                 \
+do {                                                            \
+        LASSERT(atomic_read(&(conn)->mxk_refcount) > 0);        \
+        if (atomic_dec_and_test(&(conn)->mxk_refcount))         \
+                mxlnd_conn_free(conn);                          \
+} while (0)
diff --git a/lnet/klnds/mxlnd/mxlnd_cb.c b/lnet/klnds/mxlnd/mxlnd_cb.c
new file mode 100644 (file)
index 0000000..09d0c0b
--- /dev/null
@@ -0,0 +1,3437 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ * Copyright (C) 2006 Myricom, Inc.
+ *   Author: Myricom, Inc. <help at myri.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "mxlnd.h"
+
+inline void mxlnd_noop(char *s, ...)
+{
+        return;
+}
+
+char *
+mxlnd_ctxstate_to_str(int mxc_state)
+{
+        switch (mxc_state) {
+        case MXLND_CTX_INIT:
+                return "MXLND_CTX_INIT";
+        case MXLND_CTX_IDLE:
+                return "MXLND_CTX_IDLE";
+        case MXLND_CTX_PREP:
+                return "MXLND_CTX_PREP";
+        case MXLND_CTX_PENDING:
+                return "MXLND_CTX_PENDING";
+        case MXLND_CTX_COMPLETED:
+                return "MXLND_CTX_COMPLETED";
+        case MXLND_CTX_CANCELED:
+                return "MXLND_CTX_CANCELED";
+        default:
+                return "*unknown*";
+        }
+}
+
+char *
+mxlnd_connstatus_to_str(int mxk_status)
+{
+        switch (mxk_status) {
+        case MXLND_CONN_READY:
+                return "MXLND_CONN_READY";
+        case MXLND_CONN_INIT:
+                return "MXLND_CONN_INIT";
+        case MXLND_CONN_REQ:
+                return "MXLND_CONN_REQ";
+        case MXLND_CONN_ACK:
+                return "MXLND_CONN_ACK";
+        case MXLND_CONN_WAIT:
+                return "MXLND_CONN_WAIT";
+        case MXLND_CONN_DISCONNECT:
+                return "MXLND_CONN_DISCONNECT";
+        case MXLND_CONN_FAIL:
+                return "MXLND_CONN_FAIL";
+        default:
+                return "unknown";
+        }
+}
+
+char *
+mxlnd_msgtype_to_str(int type) {
+        switch (type) {
+        case MXLND_MSG_EAGER:
+                return "MXLND_MSG_EAGER";
+        case MXLND_MSG_CONN_REQ:
+                return "MXLND_MSG_CONN_REQ";
+        case MXLND_MSG_CONN_ACK:
+                return "MXLND_MSG_CONN_ACK";
+        case MXLND_MSG_NOOP:
+                return "MXLND_MSG_NOOP";
+        case MXLND_MSG_PUT_REQ:
+                return "MXLND_MSG_PUT_REQ";
+        case MXLND_MSG_PUT_ACK:
+                return "MXLND_MSG_PUT_ACK";
+        case MXLND_MSG_PUT_DATA:
+                return "MXLND_MSG_PUT_DATA";
+        case MXLND_MSG_GET_REQ:
+                return "MXLND_MSG_GET_REQ";
+        case MXLND_MSG_GET_DATA:
+                return "MXLND_MSG_GET_DATA";
+        default:
+                return "unknown";
+        }
+}
+
+char *
+mxlnd_lnetmsg_to_str(int type)
+{
+        switch (type) {
+        case LNET_MSG_ACK:
+                return "LNET_MSG_ACK";
+        case LNET_MSG_PUT:
+                return "LNET_MSG_PUT";
+        case LNET_MSG_GET:
+                return "LNET_MSG_GET";
+        case LNET_MSG_REPLY:
+                return "LNET_MSG_REPLY";
+        case LNET_MSG_HELLO:
+                return "LNET_MSG_HELLO";
+        default:
+                return "*unknown*";
+        }
+}
+
+static inline u64
+//mxlnd_create_match(u8 msg_type, u8 error, u64 cookie)
+mxlnd_create_match(struct kmx_ctx *ctx, u8 error)
+{
+        u64 type        = (u64) ctx->mxc_msg_type;
+        u64 err         = (u64) error;
+        u64 match       = 0LL;
+
+        LASSERT(ctx->mxc_msg_type != 0);
+        LASSERT(ctx->mxc_cookie >> 52 == 0);
+        match = (type << 60) | (err << 52) | ctx->mxc_cookie;
+        return match;
+}
+
+static inline void
+mxlnd_parse_match(u64 match, u8 *msg_type, u8 *error, u64 *cookie)
+{
+        *msg_type = (u8) (match >> 60);
+        *error    = (u8) ((match >> 52) & 0xFF);
+        *cookie   = match & 0xFFFFFFFFFFFFFLL;
+        LASSERT(match == (MXLND_MASK_ICON_REQ & 0xF000000000000000LL) ||
+                match == (MXLND_MASK_ICON_ACK & 0xF000000000000000LL) ||
+                *msg_type == MXLND_MSG_EAGER    ||
+                *msg_type == MXLND_MSG_CONN_REQ ||
+                *msg_type == MXLND_MSG_CONN_ACK ||
+                *msg_type == MXLND_MSG_NOOP     ||
+                *msg_type == MXLND_MSG_PUT_REQ  ||
+                *msg_type == MXLND_MSG_PUT_ACK  ||
+                *msg_type == MXLND_MSG_PUT_DATA ||
+                *msg_type == MXLND_MSG_GET_REQ  ||
+                *msg_type == MXLND_MSG_GET_DATA);
+        return;
+}
+
+struct kmx_ctx *
+mxlnd_get_idle_rx(void)
+{
+        struct list_head        *tmp    = NULL;
+        struct kmx_ctx          *rx     = NULL;
+
+        spin_lock(&kmxlnd_data.kmx_rx_idle_lock);
+
+        if (list_empty (&kmxlnd_data.kmx_rx_idle)) {
+                spin_unlock(&kmxlnd_data.kmx_rx_idle_lock);
+                return NULL;
+        }
+
+        tmp = &kmxlnd_data.kmx_rx_idle;
+        rx = list_entry (tmp->next, struct kmx_ctx, mxc_list);
+        list_del_init(&rx->mxc_list);
+        spin_unlock(&kmxlnd_data.kmx_rx_idle_lock);
+
+#if MXLND_DEBUG
+        if (rx->mxc_get != rx->mxc_put) {
+                CDEBUG(D_NETERROR, "*** RX get (%lld) != put (%lld) ***\n", rx->mxc_get, rx->mxc_put);
+                CDEBUG(D_NETERROR, "*** incarnation= %lld ***\n", rx->mxc_incarnation);
+                CDEBUG(D_NETERROR, "*** deadline= %ld ***\n", rx->mxc_deadline);
+                CDEBUG(D_NETERROR, "*** state= %s ***\n", mxlnd_ctxstate_to_str(rx->mxc_state));
+                CDEBUG(D_NETERROR, "*** listed?= %d ***\n", !list_empty(&rx->mxc_list));
+                CDEBUG(D_NETERROR, "*** nid= 0x%llx ***\n", rx->mxc_nid);
+                CDEBUG(D_NETERROR, "*** peer= 0x%p ***\n", rx->mxc_peer);
+                CDEBUG(D_NETERROR, "*** msg_type= %s ***\n", mxlnd_msgtype_to_str(rx->mxc_msg_type));
+                CDEBUG(D_NETERROR, "*** cookie= 0x%llx ***\n", rx->mxc_cookie);
+                CDEBUG(D_NETERROR, "*** nob= %d ***\n", rx->mxc_nob);
+        }
+#endif
+        LASSERT (rx->mxc_get == rx->mxc_put);
+
+        rx->mxc_get++;
+
+        LASSERT (rx->mxc_state == MXLND_CTX_IDLE);
+        rx->mxc_state = MXLND_CTX_PREP;
+
+        return rx;
+}
+
+int
+mxlnd_put_idle_rx(struct kmx_ctx *rx)
+{
+        if (rx == NULL) {
+                CDEBUG(D_NETERROR, "called with NULL pointer\n");
+                return -EINVAL;
+        } else if (rx->mxc_type != MXLND_REQ_RX) {
+                CDEBUG(D_NETERROR, "called with tx\n");
+                return -EINVAL;
+        }
+        LASSERT(rx->mxc_get == rx->mxc_put + 1);
+        mxlnd_ctx_init(rx);
+        rx->mxc_put++;
+        spin_lock(&kmxlnd_data.kmx_rx_idle_lock);
+        list_add_tail(&rx->mxc_list, &kmxlnd_data.kmx_rx_idle);
+        spin_unlock(&kmxlnd_data.kmx_rx_idle_lock);
+        return 0;
+}
+
+int
+mxlnd_reduce_idle_rxs(__u32 count)
+{
+        __u32                   i       = 0;
+        struct kmx_ctx          *rx     = NULL;
+
+        spin_lock(&kmxlnd_data.kmx_rxs_lock);
+        for (i = 0; i < count; i++) {
+                rx = mxlnd_get_idle_rx();
+                if (rx != NULL) {
+                        struct list_head *tmp = &rx->mxc_global_list;
+                        list_del_init(tmp);
+                        mxlnd_ctx_free(rx);
+                } else {
+                        CDEBUG(D_NETERROR, "only reduced %d out of %d rxs\n", i, count);
+                        break;
+                }
+        }
+        spin_unlock(&kmxlnd_data.kmx_rxs_lock);
+        return 0;
+}
+
+struct kmx_ctx *
+mxlnd_get_idle_tx(void)
+{
+        struct list_head        *tmp    = NULL;
+        struct kmx_ctx          *tx     = NULL;
+
+        spin_lock(&kmxlnd_data.kmx_tx_idle_lock);
+
+        if (list_empty (&kmxlnd_data.kmx_tx_idle)) {
+                CDEBUG(D_NETERROR, "%d txs in use\n", kmxlnd_data.kmx_tx_used);
+                spin_unlock(&kmxlnd_data.kmx_tx_idle_lock);
+                return NULL;
+        }
+
+        tmp = &kmxlnd_data.kmx_tx_idle;
+        tx = list_entry (tmp->next, struct kmx_ctx, mxc_list);
+        list_del_init(&tx->mxc_list);
+
+        /* Allocate a new completion cookie.  It might not be needed,
+         * but we've got a lock right now and we're unlikely to
+         * wrap... */
+        tx->mxc_cookie = kmxlnd_data.kmx_tx_next_cookie++;
+        if (kmxlnd_data.kmx_tx_next_cookie > MXLND_MAX_COOKIE) {
+                tx->mxc_cookie = 1;
+        }
+        kmxlnd_data.kmx_tx_used++;
+        spin_unlock(&kmxlnd_data.kmx_tx_idle_lock);
+
+        LASSERT (tx->mxc_get == tx->mxc_put);
+
+        tx->mxc_get++;
+
+        LASSERT (tx->mxc_state == MXLND_CTX_IDLE);
+        LASSERT (tx->mxc_lntmsg[0] == NULL);
+        LASSERT (tx->mxc_lntmsg[1] == NULL);
+
+        tx->mxc_state = MXLND_CTX_PREP;
+
+        return tx;
+}
+
+int
+mxlnd_put_idle_tx(struct kmx_ctx *tx)
+{
+        int             failed  = (tx->mxc_status.code != MX_STATUS_SUCCESS && tx->mxc_status.code != MX_STATUS_TRUNCATED);
+        int             result  = failed ? -EIO : 0;
+        lnet_msg_t      *lntmsg[2];
+
+        if (tx == NULL) {
+                CDEBUG(D_NETERROR, "called with NULL pointer\n");
+                return -EINVAL;
+        } else if (tx->mxc_type != MXLND_REQ_TX) {
+                CDEBUG(D_NETERROR, "called with rx\n");
+                return -EINVAL;
+        }
+
+        lntmsg[0] = tx->mxc_lntmsg[0];
+        lntmsg[1] = tx->mxc_lntmsg[1];
+
+        LASSERT(tx->mxc_get == tx->mxc_put + 1);
+        mxlnd_ctx_init(tx);
+        tx->mxc_put++;
+        spin_lock(&kmxlnd_data.kmx_tx_idle_lock);
+        list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle);
+        kmxlnd_data.kmx_tx_used--;
+        spin_unlock(&kmxlnd_data.kmx_tx_idle_lock);
+        if (lntmsg[0] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result);
+        if (lntmsg[1] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result);
+        return 0;
+}
+
+/**
+ * mxlnd_conn_free - free the conn
+ * @conn - a kmx_conn pointer
+ *
+ * The calling function should remove the conn from the conns list first
+ * then destroy it.
+ */
+void
+mxlnd_conn_free(struct kmx_conn *conn)
+{
+        struct kmx_peer *peer   = conn->mxk_peer;
+
+        CDEBUG(D_NET, "freeing conn 0x%p *****\n", conn);
+        LASSERT (list_empty (&conn->mxk_tx_credit_queue) &&
+                 list_empty (&conn->mxk_tx_free_queue) &&
+                 list_empty (&conn->mxk_pending));
+        if (!list_empty(&conn->mxk_list)) {
+                spin_lock(&peer->mxp_lock);
+                list_del_init(&conn->mxk_list);
+                if (peer->mxp_conn == conn) {
+                        peer->mxp_conn = NULL;
+                        if (!(conn->mxk_epa.stuff[0] == 0 && conn->mxk_epa.stuff[1] == 0)) {
+                                mx_set_endpoint_addr_context(conn->mxk_epa,
+                                                             (void *) NULL);
+                        }
+                }
+                spin_unlock(&peer->mxp_lock);
+        }
+        mxlnd_peer_decref(conn->mxk_peer); /* drop conn's ref to peer */
+        MXLND_FREE (conn, sizeof (*conn));
+        return;
+}
+
+
+void
+mxlnd_conn_cancel_pending_rxs(struct kmx_conn *conn)
+{
+        int                     found   = 0;
+        struct kmx_ctx          *ctx    = NULL;
+        struct kmx_ctx          *next   = NULL;
+        mx_return_t             mxret   = MX_SUCCESS;
+        u32                     result  = 0;
+
+        do {
+                found = 0;
+                spin_lock(&conn->mxk_lock);
+                list_for_each_entry_safe(ctx, next, &conn->mxk_pending, mxc_list) {
+                        /* we will delete all including txs */
+                        list_del_init(&ctx->mxc_list);
+                        if (ctx->mxc_type == MXLND_REQ_RX) {
+                                found = 1;
+                                mxret = mx_cancel(kmxlnd_data.kmx_endpt,
+                                                  &ctx->mxc_mxreq,
+                                                  &result);
+                                if (mxret != MX_SUCCESS) {
+                                        CDEBUG(D_NETERROR, "mx_cancel() returned %s (%d)\n", mx_strerror(mxret), mxret);
+                                }
+                                if (result == 1) {
+                                        ctx->mxc_status.code = -ECONNABORTED;
+                                        ctx->mxc_state = MXLND_CTX_CANCELED;
+                                        /* NOTE this calls lnet_finalize() and
+                                         * we cannot hold any locks when calling it.
+                                         * It also calls mxlnd_conn_decref(conn) */
+                                        spin_unlock(&conn->mxk_lock);
+                                        mxlnd_handle_rx_completion(ctx);
+                                        spin_lock(&conn->mxk_lock);
+                                }
+                                break;
+                        }
+                }
+                spin_unlock(&conn->mxk_lock);
+        }
+        while (found);
+
+        return;
+}
+
+/**
+ * mxlnd_conn_disconnect - shutdown a connection
+ * @conn - a kmx_conn pointer
+ *
+ * This function sets the status to DISCONNECT, completes queued
+ * txs with failure, calls mx_disconnect, which will complete
+ * pending txs and matched rxs with failure.
+ */
+void
+mxlnd_conn_disconnect(struct kmx_conn *conn, int mx_dis, int notify)
+{
+        struct list_head        *tmp    = NULL;
+
+        spin_lock(&conn->mxk_lock);
+        if (conn->mxk_status == MXLND_CONN_DISCONNECT) {
+                spin_unlock(&conn->mxk_lock);
+                return;
+        }
+        conn->mxk_status = MXLND_CONN_DISCONNECT;
+        conn->mxk_timeout = 0;
+
+        while (!list_empty(&conn->mxk_tx_free_queue) ||
+               !list_empty(&conn->mxk_tx_credit_queue)) {
+
+                struct kmx_ctx          *tx     = NULL;
+
+                if (!list_empty(&conn->mxk_tx_free_queue)) {
+                        tmp = &conn->mxk_tx_free_queue;
+                } else {
+                        tmp = &conn->mxk_tx_credit_queue;
+                }
+
+                tx = list_entry(tmp->next, struct kmx_ctx, mxc_list);
+                list_del_init(&tx->mxc_list);
+                tx->mxc_status.code = -ECONNABORTED;
+                spin_unlock(&conn->mxk_lock);
+                mxlnd_put_idle_tx(tx);
+                mxlnd_conn_decref(conn); /* for this tx */
+                spin_lock(&conn->mxk_lock);
+        }
+
+        spin_unlock(&conn->mxk_lock);
+
+        /* cancel pending rxs */
+        mxlnd_conn_cancel_pending_rxs(conn);
+
+        if (kmxlnd_data.kmx_shutdown != 1) {
+
+                if (mx_dis) mx_disconnect(kmxlnd_data.kmx_endpt, conn->mxk_epa);
+
+                if (notify) {
+                        time_t          last_alive      = 0;
+                        unsigned long   last_msg        = 0;
+
+                        /* notify LNET that we are giving up on this peer */
+                        if (time_after(conn->mxk_last_rx, conn->mxk_last_tx)) {
+                                last_msg = conn->mxk_last_rx;
+                        } else {
+                                last_msg = conn->mxk_last_tx;
+                        }
+                        last_alive = cfs_time_current_sec() -
+                                     cfs_duration_sec(cfs_time_current() - last_msg);
+                        lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_alive);
+                }
+        }
+        mxlnd_conn_decref(conn); /* drop the owning peer's reference */
+        
+        return;
+}
+
+/**
+ * mxlnd_conn_alloc - allocate and initialize a new conn struct
+ * @connp - address of a kmx_conn pointer
+ * @peer - owning kmx_peer
+ *
+ * Returns 0 on success and -ENOMEM on failure
+ */
+int
+mxlnd_conn_alloc(struct kmx_conn **connp, struct kmx_peer *peer)
+{
+        struct kmx_conn *conn    = NULL;
+
+        LASSERT(peer != NULL);
+
+        MXLND_ALLOC(conn, sizeof (*conn));
+        if (conn == NULL) {
+                CDEBUG(D_NETERROR, "Cannot allocate conn\n");
+                return -ENOMEM;
+        }
+        CDEBUG(D_NET, "allocated conn 0x%p for peer 0x%p\n", conn, peer);
+
+        memset(conn, 0, sizeof(*conn));
+
+        /* conn->mxk_incarnation = 0 - will be set by peer */
+        atomic_set(&conn->mxk_refcount, 1);     /* ref for owning peer */
+        conn->mxk_peer = peer;
+        /* mxk_epa - to be set after mx_iconnect() */
+        INIT_LIST_HEAD(&conn->mxk_list);
+        spin_lock_init(&conn->mxk_lock);
+        /* conn->mxk_timeout = 0 */
+        conn->mxk_last_tx = jiffies;
+        conn->mxk_last_rx = conn->mxk_last_tx;
+        conn->mxk_credits = *kmxlnd_tunables.kmx_credits;
+        /* mxk_outstanding = 0 */
+        conn->mxk_status = MXLND_CONN_INIT;
+        INIT_LIST_HEAD(&conn->mxk_tx_credit_queue);
+        INIT_LIST_HEAD(&conn->mxk_tx_free_queue);
+        /* conn->mxk_ntx_msgs = 0 */
+        /* conn->mxk_ntx_data = 0 */
+        /* conn->mxk_ntx_posted = 0 */
+        /* conn->mxk_data_posted = 0 */
+        INIT_LIST_HEAD(&conn->mxk_pending);
+
+        *connp = conn;
+
+        mxlnd_peer_addref(peer);        /* add a ref for this conn */
+
+        /* add to front of peer's conns list */
+        spin_lock(&peer->mxp_lock);
+        list_add(&conn->mxk_list, &peer->mxp_conns);
+        peer->mxp_conn = conn;
+        spin_unlock(&peer->mxp_lock);
+        return 0;
+}
+
+
+int
+mxlnd_q_pending_ctx(struct kmx_ctx *ctx)
+{
+        int             ret     = 0;
+        struct kmx_conn *conn   = ctx->mxc_conn;
+
+        ctx->mxc_state = MXLND_CTX_PENDING;
+        if (conn != NULL) {
+                spin_lock(&conn->mxk_lock);
+                if (conn->mxk_status >= MXLND_CONN_INIT) {
+                        list_add_tail(&ctx->mxc_list, &conn->mxk_pending);
+                        if (conn->mxk_timeout == 0 || ctx->mxc_deadline < conn->mxk_timeout) {
+                                conn->mxk_timeout = ctx->mxc_deadline;
+                        }
+                } else {
+                        ctx->mxc_state = MXLND_CTX_COMPLETED;
+                        ret = -1;
+                }
+                spin_unlock(&conn->mxk_lock);
+        }
+        return ret;
+}
+
+int
+mxlnd_deq_pending_ctx(struct kmx_ctx *ctx)
+{
+        LASSERT(ctx->mxc_state == MXLND_CTX_PENDING ||
+                ctx->mxc_state == MXLND_CTX_COMPLETED);
+        if (ctx->mxc_state != MXLND_CTX_PENDING &&
+            ctx->mxc_state != MXLND_CTX_COMPLETED) {
+                CDEBUG(D_NETERROR, "deq ctx->mxc_state = %s\n", 
+                       mxlnd_ctxstate_to_str(ctx->mxc_state));
+        }
+        ctx->mxc_state = MXLND_CTX_COMPLETED;
+        if (!list_empty(&ctx->mxc_list)) {
+                struct kmx_conn *conn = ctx->mxc_conn;
+                struct kmx_ctx *next = NULL;
+                LASSERT(conn != NULL);
+                spin_lock(&conn->mxk_lock);
+                list_del_init(&ctx->mxc_list);
+                conn->mxk_timeout = 0;
+                if (!list_empty(&conn->mxk_pending)) {
+                        next = list_entry(conn->mxk_pending.next, struct kmx_ctx, mxc_list);
+                        conn->mxk_timeout = next->mxc_deadline;
+                }
+                spin_unlock(&ctx->mxc_conn->mxk_lock);
+        }
+        return 0;
+}
+
+/**
+ * mxlnd_peer_free - free the peer
+ * @peer - a kmx_peer pointer
+ *
+ * The calling function should decrement the rxs, drain the tx queues and
+ * remove the peer from the peers list first then destroy it.
+ */
+void
+mxlnd_peer_free(struct kmx_peer *peer)
+{
+        CDEBUG(D_NET, "freeing peer 0x%p\n", peer);
+
+        LASSERT (atomic_read(&peer->mxp_refcount) == 0);
+
+        if (peer->mxp_host != NULL) {
+                spin_lock(&peer->mxp_host->mxh_lock);
+                peer->mxp_host->mxh_peer = NULL;
+                spin_unlock(&peer->mxp_host->mxh_lock);
+        }
+        if (!list_empty(&peer->mxp_peers)) {
+                /* assume we are locked */
+                list_del_init(&peer->mxp_peers);
+        }
+
+        MXLND_FREE (peer, sizeof (*peer));
+        atomic_dec(&kmxlnd_data.kmx_npeers);
+        return;
+}
+
+void
+mxlnd_peer_hostname_to_nic_id(struct kmx_peer *peer)
+{
+        u64             nic_id  = 0LL;
+        char            name[MX_MAX_HOSTNAME_LEN + 1];
+        mx_return_t     mxret   = MX_SUCCESS;
+
+        memset(name, 0, sizeof(name));
+        snprintf(name, sizeof(name), "%s:%d", peer->mxp_host->mxh_hostname, peer->mxp_host->mxh_board);
+        mxret = mx_hostname_to_nic_id(name, &nic_id);
+        if (mxret == MX_SUCCESS) {
+                peer->mxp_nic_id = nic_id;
+        } else {
+                CDEBUG(D_NETERROR, "mx_hostname_to_nic_id() failed for %s "
+                                   "with %s\n", mx_strerror(mxret), name);
+                mxret = mx_hostname_to_nic_id(peer->mxp_host->mxh_hostname, &nic_id);
+                if (mxret == MX_SUCCESS) {
+                        peer->mxp_nic_id = nic_id;
+                } else {
+                        CDEBUG(D_NETERROR, "mx_hostname_to_nic_id() failed for %s "
+                                           "with %s\n", mx_strerror(mxret), 
+                                           peer->mxp_host->mxh_hostname);
+                }
+        }
+        return;
+}
+
+/**
+ * mxlnd_peer_alloc - allocate and initialize a new peer struct
+ * @peerp - address of a kmx_peer pointer
+ * @nid - LNET node id
+ *
+ * Returns 0 on success and -ENOMEM on failure
+ */
+int
+mxlnd_peer_alloc(struct kmx_peer **peerp, lnet_nid_t nid)
+{
+        int                     i       = 0;
+        int                     ret     = 0;
+        u32                     addr    = LNET_NIDADDR(nid);
+        struct kmx_peer        *peer    = NULL;
+        struct kmx_host        *host    = NULL;
+
+        LASSERT (nid != LNET_NID_ANY && nid != 0LL);
+
+        MXLND_ALLOC(peer, sizeof (*peer));
+        if (peer == NULL) {
+                CDEBUG(D_NETERROR, "Cannot allocate peer for NID 0x%llx\n", nid);
+                return -ENOMEM;
+        }
+        CDEBUG(D_NET, "allocated peer 0x%p for NID 0x%llx\n", peer, nid);
+
+        memset(peer, 0, sizeof(*peer));
+
+        list_for_each_entry(host, &kmxlnd_data.kmx_hosts, mxh_list) {
+                if (addr == host->mxh_addr) {
+                        peer->mxp_host = host;
+                        spin_lock(&host->mxh_lock);
+                        host->mxh_peer = peer;
+                        spin_unlock(&host->mxh_lock);
+                        break;
+                }
+        }
+        LASSERT(peer->mxp_host != NULL);
+
+        peer->mxp_nid = nid;
+        /* peer->mxp_incarnation */
+        atomic_set(&peer->mxp_refcount, 1);     /* ref for kmx_peers list */
+        mxlnd_peer_hostname_to_nic_id(peer);
+
+        INIT_LIST_HEAD(&peer->mxp_peers);
+        spin_lock_init(&peer->mxp_lock);
+        INIT_LIST_HEAD(&peer->mxp_conns);
+        ret = mxlnd_conn_alloc(&peer->mxp_conn, peer);
+        if (ret != 0) {
+                mxlnd_peer_decref(peer);
+                return ret;
+        }
+
+        for (i = 0; i < *kmxlnd_tunables.kmx_credits - 1; i++) {
+                struct kmx_ctx   *rx     = NULL;
+                ret = mxlnd_ctx_alloc(&rx, MXLND_REQ_RX);
+                if (ret != 0) {
+                        mxlnd_reduce_idle_rxs(i);
+                        mxlnd_peer_decref(peer);
+                        return ret;
+                }
+                spin_lock(&kmxlnd_data.kmx_rxs_lock);
+                list_add_tail(&rx->mxc_global_list, &kmxlnd_data.kmx_rxs);
+                spin_unlock(&kmxlnd_data.kmx_rxs_lock);
+                rx->mxc_put = -1;
+                mxlnd_put_idle_rx(rx);
+        }
+        /* peer->mxp_reconnect_time = 0 */
+        /* peer->mxp_incompatible = 0 */
+
+        *peerp = peer;
+        return 0;
+}
+
+/**
+ * mxlnd_nid_to_hash - hash the nid
+ * @nid - msg pointer
+ *
+ * Takes the u64 nid and XORs the lowest N bits by the next lowest N bits.
+ */
+static inline int
+mxlnd_nid_to_hash(lnet_nid_t nid)
+{
+        return (nid & MXLND_HASH_MASK) ^
+               ((nid & (MXLND_HASH_MASK << MXLND_HASH_BITS)) >> MXLND_HASH_BITS);
+}
+
+static inline struct kmx_peer *
+mxlnd_find_peer_by_nid(lnet_nid_t nid)
+{
+        int                     found   = 0;
+        int                     hash    = 0;
+        struct kmx_peer         *peer   = NULL;
+
+        hash = mxlnd_nid_to_hash(nid);
+
+        read_lock(&kmxlnd_data.kmx_peers_lock);
+        list_for_each_entry(peer, &kmxlnd_data.kmx_peers[hash], mxp_peers) {
+                if (peer->mxp_nid == nid) {
+                        found = 1;
+                        break;
+                }
+        }
+        read_unlock(&kmxlnd_data.kmx_peers_lock);
+        return (found ? peer : NULL);
+}
+
+static inline int
+mxlnd_tx_requires_credit(struct kmx_ctx *tx)
+{
+        return (tx->mxc_msg_type == MXLND_MSG_EAGER ||
+                tx->mxc_msg_type == MXLND_MSG_GET_REQ ||
+                tx->mxc_msg_type == MXLND_MSG_PUT_REQ ||
+                tx->mxc_msg_type == MXLND_MSG_NOOP);
+}
+
+/**
+ * mxlnd_init_msg - set type and number of bytes
+ * @msg - msg pointer
+ * @type - of message
+ * @body_nob - bytes in msg body
+ */
+static inline void
+mxlnd_init_msg(kmx_msg_t *msg, u8 type, int body_nob)
+{
+        msg->mxm_type = type;
+        msg->mxm_nob  = offsetof(kmx_msg_t, mxm_u) + body_nob;
+}
+
+static inline void
+mxlnd_init_tx_msg (struct kmx_ctx *tx, u8 type, int body_nob, lnet_nid_t nid)
+{
+        int             nob     = offsetof (kmx_msg_t, mxm_u) + body_nob;
+        struct kmx_msg  *msg    = NULL;
+        
+        LASSERT (tx != NULL);
+        LASSERT (nob <= MXLND_EAGER_SIZE);
+
+        tx->mxc_nid = nid;
+        /* tx->mxc_peer should have already been set if we know it */
+        tx->mxc_msg_type = type;
+        tx->mxc_nseg = 1;
+        /* tx->mxc_seg.segment_ptr is already pointing to mxc_page */
+        tx->mxc_seg.segment_length = nob;
+        tx->mxc_pin_type = MX_PIN_PHYSICAL;
+        //tx->mxc_state = MXLND_CTX_PENDING;
+
+        msg = tx->mxc_msg;
+        msg->mxm_type = type;
+        msg->mxm_nob  = nob;
+
+        return;
+}
+
+static inline __u32 
+mxlnd_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
+
+/**
+ * mxlnd_pack_msg - complete msg info
+ * @tx - msg to send
+ */
+static inline void
+mxlnd_pack_msg(struct kmx_ctx *tx)
+{
+        struct kmx_msg  *msg    = tx->mxc_msg;
+
+        /* type and nob should already be set in init_msg() */
+        msg->mxm_magic    = MXLND_MSG_MAGIC;
+        msg->mxm_version  = MXLND_MSG_VERSION;
+        /*   mxm_type */
+        /* don't use mxlnd_tx_requires_credit() since we want PUT_ACK to
+         * return credits as well */
+        if (tx->mxc_msg_type != MXLND_MSG_CONN_REQ &&
+            tx->mxc_msg_type != MXLND_MSG_CONN_ACK) {
+                spin_lock(&tx->mxc_conn->mxk_lock);
+                msg->mxm_credits  = tx->mxc_conn->mxk_outstanding;
+                tx->mxc_conn->mxk_outstanding = 0;
+                spin_unlock(&tx->mxc_conn->mxk_lock);
+        } else {
+                msg->mxm_credits  = 0;
+        }
+        /*   mxm_nob */
+        msg->mxm_cksum    = 0;        
+        msg->mxm_srcnid   = lnet_ptlcompat_srcnid(kmxlnd_data.kmx_ni->ni_nid, tx->mxc_nid);
+        msg->mxm_srcstamp = kmxlnd_data.kmx_incarnation;
+        msg->mxm_dstnid   = tx->mxc_nid;
+        /* if it is a new peer, the dststamp will be 0 */
+        msg->mxm_dststamp = tx->mxc_conn->mxk_incarnation;         
+        msg->mxm_seq      = tx->mxc_cookie;
+
+        if (*kmxlnd_tunables.kmx_cksum) {
+                msg->mxm_cksum = mxlnd_cksum(msg, msg->mxm_nob);
+        }       
+}
+
+int
+mxlnd_unpack_msg(kmx_msg_t *msg, int nob)
+{
+        const int hdr_size      = offsetof(kmx_msg_t, mxm_u);
+        __u32     msg_cksum     = 0;
+        int       flip          = 0;
+        int       msg_nob       = 0;
+
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CDEBUG(D_NETERROR, "not enough bytes for magic + hdr: %d\n", nob);
+                return -EPROTO;
+        }
+
+        if (msg->mxm_magic == MXLND_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->mxm_magic == __swab32(MXLND_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                CDEBUG(D_NETERROR, "Bad magic: %08x\n", msg->mxm_magic);
+                return -EPROTO;
+        }
+
+        if (msg->mxm_version !=
+            (flip ? __swab16(MXLND_MSG_VERSION) : MXLND_MSG_VERSION)) {
+                CDEBUG(D_NETERROR, "Bad version: %d\n", msg->mxm_version);
+                return -EPROTO;
+        }
+
+        if (nob < hdr_size) {
+                CDEBUG(D_NETERROR, "not enough for a header: %d\n", nob);
+                return -EPROTO;
+        }
+
+        msg_nob = flip ? __swab32(msg->mxm_nob) : msg->mxm_nob;
+        if (msg_nob > nob) {
+                CDEBUG(D_NETERROR, "Short message: got %d, wanted %d\n", nob, msg_nob);
+                return -EPROTO;
+        }
+
+        /* checksum must be computed with mxm_cksum zero and BEFORE anything
+         * gets flipped */
+        msg_cksum = flip ? __swab32(msg->mxm_cksum) : msg->mxm_cksum;
+        msg->mxm_cksum = 0;
+        if (msg_cksum != 0 && msg_cksum != mxlnd_cksum(msg, msg_nob)) {
+                CDEBUG(D_NETERROR, "Bad checksum\n");
+                return -EPROTO;
+        }
+        msg->mxm_cksum = msg_cksum;
+
+        if (flip) {
+                /* leave magic unflipped as a clue to peer endianness */
+                __swab16s(&msg->mxm_version);
+                CLASSERT (sizeof(msg->mxm_type) == 1);
+                CLASSERT (sizeof(msg->mxm_credits) == 1);
+                msg->mxm_nob = msg_nob;
+                __swab64s(&msg->mxm_srcnid);
+                __swab64s(&msg->mxm_srcstamp);
+                __swab64s(&msg->mxm_dstnid);
+                __swab64s(&msg->mxm_dststamp);
+                __swab64s(&msg->mxm_seq);
+        }
+
+        if (msg->mxm_srcnid == LNET_NID_ANY) {
+                CDEBUG(D_NETERROR, "Bad src nid: %s\n", libcfs_nid2str(msg->mxm_srcnid));
+                return -EPROTO;
+        }
+
+        switch (msg->mxm_type) {
+        default:
+                CDEBUG(D_NETERROR, "Unknown message type %x\n", msg->mxm_type);
+                return -EPROTO;
+
+        case MXLND_MSG_NOOP:
+                break;
+
+        case MXLND_MSG_EAGER:
+                if (msg_nob < offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0])) {
+                        CDEBUG(D_NETERROR, "Short EAGER: %d(%d)\n", msg_nob,
+                               (int)offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0]));
+                        return -EPROTO;
+                }
+                break;
+
+        case MXLND_MSG_PUT_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_req)) {
+                        CDEBUG(D_NETERROR, "Short PUT_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->mxm_u.put_req)));
+                        return -EPROTO;
+                }
+                if (flip)
+                        __swab64s(&msg->mxm_u.put_req.mxprm_cookie);
+                break;
+
+        case MXLND_MSG_PUT_ACK:
+                if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_ack)) {
+                        CDEBUG(D_NETERROR, "Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->mxm_u.put_ack)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab64s(&msg->mxm_u.put_ack.mxpam_src_cookie);
+                        __swab64s(&msg->mxm_u.put_ack.mxpam_dst_cookie);
+                }
+                break;
+
+        case MXLND_MSG_GET_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->mxm_u.get_req)) {
+                        CDEBUG(D_NETERROR, "Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->mxm_u.get_req)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab64s(&msg->mxm_u.get_req.mxgrm_cookie);
+                }
+                break;
+
+        case MXLND_MSG_CONN_REQ:
+        case MXLND_MSG_CONN_ACK:
+                if (msg_nob < hdr_size + sizeof(msg->mxm_u.conn_req)) {
+                        CDEBUG(D_NETERROR, "Short connreq/ack: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->mxm_u.conn_req)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab32s(&msg->mxm_u.conn_req.mxcrm_queue_depth);
+                        __swab32s(&msg->mxm_u.conn_req.mxcrm_eager_size);
+                }
+                break;
+        }
+        return 0;
+}
+
+/**
+ * mxlnd_recv_msg
+ * @lntmsg - the LNET msg that this is continuing. If EAGER, then NULL.
+ * @rx
+ * @msg_type
+ * @cookie
+ * @length - length of incoming message
+ * @pending - add to kmx_pending (0 is NO and 1 is YES)
+ *
+ * The caller gets the rx and sets nid, peer and conn if known.
+ *
+ * Returns 0 on success and -1 on failure
+ */
+int
+mxlnd_recv_msg(lnet_msg_t *lntmsg, struct kmx_ctx *rx, u8 msg_type, u64 cookie, u32 length)
+{
+        int             ret     = 0;
+        mx_return_t     mxret   = MX_SUCCESS;
+        uint64_t        mask    = 0xF00FFFFFFFFFFFFFLL;
+
+        rx->mxc_msg_type = msg_type;
+        rx->mxc_lntmsg[0] = lntmsg; /* may be NULL if EAGER */
+        rx->mxc_cookie = cookie;
+        /* rx->mxc_match may already be set */
+        /* rx->mxc_seg.segment_ptr is already set */
+        rx->mxc_seg.segment_length = length;
+        rx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT;
+        ret = mxlnd_q_pending_ctx(rx);
+        if (ret == -1) {
+                /* FIXME the conn is disconnected, now what? */
+                return -1;
+        }
+        mxret = mx_kirecv(kmxlnd_data.kmx_endpt, &rx->mxc_seg, 1, MX_PIN_PHYSICAL,
+                          cookie, mask, (void *) rx, &rx->mxc_mxreq);
+        if (mxret != MX_SUCCESS) {
+                mxlnd_deq_pending_ctx(rx);
+                CDEBUG(D_NETERROR, "mx_kirecv() failed with %s (%d)\n", 
+                                   mx_strerror(mxret), (int) mxret);
+                return -1;
+        }
+        return 0;
+}
+
+
+/**
+ * mxlnd_unexpected_recv - this is the callback function that will handle 
+ *                         unexpected receives
+ * @context - NULL, ignore
+ * @source - the peer's mx_endpoint_addr_t
+ * @match_value - the msg's bit, should be MXLND_MASK_EAGER
+ * @length - length of incoming message
+ * @data_if_available - ignore
+ *
+ * If it is an eager-sized msg, we will call recv_msg() with the actual
+ * length. If it is a large message, we will call recv_msg() with a
+ * length of 0 bytes to drop it because we should never have a large,
+ * unexpected message.
+ *
+ * NOTE - The MX library blocks until this function completes. Make it as fast as
+ * possible. DO NOT allocate memory which can block!
+ *
+ * If we cannot get a rx or the conn is closed, drop the message on the floor
+ * (i.e. recv 0 bytes and ignore).
+ */
+mx_unexp_handler_action_t
+mxlnd_unexpected_recv(void *context, mx_endpoint_addr_t source,
+                 uint64_t match_value, uint32_t length, void *data_if_available)
+{
+        int             ret             = 0;
+        struct kmx_ctx  *rx             = NULL;
+        mx_ksegment_t   seg;
+        u8              msg_type        = 0;
+        u8              error           = 0;
+        u64             cookie          = 0LL;
+
+        if (context != NULL) {
+                CDEBUG(D_NETERROR, "unexpected receive with non-NULL context\n");
+        }
+
+#if MXLND_DEBUG
+        CDEBUG(D_NET, "unexpected_recv() bits=0x%llx length=%d\n", match_value, length);
+#endif
+
+        rx = mxlnd_get_idle_rx();
+        if (rx != NULL) {
+                mxlnd_parse_match(match_value, &msg_type, &error, &cookie);
+                if (length <= MXLND_EAGER_SIZE) {
+                        ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, length);
+                } else {
+                        CDEBUG(D_NETERROR, "unexpected large receive with "
+                                           "match_value=0x%llx length=%d\n", 
+                                           match_value, length);
+                        ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, 0);
+                }
+                if (ret == 0) {
+                        struct kmx_conn *conn   = NULL;
+                        mx_get_endpoint_addr_context(source, (void **) &conn);
+                        if (conn != NULL) {
+                                mxlnd_conn_addref(conn);
+                                rx->mxc_conn = conn;
+                                rx->mxc_peer = conn->mxk_peer;
+                                if (conn->mxk_peer != NULL) {
+                                        rx->mxc_nid = conn->mxk_peer->mxp_nid;
+                                } else {
+                                        CDEBUG(D_NETERROR, "conn is 0x%p and peer "
+                                                           "is NULL\n", conn);
+                                }
+                        }
+                } else {
+                        CDEBUG(D_NETERROR, "could not post receive\n");
+                        mxlnd_put_idle_rx(rx);
+                }
+        }
+
+        if (rx == NULL || ret != 0) {
+                if (rx == NULL) {
+                        CDEBUG(D_NETERROR, "no idle rxs available - dropping rx\n");
+                } else {
+                        /* ret != 0 */
+                        CDEBUG(D_NETERROR, "disconnected peer - dropping rx\n");
+                }
+                seg.segment_ptr = 0LL;
+                seg.segment_length = 0;
+                mx_kirecv(kmxlnd_data.kmx_endpt, &seg, 1, MX_PIN_PHYSICAL,
+                          match_value, 0xFFFFFFFFFFFFFFFFLL, NULL, NULL);
+        }
+
+        return MX_RECV_CONTINUE;
+}
+
+
+int
+mxlnd_get_peer_info(int index, lnet_nid_t *nidp, int *count)
+{
+        int                      i      = 0;
+        struct kmx_peer         *peer   = NULL;
+        struct kmx_conn         *conn   = NULL;
+
+        read_lock(&kmxlnd_data.kmx_peers_lock);
+        for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) {
+                        conn = peer->mxp_conn;
+                        if (index-- > 0)
+                                continue;
+
+                        *nidp = peer->mxp_nid;
+                        *count = atomic_read(&peer->mxp_refcount);
+                }
+        }
+        read_unlock(&kmxlnd_data.kmx_peers_lock);
+        
+        return -ENOENT;
+}
+
+void
+mxlnd_del_peer_locked(struct kmx_peer *peer)
+{
+        list_del_init(&peer->mxp_peers); /* remove from the global list */
+        if (peer->mxp_conn) mxlnd_conn_disconnect(peer->mxp_conn, 0, 0);
+        mxlnd_peer_decref(peer); /* drop global list ref */
+        return;
+}
+
+int
+mxlnd_del_peer(lnet_nid_t nid)
+{
+        int             i       = 0;
+        int             ret     = 0;
+        struct kmx_peer *peer   = NULL;
+        struct kmx_peer *next   = NULL;
+
+        if (nid != LNET_NID_ANY) {
+                peer = mxlnd_find_peer_by_nid(nid);
+        }
+        write_lock(&kmxlnd_data.kmx_peers_lock);
+        if (nid != LNET_NID_ANY) {
+                if (peer == NULL) {
+                        ret = -ENOENT;
+                } else {
+                        mxlnd_del_peer_locked(peer);
+                }
+        } else { /* LNET_NID_ANY */
+                for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                        list_for_each_entry_safe(peer, next, 
+                                                 &kmxlnd_data.kmx_peers[i], mxp_peers) {
+                                mxlnd_del_peer_locked(peer);
+                        }
+                }
+        }
+        write_unlock(&kmxlnd_data.kmx_peers_lock);
+
+        return ret;
+}
+
+struct kmx_conn *
+mxlnd_get_conn_by_idx(int index)
+{
+        int                      i      = 0;
+        struct kmx_peer         *peer   = NULL;
+        struct kmx_conn         *conn   = NULL;
+
+        read_lock(&kmxlnd_data.kmx_peers_lock);
+        for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) {
+                        list_for_each_entry(conn, &peer->mxp_conns, mxk_list) {
+                                if (index-- > 0)
+                                        continue;
+
+                                mxlnd_conn_addref(conn); /* add ref here, dec in ctl() */
+                                read_unlock(&kmxlnd_data.kmx_peers_lock);
+                                return conn;
+                        }
+                }
+        }
+        read_unlock(&kmxlnd_data.kmx_peers_lock);
+        
+        return NULL;
+}
+
+void
+mxlnd_close_matching_conns_locked(struct kmx_peer *peer)
+{
+        struct kmx_conn *conn   = NULL;
+        struct kmx_conn *next   = NULL;
+
+        list_for_each_entry_safe(conn, next, &peer->mxp_conns, mxk_list) {
+                mxlnd_conn_disconnect(conn, 0 , 0);
+        }
+        return;
+}
+
+int
+mxlnd_close_matching_conns(lnet_nid_t nid)
+{
+        int             i       = 0;
+        int             ret     = 0;
+        struct kmx_peer *peer   = NULL;
+
+        read_lock(&kmxlnd_data.kmx_peers_lock);
+        if (nid != LNET_NID_ANY) {
+                peer = mxlnd_find_peer_by_nid(nid);
+                if (peer == NULL) {
+                        ret = -ENOENT;
+                } else {
+                        mxlnd_close_matching_conns_locked(peer);
+                }
+        } else { /* LNET_NID_ANY */
+                for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                        list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers)
+                                mxlnd_close_matching_conns_locked(peer);
+                }
+        }
+        read_unlock(&kmxlnd_data.kmx_peers_lock);
+
+        return ret;
+}
+
+/**
+ * mxlnd_ctl - modify MXLND parameters
+ * @ni - LNET interface handle
+ * @cmd - command to change
+ * @arg - the ioctl data
+ *
+ * Not implemented yet.
+ */
+int
+mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+        struct libcfs_ioctl_data *data  = arg;
+        int                       ret   = -EINVAL;
+
+        LASSERT (ni == kmxlnd_data.kmx_ni);
+
+        switch (cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t      nid     = 0;
+                int             count   = 0;
+
+                ret = mxlnd_get_peer_info(data->ioc_count, &nid, &count);
+                data->ioc_nid    = nid;
+                data->ioc_count  = count;
+                break;
+        }
+        case IOC_LIBCFS_DEL_PEER: {
+                ret = mxlnd_del_peer(data->ioc_nid);
+                break;
+        }
+        case IOC_LIBCFS_GET_CONN: {
+                struct kmx_conn *conn = NULL;
+
+                conn = mxlnd_get_conn_by_idx(data->ioc_count);
+                if (conn == NULL) {
+                        ret = -ENOENT;
+                } else {
+                        ret = 0;
+                        data->ioc_nid = conn->mxk_peer->mxp_nid;
+                        mxlnd_conn_decref(conn); /* dec ref taken in get_conn_by_idx() */
+                }
+                break;
+        }
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                ret = mxlnd_close_matching_conns(data->ioc_nid);
+                break;
+        }
+        default:
+                CDEBUG(D_NETERROR, "unknown ctl(%d)\n", cmd);
+                break;
+        }
+        
+        return ret;
+}
+
+/**
+ * mxlnd_peer_queue_tx_locked - add the tx to the global tx queue
+ * @tx
+ *
+ * Add the tx to the peer's msg or data queue. The caller has locked the peer.
+ */
+void
+mxlnd_peer_queue_tx_locked(struct kmx_ctx *tx)
+{
+        u8                      msg_type        = tx->mxc_msg_type;
+        //struct kmx_peer         *peer           = tx->mxc_peer;
+        struct kmx_conn         *conn           = tx->mxc_conn;
+
+        LASSERT (msg_type != 0);
+        LASSERT (tx->mxc_nid != 0);
+        LASSERT (tx->mxc_peer != NULL);
+        LASSERT (tx->mxc_conn != NULL);
+
+        tx->mxc_incarnation = conn->mxk_incarnation;
+
+        if (msg_type != MXLND_MSG_PUT_DATA &&
+            msg_type != MXLND_MSG_GET_DATA) {
+                /* msg style tx */
+                if (mxlnd_tx_requires_credit(tx)) {
+                        list_add_tail(&tx->mxc_list, &conn->mxk_tx_credit_queue);
+                        conn->mxk_ntx_msgs++;
+                } else if (msg_type == MXLND_MSG_CONN_REQ ||
+                           msg_type == MXLND_MSG_CONN_ACK) {
+                        /* put conn msgs at the front of the queue */
+                        list_add(&tx->mxc_list, &conn->mxk_tx_free_queue);
+                } else {
+                        /* PUT_ACK, PUT_NAK */
+                        list_add_tail(&tx->mxc_list, &conn->mxk_tx_free_queue);
+                        conn->mxk_ntx_msgs++;
+                }
+        } else {
+                /* data style tx */
+                list_add_tail(&tx->mxc_list, &conn->mxk_tx_free_queue);
+                conn->mxk_ntx_data++;
+        }
+
+        return;
+}
+
+/**
+ * mxlnd_peer_queue_tx - add the tx to the global tx queue
+ * @tx
+ *
+ * Add the tx to the peer's msg or data queue
+ */
+static inline void
+mxlnd_peer_queue_tx(struct kmx_ctx *tx)
+{
+        LASSERT(tx->mxc_peer != NULL);
+        LASSERT(tx->mxc_conn != NULL);
+        spin_lock(&tx->mxc_conn->mxk_lock);
+        mxlnd_peer_queue_tx_locked(tx);
+        spin_unlock(&tx->mxc_conn->mxk_lock);
+
+        return;
+}
+
+/**
+ * mxlnd_queue_tx - add the tx to the global tx queue
+ * @tx
+ *
+ * Add the tx to the global queue and up the tx_queue_sem
+ */
+void
+mxlnd_queue_tx(struct kmx_ctx *tx)
+{
+        int             ret     = 0;
+        struct kmx_peer *peer   = tx->mxc_peer;
+        LASSERT (tx->mxc_nid != 0);
+
+        if (peer != NULL) {
+                if (peer->mxp_incompatible &&
+                    tx->mxc_msg_type != MXLND_MSG_CONN_ACK) {
+                        /* let this fail now */
+                        tx->mxc_status.code = -ECONNABORTED;
+                        mxlnd_put_idle_tx(tx);
+                        return;
+                }
+                if (tx->mxc_conn == NULL) {
+                        mxlnd_conn_alloc(&tx->mxc_conn, peer);
+                }
+                LASSERT(tx->mxc_conn != NULL);
+                mxlnd_peer_queue_tx(tx);
+                ret = mxlnd_check_sends(peer);
+        } else {
+                spin_lock(&kmxlnd_data.kmx_tx_queue_lock);
+                list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_queue);
+                spin_unlock(&kmxlnd_data.kmx_tx_queue_lock);
+                up(&kmxlnd_data.kmx_tx_queue_sem);
+        }
+        return;
+}
+
+int
+mxlnd_setup_iov(struct kmx_ctx *ctx, u32 niov, struct iovec *iov, u32 offset, u32 nob)
+{
+        int             i                       = 0;
+        int             sum                     = 0;
+        int             old_sum                 = 0;
+        int             nseg                    = 0;
+        int             first_iov               = -1;
+        int             first_iov_offset        = 0;
+        int             first_found             = 0;
+        int             last_iov                = -1;
+        int             last_iov_length         = 0;
+        mx_ksegment_t  *seg                     = NULL;
+
+        if (niov == 0) return 0;
+        LASSERT(iov != NULL);
+
+        for (i = 0; i < niov; i++) {
+                sum = old_sum + (u32) iov[i].iov_len;
+                if (!first_found && (sum > offset)) {
+                        first_iov = i;
+                        first_iov_offset = offset - old_sum;
+                        first_found = 1;
+                        sum = (u32) iov[i].iov_len - first_iov_offset;
+                        old_sum = 0;
+                }
+                if (sum >= nob) {
+                        last_iov = i;
+                        last_iov_length = (u32) iov[i].iov_len - (sum - nob);
+                        if (first_iov == last_iov) last_iov_length -= first_iov_offset;
+                        break;
+                }
+                old_sum = sum;
+        }
+        LASSERT(first_iov >= 0 && last_iov >= first_iov);
+        nseg = last_iov - first_iov + 1;
+        LASSERT(nseg > 0);
+        
+        MXLND_ALLOC (seg, nseg * sizeof(*seg));
+        if (seg == NULL) {
+                CDEBUG(D_NETERROR, "MXLND_ALLOC() failed\n");
+                return -1;
+        }
+        memset(seg, 0, nseg * sizeof(*seg));
+        ctx->mxc_nseg = nseg;
+        sum = 0;
+        for (i = 0; i < nseg; i++) {
+                seg[i].segment_ptr = MX_KVA_TO_U64(iov[first_iov + i].iov_base);
+                seg[i].segment_length = (u32) iov[first_iov + i].iov_len;
+                if (i == 0) {
+                        seg[i].segment_ptr += (u64) first_iov_offset;
+                        seg[i].segment_length -= (u32) first_iov_offset;
+                }
+                if (i == (nseg - 1)) {
+                        seg[i].segment_length = (u32) last_iov_length;
+                }
+                sum += seg[i].segment_length;
+        }
+        ctx->mxc_seg_list = seg;
+        ctx->mxc_pin_type = MX_PIN_KERNEL;
+#ifdef MX_PIN_FULLPAGES
+        ctx->mxc_pin_type |= MX_PIN_FULLPAGES;
+#endif
+        LASSERT(nob == sum);
+        return 0;
+}
+
+int
+mxlnd_setup_kiov(struct kmx_ctx *ctx, u32 niov, lnet_kiov_t *kiov, u32 offset, u32 nob)
+{
+        int             i                       = 0;
+        int             sum                     = 0;
+        int             old_sum                 = 0;
+        int             nseg                    = 0;
+        int             first_kiov              = -1;
+        int             first_kiov_offset       = 0;
+        int             first_found             = 0;
+        int             last_kiov               = -1;
+        int             last_kiov_length        = 0;
+        mx_ksegment_t  *seg                     = NULL;
+
+        if (niov == 0) return 0;
+        LASSERT(kiov != NULL);
+
+        for (i = 0; i < niov; i++) {
+                sum = old_sum + kiov[i].kiov_len;
+                if (i == 0) sum -= kiov[i].kiov_offset;
+                if (!first_found && (sum > offset)) {
+                        first_kiov = i;
+                        first_kiov_offset = offset - old_sum;
+                        //if (i == 0) first_kiov_offset + kiov[i].kiov_offset;
+                        if (i == 0) first_kiov_offset = kiov[i].kiov_offset;
+                        first_found = 1;
+                        sum = kiov[i].kiov_len - first_kiov_offset;
+                        old_sum = 0;
+                }
+                if (sum >= nob) {
+                        last_kiov = i;
+                        last_kiov_length = kiov[i].kiov_len - (sum - nob);
+                        if (first_kiov == last_kiov) last_kiov_length -= first_kiov_offset;
+                        break;
+                }
+                old_sum = sum;
+        }
+        LASSERT(first_kiov >= 0 && last_kiov >= first_kiov);
+        nseg = last_kiov - first_kiov + 1;
+        LASSERT(nseg > 0);
+        
+        MXLND_ALLOC (seg, nseg * sizeof(*seg));
+        if (seg == NULL) {
+                CDEBUG(D_NETERROR, "MXLND_ALLOC() failed\n");
+                return -1;
+        }
+        memset(seg, 0, niov * sizeof(*seg));
+        ctx->mxc_nseg = niov;
+        sum = 0;
+        for (i = 0; i < niov; i++) {
+                seg[i].segment_ptr = lnet_page2phys(kiov[first_kiov + i].kiov_page);
+                seg[i].segment_length = kiov[first_kiov + i].kiov_len;
+                if (i == 0) {
+                        seg[i].segment_ptr += (u64) first_kiov_offset;
+                        /* we have to add back the original kiov_offset */
+                        seg[i].segment_length -= first_kiov_offset +
+                                                 kiov[first_kiov].kiov_offset;
+                }
+                if (i == (nseg - 1)) {
+                        seg[i].segment_length = last_kiov_length;
+                }
+                sum += seg[i].segment_length;
+        }
+        ctx->mxc_seg_list = seg;
+        ctx->mxc_pin_type = MX_PIN_PHYSICAL;
+#ifdef MX_PIN_FULLPAGES
+        ctx->mxc_pin_type |= MX_PIN_FULLPAGES;
+#endif
+        LASSERT(nob == sum);
+        return 0;
+}
+
+void
+mxlnd_send_nak(struct kmx_ctx *tx, lnet_nid_t nid, int type, int status, __u64 cookie)
+{
+        LASSERT(type == MXLND_MSG_PUT_ACK);
+        mxlnd_init_tx_msg(tx, type, sizeof(kmx_putack_msg_t), tx->mxc_nid);
+        tx->mxc_cookie = cookie;
+        tx->mxc_msg->mxm_u.put_ack.mxpam_src_cookie = cookie;
+        tx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie = ((u64) status << 52); /* error code */
+        tx->mxc_match = mxlnd_create_match(tx, status);
+
+        mxlnd_queue_tx(tx);
+}
+
+
+/**
+ * mxlnd_send_data - get tx, map [k]iov, queue tx
+ * @ni
+ * @lntmsg
+ * @peer
+ * @msg_type
+ * @cookie
+ *
+ * This setups the DATA send for PUT or GET.
+ *
+ * On success, it queues the tx, on failure it calls lnet_finalize()
+ */
+void
+mxlnd_send_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, struct kmx_peer *peer, u8 msg_type, u64 cookie)
+{
+        int                     ret             = 0;
+        lnet_process_id_t       target          = lntmsg->msg_target;
+        unsigned int            niov            = lntmsg->msg_niov;
+        struct iovec           *iov             = lntmsg->msg_iov;
+        lnet_kiov_t            *kiov            = lntmsg->msg_kiov;
+        unsigned int            offset          = lntmsg->msg_offset;
+        unsigned int            nob             = lntmsg->msg_len;
+        struct kmx_ctx         *tx              = NULL;
+
+        LASSERT(lntmsg != NULL);
+        LASSERT(peer != NULL);
+        LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA);
+        LASSERT((cookie>>52) == 0);
+
+        tx = mxlnd_get_idle_tx();
+        if (tx == NULL) {
+                CDEBUG(D_NETERROR, "Can't allocate %s tx for %s\n",
+                        msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA",
+                        libcfs_nid2str(target.nid));
+                goto failed_0;
+        }
+        tx->mxc_nid = target.nid;
+        mxlnd_conn_addref(peer->mxp_conn);
+        tx->mxc_peer = peer;
+        tx->mxc_conn = peer->mxp_conn;
+        tx->mxc_msg_type = msg_type;
+        tx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT;
+        tx->mxc_state = MXLND_CTX_PENDING;
+        tx->mxc_lntmsg[0] = lntmsg;
+        tx->mxc_cookie = cookie;
+        tx->mxc_match = mxlnd_create_match(tx, 0);
+
+        /* This setups up the mx_ksegment_t to send the DATA payload  */
+        if (nob == 0) {
+                /* do not setup the segments */
+                CDEBUG(D_NETERROR, "nob = 0; why didn't we use an EAGER reply "
+                                   "to %s?\n", libcfs_nid2str(target.nid));
+                ret = 0;
+        } else if (kiov == NULL) {
+                ret = mxlnd_setup_iov(tx, niov, iov, offset, nob);
+        } else {
+                ret = mxlnd_setup_kiov(tx, niov, kiov, offset, nob);
+        }
+        if (ret != 0) {
+                CDEBUG(D_NETERROR, "Can't setup send DATA for %s\n", 
+                                   libcfs_nid2str(target.nid));
+                tx->mxc_status.code = -EIO;
+                goto failed_1;
+        }
+        mxlnd_queue_tx(tx);
+        return;
+
+failed_1:
+        mxlnd_conn_decref(peer->mxp_conn);
+        mxlnd_put_idle_tx(tx);
+        return;
+
+failed_0:
+        CDEBUG(D_NETERROR, "no tx avail\n");
+        lnet_finalize(ni, lntmsg, -EIO);
+        return;
+}
+
+/**
+ * mxlnd_recv_data - map [k]iov, post rx
+ * @ni
+ * @lntmsg
+ * @rx
+ * @msg_type
+ * @cookie
+ *
+ * This setups the DATA receive for PUT or GET.
+ *
+ * On success, it returns 0, on failure it returns -1
+ */
+int
+mxlnd_recv_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, struct kmx_ctx *rx, u8 msg_type, u64 cookie)
+{
+        int                     ret             = 0;
+        lnet_process_id_t       target          = lntmsg->msg_target;
+        unsigned int            niov            = lntmsg->msg_niov;
+        struct iovec           *iov             = lntmsg->msg_iov;
+        lnet_kiov_t            *kiov            = lntmsg->msg_kiov;
+        unsigned int            offset          = lntmsg->msg_offset;
+        unsigned int            nob             = lntmsg->msg_len;
+        mx_return_t             mxret           = MX_SUCCESS;
+
+        /* above assumes MXLND_MSG_PUT_DATA */
+        if (msg_type == MXLND_MSG_GET_DATA) {
+                niov = lntmsg->msg_md->md_niov;
+                iov = lntmsg->msg_md->md_iov.iov;
+                kiov = lntmsg->msg_md->md_iov.kiov;
+                offset = 0;
+                nob = lntmsg->msg_md->md_length;
+        }
+
+        LASSERT(lntmsg != NULL);
+        LASSERT(rx != NULL);
+        LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA);
+        LASSERT((cookie>>52) == 0); /* ensure top 12 bits are 0 */
+
+        rx->mxc_msg_type = msg_type;
+        rx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT;
+        rx->mxc_state = MXLND_CTX_PENDING;
+        rx->mxc_nid = target.nid;
+        /* if posting a GET_DATA, we may not yet know the peer */
+        if (rx->mxc_peer != NULL) {
+                rx->mxc_conn = rx->mxc_peer->mxp_conn;
+        }
+        rx->mxc_lntmsg[0] = lntmsg;
+        rx->mxc_cookie = cookie;
+        rx->mxc_match = mxlnd_create_match(rx, 0);
+        /* This setups up the mx_ksegment_t to receive the DATA payload  */
+        if (kiov == NULL) {
+                ret = mxlnd_setup_iov(rx, niov, iov, offset, nob);
+        } else {
+                ret = mxlnd_setup_kiov(rx, niov, kiov, offset, nob);
+        }
+        if (msg_type == MXLND_MSG_GET_DATA) {
+                rx->mxc_lntmsg[1] = lnet_create_reply_msg(kmxlnd_data.kmx_ni, lntmsg);
+                if (rx->mxc_lntmsg[1] == NULL) {
+                        CDEBUG(D_NETERROR, "Can't create reply for GET -> %s\n",
+                                           libcfs_nid2str(target.nid));
+                        ret = -1;
+                }
+        }
+        if (ret != 0) {
+                CDEBUG(D_NETERROR, "Can't setup %s rx for %s\n",
+                       msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA",
+                       libcfs_nid2str(target.nid));
+                return -1;
+        }
+        ret = mxlnd_q_pending_ctx(rx);
+        if (ret == -1) {
+                return -1;
+        }
+        CDEBUG(D_NET, "receiving %s 0x%llx\n", mxlnd_msgtype_to_str(msg_type), rx->mxc_cookie);
+        mxret = mx_kirecv(kmxlnd_data.kmx_endpt, 
+                          rx->mxc_seg_list, rx->mxc_nseg,
+                          rx->mxc_pin_type, rx->mxc_match,
+                          0xF00FFFFFFFFFFFFFLL, (void *) rx, 
+                          &rx->mxc_mxreq);
+        if (mxret != MX_SUCCESS) {
+                if (rx->mxc_conn != NULL) {
+                        mxlnd_deq_pending_ctx(rx);
+                }
+                CDEBUG(D_NETERROR, "mx_kirecv() failed with %d for %s\n", 
+                                   (int) mxret, libcfs_nid2str(target.nid));
+                return -1;
+        }
+
+        return 0;
+}
+
+/**
+ * mxlnd_send - the LND required send function
+ * @ni
+ * @private
+ * @lntmsg
+ *
+ * This must not block. Since we may not have a peer struct for the receiver,
+ * it will append send messages on a global tx list. We will then up the
+ * tx_queued's semaphore to notify it of the new send. 
+ */
+int
+mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+        int                     ret             = 0;
+        int                     type            = lntmsg->msg_type;
+        lnet_hdr_t             *hdr             = &lntmsg->msg_hdr;
+        lnet_process_id_t       target          = lntmsg->msg_target;
+        lnet_nid_t              nid             = target.nid;
+        int                     target_is_router = lntmsg->msg_target_is_router;
+        int                     routing         = lntmsg->msg_routing;
+        unsigned int            payload_niov    = lntmsg->msg_niov;
+        struct iovec           *payload_iov     = lntmsg->msg_iov;
+        lnet_kiov_t            *payload_kiov    = lntmsg->msg_kiov;
+        unsigned int            payload_offset  = lntmsg->msg_offset;
+        unsigned int            payload_nob     = lntmsg->msg_len;
+        struct kmx_ctx         *tx              = NULL;
+        struct kmx_msg         *txmsg           = NULL;
+        struct kmx_ctx         *rx              = (struct kmx_ctx *) private; /* for REPLY */
+        struct kmx_ctx         *rx_data         = NULL;
+        struct kmx_conn        *conn            = NULL;
+        int                     nob             = 0;
+        uint32_t                length          = 0;
+        struct kmx_peer         *peer           = NULL;
+
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+                       payload_nob, payload_niov, libcfs_id2str(target));
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+        /* private is used on LNET_GET_REPLY only, NULL for all other cases */
+
+        /* NOTE we may not know the peer if it is the very first PUT_REQ or GET_REQ
+         * to a new peer, use the nid */
+        peer = mxlnd_find_peer_by_nid(nid);
+        if (peer != NULL) {
+                conn = peer->mxp_conn;
+                if (conn) mxlnd_conn_addref(conn);
+        }
+        if (conn == NULL && peer != NULL) {
+                CDEBUG(D_NETERROR, "conn==NULL peer=0x%p nid=0x%llx payload_nob=%d type=%s\n", 
+                       peer, nid, payload_nob, ((type==LNET_MSG_PUT) ? "PUT" : 
+                       ((type==LNET_MSG_GET) ? "GET" : "Other")));
+        }
+
+        switch (type) {
+        case LNET_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
+                /* Is the payload small enough not to need DATA? */
+                nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob]);
+                if (nob <= MXLND_EAGER_SIZE)
+                        break;                  /* send EAGER */
+
+                tx = mxlnd_get_idle_tx();
+                if (unlikely(tx == NULL)) {
+                        CDEBUG(D_NETERROR, "Can't allocate %s tx for %s\n",
+                               type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                               libcfs_nid2str(nid));
+                        if (conn) mxlnd_conn_decref(conn);
+                        return -ENOMEM;
+                }
+
+                /* the peer may be NULL */
+                tx->mxc_peer = peer;
+                tx->mxc_conn = conn; /* may be NULL */
+                /* we added a conn ref above */
+                mxlnd_init_tx_msg (tx, MXLND_MSG_PUT_REQ, sizeof(kmx_putreq_msg_t), nid);
+                txmsg = tx->mxc_msg;
+                txmsg->mxm_u.put_req.mxprm_hdr = *hdr;
+                txmsg->mxm_u.put_req.mxprm_cookie = tx->mxc_cookie;
+                tx->mxc_match = mxlnd_create_match(tx, 0);
+
+                /* we must post a receive _before_ sending the request.
+                 * we need to determine how much to receive, it will be either
+                 * a put_ack or a put_nak. The put_ack is larger, so use it. */
+
+                rx = mxlnd_get_idle_rx();
+                if (unlikely(rx == NULL)) {
+                        CDEBUG(D_NETERROR, "Can't allocate rx for PUT_ACK for %s\n",
+                                           libcfs_nid2str(nid));
+                        mxlnd_put_idle_tx(tx);
+                        if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */
+                        return -ENOMEM;
+                }
+                rx->mxc_nid = nid;
+                rx->mxc_peer = peer;
+                /* conn may be NULL but unlikely since the first msg is always small */
+                if (conn) mxlnd_conn_addref(conn); /* for this rx */
+                rx->mxc_conn = conn;
+                rx->mxc_msg_type = MXLND_MSG_PUT_ACK;
+                rx->mxc_cookie = tx->mxc_cookie;
+                rx->mxc_match = mxlnd_create_match(rx, 0);
+
+                length = offsetof(kmx_msg_t, mxm_u) + sizeof(kmx_putack_msg_t);
+                ret = mxlnd_recv_msg(lntmsg, rx, MXLND_MSG_PUT_ACK, rx->mxc_match, length);
+                if (unlikely(ret != 0)) {
+                        CDEBUG(D_NETERROR, "recv_msg() failed for PUT_ACK for %s\n",
+                                           libcfs_nid2str(nid));
+                        rx->mxc_lntmsg[0] = NULL;
+                        mxlnd_put_idle_rx(rx);
+                        mxlnd_put_idle_tx(tx);
+                        if (conn) {
+                                mxlnd_conn_decref(conn); /* for the rx... */
+                                mxlnd_conn_decref(conn); /* and for the tx */
+                        }
+                        return -ENOMEM;
+                }
+
+                mxlnd_queue_tx(tx);
+                return 0;
+
+        case LNET_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send EAGER */
+
+                /* is the REPLY message too small for DATA? */
+                nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[lntmsg->msg_md->md_length]);
+                if (nob <= MXLND_EAGER_SIZE)
+                        break;                  /* send EAGER */
+
+                /* get tx (we need the cookie) , post rx for incoming DATA, 
+                 * then post GET_REQ tx */
+                tx = mxlnd_get_idle_tx();
+                if (unlikely(tx == NULL)) {
+                        CDEBUG(D_NETERROR, "Can't allocate GET tx for %s\n",
+                                           libcfs_nid2str(nid));
+                        if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */
+                        return -ENOMEM;
+                }
+                rx_data = mxlnd_get_idle_rx();
+                if (unlikely(rx_data == NULL)) {
+                        CDEBUG(D_NETERROR, "Can't allocate DATA rx for %s\n",
+                                           libcfs_nid2str(nid));
+                        mxlnd_put_idle_tx(tx);
+                        if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */
+                        return -ENOMEM;
+                }
+                rx_data->mxc_peer = peer;
+                if (conn) mxlnd_conn_addref(conn); /* for the rx_data */
+                rx_data->mxc_conn = conn; /* may be NULL */
+
+                ret = mxlnd_recv_data(ni, lntmsg, rx_data, MXLND_MSG_GET_DATA, tx->mxc_cookie);
+                if (unlikely(ret != 0)) {
+                        CDEBUG(D_NETERROR, "Can't setup GET sink for %s\n",
+                                           libcfs_nid2str(nid));
+                        mxlnd_put_idle_rx(rx_data);
+                        mxlnd_put_idle_tx(tx);
+                        if (conn) {
+                                mxlnd_conn_decref(conn); /* for the rx_data... */
+                                mxlnd_conn_decref(conn); /* and for the tx */
+                        }
+                        return -EIO;
+                }
+
+                tx->mxc_peer = peer;
+                tx->mxc_conn = conn; /* may be NULL */
+                /* conn ref taken above */
+                mxlnd_init_tx_msg(tx, MXLND_MSG_GET_REQ, sizeof(kmx_getreq_msg_t), nid);
+                txmsg = tx->mxc_msg;
+                txmsg->mxm_u.get_req.mxgrm_hdr = *hdr;
+                txmsg->mxm_u.get_req.mxgrm_cookie = tx->mxc_cookie;
+                tx->mxc_match = mxlnd_create_match(tx, 0);
+
+                mxlnd_queue_tx(tx);
+                return 0;
+
+        default:
+                LBUG();
+                if (conn) mxlnd_conn_decref(conn); /* drop ref taken above */
+                return -EIO;
+        }
+
+        /* send EAGER */
+
+        LASSERT (offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob])
+                <= MXLND_EAGER_SIZE);
+
+        tx = mxlnd_get_idle_tx();
+        if (unlikely(tx == NULL)) {
+                CDEBUG(D_NETERROR, "Can't send %s to %s: tx descs exhausted\n",
+                                   mxlnd_lnetmsg_to_str(type), libcfs_nid2str(nid));
+                if (conn) mxlnd_conn_decref(conn); /* drop ref taken above */
+                return -ENOMEM;
+        }
+
+        tx->mxc_peer = peer;
+        tx->mxc_conn = conn; /* may be NULL */
+        /* conn ref taken above */
+        nob = offsetof(kmx_eager_msg_t, mxem_payload[payload_nob]);
+        mxlnd_init_tx_msg (tx, MXLND_MSG_EAGER, nob, nid);
+        tx->mxc_match = mxlnd_create_match(tx, 0);
+
+        txmsg = tx->mxc_msg;
+        txmsg->mxm_u.eager.mxem_hdr = *hdr;
+
+        if (payload_kiov != NULL)
+                lnet_copy_kiov2flat(MXLND_EAGER_SIZE, txmsg,
+                            offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
+                            payload_niov, payload_kiov, payload_offset, payload_nob);
+        else
+                lnet_copy_iov2flat(MXLND_EAGER_SIZE, txmsg,
+                            offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
+                            payload_niov, payload_iov, payload_offset, payload_nob);
+
+        tx->mxc_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
+        mxlnd_queue_tx(tx);
+        return 0;
+}
+
+/**
+ * mxlnd_recv - the LND required recv function
+ * @ni
+ * @private
+ * @lntmsg
+ * @delayed
+ * @niov
+ * @kiov
+ * @offset
+ * @mlen
+ * @rlen
+ *
+ * This must not block.
+ */
+int
+mxlnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+             unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+             unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        int                     ret             = 0;
+        int                     nob             = 0;
+        int                     len             = 0;
+        struct kmx_ctx          *rx             = private;
+        struct kmx_msg          *rxmsg          = rx->mxc_msg;
+        lnet_nid_t               nid            = rx->mxc_nid;
+        struct kmx_ctx          *tx             = NULL;
+        struct kmx_msg          *txmsg          = NULL;
+        struct kmx_peer         *peer           = rx->mxc_peer;
+        struct kmx_conn         *conn           = peer->mxp_conn;
+        u64                      cookie         = 0LL;
+        int                      msg_type       = rxmsg->mxm_type;
+        int                      repost         = 1;
+        int                      credit         = 0;
+        int                      finalize       = 0;
+
+        LASSERT (mlen <= rlen);
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+        LASSERT (peer != NULL);
+
+        /* conn_addref(conn) already taken for the primary rx */
+
+        switch (msg_type) {
+        case MXLND_MSG_EAGER:
+                nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[rlen]);
+                len = rx->mxc_status.xfer_length;
+                if (unlikely(nob > len)) {
+                        CDEBUG(D_NETERROR, "Eager message from %s too big: %d(%d)\n",
+                                           libcfs_nid2str(nid), nob, len);
+                        ret = -EPROTO;
+                        break;
+                }
+
+                if (kiov != NULL)
+                        lnet_copy_flat2kiov(niov, kiov, offset,
+                                MXLND_EAGER_SIZE, rxmsg,
+                                offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
+                                mlen);
+                else
+                        lnet_copy_flat2iov(niov, iov, offset,
+                                MXLND_EAGER_SIZE, rxmsg,
+                                offsetof(kmx_msg_t, mxm_u.eager.mxem_payload),
+                                mlen);
+                finalize = 1;
+                credit = 1;
+                break;
+
+        case MXLND_MSG_PUT_REQ:
+                /* we are going to reuse the rx, store the needed info */
+                cookie = rxmsg->mxm_u.put_req.mxprm_cookie;
+
+                /* get tx, post rx, send PUT_ACK */
+
+                tx = mxlnd_get_idle_tx();
+                if (unlikely(tx == NULL)) {
+                        CDEBUG(D_NETERROR, "Can't allocate tx for %s\n", libcfs_nid2str(nid));
+                        /* Not replying will break the connection */
+                        ret = -ENOMEM;
+                        break;
+                }
+                if (unlikely(mlen == 0)) {
+                        finalize = 1;
+                        tx->mxc_peer = peer;
+                        tx->mxc_conn = conn;
+                        mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, 0, cookie);
+                        /* repost = 1 */
+                        break;
+                }
+
+                mxlnd_init_tx_msg(tx, MXLND_MSG_PUT_ACK, sizeof(kmx_putack_msg_t), nid);
+                tx->mxc_peer = peer;
+                tx->mxc_conn = conn;
+                mxlnd_conn_addref(conn); /* for the tx */
+                txmsg = tx->mxc_msg;
+                txmsg->mxm_u.put_ack.mxpam_src_cookie = cookie;
+                txmsg->mxm_u.put_ack.mxpam_dst_cookie = tx->mxc_cookie;
+                tx->mxc_cookie = cookie;
+                tx->mxc_match = mxlnd_create_match(tx, 0);
+
+                /* we must post a receive _before_ sending the PUT_ACK */
+                mxlnd_ctx_init(rx);
+                rx->mxc_state = MXLND_CTX_PREP;
+                rx->mxc_peer = peer;
+                rx->mxc_conn = conn;
+                /* do not take another ref for this rx, it is already taken */
+                rx->mxc_nid = peer->mxp_nid;
+                ret = mxlnd_recv_data(ni, lntmsg, rx, MXLND_MSG_PUT_DATA, 
+                                      txmsg->mxm_u.put_ack.mxpam_dst_cookie);
+
+                if (unlikely(ret != 0)) {
+                        /* Notify peer that it's over */
+                        CDEBUG(D_NETERROR, "Can't setup PUT_DATA rx for %s: %d\n", 
+                                           libcfs_nid2str(nid), ret);
+                        mxlnd_ctx_init(tx);
+                        tx->mxc_state = MXLND_CTX_PREP;
+                        tx->mxc_peer = peer;
+                        tx->mxc_conn = conn;
+                        /* finalize = 0, let the PUT_ACK tx finalize this */
+                        tx->mxc_lntmsg[0] = rx->mxc_lntmsg[0];
+                        tx->mxc_lntmsg[1] = rx->mxc_lntmsg[1];
+                        /* conn ref already taken above */
+                        mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, ret, cookie);
+                        /* repost = 1 */
+                        break;
+                }
+
+                mxlnd_queue_tx(tx);
+                /* do not return a credit until after PUT_DATA returns */
+                repost = 0;
+                break;
+
+        case MXLND_MSG_GET_REQ:
+                if (likely(lntmsg != NULL)) {
+                        mxlnd_send_data(ni, lntmsg, rx->mxc_peer, MXLND_MSG_GET_DATA,
+                                        rx->mxc_msg->mxm_u.get_req.mxgrm_cookie);
+                } else {
+                        /* GET didn't match anything */
+                        /* The initiator has a rx mapped to [k]iov. We cannot send a nak.
+                         * We have to embed the error code in the match bits.
+                         * Send the error in bits 52-59 and the cookie in bits 0-51 */
+                        u64             cookie  = rxmsg->mxm_u.get_req.mxgrm_cookie;
+
+                        tx = mxlnd_get_idle_tx();
+                        if (unlikely(tx == NULL)) {
+                                CDEBUG(D_NETERROR, "Can't get tx for GET NAK for %s\n",
+                                                   libcfs_nid2str(nid));
+                                ret = -ENOMEM;
+                                break;
+                        }
+                        tx->mxc_msg_type = MXLND_MSG_GET_DATA;
+                        tx->mxc_state = MXLND_CTX_PENDING;
+                        tx->mxc_nid = nid;
+                        tx->mxc_peer = peer;
+                        tx->mxc_conn = conn;
+                        mxlnd_conn_addref(conn); /* for this tx */
+                        tx->mxc_cookie = cookie;
+                        tx->mxc_match = mxlnd_create_match(tx, ENODATA);
+                        tx->mxc_pin_type = MX_PIN_PHYSICAL;
+                        mxlnd_queue_tx(tx);
+                }
+                /* finalize lntmsg after tx completes */
+                break;
+
+        default:
+                LBUG();
+        }
+
+        if (repost) {
+                /* we received a message, increment peer's outstanding credits */
+                if (credit == 1) {
+                        spin_lock(&conn->mxk_lock);
+                        conn->mxk_outstanding++;
+                        spin_unlock(&conn->mxk_lock);
+                }
+                /* we are done with the rx */
+                mxlnd_put_idle_rx(rx);
+                mxlnd_conn_decref(conn);
+        }
+
+        if (finalize == 1) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg, 0); 
+
+        /* we received a credit, see if we can use it to send a msg */
+        if (credit) mxlnd_check_sends(peer);
+        
+        return ret;
+}
+
+void
+mxlnd_sleep(unsigned long timeout)
+{
+        set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(timeout);
+        return;
+}
+
+/**
+ * mxlnd_tx_queued - the generic send queue thread
+ * @arg - thread id (as a void *)
+ *
+ * This thread moves send messages from the global tx_queue to the owning
+ * peer's tx_[msg|data]_queue. If the peer does not exist, it creates one and adds
+ * it to the global peer list.
+ */
+int
+mxlnd_tx_queued(void *arg)
+{
+        long                    id      = (long) arg;
+        int                     ret     = 0;
+        int                     found   = 0;
+        struct kmx_ctx         *tx      = NULL;
+        struct kmx_peer        *peer    = NULL;
+        struct list_head       *tmp_tx  = NULL;
+
+        cfs_daemonize("mxlnd_tx_queued");
+        //cfs_block_allsigs();
+
+        while (!kmxlnd_data.kmx_shutdown) {
+                ret = down_interruptible(&kmxlnd_data.kmx_tx_queue_sem);
+                if (kmxlnd_data.kmx_shutdown)
+                        break;
+                if (ret != 0) // Should we check for -EINTR?
+                        continue;
+                spin_lock(&kmxlnd_data.kmx_tx_queue_lock);
+                if (list_empty (&kmxlnd_data.kmx_tx_queue)) {
+                        spin_unlock(&kmxlnd_data.kmx_tx_queue_lock);
+                        continue;
+                }
+                tmp_tx = &kmxlnd_data.kmx_tx_queue;
+                tx = list_entry (tmp_tx->next, struct kmx_ctx, mxc_list);
+                list_del_init(&tx->mxc_list);
+                spin_unlock(&kmxlnd_data.kmx_tx_queue_lock);
+
+                found = 0;
+                peer = mxlnd_find_peer_by_nid(tx->mxc_nid);
+                if (peer != NULL) {
+                        tx->mxc_peer = peer;
+                        tx->mxc_conn = peer->mxp_conn;
+                        mxlnd_conn_addref(tx->mxc_conn); /* for this tx */
+                        mxlnd_queue_tx(tx);
+                        found = 1;
+                }
+                if (found == 0) {
+                        int              hash   = 0;
+                        struct kmx_peer *peer = NULL;
+                        struct kmx_peer *old = NULL;
+
+                        hash = mxlnd_nid_to_hash(tx->mxc_nid);
+
+                        LASSERT(tx->mxc_msg_type != MXLND_MSG_PUT_DATA &&
+                                tx->mxc_msg_type != MXLND_MSG_GET_DATA);
+                        /* create peer */
+                        ret = mxlnd_peer_alloc(&peer, tx->mxc_nid);
+                        if (ret != 0) {
+                                /* finalize message */
+                                tx->mxc_status.code = -ECONNABORTED;
+                                mxlnd_put_idle_tx(tx);
+                                continue;
+                        }
+                        tx->mxc_peer = peer;
+                        tx->mxc_conn = peer->mxp_conn;
+
+                        /* add peer to global peer list, but look to see
+                         * if someone already created it after we released
+                         * the read lock */
+                        write_lock(&kmxlnd_data.kmx_peers_lock);
+                        list_for_each_entry(old, &kmxlnd_data.kmx_peers[hash], mxp_peers) {
+                                if (old->mxp_nid == peer->mxp_nid) {
+                                        /* somebody beat us here, we created a duplicate */
+                                        found = 1;
+                                        break;
+                                }
+                        }
+
+                        if (found == 0) {
+                                list_add_tail(&peer->mxp_peers, &kmxlnd_data.kmx_peers[hash]);
+                                atomic_inc(&kmxlnd_data.kmx_npeers);
+                        } else {
+                                tx->mxc_peer = old;
+                                tx->mxc_conn = old->mxp_conn;
+                                mxlnd_reduce_idle_rxs(*kmxlnd_tunables.kmx_credits - 1);
+                                mxlnd_peer_decref(peer);
+                        }
+                        mxlnd_conn_addref(tx->mxc_conn); /* for this tx */
+                        write_unlock(&kmxlnd_data.kmx_peers_lock);
+
+                        mxlnd_queue_tx(tx);
+                }
+        }
+        mxlnd_thread_stop(id);
+        return 0;
+}
+
+/* When calling this, we must not have the peer lock. */
+void
+mxlnd_iconnect(struct kmx_peer *peer, u64 mask)
+{
+        mx_return_t             mxret   = MX_SUCCESS;
+        mx_request_t            request;
+        struct kmx_conn         *conn   = peer->mxp_conn;
+
+        mxlnd_conn_addref(conn); /* hold until CONN_REQ or CONN_ACK completes */
+
+        LASSERT(mask == MXLND_MASK_ICON_REQ ||
+                mask == MXLND_MASK_ICON_ACK);
+
+        if (peer->mxp_reconnect_time == 0) {
+                peer->mxp_reconnect_time = jiffies;
+        }
+
+        if (peer->mxp_nic_id == 0LL) {
+                mxlnd_peer_hostname_to_nic_id(peer);
+                if (peer->mxp_nic_id == 0LL) {
+                        /* not mapped yet, return */
+                        spin_lock(&conn->mxk_lock);
+                        conn->mxk_status = MXLND_CONN_INIT;
+                        spin_unlock(&conn->mxk_lock);
+                        if (time_after(jiffies, peer->mxp_reconnect_time + MXLND_WAIT_TIMEOUT)) {
+                                /* give up and notify LNET */
+                                mxlnd_conn_disconnect(conn, 0, 1);
+                                mxlnd_conn_alloc(&peer->mxp_conn, peer);
+                        }
+                        mxlnd_conn_decref(conn);
+                        return;
+                }
+        }
+
+        mxret = mx_iconnect(kmxlnd_data.kmx_endpt, peer->mxp_nic_id, 
+                            peer->mxp_host->mxh_ep_id, MXLND_MSG_MAGIC, mask, 
+                            (void *) peer, &request);
+        if (unlikely(mxret != MX_SUCCESS)) {
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_status = MXLND_CONN_FAIL;
+                spin_unlock(&conn->mxk_lock);
+                CDEBUG(D_NETERROR, "mx_iconnect() failed with %s (%d) to %s\n",
+                       mx_strerror(mxret), mxret, libcfs_nid2str(peer->mxp_nid));
+                mxlnd_conn_decref(conn);
+        }
+        return;
+}
+
+#define MXLND_STATS 0
+
+int
+mxlnd_check_sends(struct kmx_peer *peer)
+{
+        int                     ret             = 0;
+        int                     found           = 0;
+        mx_return_t             mxret           = MX_SUCCESS;
+        struct kmx_ctx          *tx             = NULL;
+        struct kmx_conn         *conn           = NULL;
+        u8                      msg_type        = 0;
+        int                     credit          = 0;
+        int                     status          = 0;
+        int                     ntx_posted      = 0;
+        int                     credits         = 0;
+#if MXLND_STATS
+        static unsigned long    last            = 0;
+#endif
+
+        if (unlikely(peer == NULL)) {
+                LASSERT(peer != NULL);
+                return -1;
+        }
+        conn = peer->mxp_conn;
+        /* do not add another ref for this tx */
+
+        if (conn == NULL) {
+                /* we do not have any conns */
+                return -1;
+        }
+
+#if MXLND_STATS
+        if (time_after(jiffies, last)) {
+                last = jiffies + HZ;
+                CDEBUG(D_NET, "status= %s credits= %d outstanding= %d ntx_msgs= %d "
+                              "ntx_posted= %d ntx_data= %d data_posted= %d\n", 
+                              mxlnd_connstatus_to_str(conn->mxk_status), conn->mxk_credits, 
+                              conn->mxk_outstanding, conn->mxk_ntx_msgs, conn->mxk_ntx_posted,
+                              conn->mxk_ntx_data, conn->mxk_data_posted);
+        }
+#endif
+
+        /* cache peer state for asserts */
+        spin_lock(&conn->mxk_lock);
+        ntx_posted = conn->mxk_ntx_posted;
+        credits = conn->mxk_credits;
+        spin_unlock(&conn->mxk_lock);
+
+        LASSERT(ntx_posted <= *kmxlnd_tunables.kmx_credits);
+        LASSERT(ntx_posted >= 0);
+
+        LASSERT(credits <= *kmxlnd_tunables.kmx_credits);
+        LASSERT(credits >= 0);
+
+        /* check number of queued msgs, ignore data */
+        spin_lock(&conn->mxk_lock);
+        if (conn->mxk_outstanding >= MXLND_CREDIT_HIGHWATER) {
+                /* check if any txs queued that could return credits... */
+                if (list_empty(&conn->mxk_tx_credit_queue) || conn->mxk_ntx_msgs == 0) {
+                        /* if not, send a NOOP */
+                        tx = mxlnd_get_idle_tx();
+                        if (likely(tx != NULL)) {
+                                tx->mxc_peer = peer;
+                                tx->mxc_conn = peer->mxp_conn;
+                                mxlnd_conn_addref(conn); /* for this tx */
+                                mxlnd_init_tx_msg (tx, MXLND_MSG_NOOP, 0, peer->mxp_nid);
+                                tx->mxc_match = mxlnd_create_match(tx, 0);
+                                mxlnd_peer_queue_tx_locked(tx);
+                                found = 1;
+                                goto done_locked;
+                        }
+                }
+        }
+        spin_unlock(&conn->mxk_lock);
+
+        /* if the peer is not ready, try to connect */
+        spin_lock(&conn->mxk_lock);
+        if (unlikely(conn->mxk_status == MXLND_CONN_INIT ||
+            conn->mxk_status == MXLND_CONN_FAIL ||
+            conn->mxk_status == MXLND_CONN_REQ)) {
+                CDEBUG(D_NET, "status=%s\n", mxlnd_connstatus_to_str(conn->mxk_status));
+                conn->mxk_status = MXLND_CONN_WAIT;
+                spin_unlock(&conn->mxk_lock);
+                mxlnd_iconnect(peer, MXLND_MASK_ICON_REQ);
+                goto done;
+        }
+        spin_unlock(&conn->mxk_lock);
+
+        spin_lock(&conn->mxk_lock);
+        while (!list_empty(&conn->mxk_tx_free_queue) ||
+               !list_empty(&conn->mxk_tx_credit_queue)) {
+                /* We have something to send. If we have a queued tx that does not
+                 * require a credit (free), choose it since its completion will 
+                 * return a credit (here or at the peer), complete a DATA or 
+                 * CONN_REQ or CONN_ACK. */
+                struct list_head *tmp_tx = NULL;
+                if (!list_empty(&conn->mxk_tx_free_queue)) {
+                        tmp_tx = &conn->mxk_tx_free_queue;
+                } else {
+                        tmp_tx = &conn->mxk_tx_credit_queue;
+                }
+                tx = list_entry(tmp_tx->next, struct kmx_ctx, mxc_list);
+
+                msg_type = tx->mxc_msg_type;
+
+                /* don't try to send a rx */
+                LASSERT(tx->mxc_type == MXLND_REQ_TX);
+
+                /* ensure that it is a valid msg type */
+                LASSERT(msg_type == MXLND_MSG_CONN_REQ ||
+                        msg_type == MXLND_MSG_CONN_ACK ||
+                        msg_type == MXLND_MSG_NOOP     ||
+                        msg_type == MXLND_MSG_EAGER    ||
+                        msg_type == MXLND_MSG_PUT_REQ  ||
+                        msg_type == MXLND_MSG_PUT_ACK  ||
+                        msg_type == MXLND_MSG_PUT_DATA ||
+                        msg_type == MXLND_MSG_GET_REQ  ||
+                        msg_type == MXLND_MSG_GET_DATA);
+                LASSERT(tx->mxc_peer == peer);
+                LASSERT(tx->mxc_nid == peer->mxp_nid);
+
+                credit = mxlnd_tx_requires_credit(tx);
+                if (credit) {
+
+                        if (conn->mxk_ntx_posted == *kmxlnd_tunables.kmx_credits) {
+                                CDEBUG(D_NET, "%s: posted enough\n", 
+                                              libcfs_nid2str(peer->mxp_nid));
+                                goto done_locked;
+                        }
+        
+                        if (conn->mxk_credits == 0) {
+                                CDEBUG(D_NET, "%s: no credits\n", 
+                                              libcfs_nid2str(peer->mxp_nid));
+                                goto done_locked;
+                        }
+
+                        if (conn->mxk_credits == 1 &&      /* last credit reserved for */
+                            conn->mxk_outstanding == 0) {  /* giving back credits */
+                                CDEBUG(D_NET, "%s: not using last credit\n",
+                                              libcfs_nid2str(peer->mxp_nid));
+                                goto done_locked;
+                        }
+                }
+
+                if (unlikely(conn->mxk_status != MXLND_CONN_READY)) {
+                        if ( ! (msg_type == MXLND_MSG_CONN_REQ ||
+                                msg_type == MXLND_MSG_CONN_ACK)) {
+                                CDEBUG(D_NET, "peer status is %s for tx 0x%llx (%s)\n",
+                                             mxlnd_connstatus_to_str(conn->mxk_status), 
+                                             tx->mxc_cookie, 
+                                             mxlnd_msgtype_to_str(tx->mxc_msg_type));
+                                if (conn->mxk_status == MXLND_CONN_DISCONNECT) {
+                                        list_del_init(&tx->mxc_list);
+                                        tx->mxc_status.code = -ECONNABORTED;
+                                        mxlnd_put_idle_tx(tx);
+                                        mxlnd_conn_decref(conn);
+                                }
+                                goto done_locked;
+                        }
+                }
+
+                list_del_init(&tx->mxc_list);
+
+                /* handle credits, etc now while we have the lock to avoid races */
+                if (credit) {
+                        conn->mxk_credits--;
+                        conn->mxk_ntx_posted++;
+                }
+                if (msg_type != MXLND_MSG_PUT_DATA &&
+                    msg_type != MXLND_MSG_GET_DATA) {
+                        if (msg_type != MXLND_MSG_CONN_REQ &&
+                            msg_type != MXLND_MSG_CONN_ACK) {
+                                conn->mxk_ntx_msgs--;
+                        }
+                }
+                if (tx->mxc_incarnation == 0 &&
+                    conn->mxk_incarnation != 0) {
+                        tx->mxc_incarnation = conn->mxk_incarnation;
+                }
+                spin_unlock(&conn->mxk_lock);
+
+                /* if this is a NOOP and (1) mxp_conn->mxk_outstanding < CREDIT_HIGHWATER 
+                 * or (2) there is a non-DATA msg that can return credits in the 
+                 * queue, then drop this duplicate NOOP */
+                if (unlikely(msg_type == MXLND_MSG_NOOP)) {
+                        spin_lock(&conn->mxk_lock);
+                        if ((conn->mxk_outstanding < MXLND_CREDIT_HIGHWATER) ||
+                            (conn->mxk_ntx_msgs >= 1)) {
+                                conn->mxk_credits++;
+                                conn->mxk_ntx_posted--;
+                                spin_unlock(&conn->mxk_lock);
+                                /* redundant NOOP */
+                                mxlnd_put_idle_tx(tx);
+                                mxlnd_conn_decref(conn);
+                                CDEBUG(D_NET, "%s: redundant noop\n",
+                                              libcfs_nid2str(peer->mxp_nid));
+                                found = 1;
+                                goto done;
+                        }
+                        spin_unlock(&conn->mxk_lock);
+                }
+
+                found = 1;
+                if (likely((msg_type != MXLND_MSG_PUT_DATA) &&
+                    (msg_type != MXLND_MSG_GET_DATA))) {
+                        mxlnd_pack_msg(tx);
+                }
+
+                //ret = -ECONNABORTED;
+                mxret = MX_SUCCESS;
+
+                spin_lock(&conn->mxk_lock);
+                status = conn->mxk_status;
+                spin_unlock(&conn->mxk_lock);
+
+                if (likely((status == MXLND_CONN_READY) ||
+                    (msg_type == MXLND_MSG_CONN_REQ) ||
+                    (msg_type == MXLND_MSG_CONN_ACK))) {
+                        ret = 0;
+                        if (msg_type != MXLND_MSG_CONN_REQ &&
+                            msg_type != MXLND_MSG_CONN_ACK) {
+                                /* add to the pending list */
+                                ret = mxlnd_q_pending_ctx(tx);
+                                if (ret == -1) {
+                                        /* FIXME the conn is disconnected, now what? */
+                                }
+                        } else {
+                                /* CONN_REQ/ACK */
+                                tx->mxc_state = MXLND_CTX_PENDING;
+                        }
+
+                        if (ret == 0) {
+                                if (likely(msg_type != MXLND_MSG_PUT_DATA &&
+                                    msg_type != MXLND_MSG_GET_DATA)) {
+                                        /* send a msg style tx */
+                                        LASSERT(tx->mxc_nseg == 1);
+                                        LASSERT(tx->mxc_pin_type == MX_PIN_PHYSICAL);
+                                        CDEBUG(D_NET, "sending %s 0x%llx\n", 
+                                               mxlnd_msgtype_to_str(msg_type),
+                                               tx->mxc_cookie);
+                                        mxret = mx_kisend(kmxlnd_data.kmx_endpt, 
+                                                          &tx->mxc_seg, 
+                                                          tx->mxc_nseg,
+                                                          tx->mxc_pin_type,
+                                                          conn->mxk_epa, 
+                                                          tx->mxc_match, 
+                                                          (void *) tx,
+                                                          &tx->mxc_mxreq);
+                                } else {
+                                        /* send a DATA tx */
+                                        spin_lock(&conn->mxk_lock);
+                                        conn->mxk_ntx_data--;
+                                        conn->mxk_data_posted++;
+                                        spin_unlock(&conn->mxk_lock);
+                                        CDEBUG(D_NET, "sending %s 0x%llx\n", 
+                                               mxlnd_msgtype_to_str(msg_type), 
+                                               tx->mxc_cookie);
+                                        mxret = mx_kisend(kmxlnd_data.kmx_endpt, 
+                                                          tx->mxc_seg_list, 
+                                                          tx->mxc_nseg,
+                                                          tx->mxc_pin_type,
+                                                          conn->mxk_epa, 
+                                                          tx->mxc_match, 
+                                                          (void *) tx,
+                                                          &tx->mxc_mxreq);
+                                }
+                        } else {
+                                mxret = MX_CONNECTION_FAILED;
+                        }
+                        if (likely(mxret == MX_SUCCESS)) {
+                                ret = 0;
+                        } else {
+                                CDEBUG(D_NETERROR, "mx_kisend() failed with %s (%d) "
+                                       "sending to %s\n", mx_strerror(mxret), (int) mxret, 
+                                       libcfs_nid2str(peer->mxp_nid));
+                                /* NOTE mx_kisend() only fails if there are not enough 
+                                * resources. Do not change the connection status. */
+                                if (mxret == MX_NO_RESOURCES) {
+                                        tx->mxc_status.code = -ENOMEM;
+                                } else {
+                                        tx->mxc_status.code = -ECONNABORTED;
+                                }
+                                if (credit) {
+                                        spin_lock(&conn->mxk_lock);
+                                        conn->mxk_ntx_posted--;
+                                        conn->mxk_credits++;
+                                        spin_unlock(&conn->mxk_lock);
+                                } else if (msg_type == MXLND_MSG_PUT_DATA || 
+                                        msg_type == MXLND_MSG_GET_DATA) {
+                                        spin_lock(&conn->mxk_lock);
+                                        conn->mxk_data_posted--;
+                                        spin_unlock(&conn->mxk_lock);
+                                }
+                                if (msg_type != MXLND_MSG_PUT_DATA &&
+                                    msg_type != MXLND_MSG_GET_DATA &&
+                                    msg_type != MXLND_MSG_CONN_REQ &&
+                                    msg_type != MXLND_MSG_CONN_ACK) {
+                                        spin_lock(&conn->mxk_lock);
+                                        conn->mxk_outstanding += tx->mxc_msg->mxm_credits;
+                                        spin_unlock(&conn->mxk_lock);
+                                }
+                                if (msg_type != MXLND_MSG_CONN_REQ &&
+                                    msg_type != MXLND_MSG_CONN_ACK) {
+                                        /* remove from the pending list */
+                                        mxlnd_deq_pending_ctx(tx);
+                                }
+                                mxlnd_put_idle_tx(tx);
+                                mxlnd_conn_decref(conn);
+                        }
+                }
+                spin_lock(&conn->mxk_lock);
+        }
+done_locked:
+        spin_unlock(&conn->mxk_lock);
+done:
+        return found;
+}
+
+
+/**
+ * mxlnd_handle_tx_completion - a tx completed, progress or complete the msg
+ * @ctx - the tx descriptor
+ *
+ * Determine which type of send request it was and start the next step, if needed,
+ * or, if done, signal completion to LNET. After we are done, put back on the
+ * idle tx list.
+ */
+void
+mxlnd_handle_tx_completion(struct kmx_ctx *tx)
+{
+        int             failed  = (tx->mxc_status.code != MX_STATUS_SUCCESS);
+        struct kmx_msg  *msg    = tx->mxc_msg;
+        struct kmx_peer *peer   = tx->mxc_peer;
+        struct kmx_conn *conn   = tx->mxc_conn;
+        u8              type    = tx->mxc_msg_type;
+        int             credit  = mxlnd_tx_requires_credit(tx);
+        u64             cookie  = tx->mxc_cookie;
+
+        CDEBUG(D_NET, "entering %s (0x%llx):\n", 
+                      mxlnd_msgtype_to_str(tx->mxc_msg_type), cookie);
+
+        if (unlikely(conn == NULL)) {
+                mx_get_endpoint_addr_context(tx->mxc_status.source, (void **) &conn);
+                if (conn != NULL) {
+                        /* do not add a ref for the tx, it was set before sending */
+                        tx->mxc_conn = conn;
+                        tx->mxc_peer = conn->mxk_peer;
+                }
+        }
+        LASSERT (peer != NULL);
+        LASSERT (conn != NULL);
+
+        if (type != MXLND_MSG_PUT_DATA && type != MXLND_MSG_GET_DATA) {
+                LASSERT (type == msg->mxm_type);
+        }
+
+        if (failed) {
+                tx->mxc_status.code = -EIO;
+        } else {
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_last_tx = jiffies;
+                spin_unlock(&conn->mxk_lock);
+        }
+
+        switch (type) {
+
+        case MXLND_MSG_GET_DATA:
+                spin_lock(&conn->mxk_lock);
+                if (conn->mxk_incarnation == tx->mxc_incarnation) {
+                        conn->mxk_outstanding++;
+                        conn->mxk_data_posted--;
+                }
+                spin_unlock(&conn->mxk_lock);
+                break;
+
+        case MXLND_MSG_PUT_DATA:
+                spin_lock(&conn->mxk_lock);
+                if (conn->mxk_incarnation == tx->mxc_incarnation) {
+                        conn->mxk_data_posted--;
+                }
+                spin_unlock(&conn->mxk_lock);
+                break;
+
+        case MXLND_MSG_NOOP:
+        case MXLND_MSG_PUT_REQ:
+        case MXLND_MSG_PUT_ACK:
+        case MXLND_MSG_GET_REQ:
+        case MXLND_MSG_EAGER:
+        //case MXLND_MSG_NAK:
+                break;
+
+        case MXLND_MSG_CONN_ACK:
+                if (peer->mxp_incompatible) {
+                        /* we sent our params, now close this conn */
+                        mxlnd_conn_disconnect(conn, 0, 1);
+                }
+        case MXLND_MSG_CONN_REQ:
+                if (failed) {
+                        CDEBUG(D_NETERROR, "handle_tx_completion(): %s "
+                               "failed with %s (%d) to %s\n",
+                               type == MXLND_MSG_CONN_REQ ? "CONN_REQ" : "CONN_ACK",
+                               mx_strstatus(tx->mxc_status.code), 
+                               tx->mxc_status.code,
+                               libcfs_nid2str(tx->mxc_nid));
+                        if (!peer->mxp_incompatible) {
+                                spin_lock(&conn->mxk_lock);
+                                conn->mxk_status = MXLND_CONN_FAIL;
+                                spin_unlock(&conn->mxk_lock);
+                        }
+                }
+                break;
+
+        default:
+                CDEBUG(D_NETERROR, "Unknown msg type of %d\n", type);
+                LBUG();
+        }
+
+        if (credit) {
+                spin_lock(&conn->mxk_lock);
+                if (conn->mxk_incarnation == tx->mxc_incarnation) {
+                        conn->mxk_ntx_posted--;
+                }
+                spin_unlock(&conn->mxk_lock);
+        }
+
+        CDEBUG(D_NET, "leaving mxlnd_handle_tx_completion()\n");
+        mxlnd_put_idle_tx(tx);
+        mxlnd_conn_decref(conn);
+
+        mxlnd_check_sends(peer);
+
+        return;
+}
+
+void
+mxlnd_handle_rx_completion(struct kmx_ctx *rx)
+{
+        int                     ret             = 0;
+        int                     repost          = 1;
+        int                     credit          = 1;
+        u32                     nob             = rx->mxc_status.xfer_length;
+        u64                     bits            = rx->mxc_status.match_info;
+        struct kmx_msg         *msg             = rx->mxc_msg;
+        struct kmx_peer        *peer            = rx->mxc_peer;
+        struct kmx_conn        *conn            = rx->mxc_conn;
+        u8                      type            = rx->mxc_msg_type;
+        u64                     seq             = 0LL;
+        lnet_msg_t             *lntmsg[2];
+        int                     result          = 0;
+        u64                     nic_id          = 0LL;
+        u32                     ep_id           = 0;
+        int                     decref          = 1;
+        int                     incompatible    = 0;
+
+        /* NOTE We may only know the peer's nid if it is a PUT_REQ, GET_REQ, 
+         * failed GET reply, CONN_REQ, or a CONN_ACK */
+
+        /* NOTE peer may still be NULL if it is a new peer */
+        if (peer == NULL || conn == NULL) {
+                /* if the peer was disconnected, the peer may exist but
+                 * not have any valid conns */
+                decref = 0; /* no peer means no ref was taken for this rx */
+        }
+
+        if (conn == NULL && peer != NULL) {
+                conn = peer->mxp_conn;
+                rx->mxc_conn = conn;
+        }
+
+#if MXLND_DEBUG
+        CDEBUG(D_NET, "receiving msg bits=0x%llx nob=%d peer=0x%p\n", bits, nob, peer);
+#endif
+
+        lntmsg[0] = NULL;
+        lntmsg[1] = NULL;
+
+        if (rx->mxc_status.code != MX_STATUS_SUCCESS) {
+                CDEBUG(D_NETERROR, "rx from %s failed with %s (%d)\n",
+                                   libcfs_nid2str(rx->mxc_nid),
+                                   mx_strstatus(rx->mxc_status.code),
+                                   (int) rx->mxc_status.code);
+                credit = 0;
+                goto cleanup;
+        }
+
+        if (nob == 0) {
+                /* this may be a failed GET reply */
+                if (type == MXLND_MSG_GET_DATA) {
+                        bits = rx->mxc_status.match_info & 0x0FF0000000000000LL; 
+                        ret = (u32) (bits>>52);
+                        lntmsg[0] = rx->mxc_lntmsg[0];
+                        result = -ret;
+                        goto cleanup;
+                } else {
+                        /* we had a rx complete with 0 bytes (no hdr, nothing) */
+                        CDEBUG(D_NETERROR, "rx from %s returned with 0 bytes\n",
+                                           libcfs_nid2str(rx->mxc_nid));
+                        goto cleanup;
+                }
+        }
+
+        /* NOTE PUT_DATA and GET_DATA do not have mxc_msg, do not call unpack() */
+        if (type == MXLND_MSG_PUT_DATA) {
+                result = rx->mxc_status.code;
+                lntmsg[0] = rx->mxc_lntmsg[0];
+                goto cleanup;
+        } else if (type == MXLND_MSG_GET_DATA) {
+                result = rx->mxc_status.code;
+                lntmsg[0] = rx->mxc_lntmsg[0];
+                lntmsg[1] = rx->mxc_lntmsg[1];
+                goto cleanup;
+        }
+
+        ret = mxlnd_unpack_msg(msg, nob);
+        if (ret != 0) {
+                CDEBUG(D_NETERROR, "Error %d unpacking rx from %s\n",
+                                   ret, libcfs_nid2str(rx->mxc_nid));
+                goto cleanup;
+        }
+        rx->mxc_nob = nob;
+        type = msg->mxm_type;
+        seq = msg->mxm_seq;
+
+        if (type != MXLND_MSG_CONN_REQ &&
+            (!lnet_ptlcompat_matchnid(rx->mxc_nid, msg->mxm_srcnid) ||
+             !lnet_ptlcompat_matchnid(kmxlnd_data.kmx_ni->ni_nid, msg->mxm_dstnid))) {
+                CDEBUG(D_NETERROR, "rx with mismatched NID (type %s) (my nid is "
+                       "0x%llx and rx msg dst is 0x%llx)\n", 
+                       mxlnd_msgtype_to_str(type), kmxlnd_data.kmx_ni->ni_nid, 
+                       msg->mxm_dstnid);
+                goto cleanup;
+        }
+
+        if (type != MXLND_MSG_CONN_REQ && type != MXLND_MSG_CONN_ACK) {
+                if ((conn != NULL && msg->mxm_srcstamp != conn->mxk_incarnation) ||
+                    msg->mxm_dststamp != kmxlnd_data.kmx_incarnation) {
+                        if (conn != NULL) {
+                                CDEBUG(D_NETERROR, "Stale rx from %s with type %s "
+                                       "(mxm_srcstamp (%lld) != mxk_incarnation (%lld) "
+                                       "|| mxm_dststamp (%lld) != kmx_incarnation (%lld))\n", 
+                                       libcfs_nid2str(rx->mxc_nid), mxlnd_msgtype_to_str(type),
+                                       msg->mxm_srcstamp, conn->mxk_incarnation, 
+                                       msg->mxm_dststamp, kmxlnd_data.kmx_incarnation);
+                        } else {
+                                CDEBUG(D_NETERROR, "Stale rx from %s with type %s "
+                                       "mxm_dststamp (%lld) != kmx_incarnation (%lld))\n", 
+                                       libcfs_nid2str(rx->mxc_nid), mxlnd_msgtype_to_str(type),
+                                       msg->mxm_dststamp, kmxlnd_data.kmx_incarnation);
+                        }
+                        credit = 0;
+                        goto cleanup;
+                }
+        }
+
+        CDEBUG(D_NET, "Received %s with %d credits\n", 
+                      mxlnd_msgtype_to_str(type), msg->mxm_credits);
+
+        if (msg->mxm_type != MXLND_MSG_CONN_REQ &&
+            msg->mxm_type != MXLND_MSG_CONN_ACK) {
+                LASSERT(peer != NULL);
+                LASSERT(conn != NULL);
+                if (msg->mxm_credits != 0) {
+                        spin_lock(&conn->mxk_lock);
+                        if (msg->mxm_srcstamp == conn->mxk_incarnation) {
+                                if ((conn->mxk_credits + msg->mxm_credits) > 
+                                     *kmxlnd_tunables.kmx_credits) {
+                                        CDEBUG(D_NETERROR, "mxk_credits %d  mxm_credits %d\n",
+                                               conn->mxk_credits, msg->mxm_credits);
+                                }
+                                conn->mxk_credits += msg->mxm_credits;
+                                LASSERT(conn->mxk_credits >= 0);
+                                LASSERT(conn->mxk_credits <= *kmxlnd_tunables.kmx_credits);
+                        }
+                        spin_unlock(&conn->mxk_lock);
+                }
+        }
+
+        CDEBUG(D_NET, "switch %s for rx (0x%llx)\n", mxlnd_msgtype_to_str(type), seq);
+        switch (type) {
+        case MXLND_MSG_NOOP:
+                break;
+
+        case MXLND_MSG_EAGER:
+                ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.eager.mxem_hdr,
+                                        msg->mxm_srcnid, rx, 0);
+                repost = ret < 0;
+                break;
+
+        case MXLND_MSG_PUT_REQ:
+                ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.put_req.mxprm_hdr,
+                                        msg->mxm_srcnid, rx, 1);
+                repost = ret < 0;
+                break;
+
+        case MXLND_MSG_PUT_ACK: {
+                u64  cookie = (u64) msg->mxm_u.put_ack.mxpam_dst_cookie;
+                if (cookie > MXLND_MAX_COOKIE) {
+                        CDEBUG(D_NETERROR, "NAK for msg_type %d from %s\n", rx->mxc_msg_type, 
+                                           libcfs_nid2str(rx->mxc_nid));
+                        result = -((cookie >> 52) & 0xff);
+                        lntmsg[0] = rx->mxc_lntmsg[0];
+                } else {
+                        mxlnd_send_data(kmxlnd_data.kmx_ni, rx->mxc_lntmsg[0], 
+                                        rx->mxc_peer, MXLND_MSG_PUT_DATA, 
+                                        rx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie);
+                }
+                /* repost == 1 */
+                break;
+        }
+        case MXLND_MSG_GET_REQ:
+                ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.get_req.mxgrm_hdr,
+                                        msg->mxm_srcnid, rx, 1);
+                repost = ret < 0;
+                break;
+
+        case MXLND_MSG_CONN_REQ:
+                if (!lnet_ptlcompat_matchnid(kmxlnd_data.kmx_ni->ni_nid, msg->mxm_dstnid)) {
+                        CDEBUG(D_NETERROR, "Can't accept %s: bad dst nid %s\n",
+                                        libcfs_nid2str(msg->mxm_srcnid),
+                                        libcfs_nid2str(msg->mxm_dstnid));
+                        goto cleanup;
+                }
+                if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_credits) {
+                        CDEBUG(D_NETERROR, "Can't accept %s: incompatible queue depth "
+                                    "%d (%d wanted)\n",
+                                        libcfs_nid2str(msg->mxm_srcnid),
+                                        msg->mxm_u.conn_req.mxcrm_queue_depth,
+                                        *kmxlnd_tunables.kmx_credits);
+                        incompatible = 1;
+                }
+                if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_EAGER_SIZE) {
+                        CDEBUG(D_NETERROR, "Can't accept %s: incompatible EAGER size "
+                                    "%d (%d wanted)\n",
+                                        libcfs_nid2str(msg->mxm_srcnid),
+                                        msg->mxm_u.conn_req.mxcrm_eager_size,
+                                        (int) MXLND_EAGER_SIZE);
+                        incompatible = 1;
+                }
+                if (peer == NULL) {
+                        peer = mxlnd_find_peer_by_nid(msg->mxm_srcnid);
+                        if (peer == NULL) {
+                                int hash        = 0;
+                                hash = mxlnd_nid_to_hash(msg->mxm_srcnid);
+        
+                                mx_decompose_endpoint_addr(rx->mxc_status.source,
+                                                           &nic_id, &ep_id);
+                                rx->mxc_nid = msg->mxm_srcnid;
+        
+                                ret = mxlnd_peer_alloc(&peer, msg->mxm_srcnid);
+                                if (ret != 0) {
+                                        goto cleanup;
+                                }
+                                LASSERT(peer->mxp_host->mxh_ep_id == ep_id);
+                                write_lock(&kmxlnd_data.kmx_peers_lock);
+                                list_add_tail(&peer->mxp_peers,
+                                              &kmxlnd_data.kmx_peers[hash]);
+                                write_unlock(&kmxlnd_data.kmx_peers_lock);
+                                atomic_inc(&kmxlnd_data.kmx_npeers);
+                        } else {
+                                ret = mxlnd_conn_alloc(&conn, peer);
+                                if (ret != 0) {
+                                        CDEBUG(D_NETERROR, "Cannot allocate mxp_conn\n");
+                                        goto cleanup;
+                                }
+                        }
+                        conn = peer->mxp_conn;
+                } else {
+                        struct kmx_conn *old_conn       = conn;
+
+                        /* do not call mx_disconnect() */
+                        mxlnd_conn_disconnect(old_conn, 0, 0);
+
+                        /* the ref for this rx was taken on the old_conn */
+                        mxlnd_conn_decref(old_conn);
+
+                        /* do not decref this conn below */
+                        decref = 0;
+
+                        /* This allocs a conn, points peer->mxp_conn to this one.
+                         * The old conn is still on the peer->mxp_conns list.
+                         * As the pending requests complete, they will call
+                         * conn_decref() which will eventually free it. */
+                        ret = mxlnd_conn_alloc(&conn, peer);
+                        if (ret != 0) {
+                                CDEBUG(D_NETERROR, "Cannot allocate peer->mxp_conn\n");
+                                goto cleanup;
+                        }
+                }
+                spin_lock(&peer->mxp_lock);
+                peer->mxp_incarnation = msg->mxm_srcstamp;
+                peer->mxp_incompatible = incompatible;
+                spin_unlock(&peer->mxp_lock);
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_incarnation = msg->mxm_srcstamp;
+                conn->mxk_status = MXLND_CONN_WAIT;
+                spin_unlock(&conn->mxk_lock);
+
+                /* handle_conn_ack() will create the CONN_ACK msg */
+                mxlnd_iconnect(peer, MXLND_MASK_ICON_ACK);
+
+                break;
+
+        case MXLND_MSG_CONN_ACK:
+                if (!lnet_ptlcompat_matchnid(kmxlnd_data.kmx_ni->ni_nid, msg->mxm_dstnid)) {
+                        CDEBUG(D_NETERROR, "Can't accept CONN_ACK from %s: "
+                               "bad dst nid %s\n", libcfs_nid2str(msg->mxm_srcnid),
+                                libcfs_nid2str(msg->mxm_dstnid));
+                        ret = -1;
+                        goto failed;
+                }
+                if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_credits) {
+                        CDEBUG(D_NETERROR, "Can't accept CONN_ACK from %s: "
+                               "incompatible queue depth %d (%d wanted)\n",
+                                libcfs_nid2str(msg->mxm_srcnid),
+                                msg->mxm_u.conn_req.mxcrm_queue_depth,
+                                *kmxlnd_tunables.kmx_credits);
+                        spin_lock(&conn->mxk_lock);
+                        conn->mxk_status = MXLND_CONN_FAIL;
+                        spin_unlock(&conn->mxk_lock);
+                        incompatible = 1;
+                        ret = -1;
+                }
+                if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_EAGER_SIZE) {
+                        CDEBUG(D_NETERROR, "Can't accept CONN_ACK from %s: "
+                               "incompatible EAGER size %d (%d wanted)\n",
+                                libcfs_nid2str(msg->mxm_srcnid),
+                                msg->mxm_u.conn_req.mxcrm_eager_size,
+                                (int) MXLND_EAGER_SIZE);
+                        spin_lock(&conn->mxk_lock);
+                        conn->mxk_status = MXLND_CONN_FAIL;
+                        spin_unlock(&conn->mxk_lock);
+                        incompatible = 1;
+                        ret = -1;
+                }
+                spin_lock(&peer->mxp_lock);
+                peer->mxp_incarnation = msg->mxm_srcstamp;
+                peer->mxp_incompatible = incompatible;
+                spin_unlock(&peer->mxp_lock);
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_credits = *kmxlnd_tunables.kmx_credits;
+                conn->mxk_outstanding = 0;
+                conn->mxk_incarnation = msg->mxm_srcstamp;
+                conn->mxk_timeout = 0;
+                if (!incompatible) {
+                        conn->mxk_status = MXLND_CONN_READY;
+                }
+                spin_unlock(&conn->mxk_lock);
+                if (incompatible) mxlnd_conn_disconnect(conn, 0, 1);
+                break;
+
+        default:
+                CDEBUG(D_NETERROR, "Bad MXLND message type %x from %s\n", msg->mxm_type,
+                                libcfs_nid2str(rx->mxc_nid));
+                ret = -EPROTO;
+                break;
+        }
+
+failed:
+        if (ret < 0) {
+                MXLND_PRINT("setting PEER_CONN_FAILED\n");
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_status = MXLND_CONN_FAIL;
+                spin_unlock(&conn->mxk_lock);
+        }
+
+cleanup:
+        if (conn != NULL) {
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_last_rx = cfs_time_current(); /* jiffies */
+                spin_unlock(&conn->mxk_lock);
+        }
+
+        if (repost) {
+                /* lnet_parse() failed, etc., repost now */
+                mxlnd_put_idle_rx(rx);
+                if (conn != NULL && credit == 1) {
+                        if (type == MXLND_MSG_PUT_DATA) {
+                                spin_lock(&conn->mxk_lock);
+                                conn->mxk_outstanding++;
+                                spin_unlock(&conn->mxk_lock);
+                        } else if (type != MXLND_MSG_GET_DATA &&
+                                  (type == MXLND_MSG_EAGER ||
+                                   type == MXLND_MSG_PUT_REQ ||
+                                   type == MXLND_MSG_NOOP)) {
+                                spin_lock(&conn->mxk_lock);
+                                conn->mxk_outstanding++;
+                                spin_unlock(&conn->mxk_lock);
+                        }
+                }
+                if (decref) mxlnd_conn_decref(conn);
+        }
+
+        if (type == MXLND_MSG_PUT_DATA || type == MXLND_MSG_GET_DATA) {
+                CDEBUG(D_NET, "leaving for rx (0x%llx)\n", bits);
+        } else {
+                CDEBUG(D_NET, "leaving for rx (0x%llx)\n", seq);
+        }
+
+        if (lntmsg[0] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result); 
+        if (lntmsg[1] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result); 
+
+        if (conn != NULL && credit == 1) mxlnd_check_sends(peer);
+
+        return;
+}
+
+
+
+void
+mxlnd_handle_conn_req(struct kmx_peer *peer, mx_status_t status)
+{
+        struct kmx_ctx  *tx     = NULL;
+        struct kmx_msg  *txmsg   = NULL;
+        struct kmx_conn *conn   = peer->mxp_conn;
+
+        /* a conn ref was taken when calling mx_iconnect(), 
+         * hold it until CONN_REQ or CONN_ACK completes */
+
+        CDEBUG(D_NET, "entering\n");
+        if (status.code != MX_STATUS_SUCCESS) {
+                CDEBUG(D_NETERROR, "mx_iconnect() failed with %s (%d) to %s\n", 
+                        mx_strstatus(status.code), status.code, 
+                        libcfs_nid2str(peer->mxp_nid));
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_status = MXLND_CONN_FAIL;
+                spin_unlock(&conn->mxk_lock);
+
+                if (time_after(jiffies, peer->mxp_reconnect_time + MXLND_WAIT_TIMEOUT)) {
+                        struct kmx_conn *new_conn       = NULL;
+                        CDEBUG(D_NETERROR, "timeout, calling conn_disconnect()\n");
+                        mxlnd_conn_disconnect(conn, 0, 1);
+                        mxlnd_conn_alloc(&new_conn, peer);
+                        spin_lock(&peer->mxp_lock);
+                        peer->mxp_reconnect_time = 0;
+                        spin_unlock(&peer->mxp_lock);
+                }
+
+                mxlnd_conn_decref(conn);
+                return;
+        }
+
+        spin_lock(&conn->mxk_lock);
+        conn->mxk_epa = status.source;
+        spin_unlock(&conn->mxk_lock);
+        mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn);
+
+        /* mx_iconnect() succeeded, reset delay to 0 */
+        spin_lock(&peer->mxp_lock);
+        peer->mxp_reconnect_time = 0;
+        spin_unlock(&peer->mxp_lock);
+
+        /* marshal CONN_REQ msg */
+        /* we are still using the conn ref from iconnect() - do not take another */
+        tx = mxlnd_get_idle_tx();
+        if (tx == NULL) {
+                CDEBUG(D_NETERROR, "Can't allocate CONN_REQ tx for %s\n", 
+                                   libcfs_nid2str(peer->mxp_nid));
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_status = MXLND_CONN_FAIL;
+                spin_unlock(&conn->mxk_lock);
+                mxlnd_conn_decref(conn);
+                return;
+        }
+
+        tx->mxc_peer = peer;
+        tx->mxc_conn = conn;
+        mxlnd_init_tx_msg (tx, MXLND_MSG_CONN_REQ, sizeof(kmx_connreq_msg_t), peer->mxp_nid);
+        txmsg = tx->mxc_msg;
+        txmsg->mxm_u.conn_req.mxcrm_queue_depth = *kmxlnd_tunables.kmx_credits;
+        txmsg->mxm_u.conn_req.mxcrm_eager_size = MXLND_EAGER_SIZE;
+        tx->mxc_match = mxlnd_create_match(tx, 0);
+
+        CDEBUG(D_NET, "sending MXLND_MSG_CONN_REQ\n");
+        mxlnd_queue_tx(tx);
+        return;
+}
+
+void
+mxlnd_handle_conn_ack(struct kmx_peer *peer, mx_status_t status)
+{
+        struct kmx_ctx  *tx     = NULL;
+        struct kmx_msg  *txmsg   = NULL;
+        struct kmx_conn *conn   = peer->mxp_conn;
+
+        /* a conn ref was taken when calling mx_iconnect(), 
+         * hold it until CONN_REQ or CONN_ACK completes */
+
+        CDEBUG(D_NET, "entering\n");
+        if (status.code != MX_STATUS_SUCCESS) {
+                struct kmx_conn *conn   = peer->mxp_conn;
+                CDEBUG(D_NETERROR, "mx_iconnect() failed for CONN_ACK with %s (%d) "
+                       "to %s mxp_nid = 0x%llx mxp_nic_id = 0x%0llx mxh_ep_id = %d\n", 
+                        mx_strstatus(status.code), status.code, 
+                        libcfs_nid2str(peer->mxp_nid),
+                        peer->mxp_nid,
+                        peer->mxp_nic_id,
+                        peer->mxp_host->mxh_ep_id);
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_status = MXLND_CONN_FAIL;
+                spin_unlock(&conn->mxk_lock);
+
+                if (time_after(jiffies, peer->mxp_reconnect_time + MXLND_WAIT_TIMEOUT)) {
+                        struct kmx_conn *new_conn       = NULL;
+                        CDEBUG(D_NETERROR, "timeout, calling conn_disconnect()\n");
+                        mxlnd_conn_disconnect(conn, 0, 1);
+                        mxlnd_conn_alloc(&new_conn, peer);
+                        spin_lock(&peer->mxp_lock);
+                        peer->mxp_reconnect_time = 0;
+                        spin_unlock(&peer->mxp_lock);
+                }
+
+                mxlnd_conn_decref(conn);
+                return;
+        }
+        spin_lock(&conn->mxk_lock);
+        conn->mxk_epa = status.source;
+        if (likely(!peer->mxp_incompatible)) {
+                conn->mxk_status = MXLND_CONN_READY;
+        }
+        spin_unlock(&conn->mxk_lock);
+        mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn);
+
+        /* mx_iconnect() succeeded, reset delay to 0 */
+        spin_lock(&peer->mxp_lock);
+        peer->mxp_reconnect_time = 0;
+        spin_unlock(&peer->mxp_lock);
+
+        /* marshal CONN_ACK msg */
+        tx = mxlnd_get_idle_tx();
+        if (tx == NULL) {
+                CDEBUG(D_NETERROR, "Can't allocate CONN_ACK tx for %s\n", 
+                                   libcfs_nid2str(peer->mxp_nid));
+                spin_lock(&conn->mxk_lock);
+                conn->mxk_status = MXLND_CONN_FAIL;
+                spin_unlock(&conn->mxk_lock);
+                mxlnd_conn_decref(conn);
+                return;
+        }
+
+        tx->mxc_peer = peer;
+        tx->mxc_conn = conn;
+        CDEBUG(D_NET, "sending MXLND_MSG_CONN_ACK\n");
+        mxlnd_init_tx_msg (tx, MXLND_MSG_CONN_ACK, sizeof(kmx_connreq_msg_t), peer->mxp_nid);
+        txmsg = tx->mxc_msg;
+        txmsg->mxm_u.conn_req.mxcrm_queue_depth = *kmxlnd_tunables.kmx_credits;
+        txmsg->mxm_u.conn_req.mxcrm_eager_size = MXLND_EAGER_SIZE;
+        tx->mxc_match = mxlnd_create_match(tx, 0);
+
+        mxlnd_queue_tx(tx);
+        return;
+}
+
+/**
+ * mxlnd_request_waitd - the MX request completion thread(s)
+ * @arg - thread id (as a void *)
+ *
+ * This thread waits for a MX completion and then completes the request.
+ * We will create one thread per CPU.
+ */
+int
+mxlnd_request_waitd(void *arg)
+{
+        long                    id              = (long) arg;
+        char                    name[24];
+        __u32                   result          = 0;
+        mx_return_t             mxret           = MX_SUCCESS;
+        mx_status_t             status;
+        struct kmx_ctx         *ctx             = NULL;
+        enum kmx_req_state      req_type        = MXLND_REQ_TX;
+        struct kmx_peer        *peer            = NULL;
+        struct kmx_conn        *conn            = NULL;
+#if MXLND_POLLING
+        int                     count           = 0;
+#endif
+
+        memset(name, 0, sizeof(name));
+        snprintf(name, sizeof(name), "mxlnd_request_waitd_%02ld", id);
+        cfs_daemonize(name);
+        //cfs_block_allsigs();
+
+        memset(&status, 0, sizeof(status));
+
+        CDEBUG(D_NET, "%s starting\n", name);
+
+        while (!kmxlnd_data.kmx_shutdown) {
+                mxret = MX_SUCCESS;
+                result = 0;
+#if MXLND_POLLING
+                if (id == 0 && count++ < *kmxlnd_tunables.kmx_polling) {
+                        mxret = mx_test_any(kmxlnd_data.kmx_endpt, 0LL, 0LL, 
+                                            &status, &result);
+                } else {
+                        count = 0;
+                        mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT, 
+                                            0LL, 0LL, &status, &result);
+                }
+#else
+                mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT, 
+                                    0LL, 0LL, &status, &result);
+#endif
+                if (unlikely(kmxlnd_data.kmx_shutdown))
+                        break;
+
+                if (result != 1) {
+                        /* nothing completed... */
+                        continue;
+                }
+
+                if (status.code != MX_STATUS_SUCCESS) {
+                        CDEBUG(D_NETERROR, "wait_any() failed with %s (%d) with "
+                               "match_info 0x%llx and length %d\n", 
+                               mx_strstatus(status.code), status.code, 
+                               (u64) status.match_info, status.msg_length);
+                }
+
+                /* This may be a mx_iconnect() request completing,
+                 * check the bit mask for CONN_REQ and CONN_ACK */
+                if (status.match_info == MXLND_MASK_ICON_REQ ||
+                    status.match_info == MXLND_MASK_ICON_ACK) {
+                        peer = (struct kmx_peer*) status.context;
+                        if (status.match_info == MXLND_MASK_ICON_REQ) {
+                                mxlnd_handle_conn_req(peer, status);
+                        } else {
+                                mxlnd_handle_conn_ack(peer, status);
+                        }
+                        continue;
+                }
+
+                /* This must be a tx or rx */
+
+                /* NOTE: if this is a RX from the unexpected callback, it may
+                 * have very little info. If we dropped it in unexpected_recv(),
+                 * it will not have a context. If so, ignore it. */
+                ctx = (struct kmx_ctx *) status.context;
+                if (ctx != NULL) {
+
+                        req_type = ctx->mxc_type;
+                        conn = ctx->mxc_conn; /* this may be NULL */
+                        mxlnd_deq_pending_ctx(ctx);
+        
+                        /* copy status to ctx->mxc_status */
+                        memcpy(&ctx->mxc_status, &status, sizeof(status));
+        
+                        switch (req_type) {
+                        case MXLND_REQ_TX:
+                                mxlnd_handle_tx_completion(ctx);
+                                break;
+                        case MXLND_REQ_RX:
+                                mxlnd_handle_rx_completion(ctx);
+                                break;
+                        default:
+                                CDEBUG(D_NETERROR, "Unknown ctx type %d\n", req_type);
+                                LBUG();
+                                break;
+                        }
+        
+                        /* conn is always set except for the first CONN_REQ rx
+                         * from a new peer */
+                        if (!(status.code == MX_STATUS_SUCCESS || 
+                              status.code == MX_STATUS_TRUNCATED) &&
+                              conn != NULL) {
+                                mxlnd_conn_disconnect(conn, 1, 1);
+                        }
+                }
+                CDEBUG(D_NET, "waitd() completed task\n");
+        }
+        CDEBUG(D_NET, "%s stopping\n", name);
+        mxlnd_thread_stop(id);
+        return 0;
+}
+
+
+unsigned long
+mxlnd_check_timeouts(unsigned long now)
+{
+        int                     i               = 0;
+        int                     disconnect      = 0;
+        unsigned long           next            = 0;
+        struct  kmx_peer        *peer           = NULL;
+        struct  kmx_conn        *conn           = NULL;
+
+        read_lock(&kmxlnd_data.kmx_peers_lock);
+        for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) {
+
+                        if (unlikely(kmxlnd_data.kmx_shutdown))
+                                return next;
+        
+                        conn = peer->mxp_conn;
+                        if (conn == NULL)
+                                continue;
+
+                        mxlnd_conn_addref(conn);
+                        spin_lock(&conn->mxk_lock);
+        
+                        /* if nothing pending (timeout == 0) or
+                         * if conn is already disconnected,
+                         * skip this conn */
+                        if (conn->mxk_timeout == 0 ||
+                            conn->mxk_status == MXLND_CONN_DISCONNECT) {
+                                spin_unlock(&conn->mxk_lock);
+                                mxlnd_conn_decref(conn);
+                                continue;
+                        }
+
+                        /* we want to find the timeout that will occur first.
+                         * if it is in the future, we will sleep until then.
+                         * if it is in the past, then we will sleep one
+                         * second and repeat the process. */
+                        if ((next == 0) || (conn->mxk_timeout < next)) {
+                                next = conn->mxk_timeout;
+                        }
+        
+                        disconnect = 0;
+
+                        if (time_after_eq(now, conn->mxk_timeout))  {
+                                disconnect = 1;
+                        }
+                        spin_unlock(&conn->mxk_lock);
+
+                        if (disconnect) {
+                                mxlnd_conn_disconnect(conn, 1, 1);
+                        }
+                        mxlnd_conn_decref(conn);
+                }
+        }
+        read_unlock(&kmxlnd_data.kmx_peers_lock);
+        if (next == 0) next = now + MXLND_COMM_TIMEOUT;
+
+        return next;
+}
+
+/**
+ * mxlnd_timeoutd - enforces timeouts on messages
+ * @arg - thread id (as a void *)
+ *
+ * This thread queries each peer for its earliest timeout. If a peer has timed out,
+ * it calls mxlnd_conn_disconnect().
+ *
+ * After checking for timeouts, try progressing sends (call check_sends()).
+ */
+int
+mxlnd_timeoutd(void *arg)
+{
+        int                     i       = 0;
+        long                    id      = (long) arg;
+        unsigned long           now     = 0;
+        unsigned long           next    = 0;
+        unsigned long           delay   = HZ;
+        struct kmx_peer        *peer    = NULL;
+        struct kmx_conn        *conn    = NULL;
+
+        cfs_daemonize("mxlnd_timeoutd");
+        //cfs_block_allsigs();
+
+        CDEBUG(D_NET, "timeoutd starting\n");
+
+        while (!kmxlnd_data.kmx_shutdown) {
+
+                now = jiffies;
+                /* if the next timeout has not arrived, go back to sleep */
+                if (time_after(now, next)) {
+                        next = mxlnd_check_timeouts(now);
+                }
+
+               read_lock(&kmxlnd_data.kmx_peers_lock);
+                for (i = 0; i < MXLND_HASH_SIZE; i++) {
+                        list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) {
+                                conn = peer->mxp_conn;
+                                if (conn == NULL)
+                                        continue;
+
+                                if (conn->mxk_status != MXLND_CONN_DISCONNECT &&
+                                    time_after(now, conn->mxk_last_tx + HZ)) {
+                                        mxlnd_check_sends(peer);
+                                }
+                        }
+                }
+                read_unlock(&kmxlnd_data.kmx_peers_lock);
+
+                mxlnd_sleep(delay);
+        }
+        CDEBUG(D_NET, "timeoutd stopping\n");
+        mxlnd_thread_stop(id);
+        return 0;
+}
diff --git a/lnet/klnds/mxlnd/mxlnd_modparams.c b/lnet/klnds/mxlnd/mxlnd_modparams.c
new file mode 100644 (file)
index 0000000..37d77f1
--- /dev/null
@@ -0,0 +1,73 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ * Copyright (C) 2006 Myricom, Inc.
+ *   Author: Scott Atchley <atchley at myri.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "mxlnd.h"
+
+static int n_waitd = MXLND_N_SCHED;
+CFS_MODULE_PARM(n_waitd, "i", int, 0444,
+                "# of completion daemons");
+
+static int max_peers = MXLND_MAX_PEERS;
+CFS_MODULE_PARM(max_peers, "i", int, 0444,
+               "maximum number of peers that may connect");
+
+static int cksum = MXLND_CKSUM;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+                "set non-zero to enable message (not data payload) checksums");
+
+static int ntx = MXLND_NTX;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of total tx message descriptors");
+
+static int credits = MXLND_MSG_QUEUE_DEPTH;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int board = MXLND_MX_BOARD;
+CFS_MODULE_PARM(board, "i", int, 0444,
+               "index value of the Myrinet board (NIC)");
+
+static int ep_id = MXLND_MX_EP_ID;
+CFS_MODULE_PARM(ep_id, "i", int, 0444,
+               "MX endpoint ID");
+
+static int polling = MXLND_POLLING;
+CFS_MODULE_PARM(polling, "i", int, 0444,
+               "Use 0 to block (wait). A value > 0 will poll that many times before blocking");
+
+static char *hosts = NULL;
+CFS_MODULE_PARM(hosts, "s", charp, 0444,
+               "IP-to-hostname resolution file");
+
+kmx_tunables_t kmxlnd_tunables = {
+        .kmx_n_waitd            = &n_waitd,
+        .kmx_max_peers          = &max_peers,
+        .kmx_cksum              = &cksum,
+        .kmx_ntx                = &ntx,
+        .kmx_credits            = &credits,
+        .kmx_board              = &board,
+        .kmx_ep_id              = &ep_id,
+        .kmx_polling            = &polling,
+        .kmx_hosts              = &hosts
+};
diff --git a/lnet/klnds/mxlnd/mxlnd_wire.h b/lnet/klnds/mxlnd/mxlnd_wire.h
new file mode 100644 (file)
index 0000000..a929608
--- /dev/null
@@ -0,0 +1,95 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ * Copyright (C) 2006 Myricom, Inc.
+ *   Author: Scott Atchley <atchley at myri.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * MXLND wire format - sent in sender's byte order
+ */
+
+typedef struct kmx_connreq_msg
+{
+        u32             mxcrm_queue_depth;              /* per peer max messages in flight */
+        u32             mxcrm_eager_size;               /* size of preposted eager messages */
+} WIRE_ATTR kmx_connreq_msg_t;
+
+typedef struct kmx_eager_msg
+{
+        lnet_hdr_t      mxem_hdr;                       /* lnet header */
+        char            mxem_payload[0];                /* piggy-backed payload */
+} WIRE_ATTR kmx_eager_msg_t;
+
+typedef struct kmx_putreq_msg
+{
+        lnet_hdr_t      mxprm_hdr;                      /* lnet header */
+        u64             mxprm_cookie;                   /* opaque completion cookie */
+} WIRE_ATTR kmx_putreq_msg_t;
+
+typedef struct kmx_putack_msg
+{
+        u64             mxpam_src_cookie;               /* reflected completion cookie */
+        u64             mxpam_dst_cookie;               /* opaque completion cookie */
+} WIRE_ATTR kmx_putack_msg_t;
+
+typedef struct kmx_getreq_msg
+{
+        lnet_hdr_t      mxgrm_hdr;                      /* lnet header */
+        u64             mxgrm_cookie;                   /* opaque completion cookie */
+} WIRE_ATTR kmx_getreq_msg_t;
+
+typedef struct kmx_msg
+{
+        /* First two fields fixed for all time */
+        u32             mxm_magic;                      /* MXLND message */
+        u16             mxm_version;                    /* version number */
+
+        u8              mxm_type;                       /* message type */
+        u8              mxm_credits;                    /* returned credits */
+        u32             mxm_nob;                        /* # of bytes in whole message */
+        u32             mxm_cksum;                      /* checksum (0 == no checksum) */
+        u64             mxm_srcnid;                     /* sender's NID */
+        u64             mxm_srcstamp;                   /* sender's incarnation */
+        u64             mxm_dstnid;                     /* destination's NID */
+        u64             mxm_dststamp;                   /* destination's incarnation */
+        u64             mxm_seq;                        /* sequence number */
+
+        union {
+                kmx_connreq_msg_t       conn_req;
+                kmx_eager_msg_t         eager;
+                kmx_putreq_msg_t        put_req;
+                kmx_putack_msg_t        put_ack;
+                kmx_getreq_msg_t        get_req;
+        } WIRE_ATTR mxm_u;
+} WIRE_ATTR kmx_msg_t;
+
+#define MXLND_MSG_MAGIC         0x4d583130              /* unique magic 'MX10' */
+#define MXLND_MSG_VERSION       0x01
+
+#define MXLND_MSG_CONN_REQ      0xc                     /* connection request */
+#define MXLND_MSG_CONN_ACK      0xa                     /* connection request response */
+#define MXLND_MSG_EAGER         0xe                     /* eager message */
+#define MXLND_MSG_NOOP          0x1                     /* no msg, return credits */
+#define MXLND_MSG_PUT_REQ       0x2                     /* put request src->sink */
+#define MXLND_MSG_PUT_ACK       0x3                     /* put ack     src<-sink */
+#define MXLND_MSG_PUT_DATA      0x4                     /* put payload src->sink */
+#define MXLND_MSG_GET_REQ       0x5                     /* get request sink->src */
+#define MXLND_MSG_GET_DATA      0x6                     /* get payload sink<-src */
diff --git a/lnet/klnds/o2iblnd/.cvsignore b/lnet/klnds/o2iblnd/.cvsignore
new file mode 100644 (file)
index 0000000..2e9b6f4
--- /dev/null
@@ -0,0 +1,11 @@
+.deps
+Makefile
+.*.cmd
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.tmp_versions
+.depend
+wirecheck
diff --git a/lnet/klnds/o2iblnd/Makefile.in b/lnet/klnds/o2iblnd/Makefile.in
new file mode 100644 (file)
index 0000000..52a194d
--- /dev/null
@@ -0,0 +1,6 @@
+MODULES := ko2iblnd
+ko2iblnd-objs := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o
+
+EXTRA_POST_CFLAGS := @O2IBCPPFLAGS@
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/o2iblnd/autoMakefile.am b/lnet/klnds/o2iblnd/autoMakefile.am
new file mode 100644 (file)
index 0000000..83788fd
--- /dev/null
@@ -0,0 +1,13 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+if MODULES
+if BUILD_O2IBLND
+modulenet_DATA = ko2iblnd$(KMODEXT)
+endif
+endif
+
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(ko2iblnd-objs:%.o=%.c) o2iblnd.h
diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c
new file mode 100644 (file)
index 0000000..ded32d6
--- /dev/null
@@ -0,0 +1,1710 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "o2iblnd.h"
+
+lnd_t the_kiblnd = {
+        .lnd_type       = O2IBLND,
+        .lnd_startup    = kiblnd_startup,
+        .lnd_shutdown   = kiblnd_shutdown,
+        .lnd_ctl        = kiblnd_ctl,
+        .lnd_send       = kiblnd_send,
+        .lnd_recv       = kiblnd_recv,
+};
+
+kib_data_t              kiblnd_data;
+
+__u32
+kiblnd_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
+
+void
+kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob)
+{
+        msg->ibm_type = type;
+        msg->ibm_nob  = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
+
+void
+kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg,
+                 int credits, lnet_nid_t dstnid, __u64 dststamp)
+{
+        kib_net_t *net = ni->ni_data;
+
+        /* CAVEAT EMPTOR! all message fields not set here should have been
+         * initialised previously. */
+        msg->ibm_magic    = IBLND_MSG_MAGIC;
+        msg->ibm_version  = IBLND_MSG_VERSION;
+        /*   ibm_type */
+        msg->ibm_credits  = credits;
+        /*   ibm_nob */
+        msg->ibm_cksum    = 0;
+        msg->ibm_srcnid   = lnet_ptlcompat_srcnid(ni->ni_nid, dstnid);
+        msg->ibm_srcstamp = net->ibn_incarnation;
+        msg->ibm_dstnid   = dstnid;
+        msg->ibm_dststamp = dststamp;
+
+        if (*kiblnd_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob);
+        }
+}
+
+int
+kiblnd_unpack_msg(kib_msg_t *msg, int nob)
+{
+        const int hdr_size = offsetof(kib_msg_t, ibm_u);
+        __u32     msg_cksum;
+        int       flip;
+        int       msg_nob;
+#if !IBLND_MAP_ON_DEMAND
+        int       i;
+        int       n;
+#endif
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        if (msg->ibm_magic == IBLND_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                CERROR("Bad magic: %08x\n", msg->ibm_magic);
+                return -EPROTO;
+        }
+
+        if (msg->ibm_version !=
+            (flip ? __swab16(IBLND_MSG_VERSION) : IBLND_MSG_VERSION)) {
+                CERROR("Bad version: %d\n", msg->ibm_version);
+                return -EPROTO;
+        }
+
+        if (nob < hdr_size) {
+                CERROR("Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+        if (msg_nob > nob) {
+                CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+                return -EPROTO;
+        }
+
+        /* checksum must be computed with ibm_cksum zero and BEFORE anything
+         * gets flipped */
+        msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+        msg->ibm_cksum = 0;
+        if (msg_cksum != 0 &&
+            msg_cksum != kiblnd_cksum(msg, msg_nob)) {
+                CERROR("Bad checksum\n");
+                return -EPROTO;
+        }
+        msg->ibm_cksum = msg_cksum;
+
+        if (flip) {
+                /* leave magic unflipped as a clue to peer endianness */
+                __swab16s(&msg->ibm_version);
+                CLASSERT (sizeof(msg->ibm_type) == 1);
+                CLASSERT (sizeof(msg->ibm_credits) == 1);
+                msg->ibm_nob = msg_nob;
+                __swab64s(&msg->ibm_srcnid);
+                __swab64s(&msg->ibm_srcstamp);
+                __swab64s(&msg->ibm_dstnid);
+                __swab64s(&msg->ibm_dststamp);
+        }
+
+        if (msg->ibm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+                return -EPROTO;
+        }
+
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Unknown message type %x\n", msg->ibm_type);
+                return -EPROTO;
+
+        case IBLND_MSG_NOOP:
+                break;
+
+        case IBLND_MSG_IMMEDIATE:
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
+                        CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBLND_MSG_PUT_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
+                        CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
+                        return -EPROTO;
+                }
+                break;
+
+        case IBLND_MSG_PUT_ACK:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.putack)));
+                        return -EPROTO;
+                }
+#if IBLND_MAP_ON_DEMAND
+                if (flip) {
+                        __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                }
+#else
+                if (flip) {
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+                        __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrags);
+                }
+                
+                n = msg->ibm_u.putack.ibpam_rd.rd_nfrags;
+                if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+                        CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBLND_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
+                        CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+
+                if (flip) {
+                        for (i = 0; i < n; i++) {
+                                __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
+                                __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
+                        }
+                }
+#endif
+                break;
+
+        case IBLND_MSG_GET_REQ:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.get)));
+                        return -EPROTO;
+                }
+#if IBLND_MAP_ON_DEMAND
+                if (flip) {
+                        __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                }
+#else
+                if (flip) {
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+                        __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrags);
+                }
+
+                n = msg->ibm_u.get.ibgm_rd.rd_nfrags;
+                if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) {
+                        CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", 
+                               n, IBLND_MAX_RDMA_FRAGS);
+                        return -EPROTO;
+                }
+                
+                if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
+                        CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+                               (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
+                        return -EPROTO;
+                }
+                
+                if (flip)
+                        for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrags; i++) {
+                                __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
+                                __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
+                        }
+#endif
+                break;
+
+        case IBLND_MSG_PUT_NAK:
+        case IBLND_MSG_PUT_DONE:
+        case IBLND_MSG_GET_DONE:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
+                        CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.completion)));
+                        return -EPROTO;
+                }
+                if (flip)
+                        __swab32s(&msg->ibm_u.completion.ibcm_status);
+                break;
+
+        case IBLND_MSG_CONNREQ:
+        case IBLND_MSG_CONNACK:
+                if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
+                        CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
+                               (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth);
+                        __swab16s(&msg->ibm_u.connparams.ibcp_max_frags);
+                        __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+                }
+                break;
+        }
+        return 0;
+}
+
+int
+kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
+{
+        kib_peer_t     *peer;
+        kib_net_t      *net = ni->ni_data;
+        unsigned long   flags;
+
+        LASSERT (net != NULL);
+        LASSERT (nid != LNET_NID_ANY);
+
+        LIBCFS_ALLOC(peer, sizeof(*peer));
+        if (peer == NULL) {
+                CERROR("Cannot allocate peer\n");
+                return -ENOMEM;
+        }
+
+        memset(peer, 0, sizeof(*peer));         /* zero flags etc */
+
+        peer->ibp_ni = ni;
+        peer->ibp_nid = nid;
+        peer->ibp_error = 0;
+        peer->ibp_last_alive = cfs_time_current();
+        atomic_set(&peer->ibp_refcount, 1);     /* 1 ref for caller */
+
+        INIT_LIST_HEAD(&peer->ibp_list);       /* not in the peer table yet */
+        INIT_LIST_HEAD(&peer->ibp_conns);
+        INIT_LIST_HEAD(&peer->ibp_tx_queue);
+
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        /* always called with a ref on ni, which prevents ni being shutdown */
+        LASSERT (net->ibn_shutdown == 0);
+        
+        /* npeers only grows with the global lock held */
+        atomic_inc(&net->ibn_npeers);
+
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        *peerp = peer;
+        return 0;
+}
+
+void
+kiblnd_destroy_peer (kib_peer_t *peer)
+{
+        kib_net_t *net = peer->ibp_ni->ni_data;
+
+        LASSERT (net != NULL);
+        LASSERT (atomic_read(&peer->ibp_refcount) == 0);
+        LASSERT (!kiblnd_peer_active(peer));
+        LASSERT (peer->ibp_connecting == 0);
+        LASSERT (peer->ibp_accepting == 0);
+        LASSERT (list_empty(&peer->ibp_conns));
+        LASSERT (list_empty(&peer->ibp_tx_queue));
+
+        LIBCFS_FREE(peer, sizeof(*peer));
+
+        /* NB a peer's connections keep a reference on their peer until
+         * they are destroyed, so we can be assured that _all_ state to do
+         * with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        atomic_dec(&net->ibn_npeers);
+}
+
+void
+kiblnd_destroy_dev (kib_dev_t *dev)
+{
+        LASSERT (dev->ibd_nnets == 0);
+
+        if (!list_empty(&dev->ibd_list)) /* on kib_devs? */
+                list_del_init(&dev->ibd_list);
+
+        if (dev->ibd_mr != NULL)
+                ib_dereg_mr(dev->ibd_mr);
+
+        if (dev->ibd_pd != NULL)
+                ib_dealloc_pd(dev->ibd_pd);
+
+        if (dev->ibd_cmid != NULL)
+                rdma_destroy_id(dev->ibd_cmid);
+
+        LIBCFS_FREE(dev, sizeof(*dev));
+}
+
+kib_peer_t *
+kiblnd_find_peer_locked (lnet_nid_t nid)
+{
+        /* the caller is responsible for accounting the additional reference
+         * that this creates */
+        struct list_head *peer_list = kiblnd_nid2peerlist(nid);
+        struct list_head *tmp;
+        kib_peer_t       *peer;
+
+        list_for_each (tmp, peer_list) {
+
+                peer = list_entry(tmp, kib_peer_t, ibp_list);
+
+                LASSERT (peer->ibp_connecting > 0 || /* creating conns */
+                         peer->ibp_accepting > 0 ||
+                         !list_empty(&peer->ibp_conns));  /* active conn */
+
+                if (peer->ibp_nid != nid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                       peer, libcfs_nid2str(nid),
+                       atomic_read(&peer->ibp_refcount));
+                return peer;
+        }
+        return NULL;
+}
+
+void
+kiblnd_unlink_peer_locked (kib_peer_t *peer)
+{
+        LASSERT (list_empty(&peer->ibp_conns));
+
+        LASSERT (kiblnd_peer_active(peer));
+        list_del_init(&peer->ibp_list);
+        /* lose peerlist's ref */
+        kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_get_peer_info (lnet_ni_t *ni, int index, 
+                      lnet_nid_t *nidp, int *count)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        int                i;
+        unsigned long      flags;
+
+        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+
+                list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_connecting > 0 ||
+                                 peer->ibp_accepting > 0 ||
+                                 !list_empty(&peer->ibp_conns));
+
+                        if (peer->ibp_ni != ni)
+                                continue;
+
+                        if (index-- > 0)
+                                continue;
+
+                        *nidp = peer->ibp_nid;
+                        *count = atomic_read(&peer->ibp_refcount);
+
+                        read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                               flags);
+                        return 0;
+                }
+        }
+
+        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+        return -ENOENT;
+}
+
+void
+kiblnd_del_peer_locked (kib_peer_t *peer)
+{
+        struct list_head *ctmp;
+        struct list_head *cnxt;
+        kib_conn_t       *conn;
+
+        if (list_empty(&peer->ibp_conns)) {
+                kiblnd_unlink_peer_locked(peer);
+        } else {
+                list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                        conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                        kiblnd_close_conn_locked(conn, 0);
+                }
+                /* NB closing peer's last conn unlinked it. */
+        }
+        /* NB peer now unlinked; might even be freed if the peer table had the
+         * last ref on it. */
+}
+
+int
+kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid)
+{
+        CFS_LIST_HEAD     (zombies);
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        kib_peer_t        *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        unsigned long      flags;
+        int                rc = -ENOENT;
+
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        if (nid != LNET_NID_ANY) {
+                lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+        } else {
+                lo = 0;
+                hi = kiblnd_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_connecting > 0 ||
+                                 peer->ibp_accepting > 0 ||
+                                 !list_empty(&peer->ibp_conns));
+
+                        if (peer->ibp_ni != ni)
+                                continue;
+
+                        if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
+                                continue;
+
+                        if (!list_empty(&peer->ibp_tx_queue)) {
+                                LASSERT (list_empty(&peer->ibp_conns));
+
+                                list_splice_init(&peer->ibp_tx_queue, &zombies);
+                        }
+
+                        kiblnd_del_peer_locked(peer);
+                        rc = 0;         /* matched something */
+                }
+        }
+
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        kiblnd_txlist_done(ni, &zombies, -EIO);
+
+        return rc;
+}
+
+kib_conn_t *
+kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index)
+{
+        kib_peer_t        *peer;
+        struct list_head  *ptmp;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+        int                i;
+        unsigned long      flags;
+
+        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+                list_for_each (ptmp, &kiblnd_data.kib_peers[i]) {
+
+                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_connecting > 0 ||
+                                 peer->ibp_accepting > 0 ||
+                                 !list_empty(&peer->ibp_conns));
+
+                        if (peer->ibp_ni != ni)
+                                continue;
+
+                        list_for_each (ctmp, &peer->ibp_conns) {
+                                if (index-- > 0)
+                                        continue;
+
+                                conn = list_entry(ctmp, kib_conn_t, ibc_list);
+                                kiblnd_conn_addref(conn);
+                                read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                                       flags);
+                                return conn;
+                        }
+                }
+        }
+
+        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+        return NULL;
+}
+
+void
+kiblnd_debug_rx (kib_rx_t *rx)
+{
+        CDEBUG(D_CONSOLE, "      %p status %d msg_type %x cred %d\n",
+               rx, rx->rx_status, rx->rx_msg->ibm_type,
+               rx->rx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_tx (kib_tx_t *tx)
+{
+        CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+               "cookie "LPX64" msg %s%s type %x cred %d\n",
+               tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+               tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+               tx->tx_lntmsg[0] == NULL ? "-" : "!",
+               tx->tx_lntmsg[1] == NULL ? "-" : "!",
+               tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits);
+}
+
+void
+kiblnd_debug_conn (kib_conn_t *conn)
+{
+        struct list_head *tmp;
+        int               i;
+
+        spin_lock(&conn->ibc_lock);
+
+        CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n",
+               atomic_read(&conn->ibc_refcount), conn,
+               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+        CDEBUG(D_CONSOLE, "   state %d nposted %d cred %d o_cred %d r_cred %d\n",
+               conn->ibc_state, conn->ibc_nsends_posted, conn->ibc_credits, 
+               conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+        CDEBUG(D_CONSOLE, "   comms_err %d\n", conn->ibc_comms_error);
+
+        CDEBUG(D_CONSOLE, "   early_rxs:\n");
+        list_for_each(tmp, &conn->ibc_early_rxs)
+                kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+
+        CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+        list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+        CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+        list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+        CDEBUG(D_CONSOLE, "   tx_queue:\n");
+        list_for_each(tmp, &conn->ibc_tx_queue)
+                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+        CDEBUG(D_CONSOLE, "   active_txs:\n");
+        list_for_each(tmp, &conn->ibc_active_txs)
+                kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+        CDEBUG(D_CONSOLE, "   rxs:\n");
+        for (i = 0; i < IBLND_RX_MSGS; i++)
+                kiblnd_debug_rx(&conn->ibc_rxs[i]);
+
+        spin_unlock(&conn->ibc_lock);
+}
+
+kib_conn_t *
+kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, int state)
+{
+        /* CAVEAT EMPTOR:
+         * If the new conn is created successfully it takes over the caller's
+         * ref on 'peer'.  It also "owns" 'cmid' and destroys it when it itself
+         * is destroyed.  On failure, the caller's ref on 'peer' remains and
+         * she must dispose of 'cmid'.  (Actually I'd block forever if I tried
+         * to destroy 'cmid' here since I'm called from the CM which still has
+         * its ref on 'cmid'). */
+        kib_conn_t             *conn;
+        kib_net_t              *net = peer->ibp_ni->ni_data;
+        int                     i;
+        int                     page_offset;
+        int                     ipage;
+        int                     rc;
+        struct ib_cq           *cq;
+        struct ib_qp_init_attr *init_qp_attr;
+        unsigned long           flags;
+
+        LASSERT (net != NULL);
+        LASSERT (!in_interrupt());
+
+        LIBCFS_ALLOC(init_qp_attr, sizeof(*init_qp_attr));
+        if (init_qp_attr == NULL) {
+                CERROR("Can't allocate qp_attr for %s\n",
+                       libcfs_nid2str(peer->ibp_nid));
+                goto failed_0;
+        }
+
+        LIBCFS_ALLOC(conn, sizeof(*conn));
+        if (conn == NULL) {
+                CERROR("Can't allocate connection for %s\n",
+                       libcfs_nid2str(peer->ibp_nid));
+                goto failed_1;
+        }
+
+        memset(conn, 0, sizeof(*conn)); /* zero flags, NULL pointers etc... */
+
+        conn->ibc_state = IBLND_CONN_INIT;
+        conn->ibc_peer = peer;                  /* I take the caller's ref */
+        cmid->context = conn;                   /* for future CM callbacks */
+        conn->ibc_cmid = cmid;
+
+        INIT_LIST_HEAD(&conn->ibc_early_rxs);
+        INIT_LIST_HEAD(&conn->ibc_tx_queue);
+        INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd);
+        INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred);
+        INIT_LIST_HEAD(&conn->ibc_active_txs);
+        spin_lock_init(&conn->ibc_lock);
+
+        LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        if (conn->ibc_connvars == NULL) {
+                CERROR("Can't allocate in-progress connection state\n");
+                goto failed_2;
+        }
+        memset(conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars));
+
+        LIBCFS_ALLOC(conn->ibc_rxs, IBLND_RX_MSGS * sizeof(kib_rx_t));
+        if (conn->ibc_rxs == NULL) {
+                CERROR("Cannot allocate RX buffers\n");
+                goto failed_2;
+        }
+        memset(conn->ibc_rxs, 0, IBLND_RX_MSGS * sizeof(kib_rx_t));
+
+        rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, IBLND_RX_MSG_PAGES);
+        if (rc != 0)
+                goto failed_2;
+
+        for (i = ipage = page_offset = 0; i < IBLND_RX_MSGS; i++) {
+                struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
+                kib_rx_t    *rx = &conn->ibc_rxs[i];
+
+                rx->rx_conn = conn;
+                rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+                                           page_offset);
+                rx->rx_msgaddr = dma_map_single(cmid->device->dma_device,
+                                                rx->rx_msg,
+                                                IBLND_MSG_SIZE,
+                                                DMA_FROM_DEVICE);
+                pci_unmap_addr_set(rx, rx_msgunmap, rx->rx_msgaddr);
+
+                CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n",
+                       i, rx->rx_msg, rx->rx_msgaddr,
+                       lnet_page2phys(page) + page_offset);
+
+                page_offset += IBLND_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBLND_RX_MSG_PAGES);
+                }
+        }
+
+        cq = ib_create_cq(cmid->device,
+                          kiblnd_cq_completion, kiblnd_cq_event, conn,
+                          IBLND_CQ_ENTRIES());
+        if (!IS_ERR(cq)) {
+                conn->ibc_cq = cq;
+        } else {
+                CERROR("Can't create CQ: %ld\n", PTR_ERR(cq));
+                goto failed_2;
+        }
+
+        rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+        if (rc != 0) {
+                CERROR("Can't request completion notificiation: %d\n", rc);
+                goto failed_2;
+        }
+        
+        memset(init_qp_attr, 0, sizeof(*init_qp_attr));
+        init_qp_attr->event_handler = kiblnd_qp_event;
+        init_qp_attr->qp_context = conn;
+        init_qp_attr->cap.max_send_wr = (*kiblnd_tunables.kib_concurrent_sends) *
+                                        (1 + IBLND_MAX_RDMA_FRAGS);
+        init_qp_attr->cap.max_recv_wr = IBLND_RX_MSGS;
+        init_qp_attr->cap.max_send_sge = 1;
+        init_qp_attr->cap.max_recv_sge = 1;
+        init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+        init_qp_attr->qp_type = IB_QPT_RC;
+        init_qp_attr->send_cq = cq;
+        init_qp_attr->recv_cq = cq;
+
+        rc = 0;
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+        switch (*kiblnd_tunables.kib_ib_mtu) {
+        default:
+                rc = *kiblnd_tunables.kib_ib_mtu;
+                /* fall through to... */
+        case 0: /* set tunable to the default
+                 * CAVEAT EMPTOR! this assumes the default is one of the MTUs
+                 * below, otherwise we'll WARN on the next QP create */
+                *kiblnd_tunables.kib_ib_mtu =
+                        ib_mtu_enum_to_int(cmid->route.path_rec->mtu);
+                break;
+        case 256:
+                cmid->route.path_rec->mtu = IB_MTU_256;
+                break;
+        case 512:
+                cmid->route.path_rec->mtu = IB_MTU_512;
+                break;
+        case 1024:
+                cmid->route.path_rec->mtu = IB_MTU_1024;
+                break;
+        case 2048:
+                cmid->route.path_rec->mtu = IB_MTU_2048;
+                break;
+        case 4096:
+                cmid->route.path_rec->mtu = IB_MTU_4096;
+                break;
+        }
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        if (rc != 0)
+                CWARN("Invalid IB MTU value %d, using default value %d\n",
+                      rc, *kiblnd_tunables.kib_ib_mtu);
+                                
+        rc = rdma_create_qp(cmid, net->ibn_dev->ibd_pd, init_qp_attr);
+        if (rc != 0) {
+                CERROR("Can't create QP: %d\n", rc);
+                goto failed_2;
+        }
+
+        LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+
+        /* 1 ref for caller and each rxmsg */
+        atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS);
+        conn->ibc_nrx = IBLND_RX_MSGS;
+
+        /* post receives */
+        for (i = 0; i < IBLND_RX_MSGS; i++) {
+                rc = kiblnd_post_rx(&conn->ibc_rxs[i],
+                                    IBLND_POSTRX_NO_CREDIT);
+                if (rc != 0) {
+                        CERROR("Can't post rxmsg: %d\n", rc);
+
+                        /* Make posted receives complete */
+                        kiblnd_abort_receives(conn);
+
+                        /* correct # of posted buffers 
+                         * NB locking needed now I'm racing with completion */
+                        spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+                        conn->ibc_nrx -= IBLND_RX_MSGS - i;
+                        spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock,
+                                               flags);
+
+                        /* Drop my own and unused rxbuffer refcounts */
+                        while (i++ <= IBLND_RX_MSGS)
+                                kiblnd_conn_decref(conn);
+
+                        return NULL;
+                }
+        }
+        
+        /* Init successful! */
+        LASSERT (state == IBLND_CONN_ACTIVE_CONNECT ||
+                 state == IBLND_CONN_PASSIVE_WAIT);
+        conn->ibc_state = state;
+
+        /* 1 more conn */
+        atomic_inc(&net->ibn_nconns);
+        return conn;
+
+ failed_2:
+        kiblnd_destroy_conn(conn);
+ failed_1:
+        LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr));
+ failed_0:
+        return NULL;
+}
+
+void
+kiblnd_destroy_conn (kib_conn_t *conn)
+{
+        struct rdma_cm_id *cmid = conn->ibc_cmid;
+        kib_peer_t        *peer = conn->ibc_peer;
+        int                rc;
+        int                i;
+
+        LASSERT (!in_interrupt());
+        LASSERT (atomic_read(&conn->ibc_refcount) == 0);
+        LASSERT (list_empty(&conn->ibc_early_rxs));
+        LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+        LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
+        LASSERT (list_empty(&conn->ibc_active_txs));
+        LASSERT (conn->ibc_nsends_posted == 0);
+
+        switch (conn->ibc_state) {
+        default:
+                /* conn must be completely disengaged from the network */
+                LBUG();
+
+        case IBLND_CONN_DISCONNECTED:
+                /* connvars should have been freed already */
+                LASSERT (conn->ibc_connvars == NULL);
+                break;
+
+        case IBLND_CONN_INIT:
+                break;
+        }
+
+        if (conn->ibc_cmid->qp != NULL)
+                rdma_destroy_qp(conn->ibc_cmid);
+
+        if (conn->ibc_cq != NULL) {
+                rc = ib_destroy_cq(conn->ibc_cq);
+                if (rc != 0)
+                        CWARN("Error destroying CQ: %d\n", rc);
+        }
+
+        if (conn->ibc_rx_pages != NULL) {
+                LASSERT (conn->ibc_rxs != NULL);
+
+                for (i = 0; i < IBLND_RX_MSGS; i++) {
+                        kib_rx_t *rx = &conn->ibc_rxs[i];
+
+                        LASSERT (rx->rx_nob >= 0); /* not posted */
+
+                        dma_unmap_single(conn->ibc_cmid->device->dma_device,
+                                         pci_unmap_addr(rx, rx_msgunmap),
+                                         IBLND_MSG_SIZE, DMA_FROM_DEVICE);
+                }
+
+                kiblnd_free_pages(conn->ibc_rx_pages);
+        }
+
+        if (conn->ibc_rxs != NULL) {
+                LIBCFS_FREE(conn->ibc_rxs,
+                            IBLND_RX_MSGS * sizeof(kib_rx_t));
+        }
+
+        if (conn->ibc_connvars != NULL)
+                LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+
+        /* See CAVEAT EMPTOR above in kiblnd_create_conn */
+        if (conn->ibc_state != IBLND_CONN_INIT) {
+                kib_net_t *net = peer->ibp_ni->ni_data;
+
+                kiblnd_peer_decref(peer);
+                rdma_destroy_id(cmid);
+                atomic_dec(&net->ibn_nconns);
+        }
+
+        LIBCFS_FREE(conn, sizeof(*conn));
+}
+
+int
+kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                count++;
+                kiblnd_close_conn_locked(conn, why);
+        }
+
+        return count;
+}
+
+int
+kiblnd_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
+{
+        kib_conn_t         *conn;
+        struct list_head   *ctmp;
+        struct list_head   *cnxt;
+        int                 count = 0;
+
+        list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) {
+                conn = list_entry(ctmp, kib_conn_t, ibc_list);
+
+                if (conn->ibc_incarnation == incarnation)
+                        continue;
+
+                CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n",
+                       libcfs_nid2str(peer->ibp_nid),
+                       conn->ibc_incarnation, incarnation);
+
+                count++;
+                kiblnd_close_conn_locked(conn, -ESTALE);
+        }
+
+        return count;
+}
+
+int
+kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid)
+{
+        kib_peer_t         *peer;
+        struct list_head   *ptmp;
+        struct list_head   *pnxt;
+        int                 lo;
+        int                 hi;
+        int                 i;
+        unsigned long       flags;
+        int                 count = 0;
+
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        if (nid != LNET_NID_ANY)
+                lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers;
+        else {
+                lo = 0;
+                hi = kiblnd_data.kib_peer_hash_size - 1;
+        }
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) {
+
+                        peer = list_entry(ptmp, kib_peer_t, ibp_list);
+                        LASSERT (peer->ibp_connecting > 0 ||
+                                 peer->ibp_accepting > 0 ||
+                                 !list_empty(&peer->ibp_conns));
+
+                        if (peer->ibp_ni != ni)
+                                continue;
+
+                        if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
+                                continue;
+
+                        count += kiblnd_close_peer_conns_locked(peer, 0);
+                }
+        }
+
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        /* wildcards always succeed */
+        if (nid == LNET_NID_ANY)
+                return 0;
+
+        return (count == 0) ? -ENOENT : 0;
+}
+
+int
+kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
+
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
+                int          count = 0;
+
+                rc = kiblnd_get_peer_info(ni, data->ioc_count,
+                                          &nid, &count);
+                data->ioc_nid    = nid;
+                data->ioc_count  = count;
+                break;
+        }
+
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kiblnd_del_peer(ni, data->ioc_nid);
+                break;
+        }
+        case IOC_LIBCFS_GET_CONN: {
+                kib_conn_t *conn = kiblnd_get_conn_by_idx(ni, data->ioc_count);
+
+                if (conn == NULL) {
+                        rc = -ENOENT;
+                } else {
+                        // kiblnd_debug_conn(conn);
+                        rc = 0;
+                        data->ioc_nid = conn->ibc_peer->ibp_nid;
+                        kiblnd_conn_decref(conn);
+                }
+                break;
+        }
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kiblnd_close_matching_conns(ni, data->ioc_nid);
+                break;
+        }
+
+        default:
+                break;
+        }
+
+        return rc;
+}
+
+void
+kiblnd_free_pages (kib_pages_t *p)
+{
+        int         npages = p->ibp_npages;
+        int         i;
+
+        for (i = 0; i < npages; i++)
+                if (p->ibp_pages[i] != NULL)
+                        __free_page(p->ibp_pages[i]);
+
+        LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+}
+
+int
+kiblnd_alloc_pages (kib_pages_t **pp, int npages)
+{
+        kib_pages_t   *p;
+        int            i;
+
+        LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        if (p == NULL) {
+                CERROR("Can't allocate descriptor for %d pages\n", npages);
+                return -ENOMEM;
+        }
+
+        memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages]));
+        p->ibp_npages = npages;
+
+        for (i = 0; i < npages; i++) {
+                p->ibp_pages[i] = alloc_page(GFP_KERNEL);
+                if (p->ibp_pages[i] == NULL) {
+                        CERROR("Can't allocate page %d of %d\n", i, npages);
+                        kiblnd_free_pages(p);
+                        return -ENOMEM;
+                }
+        }
+
+        *pp = p;
+        return 0;
+}
+
+void
+kiblnd_free_tx_descs (lnet_ni_t *ni)
+{
+        int        i;
+        kib_net_t *net = ni->ni_data;
+
+        LASSERT (net != NULL);
+
+        if (net->ibn_tx_descs != NULL) {
+                for (i = 0; i < IBLND_TX_MSGS(); i++) {
+                        kib_tx_t *tx = &net->ibn_tx_descs[i];
+
+#if IBLND_MAP_ON_DEMAND
+                        if (tx->tx_pages != NULL)
+                                LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
+                                            sizeof(*tx->tx_pages));
+#else
+                        if (tx->tx_wrq != NULL)
+                                LIBCFS_FREE(tx->tx_wrq, 
+                                            (1 + IBLND_MAX_RDMA_FRAGS) * 
+                                            sizeof(*tx->tx_wrq));
+
+                        if (tx->tx_sge != NULL)
+                                LIBCFS_FREE(tx->tx_sge, 
+                                            (1 + IBLND_MAX_RDMA_FRAGS) * 
+                                            sizeof(*tx->tx_sge));
+
+                        if (tx->tx_rd != NULL)
+                                LIBCFS_FREE(tx->tx_rd, 
+                                            offsetof(kib_rdma_desc_t, 
+                                               rd_frags[IBLND_MAX_RDMA_FRAGS]));
+
+                        if (tx->tx_frags != NULL)
+                                LIBCFS_FREE(tx->tx_frags, 
+                                            IBLND_MAX_RDMA_FRAGS *
+                                            sizeof(*tx->tx_frags));
+#endif
+                }
+
+                LIBCFS_FREE(net->ibn_tx_descs,
+                            IBLND_TX_MSGS() * sizeof(kib_tx_t));
+        }
+
+        if (net->ibn_tx_pages != NULL)
+                kiblnd_free_pages(net->ibn_tx_pages);
+}
+
+int
+kiblnd_alloc_tx_descs (lnet_ni_t *ni)
+{
+        int        i;
+        int        rc;
+        kib_net_t *net = ni->ni_data;
+
+        LASSERT (net != NULL);
+
+        rc = kiblnd_alloc_pages(&net->ibn_tx_pages, IBLND_TX_MSG_PAGES());
+
+        if (rc != 0) {
+                CERROR("Can't allocate tx pages\n");
+                return rc;
+        }
+
+        LIBCFS_ALLOC (net->ibn_tx_descs,
+                      IBLND_TX_MSGS() * sizeof(kib_tx_t));
+        if (net->ibn_tx_descs == NULL) {
+                CERROR("Can't allocate %d tx descriptors\n", IBLND_TX_MSGS());
+                return -ENOMEM;
+        }
+
+        memset(net->ibn_tx_descs, 0,
+               IBLND_TX_MSGS() * sizeof(kib_tx_t));
+
+        for (i = 0; i < IBLND_TX_MSGS(); i++) {
+                kib_tx_t *tx = &net->ibn_tx_descs[i];
+
+#if IBLND_MAP_ON_DEMAND
+                LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
+                             sizeof(*tx->tx_pages));
+                if (tx->tx_pages == NULL) {
+                        CERROR("Can't allocate phys page vector[%d]\n",
+                               LNET_MAX_IOV);
+                        return -ENOMEM;
+                }
+#else
+                LIBCFS_ALLOC(tx->tx_wrq, 
+                             (1 + IBLND_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_wrq));
+                if (tx->tx_wrq == NULL)
+                        return -ENOMEM;
+                
+                LIBCFS_ALLOC(tx->tx_sge, 
+                             (1 + IBLND_MAX_RDMA_FRAGS) * 
+                             sizeof(*tx->tx_sge));
+                if (tx->tx_sge == NULL)
+                        return -ENOMEM;
+                
+                LIBCFS_ALLOC(tx->tx_rd, 
+                             offsetof(kib_rdma_desc_t, 
+                                      rd_frags[IBLND_MAX_RDMA_FRAGS]));
+                if (tx->tx_rd == NULL)
+                        return -ENOMEM;
+
+                LIBCFS_ALLOC(tx->tx_frags,
+                             IBLND_MAX_RDMA_FRAGS * 
+                             sizeof(*tx->tx_frags));
+                if (tx->tx_frags == NULL)
+                        return -ENOMEM;
+#endif
+        }
+
+        return 0;
+}
+
+void
+kiblnd_unmap_tx_descs (lnet_ni_t *ni)
+{
+        int             i;
+        kib_tx_t       *tx;
+        kib_net_t      *net = ni->ni_data;
+
+        LASSERT (net != NULL);
+
+        for (i = 0; i < IBLND_TX_MSGS(); i++) {
+                tx = &net->ibn_tx_descs[i];
+
+                dma_unmap_single(net->ibn_dev->ibd_cmid->device->dma_device,
+                                 pci_unmap_addr(tx, tx_msgunmap),
+                                 IBLND_MSG_SIZE, DMA_TO_DEVICE);
+        }
+}
+
+void
+kiblnd_map_tx_descs (lnet_ni_t *ni)
+{
+        int             ipage = 0;
+        int             page_offset = 0;
+        int             i;
+        struct page    *page;
+        kib_tx_t       *tx;
+        kib_net_t      *net = ni->ni_data;
+
+        LASSERT (net != NULL);
+
+        /* pre-mapped messages are not bigger than 1 page */
+        CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE);
+
+        /* No fancy arithmetic when we do the buffer calculations */
+        CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0);
+
+        for (i = 0; i < IBLND_TX_MSGS(); i++) {
+                page = net->ibn_tx_pages->ibp_pages[ipage];
+                tx = &net->ibn_tx_descs[i];
+
+                tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
+                                           page_offset);
+
+                tx->tx_msgaddr = dma_map_single(
+                        net->ibn_dev->ibd_cmid->device->dma_device,
+                        tx->tx_msg, IBLND_MSG_SIZE, DMA_TO_DEVICE);
+                pci_unmap_addr_set(tx, tx_msgunmap, tx->tx_msgaddr);
+
+                list_add(&tx->tx_list, &net->ibn_idle_txs);
+
+                page_offset += IBLND_MSG_SIZE;
+                LASSERT (page_offset <= PAGE_SIZE);
+
+                if (page_offset == PAGE_SIZE) {
+                        page_offset = 0;
+                        ipage++;
+                        LASSERT (ipage <= IBLND_TX_MSG_PAGES());
+                }
+        }
+}
+
+void
+kiblnd_base_shutdown (void)
+{
+        int i;
+
+        LASSERT (list_empty(&kiblnd_data.kib_devs));
+
+        CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n",
+               atomic_read(&libcfs_kmemory));
+
+        switch (kiblnd_data.kib_init) {
+        default:
+                LBUG();
+
+        case IBLND_INIT_ALL:
+        case IBLND_INIT_DATA:
+                LASSERT (kiblnd_data.kib_peers != NULL);
+                for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) {
+                        LASSERT (list_empty(&kiblnd_data.kib_peers[i]));
+                }
+                LASSERT (list_empty(&kiblnd_data.kib_connd_zombies));
+                LASSERT (list_empty(&kiblnd_data.kib_connd_conns));
+
+                /* flag threads to terminate; wake and wait for them to die */
+                kiblnd_data.kib_shutdown = 1;
+                wake_up_all(&kiblnd_data.kib_sched_waitq);
+                wake_up_all(&kiblnd_data.kib_connd_waitq);
+
+                i = 2;
+                while (atomic_read(&kiblnd_data.kib_nthreads) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d threads to terminate\n",
+                               atomic_read(&kiblnd_data.kib_nthreads));
+                        cfs_pause(cfs_time_seconds(1));
+                }
+
+                /* fall through */
+
+        case IBLND_INIT_NOTHING:
+                break;
+        }
+
+        if (kiblnd_data.kib_peers != NULL)
+                LIBCFS_FREE(kiblnd_data.kib_peers,
+                            sizeof(struct list_head) *
+                            kiblnd_data.kib_peer_hash_size);
+
+        CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n",
+               atomic_read(&libcfs_kmemory));
+
+        kiblnd_data.kib_init = IBLND_INIT_NOTHING;
+        PORTAL_MODULE_UNUSE;
+}
+
+void
+kiblnd_shutdown (lnet_ni_t *ni)
+{
+        kib_net_t        *net = ni->ni_data;
+        rwlock_t         *g_lock = &kiblnd_data.kib_global_lock;
+        int               i;
+        unsigned long     flags;
+
+        LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL);
+
+        if (net == NULL)
+                goto out;
+
+        CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n",
+               atomic_read(&libcfs_kmemory));
+
+        write_lock_irqsave(g_lock, flags);
+        net->ibn_shutdown = 1;
+        write_unlock_irqrestore(g_lock, flags);
+
+        switch (net->ibn_init) {
+        default:
+                LBUG();
+
+        case IBLND_INIT_ALL:
+                /* nuke all existing peers within this net */
+                kiblnd_del_peer(ni, LNET_NID_ANY);
+
+                /* Wait for all peer state to clean up */
+                i = 2;
+                while (atomic_read(&net->ibn_npeers) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
+                               "%s: waiting for %d peers to disconnect\n",
+                               libcfs_nid2str(ni->ni_nid),
+                               atomic_read(&net->ibn_npeers));
+                        cfs_pause(cfs_time_seconds(1));
+                }
+
+                kiblnd_unmap_tx_descs(ni);
+
+                LASSERT (net->ibn_dev->ibd_nnets > 0);
+                net->ibn_dev->ibd_nnets--;
+
+                /* fall through */
+
+        case IBLND_INIT_NOTHING:
+                LASSERT (atomic_read(&net->ibn_nconns) == 0);
+
+#if IBLND_MAP_ON_DEMAND
+                if (net->ibn_fmrpool != NULL)
+                        ib_destroy_fmr_pool(net->ibn_fmrpool);
+#endif
+                if (net->ibn_dev != NULL &&
+                    net->ibn_dev->ibd_nnets == 0)
+                        kiblnd_destroy_dev(net->ibn_dev);
+
+                break;
+        }
+
+        kiblnd_free_tx_descs(ni);
+
+        CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n",
+               atomic_read(&libcfs_kmemory));
+
+        net->ibn_init = IBLND_INIT_NOTHING;
+        ni->ni_data = NULL;
+        
+        LIBCFS_FREE(net, sizeof(*net));
+
+out:
+        if (list_empty(&kiblnd_data.kib_devs))
+                kiblnd_base_shutdown();
+        return;
+}
+
+int
+kiblnd_base_startup (void)
+{
+        int               rc;
+        int               i;
+
+        LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING);
+
+        if (*kiblnd_tunables.kib_credits > *kiblnd_tunables.kib_ntx) {
+                CERROR("Can't set credits(%d) > ntx(%d)\n",
+                       *kiblnd_tunables.kib_credits,
+                       *kiblnd_tunables.kib_ntx);
+                return -EINVAL;
+        }
+
+        PORTAL_MODULE_USE;
+        memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */
+
+        rwlock_init(&kiblnd_data.kib_global_lock);
+
+        INIT_LIST_HEAD(&kiblnd_data.kib_devs);
+
+        kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE;
+        LIBCFS_ALLOC(kiblnd_data.kib_peers,
+                     sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size);
+        if (kiblnd_data.kib_peers == NULL) {
+                goto failed;
+        }
+        for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]);
+
+        spin_lock_init(&kiblnd_data.kib_connd_lock);
+        INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns);
+        INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies);
+        init_waitqueue_head(&kiblnd_data.kib_connd_waitq);
+
+        spin_lock_init(&kiblnd_data.kib_sched_lock);
+        INIT_LIST_HEAD(&kiblnd_data.kib_sched_conns);
+        init_waitqueue_head(&kiblnd_data.kib_sched_waitq);
+
+        kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR;
+
+        /* lists/ptrs/locks initialised */
+        kiblnd_data.kib_init = IBLND_INIT_DATA;
+        /*****************************************************/
+
+        for (i = 0; i < IBLND_N_SCHED; i++) {
+                rc = kiblnd_thread_start(kiblnd_scheduler, (void *)((long)i));
+                if (rc != 0) {
+                        CERROR("Can't spawn o2iblnd scheduler[%d]: %d\n",
+                               i, rc);
+                        goto failed;
+                }
+        }
+
+        rc = kiblnd_thread_start(kiblnd_connd, NULL);
+        if (rc != 0) {
+                CERROR("Can't spawn o2iblnd connd: %d\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kiblnd_data.kib_init = IBLND_INIT_ALL;
+        /*****************************************************/
+
+        return 0;
+
+ failed:
+        kiblnd_base_shutdown();
+        return -ENETDOWN;
+}
+
+int
+kiblnd_startup (lnet_ni_t *ni)
+{
+        char                     *ifname;
+        kib_net_t                *net;
+        kib_dev_t                *ibdev;
+        struct list_head         *tmp;
+        struct timeval            tv;
+        int                       rc;
+
+        LASSERT (ni->ni_lnd == &the_kiblnd);
+
+        if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) {
+                rc = kiblnd_base_startup();
+                if (rc != 0)
+                        return rc;
+        }
+
+        LIBCFS_ALLOC(net, sizeof(*net));
+        ni->ni_data = net;
+        if (net == NULL)
+                goto failed;
+
+        memset(net, 0, sizeof(*net));
+
+        do_gettimeofday(&tv);
+        net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+        ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits;
+        ni->ni_peertxcredits = *kiblnd_tunables.kib_peercredits;
+
+        spin_lock_init(&net->ibn_tx_lock);
+        INIT_LIST_HEAD(&net->ibn_idle_txs);
+
+        rc = kiblnd_alloc_tx_descs(ni);
+        if (rc != 0) {
+                CERROR("Can't allocate tx descs\n");
+                goto failed;
+        }
+
+        if (ni->ni_interfaces[0] != NULL) {
+                /* Use the IPoIB interface specified in 'networks=' */
+
+                CLASSERT (LNET_MAX_INTERFACES > 1);
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Multiple interfaces not supported\n");
+                        goto failed;
+                }
+
+                ifname = ni->ni_interfaces[0];
+        } else {
+                ifname = *kiblnd_tunables.kib_default_ipif;
+        }
+
+        if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) {
+                CERROR("IPoIB interface name too long: %s\n", ifname);
+                goto failed;
+        }
+
+        ibdev = NULL;
+        list_for_each (tmp, &kiblnd_data.kib_devs) {
+                ibdev = list_entry(tmp, kib_dev_t, ibd_list);
+
+                if (!strcmp(&ibdev->ibd_ifname[0], ifname))
+                        break;
+
+                ibdev = NULL;
+        }
+
+        if (ibdev == NULL) {
+                __u32                     ip;
+                __u32                     netmask;
+                int                       up;
+                struct rdma_cm_id        *id;
+                struct ib_pd             *pd;
+                struct ib_mr             *mr;
+                struct sockaddr_in       addr;
+
+                rc = libcfs_ipif_query(ifname, &up, &ip, &netmask);
+                if (rc != 0) {
+                        CERROR("Can't query IPoIB interface %s: %d\n",
+                               ifname, rc);
+                        goto failed;
+                }
+
+                if (!up) {
+                        CERROR("Can't query IPoIB interface %s: it's down\n",
+                               ifname);
+                        goto failed;
+                }
+
+                LIBCFS_ALLOC(ibdev, sizeof(*ibdev));
+                if (ibdev == NULL)
+                        goto failed;
+
+                memset(ibdev, 0, sizeof(*ibdev));
+
+                INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */
+                ibdev->ibd_ifip = ip;
+                strcpy(&ibdev->ibd_ifname[0], ifname);
+
+                id = rdma_create_id(kiblnd_cm_callback, ibdev, RDMA_PS_TCP);
+                if (!IS_ERR(id)) {
+                        ibdev->ibd_cmid = id;
+                } else {
+                        CERROR("Can't create listen ID: %ld\n", PTR_ERR(id));
+                        goto failed;
+                }
+
+                memset(&addr, 0, sizeof(addr));
+                addr.sin_family      = AF_INET;
+                addr.sin_port        = htons(*kiblnd_tunables.kib_service);
+                addr.sin_addr.s_addr = htonl(ip);
+
+                rc = rdma_bind_addr(id, (struct sockaddr *)&addr);
+                if (rc != 0) {
+                        CERROR("Can't bind to %s: %d\n", ifname, rc);
+                        goto failed;
+                }
+
+                /* Binding should have assigned me an IB device */
+                LASSERT (id->device != NULL);
+
+                pd = ib_alloc_pd(id->device);
+                if (!IS_ERR(pd)) {
+                        ibdev->ibd_pd = pd;
+                } else {
+                        CERROR("Can't allocate PD: %ld\n", PTR_ERR(pd));
+                        goto failed;
+                }
+
+#if IBLND_MAP_ON_DEMAND
+                /* MR for sends and receives */
+                mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+#else
+                /* MR for sends, recieves _and_ RDMA...........v */
+                mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE |
+                                       IB_ACCESS_REMOTE_WRITE);
+#endif
+                if (!IS_ERR(mr)) {
+                        ibdev->ibd_mr = mr;
+                } else {
+                        CERROR("Can't get MR: %ld\n", PTR_ERR(pd));
+                        goto failed;
+                }
+
+                rc = rdma_listen(id, 0);
+                if (rc != 0) {
+                        CERROR("Can't start listener: %d\n", rc);
+                        goto failed;
+                }
+
+                list_add_tail(&ibdev->ibd_list, 
+                              &kiblnd_data.kib_devs);
+        }
+
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip);
+        net->ibn_dev = ibdev;
+
+#if IBLND_MAP_ON_DEMAND
+        /* FMR pool for RDMA */
+        {
+                struct ib_fmr_pool      *fmrpool;
+                struct ib_fmr_pool_param param = {
+                        .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
+                        .page_shift        = PAGE_SHIFT,
+                        .access            = (IB_ACCESS_LOCAL_WRITE |
+                                              IB_ACCESS_REMOTE_WRITE),
+                        .pool_size         = *kiblnd_tunables.kib_fmr_pool_size,
+                        .dirty_watermark   = *kiblnd_tunables.kib_fmr_flush_trigger,
+                        .flush_function    = NULL,
+                        .flush_arg         = NULL,
+                        .cache             = *kiblnd_tunables.kib_fmr_cache};
+
+                if (*kiblnd_tunables.kib_fmr_pool_size < 
+                    *kiblnd_tunables.kib_ntx) {
+                        CERROR("Can't set fmr pool size (%d) < ntx(%d)\n",
+                               *kiblnd_tunables.kib_fmr_pool_size,
+                               *kiblnd_tunables.kib_ntx);
+                        goto failed;
+                }
+
+                fmrpool = ib_create_fmr_pool(ibdev->ibd_pd, &param);
+                if (!IS_ERR(fmrpool)) {
+                        net->ibn_fmrpool = fmrpool;
+                } else {
+                        CERROR("Can't create FMR pool: %ld\n", 
+                               PTR_ERR(fmrpool));
+                        goto failed;
+                }
+        }
+#endif
+
+        kiblnd_map_tx_descs(ni);
+
+        ibdev->ibd_nnets++;
+        net->ibn_init = IBLND_INIT_ALL;
+
+        return 0;
+
+failed:
+        kiblnd_shutdown(ni);
+
+        CDEBUG(D_NET, "kiblnd_startup failed\n");
+        return -ENETDOWN;
+}
+
+void __exit
+kiblnd_module_fini (void)
+{
+        lnet_unregister_lnd(&the_kiblnd);
+        kiblnd_tunables_fini();
+}
+
+int __init
+kiblnd_module_init (void)
+{
+        int    rc;
+
+        CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
+#if !IBLND_MAP_ON_DEMAND
+        CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+                  <= IBLND_MSG_SIZE);
+        CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
+                  <= IBLND_MSG_SIZE);
+#endif
+        rc = kiblnd_tunables_init();
+        if (rc != 0)
+                return rc;
+
+        lnet_register_lnd(&the_kiblnd);
+
+        return 0;
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v1.00");
+MODULE_LICENSE("GPL");
+
+module_init(kiblnd_module_init);
+module_exit(kiblnd_module_fini);
diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h
new file mode 100644 (file)
index 0000000..24e4be2
--- /dev/null
@@ -0,0 +1,630 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <libcfs/kp30.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+
+#if !HAVE_GFP_T
+typedef int gfp_t;
+#endif
+
+#include <rdma/rdma_cm.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+
+/* tunables fixed at compile time */
+#if CONFIG_SMP
+# define IBLND_N_SCHED      num_online_cpus()   /* # schedulers */
+#else
+# define IBLND_N_SCHED      1                   /* # schedulers */
+#endif
+
+#define IBLND_PEER_HASH_SIZE         101        /* # peer lists */
+#define IBLND_RESCHED                100        /* # scheduler loops before reschedule */
+#define IBLND_MSG_QUEUE_SIZE         8          /* # messages/RDMAs in-flight */
+#define IBLND_CREDIT_HIGHWATER       7          /* when eagerly to return credits */
+#define IBLND_MSG_SIZE              (4<<10)     /* max size of queued messages (inc hdr) */
+
+#define IBLND_MAP_ON_DEMAND  0
+#if IBLND_MAP_ON_DEMAND
+# define IBLND_MAX_RDMA_FRAGS        1
+#else
+# define IBLND_MAX_RDMA_FRAGS        LNET_MAX_IOV
+#endif
+
+/************************/
+/* derived constants... */
+
+/* TX messages (shared by all connections) */
+#define IBLND_TX_MSGS()       (*kiblnd_tunables.kib_ntx)
+#define IBLND_TX_MSG_BYTES()  (IBLND_TX_MSGS() * IBLND_MSG_SIZE)
+#define IBLND_TX_MSG_PAGES()  ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
+
+/* RX messages (per connection) */
+#define IBLND_RX_MSGS         (IBLND_MSG_QUEUE_SIZE*2)
+#define IBLND_RX_MSG_BYTES    (IBLND_RX_MSGS * IBLND_MSG_SIZE)
+#define IBLND_RX_MSG_PAGES    ((IBLND_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+
+#define IBLND_CQ_ENTRIES()    (IBLND_RX_MSGS +                                  \
+                               (*kiblnd_tunables.kib_concurrent_sends) *        \
+                               (1 + IBLND_MAX_RDMA_FRAGS))
+
+typedef struct
+{
+        unsigned int     *kib_service;          /* IB service number */
+        int              *kib_min_reconnect_interval; /* first failed connection retry... */
+        int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+        int              *kib_cksum;            /* checksum kib_msg_t? */
+        int              *kib_timeout;          /* comms timeout (seconds) */
+        int              *kib_keepalive;        /* keepalive timeout (seconds) */
+        int              *kib_ntx;              /* # tx descs */
+        int              *kib_credits;          /* # concurrent sends */
+        int              *kib_peercredits;      /* # concurrent sends to 1 peer */
+        char            **kib_default_ipif;     /* default IPoIB interface */
+        int              *kib_retry_count;
+        int              *kib_rnr_retry_count;
+        int              *kib_concurrent_sends; /* send work queue sizing */
+        int             *kib_ib_mtu;           /* IB MTU */
+#if IBLND_MAP_ON_DEMAND
+        int              *kib_fmr_pool_size;    /* # FMRs in pool */
+        int              *kib_fmr_flush_trigger; /* When to trigger FMR flush */
+        int              *kib_fmr_cache;        /* enable FMR pool cache? */
+#endif
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+#endif
+} kib_tunables_t;
+
+typedef struct
+{
+        int               ibp_npages;           /* # pages */
+        struct page      *ibp_pages[0];
+} kib_pages_t;
+
+typedef struct 
+{
+        struct list_head     ibd_list;          /* chain on kib_devs */
+        __u32                ibd_ifip;          /* IPoIB interface IP */
+        char                 ibd_ifname[32];    /* IPoIB interface name */
+        int                  ibd_nnets;         /* # nets extant */
+
+        struct rdma_cm_id   *ibd_cmid;          /* IB listener (bound to 1 device) */
+        struct ib_pd        *ibd_pd;            /* PD for the device */
+        struct ib_mr        *ibd_mr;            /* MR for non RDMA I/O */
+} kib_dev_t;
+
+typedef struct
+{
+        __u64                ibn_incarnation;   /* my epoch */
+        int                  ibn_init;          /* initialisation state */
+        int                  ibn_shutdown;      /* shutting down? */
+
+        atomic_t             ibn_npeers;        /* # peers extant */
+        atomic_t             ibn_nconns;        /* # connections extant */
+
+        struct kib_tx       *ibn_tx_descs;      /* all the tx descriptors */
+        kib_pages_t         *ibn_tx_pages;      /* premapped tx msg pages */
+        struct list_head     ibn_idle_txs;      /* idle tx descriptors */
+        spinlock_t           ibn_tx_lock;       /* serialise */
+
+#if IBLND_MAP_ON_DEMAND
+        struct ib_fmr_pool  *ibn_fmrpool;       /* FMR pool for RDMA I/O */
+#endif
+
+        kib_dev_t           *ibn_dev;           /* underlying IB device */
+} kib_net_t;
+
+typedef struct
+{
+        int                  kib_init;          /* initialisation state */
+        int                  kib_shutdown;      /* shut down? */
+        struct list_head     kib_devs;          /* IB devices extant */
+        atomic_t             kib_nthreads;      /* # live threads */
+        rwlock_t             kib_global_lock;   /* stabilize net/dev/peer/conn ops */
+
+        struct list_head    *kib_peers;         /* hash table of all my known peers */
+        int                  kib_peer_hash_size; /* size of kib_peers */
+
+        void                *kib_connd;         /* the connd task (serialisation assertions) */
+        struct list_head     kib_connd_conns;   /* connections to setup/teardown */
+        struct list_head     kib_connd_zombies; /* connections with zero refcount */
+        wait_queue_head_t    kib_connd_waitq;   /* connection daemon sleeps here */
+        spinlock_t           kib_connd_lock;    /* serialise */
+
+        wait_queue_head_t    kib_sched_waitq;   /* schedulers sleep here */
+        struct list_head     kib_sched_conns;   /* conns to check for rx completions */
+        spinlock_t           kib_sched_lock;    /* serialise */
+
+        __u64                kib_next_tx_cookie; /* RDMA completion cookie */
+        struct ib_qp_attr    kib_error_qpa;      /* QP->ERROR */
+} kib_data_t;
+
+#define IBLND_INIT_NOTHING         0
+#define IBLND_INIT_DATA            1
+#define IBLND_INIT_ALL             2
+
+/************************************************************************
+ * IB Wire message format.
+ * These are sent in sender's byte order (i.e. receiver flips).
+ */
+
+typedef struct kib_connparams
+{
+        __u16             ibcp_queue_depth;
+        __u16             ibcp_max_frags;
+        __u32             ibcp_max_msg_size;
+} WIRE_ATTR kib_connparams_t;
+
+typedef struct
+{
+        lnet_hdr_t        ibim_hdr;             /* portals header */
+        char              ibim_payload[0];      /* piggy-backed payload */
+} WIRE_ATTR kib_immediate_msg_t;
+
+#if IBLND_MAP_ON_DEMAND
+typedef struct
+{
+       __u64             rd_addr;              /* IO VMA address */
+       __u32             rd_nob;               /* # of bytes */
+       __u32             rd_key;               /* remote key */
+} WIRE_ATTR kib_rdma_desc_t;
+#else
+typedef struct
+{
+        __u32             rf_nob;               /* # bytes this frag */
+        __u64             rf_addr;              /* CAVEAT EMPTOR: misaligned!! */
+} WIRE_ATTR kib_rdma_frag_t;
+
+typedef struct
+{
+        __u32             rd_key;               /* local/remote key */
+        __u32             rd_nfrags;            /* # fragments */
+        kib_rdma_frag_t   rd_frags[0];          /* buffer frags */
+} WIRE_ATTR kib_rdma_desc_t;
+#endif
+        
+typedef struct
+{
+        lnet_hdr_t        ibprm_hdr;            /* portals header */
+        __u64             ibprm_cookie;         /* opaque completion cookie */
+} WIRE_ATTR kib_putreq_msg_t;
+
+typedef struct
+{
+        __u64             ibpam_src_cookie;     /* reflected completion cookie */
+        __u64             ibpam_dst_cookie;     /* opaque completion cookie */
+        kib_rdma_desc_t   ibpam_rd;             /* sender's sink buffer */
+} WIRE_ATTR kib_putack_msg_t;
+
+typedef struct
+{
+        lnet_hdr_t        ibgm_hdr;             /* portals header */
+        __u64             ibgm_cookie;          /* opaque completion cookie */
+        kib_rdma_desc_t   ibgm_rd;              /* rdma descriptor */
+} WIRE_ATTR kib_get_msg_t;
+
+typedef struct
+{
+        __u64             ibcm_cookie;          /* opaque completion cookie */
+        __s32             ibcm_status;          /* < 0 failure: >= 0 length */
+} WIRE_ATTR kib_completion_msg_t;
+
+typedef struct
+{
+        /* First 2 fields fixed FOR ALL TIME */
+        __u32             ibm_magic;            /* I'm an openibnal message */
+        __u16             ibm_version;          /* this is my version number */
+
+        __u8              ibm_type;             /* msg type */
+        __u8              ibm_credits;          /* returned credits */
+        __u32             ibm_nob;              /* # bytes in whole message */
+        __u32             ibm_cksum;            /* checksum (0 == no checksum) */
+        __u64             ibm_srcnid;           /* sender's NID */
+        __u64             ibm_srcstamp;         /* sender's incarnation */
+        __u64             ibm_dstnid;           /* destination's NID */
+        __u64             ibm_dststamp;         /* destination's incarnation */
+
+        union {
+                kib_connparams_t      connparams;
+                kib_immediate_msg_t   immediate;
+                kib_putreq_msg_t      putreq;
+                kib_putack_msg_t      putack;
+                kib_get_msg_t         get;
+                kib_completion_msg_t  completion;
+        } WIRE_ATTR ibm_u;
+} WIRE_ATTR kib_msg_t;
+
+#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC    /* unique magic */
+
+#define IBLND_MSG_VERSION           0x11
+
+#define IBLND_MSG_CONNREQ           0xc0        /* connection request */
+#define IBLND_MSG_CONNACK           0xc1        /* connection acknowledge */
+#define IBLND_MSG_NOOP              0xd0        /* nothing (just credits) */
+#define IBLND_MSG_IMMEDIATE         0xd1        /* immediate */
+#define IBLND_MSG_PUT_REQ           0xd2        /* putreq (src->sink) */
+#define IBLND_MSG_PUT_NAK           0xd3        /* completion (sink->src) */
+#define IBLND_MSG_PUT_ACK           0xd4        /* putack (sink->src) */
+#define IBLND_MSG_PUT_DONE          0xd5        /* completion (src->sink) */
+#define IBLND_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
+#define IBLND_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
+
+typedef struct {
+        __u32            ibr_magic;             /* sender's magic */
+        __u16            ibr_version;           /* sender's version */
+        __u8             ibr_why;               /* reject reason */
+} WIRE_ATTR kib_rej_t;
+
+
+/* connection rejection reasons */
+#define IBLND_REJECT_CONN_RACE       1          /* You lost connection race */
+#define IBLND_REJECT_NO_RESOURCES    2          /* Out of memory/conns etc */
+#define IBLND_REJECT_FATAL           3          /* Anything else */
+
+/***********************************************************************/
+
+typedef struct kib_rx                           /* receive message */
+{
+        struct list_head          rx_list;      /* queue for attention */
+        struct kib_conn          *rx_conn;      /* owning conn */
+        int                       rx_nob;       /* # bytes received (-1 while posted) */
+        enum ib_wc_status         rx_status;    /* completion status */
+        kib_msg_t                *rx_msg;       /* message buffer (host vaddr) */
+        __u64                     rx_msgaddr;   /* message buffer (I/O addr) */
+        DECLARE_PCI_UNMAP_ADDR   (rx_msgunmap); /* for dma_unmap_single() */
+        struct ib_recv_wr         rx_wrq;       /* receive work item... */
+        struct ib_sge             rx_sge;       /* ...and its memory */
+} kib_rx_t;
+
+#define IBLND_POSTRX_DONT_POST    0             /* don't post */
+#define IBLND_POSTRX_NO_CREDIT    1             /* post: no credits */
+#define IBLND_POSTRX_PEER_CREDIT  2             /* post: give peer back 1 credit */
+#define IBLND_POSTRX_RSRVD_CREDIT 3             /* post: give myself back 1 reserved credit */
+
+typedef struct kib_tx                           /* transmit message */
+{
+        struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
+        struct kib_conn          *tx_conn;      /* owning conn */
+        int                       tx_sending;   /* # tx callbacks outstanding */
+        int                       tx_queued;    /* queued for sending */
+        int                       tx_waiting;   /* waiting for peer */
+        int                       tx_status;    /* LNET completion status */
+        unsigned long             tx_deadline;  /* completion deadline */
+        __u64                     tx_cookie;    /* completion cookie */
+        lnet_msg_t               *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
+        kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
+        __u64                     tx_msgaddr;   /* message buffer (I/O addr) */
+        DECLARE_PCI_UNMAP_ADDR   (tx_msgunmap); /* for dma_unmap_single() */
+        int                       tx_nwrq;      /* # send work items */
+#if IBLND_MAP_ON_DEMAND
+        struct ib_send_wr         tx_wrq[2];    /* send work items... */
+        struct ib_sge             tx_sge[2];    /* ...and their memory */
+        kib_rdma_desc_t           tx_rd[1];     /* rdma descriptor */
+        __u64                    *tx_pages;     /* rdma phys page addrs */
+        struct ib_pool_fmr       *tx_fmr;       /* rdma mapping (mapped if != NULL) */
+#else
+        struct ib_send_wr        *tx_wrq;       /* send work items... */
+        struct ib_sge            *tx_sge;       /* ...and their memory */
+        kib_rdma_desc_t          *tx_rd;        /* rdma descriptor */
+        int                       tx_nfrags;    /* # entries in... */
+        struct scatterlist       *tx_frags;     /* dma_map_sg descriptor */
+        int                       tx_dmadir;    /* dma direction */
+#endif        
+} kib_tx_t;
+
+typedef struct kib_connvars
+{
+        /* connection-in-progress variables */
+        kib_msg_t                 cv_msg;
+} kib_connvars_t;
+
+typedef struct kib_conn
+{
+        struct kib_peer    *ibc_peer;           /* owning peer */
+        struct list_head    ibc_list;           /* stash on peer's conn list */
+        struct list_head    ibc_sched_list;     /* schedule for attention */
+        __u64               ibc_incarnation;    /* which instance of the peer */
+        atomic_t            ibc_refcount;       /* # users */
+        int                 ibc_state;          /* what's happening */
+        int                 ibc_nsends_posted;  /* # uncompleted sends */
+        int                 ibc_credits;        /* # credits I have */
+        int                 ibc_outstanding_credits; /* # credits to return */
+        int                 ibc_reserved_credits;/* # ACK/DONE msg credits */
+        int                 ibc_comms_error;    /* set on comms error */
+        int                 ibc_nrx:8;          /* receive buffers owned */
+        int                 ibc_scheduled:1;    /* scheduled for attention */
+        int                 ibc_ready:1;        /* CQ callback fired */
+        unsigned long       ibc_last_send;      /* time of last send */
+        struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
+        struct list_head    ibc_tx_queue;       /* sends that need a credit */
+        struct list_head    ibc_tx_queue_nocred;/* sends that don't need a credit */
+        struct list_head    ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */
+        struct list_head    ibc_active_txs;     /* active tx awaiting completion */
+        spinlock_t          ibc_lock;           /* serialise */
+        kib_rx_t           *ibc_rxs;            /* the rx descs */
+        kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
+
+        struct rdma_cm_id  *ibc_cmid;           /* CM id */
+        struct ib_cq       *ibc_cq;             /* completion queue */
+
+        kib_connvars_t     *ibc_connvars;       /* in-progress connection state */
+} kib_conn_t;
+
+#define IBLND_CONN_INIT               0         /* being intialised */
+#define IBLND_CONN_ACTIVE_CONNECT     1         /* active sending req */
+#define IBLND_CONN_PASSIVE_WAIT       2         /* passive waiting for rtu */
+#define IBLND_CONN_ESTABLISHED        3         /* connection established */
+#define IBLND_CONN_CLOSING            4         /* being closed */
+#define IBLND_CONN_DISCONNECTED       5         /* disconnected */
+
+typedef struct kib_peer
+{
+        struct list_head    ibp_list;           /* stash on global peer list */
+        lnet_nid_t          ibp_nid;            /* who's on the other end(s) */
+        lnet_ni_t          *ibp_ni;             /* LNet interface */
+        atomic_t            ibp_refcount;       /* # users */
+        struct list_head    ibp_conns;          /* all active connections */
+        struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
+        int                 ibp_connecting;     /* current active connection attempts */
+        int                 ibp_accepting;      /* current passive connection attempts */
+        int                 ibp_error;          /* errno on closing this peer */
+        cfs_time_t          ibp_last_alive;     /* when (in jiffies) I was last alive */
+} kib_peer_t;
+
+
+extern kib_data_t      kiblnd_data;
+extern kib_tunables_t  kiblnd_tunables;
+
+#define kiblnd_conn_addref(conn)                                \
+do {                                                            \
+        CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
+               (conn), atomic_read(&(conn)->ibc_refcount));     \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);        \
+        atomic_inc(&(conn)->ibc_refcount);                      \
+} while (0)
+
+#define kiblnd_conn_decref(conn)                                              \
+do {                                                                          \
+        unsigned long   flags;                                                \
+                                                                              \
+        CDEBUG(D_NET, "conn[%p] (%d)--\n",                                    \
+               (conn), atomic_read(&(conn)->ibc_refcount));                   \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);                      \
+        if (atomic_dec_and_test(&(conn)->ibc_refcount)) {                     \
+                spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);        \
+                list_add_tail(&(conn)->ibc_list,                              \
+                              &kiblnd_data.kib_connd_zombies);                \
+                wake_up(&kiblnd_data.kib_connd_waitq);                        \
+                spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);   \
+        }                                                                     \
+} while (0)
+
+#define kiblnd_peer_addref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        atomic_inc(&(peer)->ibp_refcount);                      \
+} while (0)
+
+#define kiblnd_peer_decref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
+                kiblnd_destroy_peer(peer);                      \
+} while (0)
+
+static inline struct list_head *
+kiblnd_nid2peerlist (lnet_nid_t nid)
+{
+        unsigned int hash = ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size;
+
+        return (&kiblnd_data.kib_peers [hash]);
+}
+
+static inline int
+kiblnd_peer_active (kib_peer_t *peer)
+{
+        /* Am I in the peer hash table? */
+        return (!list_empty(&peer->ibp_list));
+}
+
+static inline kib_conn_t *
+kiblnd_get_conn_locked (kib_peer_t *peer)
+{
+        LASSERT (!list_empty(&peer->ibp_conns));
+        
+        /* just return the first connection */
+        return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list);
+}
+
+static inline int
+kiblnd_send_keepalive(kib_conn_t *conn) 
+{
+        return (*kiblnd_tunables.kib_keepalive > 0) &&
+                time_after(jiffies, conn->ibc_last_send +
+                           *kiblnd_tunables.kib_keepalive*HZ);
+}
+
+static inline void
+kiblnd_abort_receives(kib_conn_t *conn)
+{
+        ib_modify_qp(conn->ibc_cmid->qp,
+                     &kiblnd_data.kib_error_qpa, IB_QP_STATE);
+}
+
+/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the
+ * lowest bits of the work request id to stash the work item type. */
+
+#define IBLND_WID_TX    0
+#define IBLND_WID_RDMA  1
+#define IBLND_WID_RX    2
+#define IBLND_WID_MASK  3UL
+
+static inline __u64
+kiblnd_ptr2wreqid (void *ptr, int type)
+{
+        unsigned long lptr = (unsigned long)ptr;
+
+        LASSERT ((lptr & IBLND_WID_MASK) == 0);
+        LASSERT ((type & ~IBLND_WID_MASK) == 0);
+        return (__u64)(lptr | type);
+}
+
+static inline void *
+kiblnd_wreqid2ptr (__u64 wreqid)
+{
+        return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK);
+}
+
+static inline int
+kiblnd_wreqid2type (__u64 wreqid)
+{
+        return (wreqid & IBLND_WID_MASK);
+}
+
+static inline void
+kiblnd_set_conn_state (kib_conn_t *conn, int state)
+{
+        conn->ibc_state = state;
+        mb();
+}
+
+#if IBLND_MAP_ON_DEMAND
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+        return rd->rd_nob;
+}
+#else
+static inline int
+kiblnd_rd_size (kib_rdma_desc_t *rd)
+{
+        int   i;
+        int   size;
+        
+        for (i = size = 0; i < rd->rd_nfrags; i++)
+                size += rd->rd_frags[i].rf_nob;
+        
+        return size;
+}
+#endif
+
+int  kiblnd_startup (lnet_ni_t *ni);
+void kiblnd_shutdown (lnet_ni_t *ni);
+int  kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+
+int  kiblnd_tunables_init(void);
+void kiblnd_tunables_fini(void);
+
+int  kiblnd_connd (void *arg);
+int  kiblnd_scheduler(void *arg);
+int  kiblnd_thread_start (int (*fn)(void *arg), void *arg);
+
+int  kiblnd_alloc_pages (kib_pages_t **pp, int npages);
+void kiblnd_free_pages (kib_pages_t *p);
+
+int  kiblnd_cm_callback(struct rdma_cm_id *cmid,
+                        struct rdma_cm_event *event);
+
+int  kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid);
+void kiblnd_destroy_peer (kib_peer_t *peer);
+void kiblnd_destroy_dev (kib_dev_t *dev);
+void kiblnd_unlink_peer_locked (kib_peer_t *peer);
+void kiblnd_peer_alive (kib_peer_t *peer);
+kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid);
+void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error);
+int  kiblnd_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation);
+
+void kiblnd_connreq_done(kib_conn_t *conn, int status);
+kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid,
+                                int state);
+void kiblnd_destroy_conn (kib_conn_t *conn);
+void kiblnd_close_conn (kib_conn_t *conn, int error);
+void kiblnd_close_conn_locked (kib_conn_t *conn, int error);
+
+int  kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type,
+                       int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie);
+
+void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn);
+void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob);
+void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status);
+void kiblnd_check_sends (kib_conn_t *conn);
+
+void kiblnd_qp_event(struct ib_event *event, void *arg);
+void kiblnd_cq_event(struct ib_event *event, void *arg);
+void kiblnd_cq_completion(struct ib_cq *cq, void *arg);
+
+void kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob);
+void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg,
+                      int credits, lnet_nid_t dstnid, __u64 dststamp);
+int  kiblnd_unpack_msg(kib_msg_t *msg, int nob);
+int  kiblnd_post_rx (kib_rx_t *rx, int credit);
+
+int  kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+                 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+                 unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+
+
diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c
new file mode 100644 (file)
index 0000000..3e5756d
--- /dev/null
@@ -0,0 +1,3159 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "o2iblnd.h"
+
+char *
+kiblnd_msgtype2str(int type) 
+{
+        switch (type) {
+        case IBLND_MSG_CONNREQ:
+                return "CONNREQ";
+                
+        case IBLND_MSG_CONNACK:
+                return "CONNACK";
+                
+        case IBLND_MSG_NOOP:
+                return "NOOP";
+                
+        case IBLND_MSG_IMMEDIATE:
+                return "IMMEDIATE";
+                
+        case IBLND_MSG_PUT_REQ:
+                return "PUT_REQ";
+                
+        case IBLND_MSG_PUT_NAK:
+                return "PUT_NAK";
+                
+        case IBLND_MSG_PUT_ACK:
+                return "PUT_ACK";
+                
+        case IBLND_MSG_PUT_DONE:
+                return "PUT_DONE";
+                
+        case IBLND_MSG_GET_REQ:
+                return "GET_REQ";
+                
+        case IBLND_MSG_GET_DONE:
+                return "GET_DONE";
+                
+        default:
+                return "???";
+        }
+}
+
+void
+kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx)
+{
+        lnet_msg_t *lntmsg[2];
+        kib_net_t  *net = ni->ni_data;
+        int         rc;
+        int         i;
+
+        LASSERT (net != NULL);
+        LASSERT (!in_interrupt());
+        LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
+        LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting sent callback */
+        LASSERT (!tx->tx_waiting);              /* mustn't be awaiting peer response */
+
+#if IBLND_MAP_ON_DEMAND
+        if (tx->tx_fmr != NULL) {
+                rc = ib_fmr_pool_unmap(tx->tx_fmr);
+                LASSERT (rc == 0);
+
+                if (tx->tx_status != 0) {
+                        rc = ib_flush_fmr_pool(net->ibn_fmrpool);
+                        LASSERT (rc == 0);
+                }
+
+                tx->tx_fmr = NULL;
+        }
+#else
+        if (tx->tx_nfrags != 0) {
+                dma_unmap_sg(net->ibn_dev->ibd_cmid->device->dma_device,
+                             tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+                tx->tx_nfrags = 0;
+        }
+#endif
+        /* tx may have up to 2 lnet msgs to finalise */
+        lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+        lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+        rc = tx->tx_status;
+
+        if (tx->tx_conn != NULL) {
+                LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni);
+
+                kiblnd_conn_decref(tx->tx_conn);
+                tx->tx_conn = NULL;
+        }
+
+        tx->tx_nwrq = 0;
+        tx->tx_status = 0;
+
+        spin_lock(&net->ibn_tx_lock);
+
+        list_add(&tx->tx_list, &net->ibn_idle_txs);
+
+        spin_unlock(&net->ibn_tx_lock);
+
+        /* delay finalize until my descs have been freed */
+        for (i = 0; i < 2; i++) {
+                if (lntmsg[i] == NULL)
+                        continue;
+
+                lnet_finalize(ni, lntmsg[i], rc);
+        }
+}
+
+void
+kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status)
+{
+        kib_tx_t *tx;
+        
+        while (!list_empty (txlist)) {
+                tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_waiting = 0;
+                tx->tx_status = status;
+                kiblnd_tx_done(ni, tx);
+        }
+}
+
+kib_tx_t *
+kiblnd_get_idle_tx (lnet_ni_t *ni)
+{
+        kib_net_t     *net = ni->ni_data;
+        kib_tx_t      *tx;
+
+        LASSERT (net != NULL);
+
+        spin_lock(&net->ibn_tx_lock);
+
+        if (list_empty(&net->ibn_idle_txs)) {
+                spin_unlock(&net->ibn_tx_lock);
+                return NULL;
+        }
+
+        tx = list_entry(net->ibn_idle_txs.next, kib_tx_t, tx_list);
+        list_del(&tx->tx_list);
+
+        /* Allocate a new completion cookie.  It might not be needed,
+         * but we've got a lock right now and we're unlikely to
+         * wrap... */
+        tx->tx_cookie = kiblnd_data.kib_next_tx_cookie++;
+
+        spin_unlock(&net->ibn_tx_lock);
+
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (!tx->tx_queued);
+        LASSERT (tx->tx_sending == 0);
+        LASSERT (!tx->tx_waiting);
+        LASSERT (tx->tx_status == 0);
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_lntmsg[0] == NULL);
+        LASSERT (tx->tx_lntmsg[1] == NULL);
+#if IBLND_MAP_ON_DEMAND
+        LASSERT (tx->tx_fmr == NULL);
+#else
+        LASSERT (tx->tx_nfrags == 0);
+#endif
+
+        return tx;
+}
+
+void
+kiblnd_drop_rx (kib_rx_t *rx)
+{
+        kib_conn_t         *conn = rx->rx_conn;
+        unsigned long       flags;
+        
+        spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+        LASSERT (conn->ibc_nrx > 0);
+        conn->ibc_nrx--;
+        spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+
+        kiblnd_conn_decref(conn);
+}
+
+int
+kiblnd_post_rx (kib_rx_t *rx, int credit)
+{
+        kib_conn_t         *conn = rx->rx_conn;
+        kib_net_t          *net = conn->ibc_peer->ibp_ni->ni_data;
+        struct ib_recv_wr  *bad_wrq;
+        int                 rc;
+
+        LASSERT (net != NULL);
+        LASSERT (!in_interrupt());
+        LASSERT (credit == IBLND_POSTRX_NO_CREDIT ||
+                 credit == IBLND_POSTRX_PEER_CREDIT ||
+                 credit == IBLND_POSTRX_RSRVD_CREDIT);
+
+        rx->rx_sge.length = IBLND_MSG_SIZE;
+        rx->rx_sge.lkey = net->ibn_dev->ibd_mr->lkey;
+        rx->rx_sge.addr = rx->rx_msgaddr;
+
+        rx->rx_wrq.next = NULL;
+        rx->rx_wrq.sg_list = &rx->rx_sge;
+        rx->rx_wrq.num_sge = 1;
+        rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX);
+
+        LASSERT (conn->ibc_state >= IBLND_CONN_INIT);
+        LASSERT (rx->rx_nob >= 0);              /* not posted */
+
+        if (conn->ibc_state > IBLND_CONN_ESTABLISHED) {
+                kiblnd_drop_rx(rx);             /* No more posts for this rx */
+                return 0;
+        }
+
+        rx->rx_nob = -1;                        /* flag posted */
+
+        rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq);
+
+        if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */
+                return rc;
+
+        if (rc != 0) {
+                CERROR("Can't post rx for %s: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                kiblnd_close_conn(conn, rc);
+                kiblnd_drop_rx(rx);             /* No more posts for this rx */
+                return rc;
+        }
+
+        if (credit == IBLND_POSTRX_NO_CREDIT)
+                return 0;
+
+        spin_lock(&conn->ibc_lock);
+        if (credit == IBLND_POSTRX_PEER_CREDIT)
+                conn->ibc_outstanding_credits++;
+        else
+                conn->ibc_reserved_credits++;
+        spin_unlock(&conn->ibc_lock);
+
+        kiblnd_check_sends(conn);
+        return 0;
+}
+
+kib_tx_t *
+kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
+{
+        struct list_head   *tmp;
+
+        list_for_each(tmp, &conn->ibc_active_txs) {
+                kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+                LASSERT (!tx->tx_queued);
+                LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+                if (tx->tx_cookie != cookie)
+                        continue;
+
+                if (tx->tx_waiting &&
+                    tx->tx_msg->ibm_type == txtype)
+                        return tx;
+
+                CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+                      tx->tx_waiting ? "" : "NOT ",
+                      tx->tx_msg->ibm_type, txtype);
+        }
+        return NULL;
+}
+
+void
+kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
+{
+        kib_tx_t    *tx;
+        lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+        int          idle;
+
+        spin_lock(&conn->ibc_lock);
+
+        tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie);
+        if (tx == NULL) {
+                spin_unlock(&conn->ibc_lock);
+
+                CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                kiblnd_close_conn(conn, -EPROTO);
+                return;
+        }
+
+        if (tx->tx_status == 0) {               /* success so far */
+                if (status < 0) {               /* failed? */
+                        tx->tx_status = status;
+                } else if (txtype == IBLND_MSG_GET_REQ) {
+                        lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status);
+                }
+        }
+
+        tx->tx_waiting = 0;
+
+        idle = !tx->tx_queued && (tx->tx_sending == 0);
+        if (idle)
+                list_del(&tx->tx_list);
+
+        spin_unlock(&conn->ibc_lock);
+
+        if (idle)
+                kiblnd_tx_done(ni, tx);
+}
+
+void
+kiblnd_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
+{
+        lnet_ni_t   *ni = conn->ibc_peer->ibp_ni;
+        kib_tx_t    *tx = kiblnd_get_idle_tx(ni);
+
+        if (tx == NULL) {
+                CERROR("Can't get tx for completion %x for %s\n",
+                       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+
+        tx->tx_msg->ibm_u.completion.ibcm_status = status;
+        tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+        kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t));
+
+        kiblnd_queue_tx(tx, conn);
+}
+
+void
+kiblnd_handle_rx (kib_rx_t *rx)
+{
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+        int           credits = msg->ibm_credits;
+        kib_tx_t     *tx;
+        int           rc = 0;
+        int           rc2;
+        int           post_credit;
+
+        LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        CDEBUG (D_NET, "Received %x[%d] from %s\n",
+                msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+        if (credits != 0) {
+                /* Have I received credits that will let me send? */
+                spin_lock(&conn->ibc_lock);
+
+                if (conn->ibc_credits + credits > IBLND_MSG_QUEUE_SIZE) {
+                        rc2 = conn->ibc_credits;
+                        spin_unlock(&conn->ibc_lock);
+
+                        CERROR("Bad credits from %s: %d + %d > %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               rc2, credits, IBLND_MSG_QUEUE_SIZE);
+
+                        kiblnd_close_conn(conn, -EPROTO);
+                        kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT);
+                        return;
+                }
+
+                conn->ibc_credits += credits;
+
+                spin_unlock(&conn->ibc_lock);
+                kiblnd_check_sends(conn);
+        }
+
+        switch (msg->ibm_type) {
+        default:
+                CERROR("Bad IBLND message type %x from %s\n",
+                       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                post_credit = IBLND_POSTRX_NO_CREDIT;
+                rc = -EPROTO;
+                break;
+
+        case IBLND_MSG_NOOP:
+                post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_IMMEDIATE:
+                post_credit = IBLND_POSTRX_DONT_POST;
+                rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr,
+                                msg->ibm_srcnid, rx, 0);
+                if (rc < 0)                     /* repost on error */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_PUT_REQ:
+                post_credit = IBLND_POSTRX_DONT_POST;
+                rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                if (rc < 0)                     /* repost on error */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_PUT_NAK:
+                CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBLND_MSG_PUT_ACK:
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+
+                spin_lock(&conn->ibc_lock);
+                tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ,
+                                                   msg->ibm_u.putack.ibpam_src_cookie);
+                if (tx != NULL)
+                        list_del(&tx->tx_list);
+                spin_unlock(&conn->ibc_lock);
+
+                if (tx == NULL) {
+                        CERROR("Unmatched PUT_ACK from %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        rc = -EPROTO;
+                        break;
+                }
+
+                LASSERT (tx->tx_waiting);
+                /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+                 * (a) I can overwrite tx_msg since my peer has received it!
+                 * (b) tx_waiting set tells tx_complete() it's not done. */
+
+                tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
+
+                rc2 = kiblnd_init_rdma(ni, tx, IBLND_MSG_PUT_DONE,
+                                       kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                       &msg->ibm_u.putack.ibpam_rd,
+                                       msg->ibm_u.putack.ibpam_dst_cookie);
+                if (rc2 < 0)
+                        CERROR("Can't setup rdma for PUT to %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
+
+                spin_lock(&conn->ibc_lock);
+                tx->tx_waiting = 0;             /* clear waiting and queue atomically */
+                kiblnd_queue_tx_locked(tx, conn);
+                spin_unlock(&conn->ibc_lock);
+                break;
+
+        case IBLND_MSG_PUT_DONE:
+                post_credit = IBLND_POSTRX_PEER_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+
+        case IBLND_MSG_GET_REQ:
+                post_credit = IBLND_POSTRX_DONT_POST;
+                rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                if (rc < 0)                     /* repost on error */
+                        post_credit = IBLND_POSTRX_PEER_CREDIT;
+                break;
+
+        case IBLND_MSG_GET_DONE:
+                post_credit = IBLND_POSTRX_RSRVD_CREDIT;
+                kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ,
+                                         msg->ibm_u.completion.ibcm_status,
+                                         msg->ibm_u.completion.ibcm_cookie);
+                break;
+        }
+
+        if (rc < 0)                             /* protocol error */
+                kiblnd_close_conn(conn, rc);
+
+        if (post_credit != IBLND_POSTRX_DONT_POST)
+                kiblnd_post_rx(rx, post_credit);
+}
+
+void
+kiblnd_rx_complete (kib_rx_t *rx, int status, int nob)
+{
+        kib_msg_t    *msg = rx->rx_msg;
+        kib_conn_t   *conn = rx->rx_conn;
+        lnet_ni_t    *ni = conn->ibc_peer->ibp_ni;
+        kib_net_t    *net = ni->ni_data;
+        unsigned long flags;
+        int           rc;
+        int           err = -EIO;
+
+        LASSERT (net != NULL);
+        LASSERT (rx->rx_nob < 0);               /* was posted */
+        rx->rx_nob = 0;                         /* isn't now */
+        
+        if (conn->ibc_state > IBLND_CONN_ESTABLISHED)
+                goto ignore;
+
+        if (status != IB_WC_SUCCESS) {
+                CDEBUG(D_NETERROR, "Rx from %s failed: %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
+                goto failed;
+        }
+
+        LASSERT (nob >= 0);
+        rx->rx_nob = nob;
+
+        rc = kiblnd_unpack_msg(msg, rx->rx_nob);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking rx from %s\n",
+                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                goto failed;
+        }
+
+        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+            msg->ibm_dstnid != ni->ni_nid ||
+            msg->ibm_srcstamp != conn->ibc_incarnation ||
+            msg->ibm_dststamp != net->ibn_incarnation) {
+                CERROR ("Stale rx from %s\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                err = -ESTALE;
+                goto failed;
+        }
+
+        /* set time last known alive */
+        kiblnd_peer_alive(conn->ibc_peer);
+
+        /* racing with connection establishment/teardown! */
+
+        if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+                /* must check holding global lock to eliminate race */
+                if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                        list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+                        write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                                flags);
+                        return;
+                }
+                write_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                        flags);
+        }
+        kiblnd_handle_rx(rx);
+        return;
+
+ failed:
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        kiblnd_close_conn(conn, err);
+ ignore:
+        kiblnd_drop_rx(rx);                     /* Don't re-post rx. */
+}
+
+struct page *
+kiblnd_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END) {
+                page = vmalloc_to_page ((void *)vaddr);
+                LASSERT (page != NULL);
+                return page;
+        }
+#if CONFIG_HIGHMEM
+        if (vaddr >= PKMAP_BASE &&
+            vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+                /* No highmem pages only used for bulk (kiov) I/O */
+                CERROR("find page for address in highmem\n");
+                LBUG();
+        }
+#endif
+        page = virt_to_page (vaddr);
+        LASSERT (page != NULL);
+        return page;
+}
+
+#if !IBNAL_MAP_ON_DEMAND
+int
+kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, 
+                    unsigned int niov, struct iovec *iov, int offset, int nob)
+                 
+{
+        struct scatterlist *sg;
+        int                 i;
+        int                 fragnob;
+        unsigned long       vaddr;
+        struct page        *page;
+        int                 page_offset;
+        kib_net_t          *net = ni->ni_data;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+        LASSERT (net != NULL);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        sg = tx->tx_frags;
+        do {
+                LASSERT (niov > 0);
+
+                vaddr = ((unsigned long)iov->iov_base) + offset;
+                page_offset = vaddr & (PAGE_SIZE - 1);
+                page = kiblnd_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR ("Can't find page\n");
+                        return -EFAULT;
+                }
+
+                fragnob = min((int)(iov->iov_len - offset), nob);
+                fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+                sg->page = page;
+                sg->offset = page_offset;
+                sg->length = fragnob;
+                sg++;
+
+                if (offset + fragnob < iov->iov_len) {
+                        offset += fragnob;
+                } else {
+                        offset = 0;
+                        iov++;
+                        niov--;
+                }
+                nob -= fragnob;
+        } while (nob > 0);
+        
+        /* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+         * RDMA sink */
+        tx->tx_nfrags = sg - tx->tx_frags;
+        tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+        rd->rd_nfrags = dma_map_sg(net->ibn_dev->ibd_cmid->device->dma_device,
+                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+        rd->rd_key    = (rd != tx->tx_rd) ? 
+                        net->ibn_dev->ibd_mr->rkey : net->ibn_dev->ibd_mr->lkey;
+
+        for (i = 0; i < rd->rd_nfrags; i++) {
+                rd->rd_frags[i].rf_nob  = sg_dma_len(&tx->tx_frags[i]);
+                rd->rd_frags[i].rf_addr = sg_dma_address(&tx->tx_frags[i]);
+        }
+        
+        return 0;
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, 
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+        struct scatterlist *sg;
+        int                 i;
+        int                 fragnob;
+        kib_net_t          *net = ni->ni_data;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (net != NULL);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        sg = tx->tx_frags;
+        do {
+                LASSERT (nkiov > 0);
+
+                fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+                memset(sg, 0, sizeof(*sg));
+                sg->page = kiov->kiov_page;
+                sg->offset = kiov->kiov_offset + offset;
+                sg->length = fragnob;
+                sg++;
+                
+                offset = 0;
+                kiov++;
+                nkiov--;
+                nob -= fragnob;
+        } while (nob > 0);
+
+        /* If rd is not tx_rd, it's going to get sent to a peer and I'm the
+         * RDMA sink */
+        tx->tx_nfrags = sg - tx->tx_frags;
+        tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+
+        rd->rd_nfrags = dma_map_sg(net->ibn_dev->ibd_cmid->device->dma_device,
+                                   tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+        rd->rd_key    = (rd != tx->tx_rd) ? 
+                        net->ibn_dev->ibd_mr->rkey : net->ibn_dev->ibd_mr->lkey;
+
+        for (i = 0; i < tx->tx_nfrags; i++) {
+                rd->rd_frags[i].rf_nob  = sg_dma_len(&tx->tx_frags[i]);
+                rd->rd_frags[i].rf_addr = sg_dma_address(&tx->tx_frags[i]);
+#if 0
+                CDEBUG(D_WARNING,"frag[%d]: "LPX64" for %d\n",
+                       i, rd->rd_frags[i].rf_addr, rd->rd_frags[i].rf_nob);
+#endif
+        }
+        
+        return 0;
+}
+#else
+int
+kiblnd_map_tx (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+               int npages, unsigned long page_offset, int nob)
+{
+        struct ib_pool_fmr *fmr;
+        kib_net_t          *net = ni->ni_data;
+
+        LASSERT (net != NULL);
+        LASSERT (tx->tx_fmr == NULL);
+        LASSERT (page_offset < PAGE_SIZE);
+        LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
+        LASSERT (npages <= LNET_MAX_IOV);
+
+        rd->rd_addr = 0;
+
+        fmr = ib_fmr_pool_map_phys(net->ibn_fmrpool, tx->tx_pages,
+                                   npages, rd->rd_addr);
+        if (IS_ERR(fmr)) {
+                CERROR ("Can't map %d pages: %ld\n", npages, PTR_ERR(fmr));
+                return PTR_ERR(fmr);
+        }
+
+        /* If rd is not tx_rd, it's going to get sent to a peer, who will need
+         * the rkey */
+
+        rd->rd_key = (rd != tx->tx_rd) ? fmr->fmr->rkey : fmr->fmr->lkey;
+        rd->rd_nob = nob;
+
+        tx->tx_fmr = fmr;
+        return 0;
+}
+
+int
+kiblnd_setup_rd_iov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+                     unsigned int niov, struct iovec *iov, int offset, int nob)
+
+{
+        int           resid;
+        int           fragnob;
+        struct page  *page;
+        int           npages;
+        unsigned long page_offset;
+        unsigned long vaddr;
+
+        LASSERT (nob > 0);
+        LASSERT (niov > 0);
+
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+                LASSERT (niov > 0);
+        }
+
+        if (nob > iov->iov_len - offset) {
+                CERROR ("Can't map multiple vaddr fragments\n");
+                return (-EMSGSIZE);
+        }
+
+        vaddr = ((unsigned long)iov->iov_base) + offset;
+
+        page_offset = vaddr & (PAGE_SIZE - 1);
+        resid = nob;
+        npages = 0;
+
+        do {
+                LASSERT (npages < LNET_MAX_IOV);
+
+                page = kiblnd_kvaddr_to_page(vaddr);
+                if (page == NULL) {
+                        CERROR("Can't find page for %lu\n", vaddr);
+                        return -EFAULT;
+                }
+
+                tx->tx_pages[npages++] = lnet_page2phys(page);
+
+                fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
+                vaddr += fragnob;
+                resid -= fragnob;
+
+        } while (resid > 0);
+
+        return kiblnd_map_tx(ni, tx, rd, npages, page_offset, nob);
+}
+
+int
+kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+        int            resid;
+        int            npages;
+        unsigned long  page_offset;
+
+        CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+        LASSERT (nob > 0);
+        LASSERT (nkiov > 0);
+        LASSERT (nkiov <= LNET_MAX_IOV);
+
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                nkiov--;
+                kiov++;
+                LASSERT (nkiov > 0);
+        }
+
+        page_offset = kiov->kiov_offset + offset;
+
+        resid = offset + nob;
+        npages = 0;
+
+        do {
+                LASSERT (npages < LNET_MAX_IOV);
+                LASSERT (nkiov > 0);
+
+                if ((npages > 0 && kiov->kiov_offset != 0) ||
+                    (resid > kiov->kiov_len &&
+                     (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
+                        /* Can't have gaps */
+                        CERROR ("Can't make payload contiguous in I/O VM:"
+                                "page %d, offset %d, len %d \n",
+                                npages, kiov->kiov_offset, kiov->kiov_len);
+
+                        return -EINVAL;
+                }
+
+                tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
+                resid -= kiov->kiov_len;
+                kiov++;
+                nkiov--;
+        } while (resid > 0);
+
+        return kiblnd_map_tx(ni, tx, rd, npages, page_offset, nob);
+}
+#endif
+
+void
+kiblnd_check_sends (kib_conn_t *conn)
+{
+        kib_tx_t          *tx;
+        lnet_ni_t         *ni = conn->ibc_peer->ibp_ni;
+        int                rc;
+        int                consume_cred = 0;
+        struct ib_send_wr *bad_wrq;
+        int                done;
+
+        /* Don't send anything until after the connection is established */
+        if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                CDEBUG(D_NET, "%s too soon\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+        }
+
+        spin_lock(&conn->ibc_lock);
+
+        LASSERT (conn->ibc_nsends_posted <=
+                 *kiblnd_tunables.kib_concurrent_sends);
+        LASSERT (conn->ibc_reserved_credits >= 0);
+
+        while (conn->ibc_reserved_credits > 0 &&
+               !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+                tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+                                kib_tx_t, tx_list);
+                list_del(&tx->tx_list);
+                list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+                conn->ibc_reserved_credits--;
+        }
+
+        if (list_empty(&conn->ibc_tx_queue) &&
+            list_empty(&conn->ibc_tx_queue_nocred) &&
+            (conn->ibc_outstanding_credits >= IBLND_CREDIT_HIGHWATER ||
+             kiblnd_send_keepalive(conn))) {
+                spin_unlock(&conn->ibc_lock);
+
+                tx = kiblnd_get_idle_tx(ni);
+                if (tx != NULL)
+                        kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
+
+                spin_lock(&conn->ibc_lock);
+
+                if (tx != NULL)
+                        kiblnd_queue_tx_locked(tx, conn);
+        }
+
+        for (;;) {
+                if (!list_empty (&conn->ibc_tx_queue_nocred)) {
+                        tx = list_entry (conn->ibc_tx_queue_nocred.next, 
+                                         kib_tx_t, tx_list);
+                        consume_cred = 0;
+                } else if (!list_empty (&conn->ibc_tx_queue)) {
+                        tx = list_entry (conn->ibc_tx_queue.next,
+                                         kib_tx_t, tx_list);
+                        consume_cred = 1;
+                } else {
+                        /* nothing to send right now */
+                        break;
+                }
+                
+                LASSERT (tx->tx_queued);
+                /* We rely on this for QP sizing */
+                LASSERT (tx->tx_nwrq > 0 &&
+                         tx->tx_nwrq <= 1 + IBLND_MAX_RDMA_FRAGS);
+
+                LASSERT (conn->ibc_outstanding_credits >= 0);
+                LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE);
+                LASSERT (conn->ibc_credits >= 0);
+                LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE);
+
+                if (conn->ibc_nsends_posted == 
+                    *kiblnd_tunables.kib_concurrent_sends) {
+                        /* tx completions outstanding... */
+                        CDEBUG(D_NET, "%s: posted enough\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        break;
+                }
+
+                if (consume_cred) {
+                        if (conn->ibc_credits == 0) {   /* no credits */
+                                CDEBUG(D_NET, "%s: no credits\n",
+                                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                                break;
+                        }
+
+                        if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                            conn->ibc_outstanding_credits == 0) { /* giving back credits */
+                                CDEBUG(D_NET, "%s: not using last credit\n",
+                                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                                break;
+                        }
+                }
+
+                list_del (&tx->tx_list);
+                tx->tx_queued = 0;
+
+                /* NB don't drop ibc_lock before bumping tx_sending */
+
+                if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP &&
+                    (!list_empty(&conn->ibc_tx_queue) ||
+                     !list_empty(&conn->ibc_tx_queue_nocred) ||
+                     (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER &&
+                      !kiblnd_send_keepalive(conn)))) {
+                        /* redundant NOOP */
+                        spin_unlock(&conn->ibc_lock);
+                        kiblnd_tx_done(ni, tx);
+                        spin_lock(&conn->ibc_lock);
+                        CDEBUG(D_NET, "%s: redundant noop\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        continue;
+                }
+
+                kiblnd_pack_msg(ni, tx->tx_msg, conn->ibc_outstanding_credits,
+                                conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
+
+                conn->ibc_outstanding_credits = 0;
+                conn->ibc_nsends_posted++;
+                if (consume_cred)
+                        conn->ibc_credits--;
+
+                /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
+                 * PUT.  If so, it was first queued here as a PUT_REQ, sent and
+                 * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+                 * and then re-queued here.  It's (just) possible that
+                 * tx_sending is non-zero if we've not done the tx_complete() from
+                 * the first send; hence the ++ rather than = below. */
+                tx->tx_sending++;
+
+                list_add (&tx->tx_list, &conn->ibc_active_txs);
+#if 0
+                {
+                        int i;
+                        
+                        for (i = 0; i < tx->tx_nwrq - 1; i++) {
+                                LASSERT (tx->tx_wrq[i].opcode == IB_WR_RDMA_WRITE);
+                                LASSERT (tx->tx_wrq[i].next == &tx->tx_wrq[i+1]);
+                                LASSERT (tx->tx_wrq[i].sg_list == &tx->tx_sge[i]);
+                        
+                                CDEBUG(D_WARNING, "WORK[%d]: RDMA "LPX64
+                                       " for %d k %x -> "LPX64" k %x\n", i,
+                                       tx->tx_wrq[i].sg_list->addr,
+                                       tx->tx_wrq[i].sg_list->length,
+                                       tx->tx_wrq[i].sg_list->lkey,
+                                       tx->tx_wrq[i].wr.rdma.remote_addr,
+                                       tx->tx_wrq[i].wr.rdma.rkey);
+                        }
+                        
+                        LASSERT (tx->tx_wrq[i].opcode == IB_WR_SEND);
+                        LASSERT (tx->tx_wrq[i].next == NULL);
+                        LASSERT (tx->tx_wrq[i].sg_list == &tx->tx_sge[i]);
+                        
+                        CDEBUG(D_WARNING, "WORK[%d]: SEND "LPX64" for %d k %x\n", i,
+                               tx->tx_wrq[i].sg_list->addr,
+                               tx->tx_wrq[i].sg_list->length,
+                               tx->tx_wrq[i].sg_list->lkey);
+                }
+#endif           
+                /* I'm still holding ibc_lock! */
+                if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+                        rc = -ECONNABORTED;
+                else
+                        rc = ib_post_send(conn->ibc_cmid->qp, tx->tx_wrq, &bad_wrq);
+
+                conn->ibc_last_send = jiffies;
+
+                if (rc != 0) {
+                        /* NB credits are transferred in the actual
+                         * message, which can only be the last work item */
+                        conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
+                        if (consume_cred)
+                                conn->ibc_credits++;
+                        conn->ibc_nsends_posted--;
+
+                        tx->tx_status = rc;
+                        tx->tx_waiting = 0;
+                        tx->tx_sending--;
+
+                        done = (tx->tx_sending == 0);
+                        if (done)
+                                list_del (&tx->tx_list);
+
+                        spin_unlock(&conn->ibc_lock);
+
+                        if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+                                CERROR("Error %d posting transmit to %s\n",
+                                       rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        else
+                                CDEBUG(D_NET, "Error %d posting transmit to %s\n",
+                                       rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+                        kiblnd_close_conn(conn, rc);
+
+                        if (done)
+                                kiblnd_tx_done(ni, tx);
+                        return;
+                }
+        }
+
+        spin_unlock(&conn->ibc_lock);
+}
+
+void
+kiblnd_tx_complete (kib_tx_t *tx, int status)
+{
+        int           failed = (status != IB_WC_SUCCESS);
+        kib_conn_t   *conn = tx->tx_conn;
+        int           idle;
+
+        LASSERT (tx->tx_sending > 0);
+
+        if (failed) {
+                if (conn->ibc_state == IBLND_CONN_ESTABLISHED)
+                        CDEBUG(D_NETERROR, "Tx -> %s cookie "LPX64
+                               "sending %d waiting %d: failed %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               tx->tx_cookie, tx->tx_sending, tx->tx_waiting,
+                               status);
+
+                kiblnd_close_conn(conn, -EIO);
+        } else {
+                kiblnd_peer_alive(conn->ibc_peer);
+        }
+
+        spin_lock(&conn->ibc_lock);
+
+        /* I could be racing with rdma completion.  Whoever makes 'tx' idle
+         * gets to free it, which also drops its ref on 'conn'. */
+
+        tx->tx_sending--;
+        conn->ibc_nsends_posted--;
+
+        if (failed) {
+                tx->tx_waiting = 0;             /* don't wait for peer */
+                tx->tx_status = -EIO;
+        }
+
+        idle = (tx->tx_sending == 0) &&         /* This is the final callback */
+               !tx->tx_waiting &&               /* Not waiting for peer */
+               !tx->tx_queued;                  /* Not re-queued (PUT_DONE) */
+        if (idle)
+                list_del(&tx->tx_list);
+
+        kiblnd_conn_addref(conn);               /* 1 ref for me.... */
+
+        spin_unlock(&conn->ibc_lock);
+
+        if (idle)
+                kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx);
+
+        kiblnd_check_sends(conn);
+
+        kiblnd_conn_decref(conn);               /* ...until here */
+}
+
+void
+kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob)
+{
+        kib_net_t         *net = ni->ni_data;
+        struct ib_sge     *sge = &tx->tx_sge[tx->tx_nwrq];
+        struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq];
+        int                nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+
+        LASSERT (net != NULL);
+        LASSERT (tx->tx_nwrq >= 0);
+        LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1);
+        LASSERT (nob <= IBLND_MSG_SIZE);
+
+        kiblnd_init_msg(tx->tx_msg, type, body_nob);
+
+        sge->addr = tx->tx_msgaddr;
+        sge->lkey = net->ibn_dev->ibd_mr->lkey;
+        sge->length = nob;
+
+        memset(wrq, 0, sizeof(*wrq));
+
+        wrq->next       = NULL;
+        wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_TX);
+        wrq->sg_list    = sge;
+        wrq->num_sge    = 1;
+        wrq->opcode     = IB_WR_SEND;
+        wrq->send_flags = IB_SEND_SIGNALED;
+
+        tx->tx_nwrq++;
+}
+
+int
+kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type,
+                  int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie)
+{
+        kib_msg_t         *ibmsg = tx->tx_msg;
+        kib_rdma_desc_t   *srcrd = tx->tx_rd;
+        struct ib_sge     *sge = &tx->tx_sge[0];
+        struct ib_send_wr *wrq = &tx->tx_wrq[0];
+        int                rc = nob;
+
+#if IBLND_MAP_ON_DEMAND
+        LASSERT (!in_interrupt());
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (type == IBLND_MSG_GET_DONE ||
+                 type == IBLND_MSG_PUT_DONE);
+
+        sge->addr = srcrd->rd_addr;
+        sge->lkey = srcrd->rd_key;
+        sge->length = nob;
+
+        wrq = &tx->tx_wrq[0];
+
+        wrq->next       = &tx->tx_wrq[1];
+        wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+        wrq->sg_list    = sge;
+        wrq->num_sge    = 1;
+        wrq->opcode     = IB_WR_RDMA_WRITE;
+        wrq->send_flags = 0;
+
+        wrq->wr.rdma.remote_addr = dstrd->rd_addr;
+        wrq->wr.rdma.rkey        = dstrd->rd_key;
+
+        tx->tx_nwrq = 1;
+#else
+        /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+        int              resid = nob;
+        kib_rdma_frag_t *srcfrag;
+        int              srcidx;
+        kib_rdma_frag_t *dstfrag;
+        int              dstidx;
+        int              wrknob;
+
+        LASSERT (!in_interrupt());
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (type == IBLND_MSG_GET_DONE ||
+                 type == IBLND_MSG_PUT_DONE);
+
+        srcidx = dstidx = 0;
+        srcfrag = &srcrd->rd_frags[0];
+        dstfrag = &dstrd->rd_frags[0];
+
+        while (resid > 0) {
+                if (srcidx >= srcrd->rd_nfrags) {
+                        CERROR("Src buffer exhausted: %d frags\n", srcidx);
+                        rc = -EPROTO;
+                        break;
+                }
+                
+                if (dstidx == dstrd->rd_nfrags) {
+                        CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+                        rc = -EPROTO;
+                        break;
+                }
+
+                if (tx->tx_nwrq == IBLND_MAX_RDMA_FRAGS) {
+                        CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
+                               srcidx, srcrd->rd_nfrags,
+                               dstidx, dstrd->rd_nfrags);
+                        rc = -EMSGSIZE;
+                        break;
+                }
+
+                wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
+
+                sge = &tx->tx_sge[tx->tx_nwrq];
+                sge->addr   = srcfrag->rf_addr;
+                sge->length = wrknob;
+                sge->lkey   = srcrd->rd_key;
+
+                wrq = &tx->tx_wrq[tx->tx_nwrq];
+
+                wrq->next       = wrq + 1;
+                wrq->wr_id      = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA);
+                wrq->sg_list    = sge;
+                wrq->num_sge    = 1;
+                wrq->opcode     = IB_WR_RDMA_WRITE;
+                wrq->send_flags = 0;
+
+                wrq->wr.rdma.remote_addr = dstfrag->rf_addr;
+                wrq->wr.rdma.rkey        = dstrd->rd_key;
+
+                wrq++;
+                sge++;
+
+                resid -= wrknob;
+                if (wrknob < srcfrag->rf_nob) {
+                        srcfrag->rf_nob  -= wrknob;
+                        srcfrag->rf_addr += wrknob;
+                } else {
+                        srcfrag++;
+                        srcidx++;
+                }
+                
+                if (wrknob < dstfrag->rf_nob) {
+                        dstfrag->rf_nob  -= wrknob;
+                        dstfrag->rf_addr += wrknob;
+                } else {
+                        dstfrag++;
+                        dstidx++;
+                }
+                
+                tx->tx_nwrq++;
+        }
+
+        if (rc < 0)                             /* no RDMA if completing with failure */
+                tx->tx_nwrq = 0;
+#endif
+        ibmsg->ibm_u.completion.ibcm_status = rc;
+        ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+        kiblnd_init_tx_msg(ni, tx, type, sizeof (kib_completion_msg_t));
+
+        return rc;
+}
+
+void
+kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
+{
+        struct list_head   *q;
+
+        LASSERT (tx->tx_nwrq > 0);              /* work items set up */
+        LASSERT (!tx->tx_queued);               /* not queued for sending already */
+
+        tx->tx_queued = 1;
+        tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ);
+
+        if (tx->tx_conn == NULL) {
+                kiblnd_conn_addref(conn);
+                tx->tx_conn = conn;
+                LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE);
+        } else {
+                /* PUT_DONE first attached to conn as a PUT_REQ */
+                LASSERT (tx->tx_conn == conn);
+                LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE);
+        }
+
+        switch (tx->tx_msg->ibm_type) {
+        default:
+                LBUG();
+
+        case IBLND_MSG_PUT_REQ:
+        case IBLND_MSG_GET_REQ:
+                q = &conn->ibc_tx_queue_rsrvd;
+                break;
+
+        case IBLND_MSG_PUT_NAK:
+        case IBLND_MSG_PUT_ACK:
+        case IBLND_MSG_PUT_DONE:
+        case IBLND_MSG_GET_DONE:
+                q = &conn->ibc_tx_queue_nocred;
+                break;
+
+        case IBLND_MSG_NOOP:
+        case IBLND_MSG_IMMEDIATE:
+                q = &conn->ibc_tx_queue;
+                break;
+        }
+
+        list_add_tail(&tx->tx_list, q);
+}
+
+void
+kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+        spin_lock(&conn->ibc_lock);
+        kiblnd_queue_tx_locked(tx, conn);
+        spin_unlock(&conn->ibc_lock);
+
+        kiblnd_check_sends(conn);
+}
+
+void
+kiblnd_connect_peer (kib_peer_t *peer)
+{
+        struct rdma_cm_id *cmid;
+        struct sockaddr_in sockaddr;
+        int                rc;
+
+        LASSERT (peer->ibp_connecting > 0);
+
+        cmid = rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP);
+        if (IS_ERR(cmid)) {
+                CERROR("Can't create CMID for %s: %ld\n",
+                       libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
+                rc = PTR_ERR(cmid);
+                goto failed;
+        }
+
+        memset(&sockaddr, 0, sizeof(sockaddr));
+        sockaddr.sin_family = AF_INET;
+        sockaddr.sin_port = htons(*kiblnd_tunables.kib_service);
+        sockaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid));
+
+        kiblnd_peer_addref(peer);               /* cmid's ref */
+
+        rc = rdma_resolve_addr(cmid, NULL, (struct sockaddr *)&sockaddr,
+                               *kiblnd_tunables.kib_timeout * 1000);
+        if (rc == 0)
+                return;
+
+        /* Can't initiate address resolution:  */
+        CERROR("Can't resolve addr for %s: %d\n",
+               libcfs_nid2str(peer->ibp_nid), rc);
+
+        kiblnd_peer_decref(peer);               /* cmid's ref */
+        rdma_destroy_id(cmid);
+ failed:
+        kiblnd_peer_connect_failed(peer, 1, rc);
+}
+
+void
+kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid)
+{
+        kib_peer_t        *peer;
+        kib_peer_t        *peer2;
+        kib_conn_t        *conn;
+        rwlock_t          *g_lock = &kiblnd_data.kib_global_lock;
+        unsigned long      flags;
+        int                rc;
+
+        /* If I get here, I've committed to send, so I complete the tx with
+         * failure on any problems */
+
+        LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
+        LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
+
+        /* First time, just use a read lock since I expect to find my peer
+         * connected */
+        read_lock_irqsave(g_lock, flags);
+
+        peer = kiblnd_find_peer_locked(nid);
+        if (peer != NULL && !list_empty(&peer->ibp_conns)) {
+                /* Found a peer with an established connection */
+                conn = kiblnd_get_conn_locked(peer);
+                kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+                read_unlock_irqrestore(g_lock, flags);
+
+                kiblnd_queue_tx(tx, conn);
+                kiblnd_conn_decref(conn); /* ...to here */
+                return;
+        }
+
+        read_unlock(g_lock);
+        /* Re-try with a write lock */
+        write_lock(g_lock);
+
+        peer = kiblnd_find_peer_locked(nid);
+        if (peer != NULL) {
+                if (list_empty(&peer->ibp_conns)) {
+                        /* found a peer, but it's still connecting... */
+                        LASSERT (peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0);
+                        list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
+                        write_unlock_irqrestore(g_lock, flags);
+                } else {
+                        conn = kiblnd_get_conn_locked(peer);
+                        kiblnd_conn_addref(conn); /* 1 ref for me... */
+                        
+                        write_unlock_irqrestore(g_lock, flags);
+                        
+                        kiblnd_queue_tx(tx, conn);
+                        kiblnd_conn_decref(conn); /* ...to here */
+                }
+                return;
+        }
+
+        write_unlock_irqrestore(g_lock, flags);
+
+        /* Allocate a peer ready to add to the peer table and retry */
+        rc = kiblnd_create_peer(ni, &peer, nid);
+        if (rc != 0) {
+                CERROR("Can't create peer %s\n", libcfs_nid2str(nid));
+                tx->tx_status = -EHOSTUNREACH;
+                tx->tx_waiting = 0;
+                kiblnd_tx_done(ni, tx);
+                return;
+        }
+
+        write_lock_irqsave(g_lock, flags);
+
+        peer2 = kiblnd_find_peer_locked(nid);
+        if (peer2 != NULL) {
+                if (list_empty(&peer2->ibp_conns)) {
+                        /* found a peer, but it's still connecting... */
+                        LASSERT (peer2->ibp_connecting != 0 ||
+                                 peer2->ibp_accepting != 0);
+                        list_add_tail (&tx->tx_list, &peer2->ibp_tx_queue);
+                        write_unlock_irqrestore(g_lock, flags);
+                } else {
+                        conn = kiblnd_get_conn_locked(peer2);
+                        kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+                        write_unlock_irqrestore(g_lock, flags);
+                        
+                        kiblnd_queue_tx(tx, conn);
+                        kiblnd_conn_decref(conn); /* ...to here */
+                }
+
+                kiblnd_peer_decref(peer);
+                return;
+        }
+
+        /* Brand new peer */
+        LASSERT (peer->ibp_connecting == 0);
+        peer->ibp_connecting = 1;
+
+        list_add_tail(&tx->tx_list, &peer->ibp_tx_queue);
+
+        kiblnd_peer_addref(peer);
+        list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+        write_unlock_irqrestore(g_lock, flags);
+
+        kiblnd_connect_peer(peer);
+        kiblnd_peer_decref(peer);
+}
+
+int
+kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+        int               type = lntmsg->msg_type;
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      payload_niov = lntmsg->msg_niov;
+        struct iovec     *payload_iov = lntmsg->msg_iov;
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        kib_msg_t        *ibmsg;
+        kib_tx_t         *tx;
+        int               nob;
+        int               rc;
+
+        /* NB 'private' is different depending on what we're sending.... */
+
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
+
+        /* Thread context */
+        LASSERT (!in_interrupt());
+        /* payload is either all vaddrs or all pages */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+
+        switch (type) {
+        default:
+                LBUG();
+                return (-EIO);
+
+        case LNET_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
+
+        case LNET_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+
+                /* is the REPLY message too small for RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+                if (nob <= IBLND_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+                tx = kiblnd_get_idle_tx(ni);
+                if (tx == NULL) {
+                        CERROR("Can allocate txd for GET to %s: \n",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
+
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+                ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                        rc = kiblnd_setup_rd_iov(ni, tx,
+                                                 &ibmsg->ibm_u.get.ibgm_rd,
+                                                 lntmsg->msg_md->md_niov,
+                                                 lntmsg->msg_md->md_iov.iov,
+                                                 0, lntmsg->msg_md->md_length);
+                else
+                        rc = kiblnd_setup_rd_kiov(ni, tx,
+                                                  &ibmsg->ibm_u.get.ibgm_rd,
+                                                  lntmsg->msg_md->md_niov,
+                                                  lntmsg->msg_md->md_iov.kiov,
+                                                  0, lntmsg->msg_md->md_length);
+                if (rc != 0) {
+                        CERROR("Can't setup GET sink for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
+                        kiblnd_tx_done(ni, tx);
+                        return -EIO;
+                }
+#if IBLND_MAP_ON_DEMAND
+                nob = sizeof(kib_get_msg_t);
+#else
+                nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]);
+#endif
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob);
+
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+                if (tx->tx_lntmsg[1] == NULL) {
+                        CERROR("Can't create reply for GET -> %s\n",
+                               libcfs_nid2str(target.nid));
+                        kiblnd_tx_done(ni, tx);
+                        return -EIO;
+                }
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
+                tx->tx_waiting = 1;             /* waiting for GET_DONE */
+                kiblnd_launch_tx(ni, tx, target.nid);
+                return 0;
+
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
+                /* Is the payload small enough not to need RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
+                if (nob <= IBLND_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+                tx = kiblnd_get_idle_tx(ni);
+                if (tx == NULL) {
+                        CERROR("Can't allocate %s txd for %s\n",
+                               type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
+
+                if (payload_kiov == NULL)
+                        rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                                 payload_niov, payload_iov,
+                                                 payload_offset, payload_nob);
+                else
+                        rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                                  payload_niov, payload_kiov,
+                                                  payload_offset, payload_nob);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT src for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
+                        kiblnd_tx_done(ni, tx);
+                        return -EIO;
+                }
+
+                ibmsg = tx->tx_msg;
+                ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+                ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
+                kiblnd_launch_tx(ni, tx, target.nid);
+                return 0;
+        }
+
+        /* send IMMEDIATE */
+
+        LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+                 <= IBLND_MSG_SIZE);
+
+        tx = kiblnd_get_idle_tx(ni);
+        if (tx == NULL) {
+                CERROR ("Can't send %d to %s: tx descs exhausted\n",
+                        type, libcfs_nid2str(target.nid));
+                return -ENOMEM;
+        }
+
+        ibmsg = tx->tx_msg;
+        ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
+
+        if (payload_kiov != NULL)
+                lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg,
+                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                    payload_niov, payload_kiov,
+                                    payload_offset, payload_nob);
+        else
+                lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg,
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                   payload_niov, payload_iov,
+                                   payload_offset, payload_nob);
+
+        nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+        kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob);
+
+        tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
+        kiblnd_launch_tx(ni, tx, target.nid);
+        return 0;
+}
+
+void
+kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
+{
+        lnet_process_id_t target = lntmsg->msg_target;
+        unsigned int      niov = lntmsg->msg_niov;
+        struct iovec     *iov = lntmsg->msg_iov;
+        lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+        unsigned int      offset = lntmsg->msg_offset;
+        unsigned int      nob = lntmsg->msg_len;
+        kib_tx_t         *tx;
+        int               rc;
+
+        tx = kiblnd_get_idle_tx(ni);
+        if (tx == NULL) {
+                CERROR("Can't get tx for REPLY to %s\n",
+                       libcfs_nid2str(target.nid));
+                goto failed_0;
+        }
+
+        if (nob == 0)
+                rc = 0;
+        else if (kiov == NULL)
+                rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd,
+                                         niov, iov, offset, nob);
+        else
+                rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
+                                          niov, kiov, offset, nob);
+
+        if (rc != 0) {
+                CERROR("Can't setup GET src for %s: %d\n",
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
+
+        rc = kiblnd_init_rdma(ni, tx, IBLND_MSG_GET_DONE, nob,
+                              &rx->rx_msg->ibm_u.get.ibgm_rd,
+                              rx->rx_msg->ibm_u.get.ibgm_cookie);
+        if (rc < 0) {
+                CERROR("Can't setup rdma for GET from %s: %d\n",
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
+        
+        if (nob == 0) {
+                /* No RDMA: local completion may happen now! */
+                lnet_finalize(ni, lntmsg, 0);
+        } else {
+                /* RDMA: lnet_finalize(lntmsg) when it
+                 * completes */
+                tx->tx_lntmsg[0] = lntmsg;
+        }
+
+        kiblnd_queue_tx(tx, rx->rx_conn);
+        return;
+
+ failed_1:
+        kiblnd_tx_done(ni, tx);
+ failed_0:
+        lnet_finalize(ni, lntmsg, -EIO);
+}
+
+int
+kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+             unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+             unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        kib_rx_t    *rx = private;
+        kib_msg_t   *rxmsg = rx->rx_msg;
+        kib_conn_t  *conn = rx->rx_conn;
+        kib_tx_t    *tx;
+        kib_msg_t   *txmsg;
+        int          nob;
+        int          post_credit = IBLND_POSTRX_PEER_CREDIT;
+        int          rc = 0;
+
+        LASSERT (mlen <= rlen);
+        LASSERT (!in_interrupt());
+        /* Either all pages or all vaddrs */
+        LASSERT (!(kiov != NULL && iov != NULL));
+
+        switch (rxmsg->ibm_type) {
+        default:
+                LBUG();
+
+        case IBLND_MSG_IMMEDIATE:
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+                if (nob > rx->rx_nob) {
+                        CERROR ("Immediate message from %s too big: %d(%d)\n",
+                                libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+                                nob, rx->rx_nob);
+                        rc = -EPROTO;
+                        break;
+                }
+
+                if (kiov != NULL)
+                        lnet_copy_flat2kiov(niov, kiov, offset,
+                                            IBLND_MSG_SIZE, rxmsg,
+                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                            mlen);
+                else
+                        lnet_copy_flat2iov(niov, iov, offset,
+                                           IBLND_MSG_SIZE, rxmsg,
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                           mlen);
+                lnet_finalize (ni, lntmsg, 0);
+                break;
+
+        case IBLND_MSG_PUT_REQ:
+                if (mlen == 0) {
+                        lnet_finalize(ni, lntmsg, 0);
+                        kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
+                }
+
+                tx = kiblnd_get_idle_tx(ni);
+                if (tx == NULL) {
+                        CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        /* Not replying will break the connection */
+                        rc = -ENOMEM;
+                        break;
+                }
+
+                txmsg = tx->tx_msg;
+                if (kiov == NULL)
+                        rc = kiblnd_setup_rd_iov(ni, tx,
+                                                 &txmsg->ibm_u.putack.ibpam_rd,
+                                                 niov, iov, offset, mlen);
+                else
+                        rc = kiblnd_setup_rd_kiov(ni, tx, 
+                                                  &txmsg->ibm_u.putack.ibpam_rd,
+                                                  niov, kiov, offset, mlen);
+                if (rc != 0) {
+                        CERROR("Can't setup PUT sink for %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+                        kiblnd_tx_done(ni, tx);
+                        /* tell peer it's over */
+                        kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
+                }
+
+                txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+                txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+#if IBLND_MAP_ON_DEMAND
+                nob = sizeof(kib_putack_msg_t);
+#else
+                nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]);
+#endif
+                kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob);
+
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
+                tx->tx_waiting = 1;             /* waiting for PUT_DONE */
+                kiblnd_queue_tx(tx, conn);
+
+                /* reposted buffer reserved for PUT_DONE */
+                post_credit = IBLND_POSTRX_NO_CREDIT;
+                break;
+
+        case IBLND_MSG_GET_REQ:
+                if (lntmsg != NULL) {
+                        /* Optimized GET; RDMA lntmsg's payload */
+                        kiblnd_reply(ni, rx, lntmsg);
+                } else {
+                        /* GET didn't match anything */
+                        kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE,
+                                               -ENODATA,
+                                               rxmsg->ibm_u.get.ibgm_cookie);
+                }
+                break;
+        }
+
+        kiblnd_post_rx(rx, post_credit);
+        return rc;
+}
+
+int
+kiblnd_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long    pid = kernel_thread (fn, arg, 0);
+
+        if (pid < 0)
+                return ((int)pid);
+
+        atomic_inc (&kiblnd_data.kib_nthreads);
+        return (0);
+}
+
+void
+kiblnd_thread_fini (void)
+{
+        atomic_dec (&kiblnd_data.kib_nthreads);
+}
+
+void
+kiblnd_peer_alive (kib_peer_t *peer)
+{
+        /* This is racy, but everyone's only writing cfs_time_current() */
+        peer->ibp_last_alive = cfs_time_current();
+        mb();
+}
+
+void
+kiblnd_peer_notify (kib_peer_t *peer)
+{
+        time_t        last_alive = 0;
+        int           error = 0;
+        unsigned long flags;
+        
+        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        if (list_empty(&peer->ibp_conns) &&
+            peer->ibp_accepting == 0 &&
+            peer->ibp_connecting == 0 &&
+            peer->ibp_error != 0) {
+                error = peer->ibp_error;
+                peer->ibp_error = 0;
+                
+                last_alive = cfs_time_current_sec() -
+                             cfs_duration_sec(cfs_time_current() -
+                                              peer->ibp_last_alive);
+        }
+        
+        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+        
+        if (error != 0)
+                lnet_notify(peer->ibp_ni,
+                            peer->ibp_nid, 0, last_alive);
+}
+
+void
+kiblnd_close_conn_locked (kib_conn_t *conn, int error)
+{
+        /* This just does the immediate housekeeping.  'error' is zero for a
+         * normal shutdown which can happen only after the connection has been
+         * established.  If the connection is established, schedule the
+         * connection to be finished off by the connd.  Otherwise the connd is
+         * already dealing with it (either to set it up or tear it down).
+         * Caller holds kib_global_lock exclusively in irq context */
+        unsigned long     flags;
+        kib_peer_t       *peer = conn->ibc_peer;
+
+        LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        if (error != 0 && conn->ibc_comms_error == 0)
+                conn->ibc_comms_error = error;
+
+        if (conn->ibc_state != IBLND_CONN_ESTABLISHED)
+                return; /* already being handled  */
+
+        if (error == 0 &&
+            list_empty(&conn->ibc_tx_queue) &&
+            list_empty(&conn->ibc_tx_queue_rsrvd) &&
+            list_empty(&conn->ibc_tx_queue_nocred) &&
+            list_empty(&conn->ibc_active_txs)) {
+                CDEBUG(D_NET, "closing conn to %s\n", 
+                       libcfs_nid2str(peer->ibp_nid));
+        } else {
+                CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s\n",
+                       libcfs_nid2str(peer->ibp_nid), error,
+                       list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                       list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+                       list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
+                       list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
+        }
+
+        list_del (&conn->ibc_list);
+        /* connd (see below) takes over ibc_list's ref */
+
+        if (list_empty (&peer->ibp_conns) &&    /* no more conns */
+            kiblnd_peer_active(peer)) {         /* still in peer table */
+                kiblnd_unlink_peer_locked(peer);
+
+                /* set/clear error on last conn */
+                peer->ibp_error = conn->ibc_comms_error;
+        }
+
+        kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING);
+
+        spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+        list_add_tail (&conn->ibc_list, &kiblnd_data.kib_connd_conns);
+        wake_up (&kiblnd_data.kib_connd_waitq);
+
+        spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+}
+
+void
+kiblnd_close_conn (kib_conn_t *conn, int error)
+{
+        unsigned long flags;
+
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        kiblnd_close_conn_locked(conn, error);
+
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_handle_early_rxs(kib_conn_t *conn)
+{
+        unsigned long    flags;
+        kib_rx_t        *rx;
+
+        LASSERT (!in_interrupt());
+        LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+        while (!list_empty(&conn->ibc_early_rxs)) {
+                rx = list_entry(conn->ibc_early_rxs.next,
+                                kib_rx_t, rx_list);
+                list_del(&rx->rx_list);
+                write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+                kiblnd_handle_rx(rx);
+
+                write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+        }
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs)
+{
+        LIST_HEAD           (zombies); 
+        struct list_head    *tmp;
+        struct list_head    *nxt;
+        kib_tx_t            *tx;
+
+        spin_lock(&conn->ibc_lock);
+
+        list_for_each_safe (tmp, nxt, txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                if (txs == &conn->ibc_active_txs) {
+                        LASSERT (!tx->tx_queued);
+                        LASSERT (tx->tx_waiting ||
+                                 tx->tx_sending != 0);
+                } else {
+                        LASSERT (tx->tx_queued);
+                }
+                
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_queued = 0;
+                tx->tx_waiting = 0;
+                
+                if (tx->tx_sending == 0) {
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+        }
+
+        spin_unlock(&conn->ibc_lock);
+
+        kiblnd_txlist_done(conn->ibc_peer->ibp_ni,
+                           &zombies, -ECONNABORTED);
+}
+
+void
+kiblnd_finalise_conn (kib_conn_t *conn)
+{
+        LASSERT (!in_interrupt());
+        LASSERT (conn->ibc_state > IBLND_CONN_INIT);
+
+        kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED);
+
+        /* abort_receives moves QP state to IB_QPS_ERR.  This is only required
+         * for connections that didn't get as far as being connected, because
+         * rdma_disconnect() does this for free. */
+        kiblnd_abort_receives(conn);
+
+        /* Complete all tx descs not waiting for sends to complete.
+         * NB we should be safe from RDMA now that the QP has changed state */
+
+        kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
+        kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+        kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+        kiblnd_abort_txs(conn, &conn->ibc_active_txs);
+
+        kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error)
+{
+        LIST_HEAD        (zombies);
+        unsigned long     flags;
+
+        LASSERT (error != 0);
+        LASSERT (!in_interrupt());
+
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        if (active) {
+                LASSERT (peer->ibp_connecting > 0);
+                peer->ibp_connecting--;
+        } else {
+                LASSERT (peer->ibp_accepting > 0);
+                peer->ibp_accepting--;
+        }
+
+        if (peer->ibp_connecting != 0 ||
+            peer->ibp_accepting != 0) {
+                /* another connection attempt under way... */
+                write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+                return;
+        }
+
+        if (list_empty(&peer->ibp_conns)) {
+                /* Take peer's blocked transmits to complete with error */
+                list_add(&zombies, &peer->ibp_tx_queue);
+                list_del_init(&peer->ibp_tx_queue);
+
+                if (kiblnd_peer_active(peer))
+                        kiblnd_unlink_peer_locked(peer);
+
+                peer->ibp_error = error;
+        } else {
+                /* Can't have blocked transmits if there are connections */
+                LASSERT (list_empty(&peer->ibp_tx_queue));
+        }
+
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        kiblnd_peer_notify(peer);
+
+        if (list_empty (&zombies))
+                return;
+
+        CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
+                libcfs_nid2str(peer->ibp_nid));
+
+        kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH);
+}
+
+void
+kiblnd_connreq_done(kib_conn_t *conn, int status)
+{
+        struct list_head   txs;
+
+        kib_peer_t        *peer = conn->ibc_peer;
+        int                active;
+        unsigned long      flags;
+        kib_tx_t          *tx;
+
+        active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+        CDEBUG(D_NET,"%s: %d, %d\n", libcfs_nid2str(peer->ibp_nid), 
+               active, status);
+
+        LASSERT (!in_interrupt());
+        LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT &&
+                  peer->ibp_connecting > 0) ||
+                 (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
+                  peer->ibp_accepting > 0));
+
+        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        conn->ibc_connvars = NULL;
+
+        if (status != 0) {
+                /* failed to establish connection */
+                kiblnd_peer_connect_failed(conn->ibc_peer, active, status);
+                kiblnd_finalise_conn(conn);
+                return;
+        }
+
+        /* connection established */
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        conn->ibc_last_send = jiffies;
+        kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+        kiblnd_peer_alive(peer);
+
+        /* Add conn to peer's list and nuke any dangling conns from a different
+         * peer instance... */
+        kiblnd_conn_addref(conn);               /* +1 ref for ibc_list */
+        list_add(&conn->ibc_list, &peer->ibp_conns);
+        if (active)
+                peer->ibp_connecting--;
+        else
+                peer->ibp_accepting--;
+
+        kiblnd_close_stale_conns_locked(conn->ibc_peer,
+                                        conn->ibc_incarnation);
+
+        if (!kiblnd_peer_active(peer) ||        /* peer has been deleted */
+            conn->ibc_comms_error != 0) {       /* error has happened already */
+
+                /* start to shut down connection */
+                kiblnd_close_conn_locked(conn, -ECONNABORTED);
+                write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+                return;
+        }
+
+        /* grab pending txs while I have the lock */
+        list_add(&txs, &peer->ibp_tx_queue);
+        list_del_init(&peer->ibp_tx_queue);
+
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        /* Schedule blocked txs */
+        spin_lock (&conn->ibc_lock);
+        while (!list_empty (&txs)) {
+                tx = list_entry (txs.next, kib_tx_t, tx_list);
+                list_del (&tx->tx_list);
+
+                kiblnd_queue_tx_locked(tx, conn);
+        }
+        spin_unlock (&conn->ibc_lock);
+
+        kiblnd_check_sends(conn);
+
+        /* schedule blocked rxs */
+        kiblnd_handle_early_rxs(conn);
+}
+
+void
+kiblnd_reject(struct rdma_cm_id *cmid, int why)
+{
+        int          rc;
+        kib_rej_t    rej = {.ibr_magic   = IBLND_MSG_MAGIC,
+                            .ibr_version = IBLND_MSG_VERSION,
+                            .ibr_why     = why};
+
+        rc = rdma_reject(cmid, &rej, sizeof(rej));
+
+        if (rc != 0)
+                CWARN("Error %d sending reject\n", rc);
+}
+
+int
+kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob)
+{
+        kib_msg_t             *ackmsg;
+        kib_msg_t             *reqmsg = priv;
+        rwlock_t              *g_lock = &kiblnd_data.kib_global_lock;
+        struct rdma_conn_param cp;
+        unsigned long          flags;
+        lnet_ni_t             *ni = NULL;
+        kib_dev_t             *ibdev;
+        kib_peer_t            *peer;
+        kib_peer_t            *peer2;
+        kib_conn_t            *conn;
+        lnet_nid_t             nid;
+        int                    rc;
+        int                    rej = IBLND_REJECT_FATAL;
+
+        LASSERT (!in_interrupt());
+
+        /* cmid inherits 'context' from the corresponding listener id */
+        ibdev = (kib_dev_t *)cmid->context;
+        LASSERT (ibdev != NULL);
+
+        if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
+                CERROR("Short connection request\n");
+                goto failed;
+        }
+
+        if (reqmsg->ibm_magic == LNET_PROTO_MAGIC ||
+            reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC) ||
+            (reqmsg->ibm_magic == IBLND_MSG_MAGIC &&
+             reqmsg->ibm_version != IBLND_MSG_VERSION) ||
+            (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) &&
+             reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION))) {
+                /* Future protocol version compatibility support!  If the
+                 * o2iblnd-specific protocol changes, or when LNET unifies
+                 * protocols over all LNDs, the initial connection will
+                 * negotiate a protocol version.  I trap this here to avoid
+                 * console errors; the reject tells the peer which protocol I
+                 * speak. */
+                goto failed;
+        }
+
+        rc = kiblnd_unpack_msg(reqmsg, priv_nob);
+        if (rc != 0) {
+                CERROR("Can't parse connection request: %d\n", rc);
+                goto failed;
+        }
+
+        nid = reqmsg->ibm_srcnid;
+
+        if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) {
+                CERROR("Unexpected connreq msg type: %x from %s\n",
+                       reqmsg->ibm_type, libcfs_nid2str(nid));
+                goto failed;
+        }
+
+        if (reqmsg->ibm_u.connparams.ibcp_queue_depth != IBLND_MSG_QUEUE_SIZE) {
+                CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+                       libcfs_nid2str(nid),
+                       reqmsg->ibm_u.connparams.ibcp_queue_depth,
+                       IBLND_MSG_QUEUE_SIZE);
+                goto failed;
+        }
+
+        if (reqmsg->ibm_u.connparams.ibcp_max_frags != IBLND_MAX_RDMA_FRAGS) {
+                CERROR("Can't accept %s: incompatible max_frags %d (%d wanted)\n",
+                       libcfs_nid2str(nid),
+                       reqmsg->ibm_u.connparams.ibcp_queue_depth,
+                       IBLND_MSG_QUEUE_SIZE);
+                goto failed;
+        }
+
+        if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+                CERROR("Can't accept %s: message size %d too big (%d max)\n",
+                       libcfs_nid2str(nid),
+                       reqmsg->ibm_u.connparams.ibcp_max_msg_size,
+                       IBLND_MSG_SIZE);
+                goto failed;
+        }
+
+        ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid));
+        if (ni == NULL ||                               /* no matching net */
+            ni->ni_nid != reqmsg->ibm_dstnid ||   /* right NET, wrong NID! */
+            ((kib_net_t*)ni->ni_data)->ibn_dev != ibdev) { /* wrong device */
+                CERROR("Can't accept %s: bad dst nid %s\n",
+                       libcfs_nid2str(nid),
+                       libcfs_nid2str(reqmsg->ibm_dstnid));
+
+                goto failed;
+        }
+        
+        /* assume 'nid' is a new peer; create  */
+        rc = kiblnd_create_peer(ni, &peer, nid);
+        if (rc != 0) {
+                CERROR("Can't create peer for %s\n", libcfs_nid2str(nid));
+                rej = IBLND_REJECT_NO_RESOURCES;
+                goto failed;
+        }
+
+        write_lock_irqsave(g_lock, flags);
+
+        peer2 = kiblnd_find_peer_locked(nid);
+        if (peer2 != NULL) {
+                /* tie-break connection race in favour of the higher NID */                
+                if (peer2->ibp_connecting != 0 &&
+                    nid < ni->ni_nid) {
+                        write_unlock_irqrestore(g_lock, flags);
+
+                        CWARN("Conn race %s\n",
+                              libcfs_nid2str(peer2->ibp_nid));
+
+                        kiblnd_peer_decref(peer);
+                        rej = IBLND_REJECT_CONN_RACE;
+                        goto failed;
+                }
+
+                peer2->ibp_accepting++;
+                kiblnd_peer_addref(peer2);
+
+                write_unlock_irqrestore(g_lock, flags);
+                kiblnd_peer_decref(peer);
+                peer = peer2;
+        } else {
+                /* Brand new peer */
+                LASSERT (peer->ibp_accepting == 0);
+                peer->ibp_accepting = 1;
+
+                kiblnd_peer_addref(peer);
+                list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid));
+
+                write_unlock_irqrestore(g_lock, flags);
+        }
+
+        conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT);
+        if (conn == NULL) {
+                kiblnd_peer_connect_failed(peer, 0, -ENOMEM);
+                kiblnd_peer_decref(peer);
+                rej = IBLND_REJECT_NO_RESOURCES;
+                goto failed;
+        }
+
+        /* conn now "owns" cmid, so I return success from here on to ensure the
+         * CM callback doesn't destroy cmid. */
+
+        conn->ibc_incarnation = reqmsg->ibm_srcstamp;
+        conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+        conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
+        LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+                 <= IBLND_RX_MSGS);
+
+        ackmsg = &conn->ibc_connvars->cv_msg;
+        memset(ackmsg, 0, sizeof(*ackmsg));
+
+        kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK,
+                        sizeof(ackmsg->ibm_u.connparams));
+        ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE;
+        ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_MAX_RDMA_FRAGS;
+        ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+        kiblnd_pack_msg(ni, ackmsg, 0, nid, reqmsg->ibm_srcstamp);
+
+        memset(&cp, 0, sizeof(cp));
+        cp.private_data        = ackmsg;
+        cp.private_data_len    = ackmsg->ibm_nob;
+        cp.responder_resources = 0;             /* No atomic ops or RDMA reads */
+        cp.initiator_depth     = 0;
+        cp.flow_control        = 1;
+        cp.retry_count         = *kiblnd_tunables.kib_retry_count;
+        cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+        CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid));
+
+        rc = rdma_accept(cmid, &cp);
+        if (rc != 0) {
+                CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc);
+                kiblnd_reject(cmid, IBLND_REJECT_FATAL);
+                kiblnd_connreq_done(conn, rc);
+                kiblnd_conn_decref(conn);
+        }
+
+        lnet_ni_decref(ni);
+        return 0;
+
+ failed:
+        if (ni != NULL)
+                lnet_ni_decref(ni);
+
+        kiblnd_reject(cmid, rej);
+        return -ECONNREFUSED;
+}
+
+void
+kiblnd_reconnect (kib_conn_t *conn, char *why)
+{
+        kib_peer_t    *peer = conn->ibc_peer;
+        int            retry = 0;
+        unsigned long  flags;
+        
+        LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+        LASSERT (peer->ibp_connecting > 0);     /* 'conn' at least */
+
+        write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        /* retry connection if it's still needed and no other connection
+         * attempts (active or passive) are in progress */
+        if (!list_empty(&peer->ibp_tx_queue) &&
+            peer->ibp_connecting == 1 &&
+            peer->ibp_accepting == 0) {
+                retry = 1;
+                peer->ibp_connecting++;
+        }
+        
+        write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        if (retry) {
+                CDEBUG(D_NETERROR, "%s: retrying (%s)\n", 
+                       libcfs_nid2str(peer->ibp_nid), why);
+                kiblnd_connect_peer(peer);
+        }
+}
+
+void
+kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob)
+{
+        kib_peer_t    *peer = conn->ibc_peer;
+
+        LASSERT (!in_interrupt());
+        LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT);
+
+        switch (reason) {
+        case IB_CM_REJ_STALE_CONN:
+                kiblnd_reconnect(conn, "stale");
+                break;
+
+        case IB_CM_REJ_CONSUMER_DEFINED:
+                if (priv_nob >= sizeof(kib_rej_t)) {
+                        kib_rej_t *rej = priv;
+
+                        if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) ||
+                            rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) {
+                                __swab32s(&rej->ibr_magic);
+                                __swab16s(&rej->ibr_version);
+                        }
+
+                        if (rej->ibr_magic != IBLND_MSG_MAGIC &&
+                            rej->ibr_magic != LNET_PROTO_MAGIC) {
+                                CERROR("%s rejected: consumer defined fatal error\n",
+                                       libcfs_nid2str(peer->ibp_nid));
+                                break;
+                        }
+                        
+                        if (rej->ibr_version != IBLND_MSG_VERSION) {
+                                CERROR("%s rejected: o2iblnd version %d error\n",
+                                       libcfs_nid2str(peer->ibp_nid),
+                                       rej->ibr_version);
+                                break;
+                        }
+                        
+                        switch (rej->ibr_why) {
+                        case IBLND_REJECT_CONN_RACE:
+                                kiblnd_reconnect(conn, "conn race");
+                                break;
+                                
+                        case IBLND_REJECT_NO_RESOURCES:
+                                CERROR("%s rejected: o2iblnd no resources\n",
+                                       libcfs_nid2str(peer->ibp_nid));
+                                break;
+                        case IBLND_REJECT_FATAL:
+                                CERROR("%s rejected: o2iblnd fatal error\n",
+                                       libcfs_nid2str(peer->ibp_nid));
+                                break;
+                        default:
+                                CERROR("%s rejected: o2iblnd reason %d\n",
+                                       libcfs_nid2str(peer->ibp_nid),
+                                       rej->ibr_why);
+                                break;
+                        }
+                        break;
+                }
+                /* fall through */
+        default:
+                CDEBUG(D_NETERROR, "%s rejected: reason %d, size %d\n",
+                       libcfs_nid2str(peer->ibp_nid), reason, priv_nob);
+                break;
+        }
+
+        kiblnd_connreq_done(conn, -ECONNREFUSED);
+}
+
+void
+kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob)
+{
+        kib_peer_t    *peer = conn->ibc_peer;
+        lnet_ni_t     *ni = peer->ibp_ni;
+        kib_net_t     *net = ni->ni_data;
+        kib_msg_t     *msg = priv;
+        int            rc = kiblnd_unpack_msg(msg, priv_nob);
+        unsigned long  flags;
+
+        LASSERT (net != NULL);
+
+        if (rc != 0) {
+                CERROR("Can't unpack connack from %s: %d\n",
+                       libcfs_nid2str(peer->ibp_nid), rc);
+                goto failed;
+        }
+
+        if (msg->ibm_type != IBLND_MSG_CONNACK) {
+                CERROR("Unexpected message %d from %s\n",
+                       msg->ibm_type, libcfs_nid2str(peer->ibp_nid));
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (msg->ibm_u.connparams.ibcp_queue_depth != IBLND_MSG_QUEUE_SIZE) {
+                CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+                       libcfs_nid2str(peer->ibp_nid),
+                       msg->ibm_u.connparams.ibcp_queue_depth,
+                       IBLND_MSG_QUEUE_SIZE);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (msg->ibm_u.connparams.ibcp_max_frags != IBLND_MAX_RDMA_FRAGS) {
+                CERROR("%s has incompatible max_frags %d (%d wanted)\n",
+                       libcfs_nid2str(peer->ibp_nid),
+                       msg->ibm_u.connparams.ibcp_queue_depth,
+                       IBLND_MSG_QUEUE_SIZE);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) {
+                CERROR("%s max message size %d too big (%d max)\n",
+                       libcfs_nid2str(peer->ibp_nid),
+                       msg->ibm_u.connparams.ibcp_max_msg_size,
+                       IBLND_MSG_SIZE);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+        if (msg->ibm_dstnid == ni->ni_nid &&
+            msg->ibm_dststamp == net->ibn_incarnation)
+                rc = 0;
+        else
+                rc = -ESTALE;
+        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+        if (rc != 0) {
+                CERROR("Stale connection reply from %s\n",
+                       libcfs_nid2str(peer->ibp_nid));
+                goto failed;
+        }
+
+        conn->ibc_incarnation = msg->ibm_srcstamp;
+        conn->ibc_credits = IBLND_MSG_QUEUE_SIZE;
+        conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE;
+        LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+                 <= IBLND_RX_MSGS);
+
+        kiblnd_connreq_done(conn, 0);
+        return;
+
+ failed:
+        /* NB My QP has already established itself, so I handle anything going
+         * wrong here by setting ibc_comms_error.
+         * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then
+         * immediately tears it down. */
+
+        LASSERT (rc != 0);
+        conn->ibc_comms_error = rc;
+        kiblnd_connreq_done(conn, 0);
+}
+
+int
+kiblnd_active_connect (struct rdma_cm_id *cmid)
+{
+        kib_peer_t              *peer = (kib_peer_t *)cmid->context;
+        kib_conn_t              *conn;
+        kib_msg_t               *msg;
+        struct rdma_conn_param   cp;
+        int                      rc;
+
+        conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT);
+        if (conn == NULL) {
+                kiblnd_peer_connect_failed(peer, 1, -ENOMEM);
+                kiblnd_peer_decref(peer); /* lose cmid's ref */
+                return -ENOMEM;
+        }
+
+        /* conn "owns" cmid now, so I return success from here on to ensure the
+         * CM callback doesn't destroy cmid. conn also takes over cmid's ref
+         * on peer */
+
+        msg = &conn->ibc_connvars->cv_msg;
+
+        memset(msg, 0, sizeof(*msg));
+        kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
+        msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE;
+        msg->ibm_u.connparams.ibcp_max_frags = IBLND_MAX_RDMA_FRAGS;
+        msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE;
+        kiblnd_pack_msg(peer->ibp_ni, msg, 0, peer->ibp_nid, 0);
+        
+        memset(&cp, 0, sizeof(cp));
+        cp.private_data        = msg;
+        cp.private_data_len    = msg->ibm_nob;
+        cp.responder_resources = 0;             /* No atomic ops or RDMA reads */
+        cp.initiator_depth     = 0;
+        cp.flow_control        = 1;
+        cp.retry_count         = *kiblnd_tunables.kib_retry_count;
+        cp.rnr_retry_count     = *kiblnd_tunables.kib_rnr_retry_count;
+
+        LASSERT(cmid->context == (void *)conn);
+        LASSERT(conn->ibc_cmid == cmid);
+        
+        rc = rdma_connect(cmid, &cp);
+        if (rc != 0) {
+                CERROR("Can't connect to %s: %d\n",
+                       libcfs_nid2str(peer->ibp_nid), rc);
+                kiblnd_connreq_done(conn, rc);
+                kiblnd_conn_decref(conn);
+        }
+
+        return 0;
+}
+
+int
+kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event)
+{
+        kib_peer_t  *peer;
+        kib_conn_t  *conn;
+       int          rc;
+
+       switch (event->event) {
+       default:
+                LBUG();
+
+       case RDMA_CM_EVENT_CONNECT_REQUEST:
+                /* destroy cmid on failure */
+               rc = kiblnd_passive_connect(cmid, 
+                                            event->private_data,
+                                            event->private_data_len);
+                CDEBUG(D_NET, "connreq: %d\n", rc);
+                return rc;
+                
+       case RDMA_CM_EVENT_ADDR_ERROR:
+                peer = (kib_peer_t *)cmid->context;
+                CDEBUG(D_NETERROR, "%s: ADDR ERROR %d\n",
+                       libcfs_nid2str(peer->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer);
+                return -EHOSTUNREACH;      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ADDR_RESOLVED:
+                peer = (kib_peer_t *)cmid->context;
+
+                CDEBUG(D_NET,"%s Addr resolved: %d\n",
+                       libcfs_nid2str(peer->ibp_nid), event->status);
+
+                if (event->status != 0) {
+                        CDEBUG(D_NETERROR, "Can't resolve address for %s: %d\n",
+                               libcfs_nid2str(peer->ibp_nid), event->status);
+                        rc = event->status;
+                } else {
+                        rc = rdma_resolve_route(
+                                cmid, *kiblnd_tunables.kib_timeout * 1000);
+                        if (rc == 0)
+                                return 0;
+                        /* Can't initiate route resolution */
+                        CERROR("Can't resolve route for %s: %d\n",
+                               libcfs_nid2str(peer->ibp_nid), rc);
+                }
+                kiblnd_peer_connect_failed(peer, 1, rc);
+                kiblnd_peer_decref(peer);
+                return rc;                      /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ROUTE_ERROR:
+                peer = (kib_peer_t *)cmid->context;
+                CDEBUG(D_NETERROR, "%s: ROUTE ERROR %d\n",
+                       libcfs_nid2str(peer->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH);
+                kiblnd_peer_decref(peer);
+                return -EHOSTUNREACH;           /* rc != 0 destroys cmid */
+
+       case RDMA_CM_EVENT_ROUTE_RESOLVED:
+                peer = (kib_peer_t *)cmid->context;
+                CDEBUG(D_NET,"%s Route resolved: %d\n",
+                       libcfs_nid2str(peer->ibp_nid), event->status);
+
+                if (event->status == 0)
+                        return kiblnd_active_connect(cmid);
+
+                CDEBUG(D_NETERROR, "Can't resolve route for %s: %d\n",
+                       libcfs_nid2str(peer->ibp_nid), event->status);
+                kiblnd_peer_connect_failed(peer, 1, event->status);
+                kiblnd_peer_decref(peer);
+                return event->status;           /* rc != 0 destroys cmid */
+                
+       case RDMA_CM_EVENT_UNREACHABLE:
+                conn = (kib_conn_t *)cmid->context;
+                LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                        conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+                CDEBUG(D_NETERROR, "%s: UNREACHABLE %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+                kiblnd_connreq_done(conn, -ENETDOWN);
+                kiblnd_conn_decref(conn);
+                return 0;
+
+       case RDMA_CM_EVENT_CONNECT_ERROR:
+                conn = (kib_conn_t *)cmid->context;
+                LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT ||
+                        conn->ibc_state == IBLND_CONN_PASSIVE_WAIT);
+                CDEBUG(D_NETERROR, "%s: CONNECT ERROR %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status);
+                kiblnd_connreq_done(conn, -ENOTCONN);
+                kiblnd_conn_decref(conn);
+                return 0;
+
+       case RDMA_CM_EVENT_REJECTED:
+                conn = (kib_conn_t *)cmid->context;
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
+
+                case IBLND_CONN_PASSIVE_WAIT:
+                        CERROR ("%s: REJECTED %d\n",
+                                libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                                event->status);
+                        kiblnd_connreq_done(conn, -ECONNRESET);
+                        break;
+
+                case IBLND_CONN_ACTIVE_CONNECT:
+                        kiblnd_rejected(conn, event->status,
+                                        event->private_data,
+                                        event->private_data_len);
+                        break;
+                }
+                kiblnd_conn_decref(conn);
+                return 0;
+
+       case RDMA_CM_EVENT_ESTABLISHED:
+                conn = (kib_conn_t *)cmid->context;
+                switch (conn->ibc_state) {
+                default:
+                        LBUG();
+
+                case IBLND_CONN_PASSIVE_WAIT:
+                        CDEBUG(D_NET, "ESTABLISHED (passive): %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_connreq_done(conn, 0);
+                        break;
+
+                case IBLND_CONN_ACTIVE_CONNECT:
+                        CDEBUG(D_NET, "ESTABLISHED(active): %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_check_connreply(conn,
+                                               event->private_data,
+                                               event->private_data_len);
+                        break;
+                }
+                /* net keeps its ref on conn! */
+                return 0;
+
+       case RDMA_CM_EVENT_DISCONNECTED:
+                conn = (kib_conn_t *)cmid->context;
+                if (conn->ibc_state < IBLND_CONN_ESTABLISHED) {
+                        CERROR("%s DISCONNECTED\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kiblnd_connreq_done(conn, -ECONNRESET);
+                } else {
+                        kiblnd_close_conn(conn, 0);
+                }
+                kiblnd_conn_decref(conn);
+                return 0;
+
+       case RDMA_CM_EVENT_DEVICE_REMOVAL:
+                LCONSOLE_ERROR("Received notification of device removal\n");
+                LCONSOLE_ERROR("Please shutdown LNET to allow this to proceed\n");
+                /* Can't remove network from underneath LNET for now, so I have
+                 * to ignore this */
+               return 0;
+       }
+}
+
+int
+kiblnd_check_txs (kib_conn_t *conn, struct list_head *txs)
+{
+        kib_tx_t          *tx;
+        struct list_head  *ttmp;
+        int                timed_out = 0;
+
+        spin_lock(&conn->ibc_lock);
+
+        list_for_each (ttmp, txs) {
+                tx = list_entry (ttmp, kib_tx_t, tx_list);
+
+                if (txs != &conn->ibc_active_txs) {
+                        LASSERT (tx->tx_queued);
+                } else {
+                        LASSERT (!tx->tx_queued);
+                        LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+                }                        
+
+                if (time_after_eq (jiffies, tx->tx_deadline)) {
+                        timed_out = 1;
+                        break;
+                }
+        }
+
+        spin_unlock(&conn->ibc_lock);
+        return timed_out;
+}
+
+int
+kiblnd_conn_timed_out (kib_conn_t *conn)
+{
+        return  kiblnd_check_txs(conn, &conn->ibc_tx_queue) ||
+                kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
+                kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
+                kiblnd_check_txs(conn, &conn->ibc_active_txs);
+}
+
+void
+kiblnd_check_conns (int idx)
+{
+        struct list_head  *peers = &kiblnd_data.kib_peers[idx];
+        struct list_head  *ptmp;
+        kib_peer_t        *peer;
+        kib_conn_t        *conn;
+        struct list_head  *ctmp;
+        unsigned long      flags;
+
+ again:
+        /* NB. We expect to have a look at all the peers and not find any
+         * rdmas to time out, so we just use a shared lock while we
+         * take a look... */
+        read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, kib_peer_t, ibp_list);
+
+                list_for_each (ctmp, &peer->ibp_conns) {
+                        conn = list_entry (ctmp, kib_conn_t, ibc_list);
+
+                        LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
+
+                        /* In case we have enough credits to return via a
+                         * NOOP, but there were no non-blocking tx descs
+                         * free to do it last time... */
+                        kiblnd_check_sends(conn);
+
+                        if (!kiblnd_conn_timed_out(conn))
+                                continue;
+
+                        /* Handle timeout by closing the whole connection.  We
+                         * can only be sure RDMA activity has ceased once the
+                         * QP has been modified. */
+
+                        kiblnd_conn_addref(conn); /* 1 ref for me... */
+
+                        read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
+                                               flags);
+
+                        CERROR("Timed out RDMA with %s\n",
+                               libcfs_nid2str(peer->ibp_nid));
+
+                        kiblnd_close_conn(conn, -ETIMEDOUT);
+                        kiblnd_conn_decref(conn); /* ...until here */
+
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+        }
+
+        read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+}
+
+void
+kiblnd_disconnect_conn (kib_conn_t *conn)
+{
+        LASSERT (!in_interrupt());
+        LASSERT (current == kiblnd_data.kib_connd);
+        LASSERT (conn->ibc_state == IBLND_CONN_CLOSING);
+
+        rdma_disconnect(conn->ibc_cmid);
+        kiblnd_finalise_conn(conn);
+
+        kiblnd_peer_notify(conn->ibc_peer);
+}
+
+int
+kiblnd_connd (void *arg)
+{
+        wait_queue_t       wait;
+        unsigned long      flags;
+        kib_conn_t        *conn;
+        int                timeout;
+        int                i;
+        int                dropped_lock;
+        int                peer_index = 0;
+        unsigned long      deadline = jiffies;
+
+        cfs_daemonize ("kiblnd_connd");
+        cfs_block_allsigs ();
+
+        init_waitqueue_entry (&wait, current);
+        kiblnd_data.kib_connd = current;
+
+        spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+
+        while (!kiblnd_data.kib_shutdown) {
+
+                dropped_lock = 0;
+
+                if (!list_empty (&kiblnd_data.kib_connd_zombies)) {
+                        conn = list_entry (kiblnd_data.kib_connd_zombies.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+
+                        spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags);
+                        dropped_lock = 1;
+
+                        kiblnd_destroy_conn(conn);
+
+                        spin_lock_irqsave (&kiblnd_data.kib_connd_lock, flags);
+                }
+
+                if (!list_empty (&kiblnd_data.kib_connd_conns)) {
+                        conn = list_entry (kiblnd_data.kib_connd_conns.next,
+                                           kib_conn_t, ibc_list);
+                        list_del (&conn->ibc_list);
+
+                        spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags);
+                        dropped_lock = 1;
+
+                        kiblnd_disconnect_conn(conn);
+                        kiblnd_conn_decref(conn);
+
+                        spin_lock_irqsave (&kiblnd_data.kib_connd_lock, flags);
+                }
+
+                /* careful with the jiffy wrap... */
+                timeout = (int)(deadline - jiffies);
+                if (timeout <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = kiblnd_data.kib_peer_hash_size;
+
+                        spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);
+                        dropped_lock = 1;
+
+                        /* Time to check for RDMA timeouts on a few more
+                         * peers: I do checks every 'p' seconds on a
+                         * proportion of the peer table and I need to check
+                         * every connection 'n' times within a timeout
+                         * interval, to ensure I detect a timeout on any
+                         * connection within (n+1)/n times the timeout
+                         * interval. */
+
+                        if (*kiblnd_tunables.kib_timeout > n * p)
+                                chunk = (chunk * n * p) /
+                                        *kiblnd_tunables.kib_timeout;
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                kiblnd_check_conns(peer_index);
+                                peer_index = (peer_index + 1) %
+                                             kiblnd_data.kib_peer_hash_size;
+                        }
+
+                        deadline += p * HZ;
+                        spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags);
+                }
+
+                if (dropped_lock)
+                        continue;
+
+                /* Nothing to do for 'timeout'  */
+                set_current_state (TASK_INTERRUPTIBLE);
+                add_wait_queue (&kiblnd_data.kib_connd_waitq, &wait);
+                spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags);
+
+                schedule_timeout (timeout);
+
+                set_current_state (TASK_RUNNING);
+                remove_wait_queue (&kiblnd_data.kib_connd_waitq, &wait);
+                spin_lock_irqsave (&kiblnd_data.kib_connd_lock, flags);
+        }
+
+        spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags);
+
+        kiblnd_thread_fini();
+        return (0);
+}
+
+void
+kiblnd_qp_event(struct ib_event *event, void *arg)
+{
+        kib_conn_t *conn = arg;
+
+        switch (event->event) {
+        case IB_EVENT_COMM_EST:
+                CDEBUG(D_NET, "%s established\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return;
+                
+        default:
+                CERROR("%s: Async QP event type %d\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+                return;
+        }
+}
+
+void
+kiblnd_complete (struct ib_wc *wc)
+{
+        switch (kiblnd_wreqid2type(wc->wr_id)) {
+        default:
+                LBUG();
+
+        case IBLND_WID_RDMA:
+                /* We only get RDMA completion notification if it fails.  All
+                 * subsequent work items, including the final SEND will fail
+                 * too.  However we can't print out any more info about the
+                 * failing RDMA because 'tx' might be back on the idle list or
+                 * even reused already if we didn't manage to post all our work
+                 * items */
+                CDEBUG(D_NETERROR, "RDMA (tx: %p) failed: %d\n",
+                       kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+                return;
+
+        case IBLND_WID_TX:
+                kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status);
+                return;
+
+        case IBLND_WID_RX:
+                kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status,
+                                   wc->byte_len);
+                return;
+        }
+}
+
+void
+kiblnd_cq_completion (struct ib_cq *cq, void *arg)
+{
+        /* NB I'm not allowed to schedule this conn once its refcount has
+         * reached 0.  Since fundamentally I'm racing with scheduler threads
+         * consuming my CQ I could be called after all completions have
+         * occurred.  But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+         * and this CQ is about to be destroyed so I NOOP. */
+        kib_conn_t     *conn = (kib_conn_t *)arg;
+        unsigned long   flags;
+        
+        LASSERT (cq == conn->ibc_cq);
+
+        spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+
+        conn->ibc_ready = 1;
+
+        if (!conn->ibc_scheduled &&
+            (conn->ibc_nrx > 0 ||
+             conn->ibc_nsends_posted > 0)) {
+                kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+                conn->ibc_scheduled = 1;
+                list_add_tail(&conn->ibc_sched_list,
+                              &kiblnd_data.kib_sched_conns);
+                wake_up(&kiblnd_data.kib_sched_waitq);
+        }
+
+        spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+}
+
+void
+kiblnd_cq_event(struct ib_event *event, void *arg)
+{
+        kib_conn_t *conn = arg;
+
+        CERROR("%s: async CQ event type %d\n",
+               libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event);
+}
+
+int
+kiblnd_scheduler(void *arg)
+{
+        long            id = (long)arg;
+        wait_queue_t    wait;
+        char            name[16];
+        unsigned long   flags;
+        kib_conn_t     *conn;
+        struct ib_wc    wc;
+        int             rc;
+        int             did_something;
+        int             busy_loops = 0;
+
+        snprintf(name, sizeof(name), "kiblnd_sd_%02ld", id);
+        cfs_daemonize(name);
+        cfs_block_allsigs();
+
+        init_waitqueue_entry(&wait, current);
+
+        spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+
+        while (!kiblnd_data.kib_shutdown) {
+                if (busy_loops++ >= IBLND_RESCHED) {
+                        spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock,
+                                               flags);
+
+                        our_cond_resched();
+                        busy_loops = 0;
+
+                        spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+                }
+
+                did_something = 0;
+
+                if (!list_empty(&kiblnd_data.kib_sched_conns)) {
+                        conn = list_entry(kiblnd_data.kib_sched_conns.next,
+                                          kib_conn_t, ibc_sched_list);
+                        /* take over kib_sched_conns' ref on conn... */
+                        LASSERT(conn->ibc_scheduled);
+                        list_del(&conn->ibc_sched_list);
+                        conn->ibc_ready = 0;
+                        
+                        spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock,
+                                               flags);
+
+                        rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+                        if (rc == 0) {
+                                rc = ib_req_notify_cq(conn->ibc_cq,
+                                                      IB_CQ_NEXT_COMP);
+                                LASSERT (rc >= 0);
+
+                                rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+                        }
+
+                        LASSERT (rc >= 0);
+
+                        spin_lock_irqsave(&kiblnd_data.kib_sched_lock,
+                                          flags);
+
+                        if (rc != 0 || conn->ibc_ready) {
+                                /* There may be another completion waiting; get
+                                 * another scheduler to check while I handle
+                                 * this one... */
+                                kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+                                list_add_tail(&conn->ibc_sched_list,
+                                              &kiblnd_data.kib_sched_conns);
+                                wake_up(&kiblnd_data.kib_sched_waitq);
+                        } else {
+                                conn->ibc_scheduled = 0;
+                        }
+                        
+                        if (rc != 0) {
+                                spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock,
+                                                       flags);
+
+                                kiblnd_complete(&wc);
+
+                                spin_lock_irqsave(&kiblnd_data.kib_sched_lock,
+                                                  flags);
+                        }
+
+                        kiblnd_conn_decref(conn); /* ...drop my ref from above */
+                        did_something = 1;
+                }
+
+                if (did_something)
+                        continue;
+
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue_exclusive(&kiblnd_data.kib_sched_waitq, &wait);
+                spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+
+                schedule();
+                busy_loops = 0;
+
+                remove_wait_queue(&kiblnd_data.kib_sched_waitq, &wait);
+                set_current_state(TASK_RUNNING);
+                spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+        }
+
+        spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+
+        kiblnd_thread_fini();
+        return (0);
+}
diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c
new file mode 100644 (file)
index 0000000..ef42ffe
--- /dev/null
@@ -0,0 +1,218 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2006 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "o2iblnd.h"
+
+static int service = 987;
+CFS_MODULE_PARM(service, "i", int, 0444,
+                "service number (within RDMA_PS_TCP)");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int ntx = 256;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of message descriptors");
+
+static int credits = 64;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static char *ipif_name = "ib0";
+CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
+                "IPoIB interface name");
+
+static int retry_count = 5;
+CFS_MODULE_PARM(retry_count, "i", int, 0644,
+                "Retransmissions when no ACK received");
+
+static int rnr_retry_count = 6;
+CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
+                "RNR retransmissions");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+                "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu = 0;
+CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
+                "IB MTU 256/512/1024/2048/4096");
+
+#if IBLND_MAP_ON_DEMAND
+static int concurrent_sends = IBLND_RX_MSGS;
+#else
+static int concurrent_sends = IBLND_MSG_QUEUE_SIZE;
+#endif
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
+                "send work-queue sizing");
+
+#if IBLND_MAP_ON_DEMAND
+static int fmr_pool_size = 512;
+CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
+                "size of the fmr pool (>= ntx)");
+
+static int fmr_flush_trigger = 384;
+CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
+                "# dirty FMRs that triggers pool flush");
+
+static int fmr_cache = 1;
+CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
+                "non-zero to enable FMR caching");
+#endif
+
+kib_tunables_t kiblnd_tunables = {
+        .kib_service                = &service,
+        .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_keepalive              = &keepalive,
+        .kib_ntx                    = &ntx,
+        .kib_credits                = &credits,
+        .kib_peercredits            = &peer_credits,
+        .kib_default_ipif           = &ipif_name,
+        .kib_retry_count            = &retry_count,
+        .kib_rnr_retry_count        = &rnr_retry_count,
+        .kib_concurrent_sends       = &concurrent_sends,
+        .kib_ib_mtu                 = &ib_mtu,
+#if IBLND_MAP_ON_DEMAND
+        .kib_fmr_pool_size          = &fmr_pool_size,
+        .kib_fmr_flush_trigger      = &fmr_flush_trigger,
+        .kib_fmr_cache              = &fmr_cache,
+#endif
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+static char ipif_basename_space[32];
+
+static ctl_table kiblnd_ctl_table[] = {
+       {1, "service", &service, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {2, "cksum", &cksum, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {3, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {4, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {5, "credits", &credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {6, "peer_credits", &peer_credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "ipif_name", ipif_basename_space, 
+        sizeof(ipif_basename_space), 0444, NULL, &proc_dostring},
+       {8, "retry_count", &retry_count, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {9, "rnr_retry_count", &rnr_retry_count, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {10, "keepalive", &keepalive, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {11, "concurrent_sends", &concurrent_sends, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {12, "ib_mtu", &ib_mtu, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+#if IBLND_MAP_ON_DEMAND
+       {12, "fmr_pool_size", &fmr_pool_size, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {13, "fmr_flush_trigger", &fmr_flush_trigger, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {14, "fmr_cache", &fmr_cache, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+#endif
+       {0}
+};
+
+static ctl_table kiblnd_top_ctl_table[] = {
+       {203, "o2iblnd", NULL, 0, 0555, kiblnd_ctl_table},
+       {0}
+};
+
+void
+kiblnd_initstrtunable(char *space, char *str, int size)
+{
+        strncpy(space, str, size);
+        space[size-1] = 0;
+}
+
+void
+kiblnd_sysctl_init (void)
+{
+        kiblnd_initstrtunable(ipif_basename_space, ipif_name,
+                              sizeof(ipif_basename_space));
+
+       kiblnd_tunables.kib_sysctl =
+               register_sysctl_table(kiblnd_top_ctl_table, 0);
+
+       if (kiblnd_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+       if (kiblnd_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
+}
+
+#else
+
+void
+kiblnd_sysctl_init (void)
+{
+}
+
+void
+kiblnd_sysctl_fini (void)
+{
+}
+
+#endif
+
+int
+kiblnd_tunables_init (void)
+{
+        kiblnd_sysctl_init();
+        
+        if (*kiblnd_tunables.kib_concurrent_sends > IBLND_RX_MSGS)
+                *kiblnd_tunables.kib_concurrent_sends = IBLND_RX_MSGS;
+        if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE)
+                *kiblnd_tunables.kib_concurrent_sends = IBLND_MSG_QUEUE_SIZE;
+
+       return 0;
+}
+
+void
+kiblnd_tunables_fini (void)
+{
+        kiblnd_sysctl_fini();
+}
+
+
+
index 9b8ed5d..86fa9cd 100644 (file)
@@ -1,5 +1,5 @@
-MODULES := kopenibnal
-kopenibnal-objs := openibnal.o openibnal_cb.o
+MODULES := kopeniblnd
+kopeniblnd-objs := openiblnd.o openiblnd_cb.o openiblnd_modparams.o
 
 EXTRA_POST_CFLAGS := @OPENIBCPPFLAGS@
 
index 6f56421..b4e0fb7 100644 (file)
@@ -4,12 +4,10 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
-if BUILD_OPENIBNAL
-modulenet_DATA = kopenibnal$(KMODEXT)
-endif
+if BUILD_OPENIBLND
+modulenet_DATA = kopeniblnd$(KMODEXT)
 endif
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(kopenibnal-objs:%.o=%.c) openibnal.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kopeniblnd-objs:%.o=%.c) openiblnd.h
index d0385a6..4219005 100644 (file)
  *
  */
 
-#include "openibnal.h"
+#include "openiblnd.h"
 
-nal_t                   kibnal_api;
-ptl_handle_ni_t         kibnal_ni;
-kib_data_t              kibnal_data;
-kib_tunables_t          kibnal_tunables;
-
-#define IBNAL_SYSCTL             202
-
-enum {
-        IBNAL_SYSCTL_TIMEOUT=1,
-        IBNAL_SYSCTL_LISTENER_TIMEOUT,
-        IBNAL_SYSCTL_BACKLOG,
-        IBNAL_SYSCTL_PORT
-};
-
-static ctl_table kibnal_ctl_table[] = {
-        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &kibnal_tunables.kib_io_timeout, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        {IBNAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", 
-         &kibnal_tunables.kib_listener_timeout, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {IBNAL_SYSCTL_BACKLOG, "backlog",
-         &kibnal_tunables.kib_backlog, sizeof(int),
-         0644, NULL, kibnal_listener_procint},
-        {IBNAL_SYSCTL_PORT, "port",
-         &kibnal_tunables.kib_port, sizeof(int),
-         0644, NULL, kibnal_listener_procint},
-        { 0 }
+lnd_t the_kiblnd = {
+#ifdef USING_TSAPI
+        .lnd_type       = CIBLND,
+#else
+        .lnd_type       = OPENIBLND,
+#endif
+        .lnd_startup    = kibnal_startup,
+        .lnd_shutdown   = kibnal_shutdown,
+        .lnd_ctl        = kibnal_ctl,
+        .lnd_send       = kibnal_send,
+        .lnd_recv       = kibnal_recv,
+        .lnd_eager_recv = kibnal_eager_recv,
+        .lnd_accept     = kibnal_accept,
 };
 
-static ctl_table kibnal_top_ctl_table[] = {
-        {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
-        { 0 }
-};
+kib_data_t              kibnal_data;
 
 __u32 
 kibnal_cksum (void *ptr, int nob)
@@ -79,31 +61,35 @@ kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
 }
 
 void
-kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp)
+kibnal_pack_msg(kib_msg_t *msg, int version, int credits, 
+                lnet_nid_t dstnid, __u64 dststamp)
 {
         /* CAVEAT EMPTOR! all message fields not set here should have been
          * initialised previously. */
         msg->ibm_magic    = IBNAL_MSG_MAGIC;
-        msg->ibm_version  = IBNAL_MSG_VERSION;
+        msg->ibm_version  = version;
         /*   ibm_type */
         msg->ibm_credits  = credits;
         /*   ibm_nob */
         msg->ibm_cksum    = 0;
-        msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
+        msg->ibm_srcnid   = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
+                                                  dstnid);
         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
         msg->ibm_dstnid   = dstnid;
         msg->ibm_dststamp = dststamp;
-#if IBNAL_CKSUM
-        /* NB ibm_cksum zero while computing cksum */
-        msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
-#endif
+
+        if (*kibnal_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
+        }
 }
 
 int
-kibnal_unpack_msg(kib_msg_t *msg, int nob)
+kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob)
 {
         const int hdr_size = offsetof(kib_msg_t, ibm_u);
         __u32     msg_cksum;
+        int       msg_version;
         int       flip;
         int       msg_nob;
 
@@ -121,9 +107,12 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 return -EPROTO;
         }
 
-        if (msg->ibm_version != 
-            (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
-                CERROR("Bad version: %d\n", msg->ibm_version);
+        msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+        if ((expected_version == 0) ?
+            (msg_version != IBNAL_MSG_VERSION &&
+             msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) :
+            (msg_version != expected_version)) {
+                CERROR("Bad version: %x\n", msg_version);
                 return -EPROTO;
         }
 
@@ -151,7 +140,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
         
         if (flip) {
                 /* leave magic unflipped as a clue to peer endianness */
-                __swab16s(&msg->ibm_version);
+                msg->ibm_version = msg_version;
                 LASSERT (sizeof(msg->ibm_type) == 1);
                 LASSERT (sizeof(msg->ibm_credits) == 1);
                 msg->ibm_nob = msg_nob;
@@ -161,8 +150,8 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 __swab64s(&msg->ibm_dststamp);
         }
         
-        if (msg->ibm_srcnid == PTL_NID_ANY) {
-                CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
+        if (msg->ibm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
                 return -EPROTO;
         }
 
@@ -235,223 +224,11 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
 }
 
 int
-kibnal_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-        struct iovec  iov = {
-                .iov_base = buffer,
-                .iov_len  = nob
-        };
-        struct msghdr msg = {
-                .msg_name       = NULL,
-                .msg_namelen    = 0,
-                .msg_iov        = &iov,
-                .msg_iovlen     = 1,
-                .msg_control    = NULL,
-                .msg_controllen = 0,
-                .msg_flags      = MSG_DONTWAIT
-        };
-
-        /* We've set up the socket's send buffer to be large enough for
-         * everything we send, so a single non-blocking send should
-         * complete without error. */
-
-        set_fs(KERNEL_DS);
-        rc = sock_sendmsg(sock, &msg, iov.iov_len);
-        set_fs(oldmm);
-
-        if (rc == nob)
-                return 0;
-
-        if (rc >= 0)
-                return -EAGAIN;
-
-        return rc;
-}
-
-int
-kibnal_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
-{
-        int            rc;
-        mm_segment_t   oldmm = get_fs();
-        long           ticks = timeout * HZ;
-        unsigned long  then;
-        struct timeval tv;
-
-        LASSERT (nob > 0);
-        LASSERT (ticks > 0);
-
-        for (;;) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                /* Set receive timeout to remaining time */
-                tv = (struct timeval) {
-                        .tv_sec = ticks / HZ,
-                        .tv_usec = ((ticks % HZ) * 1000000) / HZ
-                };
-                set_fs(KERNEL_DS);
-                rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
-                                     (char *)&tv, sizeof(tv));
-                set_fs(oldmm);
-                if (rc != 0) {
-                        CERROR("Can't set socket recv timeout %d: %d\n",
-                               timeout, rc);
-                        return rc;
-                }
-
-                set_fs(KERNEL_DS);
-                then = jiffies;
-                rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
-                ticks -= jiffies - then;
-                set_fs(oldmm);
-
-                if (rc < 0)
-                        return rc;
-
-                if (rc == 0)
-                        return -ECONNABORTED;
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-
-                if (nob == 0)
-                        return 0;
-
-                if (ticks <= 0)
-                        return -ETIMEDOUT;
-        }
-}
-
-int
-kibnal_create_sock(struct socket **sockp)
-{
-        struct socket       *sock;
-        int                  rc;
-        int                  option;
-        mm_segment_t         oldmm = get_fs();
-
-        rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
-        if (rc != 0) {
-                CERROR("Can't create socket: %d\n", rc);
-                return rc;
-        }
-
-        /* Ensure sends will not block */
-        option = 2 * sizeof(kib_msg_t);
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set send buffer %d: %d\n", option, rc);
-                goto failed;
-        }
-
-        option = 1;
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set SO_REUSEADDR: %d\n", rc);
-                goto failed;
-        }
-
-        *sockp = sock;
-        return 0;
-
- failed:
-        sock_release(sock);
-        return rc;
-}
-
-void
-kibnal_pause(int ticks)
-{
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule_timeout(ticks);
-}
-
-int
-kibnal_connect_sock(kib_peer_t *peer, struct socket **sockp)
-{
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct socket      *sock;
-        unsigned int        port;
-        int                 rc;
-
-        for (port = 1023; port >= 512; port--) {
-
-                memset(&locaddr, 0, sizeof(locaddr)); 
-                locaddr.sin_family      = AF_INET; 
-                locaddr.sin_port        = htons(port);
-                locaddr.sin_addr.s_addr = htonl(INADDR_ANY);
-
-                memset (&srvaddr, 0, sizeof (srvaddr));
-                srvaddr.sin_family      = AF_INET;
-                srvaddr.sin_port        = htons (peer->ibp_port);
-                srvaddr.sin_addr.s_addr = htonl (peer->ibp_ip);
-
-                rc = kibnal_create_sock(&sock);
-                if (rc != 0)
-                        return rc;
-
-                rc = sock->ops->bind(sock,
-                                     (struct sockaddr *)&locaddr, sizeof(locaddr));
-                if (rc != 0) {
-                        sock_release(sock);
-                        
-                        if (rc == -EADDRINUSE) {
-                                CDEBUG(D_NET, "Port %d already in use\n", port);
-                                continue;
-                        }
-
-                        CERROR("Can't bind to reserved port %d: %d\n", port, rc);
-                        return rc;
-                }
-
-                rc = sock->ops->connect(sock,
-                                        (struct sockaddr *)&srvaddr, sizeof(srvaddr),
-                                        0);
-                if (rc == 0) {
-                        *sockp = sock;
-                        return 0;
-                }
-                
-                sock_release(sock);
-
-                if (rc != -EADDRNOTAVAIL) {
-                        CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
-                               port, HIPQUAD(peer->ibp_ip), peer->ibp_port, rc);
-                        return rc;
-                }
-                
-                CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", 
-                       port, HIPQUAD(peer->ibp_ip), peer->ibp_port);
-        }
-
-        /* all ports busy */
-        return -EHOSTUNREACH;
-}
-
-int
 kibnal_make_svcqry (kib_conn_t *conn) 
 {
         kib_peer_t    *peer = conn->ibc_peer;
+        int            version = IBNAL_MSG_VERSION;
+        int            msg_version;
         kib_msg_t     *msg;
         struct socket *sock;
         int            rc;
@@ -460,115 +237,235 @@ kibnal_make_svcqry (kib_conn_t *conn)
         LASSERT (conn->ibc_connreq != NULL);
         msg = &conn->ibc_connreq->cr_msg;
 
+ again:
         kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0);
-        kibnal_pack_msg(msg, 0, peer->ibp_nid, 0);
+        kibnal_pack_msg(msg, version, 0, peer->ibp_nid, 0);
 
-        rc = kibnal_connect_sock(peer, &sock);
+        rc = lnet_connect(&sock, peer->ibp_nid,
+                          0, peer->ibp_ip, peer->ibp_port);
         if (rc != 0)
-                return rc;
+                return -ECONNABORTED;
         
-        rc = kibnal_sock_write(sock, msg, msg->ibm_nob);
+        rc = libcfs_sock_write(sock, msg, msg->ibm_nob,
+                               lnet_acceptor_timeout());
+        if (rc != 0) {
+                CERROR("Error %d sending svcqry to %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid), 
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                goto out;
+        }
+
+        /* The first 6 bytes are invariably MAGIC + proto version */
+        rc = libcfs_sock_read(sock, msg, 6, *kibnal_tunables.kib_timeout);
         if (rc != 0) {
-                CERROR("Error %d sending svcqry to "
-                       LPX64"@%u.%u.%u.%u/%d\n", rc, 
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid), 
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                goto out;
+        }
+
+        if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
+            msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                CERROR("Bad magic: %08x from %s at %u.%u.%u.%u/%d\n",
+                       msg->ibm_magic, libcfs_nid2str(peer->ibp_nid),
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                rc = -EPROTO;
                 goto out;
         }
 
-        nob = offsetof(kib_msg_t, ibm_u) + sizeof(msg->ibm_u.svcrsp);
-        rc = kibnal_sock_read(sock, msg, nob, kibnal_tunables.kib_io_timeout);
+        msg_version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ? 
+                      msg->ibm_version : __swab16(msg->ibm_version);
+        if (msg_version != version) {
+                if (version == IBNAL_MSG_VERSION) {
+                        /* retry with previous version */
+                        libcfs_sock_release(sock);
+                        version = IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD;
+                        goto again;
+                }
+                
+                CERROR("Bad version %x from %s at %u.%u.%u.%u/%d\n",
+                       msg_version, libcfs_nid2str(peer->ibp_nid),
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                rc = -EPROTO;
+                goto out;
+        }
+
+        /* Read in the rest of the message now we know the expected format */
+        nob = offsetof(kib_msg_t, ibm_u) + sizeof(kib_svcrsp_t);
+        rc = libcfs_sock_read(sock, ((char *)msg) + 6, nob - 6,
+                              *kibnal_tunables.kib_timeout);
         if (rc != 0) {
-                CERROR("Error %d receiving svcrsp from "
-                       LPX64"@%u.%u.%u.%u/%d\n", rc
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid)
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 goto out;
         }
 
-        rc = kibnal_unpack_msg(msg, nob);
+        rc = kibnal_unpack_msg(msg, version, nob);
         if (rc != 0) {
-                CERROR("Error %d unpacking svcrsp from "
-                       LPX64"@%u.%u.%u.%u/%d\n", rc,
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Error %d unpacking svcrsp from %s at %u.%u.%u.%u/%d\n", 
+                       rc, libcfs_nid2str(peer->ibp_nid), 
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 goto out;
         }
                        
         if (msg->ibm_type != IBNAL_MSG_SVCRSP) {
-                CERROR("Unexpected response type %d from "
-                       LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_type
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Unexpected response type %d from %s at %u.%u.%u.%u/%d\n", 
+                       msg->ibm_type, libcfs_nid2str(peer->ibp_nid)
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 rc = -EPROTO;
                 goto out;
         }
         
-        if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
+        if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+                                     msg->ibm_dstnid) ||
             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
-                CERROR("Unexpected dst NID/stamp "LPX64"/"LPX64" from "
-                       LPX64"@%u.%u.%u.%u/%d\n", 
-                       msg->ibm_dstnid, msg->ibm_dststamp,
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+                CERROR("Unexpected dst NID/stamp %s/"LPX64" from "
+                       "%s at %u.%u.%u.%u/%d\n", 
+                       libcfs_nid2str(msg->ibm_dstnid), msg->ibm_dststamp,
+                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), 
+                       peer->ibp_port);
                 rc = -EPROTO;
                 goto out;
         }
 
-        if (msg->ibm_srcnid != peer->ibp_nid) {
-                CERROR("Unexpected src NID "LPX64" from "
-                       LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_srcnid,
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port);
+        if (!lnet_ptlcompat_matchnid(peer->ibp_nid, msg->ibm_srcnid)) {
+                CERROR("Unexpected src NID %s from %s at %u.%u.%u.%u/%d\n", 
+                       libcfs_nid2str(msg->ibm_srcnid),
+                       libcfs_nid2str(peer->ibp_nid), 
+                       HIPQUAD(peer->ibp_ip), peer->ibp_port);
                 rc = -EPROTO;
                 goto out;
         }
 
         conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp;
+        conn->ibc_version = version;
+        
  out:
-        sock_release(sock);
+        libcfs_sock_release(sock);
         return rc;
 }
 
 void
 kibnal_handle_svcqry (struct socket *sock)
 {
-        struct sockaddr_in   addr;
         __u32                peer_ip;
         unsigned int         peer_port;
         kib_msg_t           *msg;
         __u64                srcnid;
         __u64                srcstamp;
-        int                  len;
+        int                  version;
+        int                  reject = 0;
         int                  rc;
 
-        len = sizeof(addr);
-        rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2);
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
         if (rc != 0) {
                 CERROR("Can't get peer's IP: %d\n", rc);
                 return;
         }
 
-        peer_ip = ntohl(addr.sin_addr.s_addr);
-        peer_port = ntohs(addr.sin_port);
-
-        if (peer_port >= 1024) {
-                CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n",
+        LIBCFS_ALLOC(msg, sizeof(*msg));
+        if (msg == NULL) {
+                CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
                        HIPQUAD(peer_ip), peer_port);
                 return;
         }
+        
+        rc = libcfs_sock_read(sock, &msg->ibm_magic, sizeof(msg->ibm_magic),
+                              lnet_acceptor_timeout());
+        if (rc != 0) {
+                CERROR("Error %d receiving svcqry(1) from %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(peer_ip), peer_port);
+                goto out;
+        }
 
-        PORTAL_ALLOC(msg, sizeof(*msg));
-        if (msg == NULL) {
-                CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n",
-                       HIPQUAD(peer_ip), peer_port);
+        if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
+            msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                /* Unexpected magic! */
+                if (the_lnet.ln_ptlcompat == 0) {
+                        if (msg->ibm_magic == LNET_PROTO_MAGIC ||
+                            msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) {
+                                /* future protocol version compatibility!
+                                 * When LNET unifies protocols over all LNDs,
+                                 * the first thing sent will be a version
+                                 * query.  I send back a reply in my current
+                                 * protocol to tell her I'm "old" */
+                                kibnal_init_msg(msg, 0, 0);
+                                kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, 
+                                                LNET_NID_ANY, 0);
+                                reject = 1;
+                                goto reply;
+                        }
+
+                        CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+                                "%u.%u.%u.%u/%d\n", msg->ibm_magic,
+                                IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port);
+                        goto out;
+                }
+
+                /* When portals compatibility is set, I may be passed a new
+                 * connection "blindly" by the acceptor, and I have to
+                 * determine if my peer has sent an acceptor connection request
+                 * or not. */
+                rc = lnet_accept(kibnal_data.kib_ni, sock, msg->ibm_magic);
+                if (rc != 0)
+                        goto out;
+
+                /* It was an acceptor connection request!
+                 * Now I should see my magic... */
+                rc = libcfs_sock_read(sock, &msg->ibm_magic,
+                                      sizeof(msg->ibm_magic),
+                                      lnet_acceptor_timeout());
+                if (rc != 0) {
+                        CERROR("Error %d receiving svcqry(2) from %u.%u.%u.%u/%d\n",
+                               rc, HIPQUAD(peer_ip), peer_port);
+                        goto out;
+                }
+
+                if (msg->ibm_magic != IBNAL_MSG_MAGIC &&
+                    msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
+                        CERROR ("Bad magic(2) %#08x (%#08x expected) from "
+                                "%u.%u.%u.%u/%d\n", msg->ibm_magic,
+                                IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port);
+                        goto out;
+                }
+        }
+
+        /* Now check version */
+
+        rc = libcfs_sock_read(sock, &msg->ibm_version, sizeof(msg->ibm_version),
+                              lnet_acceptor_timeout());
+        if (rc != 0) {
+                CERROR("Error %d receiving svcqry(3) from %u.%u.%u.%u/%d\n",
+                       rc, HIPQUAD(peer_ip), peer_port);
                 goto out;
         }
+
+        version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ?
+                  msg->ibm_version : __swab32(msg->ibm_version);
+        /* Peer is a different protocol version: reply in my current protocol
+         * to tell her I'm "old" */
+        if (version != IBNAL_MSG_VERSION &&
+            version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                kibnal_init_msg(msg, 0, 0);
+                kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, LNET_NID_ANY, 0);
+                reject = 1;
+                goto reply;
+        }
         
-        rc = kibnal_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u),
-                              kibnal_tunables.kib_listener_timeout);
+        /* Now read in all the rest */
+        rc = libcfs_sock_read(sock, &msg->ibm_type,
+                              offsetof(kib_msg_t, ibm_u) -
+                              offsetof(kib_msg_t, ibm_type),
+                              lnet_acceptor_timeout());
         if (rc != 0) {
-                CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n",
+                CERROR("Error %d receiving svcqry(4) from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
                 goto out;
         }
         
-        rc = kibnal_unpack_msg(msg, offsetof(kib_msg_t, ibm_u));
+        rc = kibnal_unpack_msg(msg, version, offsetof(kib_msg_t, ibm_u));
         if (rc != 0) {
                 CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
@@ -581,10 +478,11 @@ kibnal_handle_svcqry (struct socket *sock)
                 goto out;
         }
         
-        if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
-                CERROR("Unexpected dstnid "LPX64"(expected "LPX64" "
-                       "from %u.%u.%u.%u/%d\n", msg->ibm_dstnid,
-                       kibnal_lib.libnal_ni.ni_pid.nid,
+        if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+                                     msg->ibm_dstnid)) {
+                CERROR("Unexpected dstnid %s: expected %s from %u.%u.%u.%u/%d\n",
+                       libcfs_nid2str(msg->ibm_dstnid),
+                       libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
                        HIPQUAD(peer_ip), peer_port);
                 goto out;
         }
@@ -599,277 +497,50 @@ kibnal_handle_svcqry (struct socket *sock)
                sizeof(kibnal_data.kib_svc_gid));
         msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey;
 
-        kibnal_pack_msg(msg, 0, srcnid, srcstamp);
-        
-        rc = kibnal_sock_write (sock, msg, msg->ibm_nob);
-        if (rc != 0) {
+        kibnal_pack_msg(msg, version, 0, srcnid, srcstamp);
+
+ reply:
+        rc = libcfs_sock_write (sock, msg, msg->ibm_nob,
+                                lnet_acceptor_timeout());
+        if (!reject && rc != 0) {
+                /* Only complain if we're not rejecting */
                 CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n",
                        rc, HIPQUAD(peer_ip), peer_port);
                 goto out;
         }
         
  out:
-        PORTAL_FREE(msg, sizeof(*msg));
+        LIBCFS_FREE(msg, sizeof(*msg));
 }
 
 void
 kibnal_free_acceptsock (kib_acceptsock_t *as)
 {
-        sock_release(as->ibas_sock);
-        PORTAL_FREE(as, sizeof(*as));
+        libcfs_sock_release(as->ibas_sock);
+        LIBCFS_FREE(as, sizeof(*as));
 }
 
 int
-kibnal_ip_listener(void *arg)
+kibnal_accept(lnet_ni_t *ni, struct socket *sock)
 {
-        struct sockaddr_in addr;
-        wait_queue_t       wait;
-        struct socket     *sock;
         kib_acceptsock_t  *as;
-        int                port;
-        char               name[16];
-        int                rc;
         unsigned long      flags;
 
-        /* Parent thread holds kib_nid_mutex, and is, or is about to
-         * block on kib_listener_signal */
-
-        port = kibnal_tunables.kib_port;
-        snprintf(name, sizeof(name), "kibnal_lstn%03d", port);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
-
-        init_waitqueue_entry(&wait, current);
-
-        rc = kibnal_create_sock(&sock);
-        if (rc != 0)
-                goto out_0;
-
-        memset(&addr, 0, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_port        = htons(port);
-        addr.sin_addr.s_addr = INADDR_ANY;
-
-        rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr));
-        if (rc != 0) {
-                CERROR("Can't bind to port %d\n", port);
-                goto out_1;
+        LIBCFS_ALLOC(as, sizeof(*as));
+        if (as == NULL) {
+                CERROR("Out of Memory\n");
+                return -ENOMEM;
         }
 
-        rc = sock->ops->listen(sock, kibnal_tunables.kib_backlog);
-        if (rc != 0) {
-                CERROR("Can't set listen backlog %d: %d\n", 
-                       kibnal_tunables.kib_backlog, rc);
-                goto out_1;
-        }
-
-        LASSERT (kibnal_data.kib_listener_sock == NULL);
-        kibnal_data.kib_listener_sock = sock;
-
-        /* unblock waiting parent */
-        LASSERT (kibnal_data.kib_listener_shutdown == 0);
-        up(&kibnal_data.kib_listener_signal);
-
-        /* Wake me any time something happens on my socket */
-        add_wait_queue(sock->sk->sk_sleep, &wait);
-        as = NULL;
-
-        while (kibnal_data.kib_listener_shutdown == 0) {
-
-                if (as == NULL) {
-                        PORTAL_ALLOC(as, sizeof(*as));
-                        if (as == NULL) {
-                                CERROR("Out of Memory: pausing...\n");
-                                kibnal_pause(HZ);
-                                continue;
-                        }
-                        as->ibas_sock = NULL;
-                }
-
-                if (as->ibas_sock == NULL) {
-                        as->ibas_sock = sock_alloc();
-                        if (as->ibas_sock == NULL) {
-                                CERROR("Can't allocate socket: pausing...\n");
-                                kibnal_pause(HZ);
-                                continue;
-                        }
-                        /* XXX this should add a ref to sock->ops->owner, if
-                         * TCP could be a module */
-                        as->ibas_sock->type = sock->type;
-                        as->ibas_sock->ops = sock->ops;
-                }
+        as->ibas_sock = sock;
                 
-                set_current_state(TASK_INTERRUPTIBLE);
-
-                rc = sock->ops->accept(sock, as->ibas_sock, O_NONBLOCK);
-
-                /* Sleep for socket activity? */
-                if (rc == -EAGAIN &&
-                    kibnal_data.kib_listener_shutdown == 0)
-                        schedule();
-
-                set_current_state(TASK_RUNNING);
-
-                if (rc == 0) {
-                        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-                        
-                        list_add_tail(&as->ibas_list, 
-                                      &kibnal_data.kib_connd_acceptq);
-
-                        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
-                        wake_up(&kibnal_data.kib_connd_waitq);
-
-                        as = NULL;
-                        continue;
-                }
-                
-                if (rc != -EAGAIN) {
-                        CERROR("Accept failed: %d, pausing...\n", rc);
-                        kibnal_pause(HZ);
-                }
-        }
-
-        if (as != NULL) {
-                if (as->ibas_sock != NULL)
-                        sock_release(as->ibas_sock);
-                PORTAL_FREE(as, sizeof(*as));
-        }
-
-        rc = 0;
-        remove_wait_queue(sock->sk->sk_sleep, &wait);
- out_1:
-        sock_release(sock);
-        kibnal_data.kib_listener_sock = NULL;
- out_0:
-        /* set completion status and unblock thread waiting for me 
-         * (parent on startup failure, executioner on normal shutdown) */
-        kibnal_data.kib_listener_shutdown = rc;
-        up(&kibnal_data.kib_listener_signal);
-
-        return 0;
-}
-
-int
-kibnal_start_ip_listener (void)
-{
-        long           pid;
-        int            rc;
-
-        CDEBUG(D_NET, "Starting listener\n");
-
-        /* Called holding kib_nid_mutex: listener stopped */
-        LASSERT (kibnal_data.kib_listener_sock == NULL);
-
-        kibnal_data.kib_listener_shutdown = 0;
-        pid = kernel_thread(kibnal_ip_listener, NULL, 0);
-        if (pid < 0) {
-                CERROR("Can't spawn listener: %ld\n", pid);
-                return (int)pid;
-        }
-
-        /* Block until listener has started up. */
-        down(&kibnal_data.kib_listener_signal);
-
-        rc = kibnal_data.kib_listener_shutdown;
-        LASSERT ((rc != 0) == (kibnal_data.kib_listener_sock == NULL));
-
-        CDEBUG((rc == 0) ? D_WARNING : D_ERROR, 
-               "Listener %s: pid:%ld port:%d backlog:%d\n", 
-               (rc == 0) ? "started OK" : "startup failed",
-               pid, kibnal_tunables.kib_port, kibnal_tunables.kib_backlog);
-
-        return rc;
-}
-
-void
-kibnal_stop_ip_listener(int clear_acceptq)
-{
-        struct list_head  zombie_accepts;
-        kib_acceptsock_t *as;
-        unsigned long     flags;
-
-        CDEBUG(D_NET, "Stopping listener\n");
-
-        /* Called holding kib_nid_mutex: listener running */
-        LASSERT (kibnal_data.kib_listener_sock != NULL);
-
-        kibnal_data.kib_listener_shutdown = 1;
-        wake_up_all(kibnal_data.kib_listener_sock->sk->sk_sleep);
-
-        /* Block until listener has torn down. */
-        down(&kibnal_data.kib_listener_signal);
-
-        LASSERT (kibnal_data.kib_listener_sock == NULL);
-        CWARN("Listener stopped\n");
-
-        if (!clear_acceptq)
-                return;
-
-        /* Close any unhandled accepts */
         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-
-        list_add(&zombie_accepts, &kibnal_data.kib_connd_acceptq);
-        list_del_init(&kibnal_data.kib_connd_acceptq);
+                
+        list_add_tail(&as->ibas_list, &kibnal_data.kib_connd_acceptq);
+        wake_up(&kibnal_data.kib_connd_waitq);
 
         spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
-        
-        while (!list_empty(&zombie_accepts)) {
-                as = list_entry(zombie_accepts.next,
-                                kib_acceptsock_t, ibas_list);
-                list_del(&as->ibas_list);
-                kibnal_free_acceptsock(as);
-        }
-}
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
-int 
-kibnal_listener_procint(ctl_table *table, int write, struct file *filp,
-                        void *buffer, size_t *lenp)
-#else
-int 
-kibnal_listener_procint(ctl_table *table, int write, struct file *filp,
-                        void *buffer, size_t *lenp, loff_t *ppos)
-#endif
-{
-        int   *tunable = (int *)table->data;
-        int    old_val;
-        int    rc;
-
-        /* No race with nal initialisation since the nal is setup all the time
-         * it's loaded.  When that changes, change this! */
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
-
-        down(&kibnal_data.kib_nid_mutex);
-
-        LASSERT (tunable == &kibnal_tunables.kib_port ||
-                 tunable == &kibnal_tunables.kib_backlog);
-        old_val = *tunable;
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
-        rc = proc_dointvec(table, write, filp, buffer, lenp);
-#else
-        rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
-#endif
-        if (write &&
-            (*tunable != old_val ||
-             kibnal_data.kib_listener_sock == NULL)) {
-
-                if (kibnal_data.kib_listener_sock != NULL)
-                        kibnal_stop_ip_listener(0);
-
-                rc = kibnal_start_ip_listener();
-                if (rc != 0) {
-                        CERROR("Unable to restart listener with new tunable:"
-                               " reverting to old value\n");
-                        *tunable = old_val;
-                        kibnal_start_ip_listener();
-                }
-        }
-
-        up(&kibnal_data.kib_nid_mutex);
-
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL);
-        return rc;
+        return 0;
 }
 
 int
@@ -929,79 +600,20 @@ kibnal_stop_ib_listener (void)
 }
 
 int
-kibnal_set_mynid (ptl_nid_t nid)
+kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
 {
-        lib_ni_t         *ni = &kibnal_lib.libnal_ni;
-        int               rc;
-
-        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->ni_pid.nid);
+        kib_peer_t     *peer;
+        unsigned long   flags;
+        int             rc;
 
-        down (&kibnal_data.kib_nid_mutex);
+        LASSERT (nid != LNET_NID_ANY);
 
-        if (nid == kibnal_data.kib_nid) {
-                /* no change of NID */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
+        LIBCFS_ALLOC(peer, sizeof (*peer));
+        if (peer == NULL) {
+                CERROR("Cannot allocate peer\n");
+                return -ENOMEM;
         }
 
-        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n",
-               kibnal_data.kib_nid, nid);
-
-        if (kibnal_data.kib_listener_sock != NULL)
-                kibnal_stop_ip_listener(1);
-        
-        if (kibnal_data.kib_listen_handle != NULL)
-                kibnal_stop_ib_listener();
-
-        ni->ni_pid.nid = nid;
-        kibnal_data.kib_incarnation++;
-        mb();
-        /* Delete all existing peers and their connections after new
-         * NID/incarnation set to ensure no old connections in our brave new
-         * world. */
-        kibnal_del_peer (PTL_NID_ANY, 0);
-
-        if (ni->ni_pid.nid != PTL_NID_ANY) {
-                /* got a new NID to install */
-                rc = kibnal_start_ib_listener();
-                if (rc != 0) {
-                        CERROR("Can't start IB listener: %d\n", rc);
-                        goto failed_0;
-                }
-        
-                rc = kibnal_start_ip_listener();
-                if (rc != 0) {
-                        CERROR("Can't start IP listener: %d\n", rc);
-                        goto failed_1;
-                }
-        }
-        
-        up(&kibnal_data.kib_nid_mutex);
-        return 0;
-
- failed_1:
-        kibnal_stop_ib_listener();
- failed_0:
-        ni->ni_pid.nid = PTL_NID_ANY;
-        kibnal_data.kib_incarnation++;
-        mb();
-        kibnal_del_peer (PTL_NID_ANY, 0);
-        up(&kibnal_data.kib_nid_mutex);
-        return rc;
-}
-
-kib_peer_t *
-kibnal_create_peer (ptl_nid_t nid)
-{
-        kib_peer_t *peer;
-
-        LASSERT (nid != PTL_NID_ANY);
-
-        PORTAL_ALLOC (peer, sizeof (*peer));
-        if (peer == NULL)
-                return (NULL);
-
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
         peer->ibp_nid = nid;
@@ -1012,53 +624,63 @@ kibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_tx_queue);
         INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */
 
-        peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_error = 0;
+        peer->ibp_last_alive = cfs_time_current();
+        peer->ibp_reconnect_interval = 0;       /* OK to connect at any time */
 
-        atomic_inc (&kibnal_data.kib_npeers);
-        CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid);
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        return (peer);
+        if (atomic_read(&kibnal_data.kib_npeers) >=
+            *kibnal_tunables.kib_concurrent_peers) {
+                rc = -EOVERFLOW;        /* !! but at least it distinguishes */
+        } else if (kibnal_data.kib_nonewpeers) {
+                rc = -ESHUTDOWN;        /* shutdown has started */
+        } else {
+                rc = 0;
+                /* npeers only grows with kib_global_lock held */
+                atomic_inc(&kibnal_data.kib_npeers);
+        }
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        if (rc != 0) {
+                CERROR("Can't create peer: %s\n", 
+                       (rc == -ESHUTDOWN) ? "shutting down" : 
+                       "too many peers");
+                LIBCFS_FREE(peer, sizeof(*peer));
+        } else {
+                *peerp = peer;
+        }
+        
+        return rc;
 }
 
 void
 kibnal_destroy_peer (kib_peer_t *peer)
 {
-        CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer);
+        CDEBUG (D_NET, "peer %s %p deleted\n", 
+                libcfs_nid2str(peer->ibp_nid), peer);
 
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
         LASSERT (peer->ibp_persistence == 0);
         LASSERT (!kibnal_peer_active(peer));
         LASSERT (peer->ibp_connecting == 0);
+        LASSERT (peer->ibp_accepting == 0);
         LASSERT (list_empty (&peer->ibp_connd_list));
         LASSERT (list_empty (&peer->ibp_conns));
         LASSERT (list_empty (&peer->ibp_tx_queue));
 
-        PORTAL_FREE (peer, sizeof (*peer));
+        LIBCFS_FREE (peer, sizeof (*peer));
 
         /* NB a peer's connections keep a reference on their peer until
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&kibnal_data.kib_npeers);
-}
-
-void
-kibnal_put_peer (kib_peer_t *peer)
-{
-        CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
-                peer, peer->ibp_nid,
-                atomic_read (&peer->ibp_refcount));
-
-        LASSERT (atomic_read (&peer->ibp_refcount) > 0);
-        if (!atomic_dec_and_test (&peer->ibp_refcount))
-                return;
-
-        kibnal_destroy_peer (peer);
+        atomic_dec(&kibnal_data.kib_npeers);
 }
 
 kib_peer_t *
-kibnal_find_peer_locked (ptl_nid_t nid)
+kibnal_find_peer_locked (lnet_nid_t nid)
 {
         struct list_head *peer_list = kibnal_nid2peerlist (nid);
         struct list_head *tmp;
@@ -1070,20 +692,19 @@ kibnal_find_peer_locked (ptl_nid_t nid)
 
                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
                          peer->ibp_connecting != 0 || /* creating conns */
+                         peer->ibp_accepting != 0 ||
                          !list_empty (&peer->ibp_conns));  /* active conn */
 
                 if (peer->ibp_nid != nid)
                         continue;
 
-                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
-                       peer, nid, atomic_read (&peer->ibp_refcount));
                 return (peer);
         }
         return (NULL);
 }
 
 kib_peer_t *
-kibnal_get_peer (ptl_nid_t nid)
+kibnal_get_peer (lnet_nid_t nid)
 {
         kib_peer_t     *peer;
         unsigned long   flags;
@@ -1091,7 +712,7 @@ kibnal_get_peer (ptl_nid_t nid)
         read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
         peer = kibnal_find_peer_locked (nid);
         if (peer != NULL)                       /* +1 ref for caller? */
-                atomic_inc (&peer->ibp_refcount);
+                kibnal_peer_addref(peer);
         read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
         return (peer);
@@ -1106,11 +727,11 @@ kibnal_unlink_peer_locked (kib_peer_t *peer)
         LASSERT (kibnal_peer_active(peer));
         list_del_init (&peer->ibp_list);
         /* lose peerlist's ref */
-        kibnal_put_peer (peer);
+        kibnal_peer_decref(peer);
 }
 
 int
-kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
+kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp,
                       int *persistencep)
 {
         kib_peer_t        *peer;
@@ -1127,6 +748,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
                         if (index-- > 0)
@@ -1148,24 +770,25 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
 }
 
 int
-kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
+kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port)
 {
         unsigned long      flags;
         kib_peer_t        *peer;
         kib_peer_t        *peer2;
+        int                rc;
         
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (-EINVAL);
 
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = kibnal_create_peer (&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
         peer2 = kibnal_find_peer_locked (nid);
         if (peer2 != NULL) {
-                kibnal_put_peer (peer);
+                kibnal_peer_decref(peer);
                 peer = peer2;
         } else {
                 /* peer table takes existing ref on peer */
@@ -1182,19 +805,13 @@ kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
 }
 
 void
-kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
         kib_conn_t       *conn;
 
-        if (!single_share)
-                peer->ibp_persistence = 0;
-        else if (peer->ibp_persistence > 0)
-                peer->ibp_persistence--;
-
-        if (peer->ibp_persistence != 0)
-                return;
+        peer->ibp_persistence = 0;
 
         if (list_empty(&peer->ibp_conns)) {
                 kibnal_unlink_peer_locked(peer);
@@ -1212,9 +829,10 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 }
 
 int
-kibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (lnet_nid_t nid)
 {
         unsigned long      flags;
+        CFS_LIST_HEAD     (zombies);
         struct list_head  *ptmp;
         struct list_head  *pnxt;
         kib_peer_t        *peer;
@@ -1225,7 +843,7 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -1237,21 +855,27 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                        if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
                                 continue;
 
-                        kibnal_del_peer_locked (peer, single_share);
-                        rc = 0;         /* matched something */
+                        if (!list_empty(&peer->ibp_tx_queue)) {
+                                LASSERT (list_empty(&peer->ibp_conns));
 
-                        if (single_share)
-                                goto out;
+                                list_splice_init(&peer->ibp_tx_queue, &zombies);
+                        }
+
+                        kibnal_del_peer_locked (peer);
+                        rc = 0;         /* matched something */
                 }
         }
- out:
+
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
+        kibnal_txlist_done(&zombies, -EIO);
+
         return (rc);
 }
 
@@ -1273,6 +897,7 @@ kibnal_get_conn_by_idx (int index)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence > 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
                         list_for_each (ctmp, &peer->ibp_conns) {
@@ -1280,10 +905,7 @@ kibnal_get_conn_by_idx (int index)
                                         continue;
 
                                 conn = list_entry (ctmp, kib_conn_t, ibc_list);
-                                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                                       atomic_read (&conn->ibc_refcount));
-                                atomic_inc (&conn->ibc_refcount);
+                                kibnal_conn_addref(conn);
                                 read_unlock_irqrestore(&kibnal_data.kib_global_lock,
                                                        flags);
                                 return (conn);
@@ -1310,7 +932,7 @@ kibnal_create_conn (void)
                 struct ib_qp_attribute     qp_attr;
         } params;
         
-        PORTAL_ALLOC (conn, sizeof (*conn));
+        LIBCFS_ALLOC (conn, sizeof (*conn));
         if (conn == NULL) {
                 CERROR ("Can't allocate connection\n");
                 return (NULL);
@@ -1319,14 +941,16 @@ kibnal_create_conn (void)
         /* zero flags, NULL pointers etc... */
         memset (conn, 0, sizeof (*conn));
 
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
         INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
         
         atomic_inc (&kibnal_data.kib_nconns);
         /* well not really, but I call destroy() on failure, which decrements */
 
-        PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
         if (conn->ibc_rxs == NULL)
                 goto failed;
         memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
@@ -1360,11 +984,13 @@ kibnal_create_conn (void)
                 }
         }
 
+        /* We can post up to IBLND_MSG_QUEUE_SIZE immediate/req messages and
+         * the same # of ack/nak/rdma+done messages */
+
         params.qp_create = (struct ib_qp_create_param) {
                 .limit = {
-                        /* Sends have an optional RDMA */
-                        .max_outstanding_send_request    = 2 * IBNAL_MSG_QUEUE_SIZE,
-                        .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_send_request    = 3 * IBNAL_MSG_QUEUE_SIZE,
+                        .max_outstanding_receive_request = IBNAL_RX_MSGS,
                         .max_send_gather_element         = 1,
                         .max_receive_scatter_element     = 1,
                 },
@@ -1421,6 +1047,8 @@ kibnal_destroy_conn (kib_conn_t *conn)
 
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+        LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
         LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
         LASSERT (conn->ibc_connreq == NULL);
@@ -1446,13 +1074,13 @@ kibnal_destroy_conn (kib_conn_t *conn)
                 kibnal_free_pages(conn->ibc_rx_pages);
         
         if (conn->ibc_rxs != NULL)
-                PORTAL_FREE(conn->ibc_rxs, 
+                LIBCFS_FREE(conn->ibc_rxs, 
                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
         if (conn->ibc_peer != NULL)
-                kibnal_put_peer(conn->ibc_peer);
+                kibnal_peer_decref(conn->ibc_peer);
 
-        PORTAL_FREE(conn, sizeof (*conn));
+        LIBCFS_FREE(conn, sizeof (*conn));
 
         atomic_dec(&kibnal_data.kib_nconns);
         
@@ -1465,30 +1093,6 @@ kibnal_destroy_conn (kib_conn_t *conn)
         }
 }
 
-void
-kibnal_put_conn (kib_conn_t *conn)
-{
-        unsigned long flags;
-
-        CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n",
-                conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                atomic_read (&conn->ibc_refcount));
-
-        LASSERT (atomic_read (&conn->ibc_refcount) > 0);
-        if (!atomic_dec_and_test (&conn->ibc_refcount))
-                return;
-
-        /* last ref only goes on zombies */
-        LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE);
-
-        spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags);
-
-        list_add (&conn->ibc_list, &kibnal_data.kib_reaper_conns);
-        wake_up (&kibnal_data.kib_reaper_waitq);
-
-        spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags);
-}
-
 int
 kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
 {
@@ -1521,9 +1125,10 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
                 if (conn->ibc_incarnation == incarnation)
                         continue;
 
-                CDEBUG(D_NET, "Closing stale conn %p nid:"LPX64
+                CDEBUG(D_NET, "Closing stale conn %p nid: %s"
                        " incarnation:"LPX64"("LPX64")\n", conn,
-                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                       libcfs_nid2str(peer->ibp_nid), 
+                       conn->ibc_incarnation, incarnation);
                 
                 count++;
                 kibnal_close_conn_locked (conn, -ESTALE);
@@ -1533,7 +1138,7 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
 }
 
 int
-kibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (lnet_nid_t nid)
 {
         unsigned long       flags;
         kib_peer_t         *peer;
@@ -1546,7 +1151,7 @@ kibnal_close_matching_conns (ptl_nid_t nid)
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -1559,9 +1164,10 @@ kibnal_close_matching_conns (ptl_nid_t nid)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                        if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
                                 continue;
 
                         count += kibnal_close_peer_conns_locked (peer, 0);
@@ -1571,72 +1177,71 @@ kibnal_close_matching_conns (ptl_nid_t nid)
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (0);
         
         return (count == 0 ? -ENOENT : 0);
 }
 
 int
-kibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 {
-        int rc = -EINVAL;
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
 
-        LASSERT (pcfg != NULL);
+        LASSERT (ni == kibnal_data.kib_ni);
 
-        switch(pcfg->pcfg_command) {
-        case NAL_CMD_GET_PEER: {
-                ptl_nid_t   nid = 0;
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
                 __u32       ip = 0;
                 int         port = 0;
                 int         share_count = 0;
 
-                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                rc = kibnal_get_peer_info(data->ioc_count,
                                           &nid, &ip, &port, &share_count);
-                pcfg->pcfg_nid   = nid;
-                pcfg->pcfg_size  = 0;
-                pcfg->pcfg_id    = ip;
-                pcfg->pcfg_misc  = port;
-                pcfg->pcfg_count = 0;
-                pcfg->pcfg_wait  = share_count;
+                data->ioc_nid    = nid;
+                data->ioc_count  = share_count;
+                data->ioc_u32[0] = ip;
+                data->ioc_u32[1] = port;
                 break;
         }
-        case NAL_CMD_ADD_PEER: {
-                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
-                                                 pcfg->pcfg_id, /* IP */
-                                                 pcfg->pcfg_misc); /* port */
+        case IOC_LIBCFS_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (data->ioc_nid,
+                                                 data->ioc_u32[0], /* IP */
+                                                 data->ioc_u32[1]); /* port */
                 break;
         }
-        case NAL_CMD_DEL_PEER: {
-                rc = kibnal_del_peer (pcfg->pcfg_nid, 
-                                       /* flags == single_share */
-                                       pcfg->pcfg_flags != 0);
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kibnal_del_peer (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_GET_CONN: {
-                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+        case IOC_LIBCFS_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
                 else {
                         rc = 0;
-                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
-                        pcfg->pcfg_id    = 0;
-                        pcfg->pcfg_misc  = 0;
-                        pcfg->pcfg_flags = 0;
-                        kibnal_put_conn (conn);
+                        data->ioc_nid = conn->ibc_peer->ibp_nid;
+                        kibnal_conn_decref(conn);
                 }
                 break;
         }
-        case NAL_CMD_CLOSE_CONNECTION: {
-                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_REGISTER_MYNID: {
-                if (pcfg->pcfg_nid == PTL_NID_ANY)
+        case IOC_LIBCFS_REGISTER_MYNID: {
+                /* Ignore if this is a noop */
+                if (data->ioc_nid == ni->ni_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                }
                 break;
         }
         }
@@ -1661,7 +1266,7 @@ kibnal_free_pages (kib_pages_t *p)
                 if (p->ibp_pages[i] != NULL)
                         __free_page(p->ibp_pages[i]);
         
-        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
@@ -1672,7 +1277,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
         int                         i;
         int                         rc;
 
-        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR ("Can't allocate buffer %d\n", npages);
                 return (-ENOMEM);
@@ -1690,7 +1295,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
                 }
         }
 
-        PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages));
+        LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages));
         if (phys_pages == NULL) {
                 CERROR ("Can't allocate physarray for %d pages\n", npages);
                 kibnal_free_pages(p);
@@ -1700,7 +1305,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
         for (i = 0; i < npages; i++) {
                 phys_pages[i].size = PAGE_SIZE;
                 phys_pages[i].address =
-                        kibnal_page2phys(p->ibp_pages[i]);
+                        lnet_page2phys(p->ibp_pages[i]);
         }
 
         p->ibp_vaddr = 0;
@@ -1713,7 +1318,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access)
                                          &p->ibp_lkey,
                                          &p->ibp_rkey);
         
-        PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages));
+        LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages));
         
         if (rc != 0) {
                 CERROR ("Error %d mapping %d pages\n", rc, npages);
@@ -1745,14 +1350,14 @@ kibnal_setup_tx_descs (void)
         LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
         rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
-                                IBNAL_TX_MSG_PAGES, 
+                                IBNAL_TX_MSG_PAGES()
                                 0);            /* local read access only */
         if (rc != 0)
                 return (rc);
 
         vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
 
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
@@ -1760,21 +1365,15 @@ kibnal_setup_tx_descs (void)
                 
                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset);
                 tx->tx_vaddr = vaddr;
-                tx->tx_isnblk = (i >= IBNAL_NTX);
                 tx->tx_mapped = KIB_TX_UNMAPPED;
 
                 CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", 
                        i, tx, tx->tx_msg, tx->tx_vaddr);
 
-                if (tx->tx_isnblk)
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_nblk_txs);
-                else
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_txs);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
                 vaddr += IBNAL_MSG_SIZE;
-                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
+                LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES());
 
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
@@ -1782,7 +1381,7 @@ kibnal_setup_tx_descs (void)
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
                 }
         }
         
@@ -1790,21 +1389,17 @@ kibnal_setup_tx_descs (void)
 }
 
 void
-kibnal_api_shutdown (nal_t *nal)
+kibnal_shutdown (lnet_ni_t *ni)
 {
-        int   i;
-        int   rc;
-
-        if (nal->nal_refct != 0) {
-                /* This module got the first ref */
-                PORTAL_MODULE_UNUSE;
-                return;
-        }
+        int           i;
+        int           rc;
+        unsigned long flags;
 
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
+               atomic_read (&libcfs_kmemory));
 
-        LASSERT(nal == &kibnal_api);
+        LASSERT(ni == kibnal_data.kib_ni);
+        LASSERT(ni->ni_data == &kibnal_data);
 
         switch (kibnal_data.kib_init) {
         default:
@@ -1812,23 +1407,39 @@ kibnal_api_shutdown (nal_t *nal)
                 LBUG();
 
         case IBNAL_INIT_ALL:
-                /* stop calls to nal_cmd */
-                libcfs_nal_cmd_unregister(OPENIBNAL);
-                /* No new peers */
+                /* Prevent new peers from being created */
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                kibnal_data.kib_nonewpeers = 1;
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-                /* resetting my NID unadvertises me, removes my
-                 * listener and nukes all current peers */
-                kibnal_set_mynid (PTL_NID_ANY);
+                kibnal_stop_ib_listener();
+
+                /* Remove all existing peers from the peer table */
+                kibnal_del_peer(LNET_NID_ANY);
+                
+                /* Wait for pending conn reqs to be handled */
+                i = 2;
+                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                while (!list_empty(&kibnal_data.kib_connd_acceptq)) {
+                        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, 
+                                               flags);
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
+                               "waiting for conn reqs to clean up\n");
+                        cfs_pause(cfs_time_seconds(1));
+                        
+                        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+                }
+                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
 
                 /* Wait for all peer state to clean up */
                 i = 2;
-                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                while (atomic_read(&kibnal_data.kib_npeers) != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for %d peers to close down\n",
-                               atomic_read (&kibnal_data.kib_npeers));
-                        set_current_state (TASK_INTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                               atomic_read(&kibnal_data.kib_npeers));
+                        cfs_pause(cfs_time_seconds(1));
                 }
                 /* fall through */
 
@@ -1854,14 +1465,10 @@ kibnal_api_shutdown (nal_t *nal)
                         CERROR ("Destroy PD error: %d\n", rc);
                 /* fall through */
 
-        case IBNAL_INIT_LIB:
-                lib_fini(&kibnal_lib);
-                /* fall through */
-
         case IBNAL_INIT_DATA:
                 /* Module refcount only gets to zero when all peers
                  * have been closed so all lists must be empty */
-                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0);
                 LASSERT (kibnal_data.kib_peers != NULL);
                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
@@ -1885,8 +1492,7 @@ kibnal_api_shutdown (nal_t *nal)
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "Waiting for %d threads to terminate\n",
                                atomic_read (&kibnal_data.kib_nthreads));
-                        set_current_state (TASK_INTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                        cfs_pause(cfs_time_seconds(1));
                 }
                 /* fall through */
                 
@@ -1895,57 +1501,135 @@ kibnal_api_shutdown (nal_t *nal)
         }
 
         if (kibnal_data.kib_tx_descs != NULL)
-                PORTAL_FREE (kibnal_data.kib_tx_descs,
-                             IBNAL_TX_MSGS * sizeof(kib_tx_t));
+                LIBCFS_FREE (kibnal_data.kib_tx_descs,
+                             IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 
         if (kibnal_data.kib_peers != NULL)
-                PORTAL_FREE (kibnal_data.kib_peers,
+                LIBCFS_FREE (kibnal_data.kib_peers,
                              sizeof (struct list_head) * 
                              kibnal_data.kib_peer_hash_size);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
-        printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n",
-               atomic_read(&portal_kmemory));
+               atomic_read (&libcfs_kmemory));
 
         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+        PORTAL_MODULE_UNUSE;
+}
+
+int
+kibnal_get_ipoibidx(void)
+{
+        /* NB single threaded! */
+        static struct ib_port_properties port_props;
+
+        int               ipoibidx = 0;
+        int               devidx;
+        int               port;
+        int               rc;
+        struct ib_device *device;
+
+        for (devidx = 0; devidx <= kibnal_data.kib_hca_idx; devidx++) {
+                device = ib_device_get_by_index(devidx);
+                
+                if (device == NULL) {
+                        CERROR("Can't get IB device %d\n", devidx);
+                        return -1;
+                }
+                
+                for (port = 1; port <= 2; port++) {
+                        if (devidx == kibnal_data.kib_hca_idx &&
+                            port == kibnal_data.kib_port)
+                                return ipoibidx;
+                        
+                        rc = ib_port_properties_get(device, port,
+                                                    &port_props);
+                        if (rc == 0)
+                                ipoibidx++;
+                }
+        }
+
+        LBUG();
+        return -1;
 }
 
 int
-kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
-                     ptl_ni_limits_t *requested_limits,
-                     ptl_ni_limits_t *actual_limits)
+kibnal_startup (lnet_ni_t *ni)
 {
+        char              ipif_name[32];
+        __u32             ip;
+        __u32             netmask;
+        int               up;
         struct timeval    tv;
-        ptl_process_id_t  process_id;
-        int               pkmem = atomic_read(&portal_kmemory);
         int               rc;
+        int               hca;
+        int               port;
         int               i;
+        int               nob;
 
-        LASSERT (nal == &kibnal_api);
+        LASSERT (ni->ni_lnd == &the_kiblnd);
 
-        if (nal->nal_refct != 0) {
-                if (actual_limits != NULL)
-                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
-                /* This module got the first ref */
-                PORTAL_MODULE_USE;
-                return (PTL_OK);
+        /* Only 1 instance supported */
+        if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
+                CERROR ("Only 1 instance supported\n");
+                return -EPERM;
         }
 
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+        if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
+                CERROR ("Can't set credits(%d) > ntx(%d)\n",
+                        *kibnal_tunables.kib_credits,
+                        *kibnal_tunables.kib_ntx);
+                return -EINVAL;
+        }
 
         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
 
+        ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
+        ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
+
+        CLASSERT (LNET_MAX_INTERFACES > 1);
+
+
+        kibnal_data.kib_hca_idx = 0;            /* default: first HCA */
+        kibnal_data.kib_port = 0;               /* any port */
+
+        if (ni->ni_interfaces[0] != NULL) {
+                /* hca.port specified in 'networks=openib(h.p)' */
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Multiple interfaces not supported\n");
+                        return -EPERM;
+                }
+                
+                nob = strlen(ni->ni_interfaces[0]);
+                i = sscanf(ni->ni_interfaces[0], "%d.%d%n", &hca, &port, &nob);
+                if (i >= 2 && nob == strlen(ni->ni_interfaces[0])) {
+                        kibnal_data.kib_hca_idx = hca;
+                        kibnal_data.kib_port = port;
+                } else {
+                        nob = strlen(ni->ni_interfaces[0]);
+                        i = sscanf(ni->ni_interfaces[0], "%d%n", &hca, &nob);
+
+                        if (i >= 1 && nob == strlen(ni->ni_interfaces[0])) {
+                                kibnal_data.kib_hca_idx = hca;
+                        } else {
+                                CERROR("Can't parse interface '%s'\n",
+                                       ni->ni_interfaces[0]);
+                                return -EINVAL;
+                        }
+                }
+        }
+        
+        kibnal_data.kib_ni = ni;
+        ni->ni_data = &kibnal_data;
+        
         do_gettimeofday(&tv);
         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        init_MUTEX (&kibnal_data.kib_nid_mutex);
-        init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
+        PORTAL_MODULE_USE;
 
         rwlock_init(&kibnal_data.kib_global_lock);
 
         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (kibnal_data.kib_peers,
+        LIBCFS_ALLOC (kibnal_data.kib_peers,
                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
         if (kibnal_data.kib_peers == NULL) {
                 goto failed;
@@ -1969,11 +1653,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         spin_lock_init (&kibnal_data.kib_tx_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
-        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
-        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
 
-        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
-                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS() * sizeof(kib_tx_t));
         if (kibnal_data.kib_tx_descs == NULL) {
                 CERROR ("Can't allocate tx descs\n");
                 goto failed;
@@ -1983,21 +1665,6 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
-
-        process_id.pid = requested_pid;
-        process_id.nid = PTL_NID_ANY;           /* don't know my NID yet */
-        
-        rc = lib_init(&kibnal_lib, nal, process_id,
-                      requested_limits, actual_limits);
-        if (rc != PTL_OK) {
-                CERROR("lib_init failed: error %d\n", rc);
-                goto failed;
-        }
-
-        /* lib interface initialised */
-        kibnal_data.kib_init = IBNAL_INIT_LIB;
-        /*****************************************************/
-
         for (i = 0; i < IBNAL_N_SCHED; i++) {
                 rc = kibnal_thread_start (kibnal_scheduler,
                                           (void *)((unsigned long)i));
@@ -2008,7 +1675,13 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 }
         }
 
-        for (i = 0; i < IBNAL_N_CONND; i++) {
+        /* must have at least 2 connds to remain responsive to svcqry while
+         * connecting */
+        if (*kibnal_tunables.kib_n_connd < 2)
+                *kibnal_tunables.kib_n_connd = 2;
+
+
+        for (i = 0; i < *kibnal_tunables.kib_n_connd; i++) {
                 rc = kibnal_thread_start (kibnal_connd,
                                           (void *)((unsigned long)i));
                 if (rc != 0) {
@@ -2024,9 +1697,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 goto failed;
         }
 
-        kibnal_data.kib_device = ib_device_get_by_index(0);
+        kibnal_data.kib_device = ib_device_get_by_index(kibnal_data.kib_hca_idx);
         if (kibnal_data.kib_device == NULL) {
-                CERROR ("Can't open ib device 0\n");
+                CERROR ("Can't open ib device %d\n",
+                        kibnal_data.kib_hca_idx);
                 goto failed;
         }
         
@@ -2041,19 +1715,54 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                kibnal_data.kib_device_props.max_initiator_per_qp,
                kibnal_data.kib_device_props.max_responder_per_qp);
 
-        kibnal_data.kib_port = 0;
-        for (i = 1; i <= 2; i++) {
-                rc = ib_port_properties_get(kibnal_data.kib_device, i,
+        if (kibnal_data.kib_port != 0) {
+                rc = ib_port_properties_get(kibnal_data.kib_device, 
+                                            kibnal_data.kib_port,
                                             &kibnal_data.kib_port_props);
-                if (rc == 0) {
-                        kibnal_data.kib_port = i;
-                        break;
+                if (rc != 0) {
+                        CERROR("Error %d open port %d on HCA %d\n", rc,
+                               kibnal_data.kib_port,
+                               kibnal_data.kib_hca_idx);
+                        goto failed;
+                }
+        } else {
+                for (i = 1; i <= 2; i++) {
+                        rc = ib_port_properties_get(kibnal_data.kib_device, i,
+                                                    &kibnal_data.kib_port_props);
+                        if (rc == 0) {
+                                kibnal_data.kib_port = i;
+                                break;
+                        }
                 }
+                if (kibnal_data.kib_port == 0) {
+                        CERROR ("Can't find a port\n");
+                        goto failed;
+                }
+        }
+
+        i = kibnal_get_ipoibidx();
+        if (i < 0)
+                goto failed;
+        
+        snprintf(ipif_name, sizeof(ipif_name), "%s%d",
+                 *kibnal_tunables.kib_ipif_basename, i);
+        if (strlen(ipif_name) == sizeof(ipif_name - 1)) {
+                CERROR("IPoIB interface name %s truncated\n", ipif_name);
+                return -EINVAL;
         }
-        if (kibnal_data.kib_port == 0) {
-                CERROR ("Can't find a port\n");
+        
+        rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
+        if (rc != 0) {
+                CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
+                goto failed;
+        }
+        
+        if (!up) {
+                CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
                 goto failed;
         }
+        
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
 
         rc = ib_pd_create(kibnal_data.kib_device,
                           NULL, &kibnal_data.kib_pd);
@@ -2067,9 +1776,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         /*****************************************************/
 #if IBNAL_FMR
         {
-                const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK;
+                const int pool_size = *kibnal_tunables.kib_ntx;
                 struct ib_fmr_pool_param params = {
-                        .max_pages_per_fmr = PTL_MTU/PAGE_SIZE,
+                        .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE,
                         .access            = (IB_ACCESS_LOCAL_WRITE |
                                               IB_ACCESS_REMOTE_WRITE |
                                               IB_ACCESS_REMOTE_READ),
@@ -2112,7 +1821,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         },
                         .arg            = NULL,
                 };
-                int  nentries = IBNAL_CQ_ENTRIES;
+                int  nentries = IBNAL_CQ_ENTRIES();
                 
                 rc = ib_cq_create (kibnal_data.kib_device, 
                                    &nentries, &callback, NULL,
@@ -2126,39 +1835,31 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 rc = ib_cq_request_notification(kibnal_data.kib_cq, 1);
                 LASSERT (rc == 0);
         }
-        
+
         /* flag CQ initialised */
         kibnal_data.kib_init = IBNAL_INIT_CQ;
         /*****************************************************/
-        
-        rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL);
-        if (rc != 0) {
-                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-                goto failed;
-        }
 
+        rc = kibnal_start_ib_listener();
+        if (rc != 0)
+                goto failed;
+        
         /* flag everything initialised */
         kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
 
-        printk(KERN_INFO "Lustre: OpenIB NAL loaded "
-               "(initial mem %d)\n", pkmem);
-
-        return (PTL_OK);
+        return 0;
 
  failed:
-        kibnal_api_shutdown (&kibnal_api);    
-        return (PTL_FAIL);
+        kibnal_shutdown(ni);    
+        return -ENETDOWN;
 }
 
 void __exit
 kibnal_module_fini (void)
 {
-        if (kibnal_tunables.kib_sysctl != NULL)
-                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-        PtlNIFini(kibnal_ni);
-
-        ptl_unregister_nal(OPENIBNAL);
+        lnet_unregister_lnd(&the_kiblnd);
+        kibnal_tunables_fini();
 }
 
 int __init
@@ -2166,48 +1867,21 @@ kibnal_module_init (void)
 {
         int    rc;
 
-        /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT (sizeof(kibnal_tunables.kib_io_timeout) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_listener_timeout) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_backlog) == sizeof(int));
-        LASSERT (sizeof(kibnal_tunables.kib_port) == sizeof(int));
-
-        kibnal_api.nal_ni_init = kibnal_api_startup;
-        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
-
-        /* Initialise dynamic tunables to defaults once only */
-        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
-        kibnal_tunables.kib_listener_timeout = IBNAL_LISTENER_TIMEOUT;
-        kibnal_tunables.kib_backlog = IBNAL_BACKLOG;
-        kibnal_tunables.kib_port = IBNAL_PORT;
-
-        rc = ptl_register_nal(OPENIBNAL, &kibnal_api);
-        if (rc != PTL_OK) {
-                CERROR("Can't register IBNAL: %d\n", rc);
-                return (-ENOMEM);               /* or something... */
-        }
-
-        /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                ptl_unregister_nal(OPENIBNAL);
-                return (-ENODEV);
-        }
+        rc = kibnal_tunables_init();
+        if (rc != 0)
+                return rc;
         
-        kibnal_tunables.kib_sysctl = 
-                register_sysctl_table (kibnal_top_ctl_table, 0);
-        if (kibnal_tunables.kib_sysctl == NULL) {
-                CERROR("Can't register sysctl table\n");
-                PtlNIFini(kibnal_ni);
-                ptl_unregister_nal(OPENIBNAL);
-                return (-ENOMEM);
-        }
+        lnet_register_lnd(&the_kiblnd);
 
         return (0);
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01");
+#ifdef USING_TSAPI
+MODULE_DESCRIPTION("Kernel Cisco IB LND v1.00");
+#else
+MODULE_DESCRIPTION("Kernel OpenIB(gen1) LND v1.00");
+#endif
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
index 5ba102e..6ed306c 100644 (file)
 #include <net/sock.h>
 #include <linux/in.h>
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
 
 #include <ts_ib_core.h>
 #include <ts_ib_cm.h>
 #include <ts_ib_sa_client.h>
 
+#ifndef USING_TSAPI
+
+/* OpenIB Gen1 */
+typedef struct ib_qp       ib_qp_t;
+typedef struct ib_mr       ib_mr_t;
+typedef struct ib_fmr      ib_fmr_t;
+typedef struct ib_pd       ib_pd_t;
+typedef struct ib_cq       ib_cq_t;
+typedef struct ib_fmr_pool ib_fmr_pool_t;
+
+#else
+
+/* Cisco (topspin) */
+typedef void                 ib_qp_t;
+typedef void                 ib_mr_t;
+typedef void                 ib_fmr_t;
+typedef void                 ib_pd_t;
+typedef void                 ib_cq_t;
+typedef void                 ib_fmr_pool_t;
+
+#define IB_ACCESS_LOCAL_WRITE              TS_IB_ACCESS_LOCAL_WRITE
+#define IB_WQ_SIGNAL_SELECTABLE            TS_IB_ACCESS_LOCAL_WRITE
+#define IB_TRANSPORT_RC                    TS_IB_TRANSPORT_RC
+#define IB_QP_STATE_INIT                   TS_IB_QP_STATE_INIT
+#define IB_QP_ATTRIBUTE_STATE              TS_IB_QP_ATTRIBUTE_STATE
+#define IB_QP_ATTRIBUTE_PORT               TS_IB_QP_ATTRIBUTE_PORT
+#define IB_QP_ATTRIBUTE_PKEY_INDEX         TS_IB_QP_ATTRIBUTE_PKEY_INDEX
+#define IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE
+#define IB_ACCESS_LOCAL_WRITE              TS_IB_ACCESS_LOCAL_WRITE
+#define IB_ACCESS_REMOTE_WRITE             TS_IB_ACCESS_REMOTE_WRITE
+#define IB_ACCESS_REMOTE_READ              TS_IB_ACCESS_REMOTE_READ
+#define IB_CQ_CALLBACK_INTERRU             TS_IB_CQ_CALLBACK_INTERRUPTPT
+#define IB_CQ_PROVIDER_REARM               TS_IB_CQ_PROVIDER_REARM
+#define IB_CQ_CALLBACK_INTERRUPT           TS_IB_CQ_CALLBACK_INTERRUPT
+#define IB_COMPLETION_STATUS_SUCCESS       TS_IB_COMPLETION_STATUS_SUCCESS
+#define IB_OP_SEND                         TS_IB_OP_SEND
+#define IB_OP_RDMA_WRITE                   TS_IB_OP_RDMA_WRITE
+#define IB_OP_RDMA_READ                    TS_IB_OP_RDMA_READ
+
+#endif
+
 #if CONFIG_SMP
 # define IBNAL_N_SCHED      num_online_cpus()   /* # schedulers */
 #else
 # define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
-#define IBNAL_N_CONND       4                   /* # connection daemons */
-
-#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
-#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
-
-#define IBNAL_MSG_SIZE           (4<<10)        /* max size of queued messages (inc hdr) */
 
-#define IBNAL_MSG_QUEUE_SIZE      8             /* # messages/RDMAs in-flight */
-#define IBNAL_CREDIT_HIGHWATER    6             /* when to eagerly return credits */
-#define IBNAL_RETRY               7             /* # times to retry */
-#define IBNAL_RNR_RETRY           7             /*  */
-#define IBNAL_CM_RETRY            7             /* # times to retry connection */
-#define IBNAL_FLOW_CONTROL        1
-#define IBNAL_RESPONDER_RESOURCES 8
-
-#define IBNAL_NTX                 64            /* # tx descs */
-#define IBNAL_NTX_NBLK            256           /* # reserved tx descs */
-
-#define IBNAL_PEER_HASH_SIZE      101           /* # peer lists */
+#define IBNAL_FMR                    1
+//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
+#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
 
-#define IBNAL_RESCHED             100           /* # scheduler loops before reschedule */
 
-#define IBNAL_CONCURRENT_PEERS    1000          /* # nodes all talking at once to me */
+/* tunables fixed at compile time */
+#define IBNAL_PEER_HASH_SIZE         101        /* # peer lists */
+#define IBNAL_RESCHED                100        /* # scheduler loops before reschedule */
+#define IBNAL_MSG_QUEUE_SIZE         8          /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER       6          /* when to eagerly return credits */
+#define IBNAL_MSG_SIZE              (4<<10)     /* max size of queued messages (inc hdr) */
+#define IBNAL_RDMA_BASE              0x0eeb0000
 
-/* default vals for runtime tunables */
-#define IBNAL_IO_TIMEOUT          50            /* default comms timeout (seconds) */
-#define IBNAL_LISTENER_TIMEOUT    5             /* default listener timeout (seconds) */
-#define IBNAL_BACKLOG             127           /* default listener backlog */
-#define IBNAL_PORT                988           /* default listener port */
+/* QP tunables */
+#define IBNAL_RETRY                  7          /* # times to retry */
+#define IBNAL_RNR_RETRY              7          /*  */
+#define IBNAL_CM_RETRY               7          /* # times to retry connection */
+#define IBNAL_FLOW_CONTROL           1
+#define IBNAL_RESPONDER_RESOURCES    8
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
-#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_TX_MSGS()       (*kibnal_tunables.kib_ntx)
+#define IBNAL_TX_MSG_BYTES()  (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES()  ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* RX messages (per connection) */
-#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
-#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS         (IBNAL_MSG_QUEUE_SIZE * 2)
+#define IBNAL_RX_MSG_BYTES    (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES    ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
 /* we may have up to 2 completions per transmit +
    1 completion per receive, per connection */
-#define IBNAL_CQ_ENTRIES  ((2*IBNAL_TX_MSGS) +                          \
-                           (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS))
-
-#define IBNAL_RDMA_BASE  0x0eeb0000
-#define IBNAL_FMR        1
-#define IBNAL_CKSUM      1
-//#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_PROCESS
-#define IBNAL_CALLBACK_CTXT  IB_CQ_CALLBACK_INTERRUPT
+#define IBNAL_CQ_ENTRIES()  ((2*IBNAL_TX_MSGS()) +                                      \
+                             (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers))
 
 typedef struct
 {
-        int               kib_io_timeout;       /* comms timeout (seconds) */
-        int               kib_listener_timeout; /* listener's timeout */
-        int               kib_backlog;          /* listenter's accept backlog */
-        int               kib_port;             /* where the listener listens */
+        char    **kib_ipif_basename;            /* IPoIB interface base name */
+        int      *kib_n_connd;                  /* # connection daemons */
+        int      *kib_min_reconnect_interval;   /* min connect retry seconds... */
+        int      *kib_max_reconnect_interval;   /* max connect retry seconds */
+        int      *kib_concurrent_peers;         /* max # peers */
+        int      *kib_cksum;                    /* checksum kib_msg_t? */
+        int      *kib_timeout;                  /* comms timeout (seconds) */
+        int      *kib_keepalive;                /* keepalive (seconds) */
+        int      *kib_ntx;                      /* # tx descs */
+        int      *kib_credits;                  /* # concurrent sends */
+        int      *kib_peercredits;              /* # concurrent sends to 1 peer */
+
         struct ctl_table_header *kib_sysctl;    /* sysctl interface */
 } kib_tunables_t;
 
@@ -137,7 +170,7 @@ typedef struct
         __u64             ibp_vaddr;            /* mapped region vaddr */
         __u32             ibp_lkey;             /* mapped region lkey */
         __u32             ibp_rkey;             /* mapped region rkey */
-        struct ib_mr     *ibp_handle;           /* mapped region handle */
+        ib_mr_t          *ibp_handle;           /* mapped region handle */
         struct page      *ibp_pages[0];
 } kib_pages_t;
 
@@ -147,22 +180,19 @@ typedef struct
         __u64             kib_incarnation;      /* which one am I */
         int               kib_shutdown;         /* shut down? */
         atomic_t          kib_nthreads;         /* # live threads */
+        lnet_ni_t        *kib_ni;               /* _the_ openib interface */
 
         __u64             kib_svc_id;           /* service number I listen on */
         tTS_IB_GID        kib_svc_gid;          /* device/port GID */
         __u16             kib_svc_pkey;         /* device/port pkey */
         
-        ptl_nid_t         kib_nid;              /* my NID */
-        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
-        struct semaphore  kib_listener_signal;  /* signal IP listener completion */
-        struct socket    *kib_listener_sock;    /* IP listener's socket */
-        int               kib_listener_shutdown; /* ask IP listener to close */
         void             *kib_listen_handle;    /* IB listen handle */
         
         rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
 
         struct list_head *kib_peers;            /* hash table of all my known peers */
         int               kib_peer_hash_size;   /* size of kib_peers */
+        int               kib_nonewpeers;       /* prevent new peers? */
         atomic_t          kib_npeers;           /* # peers extant */
         atomic_t          kib_nconns;           /* # connections extant */
 
@@ -174,6 +204,7 @@ typedef struct
         struct list_head  kib_connd_peers;      /* peers waiting for a connection */
         struct list_head  kib_connd_acceptq;    /* accepted sockets to handle */
         wait_queue_head_t kib_connd_waitq;      /* connection daemons sleep here */
+        int               kib_connd_connecting; /* # connds connecting */
         spinlock_t        kib_connd_lock;       /* serialise */
 
         wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
@@ -185,20 +216,19 @@ typedef struct
         kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
 
         struct list_head  kib_idle_txs;         /* idle tx descriptors */
-        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
-        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
         __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
         spinlock_t        kib_tx_lock;          /* serialise */
 
+        int               kib_hca_idx;          /* my HCA number */
         struct ib_device *kib_device;           /* "the" device */
         struct ib_device_properties kib_device_props; /* its properties */
         int               kib_port;             /* port on the device */
         struct ib_port_properties kib_port_props; /* its properties */
-        struct ib_pd     *kib_pd;               /* protection domain */
+        ib_pd_t          *kib_pd;               /* protection domain */
 #if IBNAL_FMR
-        struct ib_fmr_pool *kib_fmr_pool;       /* fast memory region pool */
+        ib_fmr_pool_t    *kib_fmr_pool;         /* fast memory region pool */
 #endif
-        struct ib_cq     *kib_cq;               /* completion queue */
+        ib_cq_t          *kib_cq;               /* completion queue */
 
 } kib_data_t;
 
@@ -239,8 +269,8 @@ typedef struct kib_connparams
 typedef struct
 {
         union {
-                struct ib_mr    *mr;
-                struct ib_fmr   *fmr;
+                ib_mr_t         *mr;
+                ib_fmr_t        *fmr;
         }                 md_handle;
         __u32             md_lkey;
         __u32             md_rkey;
@@ -256,13 +286,13 @@ typedef struct
 
 typedef struct
 {
-        ptl_hdr_t         ibim_hdr;             /* portals header */
+        lnet_hdr_t        ibim_hdr;             /* portals header */
         char              ibim_payload[0];      /* piggy-backed payload */
 } WIRE_ATTR kib_immediate_msg_t;
 
 typedef struct
 {
-        ptl_hdr_t         ibrm_hdr;             /* portals header */
+        lnet_hdr_t        ibrm_hdr;             /* portals header */
         __u64             ibrm_cookie;          /* opaque completion cookie */
         kib_rdma_desc_t   ibrm_desc;            /* where to suck/blow */
 } WIRE_ATTR kib_rdma_msg_t;
@@ -296,8 +326,9 @@ typedef struct
         } WIRE_ATTR       ibm_u;
 } WIRE_ATTR kib_msg_t;
 
-#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
-#define IBNAL_MSG_VERSION              2        /* current protocol version */
+#define IBNAL_MSG_MAGIC LNET_PROTO_OPENIB_MAGIC /* unique magic */
+#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 2   /* previous protocol version */
+#define IBNAL_MSG_VERSION              3        /* current protocol version */
 
 #define IBNAL_MSG_SVCQRY            0xb0        /* service query */
 #define IBNAL_MSG_SVCRSP            0xb1        /* service response */
@@ -316,8 +347,7 @@ typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
         struct kib_conn          *rx_conn;      /* owning conn */
-        int                       rx_rdma;      /* RDMA completion posted? */
-        int                       rx_posted;    /* posted? */
+        int                       rx_nob;       /* # bytes received (-1 while posted) */
         __u64                     rx_vaddr;     /* pre-mapped buffer (hca vaddr) */
         kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         struct ib_receive_param   rx_sp;        /* receive work item */
@@ -327,7 +357,6 @@ typedef struct kib_rx                           /* receive message */
 typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
-        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
         struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_mapped;    /* mapped for RDMA? */
         int                       tx_sending;   /* # tx callbacks outstanding */
@@ -336,7 +365,7 @@ typedef struct kib_tx                           /* transmit message */
         int                       tx_passive_rdma; /* peer sucks/blows */
         int                       tx_passive_rdma_wait; /* waiting for peer to complete */
         __u64                     tx_passive_rdma_cookie; /* completion cookie */
-        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+        lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
         kib_md_t                  tx_md;        /* RDMA mapping (active/passive) */
         __u64                     tx_vaddr;     /* pre-mapped buffer (hca vaddr) */
         kib_msg_t                *tx_msg;       /* pre-mapped buffer (host vaddr) */
@@ -366,18 +395,22 @@ typedef struct kib_conn
         struct kib_peer    *ibc_peer;           /* owning peer */
         struct list_head    ibc_list;           /* stash on peer's conn list */
         __u64               ibc_incarnation;    /* which instance of the peer */
+        int                 ibc_version;        /* peer protocol version */
         atomic_t            ibc_refcount;       /* # users */
         int                 ibc_state;          /* what's happening */
-        atomic_t            ibc_nob;            /* # bytes buffered */
         int                 ibc_nsends_posted;  /* # uncompleted sends */
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
+        int                 ibc_reserved_credits; /* # credits for ACK/DONE msgs */
+        unsigned long       ibc_last_send;      /* time of last send */
+        struct list_head    ibc_tx_queue_nocred; /* sends that don't need a credit */
+        struct list_head    ibc_tx_queue_rsrvd; /* sends that need a reserved cred */
         struct list_head    ibc_tx_queue;       /* send queue */
         struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
         kib_rx_t           *ibc_rxs;            /* the rx descs */
         kib_pages_t        *ibc_rx_pages;       /* premapped rx msg pages */
-        struct ib_qp       *ibc_qp;             /* queue pair */
+        ib_qp_t            *ibc_qp;             /* queue pair */
         __u32               ibc_qpn;            /* queue pair number */
         tTS_IB_CM_COMM_ID   ibc_comm_id;        /* connection ID? */
         kib_connreq_t      *ibc_connreq;        /* connection request state */
@@ -394,7 +427,7 @@ typedef struct kib_peer
 {
         struct list_head    ibp_list;           /* stash on global peer list */
         struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
-        ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        lnet_nid_t          ibp_nid;            /* who's on the other end(s) */
         __u32               ibp_ip;             /* IP to query for peer conn params */
         int                 ibp_port;           /* port to qery for peer conn params */
         __u64               ibp_incarnation;    /* peer's incarnation */
@@ -402,17 +435,69 @@ typedef struct kib_peer
         int                 ibp_persistence;    /* "known" peer refs */
         struct list_head    ibp_conns;          /* all active connections */
         struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
-        int                 ibp_connecting;     /* connecting+accepting */
+        int                 ibp_connecting;     /* current active connection attempts */
+        int                 ibp_accepting;      /* current passive connection attempts */
         unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
+        int                 ibp_error;          /* errno on closing this peer */
+        cfs_time_t          ibp_last_alive;     /* when (in jiffies) I was last alive */
 } kib_peer_t;
 
-extern lib_nal_t       kibnal_lib;
 extern kib_data_t      kibnal_data;
 extern kib_tunables_t  kibnal_tunables;
 
+/******************************************************************************/
+
+/* these are purposely avoiding using local vars so they don't increase
+ * stack consumption. */
+
+#define kibnal_conn_addref(conn)                                \
+do {                                                            \
+        CDEBUG(D_NET, "conn[%p] (%d)++\n",                      \
+               (conn), atomic_read(&(conn)->ibc_refcount));     \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);        \
+        atomic_inc(&(conn)->ibc_refcount);                      \
+} while (0)
+
+#define kibnal_conn_decref(conn)                                              \
+do {                                                                          \
+        unsigned long   flags;                                                \
+                                                                              \
+        CDEBUG(D_NET, "conn[%p] (%d)--\n",                                    \
+               (conn), atomic_read(&(conn)->ibc_refcount));                   \
+        LASSERT(atomic_read(&(conn)->ibc_refcount) > 0);                      \
+        if (atomic_dec_and_test(&(conn)->ibc_refcount)) {                     \
+                spin_lock_irqsave(&kibnal_data.kib_reaper_lock, flags);       \
+                list_add_tail(&(conn)->ibc_list,                              \
+                              &kibnal_data.kib_reaper_conns);                 \
+                wake_up(&kibnal_data.kib_reaper_waitq);                       \
+                spin_unlock_irqrestore(&kibnal_data.kib_reaper_lock, flags);  \
+        }                                                                     \
+} while (0)
+
+#define kibnal_peer_addref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        atomic_inc(&(peer)->ibp_refcount);                      \
+} while (0)
+
+#define kibnal_peer_decref(peer)                                \
+do {                                                            \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
+               atomic_read (&(peer)->ibp_refcount));            \
+        LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
+        if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
+                kibnal_destroy_peer(peer);                      \
+} while (0)
+
+/******************************************************************************/
+
 static inline struct list_head *
-kibnal_nid2peerlist (ptl_nid_t nid)
+kibnal_nid2peerlist (lnet_nid_t nid)
 {
         unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
 
@@ -429,42 +514,57 @@ kibnal_peer_active(kib_peer_t *peer)
 static inline void
 kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
-        /* CAVEAT EMPTOR: tx takes caller's ref on conn */
+        struct list_head      *q;
 
         LASSERT (tx->tx_nsp > 0);               /* work items set up */
         LASSERT (tx->tx_conn == NULL);          /* only set here */
 
+        kibnal_conn_addref(conn);
         tx->tx_conn = conn;
-        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
-        list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
-}
-
-#if 0
-static inline void
-kibnal_show_rdma_attr (kib_conn_t *conn)
-{
-        struct ib_qp_attribute qp_attr;
-        int                    rc;
-
-        memset (&qp_attr, 0, sizeof(qp_attr));
-        rc = ib_qp_query(conn->ibc_qp, &qp_attr);
-        if (rc != 0) {
-                CERROR ("Can't get qp attrs: %d\n", rc);
-                return;
+        tx->tx_deadline = jiffies + *kibnal_tunables.kib_timeout * HZ;
+
+        if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                /* All messages have simple credit control */
+                q = &conn->ibc_tx_queue;
+        } else {
+                LASSERT (conn->ibc_version == IBNAL_MSG_VERSION);
+                
+                switch (tx->tx_msg->ibm_type) {
+                case IBNAL_MSG_PUT_RDMA:
+                case IBNAL_MSG_GET_RDMA:
+                        /* RDMA request: reserve a buffer for the RDMA reply
+                         * before sending */
+                        q = &conn->ibc_tx_queue_rsrvd;
+                        break;
+
+                case IBNAL_MSG_PUT_DONE:
+                case IBNAL_MSG_GET_DONE:
+                        /* RDMA completion: no credits; peer has reserved a
+                         * reply buffer */
+                        q = &conn->ibc_tx_queue_nocred;
+                        break;
+                
+                case IBNAL_MSG_NOOP:
+                case IBNAL_MSG_IMMEDIATE:
+                        /* Otherwise: consume a credit before sending */
+                        q = &conn->ibc_tx_queue;
+                        break;
+                
+                default:
+                        LBUG();
+                        q = NULL;
+                }
         }
 
-        CWARN ("RDMA CAPABILITY: write %s read %s\n",
-               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
-               (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid",
-               (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ?
-               (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid");
+        list_add_tail(&tx->tx_list, q);
 }
-#endif
 
-static inline __u64
-kibnal_page2phys (struct page *p)
+static inline int
+kibnal_send_keepalive(kib_conn_t *conn) 
 {
-        return page_to_phys(p);
+        return (*kibnal_tunables.kib_keepalive > 0) &&
+                time_after(jiffies, conn->ibc_last_send +
+                           *kibnal_tunables.kib_keepalive*HZ);
 }
 
 /* CAVEAT EMPTOR:
@@ -494,38 +594,63 @@ kibnal_wreqid_is_rx (__u64 wreqid)
         return (wreqid & 1) != 0;
 }
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
-# define sk_allocation  allocation
-# define sk_data_ready  data_ready
-# define sk_write_space write_space
-# define sk_user_data   user_data
-# define sk_prot        prot
-# define sk_sndbuf      sndbuf
-# define sk_socket      socket
-# define sk_wmem_queued wmem_queued
-# define sk_err         err
-# define sk_sleep       sleep
+#if (IB_NTXRXPARAMS == 3)
+static inline int
+kibnal_ib_send(ib_qp_t *qp, struct ib_send_param *p)
+{
+        return ib_send(qp, p, 1);
+}
+
+static inline int
+kibnal_ib_receive(ib_qp_t *qp, struct ib_receive_param *p)
+{
+        return ib_receive(qp, p, 1);
+}
+#elif (IB_NTXRXPARAMS == 4)
+static inline int
+kibnal_ib_send(ib_qp_t *qp, struct ib_send_param *p)
+{
+        return ib_send(qp, p, 1, NULL);
+}
+
+static inline int
+kibnal_ib_receive(ib_qp_t *qp, struct ib_receive_param *p)
+{
+        return ib_receive(qp, p, 1, NULL);
+}
+#else
+ #error "IB_NTXRXPARAMS not set correctly"
 #endif
 
+int kibnal_startup (lnet_ni_t *ni);
+void kibnal_shutdown (lnet_ni_t *ni);
+int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kibnal_eager_recv (lnet_ni_t *ni, void *private, 
+                       lnet_msg_t *lntmsg, void **new_private);
+int kibnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, 
+                int delayed, unsigned int niov, 
+                struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
+int kibnal_accept(lnet_ni_t *ni, struct socket *sock);
+
 extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob);
-extern void kibnal_pack_msg(kib_msg_t *msg, int credits, 
-                            ptl_nid_t dstnid, __u64 dststamp);
-extern int kibnal_unpack_msg(kib_msg_t *msg, int nob);
+extern void kibnal_pack_msg(kib_msg_t *msg, int version, int credits, 
+                            lnet_nid_t dstnid, __u64 dststamp);
+extern int kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob);
 extern void kibnal_handle_svcqry (struct socket *sock);
 extern int kibnal_make_svcqry (kib_conn_t *conn);
 extern void kibnal_free_acceptsock (kib_acceptsock_t *as);
-extern int kibnal_listener_procint(ctl_table *table, int write, 
-                                   struct file *filp, void *buffer, 
-                                   size_t *lenp);
-extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid);
-extern void kibnal_put_peer (kib_peer_t *peer);
-extern int kibnal_del_peer (ptl_nid_t nid, int single_share);
-extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid);
+extern int kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid);
+extern void kibnal_destroy_peer (kib_peer_t *peer);
+extern int kibnal_add_persistent_peer(lnet_nid_t nid, __u32 ip, int port);
+extern int kibnal_del_peer (lnet_nid_t nid);
+extern kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid);
 extern void kibnal_unlink_peer_locked (kib_peer_t *peer);
+extern void kibnal_peer_alive(kib_peer_t *peer);
 extern int  kibnal_close_stale_conns_locked (kib_peer_t *peer,
                                               __u64 incarnation);
 extern kib_conn_t *kibnal_create_conn (void);
-extern void kibnal_put_conn (kib_conn_t *conn);
 extern void kibnal_destroy_conn (kib_conn_t *conn);
 extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access);
 extern void kibnal_free_pages (kib_pages_t *p);
@@ -548,16 +673,15 @@ extern int  kibnal_thread_start (int (*fn)(void *arg), void *arg);
 extern int  kibnal_scheduler(void *arg);
 extern int  kibnal_connd (void *arg);
 extern int  kibnal_reaper (void *arg);
-extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg);
+extern void kibnal_callback (ib_cq_t *cq, struct ib_cq_entry *e, void *arg);
+extern void kibnal_txlist_done (struct list_head *txlist, int status);
 extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob);
 extern int  kibnal_close_conn (kib_conn_t *conn, int why);
 extern void kibnal_start_active_rdma (int type, int status,
-                                      kib_rx_t *rx, lib_msg_t *libmsg,
+                                      kib_rx_t *rx, lnet_msg_t *lntmsg,
                                       unsigned int niov,
-                                      struct iovec *iov, ptl_kiov_t *kiov,
+                                      struct iovec *iov, lnet_kiov_t *kiov,
                                       int offset, int nob);
 
-
-
-
-
+extern int  kibnal_tunables_init(void);
+extern void kibnal_tunables_fini(void);
index a356eaf..75f3e23 100644 (file)
@@ -21,7 +21,7 @@
  *
  */
 
-#include "openibnal.h"
+#include "openiblnd.h"
 
 /*
  *  LIB functions follow
@@ -43,7 +43,7 @@ kibnal_schedule_tx_done (kib_tx_t *tx)
 void
 kibnal_tx_done (kib_tx_t *tx)
 {
-        ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
+        lnet_msg_t      *lntmsg[2];
         unsigned long    flags;
         int              i;
         int              rc;
@@ -51,6 +51,12 @@ kibnal_tx_done (kib_tx_t *tx)
         LASSERT (tx->tx_sending == 0);          /* mustn't be awaiting callback */
         LASSERT (!tx->tx_passive_rdma_wait);    /* mustn't be awaiting RDMA */
 
+        if (in_interrupt()) {
+                /* can't deregister memory/flush FMAs/finalize in IRQ context... */
+                kibnal_schedule_tx_done(tx);
+                return;
+        }
+
         switch (tx->tx_mapped) {
         default:
                 LBUG();
@@ -59,11 +65,6 @@ kibnal_tx_done (kib_tx_t *tx)
                 break;
                 
         case KIB_TX_MAPPED:
-                if (in_interrupt()) {
-                        /* can't deregister memory in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }
                 rc = ib_memory_deregister(tx->tx_md.md_handle.mr);
                 LASSERT (rc == 0);
                 tx->tx_mapped = KIB_TX_UNMAPPED;
@@ -71,33 +72,27 @@ kibnal_tx_done (kib_tx_t *tx)
 
 #if IBNAL_FMR
         case KIB_TX_MAPPED_FMR:
-                if (in_interrupt() && tx->tx_status != 0) {
-                        /* can't flush FMRs in IRQ context... */
-                        kibnal_schedule_tx_done(tx);
-                        return;
-                }              
-
                 rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
                 LASSERT (rc == 0);
 
+#ifndef USING_TSAPI
+                /* Somewhat belt-and-braces since the tx's conn has closed if
+                 * this was a passive RDMA waiting to complete... */
                 if (tx->tx_status != 0)
                         ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
+#endif
                 tx->tx_mapped = KIB_TX_UNMAPPED;
                 break;
 #endif
         }
 
-        for (i = 0; i < 2; i++) {
-                /* tx may have up to 2 libmsgs to finalise */
-                if (tx->tx_libmsg[i] == NULL)
-                        continue;
+        /* tx may have up to 2 ptlmsgs to finalise */
+        lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+        lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
+        rc = tx->tx_status;
 
-                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
-                tx->tx_libmsg[i] = NULL;
-        }
-        
         if (tx->tx_conn != NULL) {
-                kibnal_put_conn (tx->tx_conn);
+                kibnal_conn_decref(tx->tx_conn);
                 tx->tx_conn = NULL;
         }
 
@@ -107,88 +102,53 @@ kibnal_tx_done (kib_tx_t *tx)
 
         spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
-        if (tx->tx_isnblk) {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
-        } else {
-                list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
-                wake_up (&kibnal_data.kib_idle_tx_waitq);
-        }
+        list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
         spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+
+        /* delay finalize until my descs have been freed */
+        for (i = 0; i < 2; i++) {
+                if (lntmsg[i] == NULL)
+                        continue;
+
+                lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
+        }
 }
 
 kib_tx_t *
-kibnal_get_idle_tx (int may_block
+kibnal_get_idle_tx (void
 {
         unsigned long  flags;
-        kib_tx_t      *tx = NULL;
+        kib_tx_t      *tx;
         
-        for (;;) {
-                spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
-
-                /* "normal" descriptor is free */
-                if (!list_empty (&kibnal_data.kib_idle_txs)) {
-                        tx = list_entry (kibnal_data.kib_idle_txs.next,
-                                         kib_tx_t, tx_list);
-                        break;
-                }
-
-                if (!may_block) {
-                        /* may dip into reserve pool */
-                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
-                                CERROR ("reserved tx desc pool exhausted\n");
-                                break;
-                        }
-
-                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
-                                         kib_tx_t, tx_list);
-                        break;
-                }
+        spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
 
-                /* block for idle tx */
+        if (list_empty (&kibnal_data.kib_idle_txs)) {
                 spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
-
-                wait_event (kibnal_data.kib_idle_tx_waitq,
-                            !list_empty (&kibnal_data.kib_idle_txs) ||
-                            kibnal_data.kib_shutdown);
+                return NULL;
         }
 
-        if (tx != NULL) {
-                list_del (&tx->tx_list);
+        tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
+        list_del (&tx->tx_list);
 
-                /* Allocate a new passive RDMA completion cookie.  It might
-                 * not be needed, but we've got a lock right now and we're
-                 * unlikely to wrap... */
-                tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
-
-                LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
-                LASSERT (tx->tx_nsp == 0);
-                LASSERT (tx->tx_sending == 0);
-                LASSERT (tx->tx_status == 0);
-                LASSERT (tx->tx_conn == NULL);
-                LASSERT (!tx->tx_passive_rdma);
-                LASSERT (!tx->tx_passive_rdma_wait);
-                LASSERT (tx->tx_libmsg[0] == NULL);
-                LASSERT (tx->tx_libmsg[1] == NULL);
-        }
+        /* Allocate a new passive RDMA completion cookie.  It might not be
+         * needed, but we've got a lock right now and we're unlikely to
+         * wrap... */
+        tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
 
         spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
-        
-        return (tx);
-}
-
-int
-kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
-        /* I would guess that if kibnal_get_peer (nid) == NULL,
-           and we're not routing, then 'nid' is very distant :) */
-        if ( nal->libnal_ni.ni_pid.nid == nid ) {
-                *dist = 0;
-        } else {
-                *dist = 1;
-        }
 
-        return 0;
+        LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+        LASSERT (tx->tx_nsp == 0);
+        LASSERT (tx->tx_sending == 0);
+        LASSERT (tx->tx_status == 0);
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (!tx->tx_passive_rdma);
+        LASSERT (!tx->tx_passive_rdma_wait);
+        LASSERT (tx->tx_lntmsg[0] == NULL);
+        LASSERT (tx->tx_lntmsg[1] == NULL);
+
+        return tx;
 }
 
 void
@@ -215,6 +175,8 @@ kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
 
                 CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
 
+                /* XXX Set mlength of reply here */
+
                 tx->tx_status = status;
                 tx->tx_passive_rdma_wait = 0;
                 idle = (tx->tx_sending == 0);
@@ -233,17 +195,20 @@ kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
                 
         spin_unlock_irqrestore (&conn->ibc_lock, flags);
 
-        CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n",
-                cookie, conn->ibc_peer->ibp_nid);
+        CERROR ("Unmatched (late?) RDMA completion "LPX64" from %s\n",
+                cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 }
 
 void
-kibnal_post_rx (kib_rx_t *rx, int do_credits)
+kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
 {
         kib_conn_t   *conn = rx->rx_conn;
         int           rc;
         unsigned long flags;
 
+        LASSERT(!rsrvd_credit ||
+                conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+
         rx->rx_gl = (struct ib_gather_scatter) {
                 .address = rx->rx_vaddr,
                 .length  = IBNAL_MSG_SIZE,
@@ -259,19 +224,24 @@ kibnal_post_rx (kib_rx_t *rx, int do_credits)
         };
 
         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
-        LASSERT (!rx->rx_posted);
-        rx->rx_posted = 1;
+        LASSERT (rx->rx_nob >= 0);              /* not posted */
+        rx->rx_nob = -1;                        /* is now */
         mb();
 
         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
                 rc = -ECONNABORTED;
         else
-                rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1);
+                rc = kibnal_ib_receive(conn->ibc_qp, &rx->rx_sp);
 
         if (rc == 0) {
-                if (do_credits) {
+                if (credit || rsrvd_credit) {
                         spin_lock_irqsave(&conn->ibc_lock, flags);
-                        conn->ibc_outstanding_credits++;
+
+                        if (credit)
+                                conn->ibc_outstanding_credits++;
+                        if (rsrvd_credit)
+                                conn->ibc_reserved_credits++;
+                        
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
 
                         kibnal_check_sends(conn);
@@ -280,16 +250,16 @@ kibnal_post_rx (kib_rx_t *rx, int do_credits)
         }
 
         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
-                CERROR ("Error posting receive -> "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, rc);
+                CERROR ("Error posting receive -> %s: %d\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
                 kibnal_close_conn (rx->rx_conn, rc);
         } else {
-                CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n",
-                        conn->ibc_peer->ibp_nid, rc);
+                CDEBUG (D_NET, "Error posting receive -> %s: %d\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
         }
 
         /* Drop rx's ref */
-        kibnal_put_conn (conn);
+        kibnal_conn_decref(conn);
 }
 
 void
@@ -301,10 +271,11 @@ kibnal_rx_callback (struct ib_cq_entry *e)
         int           credits;
         unsigned long flags;
         int           rc;
+        int           err = -ECONNABORTED;
 
         CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
-        LASSERT (rx->rx_posted);
-        rx->rx_posted = 0;
+        LASSERT (rx->rx_nob < 0);               /* was posted */
+        rx->rx_nob = 0;                         /* isn't now */
         mb();
 
         /* receives complete with error in any case after we've started
@@ -316,24 +287,31 @@ kibnal_rx_callback (struct ib_cq_entry *e)
         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
-                CERROR("Rx from "LPX64" failed: %d\n", 
-                       conn->ibc_peer->ibp_nid, e->status);
+                CERROR("Rx from %s failed: %d\n", 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status);
                 goto failed;
         }
 
-        rc = kibnal_unpack_msg(msg, e->bytes_transferred);
+        LASSERT (e->bytes_transferred >= 0);
+        rx->rx_nob = e->bytes_transferred;
+        mb();
+
+        rc = kibnal_unpack_msg(msg, conn->ibc_version, rx->rx_nob);
         if (rc != 0) {
-                CERROR ("Error %d unpacking rx from "LPX64"\n",
-                        rc, conn->ibc_peer->ibp_nid);
+                CERROR ("Error %d unpacking rx from %s\n",
+                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 goto failed;
         }
 
-        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+        if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
+                                     msg->ibm_srcnid) ||
+            !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+                                     msg->ibm_dstnid) ||
             msg->ibm_srcstamp != conn->ibc_incarnation ||
-            msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
-                CERROR ("Stale rx from "LPX64"\n",
-                        conn->ibc_peer->ibp_nid);
+                CERROR ("Stale rx from %s\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                err = -ESTALE;
                 goto failed;
         }
 
@@ -349,7 +327,7 @@ kibnal_rx_callback (struct ib_cq_entry *e)
 
         switch (msg->ibm_type) {
         case IBNAL_MSG_NOOP:
-                kibnal_post_rx (rx, 1);
+                kibnal_post_rx (rx, 1, 0);
                 return;
 
         case IBNAL_MSG_IMMEDIATE:
@@ -373,15 +351,23 @@ kibnal_rx_callback (struct ib_cq_entry *e)
                 kibnal_complete_passive_rdma (conn, 
                                               msg->ibm_u.completion.ibcm_cookie,
                                               msg->ibm_u.completion.ibcm_status);
-                kibnal_post_rx (rx, 1);
+
+                if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                        kibnal_post_rx (rx, 1, 0);
+                } else {
+                        /* this reply buffer was pre-reserved */
+                        kibnal_post_rx (rx, 0, 1);
+                }
                 return;
                         
         default:
-                CERROR ("Bad msg type %x from "LPX64"\n",
-                        msg->ibm_type, conn->ibc_peer->ibp_nid);
+                CERROR ("Bad msg type %x from %s\n",
+                        msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 goto failed;
         }
 
+        kibnal_peer_alive(conn->ibc_peer);
+
         /* schedule for kibnal_rx() in thread context */
         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
         
@@ -393,61 +379,43 @@ kibnal_rx_callback (struct ib_cq_entry *e)
         
  failed:
         CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
-        kibnal_close_conn(conn, -ECONNABORTED);
+        kibnal_close_conn(conn, err);
 
         /* Don't re-post rx & drop its ref on conn */
-        kibnal_put_conn(conn);
+        kibnal_conn_decref(conn);
 }
 
 void
 kibnal_rx (kib_rx_t *rx)
 {
+        int          rc = 0;
         kib_msg_t   *msg = rx->rx_msg;
 
-        /* Clear flag so I can detect if I've sent an RDMA completion */
-        rx->rx_rdma = 0;
-
         switch (msg->ibm_type) {
         case IBNAL_MSG_GET_RDMA:
-                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
-                /* If the incoming get was matched, I'll have initiated the
-                 * RDMA and the completion message... */
-                if (rx->rx_rdma)
-                        break;
-
-                /* Otherwise, I'll send a failed completion now to prevent
-                 * the peer's GET blocking for the full timeout. */
-                CERROR ("Completing unmatched RDMA GET from "LPX64"\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO,
-                                          rx, NULL, 0, NULL, NULL, 0, 0);
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr,
+                                msg->ibm_srcnid, rx, 1);
                 break;
                 
         case IBNAL_MSG_PUT_RDMA:
-                lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx);
-                if (rx->rx_rdma)
-                        break;
-                /* This is most unusual, since even if lib_parse() didn't
-                 * match anything, it should have asked us to read (and
-                 * discard) the payload.  The portals header must be
-                 * inconsistent with this message type, so it's the
-                 * sender's fault for sending garbage and she can time
-                 * herself out... */
-                CERROR ("Uncompleted RMDA PUT from "LPX64"\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr,
+                                msg->ibm_srcnid, rx, 1);
                 break;
 
         case IBNAL_MSG_IMMEDIATE:
-                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
-                LASSERT (!rx->rx_rdma);
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
+                                msg->ibm_srcnid, rx, 0);
                 break;
-                
+
         default:
                 LBUG();
                 break;
         }
 
-        kibnal_post_rx (rx, 1);
+        if (rc < 0) {
+                kibnal_close_conn(rx->rx_conn, rc);
+                kibnal_post_rx (rx, 1, 0);
+        }
 }
 
 #if 0
@@ -472,14 +440,14 @@ kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp)
             !VALID_PAGE (page))
                 return (-EFAULT);
 
-        *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
+        *physp = lnet_page2phys(page) + (vaddr & (PAGE_SIZE - 1));
         return (0);
 }
 #endif
 
 int
-kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
-                 int niov, struct iovec *iov, int offset, int nob)
+kibnal_map_iov (kib_tx_t *tx, int access,
+                unsigned int niov, struct iovec *iov, int offset, int nob)
                  
 {
         void   *vaddr;
@@ -521,8 +489,8 @@ kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access,
 }
 
 int
-kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
-                  int nkiov, ptl_kiov_t *kiov,
+kibnal_map_kiov (kib_tx_t *tx, int access,
+                  int nkiov, lnet_kiov_t *kiov,
                   int offset, int nob)
 {
 #if IBNAL_FMR
@@ -552,7 +520,7 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
         }
 
         phys_size = nkiov * sizeof (*phys);
-        PORTAL_ALLOC(phys, phys_size);
+        LIBCFS_ALLOC(phys, phys_size);
         if (phys == NULL) {
                 CERROR ("Can't allocate tmp phys\n");
                 return (-ENOMEM);
@@ -560,9 +528,9 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
 
         page_offset = kiov->kiov_offset + offset;
 #if IBNAL_FMR
-        phys[0] = kibnal_page2phys(kiov->kiov_page);
+        phys[0] = lnet_page2phys(kiov->kiov_page);
 #else
-        phys[0].address = kibnal_page2phys(kiov->kiov_page);
+        phys[0].address = lnet_page2phys(kiov->kiov_page);
         phys[0].size = PAGE_SIZE;
 #endif
         nphys = 1;
@@ -592,7 +560,7 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
                         goto out;
                 }
 
-                if (nphys == PTL_MD_MAX_IOV) {
+                if (nphys == LNET_MAX_IOV) {
                         CERROR ("payload too big (%d)\n", nphys);
                         rc = -EMSGSIZE;
                         goto out;
@@ -600,9 +568,9 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
 
                 LASSERT (nphys * sizeof (*phys) < phys_size);
 #if IBNAL_FMR
-                phys[nphys] = kibnal_page2phys(kiov->kiov_page);
+                phys[nphys] = lnet_page2phys(kiov->kiov_page);
 #else
-                phys[nphys].address = kibnal_page2phys(kiov->kiov_page);
+                phys[nphys].address = lnet_page2phys(kiov->kiov_page);
                 phys[nphys].size = PAGE_SIZE;
 #endif
                 nphys++;
@@ -640,7 +608,7 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access,
         }
 
  out:
-        PORTAL_FREE(phys, phys_size);
+        LIBCFS_FREE(phys, phys_size);
         return (rc);
 }
 
@@ -664,31 +632,57 @@ kibnal_check_sends (kib_conn_t *conn)
         kib_tx_t       *tx;
         int             rc;
         int             i;
+        int             consume_credit;
         int             done;
         int             nwork;
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
+        LASSERT (conn->ibc_nsends_posted <= IBNAL_RX_MSGS);
+        LASSERT (conn->ibc_reserved_credits >= 0);
+
+        while (conn->ibc_reserved_credits > 0 &&
+               !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+                LASSERT (conn->ibc_version !=
+                         IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+                tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+                                kib_tx_t, tx_list);
+                list_del(&tx->tx_list);
+                list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+                conn->ibc_reserved_credits--;
+        }
 
         if (list_empty(&conn->ibc_tx_queue) &&
-            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+            list_empty(&conn->ibc_tx_queue_nocred) &&
+            (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
+             kibnal_send_keepalive(conn))) {
                 spin_unlock_irqrestore(&conn->ibc_lock, flags);
                 
-                tx = kibnal_get_idle_tx(0);     /* don't block */
+                tx = kibnal_get_idle_tx();
                 if (tx != NULL)
                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
                 spin_lock_irqsave(&conn->ibc_lock, flags);
                 
-                if (tx != NULL) {
-                        atomic_inc(&conn->ibc_refcount);
+                if (tx != NULL)
                         kibnal_queue_tx_locked(tx, conn);
-                }
         }
 
-        while (!list_empty (&conn->ibc_tx_queue)) {
-                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+        for (;;) {
+                if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                        LASSERT (conn->ibc_version !=
+                                 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+                        tx = list_entry(conn->ibc_tx_queue_nocred.next,
+                                        kib_tx_t, tx_list);
+                        consume_credit = 0;
+                } else if (!list_empty (&conn->ibc_tx_queue)) {
+                        tx = list_entry (conn->ibc_tx_queue.next, 
+                                         kib_tx_t, tx_list);
+                        consume_credit = 1;
+                } else {
+                        /* nothing waiting */
+                        break;
+                }
 
                 /* We rely on this for QP sizing */
                 LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2);
@@ -701,21 +695,25 @@ kibnal_check_sends (kib_conn_t *conn)
                 /* Not on ibc_rdma_queue */
                 LASSERT (!tx->tx_passive_rdma_wait);
 
-                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
+                if (conn->ibc_nsends_posted == IBNAL_RX_MSGS)
                         break;
 
-                if (conn->ibc_credits == 0)     /* no credits */
-                        break;
+                if (consume_credit) {
+                        if (conn->ibc_credits == 0)     /* no credits */
+                                break;
+                
+                        if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                            conn->ibc_outstanding_credits == 0) /* giving back credits */
+                                break;
+                }
                 
-                if (conn->ibc_credits == 1 &&   /* last credit reserved for */
-                    conn->ibc_outstanding_credits == 0) /* giving back credits */
-                        break;
-
                 list_del (&tx->tx_list);
 
                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
-                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                     !list_empty(&conn->ibc_tx_queue_nocred) ||
+                     (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
+                      !kibnal_send_keepalive(conn)))) {
                         /* redundant NOOP */
                         spin_unlock_irqrestore(&conn->ibc_lock, flags);
                         kibnal_tx_done(tx);
@@ -723,12 +721,14 @@ kibnal_check_sends (kib_conn_t *conn)
                         continue;
                 }
 
-                kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
+                kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
+                                conn->ibc_outstanding_credits,
                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
 
                 conn->ibc_outstanding_credits = 0;
                 conn->ibc_nsends_posted++;
-                conn->ibc_credits--;
+                if (consume_credit)
+                        conn->ibc_credits--;
 
                 tx->tx_sending = tx->tx_nsp;
                 tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
@@ -747,19 +747,22 @@ kibnal_check_sends (kib_conn_t *conn)
                         tx->tx_status = 0;
                         /* Driver only accepts 1 item at a time */
                         for (i = 0; i < tx->tx_nsp; i++) {
-                                rc = ib_send (conn->ibc_qp, &tx->tx_sp[i], 1);
+                                rc = kibnal_ib_send(conn->ibc_qp, &tx->tx_sp[i]);
                                 if (rc != 0)
                                         break;
                                 nwork++;
                         }
                 }
 
+                conn->ibc_last_send = jiffies;
+
                 spin_lock_irqsave (&conn->ibc_lock, flags);
                 if (rc != 0) {
                         /* NB credits are transferred in the actual
                          * message, which can only be the last work item */
                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
-                        conn->ibc_credits++;
+                        if (consume_credit)
+                                conn->ibc_credits++;
                         conn->ibc_nsends_posted--;
 
                         tx->tx_status = rc;
@@ -773,11 +776,11 @@ kibnal_check_sends (kib_conn_t *conn)
                         spin_unlock_irqrestore (&conn->ibc_lock, flags);
                         
                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
-                                CERROR ("Error %d posting transmit to "LPX64"\n", 
-                                        rc, conn->ibc_peer->ibp_nid);
+                                CERROR ("Error %d posting transmit to %s\n", 
+                                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         else
-                                CDEBUG (D_NET, "Error %d posting transmit to "
-                                        LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+                                CDEBUG (D_NET, "Error %d posting transmit to %s\n",
+                                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
                         kibnal_close_conn (conn, rc);
 
@@ -820,10 +823,7 @@ kibnal_tx_callback (struct ib_cq_entry *e)
         if (idle)
                 list_del(&tx->tx_list);
 
-        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-               atomic_read (&conn->ibc_refcount));
-        atomic_inc (&conn->ibc_refcount);
+        kibnal_conn_addref(conn);
 
         if (tx->tx_sending == 0)
                 conn->ibc_nsends_posted--;
@@ -838,19 +838,20 @@ kibnal_tx_callback (struct ib_cq_entry *e)
                 kibnal_tx_done (tx);
 
         if (e->status != IB_COMPLETION_STATUS_SUCCESS) {
-                CERROR ("Tx completion to "LPX64" failed: %d\n", 
-                        conn->ibc_peer->ibp_nid, e->status);
+                CDEBUG (D_NETERROR, "Tx completion to %s failed: %d\n", 
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status);
                 kibnal_close_conn (conn, -ENETDOWN);
         } else {
+                kibnal_peer_alive(conn->ibc_peer);
                 /* can I shovel some more sends out the door? */
                 kibnal_check_sends(conn);
         }
 
-        kibnal_put_conn (conn);
+        kibnal_conn_decref(conn);
 }
 
 void
-kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg)
+kibnal_callback (ib_cq_t *cq, struct ib_cq_entry *e, void *arg)
 {
         if (kibnal_wreqid_is_rx(e->work_request_id))
                 kibnal_rx_callback (e);
@@ -921,7 +922,7 @@ kibnal_schedule_active_connect_locked (kib_peer_t *peer)
         /* Called with exclusive kib_global_lock */
 
         peer->ibp_connecting++;
-        atomic_inc (&peer->ibp_refcount); /* extra ref for connd */
+        kibnal_peer_addref(peer); /* extra ref for connd */
         
         spin_lock (&kibnal_data.kib_connd_lock);
         
@@ -934,11 +935,13 @@ kibnal_schedule_active_connect_locked (kib_peer_t *peer)
 }
 
 void
-kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
 {
         unsigned long    flags;
         kib_peer_t      *peer;
         kib_conn_t      *conn;
+        int              retry;
+        int              rc;
         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
@@ -947,55 +950,65 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
         LASSERT (tx->tx_nsp > 0);               /* work items have been set up */
 
-        read_lock_irqsave(g_lock, flags);
+        for (retry = 0; ; retry = 1) {
+                read_lock_irqsave(g_lock, flags);
         
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
-                read_unlock_irqrestore(g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
-        }
-
-        conn = kibnal_find_conn_locked (peer);
-        if (conn != NULL) {
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
-                read_unlock_irqrestore(g_lock, flags);
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL) {
+                        conn = kibnal_find_conn_locked (peer);
+                        if (conn != NULL) {
+                                kibnal_conn_addref(conn); /* 1 ref for me...*/
+                                read_unlock_irqrestore(g_lock, flags);
                 
-                kibnal_queue_tx (tx, conn);
-                return;
-        }
-        
-        /* Making one or more connections; I'll need a write lock... */
-        read_unlock(g_lock);
-        write_lock(g_lock);
+                                kibnal_queue_tx (tx, conn);
+                                kibnal_conn_decref(conn); /* ...until here */
+                                return;
+                        }
+                }
+                
+                /* Making one or more connections; I'll need a write lock... */
+                read_unlock(g_lock);
+                write_lock(g_lock);
 
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL)
+                        break;
+                
                 write_unlock_irqrestore (g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-                return;
+
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
+                        tx->tx_status = -EHOSTUNREACH;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+
+                rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid),
+                                                lnet_acceptor_port());
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        tx->tx_status = rc;
+                        kibnal_tx_done(tx);
+                        return;
+                }
         }
 
         conn = kibnal_find_conn_locked (peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
+                kibnal_conn_addref(conn);       /* +1 ref from me... */
                 write_unlock_irqrestore (g_lock, flags);
                 
                 kibnal_queue_tx (tx, conn);
+                kibnal_conn_decref(conn);       /* ...until here */
                 return;
         }
 
-        if (peer->ibp_connecting == 0) {
-                if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+        if (peer->ibp_connecting == 0 &&
+            peer->ibp_accepting == 0) {
+                if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
+                      time_after_eq(jiffies, peer->ibp_reconnect_time))) {
                         write_unlock_irqrestore (g_lock, flags);
                         tx->tx_status = -EHOSTUNREACH;
                         kibnal_tx_done (tx);
@@ -1011,11 +1024,27 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
         write_unlock_irqrestore (g_lock, flags);
 }
 
-ptl_err_t
-kibnal_start_passive_rdma (int type, ptl_nid_t nid,
-                            lib_msg_t *libmsg, ptl_hdr_t *hdr)
+void
+kibnal_txlist_done (struct list_head *txlist, int status)
+{
+        kib_tx_t *tx;
+
+        while (!list_empty(txlist)) {
+                tx = list_entry (txlist->next, kib_tx_t, tx_list);
+
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_status = status;
+                kibnal_tx_done (tx);
+        }
+}
+
+int
+kibnal_start_passive_rdma (int type, lnet_msg_t *lntmsg,
+                           int niov, struct iovec *iov, lnet_kiov_t *kiov,
+                           int nob)
 {
-        int         nob = libmsg->md->length;
+        lnet_nid_t  nid = lntmsg->msg_target.nid;
         kib_tx_t   *tx;
         kib_msg_t  *ibmsg;
         int         rc;
@@ -1033,32 +1062,33 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid,
                          IB_ACCESS_LOCAL_WRITE;
         }
 
-        tx = kibnal_get_idle_tx (1);           /* May block; caller is an app thread */
-        LASSERT (tx != NULL);
+        tx = kibnal_get_idle_tx ();
+        if (tx == NULL) {
+                CERROR("Can't allocate %s txd for %s\n",
+                       (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET",
+                       libcfs_nid2str(nid));
+                return -ENOMEM;
+        }
 
-        if ((libmsg->md->options & PTL_MD_KIOV) == 0) 
-                rc = kibnal_map_iov (tx, access,
-                                     libmsg->md->md_niov,
-                                     libmsg->md->md_iov.iov,
-                                     0, nob);
+        
+        if (iov != NULL) 
+                rc = kibnal_map_iov (tx, access, niov, iov, 0, nob);
         else
-                rc = kibnal_map_kiov (tx, access,
-                                      libmsg->md->md_niov, 
-                                      libmsg->md->md_iov.kiov,
-                                      0, nob);
+                rc = kibnal_map_kiov (tx, access, niov, kiov, 0, nob);
 
         if (rc != 0) {
-                CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc);
+                CERROR ("Can't map RDMA for %s: %d\n", 
+                        libcfs_nid2str(nid), rc);
                 goto failed;
         }
         
         if (type == IBNAL_MSG_GET_RDMA) {
                 /* reply gets finalized when tx completes */
-                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib
-                                                        nid, libmsg);
-                if (tx->tx_libmsg[1] == NULL) {
-                        CERROR ("Can't create reply for GET -> "LPX64"\n",
-                                nid);
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni
+                                                         lntmsg);
+                if (tx->tx_lntmsg[1] == NULL) {
+                        CERROR ("Can't create reply for GET -> %s\n",
+                                libcfs_nid2str(nid));
                         rc = -ENOMEM;
                         goto failed;
                 }
@@ -1068,7 +1098,7 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid,
 
         ibmsg = tx->tx_msg;
 
-        ibmsg->ibm_u.rdma.ibrm_hdr = *hdr;
+        ibmsg->ibm_u.rdma.ibrm_hdr = lntmsg->msg_hdr;
         ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
         ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey;
         ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr;
@@ -1081,24 +1111,24 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid,
                tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
                tx->tx_md.md_addr, nob);
         
-        /* libmsg gets finalized when tx completes. */
-        tx->tx_libmsg[0] = libmsg;
+        /* lntmsg gets finalized when tx completes. */
+        tx->tx_lntmsg[0] = lntmsg;
 
         kibnal_launch_tx(tx, nid);
-        return (PTL_OK);
+        return (0);
 
  failed:
         tx->tx_status = rc;
         kibnal_tx_done (tx);
-        return (PTL_FAIL);
+        return (-EIO);
 }
 
 void
 kibnal_start_active_rdma (int type, int status,
-                           kib_rx_t *rx, lib_msg_t *libmsg, 
-                           unsigned int niov,
-                           struct iovec *iov, ptl_kiov_t *kiov,
-                           int offset, int nob)
+                          kib_rx_t *rx, lnet_msg_t *lntmsg, 
+                          unsigned int niov,
+                          struct iovec *iov, lnet_kiov_t *kiov,
+                          int offset, int nob)
 {
         kib_msg_t    *rxmsg = rx->rx_msg;
         kib_msg_t    *txmsg;
@@ -1122,12 +1152,6 @@ kibnal_start_active_rdma (int type, int status,
         LASSERT (type == IBNAL_MSG_GET_DONE ||
                  type == IBNAL_MSG_PUT_DONE);
 
-        /* Flag I'm completing the RDMA.  Even if I fail to send the
-         * completion message, I will have tried my best so further
-         * attempts shouldn't be tried. */
-        LASSERT (!rx->rx_rdma);
-        rx->rx_rdma = 1;
-
         if (type == IBNAL_MSG_GET_DONE) {
                 access   = 0;
                 rdma_op  = IB_OP_RDMA_WRITE;
@@ -1138,12 +1162,12 @@ kibnal_start_active_rdma (int type, int status,
                 LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
         }
 
-        tx = kibnal_get_idle_tx (0);           /* Mustn't block */
+        tx = kibnal_get_idle_tx ();
         if (tx == NULL) {
-                CERROR ("tx descs exhausted on RDMA from "LPX64
+                CERROR ("tx descs exhausted on RDMA from %s"
                         " completing locally with failure\n",
-                        rx->rx_conn->ibc_peer->ibp_nid);
-                lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE);
+                        libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid));
+                lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM);
                 return;
         }
         LASSERT (tx->tx_nsp == 0);
@@ -1161,8 +1185,9 @@ kibnal_start_active_rdma (int type, int status,
                                              niov, iov, offset, nob);
                 
                 if (rc != 0) {
-                        CERROR ("Can't map RDMA -> "LPX64": %d\n", 
-                                rx->rx_conn->ibc_peer->ibp_nid, rc);
+                        CERROR ("Can't map RDMA -> %s: %d\n", 
+                                libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid), 
+                                rc);
                         /* We'll skip the RDMA and complete with failure. */
                         status = rc;
                         nob = 0;
@@ -1201,53 +1226,45 @@ kibnal_start_active_rdma (int type, int status,
 
         if (status == 0 && nob != 0) {
                 LASSERT (tx->tx_nsp > 1);
-                /* RDMA: libmsg gets finalized when the tx completes.  This
+                /* RDMA: lntmsg gets finalized when the tx completes.  This
                  * is after the completion message has been sent, which in
                  * turn is after the RDMA has finished. */
-                tx->tx_libmsg[0] = libmsg;
+                tx->tx_lntmsg[0] = lntmsg;
         } else {
                 LASSERT (tx->tx_nsp == 1);
                 /* No RDMA: local completion happens now! */
                 CDEBUG(D_NET, "No data: immediate completion\n");
-                lib_finalize (&kibnal_lib, NULL, libmsg,
-                              status == 0 ? PTL_OK : PTL_FAIL);
+                lnet_finalize (kibnal_data.kib_ni, lntmsg,
+                              status == 0 ? 0 : -EIO);
         }
 
-        /* +1 ref for this tx... */
-        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-               rx->rx_conn, rx->rx_conn->ibc_state, 
-               rx->rx_conn->ibc_peer->ibp_nid,
-               atomic_read (&rx->rx_conn->ibc_refcount));
-        atomic_inc (&rx->rx_conn->ibc_refcount);
-        /* ...and queue it up */
         kibnal_queue_tx(tx, rx->rx_conn);
 }
 
-ptl_err_t
-kibnal_sendmsg(lib_nal_t    *nal, 
-                void         *private,
-                lib_msg_t    *libmsg,
-                ptl_hdr_t    *hdr, 
-                int           type, 
-                ptl_nid_t     nid, 
-                ptl_pid_t     pid,
-                unsigned int  payload_niov, 
-                struct iovec *payload_iov, 
-                ptl_kiov_t   *payload_kiov,
-                int           payload_offset,
-                int           payload_nob)
+int
+kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
-        kib_msg_t  *ibmsg;
-        kib_tx_t   *tx;
-        int         nob;
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr; 
+        int               type = lntmsg->msg_type; 
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      payload_niov = lntmsg->msg_niov; 
+        struct iovec     *payload_iov = lntmsg->msg_iov; 
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        kib_msg_t        *ibmsg;
+        kib_tx_t         *tx;
+        int               nob;
 
         /* NB 'private' is different depending on what we're sending.... */
 
-        CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64" pid %d\n",
-               payload_nob, payload_niov, nid , pid);
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
 
         LASSERT (payload_nob == 0 || payload_niov > 0);
-        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
 
         /* Thread context if we're sending payload */
         LASSERT (!in_interrupt() || payload_niov == 0);
@@ -1257,126 +1274,111 @@ kibnal_sendmsg(lib_nal_t    *nal,
         switch (type) {
         default:
                 LBUG();
-                return (PTL_FAIL);
-                
-        case PTL_MSG_REPLY: {
-                /* reply's 'private' is the incoming receive */
-                kib_rx_t *rx = private;
-
-                /* RDMA reply expected? */
-                if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) {
-                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
-                                                 rx, libmsg, payload_niov, 
-                                                 payload_iov, payload_kiov,
-                                                 payload_offset, payload_nob);
-                        return (PTL_OK);
-                }
+                return (-EIO);
                 
-                /* Incoming message consistent with immediate reply? */
-                if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
-                        CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n",
-                                nid, rx->rx_msg->ibm_type);
-                        return (PTL_FAIL);
-                }
-
-                /* Will it fit in a message? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE) {
-                        CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", 
-                               nid, payload_nob);
-                        return (PTL_FAIL);
-                }
-                break;
-        }
-
-        case PTL_MSG_GET:
-                /* might the REPLY message be big enough to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 
-                                                          nid, libmsg, hdr));
-                break;
-
-        case PTL_MSG_ACK:
+        case LNET_MSG_ACK:
                 LASSERT (payload_nob == 0);
                 break;
 
-        case PTL_MSG_PUT:
-                /* Is the payload big enough to need RDMA? */
+        case LNET_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+
+                /* is the REPLY message too small for RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
+
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                        return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg, 
+                                                         lntmsg->msg_md->md_niov, 
+                                                         lntmsg->msg_md->md_iov.iov, NULL,
+                                                         lntmsg->msg_md->md_length);
+
+                return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg, 
+                                                 lntmsg->msg_md->md_niov, 
+                                                 NULL, lntmsg->msg_md->md_iov.kiov,
+                                                 lntmsg->msg_md->md_length);
+
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
+                /* Is the payload small enough not to need RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                if (nob > IBNAL_MSG_SIZE)
-                        return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
-                                                          nid, libmsg, hdr));
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
                 
-                break;
+                return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, lntmsg,
+                                                 payload_niov,
+                                                 payload_iov, payload_kiov,
+                                                 payload_nob);
         }
 
-        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
-                                  type == PTL_MSG_REPLY ||
-                                  in_interrupt()));
+        /* Send IMMEDIATE */
+
+        tx = kibnal_get_idle_tx();
         if (tx == NULL) {
-                CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", 
-                        type, nid, in_interrupt() ? " (intr)" : "");
-                return (PTL_NO_SPACE);
+                CERROR ("Can't send %d to %s: tx descs exhausted%s\n", 
+                        type, libcfs_nid2str(target.nid), 
+                        in_interrupt() ? " (intr)" : "");
+                return (-ENOMEM);
         }
 
         ibmsg = tx->tx_msg;
         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
 
-        if (payload_nob > 0) {
-                if (payload_kiov != NULL)
-                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
-                                          payload_niov, payload_kiov,
-                                          payload_offset, payload_nob);
-                else
-                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
-                                         payload_niov, payload_iov,
-                                         payload_offset, payload_nob);
-        }
+        if (payload_kiov != NULL)
+                lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
+                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                    payload_niov, payload_kiov, 
+                                    payload_offset, payload_nob);
+        else
+                lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                   payload_niov, payload_iov, 
+                                   payload_offset, payload_nob);
 
         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
                             offsetof(kib_immediate_msg_t, 
                                      ibim_payload[payload_nob]));
 
-        /* libmsg gets finalized when tx completes */
-        tx->tx_libmsg[0] = libmsg;
+        /* lntmsg gets finalized when tx completes */
+        tx->tx_lntmsg[0] = lntmsg;
 
-        kibnal_launch_tx(tx, nid);
-        return (PTL_OK);
+        kibnal_launch_tx(tx, target.nid);
+        return (0);
 }
 
-ptl_err_t
-kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
-               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-               unsigned int payload_niov, struct iovec *payload_iov,
-               size_t payload_offset, size_t payload_len)
+int
+kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                   void **new_private)
 {
-        return (kibnal_sendmsg(nal, private, cookie,
-                               hdr, type, nid, pid,
-                               payload_niov, payload_iov, NULL,
-                               payload_offset, payload_len));
-}
+        kib_rx_t    *rx = private;
+        kib_conn_t  *conn = rx->rx_conn;
 
-ptl_err_t
-kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
-                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
-                     size_t payload_offset, size_t payload_len)
-{
-        return (kibnal_sendmsg(nal, private, cookie,
-                               hdr, type, nid, pid,
-                               payload_niov, NULL, payload_kiov,
-                               payload_offset, payload_len));
+        if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                /* Can't block if RDMA completions need normal credits */
+                LCONSOLE_ERROR("Dropping message from %s: no buffers free. "
+                               "%s is running an old version of LNET that may "
+                               "deadlock if messages wait for buffers)\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return -EDEADLK;
+        }
+        
+        *new_private = private;
+        return 0;
 }
 
-ptl_err_t
-kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
-                 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
-                 int offset, int mlen, int rlen)
+int
+kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+             int delayed, unsigned int niov,
+             struct iovec *iov, lnet_kiov_t *kiov,
+             unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
         kib_rx_t    *rx = private;
         kib_msg_t   *rxmsg = rx->rx_msg;
         int          msg_nob;
+        int          rc = 0;
         
         LASSERT (mlen <= rlen);
         LASSERT (!in_interrupt ());
@@ -1386,59 +1388,58 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
         switch (rxmsg->ibm_type) {
         default:
                 LBUG();
-                return (PTL_FAIL);
-                
+
         case IBNAL_MSG_IMMEDIATE:
                 msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
-                if (msg_nob > IBNAL_MSG_SIZE) {
-                        CERROR ("Immediate message from "LPX64" too big: %d\n",
-                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
-                        return (PTL_FAIL);
+                if (msg_nob > rx->rx_nob) {
+                        CERROR ("Immediate message from %s too big: %d(%d)\n",
+                                libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+                                msg_nob, rx->rx_nob);
+                        rc = -EPROTO;
+                        break;
                 }
 
                 if (kiov != NULL)
-                        lib_copy_buf2kiov(niov, kiov, offset,
-                                          rxmsg->ibm_u.immediate.ibim_payload,
-                                          mlen);
+                        lnet_copy_flat2kiov(
+                                niov, kiov, offset, 
+                                IBNAL_MSG_SIZE, rxmsg,
+                                offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                mlen);
                 else
-                        lib_copy_buf2iov(niov, iov, offset,
-                                         rxmsg->ibm_u.immediate.ibim_payload,
-                                         mlen);
+                        lnet_copy_flat2iov(
+                                niov, iov, offset,
+                                IBNAL_MSG_SIZE, rxmsg,
+                                offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                mlen);
 
-                lib_finalize (nal, NULL, libmsg, PTL_OK);
-                return (PTL_OK);
+                lnet_finalize (ni, lntmsg, 0);
+                break;
 
         case IBNAL_MSG_GET_RDMA:
-                /* We get called here just to discard any junk after the
-                 * GET hdr. */
-                LASSERT (libmsg == NULL);
-                lib_finalize (nal, NULL, libmsg, PTL_OK);
-                return (PTL_OK);
+                if (lntmsg != NULL) {
+                        /* GET matched: RDMA lntmsg's payload */
+                        kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
+                                                 rx, lntmsg, 
+                                                 lntmsg->msg_niov, 
+                                                 lntmsg->msg_iov, 
+                                                 lntmsg->msg_kiov,
+                                                 lntmsg->msg_offset, 
+                                                 lntmsg->msg_len);
+                } else {
+                        /* GET didn't match anything */
+                        kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -ENODATA,
+                                                  rx, NULL, 0, NULL, NULL, 0, 0);
+                }
+                break;
 
         case IBNAL_MSG_PUT_RDMA:
-                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0,
-                                          rx, libmsg, 
+                kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, rx, lntmsg,
                                           niov, iov, kiov, offset, mlen);
-                return (PTL_OK);
+                break;
         }
-}
 
-ptl_err_t
-kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
-              unsigned int niov, struct iovec *iov, 
-              size_t offset, size_t mlen, size_t rlen)
-{
-        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
-                                offset, mlen, rlen));
-}
-
-ptl_err_t
-kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
-                     unsigned int niov, ptl_kiov_t *kiov, 
-                     size_t offset, size_t mlen, size_t rlen)
-{
-        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
-                                offset, mlen, rlen));
+        kibnal_post_rx(rx, 1, 0);
+        return rc;
 }
 
 int
@@ -1460,6 +1461,40 @@ kibnal_thread_fini (void)
 }
 
 void
+kibnal_peer_alive (kib_peer_t *peer)
+{
+        /* This is racy, but everyone's only writing cfs_time_current() */
+        peer->ibp_last_alive = cfs_time_current();
+        mb();
+}
+
+void
+kibnal_peer_notify (kib_peer_t *peer)
+{
+        time_t        last_alive = 0;
+        int           error = 0;
+        unsigned long flags;
+        
+        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        if (list_empty(&peer->ibp_conns) &&
+            peer->ibp_accepting == 0 &&
+            peer->ibp_connecting == 0 &&
+            peer->ibp_error != 0) {
+                error = peer->ibp_error;
+                peer->ibp_error = 0;
+                last_alive = cfs_time_current_sec() -
+                             cfs_duration_sec(cfs_time_current() -
+                                              peer->ibp_last_alive);
+        }
+        
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+        
+        if (error != 0)
+                lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
+}
+
+void
 kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
         /* This just does the immmediate housekeeping, and schedules the
@@ -1467,8 +1502,9 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
          * Caller holds kib_global_lock exclusively in irq context */
         kib_peer_t   *peer = conn->ibc_peer;
 
-        CDEBUG (error == 0 ? D_NET : D_ERROR,
-                "closing conn to "LPX64": error %d\n", peer->ibp_nid, error);
+        CDEBUG (error == 0 ? D_NET : D_NETERROR,
+                "closing conn to %s: error %d\n", 
+                libcfs_nid2str(peer->ibp_nid), error);
         
         LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED ||
                  conn->ibc_state == IBNAL_CONN_CONNECTING);
@@ -1478,16 +1514,15 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
                 list_del (&conn->ibc_list);
         } else {
                 /* new ref for kib_reaper_conns */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
+                kibnal_conn_addref(conn);
         }
         
-        if (list_empty (&peer->ibp_conns) &&    /* no more conns */
-            peer->ibp_persistence == 0 &&       /* non-persistent peer */
-            kibnal_peer_active(peer)) {         /* still in peer table */
-                kibnal_unlink_peer_locked (peer);
+        if (list_empty (&peer->ibp_conns)) {   /* no more conns */
+                if (peer->ibp_persistence == 0 && /* non-persistent peer */
+                    kibnal_peer_active(peer))     /* still in peer table */
+                        kibnal_unlink_peer_locked (peer);
+
+                peer->ibp_error = error; /* set/clear error on last conn */
         }
 
         conn->ibc_state = IBNAL_CONN_DEATHROW;
@@ -1521,21 +1556,25 @@ kibnal_close_conn (kib_conn_t *conn, int why)
 }
 
 void
-kibnal_peer_connect_failed (kib_peer_t *peer, int rc)
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
 {
         LIST_HEAD        (zombies);
-        kib_tx_t         *tx;
         unsigned long     flags;
 
-        LASSERT (rc != 0);
-        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
+        LASSERT(error != 0);
 
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        LASSERT (peer->ibp_connecting != 0);
-        peer->ibp_connecting--;
+        if (active) {
+                LASSERT (peer->ibp_connecting != 0);
+                peer->ibp_connecting--;
+        } else {
+                LASSERT (peer->ibp_accepting != 0);
+                peer->ibp_accepting--;
+        }
 
-        if (peer->ibp_connecting != 0) {
+        if (peer->ibp_connecting != 0 ||
+            peer->ibp_accepting != 0) {
                 /* another connection attempt under way... */
                 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
                 return;
@@ -1543,26 +1582,29 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int rc)
 
         if (list_empty(&peer->ibp_conns)) {
                 /* Say when active connection can be re-attempted */
-                peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
-                /* Increase reconnection interval */
-                peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
-                                                    IBNAL_MAX_RECONNECT_INTERVAL);
+                peer->ibp_reconnect_interval *= 2;
+                peer->ibp_reconnect_interval =
+                        MAX(peer->ibp_reconnect_interval,
+                            *kibnal_tunables.kib_min_reconnect_interval);
+                peer->ibp_reconnect_interval =
+                        MIN(peer->ibp_reconnect_interval,
+                            *kibnal_tunables.kib_max_reconnect_interval);
+                
+                peer->ibp_reconnect_time = jiffies + 
+                                           peer->ibp_reconnect_interval * HZ;
         
-                /* Take peer's blocked blocked transmits; I'll complete
+                /* Take peer's blocked transmits; I'll complete
                  * them with error */
-                while (!list_empty (&peer->ibp_tx_queue)) {
-                        tx = list_entry (peer->ibp_tx_queue.next,
-                                         kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
-                        list_add_tail (&tx->tx_list, &zombies);
-                }
+                list_add(&zombies, &peer->ibp_tx_queue);
+                list_del_init(&peer->ibp_tx_queue);
                 
                 if (kibnal_peer_active(peer) &&
                     (peer->ibp_persistence == 0)) {
                         /* failed connection attempt on non-persistent peer */
                         kibnal_unlink_peer_locked (peer);
                 }
+
+                peer->ibp_error = error;
         } else {
                 /* Can't have blocked transmits if there are connections */
                 LASSERT (list_empty(&peer->ibp_tx_queue));
@@ -1570,22 +1612,17 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int rc)
         
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
+        kibnal_peer_notify(peer);
+        
         if (!list_empty (&zombies))
-                CERROR ("Deleting messages for "LPX64": connection failed\n",
-                        peer->ibp_nid);
-
-        while (!list_empty (&zombies)) {
-                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+                CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
+                        libcfs_nid2str(peer->ibp_nid));
 
-                list_del (&tx->tx_list);
-                /* complete now */
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-        }
+        kibnal_txlist_done(&zombies, -EHOSTUNREACH);
 }
 
 void
-kibnal_connreq_done (kib_conn_t *conn, int status)
+kibnal_connreq_done (kib_conn_t *conn, int active, int status)
 {
         int               state = conn->ibc_state;
         kib_peer_t       *peer = conn->ibc_peer;
@@ -1595,7 +1632,7 @@ kibnal_connreq_done (kib_conn_t *conn, int status)
         int               i;
 
         if (conn->ibc_connreq != NULL) {
-                PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+                LIBCFS_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
                 conn->ibc_connreq = NULL;
         }
 
@@ -1628,24 +1665,29 @@ kibnal_connreq_done (kib_conn_t *conn, int status)
         
         write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
 
-        LASSERT (peer->ibp_connecting != 0);
+        if (active)
+                LASSERT (peer->ibp_connecting != 0);
+        else
+                LASSERT (peer->ibp_accepting != 0);
         
         if (status == 0 &&                      /* connection established */
             kibnal_peer_active(peer)) {         /* peer not deleted */
 
-                peer->ibp_connecting--;
+                if (active)
+                        peer->ibp_connecting--;
+                else
+                        peer->ibp_accepting--;
+
+                conn->ibc_last_send = jiffies;
                 conn->ibc_state = IBNAL_CONN_ESTABLISHED;
+                kibnal_peer_alive(peer);
 
                 /* +1 ref for ibc_list; caller(== CM)'s ref remains until
                  * the IB_CM_IDLE callback */
-                CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_inc (&conn->ibc_refcount);
+                kibnal_conn_addref(conn);
                 list_add (&conn->ibc_list, &peer->ibp_conns);
-                
-                /* reset reconnect interval for next attempt */
-                peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+
+                peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
 
                 /* post blocked sends to the new connection */
                 spin_lock (&conn->ibc_lock);
@@ -1656,11 +1698,6 @@ kibnal_connreq_done (kib_conn_t *conn, int status)
                         
                         list_del (&tx->tx_list);
 
-                        /* +1 ref for each tx */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
                         kibnal_queue_tx_locked (tx, conn);
                 }
                 
@@ -1675,16 +1712,13 @@ kibnal_connreq_done (kib_conn_t *conn, int status)
                 /* queue up all the receives */
                 for (i = 0; i < IBNAL_RX_MSGS; i++) {
                         /* +1 ref for rx desc */
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
-                        atomic_inc (&conn->ibc_refcount);
+                        kibnal_conn_addref(conn);
 
                         CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
                                i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
                                conn->ibc_rxs[i].rx_vaddr);
 
-                        kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                        kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
                 }
 
                 kibnal_check_sends (conn);
@@ -1703,12 +1737,12 @@ kibnal_connreq_done (kib_conn_t *conn, int status)
 
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
-        kibnal_peer_connect_failed (conn->ibc_peer, status);
+        kibnal_peer_connect_failed (conn->ibc_peer, active, status);
 }
 
 int
-kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
-               kib_msg_t *msg, int nob)
+kibnal_accept_connreq (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
+                       kib_msg_t *msg, int nob)
 {
         kib_conn_t    *conn;
         kib_peer_t    *peer;
@@ -1716,23 +1750,24 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
         unsigned long  flags;
         int            rc;
 
-        rc = kibnal_unpack_msg(msg, nob);
+        rc = kibnal_unpack_msg(msg, 0, nob);
         if (rc != 0) {
                 CERROR("Can't unpack connreq msg: %d\n", rc);
                 return -EPROTO;
         }
 
-        CDEBUG(D_NET, "connreq from "LPX64"\n", msg->ibm_srcnid);
+        CDEBUG(D_NET, "connreq from %s\n", libcfs_nid2str(msg->ibm_srcnid));
 
         if (msg->ibm_type != IBNAL_MSG_CONNREQ) {
-                CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
-                       msg->ibm_type, msg->ibm_srcnid);
+                CERROR("Unexpected connreq msg type: %x from %s\n",
+                       msg->ibm_type, libcfs_nid2str(msg->ibm_srcnid));
                 return -EPROTO;
         }
                 
         if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
-                CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n",
-                       msg->ibm_srcnid, msg->ibm_u.connparams.ibcp_queue_depth, 
+                CERROR("Can't accept %s: bad queue depth %d (%d expected)\n",
+                       libcfs_nid2str(msg->ibm_srcnid), 
+                       msg->ibm_u.connparams.ibcp_queue_depth, 
                        IBNAL_MSG_QUEUE_SIZE);
                 return (-EPROTO);
         }
@@ -1742,13 +1777,9 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
                 return (-ENOMEM);
 
         /* assume 'nid' is a new peer */
-        peer = kibnal_create_peer (msg->ibm_srcnid);
-        if (peer == NULL) {
-                CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n",
-                       conn, conn->ibc_state, conn->ibc_peer->ibp_nid,
-                       atomic_read (&conn->ibc_refcount));
-                atomic_dec (&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
+        rc = kibnal_create_peer(&peer, msg->ibm_srcnid);
+        if (rc != 0) {
+                kibnal_conn_decref(conn);
                 return (-ENOMEM);
         }
         
@@ -1758,31 +1789,47 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
          * NB If my incarnation changes after this, the peer will get nuked and
          * we'll spot that when the connection is finally added into the peer's
          * connlist */
-        if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
+        if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+                                     msg->ibm_dstnid) ||
             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
                 write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
                 
-                CERROR("Stale connection params from "LPX64"\n",
-                       msg->ibm_srcnid);
-                atomic_dec(&conn->ibc_refcount);
-                kibnal_destroy_conn(conn);
-                kibnal_put_peer(peer);
+                CERROR("Stale connection params from %s\n",
+                       libcfs_nid2str(msg->ibm_srcnid));
+                kibnal_conn_decref(conn);
+                kibnal_peer_decref(peer);
                 return -ESTALE;
         }
 
         peer2 = kibnal_find_peer_locked(msg->ibm_srcnid);
         if (peer2 == NULL) {
+                /* Brand new peer */
+                LASSERT (peer->ibp_accepting == 0);
+
                 /* peer table takes my ref on peer */
                 list_add_tail (&peer->ibp_list,
                                kibnal_nid2peerlist(msg->ibm_srcnid));
         } else {
-                kibnal_put_peer (peer);
+                /* tie-break connection race in favour of the higher NID */                
+                if (peer2->ibp_connecting != 0 &&
+                    msg->ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+                                                flags);
+                        CWARN("Conn race %s\n",
+                              libcfs_nid2str(peer2->ibp_nid));
+
+                        kibnal_conn_decref(conn);
+                        kibnal_peer_decref(peer);
+                        return -EALREADY;
+                }
+
+                kibnal_peer_decref(peer);
                 peer = peer2;
         }
 
         /* +1 ref for conn */
-        atomic_inc (&peer->ibp_refcount);
-        peer->ibp_connecting++;
+        kibnal_peer_addref(peer);
+        peer->ibp_accepting++;
 
         write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
 
@@ -1791,6 +1838,8 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid,
         conn->ibc_comm_id = cid;
         conn->ibc_incarnation = msg->ibm_srcstamp;
         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+        conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
+        conn->ibc_version = msg->ibm_version;
 
         *connp = conn;
         return (0);
@@ -1807,39 +1856,72 @@ kibnal_bad_conn_callback (tTS_IB_CM_EVENT event,
         return TS_IB_CM_CALLBACK_PROCEED;
 }
 
-tTS_IB_CM_CALLBACK_RETURN
-kibnal_conn_callback (tTS_IB_CM_EVENT event,
-                       tTS_IB_CM_COMM_ID cid,
-                       void *param,
-                       void *arg)
+void
+kibnal_abort_txs (kib_conn_t *conn, struct list_head *txs)
 {
-        kib_conn_t       *conn = arg;
         LIST_HEAD        (zombies); 
         struct list_head *tmp;
         struct list_head *nxt;
         kib_tx_t         *tx;
         unsigned long     flags;
-        int               done;
+
+        spin_lock_irqsave (&conn->ibc_lock, flags);
+
+        list_for_each_safe (tmp, nxt, txs) {
+                tx = list_entry (tmp, kib_tx_t, tx_list);
+
+                if (txs == &conn->ibc_active_txs) {
+                        LASSERT (tx->tx_passive_rdma ||
+                                 !tx->tx_passive_rdma_wait);
+
+                        LASSERT (tx->tx_passive_rdma_wait ||
+                                 tx->tx_sending != 0);
+                } else {
+                        LASSERT (!tx->tx_passive_rdma_wait);
+                        LASSERT (tx->tx_sending == 0);
+                }
+
+                tx->tx_status = -ECONNABORTED;
+                tx->tx_passive_rdma_wait = 0;
+
+                if (tx->tx_sending == 0) {
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
+        }
+        
+        spin_unlock_irqrestore (&conn->ibc_lock, flags);
+
+        kibnal_txlist_done (&zombies, -ECONNABORTED);
+}
+
+tTS_IB_CM_CALLBACK_RETURN
+kibnal_conn_callback (tTS_IB_CM_EVENT event,
+                      tTS_IB_CM_COMM_ID cid,
+                      void *param,
+                      void *arg)
+{
+        kib_conn_t       *conn = arg;
         int               rc;
 
         /* Established Connection Notifier */
 
         switch (event) {
         default:
-                CERROR("Connection %p -> "LPX64" ERROR %d\n",
-                       conn, conn->ibc_peer->ibp_nid, event);
+                CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event);
                 kibnal_close_conn (conn, -ECONNABORTED);
                 break;
                 
         case TS_IB_CM_DISCONNECTED:
-                CWARN("Connection %p -> "LPX64" DISCONNECTED.\n",
-                       conn, conn->ibc_peer->ibp_nid);
+                CDEBUG(D_NETERROR, "Connection %p -> %s DISCONNECTED.\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 kibnal_close_conn (conn, 0);
                 break;
 
         case TS_IB_CM_IDLE:
-                CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n",
-                       conn, conn->ibc_peer->ibp_nid);
+                CDEBUG(D_NET, "Connection %p -> %s IDLE.\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
                 /* LASSERT (no further callbacks) */
                 rc = tsIbCmCallbackModify(cid, kibnal_bad_conn_callback, conn);
@@ -1849,51 +1931,12 @@ kibnal_conn_callback (tTS_IB_CM_EVENT event,
                  * completing outstanding passive RDMAs so we can be sure
                  * the network can't touch the mapped memory any more. */
 
-                spin_lock_irqsave (&conn->ibc_lock, flags);
-
-                /* grab passive RDMAs not waiting for the tx callback */
-                list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
-                        tx = list_entry (tmp, kib_tx_t, tx_list);
-
-                        LASSERT (tx->tx_passive_rdma ||
-                                 !tx->tx_passive_rdma_wait);
-
-                        LASSERT (tx->tx_passive_rdma_wait ||
-                                 tx->tx_sending != 0);
-
-                        /* still waiting for tx callback? */
-                        if (!tx->tx_passive_rdma_wait)
-                                continue;
-
-                        tx->tx_status = -ECONNABORTED;
-                        tx->tx_passive_rdma_wait = 0;
-                        done = (tx->tx_sending == 0);
-
-                        if (!done)
-                                continue;
-
-                        list_del (&tx->tx_list);
-                        list_add (&tx->tx_list, &zombies);
-                }
-
-                /* grab all blocked transmits */
-                list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
-                        tx = list_entry (tmp, kib_tx_t, tx_list);
-                        
-                        list_del (&tx->tx_list);
-                        list_add (&tx->tx_list, &zombies);
-                }
+                kibnal_abort_txs(conn, &conn->ibc_tx_queue);
+                kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+                kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+                kibnal_abort_txs(conn, &conn->ibc_active_txs);
                 
-                spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
-                while (!list_empty(&zombies)) {
-                        tx = list_entry (zombies.next, kib_tx_t, tx_list);
-
-                        list_del(&tx->tx_list);
-                        kibnal_tx_done (tx);
-                }
-
-                kibnal_put_conn (conn);        /* Lose CM's ref */
+                kibnal_conn_decref(conn);        /* Lose CM's ref */
                 break;
         }
 
@@ -1902,9 +1945,9 @@ kibnal_conn_callback (tTS_IB_CM_EVENT event,
 
 tTS_IB_CM_CALLBACK_RETURN
 kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
-                               tTS_IB_CM_COMM_ID cid,
-                               void *param,
-                               void *arg)
+                              tTS_IB_CM_COMM_ID cid,
+                              void *param,
+                              void *arg)
 {
         kib_conn_t  *conn = arg;
         int          rc;
@@ -1917,11 +1960,11 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                 
-                CERROR ("%s event %p -> "LPX64": %d\n",
+                CERROR ("%s event %p -> %s: %d\n",
                         (event == TS_IB_CM_IDLE) ? "IDLE" : "Unexpected",
-                        conn, conn->ibc_peer->ibp_nid, event);
-                kibnal_connreq_done(conn, -ECONNABORTED);
-                kibnal_put_conn(conn); /* drop CM's ref */
+                        conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event);
+                kibnal_connreq_done(conn, 0, -ECONNABORTED);
+                kibnal_conn_decref(conn); /* drop CM's ref */
                 return TS_IB_CM_CALLBACK_ABORT;
                 
         case TS_IB_CM_REQ_RECEIVED: {
@@ -1931,13 +1974,13 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
                 LASSERT (conn == NULL);
 
                 /* Don't really know srcnid until successful unpack */
-                CDEBUG(D_NET, "REQ from ?"LPX64"?\n", msg->ibm_srcnid);
+                CDEBUG(D_NET, "REQ from ?%s?\n", libcfs_nid2str(msg->ibm_srcnid));
 
-                rc = kibnal_accept(&conn, cid, msg, 
-                                   req->remote_private_data_len);
+                rc = kibnal_accept_connreq(&conn, cid, msg, 
+                                           req->remote_private_data_len);
                 if (rc != 0) {
-                        CERROR ("Can't accept ?"LPX64"?: %d\n",
-                                msg->ibm_srcnid, rc);
+                        CERROR ("Can't accept ?%s?: %d\n",
+                                libcfs_nid2str(msg->ibm_srcnid), rc);
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
 
@@ -1951,7 +1994,7 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
 
                 msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
 
-                kibnal_pack_msg(msg, 0, 
+                kibnal_pack_msg(msg, conn->ibc_version, 0, 
                                 conn->ibc_peer->ibp_nid, 
                                 conn->ibc_incarnation);
 
@@ -1968,19 +2011,19 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event,
 
         case TS_IB_CM_ESTABLISHED:
                 LASSERT (conn != NULL);
-                CWARN("Connection %p -> "LPX64" ESTABLISHED.\n",
-                       conn, conn->ibc_peer->ibp_nid);
+                CWARN("Connection %p -> %s ESTABLISHED.\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
-                kibnal_connreq_done(conn, 0);
+                kibnal_connreq_done(conn, 0, 0);
                 return TS_IB_CM_CALLBACK_PROCEED;
         }
 }
 
 tTS_IB_CM_CALLBACK_RETURN
 kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
-                              tTS_IB_CM_COMM_ID cid,
-                              void *param,
-                              void *arg)
+                             tTS_IB_CM_COMM_ID cid,
+                             void *param,
+                             void *arg)
 {
         kib_conn_t    *conn = arg;
         unsigned long  flags;
@@ -1992,75 +2035,79 @@ kibnal_active_conn_callback (tTS_IB_CM_EVENT event,
                 int                              nob = rep->remote_private_data_len;
                 int                              rc;
 
-                rc = kibnal_unpack_msg(msg, nob);
+                rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
                 if (rc != 0) {
-                        CERROR ("Error %d unpacking conn ack from "LPX64"\n",
-                                rc, conn->ibc_peer->ibp_nid);
-                        kibnal_connreq_done(conn, rc);
-                        kibnal_put_conn(conn); /* drop CM's ref */
+                        CERROR ("Error %d unpacking conn ack from %s\n",
+                                rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kibnal_connreq_done(conn, 1, rc);
+                        kibnal_conn_decref(conn); /* drop CM's ref */
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
 
                 if (msg->ibm_type != IBNAL_MSG_CONNACK) {
-                        CERROR ("Unexpected conn ack type %d from "LPX64"\n",
-                                msg->ibm_type, conn->ibc_peer->ibp_nid);
-                        kibnal_connreq_done(conn, -EPROTO);
-                        kibnal_put_conn(conn); /* drop CM's ref */
+                        CERROR ("Unexpected conn ack type %d from %s\n",
+                                msg->ibm_type, 
+                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kibnal_connreq_done(conn, 1, -EPROTO);
+                        kibnal_conn_decref(conn); /* drop CM's ref */
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
 
-                if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+                if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
+                                             msg->ibm_srcnid) ||
+                    !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+                                             msg->ibm_dstnid) ||
                     msg->ibm_srcstamp != conn->ibc_incarnation ||
-                    msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
                     msg->ibm_dststamp != kibnal_data.kib_incarnation) {
-                        CERROR("Stale conn ack from "LPX64"\n",
-                               conn->ibc_peer->ibp_nid);
-                        kibnal_connreq_done(conn, -ESTALE);
-                        kibnal_put_conn(conn); /* drop CM's ref */
+                        CERROR("Stale conn ack from %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kibnal_connreq_done(conn, 1, -ESTALE);
+                        kibnal_conn_decref(conn); /* drop CM's ref */
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
 
                 if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
-                        CERROR ("Bad queue depth %d from "LPX64"\n",
+                        CERROR ("Bad queue depth %d from %s\n",
                                 msg->ibm_u.connparams.ibcp_queue_depth,
-                                conn->ibc_peer->ibp_nid);
-                        kibnal_connreq_done(conn, -EPROTO);
-                        kibnal_put_conn(conn); /* drop CM's ref */
+                                libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        kibnal_connreq_done(conn, 1, -EPROTO);
+                        kibnal_conn_decref(conn); /* drop CM's ref */
                         return TS_IB_CM_CALLBACK_ABORT;
                 }
                                 
-                CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n",
-                       conn, conn->ibc_peer->ibp_nid);
+                CDEBUG(D_NET, "Connection %p -> %s REP_RECEIVED.\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+                conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
                 return TS_IB_CM_CALLBACK_PROCEED;
         }
 
         case TS_IB_CM_ESTABLISHED:
-                CWARN("Connection %p -> "LPX64" ESTABLISHED\n",
-                       conn, conn->ibc_peer->ibp_nid);
+                CWARN("Connection %p -> %s ESTABLISHED\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
-                kibnal_connreq_done(conn, 0);
+                kibnal_connreq_done(conn, 1, 0);
                 return TS_IB_CM_CALLBACK_PROCEED;
 
         case TS_IB_CM_IDLE:
-                CERROR("Connection %p -> "LPX64" IDLE\n",
-                       conn, conn->ibc_peer->ibp_nid);
+                CDEBUG(D_NETERROR, "Connection %p -> %s IDLE\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 /* I assume this connection attempt was rejected because the
                  * peer found a stale QP; I'll just try again */
                 write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
                 kibnal_schedule_active_connect_locked(conn->ibc_peer);
                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-                kibnal_connreq_done(conn, -ECONNABORTED);
-                kibnal_put_conn(conn); /* drop CM's ref */
+                kibnal_connreq_done(conn, 1, -ECONNABORTED);
+                kibnal_conn_decref(conn); /* drop CM's ref */
                 return TS_IB_CM_CALLBACK_ABORT;
 
         default:
-                CERROR("Connection %p -> "LPX64" ERROR %d\n",
-                       conn, conn->ibc_peer->ibp_nid, event);
-                kibnal_connreq_done(conn, -ECONNABORTED);
-                kibnal_put_conn(conn); /* drop CM's ref */
+                CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n",
+                       conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event);
+                kibnal_connreq_done(conn, 1, -ECONNABORTED);
+                kibnal_conn_decref(conn); /* drop CM's ref */
                 return TS_IB_CM_CALLBACK_ABORT;
         }
 }
@@ -2075,10 +2122,10 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
         kib_msg_t  *msg = &conn->ibc_connreq->cr_msg;
 
         if (status != 0) {
-                CERROR ("Pathreq %p -> "LPX64" failed: %d\n",
-                        conn, conn->ibc_peer->ibp_nid, status);
-                kibnal_connreq_done(conn, status);
-                kibnal_put_conn(conn); /* drop callback's ref */
+                CDEBUG (D_NETERROR, "Pathreq %p -> %s failed: %d\n",
+                        conn, libcfs_nid2str(peer->ibp_nid), status);
+                kibnal_connreq_done(conn, 1, status);
+                kibnal_conn_decref(conn); /* drop callback's ref */
                 return 1;    /* non-zero prevents further callbacks */
         }
 
@@ -2086,7 +2133,8 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
 
         kibnal_init_msg(msg, IBNAL_MSG_CONNREQ, sizeof(msg->ibm_u.connparams));
         msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
-        kibnal_pack_msg(msg, 0, peer->ibp_nid, conn->ibc_incarnation);
+        kibnal_pack_msg(msg, conn->ibc_version, 0, 
+                        peer->ibp_nid, conn->ibc_incarnation);
 
         conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) {
                 .qp                   = conn->ibc_qp,
@@ -2096,7 +2144,7 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
                 .initiator_depth      = IBNAL_RESPONDER_RESOURCES,
                 .retry_count          = IBNAL_RETRY,
                 .rnr_retry_count      = IBNAL_RNR_RETRY,
-                .cm_response_timeout  = kibnal_tunables.kib_io_timeout,
+                .cm_response_timeout  = *kibnal_tunables.kib_timeout,
                 .max_cm_retries       = IBNAL_CM_RETRY,
                 .flow_control         = IBNAL_FLOW_CONTROL,
         };
@@ -2107,8 +2155,9 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
         /* Flag I'm getting involved with the CM... */
         conn->ibc_state = IBNAL_CONN_CONNECTING;
 
-        CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n",
-               conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, peer->ibp_nid);
+        CDEBUG(D_NET, "Connecting to, service id "LPX64", on %s\n",
+               conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, 
+               libcfs_nid2str(peer->ibp_nid));
 
         /* kibnal_connect_callback gets my conn ref */
         status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, 
@@ -2117,12 +2166,12 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status,
                                 kibnal_active_conn_callback, conn,
                                 &conn->ibc_comm_id);
         if (status != 0) {
-                CERROR ("Connect %p -> "LPX64" failed: %d\n",
-                        conn, conn->ibc_peer->ibp_nid, status);
+                CERROR ("Connect %p -> %s failed: %d\n",
+                        conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), status);
                 /* Back out state change: I've not got a CM comm_id yet... */
                 conn->ibc_state = IBNAL_CONN_INIT_QP;
-                kibnal_connreq_done(conn, status);
-                kibnal_put_conn(conn); /* Drop callback's ref */
+                kibnal_connreq_done(conn, 1, status);
+                kibnal_conn_decref(conn); /* Drop callback's ref */
         }
         
         return 1;    /* non-zero to prevent further callbacks */
@@ -2137,18 +2186,18 @@ kibnal_connect_peer (kib_peer_t *peer)
         conn = kibnal_create_conn();
         if (conn == NULL) {
                 CERROR ("Can't allocate conn\n");
-                kibnal_peer_connect_failed (peer, -ENOMEM);
+                kibnal_peer_connect_failed (peer, 1, -ENOMEM);
                 return;
         }
 
         conn->ibc_peer = peer;
-        atomic_inc (&peer->ibp_refcount);
+        kibnal_peer_addref(peer);
 
-        PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
+        LIBCFS_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
         if (conn->ibc_connreq == NULL) {
                 CERROR ("Can't allocate connreq\n");
-                kibnal_connreq_done(conn, -ENOMEM);
-                kibnal_put_conn(conn); /* drop my ref */
+                kibnal_connreq_done(conn, 1, -ENOMEM);
+                kibnal_conn_decref(conn); /* drop my ref */
                 return;
         }
 
@@ -2156,8 +2205,8 @@ kibnal_connect_peer (kib_peer_t *peer)
 
         rc = kibnal_make_svcqry(conn);
         if (rc != 0) {
-                kibnal_connreq_done (conn, rc);
-                kibnal_put_conn(conn); /* drop my ref */
+                kibnal_connreq_done (conn, 1, rc);
+                kibnal_conn_decref(conn); /* drop my ref */
                 return;
         }
 
@@ -2173,58 +2222,60 @@ kibnal_connect_peer (kib_peer_t *peer)
                                     conn->ibc_connreq->cr_svcrsp.ibsr_svc_gid,
                                     conn->ibc_connreq->cr_svcrsp.ibsr_svc_pkey,
                                     0,
-                                    kibnal_tunables.kib_io_timeout * HZ,
+                                    *kibnal_tunables.kib_timeout * HZ,
                                     0,
                                     kibnal_pathreq_callback, conn, 
                                     &conn->ibc_connreq->cr_tid);
         if (rc == 0)
                 return; /* callback now has my ref on conn */
 
-        CERROR ("Path record request %p -> "LPX64" failed: %d\n",
-                conn, conn->ibc_peer->ibp_nid, rc);
-        kibnal_connreq_done(conn, rc);
-        kibnal_put_conn(conn); /* drop my ref */
+        CERROR ("Path record request %p -> %s failed: %d\n",
+                conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+        kibnal_connreq_done(conn, 1, rc);
+        kibnal_conn_decref(conn); /* drop my ref */
 }
 
 int
-kibnal_conn_timed_out (kib_conn_t *conn)
+kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
 {
         kib_tx_t          *tx;
         struct list_head  *ttmp;
         unsigned long      flags;
+        int                timed_out = 0;
 
         spin_lock_irqsave (&conn->ibc_lock, flags);
 
-        list_for_each (ttmp, &conn->ibc_tx_queue) {
+        list_for_each (ttmp, txs) {
                 tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (!tx->tx_passive_rdma_wait);
-                LASSERT (tx->tx_sending == 0);
+                if (txs == &conn->ibc_active_txs) {
+                        LASSERT (tx->tx_passive_rdma ||
+                                 !tx->tx_passive_rdma_wait);
 
-                if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-                        return 1;
+                        LASSERT (tx->tx_passive_rdma_wait ||
+                                 tx->tx_sending != 0);
+                } else {
+                        LASSERT (!tx->tx_passive_rdma_wait);
+                        LASSERT (tx->tx_sending == 0);
                 }
-        }
-
-        list_for_each (ttmp, &conn->ibc_active_txs) {
-                tx = list_entry (ttmp, kib_tx_t, tx_list);
-
-                LASSERT (tx->tx_passive_rdma ||
-                         !tx->tx_passive_rdma_wait);
-
-                LASSERT (tx->tx_passive_rdma_wait ||
-                         tx->tx_sending != 0);
-
+                
                 if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock_irqrestore (&conn->ibc_lock, flags);
-                        return 1;
+                        timed_out = 1;
+                        break;
                 }
         }
 
         spin_unlock_irqrestore (&conn->ibc_lock, flags);
+        return timed_out;
+}
 
-        return 0;
+int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+        return  kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
+                kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
+                kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
+                kibnal_check_txs(conn, &conn->ibc_active_txs);
 }
 
 void
@@ -2260,19 +2311,16 @@ kibnal_check_conns (int idx)
                         if (!kibnal_conn_timed_out(conn))
                                 continue;
                         
-                        CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n",
-                               conn, conn->ibc_state, peer->ibp_nid,
-                               atomic_read (&conn->ibc_refcount));
+                        kibnal_conn_addref(conn);
 
-                        atomic_inc (&conn->ibc_refcount);
                         read_unlock_irqrestore(&kibnal_data.kib_global_lock,
                                                flags);
 
-                        CERROR("Timed out RDMA with "LPX64"\n",
-                               peer->ibp_nid);
+                        CERROR("Timed out RDMA with %s\n",
+                               libcfs_nid2str(peer->ibp_nid));
 
                         kibnal_close_conn (conn, -ETIMEDOUT);
-                        kibnal_put_conn (conn);
+                        kibnal_conn_decref(conn);
 
                         /* start again now I've dropped the lock */
                         goto again;
@@ -2293,8 +2341,10 @@ kibnal_terminate_conn (kib_conn_t *conn)
 
         rc = ib_cm_disconnect (conn->ibc_comm_id);
         if (rc != 0)
-                CERROR ("Error %d disconnecting conn %p -> "LPX64"\n",
-                        rc, conn, conn->ibc_peer->ibp_nid);
+                CERROR ("Error %d disconnecting conn %p -> %s\n",
+                        rc, conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+        kibnal_peer_notify(conn->ibc_peer);
 }
 
 int
@@ -2308,8 +2358,8 @@ kibnal_reaper (void *arg)
         int                peer_index = 0;
         unsigned long      deadline = jiffies;
         
-        kportal_daemonize ("kibnal_reaper");
-        kportal_blockallsigs ();
+        cfs_daemonize ("kibnal_reaper");
+        cfs_block_allsigs ();
 
         init_waitqueue_entry (&wait, current);
 
@@ -2330,9 +2380,10 @@ kibnal_reaper (void *arg)
                                  * callback and last ref reschedules it
                                  * here... */
                                 kibnal_terminate_conn(conn);
-                                kibnal_put_conn (conn);
+                                kibnal_conn_decref(conn);
                                 break;
-                                
+
+                        case IBNAL_CONN_INIT_QP:
                         case IBNAL_CONN_ZOMBIE:
                                 kibnal_destroy_conn (conn);
                                 break;
@@ -2363,9 +2414,9 @@ kibnal_reaper (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (kibnal_tunables.kib_io_timeout > n * p)
+                        if (*kibnal_tunables.kib_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        kibnal_tunables.kib_io_timeout;
+                                        *kibnal_tunables.kib_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
@@ -2409,8 +2460,8 @@ kibnal_connd (void *arg)
         int                did_something;
 
         snprintf(name, sizeof(name), "kibnal_connd_%02ld", id);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
+        cfs_daemonize(name);
+        cfs_block_allsigs();
 
         init_waitqueue_entry (&wait, current);
 
@@ -2427,32 +2478,37 @@ kibnal_connd (void *arg)
                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                         kibnal_handle_svcqry(as->ibas_sock);
-                        sock_release(as->ibas_sock);
-                        PORTAL_FREE(as, sizeof(*as));
+                        kibnal_free_acceptsock(as);
                         
                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
                         did_something = 1;
                 }
                         
-                if (!list_empty (&kibnal_data.kib_connd_peers)) {
+                /* Only handle an outgoing connection request if there is someone left
+                 * to handle an incoming svcqry */
+                if (!list_empty (&kibnal_data.kib_connd_peers) &&
+                    ((kibnal_data.kib_connd_connecting + 1) < 
+                     *kibnal_tunables.kib_n_connd)) {
                         peer = list_entry (kibnal_data.kib_connd_peers.next,
                                            kib_peer_t, ibp_connd_list);
                         
                         list_del_init (&peer->ibp_connd_list);
+                        kibnal_data.kib_connd_connecting++;
                         spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
                         kibnal_connect_peer (peer);
-                        kibnal_put_peer (peer);
+                        kibnal_peer_decref(peer);
 
                         spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
                         did_something = 1;
+                        kibnal_data.kib_connd_connecting--;
                 }
 
                 if (did_something)
                         continue;
 
                 set_current_state (TASK_INTERRUPTIBLE);
-                add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
+                add_wait_queue_exclusive(&kibnal_data.kib_connd_waitq, &wait);
 
                 spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
 
@@ -2483,8 +2539,8 @@ kibnal_scheduler(void *arg)
         int             did_something;
 
         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
+        cfs_daemonize(name);
+        cfs_block_allsigs();
 
         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
 
@@ -2524,7 +2580,7 @@ kibnal_scheduler(void *arg)
                         counter = 0;
 
                         if (!did_something) {
-                                rc = wait_event_interruptible(
+                                rc = wait_event_interruptible_exclusive(
                                         kibnal_data.kib_sched_waitq,
                                         !list_empty(&kibnal_data.kib_sched_txq) || 
                                         !list_empty(&kibnal_data.kib_sched_rxq) || 
@@ -2543,13 +2599,3 @@ kibnal_scheduler(void *arg)
         kibnal_thread_fini();
         return (0);
 }
-
-
-lib_nal_t kibnal_lib = {
-        libnal_data:        &kibnal_data,      /* NAL private data */
-        libnal_send:         kibnal_send,
-        libnal_send_pages:   kibnal_send_pages,
-        libnal_recv:         kibnal_recv,
-        libnal_recv_pages:   kibnal_recv_pages,
-        libnal_dist:         kibnal_dist
-};
diff --git a/lnet/klnds/openiblnd/openiblnd_modparams.c b/lnet/klnds/openiblnd/openiblnd_modparams.c
new file mode 100644 (file)
index 0000000..f40004b
--- /dev/null
@@ -0,0 +1,149 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "openiblnd.h"
+
+static char *ipif_basename = "ib";
+CFS_MODULE_PARM(ipif_basename, "s", charp, 0444,
+                "IPoIB interface base name");
+
+static int n_connd = 4;
+CFS_MODULE_PARM(n_connd, "i", int, 0444,
+                "# of connection daemons");
+
+static int min_reconnect_interval = 1;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = 60;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int concurrent_peers = 1152;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+               "maximum number of peers that may connect");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int ntx = 384;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of message descriptors");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 16;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+                "Idle time in seconds before sending a keepalive");
+
+kib_tunables_t kibnal_tunables = {
+        .kib_ipif_basename          = &ipif_basename,
+       .kib_n_connd                = &n_connd,
+        .kib_min_reconnect_interval = &min_reconnect_interval,
+        .kib_max_reconnect_interval = &max_reconnect_interval,
+        .kib_concurrent_peers       = &concurrent_peers,
+       .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_ntx                    = &ntx,
+        .kib_credits                = &credits,
+        .kib_peercredits            = &peer_credits,
+        .kib_keepalive              = &keepalive,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+static ctl_table kibnal_ctl_table[] = {
+       {1, "ipif_basename", &ipif_basename, 
+         1024, 0444, NULL, &proc_dostring},
+       {2, "n_connd", &n_connd, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {3, "min_reconnect_interval", &min_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {4, "max_reconnect_interval", &max_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {5, "concurrent_peers", &concurrent_peers, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {6, "cksum", &cksum, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {7, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {8, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {9, "credits", &credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "peer_credits", &peer_credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {11, "keepalive", &keepalive, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {0}
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+       {203, "openibnal", NULL, 0, 0555, kibnal_ctl_table},
+       {0}
+};
+
+int
+kibnal_tunables_init ()
+{
+       kibnal_tunables.kib_sysctl =
+               register_sysctl_table(kibnal_top_ctl_table, 0);
+       
+       if (kibnal_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+       if (kibnal_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kibnal_tunables.kib_sysctl);
+}
+
+#else
+
+int
+kibnal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+}
+
+#endif
diff --git a/lnet/klnds/ptllnd/.cvsignore b/lnet/klnds/ptllnd/.cvsignore
new file mode 100644 (file)
index 0000000..0586565
--- /dev/null
@@ -0,0 +1,11 @@
+.deps
+Makefile
+autoMakefile.in
+autoMakefile
+*.ko
+*.mod.c
+.*.flags
+.*.cmd
+.tmp_versions
+.depend
+wirecheck
diff --git a/lnet/klnds/ptllnd/Makefile.in b/lnet/klnds/ptllnd/Makefile.in
new file mode 100755 (executable)
index 0000000..ec2f9bb
--- /dev/null
@@ -0,0 +1,13 @@
+MODULES := kptllnd
+
+EXTRA_POST_CFLAGS := @PTLLNDCPPFLAGS@
+
+kptllnd-objs := ptllnd.o                \
+                ptllnd_cb.o             \
+                ptllnd_modparams.o      \
+                ptllnd_peer.o           \
+                ptllnd_rx_buf.o         \
+                ptllnd_tx.o            \
+                ptllnd_ptltrace.o
+
+@INCLUDE_RULES@
diff --git a/lnet/klnds/ptllnd/README b/lnet/klnds/ptllnd/README
new file mode 100644 (file)
index 0000000..5cb6cfc
--- /dev/null
@@ -0,0 +1,47 @@
+1. This version of the Portals LND is intended to work on the Cray XT3 using
+   Cray Portals as a network transport.
+
+2. To enable the building of the Portals LND (ptllnd.ko) configure with the
+   following option:
+   ./configure --with-portals=<path-to-portals-headers>
+
+3. The following configuration options are supported
+
+        ntx:
+            The total number of message descritprs
+
+        concurrent_peers:
+            The maximum number of conncurent peers.  Peers attemting
+            to connect beyond the maximum will not be allowd.
+
+        peer_hash_table_size:
+            The number of hash table slots for the peers. This number
+            should scale with concurrent_peers.
+
+        cksum:
+            Set to non-zero to enable message (not RDMA) checksums for
+            outgoing packets.   Incoming packets will always be checksumed
+            if necssary, independnt of this value.
+
+        timeout:
+            The amount of time a request can linger in a peers active
+            queue, before the peer is considered dead.  Units: seconds.
+
+        portal:
+            The portal ID to use for the ptllnd traffic.
+
+        rxb_npages:
+            The number of pages in a RX Buffer.
+
+        credits:
+            The maximum total number of concurrent sends that are
+            outstanding at any given instant.
+
+        peercredits:
+            The maximum number of concurrent sends that are
+            outstanding to a single piere at any given instant.
+
+        max_msg_size:
+            The maximum immedate message size.  This MUST be
+            the same on all nodes in a cluster.  A peer connecting
+            with a diffrent max_msg_size will be rejected.
diff --git a/lnet/klnds/ptllnd/autoMakefile.am b/lnet/klnds/ptllnd/autoMakefile.am
new file mode 100755 (executable)
index 0000000..bd8cc9c
--- /dev/null
@@ -0,0 +1,8 @@
+if MODULES
+if BUILD_PTLLND
+modulenet_DATA = kptllnd$(KMODEXT)
+endif
+endif
+
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h
diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c
new file mode 100755 (executable)
index 0000000..a82babe
--- /dev/null
@@ -0,0 +1,836 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+#include "ptllnd.h"
+
+lnd_t kptllnd_lnd = {
+        .lnd_type       = PTLLND,
+        .lnd_startup    = kptllnd_startup,
+        .lnd_shutdown   = kptllnd_shutdown,
+        .lnd_ctl        = kptllnd_ctl,
+        .lnd_send       = kptllnd_send,
+        .lnd_recv       = kptllnd_recv,
+        .lnd_eager_recv = kptllnd_eager_recv,
+};
+
+kptl_data_t kptllnd_data;
+
+char *
+kptllnd_ptlid2str(ptl_process_id_t id)
+{
+        static char    strs[64][32];
+        static int     idx = 0;
+
+        unsigned long  flags;
+        char          *str;
+        
+        spin_lock_irqsave(&kptllnd_data.kptl_ptlid2str_lock, flags);
+        str = strs[idx++];
+        if (idx >= sizeof(strs)/sizeof(strs[0]))
+                idx = 0;
+        spin_unlock_irqrestore(&kptllnd_data.kptl_ptlid2str_lock, flags);
+
+        snprintf(str, sizeof(strs[0]), FMT_PTLID, id.pid, id.nid);
+        return str;
+}
+
+void 
+kptllnd_assert_wire_constants (void)
+{
+        /* Wire protocol assertions generated by 'wirecheck'
+         * running on Linux fedora 2.6.11-co-0.6.4 #1 Mon Jun 19 05:36:13 UTC 2006 i686 i686 i386 GNU
+         * with gcc version 4.1.1 20060525 (Red Hat 4.1.1-1) */
+
+
+        /* Constants... */
+        CLASSERT (PTL_RESERVED_MATCHBITS == 0x100);
+        CLASSERT (LNET_MSG_MATCHBITS == 0);
+        CLASSERT (PTLLND_MSG_MAGIC == 0x50746C4E);
+        CLASSERT (PTLLND_MSG_VERSION == 0x04);
+        CLASSERT (PTLLND_RDMA_OK == 0x00);
+        CLASSERT (PTLLND_RDMA_FAIL == 0x01);
+        CLASSERT (PTLLND_MSG_TYPE_INVALID == 0x00);
+        CLASSERT (PTLLND_MSG_TYPE_PUT == 0x01);
+        CLASSERT (PTLLND_MSG_TYPE_GET == 0x02);
+        CLASSERT (PTLLND_MSG_TYPE_IMMEDIATE == 0x03);
+        CLASSERT (PTLLND_MSG_TYPE_NOOP == 0x04);
+        CLASSERT (PTLLND_MSG_TYPE_HELLO == 0x05);
+        CLASSERT (PTLLND_MSG_TYPE_NAK == 0x06);
+
+        /* Checks for struct kptl_msg_t */
+        CLASSERT ((int)sizeof(kptl_msg_t) == 136);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_magic) == 0);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_magic) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_version) == 4);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_version) == 2);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_type) == 6);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_type) == 1);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_credits) == 7);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_credits) == 1);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_nob) == 8);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_nob) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_cksum) == 12);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_cksum) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcnid) == 16);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcnid) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcstamp) == 24);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcstamp) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstnid) == 32);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstnid) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dststamp) == 40);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dststamp) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcpid) == 48);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcpid) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstpid) == 52);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstpid) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.immediate) == 56);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.immediate) == 72);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.rdma) == 56);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.rdma) == 80);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.hello) == 56);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.hello) == 12);
+
+        /* Checks for struct kptl_immediate_msg_t */
+        CLASSERT ((int)sizeof(kptl_immediate_msg_t) == 72);
+        CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_hdr) == 0);
+        CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_hdr) == 72);
+        CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_payload[13]) == 85);
+        CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_payload[13]) == 1);
+
+        /* Checks for struct kptl_rdma_msg_t */
+        CLASSERT ((int)sizeof(kptl_rdma_msg_t) == 80);
+        CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_hdr) == 0);
+        CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_hdr) == 72);
+        CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_matchbits) == 72);
+        CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_matchbits) == 8);
+
+        /* Checks for struct kptl_hello_msg_t */
+        CLASSERT ((int)sizeof(kptl_hello_msg_t) == 12);
+        CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_matchbits) == 0);
+        CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_matchbits) == 8);
+        CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_max_msg_size) == 8);
+        CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_max_msg_size) == 4);
+}
+
+const char *kptllnd_evtype2str(int type)
+{
+#define DO_TYPE(x) case x: return #x;
+        switch(type)
+        {
+                DO_TYPE(PTL_EVENT_GET_START);
+                DO_TYPE(PTL_EVENT_GET_END);
+                DO_TYPE(PTL_EVENT_PUT_START);
+                DO_TYPE(PTL_EVENT_PUT_END);
+                DO_TYPE(PTL_EVENT_REPLY_START);
+                DO_TYPE(PTL_EVENT_REPLY_END);
+                DO_TYPE(PTL_EVENT_ACK);
+                DO_TYPE(PTL_EVENT_SEND_START);
+                DO_TYPE(PTL_EVENT_SEND_END);
+                DO_TYPE(PTL_EVENT_UNLINK);
+        default:
+                return "<unknown event type>";
+        }
+#undef DO_TYPE
+}
+
+const char *kptllnd_msgtype2str(int type)
+{
+#define DO_TYPE(x) case x: return #x;
+        switch(type)
+        {
+                DO_TYPE(PTLLND_MSG_TYPE_INVALID);
+                DO_TYPE(PTLLND_MSG_TYPE_PUT);
+                DO_TYPE(PTLLND_MSG_TYPE_GET);
+                DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE);
+                DO_TYPE(PTLLND_MSG_TYPE_HELLO);
+                DO_TYPE(PTLLND_MSG_TYPE_NOOP);
+                DO_TYPE(PTLLND_MSG_TYPE_NAK);
+        default:
+                return "<unknown msg type>";
+        }
+#undef DO_TYPE
+}
+
+__u32
+kptllnd_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
+
+void
+kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob)
+{
+        msg->ptlm_type = type;
+        msg->ptlm_nob  = (offsetof(kptl_msg_t, ptlm_u) + body_nob + 7) & ~7;
+        
+        LASSERT(msg->ptlm_nob <= *kptllnd_tunables.kptl_max_msg_size);
+}
+
+void
+kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer)
+{
+        msg->ptlm_magic    = PTLLND_MSG_MAGIC;
+        msg->ptlm_version  = PTLLND_MSG_VERSION;
+        /* msg->ptlm_type  Filled in kptllnd_init_msg()  */
+        msg->ptlm_credits  = peer->peer_outstanding_credits;
+        /* msg->ptlm_nob   Filled in kptllnd_init_msg()  */
+        msg->ptlm_cksum    = 0;
+        msg->ptlm_srcnid   = kptllnd_data.kptl_ni->ni_nid;
+        msg->ptlm_srcstamp = kptllnd_data.kptl_incarnation;
+        msg->ptlm_dstnid   = peer->peer_id.nid;
+        msg->ptlm_dststamp = peer->peer_incarnation;
+        msg->ptlm_srcpid   = the_lnet.ln_pid;
+        msg->ptlm_dstpid   = peer->peer_id.pid;
+
+        if (*kptllnd_tunables.kptl_checksum) {
+                /* NB ptlm_cksum zero while computing cksum */
+                msg->ptlm_cksum = kptllnd_cksum(msg, 
+                                                offsetof(kptl_msg_t, ptlm_u));
+        }
+}
+
+int
+kptllnd_msg_unpack(kptl_msg_t *msg, int nob)
+{
+        const int hdr_size = offsetof(kptl_msg_t, ptlm_u);
+        __u32     msg_cksum;
+        __u16     msg_version;
+        int       flip;
+
+        /* 6 bytes are enough to have received magic + version */
+        if (nob < 6) {
+                CERROR("Very Short message: %d\n", nob);
+                return -EPROTO;
+        }
+
+        /*
+         * Determine if we need to flip
+         */
+        if (msg->ptlm_magic == PTLLND_MSG_MAGIC) {
+                flip = 0;
+        } else if (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC)) {
+                flip = 1;
+        } else {
+                CERROR("Bad magic: %08x\n", msg->ptlm_magic);
+                return -EPROTO;
+        }
+
+        msg_version = flip ? __swab16(msg->ptlm_version) : msg->ptlm_version;
+
+        if (msg_version != PTLLND_MSG_VERSION) {
+                CERROR("Bad version: got %04x expected %04x\n",
+                        (__u32)msg_version, PTLLND_MSG_VERSION);
+                return -EPROTO;
+        }
+
+        if (nob < hdr_size) {
+                CERROR("Short message: got %d, wanted at least %d\n",
+                       nob, hdr_size);
+                return -EPROTO;
+        }
+
+        /* checksum must be computed with
+         * 1) ptlm_cksum zero and
+         * 2) BEFORE anything gets modified/flipped
+         */
+        msg_cksum = flip ? __swab32(msg->ptlm_cksum) : msg->ptlm_cksum;
+        msg->ptlm_cksum = 0;
+        if (msg_cksum != 0 &&
+            msg_cksum != kptllnd_cksum(msg, hdr_size)) {
+                CERROR("Bad checksum\n");
+                return -EPROTO;
+        }
+
+        msg->ptlm_version = msg_version;
+        msg->ptlm_cksum = msg_cksum;
+        
+        if (flip) {
+                /* These two are 1 byte long so we don't swap them
+                   But check this assumtion*/
+                CLASSERT (sizeof(msg->ptlm_type) == 1);
+                CLASSERT (sizeof(msg->ptlm_credits) == 1);
+                /* src & dst stamps are opaque cookies */
+                __swab32s(&msg->ptlm_nob);
+                __swab64s(&msg->ptlm_srcnid);
+                __swab64s(&msg->ptlm_dstnid);
+                __swab32s(&msg->ptlm_srcpid);
+                __swab32s(&msg->ptlm_dstpid);
+        }
+
+        if (msg->ptlm_nob != nob) {
+                CERROR("msg_nob corrupt: got 0x%08x, wanted %08x\n",
+                       msg->ptlm_nob, nob);
+                return -EPROTO;
+        }
+
+        switch(msg->ptlm_type)
+        {
+        case PTLLND_MSG_TYPE_PUT:
+        case PTLLND_MSG_TYPE_GET:
+                if (nob < hdr_size + sizeof(kptl_rdma_msg_t)) {
+                        CERROR("Short rdma request: got %d, want %d\n",
+                               nob, hdr_size + (int)sizeof(kptl_rdma_msg_t));
+                        return -EPROTO;
+                }
+
+                if (flip)
+                        __swab64s(&msg->ptlm_u.rdma.kptlrm_matchbits);
+
+                if (msg->ptlm_u.rdma.kptlrm_matchbits < PTL_RESERVED_MATCHBITS) {
+                        CERROR("Bad matchbits "LPX64"\n",
+                               msg->ptlm_u.rdma.kptlrm_matchbits);
+                        return -EPROTO;
+                }
+                break;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                if (nob < offsetof(kptl_msg_t, 
+                                   ptlm_u.immediate.kptlim_payload)) {
+                        CERROR("Short immediate: got %d, want %d\n", nob,
+                               (int)offsetof(kptl_msg_t, 
+                                             ptlm_u.immediate.kptlim_payload));
+                        return -EPROTO;
+                }
+                /* Do nothing */
+                break;
+                        
+        case PTLLND_MSG_TYPE_NOOP:
+        case PTLLND_MSG_TYPE_NAK:
+                /* Do nothing */
+                break;
+
+        case PTLLND_MSG_TYPE_HELLO:
+                if (nob < hdr_size + sizeof(kptl_hello_msg_t)) {
+                        CERROR("Short hello: got %d want %d\n",
+                               nob, hdr_size + (int)sizeof(kptl_hello_msg_t));
+                        return -EPROTO;
+                }
+                if (flip) {
+                        __swab64s(&msg->ptlm_u.hello.kptlhm_matchbits);
+                        __swab32s(&msg->ptlm_u.hello.kptlhm_max_msg_size);
+                }
+                break;
+
+        default:
+                CERROR("Bad message type: 0x%02x\n", (__u32)msg->ptlm_type);
+                return -EPROTO;
+        }
+
+        return 0;
+}
+
+int
+kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+        struct libcfs_ioctl_data *data = arg;
+        int          rc = -EINVAL;
+
+        CDEBUG(D_NET, ">>> kptllnd_ctl cmd=%u arg=%p\n", cmd, arg);
+
+        /*
+         * Validate that the context block is actually
+         * pointing to this interface
+         */
+        LASSERT (ni == kptllnd_data.kptl_ni);
+
+        switch(cmd) {
+        case IOC_LIBCFS_DEL_PEER: {
+                lnet_process_id_t id;
+                
+                id.nid = data->ioc_nid;
+                id.pid = data->ioc_u32[1];
+                
+                rc = kptllnd_peer_del(id);
+                break;
+        }
+
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_process_id_t   id = {.nid = LNET_NID_ANY,
+                                          .pid = LNET_PID_ANY};
+                __u64               incarnation = 0;
+                __u64               next_matchbits = 0;
+                __u64               last_matchbits_seen = 0;
+                int                 state = 0;
+                int                 sent_hello = 0;
+                int                 refcount = 0;
+                int                 nsendq = 0;
+                int                 nactiveq = 0;
+                int                 credits = 0;
+                int                 outstanding_credits = 0;
+
+                rc = kptllnd_get_peer_info(data->ioc_count, &id,
+                                           &state, &sent_hello,
+                                           &refcount, &incarnation,
+                                           &next_matchbits, &last_matchbits_seen,
+                                           &nsendq, &nactiveq,
+                                           &credits, &outstanding_credits);
+                /* wince... */
+                data->ioc_nid = id.nid;
+                data->ioc_net = state;
+                data->ioc_flags  = sent_hello;
+                data->ioc_count = refcount;
+                data->ioc_u64[0] = incarnation;
+                data->ioc_u32[0] = (__u32)next_matchbits;
+                data->ioc_u32[1] = (__u32)(next_matchbits >> 32);
+                data->ioc_u32[2] = (__u32)last_matchbits_seen;
+                data->ioc_u32[3] = (__u32)(last_matchbits_seen >> 32);
+                data->ioc_u32[4] = id.pid;
+                data->ioc_u32[5] = (nsendq << 16) | nactiveq;
+                data->ioc_u32[6] = (credits << 16) | outstanding_credits;
+                break;
+        }
+                
+        default:
+                rc=-EINVAL;
+                break;
+        }
+        CDEBUG(D_NET, "<<< kptllnd_ctl rc=%d\n", rc);
+        return rc;
+}
+
+int
+kptllnd_startup (lnet_ni_t *ni)
+{
+        int             rc;
+        int             i;
+        int             spares;
+        struct timeval  tv;
+        ptl_err_t       ptl_rc;
+
+        LASSERT (ni->ni_lnd == &kptllnd_lnd);
+
+        if (kptllnd_data.kptl_init != PTLLND_INIT_NOTHING) {
+                CERROR("Only 1 instance supported\n");
+                return -EPERM;
+        }
+
+        if (*kptllnd_tunables.kptl_max_procs_per_node < 1) {
+                CERROR("max_procs_per_node must be > 1\n");
+                return -EINVAL;
+        }
+
+        *kptllnd_tunables.kptl_max_msg_size &= ~7;
+        if (*kptllnd_tunables.kptl_max_msg_size < sizeof(kptl_msg_t))
+                *kptllnd_tunables.kptl_max_msg_size =
+                        (sizeof(kptl_msg_t) + 7) & ~7;
+        /*
+         * zero pointers, flags etc
+         * put everything into a known state.
+         */
+        memset (&kptllnd_data, 0, sizeof (kptllnd_data));
+        kptllnd_data.kptl_eqh = PTL_INVALID_HANDLE;
+        kptllnd_data.kptl_nih = PTL_INVALID_HANDLE;
+
+        /*
+         * Uptick the module reference count
+         */
+        PORTAL_MODULE_USE;
+
+        /*
+         * Setup pointers between the ni and context data block
+         */
+        kptllnd_data.kptl_ni = ni;
+        ni->ni_data = &kptllnd_data;
+
+        /*
+         * Setup Credits
+         */
+        ni->ni_maxtxcredits = *kptllnd_tunables.kptl_credits;
+        ni->ni_peertxcredits = *kptllnd_tunables.kptl_peercredits;
+
+        kptllnd_data.kptl_expected_peers =
+                *kptllnd_tunables.kptl_max_nodes *
+                *kptllnd_tunables.kptl_max_procs_per_node;
+        
+        /*
+         * Initialize the Network interface instance
+         * We use the default because we don't have any
+         * way to choose a better interface.
+         * Requested and actual limits are ignored.
+         */
+        ptl_rc = PtlNIInit(
+#ifdef _USING_LUSTRE_PORTALS_
+                PTL_IFACE_DEFAULT,
+#else
+                CRAY_KERN_NAL,
+#endif
+                *kptllnd_tunables.kptl_pid, NULL, NULL,
+                &kptllnd_data.kptl_nih);
+
+        /*
+         * Note: PTL_IFACE_DUP simply means that the requested
+         * interface was already inited and that we're sharing it.
+         * Which is ok.
+         */
+        if (ptl_rc != PTL_OK && ptl_rc != PTL_IFACE_DUP) {
+                CERROR ("PtlNIInit: error %d\n", ptl_rc);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        /* NB eq size irrelevant if using a callback */
+        ptl_rc = PtlEQAlloc(kptllnd_data.kptl_nih,
+                            8,                       /* size */
+                            kptllnd_eq_callback,     /* handler callback */
+                            &kptllnd_data.kptl_eqh); /* output handle */
+        if (ptl_rc != PTL_OK) {
+                CERROR("PtlEQAlloc failed %d\n", ptl_rc);
+                rc = -ENOMEM;
+                goto failed;
+        }
+
+        /*
+         * Fetch the lower NID
+         */
+        ptl_rc = PtlGetId(kptllnd_data.kptl_nih,
+                          &kptllnd_data.kptl_portals_id);
+        if (ptl_rc != PTL_OK) {
+                CERROR ("PtlGetID: error %d\n", ptl_rc);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        if (kptllnd_data.kptl_portals_id.pid != *kptllnd_tunables.kptl_pid) {
+                /* The kernel ptllnd must have the expected PID */
+                CERROR("Unexpected PID: %u (%u expected)\n",
+                       kptllnd_data.kptl_portals_id.pid,
+                       *kptllnd_tunables.kptl_pid);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        ni->ni_nid = kptllnd_ptl2lnetnid(kptllnd_data.kptl_portals_id.nid);
+
+        CDEBUG(D_NET, "ptl id=%s, lnet id=%s\n", 
+               kptllnd_ptlid2str(kptllnd_data.kptl_portals_id),
+               libcfs_nid2str(ni->ni_nid));
+
+        /*
+         * Initialized the incarnation
+         */
+        do_gettimeofday(&tv);
+        kptllnd_data.kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) +
+                                        tv.tv_usec;
+        CDEBUG(D_NET, "Incarnation="LPX64"\n", kptllnd_data.kptl_incarnation);
+
+        /*
+         * Setup the sched locks/lists/waitq
+         */
+        spin_lock_init(&kptllnd_data.kptl_sched_lock);
+        init_waitqueue_head(&kptllnd_data.kptl_sched_waitq);
+        INIT_LIST_HEAD(&kptllnd_data.kptl_sched_txq);
+        INIT_LIST_HEAD(&kptllnd_data.kptl_sched_rxq);
+        INIT_LIST_HEAD(&kptllnd_data.kptl_sched_rxbq);
+
+        /*
+         * Setup the tx locks/lists
+         */
+        spin_lock_init(&kptllnd_data.kptl_tx_lock);
+        INIT_LIST_HEAD(&kptllnd_data.kptl_idle_txs);
+        atomic_set(&kptllnd_data.kptl_ntx, 0);
+
+        /*
+         * Allocate and setup the peer hash table
+         */
+        rwlock_init(&kptllnd_data.kptl_peer_rw_lock);
+        init_waitqueue_head(&kptllnd_data.kptl_watchdog_waitq);
+        INIT_LIST_HEAD(&kptllnd_data.kptl_closing_peers);
+        INIT_LIST_HEAD(&kptllnd_data.kptl_zombie_peers);
+
+        spin_lock_init(&kptllnd_data.kptl_ptlid2str_lock);
+
+        kptllnd_data.kptl_peer_hash_size =
+                *kptllnd_tunables.kptl_peer_hash_table_size;
+        LIBCFS_ALLOC(kptllnd_data.kptl_peers,
+                     (kptllnd_data.kptl_peer_hash_size * 
+                      sizeof(struct list_head)));
+        if (kptllnd_data.kptl_peers == NULL) {
+                CERROR("Failed to allocate space for peer hash table size=%d\n",
+                        kptllnd_data.kptl_peer_hash_size);
+                rc = -ENOMEM;
+                goto failed;
+        }
+        for (i = 0; i < kptllnd_data.kptl_peer_hash_size; i++)
+                INIT_LIST_HEAD(&kptllnd_data.kptl_peers[i]);
+
+        LIBCFS_ALLOC(kptllnd_data.kptl_nak_msg, offsetof(kptl_msg_t, ptlm_u));
+        if (kptllnd_data.kptl_nak_msg == NULL) {
+                CERROR("Can't allocate NAK msg\n");
+                rc = -ENOMEM;
+                goto failed;
+        }
+        memset(kptllnd_data.kptl_nak_msg, 0, offsetof(kptl_msg_t, ptlm_u));
+        kptllnd_init_msg(kptllnd_data.kptl_nak_msg, PTLLND_MSG_TYPE_NAK, 0);
+        kptllnd_data.kptl_nak_msg->ptlm_magic    = PTLLND_MSG_MAGIC;
+        kptllnd_data.kptl_nak_msg->ptlm_version  = PTLLND_MSG_VERSION;
+        kptllnd_data.kptl_nak_msg->ptlm_srcpid   = the_lnet.ln_pid;
+        kptllnd_data.kptl_nak_msg->ptlm_srcnid   = ni->ni_nid;
+        kptllnd_data.kptl_nak_msg->ptlm_srcstamp = kptllnd_data.kptl_incarnation;
+        kptllnd_data.kptl_nak_msg->ptlm_dstpid   = LNET_PID_ANY;
+        kptllnd_data.kptl_nak_msg->ptlm_dstnid   = LNET_NID_ANY;
+
+        kptllnd_rx_buffer_pool_init(&kptllnd_data.kptl_rx_buffer_pool);
+
+        kptllnd_data.kptl_rx_cache = 
+                cfs_mem_cache_create("ptllnd_rx",
+                                     sizeof(kptl_rx_t) + 
+                                     *kptllnd_tunables.kptl_max_msg_size,
+                                     0,    /* offset */
+                                     0);   /* flags */
+        if (kptllnd_data.kptl_rx_cache == NULL) {
+                CERROR("Can't create slab for RX descriptors\n");
+                rc = -ENOMEM;
+                goto failed;
+        }
+
+        /* lists/ptrs/locks initialised */
+        kptllnd_data.kptl_init = PTLLND_INIT_DATA;
+
+        /*****************************************************/
+
+        rc = kptllnd_setup_tx_descs();
+        if (rc != 0) {
+                CERROR("Can't pre-allocate %d TX descriptors: %d\n",
+                       *kptllnd_tunables.kptl_ntx, rc);
+                goto failed;
+        }
+        
+        /* Start the scheduler threads for handling incoming requests.  No need
+         * to advance the state because this will be automatically cleaned up
+         * now that PTLNAT_INIT_DATA state has been entered */
+        CDEBUG(D_NET, "starting %d scheduler threads\n", PTLLND_N_SCHED);
+        for (i = 0; i < PTLLND_N_SCHED; i++) {
+                rc = kptllnd_thread_start(kptllnd_scheduler, (void *)((long)i));
+                if (rc != 0) {
+                        CERROR("Can't spawn scheduler[%d]: %d\n", i, rc);
+                        goto failed;
+                }
+        }
+
+        rc = kptllnd_thread_start(kptllnd_watchdog, NULL);
+        if (rc != 0) {
+                CERROR("Can't spawn watchdog: %d\n", rc);
+                goto failed;
+        }
+
+        /* Ensure that 'rxb_nspare' buffers can be off the net (being emptied)
+         * and we will still have enough buffers posted for all our peers */
+        spares = *kptllnd_tunables.kptl_rxb_nspare *
+                 ((*kptllnd_tunables.kptl_rxb_npages * PAGE_SIZE)/
+                  *kptllnd_tunables.kptl_max_msg_size);
+
+        /* reserve and post the buffers */
+        rc = kptllnd_rx_buffer_pool_reserve(&kptllnd_data.kptl_rx_buffer_pool,
+                                            kptllnd_data.kptl_expected_peers +
+                                            spares);
+        if (rc != 0) {
+                CERROR("Can't reserve RX Buffer pool: %d\n", rc);
+                goto failed;
+        }
+
+        /* flag everything initialised */
+        kptllnd_data.kptl_init = PTLLND_INIT_ALL;
+
+        /*****************************************************/
+
+        if (*kptllnd_tunables.kptl_checksum)
+                CWARN("Checksumming enabled\n");
+        
+        CDEBUG(D_NET, "<<< kptllnd_startup SUCCESS\n");
+        return 0;
+
+ failed:
+        CDEBUG(D_NET, "kptllnd_startup failed rc=%d\n", rc);
+        kptllnd_shutdown(ni);
+        return rc;
+}
+
+void
+kptllnd_shutdown (lnet_ni_t *ni)
+{
+        int               i;
+        ptl_err_t         prc;
+        lnet_process_id_t process_id;
+        unsigned long     flags;
+
+        CDEBUG(D_MALLOC, "before LND cleanup: kmem %d\n",
+               atomic_read (&libcfs_kmemory));
+
+        LASSERT (ni == kptllnd_data.kptl_ni);
+
+        switch (kptllnd_data.kptl_init) {
+        default:
+                LBUG();
+
+        case PTLLND_INIT_ALL:
+        case PTLLND_INIT_DATA:
+                /* Stop receiving */
+                kptllnd_rx_buffer_pool_fini(&kptllnd_data.kptl_rx_buffer_pool);
+                LASSERT (list_empty(&kptllnd_data.kptl_sched_rxq));
+                LASSERT (list_empty(&kptllnd_data.kptl_sched_rxbq));
+
+                /* Hold peertable lock to interleave cleanly with peer birth/death */
+                write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+                LASSERT (kptllnd_data.kptl_shutdown == 0);
+                kptllnd_data.kptl_shutdown = 1; /* phase 1 == destroy peers */
+
+                /* no new peers possible now */
+                write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, 
+                                        flags);
+
+                /* nuke all existing peers */
+                process_id.nid = LNET_NID_ANY;
+                process_id.pid = LNET_PID_ANY;
+                kptllnd_peer_del(process_id);
+
+                read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+                LASSERT (kptllnd_data.kptl_n_active_peers == 0);
+
+                i = 2;
+                while (kptllnd_data.kptl_npeers != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET,
+                               "Waiting for %d peers to terminate\n",
+                               kptllnd_data.kptl_npeers);
+
+                        read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, 
+                                               flags);
+
+                        cfs_pause(cfs_time_seconds(1));
+
+                        read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, 
+                                          flags);
+                }
+
+                LASSERT(list_empty(&kptllnd_data.kptl_closing_peers));
+                LASSERT(list_empty(&kptllnd_data.kptl_zombie_peers));
+                LASSERT (kptllnd_data.kptl_peers != NULL);
+                for (i = 0; i < kptllnd_data.kptl_peer_hash_size; i++)
+                        LASSERT (list_empty (&kptllnd_data.kptl_peers[i]));
+
+                read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+                CDEBUG(D_NET, "All peers deleted\n");
+
+                /* Shutdown phase 2: kill the daemons... */
+                kptllnd_data.kptl_shutdown = 2;
+                mb();
+                
+                i = 2;
+                while (atomic_read (&kptllnd_data.kptl_nthreads) != 0) {
+                        /* Wake up all threads*/
+                        wake_up_all(&kptllnd_data.kptl_sched_waitq);
+                        wake_up_all(&kptllnd_data.kptl_watchdog_waitq);
+
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                               "Waiting for %d threads to terminate\n",
+                               atomic_read(&kptllnd_data.kptl_nthreads));
+                        cfs_pause(cfs_time_seconds(1));
+                }
+
+                CDEBUG(D_NET, "All Threads stopped\n");
+                LASSERT(list_empty(&kptllnd_data.kptl_sched_txq));
+
+                kptllnd_cleanup_tx_descs();
+
+                /* Nothing here now, but libcfs might soon require
+                 * us to explicitly destroy wait queues and semaphores
+                 * that would be done here */
+
+                /* fall through */
+
+        case PTLLND_INIT_NOTHING:
+                CDEBUG(D_NET, "PTLLND_INIT_NOTHING\n");
+                break;
+        }
+
+        if (!PtlHandleIsEqual(kptllnd_data.kptl_eqh, PTL_INVALID_HANDLE)) {
+                prc = PtlEQFree(kptllnd_data.kptl_eqh);
+                if (prc != PTL_OK)
+                        CERROR("Error %d freeing portals EQ\n", prc);
+        }
+
+        if (!PtlHandleIsEqual(kptllnd_data.kptl_nih, PTL_INVALID_HANDLE)) {
+                prc = PtlNIFini(kptllnd_data.kptl_nih);
+                if (prc != PTL_OK)
+                        CERROR("Error %d finalizing portals NI\n", prc);
+        }
+        
+        LASSERT (atomic_read(&kptllnd_data.kptl_ntx) == 0);
+        LASSERT (list_empty(&kptllnd_data.kptl_idle_txs));
+
+        if (kptllnd_data.kptl_rx_cache != NULL)
+                cfs_mem_cache_destroy(kptllnd_data.kptl_rx_cache);
+
+        if (kptllnd_data.kptl_peers != NULL)
+                LIBCFS_FREE (kptllnd_data.kptl_peers,
+                             sizeof (struct list_head) *
+                             kptllnd_data.kptl_peer_hash_size);
+
+        if (kptllnd_data.kptl_nak_msg != NULL)
+                LIBCFS_FREE (kptllnd_data.kptl_nak_msg,
+                             offsetof(kptl_msg_t, ptlm_u));
+
+        memset(&kptllnd_data, 0, sizeof(kptllnd_data));
+
+        CDEBUG(D_MALLOC, "after LND cleanup: kmem %d\n",
+               atomic_read (&libcfs_kmemory));
+
+        PORTAL_MODULE_UNUSE;
+}
+
+int __init
+kptllnd_module_init (void)
+{
+        int    rc;
+
+        kptllnd_assert_wire_constants();
+
+        rc = kptllnd_tunables_init();
+        if (rc != 0)
+                return rc;
+
+        kptllnd_init_ptltrace();
+
+        lnet_register_lnd(&kptllnd_lnd);
+
+        return 0;
+}
+
+void __exit
+kptllnd_module_fini (void)
+{
+        lnet_unregister_lnd(&kptllnd_lnd);
+        kptllnd_tunables_fini();
+}
+
+MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_DESCRIPTION("Kernel Portals LND v1.00");
+MODULE_LICENSE("GPL");
+
+module_init(kptllnd_module_init);
+module_exit(kptllnd_module_fini);
diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h
new file mode 100755 (executable)
index 0000000..7243a6b
--- /dev/null
@@ -0,0 +1,538 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/smp_lock.h>
+#include <linux/unistd.h>
+#include <linux/uio.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+#include <linux/random.h>
+
+#include <net/sock.h>
+#include <linux/in.h>
+
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <libcfs/kp30.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <portals/p30.h>
+#ifdef CRAY_XT3
+#include <portals/ptltrace.h>
+#endif
+#include <lnet/ptllnd.h>        /* Depends on portals/p30.h */
+
+/*
+ * Define this to enable console debug logging
+ * and simulation
+ */
+//#define PJK_DEBUGGING
+
+#if CONFIG_SMP
+# define PTLLND_N_SCHED         num_online_cpus()   /* # schedulers */
+#else
+# define PTLLND_N_SCHED         1                   /* # schedulers */
+#endif
+
+#define PTLLND_CREDIT_HIGHWATER ((*kptllnd_tunables.kptl_peercredits)-1)
+  /* when eagerly to return credits */
+
+typedef struct
+{
+        int             *kptl_ntx;              /* # tx descs to pre-allocate */
+        int             *kptl_max_nodes;        /* max # nodes all talking to me */
+        int             *kptl_max_procs_per_node; /* max # processes per node */
+        int             *kptl_checksum;         /* checksum kptl_msg_t? */
+        int             *kptl_timeout;          /* comms timeout (seconds) */
+        int             *kptl_portal;           /* portal number */
+        int             *kptl_pid;              /* portals PID (self + kernel peers) */
+        int             *kptl_rxb_npages;       /* number of pages for rx buffer */
+        int             *kptl_rxb_nspare;       /* number of spare rx buffers */
+        int             *kptl_credits;          /* number of credits */
+        int             *kptl_peercredits;      /* number of credits */
+        int             *kptl_max_msg_size;     /* max immd message size*/
+        int             *kptl_peer_hash_table_size; /* # slots in peer hash table */
+        int             *kptl_reschedule_loops; /* scheduler yield loops */
+#ifdef CRAY_XT3
+        int             *kptl_ptltrace_on_timeout; /* dump pltrace on timeout? */
+        char           **kptl_ptltrace_basename;  /* ptltrace dump file basename */
+#endif
+#ifdef PJK_DEBUGGING
+        int             *kptl_simulation_bitmap;/* simulation bitmap */
+#endif
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        struct ctl_table_header *kptl_sysctl;    /* sysctl interface */
+#endif
+} kptl_tunables_t;
+
+#include "lnet/ptllnd_wire.h"
+
+/***********************************************************************/
+
+typedef struct kptl_data kptl_data_t;
+typedef struct kptl_rx_buffer kptl_rx_buffer_t;
+typedef struct kptl_peer kptl_peer_t;
+
+typedef struct {
+        char      eva_type;
+} kptl_eventarg_t;
+
+#define PTLLND_EVENTARG_TYPE_MSG    0x1
+#define PTLLND_EVENTARG_TYPE_RDMA   0x2
+#define PTLLND_EVENTARG_TYPE_BUF    0x3
+
+typedef struct kptl_rx                          /* receive message */
+{
+        struct list_head        rx_list;        /* queue for attention */
+        kptl_rx_buffer_t       *rx_rxb;         /* the rx buffer pointer */
+        kptl_msg_t             *rx_msg;         /* received message */
+        int                     rx_nob;         /* received message size */
+        ptl_process_id_t        rx_initiator;   /* sender's address */
+#ifdef CRAY_XT3
+        ptl_uid_t               rx_uid;         /* sender's uid */
+#endif
+        kptl_peer_t            *rx_peer;        /* pointer to peer */
+        char                    rx_space[0];    /* copy of incoming request */
+} kptl_rx_t;
+
+typedef struct kptl_rx_buffer_pool
+{
+        spinlock_t              rxbp_lock;
+        struct list_head        rxbp_list;      /* all allocated buffers */
+        int                     rxbp_count;     /* # allocated buffers */
+        int                     rxbp_reserved;  /* # requests to buffer */
+        int                     rxbp_shutdown;  /* shutdown flag */
+} kptl_rx_buffer_pool_t;
+
+struct kptl_rx_buffer
+{
+        kptl_rx_buffer_pool_t  *rxb_pool;
+        struct list_head        rxb_list;       /* for the rxb_pool list */
+        struct list_head        rxb_repost_list;/* for the kptl_sched_rxbq list */
+        int                     rxb_posted:1;   /* on the net */
+        int                     rxb_idle:1;     /* all done */
+        kptl_eventarg_t         rxb_eventarg;   /* event->md.user_ptr */
+        int                     rxb_refcount;   /* reference count */
+        ptl_handle_md_t         rxb_mdh;        /* the portals memory descriptor (MD) handle */
+        char                   *rxb_buffer;     /* the buffer */
+
+};
+
+enum kptl_tx_type
+{
+        TX_TYPE_RESERVED                = 0,
+        TX_TYPE_SMALL_MESSAGE           = 1,
+        TX_TYPE_PUT_REQUEST             = 2,
+        TX_TYPE_GET_REQUEST             = 3,
+        TX_TYPE_PUT_RESPONSE            = 4,
+        TX_TYPE_GET_RESPONSE            = 5,
+};
+
+typedef union {
+#ifdef _USING_LUSTRE_PORTALS_
+        struct iovec iov[PTL_MD_MAX_IOV];
+        lnet_kiov_t kiov[PTL_MD_MAX_IOV];
+#else
+        ptl_md_iovec_t iov[PTL_MD_MAX_IOV];
+#endif
+} kptl_fragvec_t;
+
+typedef struct kptl_tx                           /* transmit message */
+{
+        struct list_head        tx_list;      /* queue on idle_txs etc */
+        atomic_t                tx_refcount;  /* reference count*/
+        enum kptl_tx_type       tx_type;      /* small msg/{put,get}{req,resp} */
+        int                     tx_active:1;  /* queued on the peer */
+        int                     tx_idle:1;    /* on the free list */
+        kptl_eventarg_t         tx_msg_eventarg; /* event->md.user_ptr */
+        kptl_eventarg_t         tx_rdma_eventarg; /* event->md.user_ptr */
+        int                     tx_status;    /* the status of this tx descriptor */
+        ptl_handle_md_t         tx_rdma_mdh;  /* RDMA buffer */
+        ptl_handle_md_t         tx_msg_mdh;   /* the portals MD handle for the initial message */
+        lnet_msg_t             *tx_lnet_msg;  /* LNET message to finalize */
+        lnet_msg_t             *tx_lnet_replymsg; /* LNET reply message to finalize */
+        kptl_msg_t             *tx_msg;       /* the message data */
+        kptl_peer_t            *tx_peer;      /* the peer this is waiting on */
+        unsigned long           tx_deadline;  /* deadline */
+        ptl_md_t                tx_rdma_md;   /* rdma buffer */
+        kptl_fragvec_t         *tx_rdma_frags; /* buffer fragments */
+} kptl_tx_t;
+
+enum kptllnd_peer_state
+{
+        PEER_STATE_UNINITIALIZED        = 0,
+        PEER_STATE_ALLOCATED            = 1,
+        PEER_STATE_WAITING_HELLO        = 2,
+        PEER_STATE_ACTIVE               = 3,
+        PEER_STATE_CLOSING              = 4,
+        PEER_STATE_ZOMBIE               = 5,
+};
+
+struct kptl_peer
+{
+        struct list_head        peer_list;
+        atomic_t                peer_refcount;          /* The current refrences */
+        enum kptllnd_peer_state peer_state;
+        spinlock_t              peer_lock;              /* serialize */
+        struct list_head        peer_sendq;             /* txs waiting for mh handles */
+        struct list_head        peer_activeq;           /* txs awaiting completion */
+        lnet_process_id_t       peer_id;                /* Peer's LNET id */
+        ptl_process_id_t        peer_ptlid;             /* Peer's portals id */
+        __u64                   peer_incarnation;       /* peer's incarnation */
+        int                     peer_sent_hello;        /* have I sent HELLO? */
+        int                     peer_credits;           /* number of send credits */
+        int                     peer_outstanding_credits;/* number of peer credits */
+        int                     peer_error;             /* errno on closing this peer */
+        cfs_time_t              peer_last_alive;        /* when (in jiffies) I was last alive */
+        __u64                   peer_next_matchbits;    /* Next value to register RDMA from peer */
+        __u64                   peer_last_matchbits_seen; /* last matchbits used to RDMA to peer */
+};
+
+struct kptl_data
+{
+        int                     kptl_init;             /* initialisation state */
+        volatile int            kptl_shutdown;         /* shut down? */
+        atomic_t                kptl_nthreads;         /* # live threads */
+        lnet_ni_t              *kptl_ni;               /* _the_ LND instance */
+        ptl_handle_ni_t         kptl_nih;              /* network inteface handle */
+        ptl_process_id_t        kptl_portals_id;       /* Portals ID of interface */
+        __u64                   kptl_incarnation;      /* which one am I */
+        ptl_handle_eq_t         kptl_eqh;              /* Event Queue (EQ) */
+
+        spinlock_t              kptl_sched_lock;       /* serialise... */
+        wait_queue_head_t       kptl_sched_waitq;      /* schedulers sleep here */
+        struct list_head        kptl_sched_txq;        /* tx requiring attention */
+        struct list_head        kptl_sched_rxq;        /* rx requiring attention */
+        struct list_head        kptl_sched_rxbq;       /* rxb requiring reposting */
+
+        wait_queue_head_t       kptl_watchdog_waitq;   /* watchdog sleeps here */
+
+        kptl_rx_buffer_pool_t   kptl_rx_buffer_pool;   /* rx buffer pool */
+        cfs_mem_cache_t*        kptl_rx_cache;         /* rx descripter cache */
+
+        atomic_t                kptl_ntx;              /* # tx descs allocated */
+        spinlock_t              kptl_tx_lock;          /* serialise idle tx list*/
+        struct list_head        kptl_idle_txs;         /* idle tx descriptors */
+
+        rwlock_t                kptl_peer_rw_lock;     /* lock for peer table */
+        struct list_head       *kptl_peers;            /* hash table of all my known peers */
+        struct list_head        kptl_closing_peers;    /* peers being closed */
+        struct list_head        kptl_zombie_peers;     /* peers waiting for refs to drain */
+        int                     kptl_peer_hash_size;   /* size of kptl_peers */
+        int                     kptl_npeers;           /* # peers extant */
+        int                     kptl_n_active_peers;   /* # active peers */
+        int                     kptl_expected_peers;   /* # peers I can buffer HELLOs from */
+
+        kptl_msg_t             *kptl_nak_msg;          /* common NAK message */
+        spinlock_t              kptl_ptlid2str_lock;   /* serialise str ops */
+};
+
+enum 
+{
+        PTLLND_INIT_NOTHING = 0,
+        PTLLND_INIT_DATA,
+        PTLLND_INIT_ALL,
+};
+
+extern kptl_tunables_t  kptllnd_tunables;
+extern kptl_data_t      kptllnd_data;
+
+static inline lnet_nid_t 
+kptllnd_ptl2lnetnid(ptl_nid_t ptl_nid)
+{
+#ifdef _USING_LUSTRE_PORTALS_
+        return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_ni->ni_nid), 
+                          LNET_NIDADDR(ptl_nid));
+#else
+       return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_ni->ni_nid), 
+                          ptl_nid);
+#endif
+}
+
+static inline ptl_nid_t 
+kptllnd_lnet2ptlnid(lnet_nid_t lnet_nid)
+{
+#ifdef _USING_LUSTRE_PORTALS_
+        return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_portals_id.nid),
+                          LNET_NIDADDR(lnet_nid));
+#else
+       return LNET_NIDADDR(lnet_nid);
+#endif
+}
+
+int  kptllnd_startup(lnet_ni_t *ni);
+void kptllnd_shutdown(lnet_ni_t *ni);
+int  kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int  kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int  kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                  int delayed, unsigned int niov, 
+                  struct iovec *iov, lnet_kiov_t *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int  kptllnd_eager_recv(struct lnet_ni *ni, void *private, 
+                        lnet_msg_t *msg, void **new_privatep);
+void kptllnd_eq_callback(ptl_event_t *evp);
+int  kptllnd_scheduler(void *arg);
+int  kptllnd_watchdog(void *arg);
+int  kptllnd_thread_start(int (*fn)(void *arg), void *arg);
+int  kptllnd_tunables_init(void);
+void kptllnd_tunables_fini(void);
+
+const char *kptllnd_evtype2str(int evtype);
+const char *kptllnd_msgtype2str(int msgtype);
+
+static inline void *
+kptllnd_eventarg2obj (kptl_eventarg_t *eva)
+{
+        switch (eva->eva_type) {
+        default:
+                LBUG();
+        case PTLLND_EVENTARG_TYPE_BUF:
+                return list_entry(eva, kptl_rx_buffer_t, rxb_eventarg);
+        case PTLLND_EVENTARG_TYPE_RDMA:
+                return list_entry(eva, kptl_tx_t, tx_rdma_eventarg);
+        case PTLLND_EVENTARG_TYPE_MSG:
+                return list_entry(eva, kptl_tx_t, tx_msg_eventarg);
+        }
+}
+
+/*
+ * RX BUFFER SUPPORT FUNCTIONS
+ */
+void kptllnd_rx_buffer_pool_init(kptl_rx_buffer_pool_t *rxbp);
+void kptllnd_rx_buffer_pool_fini(kptl_rx_buffer_pool_t *rxbp);
+int  kptllnd_rx_buffer_pool_reserve(kptl_rx_buffer_pool_t *rxbp, int count);
+void kptllnd_rx_buffer_pool_unreserve(kptl_rx_buffer_pool_t *rxbp, int count);
+void kptllnd_rx_buffer_callback(ptl_event_t *ev);
+void kptllnd_rx_buffer_post(kptl_rx_buffer_t *rxb);
+
+static inline int
+kptllnd_rx_buffer_size(void)
+{
+        return PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages);
+}
+
+static inline void
+kptllnd_rx_buffer_addref(kptl_rx_buffer_t *rxb)
+{
+        unsigned long flags;
+        
+        spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags);
+        rxb->rxb_refcount++;
+        spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags);
+}
+
+static inline void
+kptllnd_rx_buffer_decref_locked(kptl_rx_buffer_t *rxb)
+{
+        if (--(rxb->rxb_refcount) == 0) {
+                spin_lock(&kptllnd_data.kptl_sched_lock);
+        
+                list_add_tail(&rxb->rxb_repost_list,
+                              &kptllnd_data.kptl_sched_rxbq);
+                wake_up(&kptllnd_data.kptl_sched_waitq);
+
+                spin_unlock(&kptllnd_data.kptl_sched_lock);
+        }
+}
+
+static inline void
+kptllnd_rx_buffer_decref(kptl_rx_buffer_t *rxb)
+{
+        unsigned long flags;
+        int           count;
+        
+        spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags);
+        count = --(rxb->rxb_refcount);
+        spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags);
+
+        if (count == 0)
+                kptllnd_rx_buffer_post(rxb);
+}
+
+/*
+ * RX SUPPORT FUNCTIONS
+ */
+void kptllnd_rx_done(kptl_rx_t *rx);
+void kptllnd_rx_parse(kptl_rx_t *rx);
+
+/*
+ * PEER SUPPORT FUNCTIONS
+ */
+int kptllnd_get_peer_info(int index,
+                          lnet_process_id_t *id, 
+                          int *state, int *sent_hello,
+                          int *refcount, __u64 *incarnation,
+                          __u64 *next_matchbits, __u64 *last_matchbits_seen,
+                          int *nsendq, int *nactiveq,
+                          int *credits, int *outstanding_credits);
+void kptllnd_peer_destroy(kptl_peer_t *peer);
+int  kptllnd_peer_del(lnet_process_id_t id);
+void kptllnd_peer_close_locked(kptl_peer_t *peer, int why);
+void kptllnd_peer_close(kptl_peer_t *peer, int why);
+void kptllnd_handle_closing_peers(void);
+int  kptllnd_peer_connect(kptl_tx_t *tx, lnet_nid_t nid);
+void kptllnd_peer_check_sends(kptl_peer_t *peer);
+void kptllnd_peer_check_bucket(int idx);
+void kptllnd_tx_launch(kptl_tx_t *tx, lnet_process_id_t target);
+kptl_peer_t *kptllnd_peer_handle_hello(ptl_process_id_t initiator,
+                                       kptl_msg_t *msg);
+kptl_peer_t *kptllnd_id2peer_locked(lnet_process_id_t id);
+void kptllnd_peer_alive(kptl_peer_t *peer);
+
+static inline void
+kptllnd_peer_addref (kptl_peer_t *peer)
+{
+        atomic_inc(&peer->peer_refcount);
+}
+
+static inline void
+kptllnd_peer_decref (kptl_peer_t *peer)
+{
+        if (atomic_dec_and_test(&peer->peer_refcount))
+                kptllnd_peer_destroy(peer);
+}
+
+static inline void
+kptllnd_set_tx_peer(kptl_tx_t *tx, kptl_peer_t *peer) 
+{
+        LASSERT (tx->tx_peer == NULL);
+        
+        kptllnd_peer_addref(peer);
+        tx->tx_peer = peer;
+}
+
+static inline struct list_head *
+kptllnd_nid2peerlist(lnet_nid_t nid)
+{
+        unsigned int hash = ((unsigned int)nid) %
+                            kptllnd_data.kptl_peer_hash_size;
+
+        return &kptllnd_data.kptl_peers[hash];
+}
+
+static inline kptl_peer_t *
+kptllnd_id2peer(lnet_process_id_t id)
+{
+        kptl_peer_t   *peer;
+        unsigned long  flags;
+
+        read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+        peer = kptllnd_id2peer_locked(id);
+        read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        return peer;
+}
+
+static inline int
+kptllnd_reserve_buffers(int n)
+{
+        return kptllnd_rx_buffer_pool_reserve(&kptllnd_data.kptl_rx_buffer_pool,
+                                              n);
+}
+
+static inline int
+kptllnd_peer_reserve_buffers(void)
+{
+        return kptllnd_reserve_buffers(*kptllnd_tunables.kptl_peercredits);
+}
+
+static inline void
+kptllnd_peer_unreserve_buffers(void)
+{
+        kptllnd_rx_buffer_pool_unreserve(&kptllnd_data.kptl_rx_buffer_pool,
+                                         *kptllnd_tunables.kptl_peercredits);
+}
+
+/*
+ * TX SUPPORT FUNCTIONS
+ */
+int  kptllnd_setup_tx_descs(void);
+void kptllnd_cleanup_tx_descs(void);
+void kptllnd_tx_fini(kptl_tx_t *tx);
+kptl_tx_t *kptllnd_get_idle_tx(enum kptl_tx_type purpose);
+void kptllnd_tx_callback(ptl_event_t *ev);
+const char *kptllnd_tx_typestr(int type);
+
+static inline void
+kptllnd_tx_addref(kptl_tx_t *tx)
+{
+        atomic_inc(&tx->tx_refcount);
+}
+
+static inline void 
+kptllnd_tx_decref(kptl_tx_t *tx)
+{
+        LASSERT (!in_interrupt());        /* Thread context only */
+
+        if (atomic_dec_and_test(&tx->tx_refcount))
+                kptllnd_tx_fini(tx);
+}
+
+/*
+ * MESSAGE SUPPORT FUNCTIONS
+ */
+void kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob);
+void kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer);
+int  kptllnd_msg_unpack(kptl_msg_t *msg, int nob);
+
+/*
+ * MISC SUPPORT FUNCTIONS
+ */
+void kptllnd_init_rdma_md(kptl_tx_t *tx, unsigned int niov,
+                          struct iovec *iov, lnet_kiov_t *kiov,
+                          unsigned int offset, unsigned int nob);
+char *kptllnd_ptlid2str(ptl_process_id_t id);
+
+void kptllnd_init_ptltrace(void);
+void kptllnd_dump_ptltrace(void);
+
+#ifdef PJK_DEBUGGING
+#define SIMULATION_FAIL_TX_PUT_ALLOC   0       /* 0x00000001 */
+#define SIMULATION_FAIL_TX_GET_ALLOC   1       /* 0x00000002 */
+#define SIMULATION_FAIL_TX             2       /* 0x00000004 */
+#define SIMULATION_FAIL_RX_ALLOC       3       /* 0x00000008 */
+
+#define IS_SIMULATION_ENABLED(x) \
+        (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0)
+#else
+#define IS_SIMULATION_ENABLED(x)       0
+#endif
+
diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c
new file mode 100644 (file)
index 0000000..89456ac
--- /dev/null
@@ -0,0 +1,760 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+#include "ptllnd.h"
+
+#ifndef _USING_LUSTRE_PORTALS_
+int
+kptllnd_extract_iov (int dst_niov, ptl_md_iovec_t *dst,
+                     int src_niov, struct iovec *src,
+                     unsigned int offset, unsigned int len)
+{
+        /* Initialise 'dst' to the subset of 'src' starting at 'offset',
+         * for exactly 'len' bytes, and return the number of entries.
+         * NB not destructive to 'src' */
+        unsigned int    frag_len;
+        unsigned int    niov;
+
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->iov_len) {      /* skip initial frags */
+                offset -= src->iov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (niov <= dst_niov);
+
+                frag_len = src->iov_len - offset;
+                dst->iov_base = ((char *)src->iov_base) + offset;
+
+                if (len <= frag_len) {
+                        dst->iov_len = len;
+                        return (niov);
+                }
+
+                dst->iov_len = frag_len;
+
+                len -= frag_len;
+                dst++;
+                src++;
+                niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+
+int
+kptllnd_extract_phys (int dst_niov, ptl_md_iovec_t *dst,
+                      int src_niov, lnet_kiov_t *src,
+                      unsigned int offset, unsigned int len)
+{
+        /* Initialise 'dst' to the physical addresses of the subset of 'src'
+         * starting at 'offset', for exactly 'len' bytes, and return the number
+         * of entries.  NB not destructive to 'src' */
+        unsigned int    frag_len;
+        unsigned int    niov;
+        __u64           phys_page;
+        __u64           phys;
+
+        if (len == 0)                           /* no data => */
+                return (0);                     /* no frags */
+
+        LASSERT (src_niov > 0);
+        while (offset >= src->kiov_len) {      /* skip initial frags */
+                offset -= src->kiov_len;
+                src_niov--;
+                src++;
+                LASSERT (src_niov > 0);
+        }
+
+        niov = 1;
+        for (;;) {
+                LASSERT (src_niov > 0);
+                LASSERT (niov <= dst_niov);
+
+                frag_len = min(src->kiov_len - offset, len);
+                phys_page = lnet_page2phys(src->kiov_page);
+                phys = phys_page + src->kiov_offset + offset;
+
+                LASSERT (sizeof(void *) > 4 || 
+                         (phys <= 0xffffffffULL &&
+                          phys + (frag_len - 1) <= 0xffffffffULL));
+
+                dst->iov_base = (void *)((unsigned long)phys);
+                dst->iov_len = frag_len;
+                
+                if (frag_len == len)
+                        return niov;
+
+                len -= frag_len;
+                dst++;
+                src++;
+                niov++;
+                src_niov--;
+                offset = 0;
+        }
+}
+#endif
+
+void
+kptllnd_init_rdma_md(kptl_tx_t *tx, unsigned int niov,
+                     struct iovec *iov, lnet_kiov_t *kiov,
+                     unsigned int offset, unsigned int nob)
+{
+        LASSERT (iov == NULL || kiov == NULL);
+        
+        memset(&tx->tx_rdma_md, 0, sizeof(tx->tx_rdma_md));
+
+        tx->tx_rdma_md.start     = tx->tx_rdma_frags;
+        tx->tx_rdma_md.user_ptr  = &tx->tx_rdma_eventarg;
+        tx->tx_rdma_md.eq_handle = kptllnd_data.kptl_eqh;
+        tx->tx_rdma_md.options   = PTL_MD_LUSTRE_COMPLETION_SEMANTICS |
+                                   PTL_MD_EVENT_START_DISABLE;
+        switch (tx->tx_type) {
+        default:
+                LBUG();
+                
+        case TX_TYPE_PUT_REQUEST:               /* passive: peer gets */
+                tx->tx_rdma_md.threshold = 1;   /* GET event */
+                tx->tx_rdma_md.options |= PTL_MD_OP_GET;
+                break;
+
+        case TX_TYPE_GET_REQUEST:               /* passive: peer puts */
+                tx->tx_rdma_md.threshold = 1;   /* PUT event */
+                tx->tx_rdma_md.options |= PTL_MD_OP_PUT;
+                break;
+                
+        case TX_TYPE_PUT_RESPONSE:              /* active: I get */
+                tx->tx_rdma_md.threshold = 2;   /* SEND + REPLY */
+                break;
+                
+        case TX_TYPE_GET_RESPONSE:              /* active: I put */
+                tx->tx_rdma_md.threshold = 1;   /* SEND */
+                break;
+        }
+
+        if (nob == 0) {
+                tx->tx_rdma_md.length = 0;
+                return;
+        }
+
+#ifdef _USING_LUSTRE_PORTALS_
+        if (iov != NULL) {
+                tx->tx_rdma_md.options |= PTL_MD_IOVEC;
+                tx->tx_rdma_md.length = 
+                        lnet_extract_iov(PTL_MD_MAX_IOV, tx->tx_rdma_frags->iov,
+                                         niov, iov, offset, nob);
+                return;
+        }
+
+        /* Cheating OK since ptl_kiov_t == lnet_kiov_t */
+        CLASSERT(sizeof(ptl_kiov_t) == sizeof(lnet_kiov_t));
+        CLASSERT(offsetof(ptl_kiov_t, kiov_offset) ==
+                 offsetof(lnet_kiov_t, kiov_offset));
+        CLASSERT(offsetof(ptl_kiov_t, kiov_page) ==
+                 offsetof(lnet_kiov_t, kiov_page));
+        CLASSERT(offsetof(ptl_kiov_t, kiov_len) ==
+                 offsetof(lnet_kiov_t, kiov_len));
+        
+        tx->tx_rdma_md.options |= PTL_MD_KIOV;
+        tx->tx_rdma_md.length = 
+                lnet_extract_kiov(PTL_MD_MAX_IOV, tx->tx_rdma_frags->kiov,
+                                  niov, kiov, offset, nob);
+#else
+        if (iov != NULL) {
+                tx->tx_rdma_md.options |= PTL_MD_IOVEC;
+                tx->tx_rdma_md.length = 
+                        kptllnd_extract_iov(PTL_MD_MAX_IOV, tx->tx_rdma_frags->iov,
+                                            niov, iov, offset, nob);
+                return;
+        }
+
+        tx->tx_rdma_md.options |= PTL_MD_IOVEC | PTL_MD_PHYS;
+        tx->tx_rdma_md.length =
+                kptllnd_extract_phys(PTL_MD_MAX_IOV, tx->tx_rdma_frags->iov,
+                                     niov, kiov, offset, nob);
+#endif
+}
+
+int
+kptllnd_active_rdma(kptl_rx_t *rx, lnet_msg_t *lntmsg, int type,
+                    unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+                    unsigned int offset, int nob)
+{
+        kptl_tx_t       *tx;
+        ptl_err_t        ptlrc;
+        kptl_msg_t      *rxmsg = rx->rx_msg;
+        kptl_peer_t     *peer = rx->rx_peer;
+        unsigned long    flags;
+        ptl_handle_md_t  mdh;
+
+        LASSERT (type == TX_TYPE_PUT_RESPONSE || 
+                 type == TX_TYPE_GET_RESPONSE);
+
+        tx = kptllnd_get_idle_tx(type);
+        if (tx == NULL) {
+                CERROR ("Can't do %s rdma to %s: can't allocate descriptor\n",
+                        type == TX_TYPE_PUT_RESPONSE ? "GET" : "PUT",
+                        libcfs_id2str(peer->peer_id));
+                return -ENOMEM;
+        }
+
+        kptllnd_set_tx_peer(tx, peer);
+        kptllnd_init_rdma_md(tx, niov, iov, kiov, offset, nob);
+
+        ptlrc = PtlMDBind(kptllnd_data.kptl_nih, tx->tx_rdma_md, 
+                          PTL_UNLINK, &mdh);
+        if (ptlrc != PTL_OK) {
+                CERROR("PtlMDBind(%s) failed: %d\n",
+                       libcfs_id2str(peer->peer_id), ptlrc);
+                tx->tx_status = -EIO;
+                kptllnd_tx_decref(tx);
+                return -EIO;
+        }
+        
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        tx->tx_lnet_msg = lntmsg;
+        /* lnet_finalize() will be called when tx is torn down, so I must
+         * return success from here on... */
+
+        tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
+        tx->tx_rdma_mdh = mdh;
+        tx->tx_active = 1;
+        list_add_tail(&tx->tx_list, &peer->peer_activeq);
+
+        /* peer has now got my ref on 'tx' */
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        if (type == TX_TYPE_GET_RESPONSE)
+                ptlrc = PtlPut(mdh,
+                               PTL_NOACK_REQ,
+                               rx->rx_initiator,
+                               *kptllnd_tunables.kptl_portal,
+                               0,                     /* acl cookie */
+                               rxmsg->ptlm_u.rdma.kptlrm_matchbits,
+                               0,                     /* offset */
+                               (lntmsg != NULL) ?     /* header data */
+                               PTLLND_RDMA_OK :
+                               PTLLND_RDMA_FAIL);
+        else
+                ptlrc = PtlGet(mdh,
+                               rx->rx_initiator,
+                               *kptllnd_tunables.kptl_portal,
+                               0,                     /* acl cookie */
+                               rxmsg->ptlm_u.rdma.kptlrm_matchbits,
+                               0);                    /* offset */
+
+        if (ptlrc != PTL_OK) {
+                CERROR("Ptl%s failed: %d\n", 
+                       (type == TX_TYPE_GET_RESPONSE) ? "Put" : "Get", ptlrc);
+                
+                kptllnd_peer_close(peer, -EIO);
+                /* Everything (including this RDMA) queued on the peer will
+                 * be completed with failure */
+        }
+
+        return 0;
+}
+
+int
+kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+        int               type = lntmsg->msg_type;
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      payload_niov = lntmsg->msg_niov;
+        struct iovec     *payload_iov = lntmsg->msg_iov;
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        kptl_tx_t        *tx;
+        int               nob;
+
+        LASSERT (payload_nob == 0 || payload_niov > 0);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
+        LASSERT (payload_niov <= PTL_MD_MAX_IOV); /* !!! */
+        LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
+        LASSERT (!in_interrupt());
+
+        switch (type) {
+        default:
+                LBUG();
+                return -EINVAL;
+
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
+                /* Is the payload small enough not to need RDMA? */
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]);
+                if (nob <= *kptllnd_tunables.kptl_max_msg_size)
+                        break;
+
+                tx = kptllnd_get_idle_tx(TX_TYPE_PUT_REQUEST);
+                if (tx == NULL) {
+                        CERROR("Can't send %s to %s: can't allocate descriptor\n",
+                               lnet_msgtyp2str(type),
+                               libcfs_id2str(target));
+                        return -ENOMEM;
+                }
+
+                kptllnd_init_rdma_md(tx, payload_niov, 
+                                     payload_iov, payload_kiov,
+                                     payload_offset, payload_nob);
+
+                tx->tx_lnet_msg = lntmsg;
+                tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr;
+                kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_PUT,
+                                  sizeof(kptl_rdma_msg_t));
+                kptllnd_tx_launch(tx, target);
+                return 0;
+
+        case LNET_MSG_GET:
+                /* routed gets don't RDMA */
+                if (target_is_router || routing)
+                        break;
+
+                /* Is the payload small enough not to need RDMA? */
+                nob = lntmsg->msg_md->md_length;
+                nob = offsetof(kptl_msg_t, 
+                               ptlm_u.immediate.kptlim_payload[nob]);
+                if (nob <= *kptllnd_tunables.kptl_max_msg_size)
+                        break;
+
+                tx = kptllnd_get_idle_tx(TX_TYPE_GET_REQUEST);
+                if (tx == NULL) {
+                        CERROR("Can't send GET to %s: can't allocate descriptor\n",
+                               libcfs_id2str(target));
+                        return -ENOMEM;
+                }
+
+                tx->tx_lnet_replymsg =
+                        lnet_create_reply_msg(kptllnd_data.kptl_ni, lntmsg);
+                if (tx->tx_lnet_replymsg == NULL) {
+                        CERROR("Failed to allocate LNET reply for %s\n",
+                               libcfs_id2str(target));
+                        kptllnd_tx_decref(tx);
+                        return -ENOMEM;
+                }
+
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                        kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov,
+                                             lntmsg->msg_md->md_iov.iov, NULL,
+                                             0, lntmsg->msg_md->md_length);
+                else
+                        kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov,
+                                             NULL, lntmsg->msg_md->md_iov.kiov,
+                                             0, lntmsg->msg_md->md_length);
+                
+                tx->tx_lnet_msg = lntmsg;
+                tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr;
+                kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_GET,
+                                  sizeof(kptl_rdma_msg_t));
+                kptllnd_tx_launch(tx, target);
+                return 0;
+
+        case LNET_MSG_ACK:
+                CDEBUG(D_NET, "LNET_MSG_ACK\n");
+                LASSERT (payload_nob == 0);
+                break;
+        }
+
+        tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE);
+        if (tx == NULL) {
+                CERROR("Can't send %s to %s: can't allocate descriptor\n",
+                       lnet_msgtyp2str(type), libcfs_id2str(target));
+                        return -ENOMEM;
+        }
+
+        tx->tx_lnet_msg = lntmsg;
+        tx->tx_msg->ptlm_u.immediate.kptlim_hdr = *hdr;
+
+        if (payload_kiov != NULL)
+                lnet_copy_kiov2flat(*kptllnd_tunables.kptl_max_msg_size,
+                                    tx->tx_msg->ptlm_u.immediate.kptlim_payload,
+                                    0,
+                                    payload_niov, payload_kiov,
+                                    payload_offset, payload_nob);
+        else
+                lnet_copy_iov2flat(*kptllnd_tunables.kptl_max_msg_size,
+                                   tx->tx_msg->ptlm_u.immediate.kptlim_payload,
+                                   0,
+                                   payload_niov, payload_iov,
+                                   payload_offset, payload_nob);
+
+        nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]);
+        kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE, nob);
+        kptllnd_tx_launch(tx, target);
+        return 0;
+}
+
+int 
+kptllnd_eager_recv(struct lnet_ni *ni, void *private,
+                   lnet_msg_t *msg, void **new_privatep)
+{
+        kptl_rx_t        *rx = private;
+
+        CDEBUG(D_NET, "Eager RX=%p RXB=%p\n", rx, rx->rx_rxb);
+
+        /* I have to release my ref on rxb (if I have one) to ensure I'm an
+         * eager receiver, so I copy the incoming request from the buffer it
+         * landed in, into space reserved in the descriptor... */
+
+#if (PTL_MD_LOCAL_ALIGN8 == 0)
+        if (rx->rx_rxb == NULL)                 /* already copied */
+                return 0;                       /* to fix alignment */
+#else
+        LASSERT(rx->rx_rxb != NULL);
+#endif
+        LASSERT(rx->rx_nob <= *kptllnd_tunables.kptl_max_msg_size);
+
+        memcpy(rx->rx_space, rx->rx_msg, rx->rx_nob);
+        rx->rx_msg = (kptl_msg_t *)rx->rx_space;
+
+        kptllnd_rx_buffer_decref(rx->rx_rxb);
+        rx->rx_rxb = NULL;
+
+        return 0;
+}
+
+
+int 
+kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+              unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+              unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        kptl_rx_t    *rx = private;
+        kptl_msg_t   *rxmsg = rx->rx_msg;
+        int           nob;
+        int           rc;
+
+        CDEBUG(D_NET, "%s niov=%d offset=%d mlen=%d rlen=%d\n",
+               kptllnd_msgtype2str(rxmsg->ptlm_type),
+               niov, offset, mlen, rlen);
+
+        LASSERT (mlen <= rlen);
+        LASSERT (mlen >= 0);
+        LASSERT (!in_interrupt());
+        LASSERT (!(kiov != NULL && iov != NULL)); /* never both */
+        LASSERT (niov <= PTL_MD_MAX_IOV);       /* !!! */
+
+#ifdef CRAY_XT3
+        if (lntmsg != NULL &&
+            rx->rx_uid != 0) {
+                /* Set the UID if the sender's uid isn't 0; i.e. non-root
+                 * running in userspace (e.g. a catamount node; linux kernel
+                 * senders, including routers have uid 0).  If this is a lustre
+                 * RPC request, this tells lustre not to trust the creds in the
+                 * RPC message body. */
+                lnet_set_msg_uid(ni, lntmsg, rx->rx_uid);
+        }
+#endif
+        switch(rxmsg->ptlm_type)
+        {
+        default:
+                LBUG();
+                rc = -EINVAL;
+                break;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE %d,%d\n", mlen, rlen);
+
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[rlen]);
+                if (nob > rx->rx_nob) {
+                        CERROR ("Immediate message from %s too big: %d(%d)\n",
+                                libcfs_id2str(rx->rx_peer->peer_id), nob,
+                                rx->rx_nob);
+                        rc = -EINVAL;
+                        break;
+                }
+
+                if (kiov != NULL)
+                        lnet_copy_flat2kiov(
+                                niov, kiov, offset,
+                                *kptllnd_tunables.kptl_max_msg_size,
+                                rxmsg->ptlm_u.immediate.kptlim_payload,
+                                0,
+                                mlen);
+                else
+                        lnet_copy_flat2iov(
+                                niov, iov, offset,
+                                *kptllnd_tunables.kptl_max_msg_size,
+                                rxmsg->ptlm_u.immediate.kptlim_payload,
+                                0,
+                                mlen);
+
+                lnet_finalize (ni, lntmsg, 0);
+                rc = 0;
+                break;
+
+        case PTLLND_MSG_TYPE_GET:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_GET %d,%d\n", mlen, rlen);
+
+                /* NB always send RDMA so the peer can complete.  I send
+                 * success/failure in the portals 'hdr_data' */
+
+                if (lntmsg == NULL)
+                        rc = kptllnd_active_rdma(rx, NULL,
+                                                 TX_TYPE_GET_RESPONSE,
+                                                 0, NULL, NULL, 0, 0);
+                else
+                        rc = kptllnd_active_rdma(rx, lntmsg, 
+                                                 TX_TYPE_GET_RESPONSE,
+                                                 lntmsg->msg_niov,
+                                                 lntmsg->msg_iov, 
+                                                 lntmsg->msg_kiov,
+                                                 lntmsg->msg_offset, 
+                                                 lntmsg->msg_len);
+                break;
+
+        case PTLLND_MSG_TYPE_PUT:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_PUT %d,%d\n", mlen, rlen);
+
+                /* NB always send RDMA so the peer can complete; it'll be 0
+                 * bytes if there was no match (lntmsg == NULL). I have no way
+                 * to let my peer know this, but she's only interested in when
+                 * the net has stopped accessing her buffer in any case. */
+
+                rc = kptllnd_active_rdma(rx, lntmsg, TX_TYPE_PUT_RESPONSE,
+                                         niov, iov, kiov, offset, mlen);
+                break;
+        }
+
+        /*
+         * We're done with the RX
+         */
+        kptllnd_rx_done(rx);
+        return rc;
+}
+
+void
+kptllnd_eq_callback(ptl_event_t *ev)
+{
+        kptl_eventarg_t *eva = ev->md.user_ptr;
+
+        switch (eva->eva_type) {
+        default:
+                LBUG();
+                
+        case PTLLND_EVENTARG_TYPE_MSG:
+        case PTLLND_EVENTARG_TYPE_RDMA:
+                kptllnd_tx_callback(ev);
+                break;
+                
+        case PTLLND_EVENTARG_TYPE_BUF:
+                kptllnd_rx_buffer_callback(ev);
+                break;
+        }
+}
+
+void
+kptllnd_thread_fini (void)
+{
+        atomic_dec(&kptllnd_data.kptl_nthreads);
+}
+
+int
+kptllnd_thread_start (int (*fn)(void *arg), void *arg)
+{
+        long                pid;
+
+        atomic_inc(&kptllnd_data.kptl_nthreads);
+
+        pid = kernel_thread (fn, arg, 0);
+        if (pid >= 0)
+                return 0;
+        
+        CERROR("Failed to start kernel_thread: error %d\n", (int)pid);
+        kptllnd_thread_fini();
+        return (int)pid;
+}
+
+int
+kptllnd_watchdog(void *arg)
+{
+        int                 id = (long)arg;
+        char                name[16];
+        wait_queue_t        waitlink;
+        int                 peer_index = 0;
+        unsigned long       deadline = jiffies;
+        int                 timeout;
+        int                 i;
+
+        snprintf(name, sizeof(name), "kptllnd_wd_%02d", id);
+        cfs_daemonize(name);
+        cfs_block_allsigs();
+
+        init_waitqueue_entry(&waitlink, current);
+
+        /* threads shut down in phase 2 after all peers have been destroyed */
+        while (kptllnd_data.kptl_shutdown < 2) {
+
+                timeout = (int)(deadline - jiffies);
+                
+                if (timeout <= 0) {
+                        const int n = 4;
+                        const int p = 1;
+                        int       chunk = kptllnd_data.kptl_peer_hash_size;
+
+
+                        /* Time to check for RDMA timeouts on a few more
+                         * peers: I do checks every 'p' seconds on a
+                         * proportion of the peer table and I need to check
+                         * every connection 'n' times within a timeout
+                         * interval, to ensure I detect a timeout on any
+                         * connection within (n+1)/n times the timeout
+                         * interval. */
+
+                        if ((*kptllnd_tunables.kptl_timeout) > n * p)
+                                chunk = (chunk * n * p) /
+                                        (*kptllnd_tunables.kptl_timeout);
+                        if (chunk == 0)
+                                chunk = 1;
+
+                        for (i = 0; i < chunk; i++) {
+                                kptllnd_peer_check_bucket(peer_index);
+                                peer_index = (peer_index + 1) %
+                                     kptllnd_data.kptl_peer_hash_size;
+                        }
+
+                        deadline += p * HZ;
+                        continue;
+                }
+
+                kptllnd_handle_closing_peers();
+
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue_exclusive(&kptllnd_data.kptl_watchdog_waitq,
+                                         &waitlink);
+
+                schedule_timeout(timeout);
+                
+                set_current_state (TASK_RUNNING);
+                remove_wait_queue(&kptllnd_data.kptl_watchdog_waitq, &waitlink);
+        }
+
+        kptllnd_thread_fini();
+        CDEBUG(D_NET, "<<<\n");
+        return (0);
+};
+
+int
+kptllnd_scheduler (void *arg)
+{
+        int                 id = (long)arg;
+        char                name[16];
+        wait_queue_t        waitlink;
+        unsigned long       flags;
+        int                 did_something;
+        int                 counter = 0;
+        kptl_rx_t          *rx;
+        kptl_rx_buffer_t   *rxb;
+        kptl_tx_t          *tx;
+
+        snprintf(name, sizeof(name), "kptllnd_sd_%02d", id);
+        cfs_daemonize(name);
+        cfs_block_allsigs();
+
+        init_waitqueue_entry(&waitlink, current);
+
+        spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags);
+
+        /* threads shut down in phase 2 after all peers have been destroyed */
+        while (kptllnd_data.kptl_shutdown < 2) {
+
+                did_something = 0;
+
+                if (!list_empty(&kptllnd_data.kptl_sched_rxq)) {
+                        rx = list_entry (kptllnd_data.kptl_sched_rxq.next,
+                                         kptl_rx_t, rx_list);
+                        list_del(&rx->rx_list);
+                        
+                        spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock,
+                                               flags);
+
+                        kptllnd_rx_parse(rx);
+                        did_something = 1;
+
+                        spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags);
+                }
+
+                if (!list_empty(&kptllnd_data.kptl_sched_rxbq)) {
+                        rxb = list_entry (kptllnd_data.kptl_sched_rxbq.next,
+                                          kptl_rx_buffer_t, rxb_repost_list);
+                        list_del(&rxb->rxb_repost_list);
+
+                        spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock,
+                                               flags);
+
+                        kptllnd_rx_buffer_post(rxb);
+                        did_something = 1;
+
+                        spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags);
+                }
+
+                if (!list_empty(&kptllnd_data.kptl_sched_txq)) {
+                        tx = list_entry (kptllnd_data.kptl_sched_txq.next,
+                                         kptl_tx_t, tx_list);
+                        list_del_init(&tx->tx_list);
+
+                        spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags);
+
+                        kptllnd_tx_fini(tx);
+                        did_something = 1;
+
+                        spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags);
+                }
+
+                if (did_something) {
+                        if (++counter != *kptllnd_tunables.kptl_reschedule_loops)
+                                continue;
+                }
+
+                set_current_state(TASK_INTERRUPTIBLE);
+                add_wait_queue_exclusive(&kptllnd_data.kptl_sched_waitq,
+                                         &waitlink);
+                spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags);
+
+                if (!did_something)
+                        schedule(); 
+                else
+                        cond_resched();
+
+                set_current_state(TASK_RUNNING);
+                remove_wait_queue(&kptllnd_data.kptl_sched_waitq, &waitlink);
+
+                spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags);
+
+                counter = 0;
+        }
+
+        spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags);
+
+        kptllnd_thread_fini();
+        return 0;
+}
+
diff --git a/lnet/klnds/ptllnd/ptllnd_modparams.c b/lnet/klnds/ptllnd/ptllnd_modparams.c
new file mode 100644 (file)
index 0000000..84b62d6
--- /dev/null
@@ -0,0 +1,217 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+
+#include "ptllnd.h"
+
+static int ntx = 256;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of TX descriptors");
+
+static int max_nodes = 1152;
+CFS_MODULE_PARM(max_nodes, "i", int, 0444,
+               "maximum number of peer nodes");
+
+static int max_procs_per_node = 2;
+CFS_MODULE_PARM(max_procs_per_node, "i", int, 0444,
+               "maximum number of processes per peer node to cache");
+
+static int checksum = 0;
+CFS_MODULE_PARM(checksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int portal = PTLLND_PORTAL;              /* <lnet/ptllnd_wire.h> */
+CFS_MODULE_PARM(portal, "i", int, 0444,
+               "portal id");
+
+static int pid = PTLLND_PID;                    /* <lnet/ptllnd_wire.h> */
+CFS_MODULE_PARM(pid, "i", int, 0444,
+               "portals pid");
+
+static int rxb_npages = 1;
+CFS_MODULE_PARM(rxb_npages, "i", int, 0444,
+               "# of pages per rx buffer");
+
+static int rxb_nspare = 8;
+CFS_MODULE_PARM(rxb_nspare, "i", int, 0444,
+                "# of spare rx buffers");
+
+static int credits = 128;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "concurrent sends");
+
+static int peercredits = PTLLND_PEERCREDITS;    /* <lnet/ptllnd_wire.h> */
+CFS_MODULE_PARM(peercredits, "i", int, 0444,
+               "concurrent sends to 1 peer");
+
+static int max_msg_size = PTLLND_MAX_MSG_SIZE;  /* <lnet/ptllnd_wire.h> */
+CFS_MODULE_PARM(max_msg_size, "i", int, 0444,
+               "max size of immediate message");
+
+static int peer_hash_table_size = 101;
+CFS_MODULE_PARM(peer_hash_table_size, "i", int, 0444,
+               "# of slots in the peer hash table");
+
+static int reschedule_loops = 100;
+CFS_MODULE_PARM(reschedule_loops, "i", int, 0644,
+                "# of loops before scheduler does cond_resched()");
+
+#ifdef CRAY_XT3
+static int ptltrace_on_timeout = 0;
+CFS_MODULE_PARM(ptltrace_on_timeout, "i", int, 0644,
+               "dump ptltrace on timeout");
+
+static char *ptltrace_basename = "/tmp/lnet-ptltrace";
+CFS_MODULE_PARM(ptltrace_basename, "s", charp, 0644,
+                "ptltrace dump file basename");
+#endif
+#ifdef PJK_DEBUGGING
+static int simulation_bitmap = 0;
+CFS_MODULE_PARM(simulation_bitmap, "i", int, 0444,
+               "simulation bitmap");
+#endif
+
+
+kptl_tunables_t kptllnd_tunables = {
+        .kptl_ntx                    = &ntx,
+        .kptl_max_nodes              = &max_nodes,
+        .kptl_max_procs_per_node     = &max_procs_per_node,
+        .kptl_checksum               = &checksum,
+        .kptl_portal                 = &portal,
+        .kptl_pid                    = &pid,
+        .kptl_timeout                = &timeout,
+        .kptl_rxb_npages             = &rxb_npages,
+        .kptl_rxb_nspare             = &rxb_nspare,
+        .kptl_credits                = &credits,
+        .kptl_peercredits            = &peercredits,
+        .kptl_max_msg_size           = &max_msg_size,
+        .kptl_peer_hash_table_size   = &peer_hash_table_size,
+        .kptl_reschedule_loops       = &reschedule_loops,
+#ifdef CRAY_XT3
+        .kptl_ptltrace_on_timeout    = &ptltrace_on_timeout,
+        .kptl_ptltrace_basename      = &ptltrace_basename,
+#endif
+#ifdef PJK_DEBUGGING
+        .kptl_simulation_bitmap      = &simulation_bitmap,
+#endif
+};
+
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+#ifdef CRAY_XT3
+static char ptltrace_basename_space[1024];
+
+static void
+kptllnd_init_strtunable(char **str_param, char *space, int size)
+{
+        strncpy(space, *str_param, size);
+        space[size - 1] = 0;
+        *str_param = space;
+}
+#endif
+
+static ctl_table kptllnd_ctl_table[] = {
+       {1, "ntx", &ntx,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {2, "max_nodes", &max_nodes,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {3, "max_procs_per_node", &max_procs_per_node,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {4, "checksum", &checksum,
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {5, "timeout", &timeout,
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {6, "portal", &portal,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "pid", &pid,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {8, "rxb_npages", &rxb_npages,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {9, "credits", &credits,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "peercredits", &peercredits,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {11, "max_msg_size", &max_msg_size,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {12, "peer_hash_table_size", &peer_hash_table_size,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {13, "reschedule_loops", &reschedule_loops,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+#ifdef CRAY_XT3
+       {14, "ptltrace_on_timeout", &ptltrace_on_timeout,
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {15, "ptltrace_basename", ptltrace_basename_space,
+        sizeof(ptltrace_basename_space), 0644, NULL, &proc_dostring,
+        &sysctl_string},
+#endif
+#ifdef PJK_DEBUGGING
+       {16, "simulation_bitmap", &simulation_bitmap,
+        sizeof(int), 0444, NULL, &proc_dointvec},
+#endif
+
+       {0}
+};
+
+static ctl_table kptllnd_top_ctl_table[] = {
+       {203, "ptllnd", NULL, 0, 0555, kptllnd_ctl_table},
+       {0}
+};
+
+int
+kptllnd_tunables_init ()
+{
+#ifdef CRAY_XT3
+        kptllnd_init_strtunable(&ptltrace_basename,
+                                ptltrace_basename_space,
+                                sizeof(ptltrace_basename_space));
+#endif
+       kptllnd_tunables.kptl_sysctl =
+               register_sysctl_table(kptllnd_top_ctl_table, 0);
+
+       if (kptllnd_tunables.kptl_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+kptllnd_tunables_fini ()
+{
+       if (kptllnd_tunables.kptl_sysctl != NULL)
+               unregister_sysctl_table(kptllnd_tunables.kptl_sysctl);
+}
+
+#else
+
+int
+kptllnd_tunables_init ()
+{
+       return 0;
+}
+
+void
+kptllnd_tunables_fini ()
+{
+}
+
+#endif
+
diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c
new file mode 100644 (file)
index 0000000..cefcb7d
--- /dev/null
@@ -0,0 +1,1209 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *           E Barton <eeb@bartonsoftware.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+#include "ptllnd.h"
+#include <libcfs/list.h>
+
+static int
+kptllnd_count_queue(struct list_head *q)
+{
+        struct list_head *e;
+        int               n = 0;
+        
+        list_for_each(e, q) {
+                n++;
+        }
+
+        return n;
+}
+
+int
+kptllnd_get_peer_info(int index, 
+                      lnet_process_id_t *id,
+                      int *state, int *sent_hello,
+                      int *refcount, __u64 *incarnation,
+                      __u64 *next_matchbits, __u64 *last_matchbits_seen,
+                      int *nsendq, int *nactiveq,
+                      int *credits, int *outstanding_credits) 
+{
+        rwlock_t         *g_lock = &kptllnd_data.kptl_peer_rw_lock;
+        unsigned long     flags;
+        struct list_head *ptmp;
+        kptl_peer_t      *peer;
+        int               i;
+        int               rc = -ENOENT;
+
+        read_lock_irqsave(g_lock, flags);
+
+        for (i = 0; i < kptllnd_data.kptl_peer_hash_size; i++) {
+                
+                list_for_each (ptmp, &kptllnd_data.kptl_peers[i]) {
+                        peer = list_entry(ptmp, kptl_peer_t, peer_list);
+
+                        if (index-- > 0)
+                                continue;
+                        
+                        *id          = peer->peer_id;
+                        *state       = peer->peer_state;
+                        *sent_hello  = peer->peer_sent_hello;
+                        *refcount    = atomic_read(&peer->peer_refcount);
+                        *incarnation = peer->peer_incarnation;
+
+                        spin_lock(&peer->peer_lock);
+
+                        *next_matchbits      = peer->peer_next_matchbits;
+                        *last_matchbits_seen = peer->peer_last_matchbits_seen;
+                        *credits             = peer->peer_credits;
+                        *outstanding_credits = peer->peer_outstanding_credits;
+
+                        *nsendq   = kptllnd_count_queue(&peer->peer_sendq);
+                        *nactiveq = kptllnd_count_queue(&peer->peer_activeq);
+
+                        spin_unlock(&peer->peer_lock);
+
+                        rc = 0;
+                        goto out;
+                }
+        }
+        
+ out:
+        read_unlock_irqrestore(g_lock, flags);
+        return rc;
+}
+
+void
+kptllnd_peer_add_peertable_locked (kptl_peer_t *peer)
+{
+        LASSERT (kptllnd_data.kptl_n_active_peers <
+                 kptllnd_data.kptl_expected_peers);
+
+        LASSERT (peer->peer_state == PEER_STATE_WAITING_HELLO ||
+                 peer->peer_state == PEER_STATE_ACTIVE);
+        
+        kptllnd_data.kptl_n_active_peers++;
+        atomic_inc(&peer->peer_refcount);       /* +1 ref for the list */
+
+        /* NB add to HEAD of peer list for MRU order!
+         * (see kptllnd_cull_peertable) */
+        list_add(&peer->peer_list, kptllnd_nid2peerlist(peer->peer_id.nid));
+}
+
+void
+kptllnd_cull_peertable_locked (lnet_process_id_t pid)
+{
+        /* I'm about to add a new peer with this portals ID to the peer table,
+         * so (a) this peer should not exist already and (b) I want to leave at
+         * most (max_procs_per_nid - 1) peers with this NID in the table. */
+        struct list_head  *peers = kptllnd_nid2peerlist(pid.nid);
+        int                cull_count = *kptllnd_tunables.kptl_max_procs_per_node;
+        int                count;
+        struct list_head  *tmp;
+        struct list_head  *nxt;
+        kptl_peer_t       *peer;
+        
+        count = 0;
+        list_for_each_safe (tmp, nxt, peers) {
+                /* NB I rely on kptllnd_peer_add_peertable_locked to add peers
+                 * in MRU order */
+                peer = list_entry(tmp, kptl_peer_t, peer_list);
+                        
+                if (peer->peer_id.nid != pid.nid)
+                        continue;
+
+                LASSERT (peer->peer_id.pid != pid.pid);
+                        
+                count++;
+
+                if (count < cull_count) /* recent (don't cull) */
+                        continue;
+
+                CDEBUG(D_NET, "Cull %s(%s)\n",
+                       libcfs_id2str(peer->peer_id),
+                       kptllnd_ptlid2str(peer->peer_ptlid));
+                
+                kptllnd_peer_close_locked(peer, 0);
+        }
+}
+
+kptl_peer_t *
+kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid)
+{
+        unsigned long    flags;
+        kptl_peer_t     *peer;
+
+        LIBCFS_ALLOC(peer, sizeof (*peer));
+        if (peer == NULL) {
+                CERROR("Can't create peer %s (%s)\n",
+                       libcfs_id2str(lpid), 
+                       kptllnd_ptlid2str(ppid));
+                return NULL;
+        }
+
+        memset(peer, 0, sizeof(*peer));         /* zero flags etc */
+
+        INIT_LIST_HEAD (&peer->peer_sendq);
+        INIT_LIST_HEAD (&peer->peer_activeq);
+        spin_lock_init (&peer->peer_lock);
+
+        peer->peer_state = PEER_STATE_ALLOCATED;
+        peer->peer_error = 0;
+        peer->peer_last_alive = cfs_time_current();
+        peer->peer_id = lpid;
+        peer->peer_ptlid = ppid;
+        peer->peer_credits = 1;                 /* enough for HELLO */
+        peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS;
+        peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1;
+
+        atomic_set(&peer->peer_refcount, 1);    /* 1 ref for caller */
+
+        write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        /* Only increase # peers under lock, to guarantee we dont grow it
+         * during shutdown */
+        if (kptllnd_data.kptl_shutdown) {
+                write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, 
+                                        flags);
+                LIBCFS_FREE(peer, sizeof(*peer));
+                return NULL;
+        }
+
+        kptllnd_data.kptl_npeers++;
+        write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+        
+        return peer;
+}
+
+void
+kptllnd_peer_destroy (kptl_peer_t *peer)
+{
+        unsigned long flags;
+        
+        CDEBUG(D_NET, "Peer=%p\n", peer);
+
+        LASSERT (!in_interrupt());
+        LASSERT (atomic_read(&peer->peer_refcount) == 0);
+        LASSERT (peer->peer_state == PEER_STATE_ALLOCATED ||
+                 peer->peer_state == PEER_STATE_ZOMBIE);
+        LASSERT (list_empty(&peer->peer_sendq));
+        LASSERT (list_empty(&peer->peer_activeq));
+
+        write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        if (peer->peer_state == PEER_STATE_ZOMBIE)
+                list_del(&peer->peer_list);
+
+        kptllnd_data.kptl_npeers--;
+
+        write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        LIBCFS_FREE (peer, sizeof (*peer));
+}
+
+void
+kptllnd_peer_cancel_txs(kptl_peer_t *peer)
+{
+        struct list_head   sendq;
+        struct list_head   activeq;
+        struct list_head  *tmp;
+        struct list_head  *nxt;
+        kptl_tx_t         *tx;
+        unsigned long      flags;
+
+        /* atomically grab all the peer's tx-es... */
+
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        list_add(&sendq, &peer->peer_sendq);
+        list_del_init(&peer->peer_sendq);
+        list_for_each (tmp, &sendq) {
+                tx = list_entry(tmp, kptl_tx_t, tx_list);
+                tx->tx_active = 0;
+        }
+
+        list_add(&activeq, &peer->peer_activeq);
+        list_del_init(&peer->peer_activeq);
+        list_for_each (tmp, &activeq) {
+                tx = list_entry(tmp, kptl_tx_t, tx_list);
+                tx->tx_active = 0;
+        }
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        /* ...then drop the peer's ref on them at leasure.  This will get
+         * kptllnd_tx_fini() to abort outstanding comms if necessary. */
+
+        list_for_each_safe (tmp, nxt, &sendq) {
+                tx = list_entry(tmp, kptl_tx_t, tx_list);
+                list_del(&tx->tx_list);
+                tx->tx_status = -EIO;
+                kptllnd_tx_decref(tx);
+        }
+
+        list_for_each_safe (tmp, nxt, &activeq) {
+                tx = list_entry(tmp, kptl_tx_t, tx_list);
+                list_del(&tx->tx_list);
+                tx->tx_status = -EIO;
+                kptllnd_tx_decref(tx);
+        }
+}
+
+void
+kptllnd_peer_alive (kptl_peer_t *peer)
+{
+        /* This is racy, but everyone's only writing cfs_time_current() */
+        peer->peer_last_alive = cfs_time_current();
+        mb();
+}
+
+void
+kptllnd_peer_notify (kptl_peer_t *peer)
+{
+        unsigned long flags;
+        time_t        last_alive = 0;
+        int           error = 0;
+        
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        if (peer->peer_error != 0) {
+                error = peer->peer_error;
+                peer->peer_error = 0;
+                
+                last_alive = cfs_time_current_sec() - 
+                             cfs_duration_sec(cfs_time_current() - 
+                                              peer->peer_last_alive);
+        }
+        
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        if (error != 0)
+                lnet_notify (kptllnd_data.kptl_ni, peer->peer_id.nid, 0,
+                             last_alive);
+}
+
+void
+kptllnd_handle_closing_peers ()
+{
+        unsigned long           flags;
+        kptl_peer_t            *peer;
+        struct list_head       *tmp;
+        struct list_head       *nxt;
+        int                     idle;
+
+        /* Check with a read lock first to avoid blocking anyone */
+
+        read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+        idle = list_empty(&kptllnd_data.kptl_closing_peers);
+        read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        if (idle)
+                return;
+
+        /* Scan the closing peers and cancel their txs.
+         * NB only safe while there is only a single watchdog */
+
+        write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        list_for_each_safe (tmp, nxt, &kptllnd_data.kptl_closing_peers) {
+                peer = list_entry (tmp, kptl_peer_t, peer_list);
+
+                LASSERT (peer->peer_state == PEER_STATE_CLOSING);
+
+                list_del(&peer->peer_list);
+                list_add_tail(&peer->peer_list,
+                              &kptllnd_data.kptl_zombie_peers);
+                peer->peer_state = PEER_STATE_ZOMBIE;
+
+                write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+                kptllnd_peer_notify(peer);
+                kptllnd_peer_cancel_txs(peer);
+                kptllnd_peer_decref(peer);
+
+                write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+        }
+
+        write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+}
+
+void
+kptllnd_peer_close_locked(kptl_peer_t *peer, int why)
+{
+        switch (peer->peer_state) {
+        default:
+                LBUG();
+
+        case PEER_STATE_WAITING_HELLO:
+        case PEER_STATE_ACTIVE:
+                /* Removing from peer table */
+                kptllnd_data.kptl_n_active_peers--;
+                LASSERT (kptllnd_data.kptl_n_active_peers >= 0);
+
+                list_del(&peer->peer_list);
+                kptllnd_peer_unreserve_buffers();
+
+                peer->peer_error = why; /* stash 'why' only on first close */
+
+                /* Schedule for immediate attention, taking peer table's ref */
+                list_add_tail(&peer->peer_list, 
+                              &kptllnd_data.kptl_closing_peers);
+                wake_up(&kptllnd_data.kptl_watchdog_waitq);
+                break;
+
+        case PEER_STATE_ZOMBIE:
+                /* Schedule for attention at next timeout */
+                kptllnd_peer_addref(peer);
+                list_del(&peer->peer_list);
+                list_add_tail(&peer->peer_list, 
+                              &kptllnd_data.kptl_closing_peers);
+                break;
+                
+        case PEER_STATE_CLOSING:
+                break;
+        }
+
+        peer->peer_state = PEER_STATE_CLOSING;
+}
+
+void
+kptllnd_peer_close(kptl_peer_t *peer, int why)
+{
+        unsigned long      flags;
+
+        write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+        kptllnd_peer_close_locked(peer, why);
+        write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+}
+
+int
+kptllnd_peer_del(lnet_process_id_t id)
+{
+        struct list_head  *ptmp;
+        struct list_head  *pnxt;
+        kptl_peer_t       *peer;
+        int                lo;
+        int                hi;
+        int                i;
+        unsigned long      flags;
+        int                rc = -ENOENT;
+
+        /*
+         * Find the single bucket we are supposed to look at or if nid is a
+         * wildcard (LNET_NID_ANY) then look at all of the buckets
+         */
+        if (id.nid != LNET_NID_ANY) {
+                struct list_head *l = kptllnd_nid2peerlist(id.nid);
+                
+                lo = hi =  l - kptllnd_data.kptl_peers;
+        } else {
+                if (id.pid != LNET_PID_ANY)
+                        return -EINVAL;
+                
+                lo = 0;
+                hi = kptllnd_data.kptl_peer_hash_size - 1;
+        }
+
+again:
+        read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        for (i = lo; i <= hi; i++) {
+                list_for_each_safe (ptmp, pnxt, &kptllnd_data.kptl_peers[i]) {
+                        peer = list_entry (ptmp, kptl_peer_t, peer_list);
+
+                        if (!(id.nid == LNET_NID_ANY || 
+                              (peer->peer_id.nid == id.nid &&
+                               (id.pid == LNET_PID_ANY || 
+                                peer->peer_id.pid == id.pid))))
+                                continue;
+
+                        kptllnd_peer_addref(peer); /* 1 ref for me... */
+
+                        read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock,
+                                               flags);
+
+                        kptllnd_peer_close(peer, 0);
+                        kptllnd_peer_decref(peer); /* ...until here */
+
+                        rc = 0;         /* matched something */
+
+                        /* start again now I've dropped the lock */
+                        goto again;
+                }
+        }
+
+        read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        return (rc);
+}
+
+void
+kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx)
+{
+        /* CAVEAT EMPTOR: I take over caller's ref on 'tx' */
+        ptl_handle_md_t  rdma_mdh = PTL_INVALID_HANDLE;
+        ptl_handle_md_t  msg_mdh = PTL_INVALID_HANDLE;
+        ptl_handle_me_t  meh;
+        ptl_md_t         md;
+        ptl_err_t        prc;
+        unsigned long    flags;
+
+        LASSERT (!tx->tx_idle);
+        LASSERT (!tx->tx_active);
+        LASSERT (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
+        LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
+        LASSERT (tx->tx_type == TX_TYPE_SMALL_MESSAGE ||
+                 tx->tx_type == TX_TYPE_PUT_REQUEST ||
+                 tx->tx_type == TX_TYPE_GET_REQUEST);
+
+        kptllnd_set_tx_peer(tx, peer);
+
+        if (tx->tx_type == TX_TYPE_PUT_REQUEST ||
+            tx->tx_type == TX_TYPE_GET_REQUEST) {
+
+                spin_lock_irqsave(&peer->peer_lock, flags);
+
+                /* Assume 64-bit matchbits can't wrap */
+                LASSERT (peer->peer_next_matchbits >= PTL_RESERVED_MATCHBITS);
+                tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits =
+                        peer->peer_next_matchbits++;
+                        
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+                prc = PtlMEAttach(kptllnd_data.kptl_nih,
+                                  *kptllnd_tunables.kptl_portal,
+                                  peer->peer_ptlid,
+                                  tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits,
+                                  0,             /* ignore bits */
+                                  PTL_UNLINK,
+                                  PTL_INS_BEFORE,
+                                  &meh);
+                if (prc != PTL_OK) {
+                        CERROR("PtlMEAttach(%s) failed: %d\n",
+                               libcfs_id2str(peer->peer_id), prc);
+                        goto failed;
+                }
+
+                prc = PtlMDAttach(meh, tx->tx_rdma_md, PTL_UNLINK, &rdma_mdh);
+                if (prc != PTL_OK) {
+                        CERROR("PtlMDAttach(%s) failed: %d\n",
+                               libcfs_id2str(tx->tx_peer->peer_id), prc);
+                        prc = PtlMEUnlink(meh);
+                        LASSERT(prc == PTL_OK);
+                        rdma_mdh = PTL_INVALID_HANDLE;
+                        goto failed;
+                }
+
+                /* I'm not racing with the event callback here.  It's a bug if
+                 * there's an event on the MD I just attached before I actually
+                 * send the RDMA request message which the event callback
+                 * catches by asserting 'rdma_mdh' is valid. */
+        }
+
+        memset(&md, 0, sizeof(md));
+        
+        md.start = tx->tx_msg;
+        md.length = tx->tx_msg->ptlm_nob;
+        md.threshold = 1;
+        md.options = PTL_MD_OP_PUT |
+                     PTL_MD_LUSTRE_COMPLETION_SEMANTICS |
+                     PTL_MD_EVENT_START_DISABLE;
+        md.user_ptr = &tx->tx_msg_eventarg;
+        md.eq_handle = kptllnd_data.kptl_eqh;
+
+        prc = PtlMDBind(kptllnd_data.kptl_nih, md, PTL_UNLINK, &msg_mdh);
+        if (prc != PTL_OK) {
+                msg_mdh = PTL_INVALID_HANDLE;
+                goto failed;
+        }
+        
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ);
+        tx->tx_active = 1;
+        tx->tx_rdma_mdh = rdma_mdh;
+        tx->tx_msg_mdh = msg_mdh;
+
+       /* Ensure HELLO is sent first */
+       if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO)
+               list_add(&tx->tx_list, &peer->peer_sendq);
+       else
+               list_add_tail(&tx->tx_list, &peer->peer_sendq);
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+        return;
+        
+ failed:
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        tx->tx_status = -EIO;
+        tx->tx_rdma_mdh = rdma_mdh;
+        tx->tx_msg_mdh = msg_mdh;
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        kptllnd_tx_decref(tx);
+}
+
+void
+kptllnd_peer_check_sends (kptl_peer_t *peer)
+{
+
+        kptl_tx_t       *tx;
+        int              rc;
+        unsigned long    flags;
+
+        LASSERT(!in_interrupt());
+
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        if (list_empty(&peer->peer_sendq) &&
+            peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER) {
+
+                /* post a NOOP to return credits */
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+                tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE);
+                if (tx == NULL) {
+                        CERROR("Can't return credits to %s: can't allocate descriptor\n",
+                               libcfs_id2str(peer->peer_id));
+                } else {
+                        kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_NOOP, 0);
+                        kptllnd_post_tx(peer, tx);
+                }
+
+                spin_lock_irqsave(&peer->peer_lock, flags);
+        }
+
+        while (!list_empty(&peer->peer_sendq)) {
+                tx = list_entry (peer->peer_sendq.next, kptl_tx_t, tx_list);
+
+                LASSERT (tx->tx_active);
+                LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
+                LASSERT (tx->tx_type == TX_TYPE_SMALL_MESSAGE ||
+                         !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
+
+                LASSERT (peer->peer_outstanding_credits >= 0);
+                LASSERT (peer->peer_outstanding_credits <= 
+                         *kptllnd_tunables.kptl_peercredits);
+                LASSERT (peer->peer_credits >= 0);
+                LASSERT (peer->peer_credits <= 
+                         *kptllnd_tunables.kptl_peercredits);
+
+               /* Ensure HELLO is sent first */
+               if (!peer->peer_sent_hello) {
+                       if (tx->tx_msg->ptlm_type != PTLLND_MSG_TYPE_HELLO)
+                               break;
+                       peer->peer_sent_hello = 1;
+               }
+
+                if (peer->peer_credits == 0) {
+                        CDEBUG(D_NET, "%s: no credits\n",
+                               libcfs_id2str(peer->peer_id));
+                        break;
+                }
+
+                /* Don't use the last credit unless I've got credits to
+                 * return */
+                if (peer->peer_credits == 1 &&
+                    peer->peer_outstanding_credits == 0) {
+                        CDEBUG(D_NET, "%s: not using last credit\n",
+                               libcfs_id2str(peer->peer_id));
+                        break;
+                }
+
+                list_del(&tx->tx_list);
+
+                /* Discard any NOOP I queued if I'm not at the high-water mark
+                 * any more or more messages have been queued */
+                if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP &&
+                    (!list_empty(&peer->peer_sendq) ||
+                     peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) {
+
+                        tx->tx_active = 0;
+
+                        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+                        CDEBUG(D_NET, "%s: redundant noop\n", 
+                               libcfs_id2str(peer->peer_id));
+                        kptllnd_tx_decref(tx);
+
+                        spin_lock_irqsave(&peer->peer_lock, flags);
+                        continue;
+                }
+
+                CDEBUG(D_NET, "tx=%p nob=%d to %s(%s)\n",
+                       tx, tx->tx_msg->ptlm_nob,
+                       libcfs_id2str(peer->peer_id), 
+                       kptllnd_ptlid2str(peer->peer_ptlid));
+
+                /* fill last-minute msg header fields */
+                kptllnd_msg_pack(tx->tx_msg, peer);
+
+                peer->peer_outstanding_credits = 0;
+                peer->peer_credits--;
+
+                list_add_tail(&tx->tx_list, &peer->peer_activeq);
+
+                kptllnd_tx_addref(tx);          /* 1 ref for me... */
+
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+                rc = PtlPut (tx->tx_msg_mdh,
+                             PTL_NOACK_REQ,
+                             peer->peer_ptlid,
+                             *kptllnd_tunables.kptl_portal,
+                             0,                 /* acl cookie */
+                             LNET_MSG_MATCHBITS,
+                             0,                 /* offset */
+                             0);                /* header data */
+                if (rc != PTL_OK) {
+                        CERROR("PtlPut %s error %d\n",
+                               libcfs_id2str(peer->peer_id), rc);
+
+                        /* Nuke everything (including this tx) */
+                        kptllnd_peer_close(peer, -EIO);
+                        return;
+                }
+
+                kptllnd_tx_decref(tx);          /* drop my ref */
+
+                spin_lock_irqsave(&peer->peer_lock, flags);
+        }
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+}
+
+kptl_tx_t *
+kptllnd_find_timed_out_tx(kptl_peer_t *peer)
+{
+        kptl_tx_t         *tx;
+        struct list_head  *tmp;
+        unsigned long      flags;
+
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        list_for_each(tmp, &peer->peer_sendq) {
+                tx = list_entry(peer->peer_sendq.next, kptl_tx_t, tx_list);
+
+                if (time_after_eq(jiffies, tx->tx_deadline)) {
+                        kptllnd_tx_addref(tx);
+                        spin_unlock_irqrestore(&peer->peer_lock, flags);
+                        return tx;
+                }
+        }
+
+        list_for_each(tmp, &peer->peer_activeq) {
+                tx = list_entry(peer->peer_activeq.next, kptl_tx_t, tx_list);
+
+                if (time_after_eq(jiffies, tx->tx_deadline)) {
+                        kptllnd_tx_addref(tx);
+                        spin_unlock_irqrestore(&peer->peer_lock, flags);
+                        return tx;
+                }
+        }
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+        return NULL;
+}
+
+
+void
+kptllnd_peer_check_bucket (int idx)
+{
+        struct list_head  *peers = &kptllnd_data.kptl_peers[idx];
+        struct list_head  *ptmp;
+        kptl_peer_t       *peer;
+        kptl_tx_t         *tx;
+        unsigned long      flags;
+        int                nsend;
+        int                nactive;
+
+        CDEBUG(D_NET, "Bucket=%d\n", idx);
+
+ again:
+        /* NB. Shared lock while I just look */
+        read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
+
+        list_for_each (ptmp, peers) {
+                peer = list_entry (ptmp, kptl_peer_t, peer_list);
+
+                CDEBUG(D_NET, "Peer=%s Credits=%d Outstanding=%d\n",
+                       libcfs_id2str(peer->peer_id),
+                       peer->peer_credits, peer->peer_outstanding_credits);
+
+                /* In case we have enough credits to return via a
+                 * NOOP, but there were no non-blocking tx descs
+                 * free to do it last time... */
+                kptllnd_peer_check_sends(peer);
+
+                tx = kptllnd_find_timed_out_tx(peer);
+                if (tx == NULL)
+                        continue;
+
+                kptllnd_peer_addref(peer); /* 1 ref for me... */
+
+                read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock,
+                                       flags);
+
+                spin_lock_irqsave(&peer->peer_lock, flags);
+                nsend = kptllnd_count_queue(&peer->peer_sendq);
+                nactive = kptllnd_count_queue(&peer->peer_activeq);
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+                LCONSOLE_ERROR("Timing out %s: please check Portals\n",
+                               libcfs_id2str(peer->peer_id));
+
+                CERROR("%s timed out: cred %d outstanding %d sendq %d "
+                       "activeq %d Tx %s (%s%s%s) status %d T/O %ds\n",
+                       libcfs_id2str(peer->peer_id),
+                       peer->peer_credits, peer->peer_outstanding_credits,
+                       nsend, nactive, kptllnd_tx_typestr(tx->tx_type),
+                       tx->tx_active ? "A" : "",
+                       PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE) ?
+                       "" : "M",
+                       PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE) ?
+                       "" : "D",
+                       tx->tx_status, *kptllnd_tunables.kptl_timeout);
+
+                kptllnd_dump_ptltrace();
+
+                kptllnd_tx_decref(tx);
+
+                kptllnd_peer_close(peer, -ETIMEDOUT);
+                kptllnd_peer_decref(peer); /* ...until here */
+
+                /* start again now I've dropped the lock */
+                goto again;
+        }
+
+        read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
+}
+
+kptl_peer_t *
+kptllnd_id2peer_locked (lnet_process_id_t id)
+{
+        struct list_head *peers = kptllnd_nid2peerlist(id.nid);
+        struct list_head *tmp;
+        kptl_peer_t      *peer;
+
+        list_for_each (tmp, peers) {
+
+                peer = list_entry (tmp, kptl_peer_t, peer_list);
+
+                LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO ||
+                        peer->peer_state == PEER_STATE_ACTIVE);
+                
+                if (peer->peer_id.nid != id.nid ||
+                    peer->peer_id.pid != id.pid)
+                        continue;
+
+                kptllnd_peer_addref(peer);
+
+                CDEBUG(D_NET, "%s -> %s (%d)\n",
+                       libcfs_id2str(id), 
+                       kptllnd_ptlid2str(peer->peer_ptlid),
+                       atomic_read (&peer->peer_refcount));
+                return peer;
+        }
+
+        return NULL;
+}
+
+void
+kptllnd_peertable_overflow_msg(char *str, lnet_process_id_t id)
+{
+        LCONSOLE_ERROR("%s %s overflows the peer table[%d]: "
+                       "messages may be dropped\n",
+                       str, libcfs_id2str(id),
+                       kptllnd_data.kptl_n_active_peers);
+        LCONSOLE_ERROR("Please correct by increasing "
+                       "'max_nodes' or 'max_procs_per_node'\n");
+}
+
+__u64
+kptllnd_get_last_seen_matchbits_locked(lnet_process_id_t lpid)
+{
+        kptl_peer_t            *peer;
+        struct list_head       *tmp;
+
+        /* Find the last matchbits I saw this new peer using.  Note..
+           A. This peer cannot be in the peer table - she's new!
+           B. If I can't find the peer in the closing/zombie peers, all
+              matchbits are safe because all refs to the (old) peer have gone
+              so all txs have completed so there's no risk of matchbit
+              collision!
+         */
+
+        LASSERT(kptllnd_id2peer_locked(lpid) == NULL);
+
+        /* peer's last matchbits can't change after it comes out of the peer
+         * table, so first match is fine */
+
+        list_for_each (tmp, &kptllnd_data.kptl_closing_peers) {
+                peer = list_entry (tmp, kptl_peer_t, peer_list);
+
+                if (peer->peer_id.nid == lpid.nid &&
+                    peer->peer_id.pid == lpid.pid)
+                        return peer->peer_last_matchbits_seen;
+        }
+        
+        list_for_each (tmp, &kptllnd_data.kptl_zombie_peers) {
+                peer = list_entry (tmp, kptl_peer_t, peer_list);
+
+                if (peer->peer_id.nid == lpid.nid &&
+                    peer->peer_id.pid == lpid.pid)
+                        return peer->peer_last_matchbits_seen;
+        }
+        
+        return PTL_RESERVED_MATCHBITS;
+}
+
+kptl_peer_t *
+kptllnd_peer_handle_hello (ptl_process_id_t  initiator,
+                           kptl_msg_t       *msg)
+{
+        rwlock_t           *g_lock = &kptllnd_data.kptl_peer_rw_lock;
+        kptl_peer_t        *peer;
+        kptl_peer_t        *new_peer;
+        lnet_process_id_t   lpid;
+        unsigned long       flags;
+        kptl_tx_t          *hello_tx;
+        int                 rc;
+        __u64               safe_matchbits;
+        __u64               last_matchbits_seen;
+
+        lpid.nid = msg->ptlm_srcnid;
+        lpid.pid = msg->ptlm_srcpid;
+
+        CDEBUG(D_NET, "hello from %s(%s)\n",
+               libcfs_id2str(lpid), kptllnd_ptlid2str(initiator));
+
+        if (initiator.pid != kptllnd_data.kptl_portals_id.pid &&
+            (msg->ptlm_srcpid & LNET_PID_USERFLAG) == 0) {
+                /* If the peer's PID isn't _the_ ptllnd kernel pid, she must be
+                 * userspace.  Refuse the connection if she hasn't set the
+                 * correct flag in her PID... */
+                CERROR("Userflag not set in hello from %s (%s)\n",
+                       libcfs_id2str(lpid), kptllnd_ptlid2str(initiator));
+                return NULL;
+        }
+        
+        /* kptlhm_matchbits are the highest matchbits my peer may have used to
+         * RDMA to me.  I ensure I never register buffers for RDMA that could
+         * match any she used */
+        safe_matchbits = msg->ptlm_u.hello.kptlhm_matchbits + 1;
+
+        if (safe_matchbits < PTL_RESERVED_MATCHBITS) {
+                CERROR("Illegal matchbits "LPX64" in HELLO from %s\n",
+                      safe_matchbits, libcfs_id2str(lpid));
+               return NULL;
+       }
+       
+        if (msg->ptlm_u.hello.kptlhm_max_msg_size !=
+            *kptllnd_tunables.kptl_max_msg_size) {
+                CERROR("max message size MUST be equal for all peers: "
+                       "got %d expected %d from %s\n",
+                       msg->ptlm_u.hello.kptlhm_max_msg_size,
+                       *kptllnd_tunables.kptl_max_msg_size,
+                       libcfs_id2str(lpid));
+                return NULL;
+        }
+
+        if (msg->ptlm_credits + 1 != *kptllnd_tunables.kptl_peercredits) {
+                CERROR("peercredits MUST be equal on all peers: "
+                       "got %d expected %d from %s\n",
+                       msg->ptlm_credits + 1,
+                       *kptllnd_tunables.kptl_peercredits,
+                       libcfs_id2str(lpid));
+                return NULL;
+        }
+        
+        write_lock_irqsave(g_lock, flags);
+
+        peer = kptllnd_id2peer_locked(lpid);
+        if (peer != NULL) {
+                if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
+                        /* Completing HELLO handshake */
+                        LASSERT(peer->peer_incarnation == 0);
+
+                        peer->peer_state = PEER_STATE_ACTIVE;
+                        peer->peer_incarnation = msg->ptlm_srcstamp;
+                        peer->peer_next_matchbits = safe_matchbits;
+
+                        write_unlock_irqrestore(g_lock, flags);
+                        return peer;
+                }
+
+                /* remove old incarnation of this peer */
+                kptllnd_peer_close_locked(peer, 0);
+        }
+
+        kptllnd_cull_peertable_locked(lpid);
+
+        write_unlock_irqrestore(g_lock, flags);
+
+        if (peer != NULL) {
+                CDEBUG(D_NET, "Peer %s (%s) reconnecting:"
+                       " stamp "LPX64"("LPX64")\n",
+                       libcfs_id2str(lpid), kptllnd_ptlid2str(initiator),
+                       msg->ptlm_srcstamp, peer->peer_incarnation);
+
+                kptllnd_peer_decref(peer);
+        }
+
+        hello_tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE);
+        if (hello_tx == NULL) {
+                CERROR("Unable to allocate HELLO message for %s\n",
+                       libcfs_id2str(lpid));
+                return NULL;
+        }
+
+        kptllnd_init_msg(hello_tx->tx_msg, PTLLND_MSG_TYPE_HELLO,
+                         sizeof(kptl_hello_msg_t));
+
+        new_peer = kptllnd_peer_allocate(lpid, initiator);
+        if (new_peer == NULL) {
+                kptllnd_tx_decref(hello_tx);
+                return NULL;
+        }
+
+        rc = kptllnd_peer_reserve_buffers();
+        if (rc != 0) {
+                kptllnd_peer_decref(new_peer);
+                kptllnd_tx_decref(hello_tx);
+
+                CERROR("Failed to reserve buffers for %s\n",
+                       libcfs_id2str(lpid));
+                return NULL;
+        }
+
+        write_lock_irqsave(g_lock, flags);
+
+        peer = kptllnd_id2peer_locked(lpid);
+        if (peer != NULL) {
+                if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
+                        /* An outgoing message instantiated 'peer' for me and
+                        * presumably provoked this reply */
+                        CWARN("Outgoing instantiated peer %s\n", libcfs_id2str(lpid));
+                        LASSERT(peer->peer_incarnation == 0);
+
+                        peer->peer_state = PEER_STATE_ACTIVE;
+                        peer->peer_incarnation = msg->ptlm_srcstamp;
+                        peer->peer_next_matchbits = safe_matchbits;
+               } else {
+                       LASSERT (peer->peer_state == PEER_STATE_ACTIVE);
+                       /* WOW!  Somehow this peer completed the HELLO
+                        * handshake while I slept.  I guess I could have slept
+                        * while it rebooted and sent a new HELLO, so I'll fail
+                        * this one... */
+                        CWARN("Wow! peer %s\n", libcfs_id2str(lpid));
+                       kptllnd_peer_decref(peer);
+                       peer = NULL;
+               }
+               
+                write_unlock_irqrestore(g_lock, flags);
+
+                kptllnd_peer_unreserve_buffers();
+                kptllnd_peer_decref(new_peer);
+                kptllnd_tx_decref(hello_tx);
+                return peer;
+        }
+
+        if (kptllnd_data.kptl_n_active_peers ==
+            kptllnd_data.kptl_expected_peers) {
+                /* peer table full */
+                write_unlock_irqrestore(g_lock, flags);
+
+                kptllnd_peertable_overflow_msg("Connection from ", lpid);
+
+                rc = kptllnd_reserve_buffers(1); /* HELLO headroom */
+                if (rc != 0) {
+                        CERROR("Refusing connection from %s\n",
+                               libcfs_id2str(lpid));
+                        kptllnd_peer_unreserve_buffers();
+                        kptllnd_peer_decref(new_peer);
+                        kptllnd_tx_decref(hello_tx);
+                        return NULL;
+                }
+                
+                write_lock_irqsave(g_lock, flags);
+                kptllnd_data.kptl_expected_peers++;
+        }
+
+        last_matchbits_seen = kptllnd_get_last_seen_matchbits_locked(lpid);
+
+        hello_tx->tx_msg->ptlm_u.hello.kptlhm_matchbits = last_matchbits_seen;
+        hello_tx->tx_msg->ptlm_u.hello.kptlhm_max_msg_size =
+                *kptllnd_tunables.kptl_max_msg_size;
+
+        new_peer->peer_state = PEER_STATE_ACTIVE;
+        new_peer->peer_incarnation = msg->ptlm_srcstamp;
+        new_peer->peer_next_matchbits = safe_matchbits;
+        new_peer->peer_last_matchbits_seen = last_matchbits_seen;
+
+        kptllnd_peer_add_peertable_locked(new_peer);
+
+        write_unlock_irqrestore(g_lock, flags);
+
+       /* NB someone else could get in now and post a message before I post
+        * the HELLO, but post_tx/check_sends take care of that! */
+
+        kptllnd_post_tx(new_peer, hello_tx);
+        kptllnd_peer_check_sends(new_peer);
+
+        return new_peer;
+}
+
+void
+kptllnd_tx_launch(kptl_tx_t *tx, lnet_process_id_t target)
+{
+        rwlock_t         *g_lock = &kptllnd_data.kptl_peer_rw_lock;
+        ptl_process_id_t  ptl_id;
+        kptl_peer_t      *peer;
+        kptl_peer_t      *new_peer = NULL;
+        kptl_tx_t        *hello_tx = NULL;
+        unsigned long     flags;
+        int               rc;
+        __u64             last_matchbits_seen;
+
+        LASSERT (tx->tx_lnet_msg != NULL);
+        LASSERT (tx->tx_peer == NULL);
+
+        /* I expect to find the peer, so I only take a read lock... */
+        read_lock_irqsave(g_lock, flags);
+        peer = kptllnd_id2peer_locked(target);
+        read_unlock_irqrestore(g_lock, flags);
+
+        if (peer != NULL) {
+                goto post;
+        }
+        
+        if ((target.pid & LNET_PID_USERFLAG) != 0) {
+                CWARN("Refusing to create a new connection to %s "
+                      "(non-kernel peer)\n", libcfs_id2str(target));
+                tx->tx_status = -EHOSTUNREACH;
+                goto failed;
+        }
+
+        /* The new peer is a kernel ptllnd, and kernel ptllnds all have
+         * the same portals PID */
+        ptl_id.nid = kptllnd_lnet2ptlnid(target.nid);
+        ptl_id.pid = kptllnd_data.kptl_portals_id.pid;
+
+        write_lock_irqsave(g_lock, flags);
+
+        peer = kptllnd_id2peer_locked(target);
+        if (peer != NULL) {
+                write_unlock_irqrestore(g_lock, flags);
+                goto post;
+        }
+        
+        kptllnd_cull_peertable_locked(target);
+
+        write_unlock_irqrestore(g_lock, flags);
+                
+        hello_tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE);
+        if (hello_tx == NULL) {
+                CERROR("Unable to allocate connect message for %s\n",
+                       libcfs_id2str(target));
+                tx->tx_status = -ENOMEM;
+                goto failed;
+        }
+
+        kptllnd_init_msg(hello_tx->tx_msg, PTLLND_MSG_TYPE_HELLO,
+                         sizeof(kptl_hello_msg_t));
+
+        new_peer = kptllnd_peer_allocate(target, ptl_id);
+        if (new_peer == NULL) {
+                tx->tx_status = -ENOMEM;
+                goto failed;
+        }
+
+        rc = kptllnd_peer_reserve_buffers();
+        if (rc != 0) {
+                tx->tx_status = rc;
+                goto failed;
+        }
+
+        write_lock_irqsave(g_lock, flags);
+
+        peer = kptllnd_id2peer_locked(target);
+        if (peer != NULL) {                     /* someone else beat me to it */
+                write_unlock_irqrestore(g_lock, flags);
+
+                kptllnd_peer_unreserve_buffers();
+                kptllnd_peer_decref(new_peer);
+                kptllnd_tx_decref(hello_tx);
+                goto post;
+        }
+                
+        if (kptllnd_data.kptl_n_active_peers ==
+            kptllnd_data.kptl_expected_peers) {
+                /* peer table full */
+                write_unlock_irqrestore(g_lock, flags);
+
+                kptllnd_peertable_overflow_msg("Connection to ", target);
+
+                rc = kptllnd_reserve_buffers(1); /* HELLO headroom */
+                if (rc != 0) {
+                        CERROR("Can't create connection to %s\n",
+                               libcfs_id2str(target));
+                        kptllnd_peer_unreserve_buffers();
+                        tx->tx_status = -ENOMEM;
+                        goto failed;
+                }
+                write_lock_irqsave(g_lock, flags);
+                kptllnd_data.kptl_expected_peers++;
+        }
+
+        last_matchbits_seen = kptllnd_get_last_seen_matchbits_locked(target);
+
+        hello_tx->tx_msg->ptlm_u.hello.kptlhm_matchbits = last_matchbits_seen;
+        hello_tx->tx_msg->ptlm_u.hello.kptlhm_max_msg_size =
+                *kptllnd_tunables.kptl_max_msg_size;
+                
+        new_peer->peer_state = PEER_STATE_WAITING_HELLO;
+        new_peer->peer_last_matchbits_seen = last_matchbits_seen;
+        
+        kptllnd_peer_add_peertable_locked(new_peer);
+
+        write_unlock_irqrestore(g_lock, flags);
+
+       /* NB someone else could get in now and post a message before I post
+        * the HELLO, but post_tx/check_sends take care of that! */
+
+        peer = new_peer;
+        kptllnd_post_tx(peer, hello_tx);
+
+ post:
+        kptllnd_post_tx(peer, tx);
+        kptllnd_peer_check_sends(peer);
+        kptllnd_peer_decref(peer);
+        return;
+        
+ failed:
+        if (hello_tx != NULL)
+                kptllnd_tx_decref(hello_tx);
+
+        if (new_peer != NULL)
+                kptllnd_peer_decref(new_peer);
+
+        LASSERT (tx->tx_status != 0);
+        kptllnd_tx_decref(tx);
+        
+}
diff --git a/lnet/klnds/ptllnd/ptllnd_ptltrace.c b/lnet/klnds/ptllnd/ptllnd_ptltrace.c
new file mode 100644 (file)
index 0000000..30064dc
--- /dev/null
@@ -0,0 +1,172 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2006 Cluster File Systems, Inc. All rights reserved.
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+#include "ptllnd.h"
+
+#ifdef CRAY_XT3
+static struct semaphore   ptltrace_mutex;
+static struct semaphore   ptltrace_signal;
+
+void
+kptllnd_ptltrace_to_file(char *filename)
+{
+        CFS_DECL_JOURNAL_DATA;
+        CFS_DECL_MMSPACE;
+
+        cfs_file_t *filp;
+        char       *start;
+        char       *tmpbuf;
+        int         len;
+        int         rc;
+        loff_t      offset = 0;
+        int         eof = 0;
+
+        CWARN("dumping ptltrace to %s\n", filename);
+
+        LIBCFS_ALLOC(tmpbuf, PAGE_SIZE);
+        if (tmpbuf == NULL) {
+                CERROR("Can't allocate page buffer to dump %s\n", filename);
+                return;
+        }
+        
+        CFS_PUSH_JOURNAL;
+
+        filp = cfs_filp_open(filename,
+                             O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600, &rc);
+        if (filp == NULL) {
+                CERROR("Error %d creating %s\n", rc, filename);
+                goto out;
+        }
+
+        CFS_MMSPACE_OPEN;
+
+        while (!eof) { 
+                start = NULL; 
+                len = ptl_proc_read(tmpbuf, &start, offset,
+                                    PAGE_SIZE, &eof, NULL);
+
+                /* we don't allow ptl_proc_read to mimic case 0 or 1 behavior
+                 * for a proc_read method, only #2: from proc_file_read
+                 *
+                 * 2) Set *start = an address within the buffer.
+                 *    Put the data of the requested offset at *start.
+                 *    Return the number of bytes of data placed there.
+                 *    If this number is greater than zero and you
+                 *    didn't signal eof and the reader is prepared to
+                 *    take more data you will be called again with the
+                 *    requested offset advanced by the number of bytes
+                 *    absorbed.
+                 */
+
+                if (len == 0)   /* end of file */
+                        break;
+
+                if (len < 0) {
+                        CERROR("ptl_proc_read: error %d\n", len);
+                        break;
+                }
+
+                LASSERT (start >= tmpbuf && start + len <= tmpbuf + PAGE_SIZE);
+
+                rc = cfs_filp_write(filp, start, len, cfs_filp_poff(filp));
+                if (rc != len) {
+                        if (rc < 0)
+                                CERROR("Error %d writing %s\n", rc, filename);
+                        else
+                                CERROR("Partial write %d(%d) to %s\n",
+                                       rc, len, filename);
+                        break;
+                }
+
+                offset += len;
+        }
+
+        CFS_MMSPACE_CLOSE;
+
+        rc = cfs_filp_fsync(filp);
+        if (rc != 0)
+                CERROR("Error %d syncing %s\n", rc, filename);
+
+        cfs_filp_close(filp);
+out:
+        CFS_POP_JOURNAL;
+        LIBCFS_FREE(tmpbuf, PAGE_SIZE);
+}
+
+int
+kptllnd_dump_ptltrace_thread(void *arg)
+{
+        static char fname[1024];
+
+        libcfs_daemonize("ptltracedump");
+
+        /* serialise with other instances of me */
+        mutex_down(&ptltrace_mutex);
+
+        snprintf(fname, sizeof(fname), "%s.%ld.%ld",
+                 *kptllnd_tunables.kptl_ptltrace_basename,
+                 cfs_time_current_sec(), (long)arg);
+
+        kptllnd_ptltrace_to_file(fname);
+
+        mutex_up(&ptltrace_mutex);
+
+        /* unblock my creator */
+        mutex_up(&ptltrace_signal);
+        
+        return 0;
+}
+
+void
+kptllnd_dump_ptltrace(void)
+{
+        int            rc;     
+
+        if (!*kptllnd_tunables.kptl_ptltrace_on_timeout)
+                return;
+
+        rc = cfs_kernel_thread(kptllnd_dump_ptltrace_thread,
+                               (void *)(long)cfs_curproc_pid(),
+                               CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (rc < 0) {
+                CERROR("Error %d starting ptltrace dump thread\n", rc);
+        } else {
+                /* block until thread completes */
+                mutex_down(&ptltrace_signal);
+        }
+}
+
+void
+kptllnd_init_ptltrace(void)
+{
+        init_mutex(&ptltrace_mutex);
+        init_mutex_locked(&ptltrace_signal);
+}
+
+#else
+
+void
+kptllnd_dump_ptltrace(void)
+{
+}
+
+void
+kptllnd_init_ptltrace(void)
+{
+}
+
+#endif
diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c
new file mode 100644 (file)
index 0000000..74019ef
--- /dev/null
@@ -0,0 +1,720 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+ #include "ptllnd.h"
+
+void
+kptllnd_rx_buffer_pool_init(kptl_rx_buffer_pool_t *rxbp)
+{
+        memset(rxbp, 0, sizeof(*rxbp));
+        spin_lock_init(&rxbp->rxbp_lock);
+        INIT_LIST_HEAD(&rxbp->rxbp_list);
+}
+
+void
+kptllnd_rx_buffer_destroy(kptl_rx_buffer_t *rxb)
+{
+        kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool;
+
+        LASSERT(rxb->rxb_refcount == 0);
+        LASSERT(PtlHandleIsEqual(rxb->rxb_mdh, PTL_INVALID_HANDLE));
+        LASSERT(!rxb->rxb_posted);
+        LASSERT(rxb->rxb_idle);
+
+        list_del(&rxb->rxb_list);
+        rxbp->rxbp_count--;
+
+        LIBCFS_FREE(rxb->rxb_buffer, kptllnd_rx_buffer_size());
+        LIBCFS_FREE(rxb, sizeof(*rxb));
+}
+
+int
+kptllnd_rx_buffer_pool_reserve(kptl_rx_buffer_pool_t *rxbp, int count)
+{
+        int               bufsize;
+        int               msgs_per_buffer;
+        int               rc;
+        kptl_rx_buffer_t *rxb;
+        char             *buffer;
+        unsigned long     flags;
+
+        bufsize = kptllnd_rx_buffer_size();
+        msgs_per_buffer = bufsize / (*kptllnd_tunables.kptl_max_msg_size);
+
+        CDEBUG(D_NET, "kptllnd_rx_buffer_pool_reserve(%d)\n", count);
+
+        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+
+        for (;;) {
+                if (rxbp->rxbp_shutdown) {
+                        rc = -ESHUTDOWN;
+                        break;
+                }
+                
+                if (rxbp->rxbp_reserved + count <= 
+                    rxbp->rxbp_count * msgs_per_buffer) {
+                        rc = 0;
+                        break;
+                }
+                
+                spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+                
+                LIBCFS_ALLOC(rxb, sizeof(*rxb));
+                LIBCFS_ALLOC(buffer, bufsize);
+
+                if (rxb == NULL || buffer == NULL) {
+                        CERROR("Failed to allocate rx buffer\n");
+
+                        if (rxb != NULL)
+                                LIBCFS_FREE(rxb, sizeof(*rxb));
+                        if (buffer != NULL)
+                                LIBCFS_FREE(buffer, bufsize);
+                        
+                        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+                        rc = -ENOMEM;
+                        break;
+                }
+
+                memset(rxb, 0, sizeof(*rxb));
+
+                rxb->rxb_eventarg.eva_type = PTLLND_EVENTARG_TYPE_BUF;
+                rxb->rxb_refcount = 0;
+                rxb->rxb_pool = rxbp;
+                rxb->rxb_idle = 0;
+                rxb->rxb_posted = 0;
+                rxb->rxb_buffer = buffer;
+                rxb->rxb_mdh = PTL_INVALID_HANDLE;
+
+                spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+                
+                if (rxbp->rxbp_shutdown) {
+                        spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+                        
+                        LIBCFS_FREE(rxb, sizeof(*rxb));
+                        LIBCFS_FREE(buffer, bufsize);
+
+                        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+                        rc = -ESHUTDOWN;
+                        break;
+                }
+                
+                list_add_tail(&rxb->rxb_list, &rxbp->rxbp_list);
+                rxbp->rxbp_count++;
+
+                spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+                
+                kptllnd_rx_buffer_post(rxb);
+
+                spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+        }
+
+        if (rc == 0)
+                rxbp->rxbp_reserved += count;
+
+        spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+
+        return rc;
+}
+
+void
+kptllnd_rx_buffer_pool_unreserve(kptl_rx_buffer_pool_t *rxbp,
+                                 int count)
+{
+        unsigned long flags;
+
+        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+
+        CDEBUG(D_NET, "kptllnd_rx_buffer_pool_unreserve(%d)\n", count);
+        rxbp->rxbp_reserved -= count;
+
+        spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+}
+
+void
+kptllnd_rx_buffer_pool_fini(kptl_rx_buffer_pool_t *rxbp)
+{
+        kptl_rx_buffer_t       *rxb;
+        int                     rc;
+        int                     i;
+        unsigned long           flags;
+        struct list_head       *tmp;
+        struct list_head       *nxt;
+        ptl_handle_md_t         mdh;
+
+        /* CAVEAT EMPTOR: I'm racing with everything here!!!  
+         *
+         * Buffers can still be posted after I set rxbp_shutdown because I
+         * can't hold rxbp_lock while I'm posting them.
+         *
+         * Calling PtlMDUnlink() here races with auto-unlinks; i.e. a buffer's
+         * MD handle could become invalid under me.  I am vulnerable to portals
+         * re-using handles (i.e. make the same handle valid again, but for a
+         * different MD) from when the MD is actually unlinked, to when the
+         * event callback tells me it has been unlinked. */
+
+        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+
+        rxbp->rxbp_shutdown = 1;
+
+        for (i = 9;; i++) {
+                list_for_each_safe(tmp, nxt, &rxbp->rxbp_list) {
+                        rxb = list_entry (tmp, kptl_rx_buffer_t, rxb_list);
+                
+                        if (rxb->rxb_idle) {
+                                spin_unlock_irqrestore(&rxbp->rxbp_lock, 
+                                                       flags);
+                                kptllnd_rx_buffer_destroy(rxb);
+                                spin_lock_irqsave(&rxbp->rxbp_lock, 
+                                                  flags);
+                                continue;
+                        }
+
+                        mdh = rxb->rxb_mdh;
+                        if (PtlHandleIsEqual(mdh, PTL_INVALID_HANDLE))
+                                continue;
+                        
+                        spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+
+                        rc = PtlMDUnlink(mdh);
+
+                        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+                        
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                        /* callback clears rxb_mdh and drops net's ref
+                         * (which causes repost, but since I set
+                         * shutdown, it will just set the buffer
+                         * idle) */
+#else
+                        if (rc == PTL_OK) {
+                                rxb->rxb_posted = 0;
+                                rxb->rxb_mdh = PTL_INVALID_HANDLE;
+                                kptllnd_rx_buffer_decref_locked(rxb);
+                        }
+#endif
+                }
+
+                if (list_empty(&rxbp->rxbp_list))
+                        break;
+
+                spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+
+                /* Wait a bit for references to be dropped */
+                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                       "Waiting for %d Busy RX Buffers\n",
+                       rxbp->rxbp_count);
+
+                cfs_pause(cfs_time_seconds(1));
+
+                spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+        }
+
+        spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+}
+
+void
+kptllnd_rx_buffer_post(kptl_rx_buffer_t *rxb)
+{
+        int                     rc;
+        ptl_md_t                md;
+        ptl_handle_me_t         meh;
+        ptl_handle_md_t         mdh;
+        ptl_process_id_t        any;
+        kptl_rx_buffer_pool_t  *rxbp = rxb->rxb_pool;
+        unsigned long           flags;
+
+        LASSERT (!in_interrupt());
+        LASSERT (rxb->rxb_refcount == 0);
+        LASSERT (!rxb->rxb_idle);
+        LASSERT (!rxb->rxb_posted);
+        LASSERT (PtlHandleIsEqual(rxb->rxb_mdh, PTL_INVALID_HANDLE));
+
+        any.nid = PTL_NID_ANY;
+        any.pid = PTL_PID_ANY;
+
+        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+
+        if (rxbp->rxbp_shutdown) {
+                rxb->rxb_idle = 1;
+                spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+                return;
+        }
+
+        rxb->rxb_refcount = 1;                  /* net's ref */
+        rxb->rxb_posted = 1;                    /* I'm posting */
+        
+        spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+
+        rc = PtlMEAttach(kptllnd_data.kptl_nih,
+                         *kptllnd_tunables.kptl_portal,
+                         any,
+                         LNET_MSG_MATCHBITS,
+                         0, /* all matchbits are valid - ignore none */
+                         PTL_UNLINK,
+                         PTL_INS_AFTER,
+                         &meh);
+        if (rc != PTL_OK) {
+                CERROR("PtlMeAttach rxb failed %d\n", rc);
+                goto failed;
+        }
+
+        /*
+         * Setup MD
+         */
+        md.start = rxb->rxb_buffer;
+        md.length = PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages;
+        md.threshold = PTL_MD_THRESH_INF;
+        md.options = PTL_MD_OP_PUT |
+                     PTL_MD_LUSTRE_COMPLETION_SEMANTICS |
+                     PTL_MD_EVENT_START_DISABLE |
+                     PTL_MD_MAX_SIZE |
+                     PTL_MD_LOCAL_ALIGN8;
+        md.user_ptr = &rxb->rxb_eventarg;
+        md.max_size = *kptllnd_tunables.kptl_max_msg_size;
+        md.eq_handle = kptllnd_data.kptl_eqh;
+
+        rc = PtlMDAttach(meh, md, PTL_UNLINK, &mdh);
+        if (rc == PTL_OK) {
+                spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+                if (rxb->rxb_posted)            /* Not auto-unlinked yet!!! */
+                        rxb->rxb_mdh = mdh;
+                spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+                return;
+        }
+        
+        CERROR("PtlMDAttach rxb failed %d\n", rc);
+        rc = PtlMEUnlink(meh);
+        LASSERT(rc == PTL_OK);
+
+ failed:
+        spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+        rxb->rxb_posted = 0;
+        /* XXX this will just try again immediately */
+        kptllnd_rx_buffer_decref_locked(rxb);
+        spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+}
+
+kptl_rx_t *
+kptllnd_rx_alloc(void)
+{
+        kptl_rx_t* rx;
+
+        if (IS_SIMULATION_ENABLED(FAIL_RX_ALLOC)) {
+                CERROR ("FAIL_RX_ALLOC SIMULATION triggered\n");
+                return NULL;
+        }
+
+        rx = cfs_mem_cache_alloc(kptllnd_data.kptl_rx_cache, CFS_ALLOC_ATOMIC);
+        if (rx == NULL) {
+                CERROR("Failed to allocate rx\n");
+                return NULL;
+        }
+
+        memset(rx, 0, sizeof(*rx));
+        return rx;
+}
+
+void
+kptllnd_rx_done(kptl_rx_t *rx)
+{
+        kptl_rx_buffer_t *rxb = rx->rx_rxb;
+        kptl_peer_t      *peer = rx->rx_peer;
+        unsigned long     flags;
+
+        CDEBUG(D_NET, "rx=%p rxb %p peer %p\n", rx, rxb, peer);
+
+        if (rxb != NULL)
+                kptllnd_rx_buffer_decref(rxb);
+
+        if (peer != NULL) {
+                /* Update credits (after I've decref-ed the buffer) */
+                spin_lock_irqsave(&peer->peer_lock, flags);
+
+                peer->peer_outstanding_credits++;
+                LASSERT (peer->peer_outstanding_credits <=
+                         *kptllnd_tunables.kptl_peercredits);
+
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+                CDEBUG(D_NET, "Peer=%s Credits=%d Outstanding=%d\n", 
+                       libcfs_id2str(peer->peer_id), 
+                       peer->peer_credits, peer->peer_outstanding_credits);
+
+                /* I might have to send back credits */
+                kptllnd_peer_check_sends(peer);
+                kptllnd_peer_decref(peer);
+        }
+
+        cfs_mem_cache_free(kptllnd_data.kptl_rx_cache, rx);
+}
+
+void
+kptllnd_rx_buffer_callback (ptl_event_t *ev)
+{
+        kptl_eventarg_t        *eva = ev->md.user_ptr;
+        kptl_rx_buffer_t       *rxb = kptllnd_eventarg2obj(eva);
+        kptl_rx_buffer_pool_t  *rxbp = rxb->rxb_pool;
+        kptl_rx_t              *rx;
+        int                     unlinked;
+        unsigned long           flags;
+
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+        unlinked = ev->unlinked;
+#else
+        unlinked = ev->type == PTL_EVENT_UNLINK;
+#endif
+
+        CDEBUG(D_NET, "RXB Callback %s(%d) rxb=%p id=%s unlink=%d rc %d\n",
+               kptllnd_evtype2str(ev->type), ev->type, rxb, 
+               kptllnd_ptlid2str(ev->initiator), 
+               unlinked, ev->ni_fail_type);
+
+        LASSERT (!rxb->rxb_idle);
+        LASSERT (ev->md.start == rxb->rxb_buffer);
+        LASSERT (ev->offset + ev->mlength <= 
+                 PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages);
+        LASSERT (ev->type == PTL_EVENT_PUT_END || 
+                 ev->type == PTL_EVENT_UNLINK);
+        LASSERT (ev->type == PTL_EVENT_UNLINK ||
+                 ev->match_bits == LNET_MSG_MATCHBITS);
+
+        if (ev->ni_fail_type != PTL_NI_OK)
+                CERROR("event type %d, status %d from %s\n",
+                       ev->type, ev->ni_fail_type,
+                       kptllnd_ptlid2str(ev->initiator));
+
+        if (ev->type == PTL_EVENT_PUT_END &&
+            ev->ni_fail_type == PTL_NI_OK &&
+            !rxbp->rxbp_shutdown) {
+
+                /* rxbp_shutdown sampled without locking!  I only treat it as a
+                 * hint since shutdown can start while rx's are queued on
+                 * kptl_sched_rxq. */
+#if (PTL_MD_LOCAL_ALIGN8 == 0)
+                /* Portals can't force message alignment - someone sending an
+                 * odd-length message will misalign subsequent messages and
+                 * force the fixup below...  */
+                if ((ev->mlength & 7) != 0)
+                        CWARN("Message from %s has odd length %d: "
+                              "probable version incompatibility\n",
+                              kptllnd_ptlid2str(ev->initiator),
+                              ev->mlength);
+#endif
+                rx = kptllnd_rx_alloc();
+                if (rx == NULL) {
+                        CERROR("Message from %s dropped: ENOMEM",
+                               kptllnd_ptlid2str(ev->initiator));
+                } else {
+                        if ((ev->offset & 7) == 0) {
+                                kptllnd_rx_buffer_addref(rxb);
+                                rx->rx_rxb = rxb;
+                                rx->rx_nob = ev->mlength;
+                                rx->rx_msg = (kptl_msg_t *)
+                                             (rxb->rxb_buffer + ev->offset);
+                        } else {
+#if (PTL_MD_LOCAL_ALIGN8 == 0)
+                                /* Portals can't force alignment - copy into
+                                 * rx_space (avoiding overflow) to fix */
+                                int maxlen = *kptllnd_tunables.kptl_max_msg_size;
+                                
+                                rx->rx_rxb = NULL;
+                                rx->rx_nob = MIN(maxlen, ev->mlength);
+                                rx->rx_msg = (kptl_msg_t *)rx->rx_space;
+                                memcpy(rx->rx_msg, rxb->rxb_buffer + ev->offset,
+                                       rx->rx_nob);
+#else
+                                /* Portals should have forced the alignment */
+                                LBUG();
+#endif
+                        }
+
+                        rx->rx_initiator = ev->initiator;
+#ifdef CRAY_XT3
+                        rx->rx_uid = ev->uid;
+#endif
+                        /* Queue for attention */
+                        spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, 
+                                          flags);
+
+                        list_add_tail(&rx->rx_list, 
+                                      &kptllnd_data.kptl_sched_rxq);
+                        wake_up(&kptllnd_data.kptl_sched_waitq);
+
+                        spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, 
+                                               flags);
+                }
+        }
+
+        if (unlinked) {
+                spin_lock_irqsave(&rxbp->rxbp_lock, flags);
+
+                rxb->rxb_posted = 0;
+                rxb->rxb_mdh = PTL_INVALID_HANDLE;
+                kptllnd_rx_buffer_decref_locked(rxb);
+
+                spin_unlock_irqrestore(&rxbp->rxbp_lock, flags);
+        }
+}
+
+void
+kptllnd_nak (kptl_rx_t *rx)
+{
+        /* Fire-and-forget a stub message that will let the peer know my
+         * protocol magic/version and make her drop/refresh any peer state she
+         * might have with me. */
+        ptl_md_t md = {
+                .start        = kptllnd_data.kptl_nak_msg,
+                .length       = kptllnd_data.kptl_nak_msg->ptlm_nob,
+                .threshold    = 1,
+                .options      = 0,
+                .user_ptr     = NULL,
+                .eq_handle    = PTL_EQ_NONE};
+        ptl_handle_md_t   mdh;
+        int               rc;
+
+        rc = PtlMDBind(kptllnd_data.kptl_nih, md, PTL_UNLINK, &mdh);
+        if (rc != PTL_OK) {
+                CWARN("Can't NAK %s: bind failed %d\n",
+                      kptllnd_ptlid2str(rx->rx_initiator), rc);
+                return;
+        }
+
+        rc = PtlPut(mdh, PTL_NOACK_REQ, rx->rx_initiator,
+                    *kptllnd_tunables.kptl_portal, 0,
+                    LNET_MSG_MATCHBITS, 0, 0);
+
+        if (rc != PTL_OK)
+                CWARN("Can't NAK %s: put failed %d\n",
+                      kptllnd_ptlid2str(rx->rx_initiator), rc);
+}
+
+void
+kptllnd_rx_parse(kptl_rx_t *rx)
+{
+        kptl_msg_t             *msg = rx->rx_msg;
+        kptl_peer_t            *peer;
+        int                     rc;
+        int                     credits;
+        unsigned long           flags;
+        lnet_process_id_t       srcid;
+
+        LASSERT (rx->rx_peer == NULL);
+
+        CDEBUG (D_NET, "%s: nob=%d %08x %04x %02x %d %d\n",
+                kptllnd_ptlid2str(rx->rx_initiator),
+                rx->rx_nob, msg->ptlm_magic, msg->ptlm_version,
+                msg->ptlm_type, msg->ptlm_credits, msg->ptlm_nob);
+
+        if ((rx->rx_nob >= 4 &&
+             (msg->ptlm_magic == LNET_PROTO_MAGIC ||
+              msg->ptlm_magic == __swab32(LNET_PROTO_MAGIC))) ||
+            (rx->rx_nob >= 6 &&
+             ((msg->ptlm_magic == PTLLND_MSG_MAGIC &&
+               msg->ptlm_version != PTLLND_MSG_VERSION) ||
+              (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC) &&
+               msg->ptlm_version != __swab16(PTLLND_MSG_VERSION))))) {
+                /* NAK incompatible versions
+                 * See other LNDs for how to handle this if/when ptllnd begins
+                 * to allow different versions to co-exist */
+                CERROR("Bad version: got %04x expected %04x from %s\n",
+                       (__u32)(msg->ptlm_magic == PTLLND_MSG_MAGIC ?
+                               msg->ptlm_version : __swab16(msg->ptlm_version)),
+                        PTLLND_MSG_VERSION, kptllnd_ptlid2str(rx->rx_initiator));
+                kptllnd_nak(rx);
+                goto rx_done;
+        }
+        
+        rc = kptllnd_msg_unpack(msg, rx->rx_nob);
+        if (rc != 0) {
+                CERROR ("Error %d unpacking rx from %s\n",
+                        rc, kptllnd_ptlid2str(rx->rx_initiator));
+                goto rx_done;
+        }
+
+        CDEBUG(D_NET, "rx=%p type=%s(%d) nob %d cred %d\n",
+               rx, kptllnd_msgtype2str(msg->ptlm_type), msg->ptlm_type,
+               msg->ptlm_nob, msg->ptlm_credits);
+
+        srcid.nid = msg->ptlm_srcnid;
+        srcid.pid = msg->ptlm_srcpid;
+
+        if (srcid.nid != kptllnd_ptl2lnetnid(rx->rx_initiator.nid)) {
+                CERROR("Bad source id %s from %s\n",
+                       libcfs_id2str(srcid),
+                       kptllnd_ptlid2str(rx->rx_initiator));
+                goto rx_done;
+        }
+
+        if (msg->ptlm_type == PTLLND_MSG_TYPE_NAK) {
+                peer = kptllnd_id2peer(srcid);
+                if (peer == NULL)
+                        goto rx_done;
+                
+                CWARN("NAK from %s (%s)\n",
+                      libcfs_id2str(srcid),
+                      kptllnd_ptlid2str(rx->rx_initiator));
+
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (msg->ptlm_dstnid != kptllnd_data.kptl_ni->ni_nid ||
+            msg->ptlm_dstpid != the_lnet.ln_pid) {
+                CERROR("Bad dstid %s (expected %s) from %s\n",
+                       libcfs_id2str((lnet_process_id_t) {
+                               .nid = msg->ptlm_dstnid,
+                               .pid = msg->ptlm_dstpid}),
+                       libcfs_id2str((lnet_process_id_t) {
+                               .nid = kptllnd_data.kptl_ni->ni_nid,
+                               .pid = the_lnet.ln_pid}),
+                       kptllnd_ptlid2str(rx->rx_initiator));
+                goto rx_done;
+        }
+
+        if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) {
+                peer = kptllnd_peer_handle_hello(rx->rx_initiator, msg);
+                if (peer == NULL) {
+                        CWARN("No peer for %s\n",
+                              kptllnd_ptlid2str(rx->rx_initiator));
+                        goto rx_done;
+                }
+        } else {
+                peer = kptllnd_id2peer(srcid);
+                if (peer == NULL) {
+                        CWARN("NAK %s: no connection; peer must reconnect\n",
+                              libcfs_id2str(srcid));
+                        /* NAK to make the peer reconnect */
+                        kptllnd_nak(rx);
+                        goto rx_done;
+                }
+
+                /* Ignore anything else while I'm waiting for HELLO */
+                if (peer->peer_state == PEER_STATE_WAITING_HELLO) {
+                        kptllnd_peer_decref(peer);
+                        goto rx_done;
+                }
+        }
+
+        LASSERT (msg->ptlm_srcnid == peer->peer_id.nid &&
+                 msg->ptlm_srcpid == peer->peer_id.pid);
+
+        if (msg->ptlm_srcstamp != peer->peer_incarnation) {
+                CERROR("Stale rx from %s srcstamp "LPX64" expected "LPX64"\n",
+                       libcfs_id2str(peer->peer_id),
+                       msg->ptlm_srcstamp,
+                       peer->peer_incarnation);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (msg->ptlm_dststamp != kptllnd_data.kptl_incarnation &&
+            (msg->ptlm_type != PTLLND_MSG_TYPE_HELLO || /* HELLO sends a */
+             msg->ptlm_dststamp != 0)) {                /* zero dststamp */
+                CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n",
+                       libcfs_id2str(peer->peer_id), msg->ptlm_dststamp,
+                       kptllnd_data.kptl_incarnation);
+                rc = -EPROTO;
+                goto failed;
+        }
+
+        if (msg->ptlm_credits != 0) {
+                spin_lock_irqsave(&peer->peer_lock, flags);
+
+                if (peer->peer_credits + msg->ptlm_credits >
+                    *kptllnd_tunables.kptl_peercredits) {
+                        credits = peer->peer_credits;
+                        spin_unlock_irqrestore(&peer->peer_lock, flags);
+                        
+                        CERROR("Credit overflow from %s: %d + %d > %d\n",
+                               libcfs_id2str(peer->peer_id),
+                               credits, msg->ptlm_credits,
+                               *kptllnd_tunables.kptl_peercredits);
+                        rc = -EPROTO;
+                        goto failed;
+                }
+                               
+                peer->peer_credits += msg->ptlm_credits;
+
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+                kptllnd_peer_check_sends(peer);
+        }
+
+        /* ptllnd-level protocol correct - rx takes my ref on peer and increments
+         * peer_outstanding_credits when it completes */
+        rx->rx_peer = peer;
+        kptllnd_peer_alive(peer);
+
+        switch (msg->ptlm_type) {
+        default:
+                /* already checked by kptllnd_msg_unpack() */
+                LBUG();
+
+        case PTLLND_MSG_TYPE_HELLO:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_HELLO\n");
+                goto rx_done;
+
+        case PTLLND_MSG_TYPE_NOOP:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_NOOP\n");
+                goto rx_done;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
+                rc = lnet_parse(kptllnd_data.kptl_ni,
+                                &msg->ptlm_u.immediate.kptlim_hdr,
+                                msg->ptlm_srcnid,
+                                rx, 0);
+                if (rc >= 0)                    /* kptllnd_recv owns 'rx' now */
+                        return;
+                goto failed;
+                
+        case PTLLND_MSG_TYPE_PUT:
+        case PTLLND_MSG_TYPE_GET:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n",
+                        msg->ptlm_type == PTLLND_MSG_TYPE_PUT ?
+                        "PUT" : "GET");
+
+                /* checked in kptllnd_msg_unpack() */
+                LASSERT (msg->ptlm_u.rdma.kptlrm_matchbits >= 
+                         PTL_RESERVED_MATCHBITS);
+
+                /* Update last match bits seen */
+                spin_lock_irqsave(&peer->peer_lock, flags);
+
+                if (msg->ptlm_u.rdma.kptlrm_matchbits >
+                    rx->rx_peer->peer_last_matchbits_seen)
+                        rx->rx_peer->peer_last_matchbits_seen =
+                                msg->ptlm_u.rdma.kptlrm_matchbits;
+
+                spin_unlock_irqrestore(&rx->rx_peer->peer_lock, flags);
+
+                rc = lnet_parse(kptllnd_data.kptl_ni,
+                                &msg->ptlm_u.rdma.kptlrm_hdr,
+                                msg->ptlm_srcnid,
+                                rx, 1);
+                if (rc >= 0)                    /* kptllnd_recv owns 'rx' now */
+                        return;
+                goto failed;
+         }
+
+ failed:
+        kptllnd_peer_close(peer, rc);
+        if (rx->rx_peer == NULL)                /* drop ref on peer */
+                kptllnd_peer_decref(peer);      /* unless rx_done will */
+ rx_done:
+        kptllnd_rx_done(rx);
+}
diff --git a/lnet/klnds/ptllnd/ptllnd_tx.c b/lnet/klnds/ptllnd/ptllnd_tx.c
new file mode 100644 (file)
index 0000000..1c086d8
--- /dev/null
@@ -0,0 +1,494 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+ #include "ptllnd.h"
+
+void
+kptllnd_free_tx(kptl_tx_t *tx)
+{
+        if (tx->tx_msg != NULL)
+                LIBCFS_FREE(tx->tx_msg, 
+                            *kptllnd_tunables.kptl_max_msg_size);
+                        
+        if (tx->tx_rdma_frags != NULL)
+                LIBCFS_FREE(tx->tx_rdma_frags, 
+                            sizeof(*tx->tx_rdma_frags));
+
+        LIBCFS_FREE(tx, sizeof(*tx));
+
+        atomic_dec(&kptllnd_data.kptl_ntx);
+
+        /* Keep the tunable in step for visibility */
+        *kptllnd_tunables.kptl_ntx = atomic_read(&kptllnd_data.kptl_ntx);
+}
+
+kptl_tx_t *
+kptllnd_alloc_tx(void)
+{
+        kptl_tx_t       *tx;
+
+        LIBCFS_ALLOC(tx, sizeof(*tx));
+        if (tx == NULL) {
+                CERROR("Failed to allocate TX\n");
+                return NULL;
+        }
+
+        atomic_inc(&kptllnd_data.kptl_ntx);
+
+        /* Keep the tunable in step for visibility */
+        *kptllnd_tunables.kptl_ntx = atomic_read(&kptllnd_data.kptl_ntx);
+
+        tx->tx_idle = 1;
+        tx->tx_rdma_mdh = PTL_INVALID_HANDLE;
+        tx->tx_msg_mdh = PTL_INVALID_HANDLE;
+        tx->tx_rdma_eventarg.eva_type = PTLLND_EVENTARG_TYPE_RDMA;
+        tx->tx_msg_eventarg.eva_type = PTLLND_EVENTARG_TYPE_MSG;
+        tx->tx_msg = NULL;
+        tx->tx_rdma_frags = NULL;
+                
+        LIBCFS_ALLOC(tx->tx_msg, *kptllnd_tunables.kptl_max_msg_size);
+        if (tx->tx_msg == NULL) {
+                CERROR("Failed to allocate TX payload\n");
+                goto failed;
+        }
+
+        LIBCFS_ALLOC(tx->tx_rdma_frags, sizeof(*tx->tx_rdma_frags));
+        if (tx->tx_rdma_frags == NULL) {
+                CERROR("Failed to allocate TX frags\n");
+                goto failed;
+        }
+
+        return tx;
+
+ failed:
+        kptllnd_free_tx(tx);
+        return NULL;
+}
+
+int
+kptllnd_setup_tx_descs()
+{
+        int       n = *kptllnd_tunables.kptl_ntx;
+        int       i;
+        
+        for (i = 0; i < n; i++) {
+                kptl_tx_t *tx = kptllnd_alloc_tx();
+                
+                if (tx == NULL)
+                        return -ENOMEM;
+                
+                spin_lock(&kptllnd_data.kptl_tx_lock);
+                
+                list_add_tail(&tx->tx_list, &kptllnd_data.kptl_idle_txs);
+                
+                spin_unlock(&kptllnd_data.kptl_tx_lock);
+        }
+        
+        return 0;
+}
+
+void
+kptllnd_cleanup_tx_descs()
+{
+        kptl_tx_t       *tx;
+
+        /* No locking; single threaded now */
+        LASSERT (kptllnd_data.kptl_shutdown == 2);
+
+        while (!list_empty(&kptllnd_data.kptl_idle_txs)) {
+                tx = list_entry(kptllnd_data.kptl_idle_txs.next,
+                                kptl_tx_t, tx_list);
+                
+                list_del(&tx->tx_list);
+                kptllnd_free_tx(tx);
+        }
+
+        LASSERT (atomic_read(&kptllnd_data.kptl_ntx) == 0);
+}
+
+kptl_tx_t *
+kptllnd_get_idle_tx(enum kptl_tx_type type)
+{
+        kptl_tx_t      *tx = NULL;
+
+        if (IS_SIMULATION_ENABLED(FAIL_TX_PUT_ALLOC) && 
+            type == TX_TYPE_PUT_REQUEST) {
+                CERROR("FAIL_TX_PUT_ALLOC SIMULATION triggered\n");
+                return NULL;
+        }
+
+        if (IS_SIMULATION_ENABLED(FAIL_TX_GET_ALLOC) && 
+            type == TX_TYPE_GET_REQUEST) {
+                CERROR ("FAIL_TX_GET_ALLOC SIMULATION triggered\n");
+                return NULL;
+        }
+
+        if (IS_SIMULATION_ENABLED(FAIL_TX)) {
+                CERROR ("FAIL_TX SIMULATION triggered\n");
+                return NULL;
+        }
+
+        spin_lock(&kptllnd_data.kptl_tx_lock);
+
+        if (list_empty (&kptllnd_data.kptl_idle_txs)) {
+                spin_unlock(&kptllnd_data.kptl_tx_lock);
+
+                tx = kptllnd_alloc_tx();
+                if (tx == NULL)
+                        return NULL;
+        } else {
+                tx = list_entry(kptllnd_data.kptl_idle_txs.next, 
+                                kptl_tx_t, tx_list);
+                list_del(&tx->tx_list);
+
+                spin_unlock(&kptllnd_data.kptl_tx_lock);
+        }
+
+        LASSERT (atomic_read(&tx->tx_refcount)== 0);
+        LASSERT (tx->tx_idle);
+        LASSERT (!tx->tx_active);
+        LASSERT (tx->tx_lnet_msg == NULL);
+        LASSERT (tx->tx_lnet_replymsg == NULL);
+        LASSERT (tx->tx_peer == NULL);
+        LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
+        LASSERT (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
+        
+        tx->tx_type = type;
+        atomic_set(&tx->tx_refcount, 1);
+        tx->tx_status = 0;
+        tx->tx_idle = 0;
+
+        CDEBUG(D_NET, "tx=%p\n", tx);
+        return tx;
+}
+
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+int
+kptllnd_tx_abort_netio(kptl_tx_t *tx)
+{
+        kptl_peer_t     *peer = tx->tx_peer;
+        ptl_handle_md_t  msg_mdh;
+        ptl_handle_md_t  rdma_mdh;
+        unsigned long    flags;
+
+        LASSERT (atomic_read(&tx->tx_refcount) == 0);
+        LASSERT (!tx->tx_active);
+
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        msg_mdh = tx->tx_msg_mdh;
+        rdma_mdh = tx->tx_rdma_mdh;
+
+        if (PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE) &&
+            PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) {
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+                return 0;
+        }
+        
+        /* Uncompleted comms: there must have been some error and it must be
+         * propagated to LNET... */
+        LASSERT (tx->tx_status != 0 ||
+                 (tx->tx_lnet_msg == NULL && 
+                  tx->tx_lnet_replymsg == NULL));
+
+        /* stash the tx on its peer until it completes */
+        atomic_set(&tx->tx_refcount, 1);
+        tx->tx_active = 1;
+        list_add_tail(&tx->tx_list, &peer->peer_activeq);
+        
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        /* These unlinks will ensure completion events (normal or unlink) will
+         * happen ASAP */
+
+        if (!PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE))
+                PtlMDUnlink(msg_mdh);
+        
+        if (!PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE))
+                PtlMDUnlink(rdma_mdh);
+
+        return -EAGAIN;
+}
+#else
+int
+kptllnd_tx_abort_netio(kptl_tx_t *tx)
+{
+        ptl_peer_t      *peer = tx->tx_peer;
+        ptl_handle_md_t  msg_mdh;
+        ptl_handle_md_t  rdma_mdh;
+        unsigned long    flags;
+        ptl_err_t        prc;
+
+        LASSERT (atomic_read(&tx->tx_refcount) == 0);
+        LASSERT (!tx->tx_active);
+
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        msg_mdh = tx->tx_msg_mdh;
+        rdma_mdh = tx->tx_rdma_mdh;
+
+        if (PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE) &&
+            PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) {
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+                return 0;
+        }
+        
+        /* Uncompleted comms: there must have been some error and it must be
+         * propagated to LNET... */
+        LASSERT (tx->tx_status != 0 ||
+                 (tx->tx_lnet_msg == NULL && 
+                  tx->tx_replymsg == NULL));
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        if (!PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE)) {
+                prc = PtlMDUnlink(msg_mdh);
+                if (prc == PTL_OK)
+                        msg_mdh = PTL_INVALID_HANDLE;
+        }
+
+        if (!PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) {
+                prc = PtlMDUnlink(rdma_mdh);
+                if (prc == PTL_OK)
+                        rdma_mdh = PTL_INVALID_HANDLE;
+        }
+
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        /* update tx_???_mdh if callback hasn't fired */
+        if (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE))
+                msg_mdh = PTL_INVALID_HANDLE;
+        else
+                tx->tx_msg_mdh = msg_mdh;
+        
+        if (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE))
+                rdma_mdh = PTL_INVALID_HANDLE;
+        else
+                tx->tx_rdma_mdh = rdma_mdh;
+
+        if (PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE) &&
+            PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) {
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+                return 0;
+        }
+
+        /* stash the tx on its peer until it completes */
+        atomic_set(&tx->tx_refcount, 1);
+        tx->tx_active = 1;
+        list_add_tail(&tx->tx_list, &peer->peer_activeq);
+
+        kptllnd_peer_addref(peer);              /* extra ref for me... */
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        /* This will get the watchdog thread to try aborting all the peer's
+         * comms again.  NB, this deems it fair that 1 failing tx which can't
+         * be aborted immediately (i.e. its MDs are still busy) is valid cause
+         * to nuke everything to the same peer! */
+        kptllnd_peer_close(peer, tx->tx_status);
+
+        kptllnd_peer_decref(peer);
+
+        return -EAGAIN;
+}
+#endif
+
+void
+kptllnd_tx_fini (kptl_tx_t *tx)
+{
+        lnet_msg_t     *replymsg = tx->tx_lnet_replymsg;
+        lnet_msg_t     *msg      = tx->tx_lnet_msg;
+        kptl_peer_t    *peer     = tx->tx_peer;
+        int             status   = tx->tx_status;
+        int             rc;
+
+        LASSERT (!in_interrupt());
+        LASSERT (atomic_read(&tx->tx_refcount) == 0);
+        LASSERT (!tx->tx_idle);
+        LASSERT (!tx->tx_active);
+
+        /* TX has completed or failed */
+
+        if (peer != NULL) {
+                rc = kptllnd_tx_abort_netio(tx);
+                if (rc != 0)
+                        return;
+        }
+
+        LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
+        LASSERT (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
+
+        tx->tx_lnet_msg = tx->tx_lnet_replymsg = NULL;
+        tx->tx_peer = NULL;
+        tx->tx_idle = 1;
+
+        spin_lock(&kptllnd_data.kptl_tx_lock);
+        list_add_tail(&tx->tx_list, &kptllnd_data.kptl_idle_txs);
+        spin_unlock(&kptllnd_data.kptl_tx_lock);
+
+        /* Must finalize AFTER freeing 'tx' */
+        if (msg != NULL)
+                lnet_finalize(kptllnd_data.kptl_ni, msg,
+                              (replymsg == NULL) ? status : 0);
+
+        if (replymsg != NULL)
+                lnet_finalize(kptllnd_data.kptl_ni, replymsg, status);
+
+        if (peer != NULL)
+                kptllnd_peer_decref(peer);
+}
+
+const char *
+kptllnd_tx_typestr(int type)
+{
+        switch (type) {
+        default:
+                return "<TYPE UNKNOWN>";
+                
+        case TX_TYPE_SMALL_MESSAGE:
+                return "msg";
+
+        case TX_TYPE_PUT_REQUEST:
+                return "put_req";
+
+        case TX_TYPE_GET_REQUEST:
+                return "get_req";
+                break;
+
+        case TX_TYPE_PUT_RESPONSE:
+                return "put_rsp";
+                break;
+
+        case TX_TYPE_GET_RESPONSE:
+                return "get_rsp";
+        }
+}
+
+void
+kptllnd_tx_callback(ptl_event_t *ev)
+{
+        kptl_eventarg_t *eva = ev->md.user_ptr;
+        int              ismsg = (eva->eva_type == PTLLND_EVENTARG_TYPE_MSG);
+        kptl_tx_t       *tx = kptllnd_eventarg2obj(eva);
+        kptl_peer_t     *peer = tx->tx_peer;
+        int              ok = (ev->ni_fail_type == PTL_OK);
+        int              unlinked;
+        unsigned long    flags;
+
+        LASSERT (peer != NULL);
+        LASSERT (eva->eva_type == PTLLND_EVENTARG_TYPE_MSG ||
+                 eva->eva_type == PTLLND_EVENTARG_TYPE_RDMA);
+        LASSERT (!ismsg || !PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE));
+        LASSERT (ismsg || !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE));
+
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+        unlinked = ev->unlinked;
+#else
+        unlinked = (ev->type == PTL_EVENT_UNLINK);
+#endif
+        CDEBUG(D_NET, "%s(%d) tx=%p(%s) fail=%d unlinked=%d\n",
+               kptllnd_evtype2str(ev->type), ev->type, 
+               tx, libcfs_id2str(peer->peer_id), 
+               ev->ni_fail_type, unlinked);
+
+        switch (tx->tx_type) {
+        default:
+                LBUG();
+                
+        case TX_TYPE_SMALL_MESSAGE:
+                LASSERT (ismsg);
+                LASSERT (ev->type == PTL_EVENT_UNLINK ||
+                         ev->type == PTL_EVENT_SEND_END);
+                break;
+
+        case TX_TYPE_PUT_REQUEST:
+                LASSERT (ev->type == PTL_EVENT_UNLINK ||
+                         (ismsg && ev->type == PTL_EVENT_SEND_END) ||
+                         (!ismsg && ev->type == PTL_EVENT_GET_END));
+                break;
+
+        case TX_TYPE_GET_REQUEST:
+                LASSERT (ev->type == PTL_EVENT_UNLINK ||
+                         (ismsg && ev->type == PTL_EVENT_SEND_END) ||
+                         (!ismsg && ev->type == PTL_EVENT_PUT_END));
+
+                if (!ismsg && ok && ev->type == PTL_EVENT_PUT_END) {
+                        if (ev->hdr_data == PTLLND_RDMA_OK) {
+                                lnet_set_reply_msg_len(
+                                        kptllnd_data.kptl_ni,
+                                        tx->tx_lnet_replymsg,
+                                        ev->mlength);
+                        } else {
+                                /* no match at peer */
+                                tx->tx_status = -EIO;
+                        }
+                }
+                break;
+
+        case TX_TYPE_PUT_RESPONSE:
+                LASSERT (!ismsg);
+                LASSERT (ev->type == PTL_EVENT_UNLINK ||
+                         ev->type == PTL_EVENT_SEND_END ||
+                         ev->type == PTL_EVENT_REPLY_END);
+                break;
+
+        case TX_TYPE_GET_RESPONSE:
+                LASSERT (!ismsg);
+                LASSERT (ev->type == PTL_EVENT_UNLINK ||
+                         ev->type == PTL_EVENT_SEND_END);
+                break;
+        }
+
+        if (!ok)
+                kptllnd_peer_close(peer, -EIO);
+        else
+                kptllnd_peer_alive(peer);
+
+        if (!unlinked)
+                return;
+
+        spin_lock_irqsave(&peer->peer_lock, flags);
+
+        if (ismsg)
+                tx->tx_msg_mdh = PTL_INVALID_HANDLE;
+        else
+                tx->tx_rdma_mdh = PTL_INVALID_HANDLE;
+
+        if (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE) ||
+            !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE) ||
+            !tx->tx_active) {
+                spin_unlock_irqrestore(&peer->peer_lock, flags);
+                return;
+        }
+
+        list_del(&tx->tx_list);
+        tx->tx_active = 0;
+
+        spin_unlock_irqrestore(&peer->peer_lock, flags);
+
+        /* drop peer's ref, but if it was the last one... */
+        if (atomic_dec_and_test(&tx->tx_refcount)) {
+                /* ...finalize it in thread context! */
+                spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags);
+
+                list_add_tail(&tx->tx_list, &kptllnd_data.kptl_sched_txq);
+                wake_up(&kptllnd_data.kptl_sched_waitq);
+
+                spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags);
+        }
+}
diff --git a/lnet/klnds/ptllnd/wirecheck.c b/lnet/klnds/ptllnd/wirecheck.c
new file mode 100644 (file)
index 0000000..8111cbb
--- /dev/null
@@ -0,0 +1,206 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: PJ Kirner <pjkirner@clusterfs.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include <config.h>
+
+#include <lnet/api-support.h>
+
+/* This ghastly hack to allows me to include lib-types.h It doesn't affect any
+ * assertions generated here (but fails-safe if it ever does) */
+typedef struct {
+        int     counter;
+} atomic_t;
+
+#include <lnet/lib-types.h>
+#include <lnet/ptllnd_wire.h>
+
+#ifndef HAVE_STRNLEN
+#define strnlen(s, i) strlen(s)
+#endif
+
+#define BLANK_LINE()                            \
+do {                                            \
+        printf ("\n");                          \
+} while (0)
+
+#define COMMENT(c)                              \
+do {                                            \
+        printf ("        /* "c" */\n");         \
+} while (0)
+
+#undef STRINGIFY
+#define STRINGIFY(a) #a
+
+#define CHECK_DEFINE(a)                                         \
+do {                                                            \
+        printf ("        CLASSERT ("#a" == "STRINGIFY(a)");\n"); \
+} while (0)
+
+#define CHECK_VALUE(a)                                  \
+do {                                                    \
+        printf ("        CLASSERT ("#a" == %d);\n", a);  \
+} while (0)
+
+#define CHECK_MEMBER_OFFSET(s,m)                \
+do {                                            \
+        CHECK_VALUE((int)offsetof(s, m));       \
+} while (0)
+
+#define CHECK_MEMBER_SIZEOF(s,m)                \
+do {                                            \
+        CHECK_VALUE((int)sizeof(((s *)0)->m));  \
+} while (0)
+
+#define CHECK_MEMBER(s,m)                       \
+do {                                            \
+        CHECK_MEMBER_OFFSET(s, m);              \
+        CHECK_MEMBER_SIZEOF(s, m);              \
+} while (0)
+
+#define CHECK_STRUCT(s)                         \
+do {                                            \
+        BLANK_LINE ();                          \
+        COMMENT ("Checks for struct "#s);       \
+        CHECK_VALUE((int)sizeof(s));            \
+} while (0)
+
+void
+system_string (char *cmdline, char *str, int len)
+{
+        int   fds[2];
+        int   rc;
+        pid_t pid;
+
+        rc = pipe (fds);
+        if (rc != 0)
+                abort ();
+
+        pid = fork ();
+        if (pid == 0) {
+                /* child */
+                int   fd = fileno(stdout);
+
+                rc = dup2(fds[1], fd);
+                if (rc != fd)
+                        abort();
+
+                exit(system(cmdline));
+                /* notreached */
+        } else if ((int)pid < 0) {
+                abort();
+        } else {
+                FILE *f = fdopen (fds[0], "r");
+
+                if (f == NULL)
+                        abort();
+
+                close(fds[1]);
+
+                if (fgets(str, len, f) == NULL)
+                        abort();
+
+                if (waitpid(pid, &rc, 0) != pid)
+                        abort();
+
+                if (!WIFEXITED(rc) ||
+                    WEXITSTATUS(rc) != 0)
+                        abort();
+
+                if (strnlen(str, len) == len)
+                        str[len - 1] = 0;
+
+                if (str[strlen(str) - 1] == '\n')
+                        str[strlen(str) - 1] = 0;
+
+                fclose(f);
+        }
+}
+
+int
+main (int argc, char **argv)
+{
+        char unameinfo[80];
+        char gccinfo[80];
+
+        system_string("uname -a", unameinfo, sizeof(unameinfo));
+        system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo));
+
+        printf ("void kptllnd_assert_wire_constants (void)\n"
+                "{\n"
+                "        /* Wire protocol assertions generated by 'wirecheck'\n"
+                "         * running on %s\n"
+                "         * with %s */\n"
+                "\n", unameinfo, gccinfo);
+
+        BLANK_LINE ();
+
+        COMMENT ("Constants...");
+        CHECK_DEFINE (PTL_RESERVED_MATCHBITS);
+        CHECK_DEFINE (LNET_MSG_MATCHBITS);
+        
+        CHECK_DEFINE (PTLLND_MSG_MAGIC);
+        CHECK_DEFINE (PTLLND_MSG_VERSION);
+
+        CHECK_DEFINE (PTLLND_RDMA_OK);
+        CHECK_DEFINE (PTLLND_RDMA_FAIL);
+
+        CHECK_DEFINE (PTLLND_MSG_TYPE_INVALID);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_PUT);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_GET);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_IMMEDIATE);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_NOOP);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_HELLO);
+        CHECK_DEFINE (PTLLND_MSG_TYPE_NAK);
+
+        CHECK_STRUCT (kptl_msg_t);
+        CHECK_MEMBER (kptl_msg_t, ptlm_magic);
+        CHECK_MEMBER (kptl_msg_t, ptlm_version);
+        CHECK_MEMBER (kptl_msg_t, ptlm_type);
+        CHECK_MEMBER (kptl_msg_t, ptlm_credits);
+        CHECK_MEMBER (kptl_msg_t, ptlm_nob);
+        CHECK_MEMBER (kptl_msg_t, ptlm_cksum);
+        CHECK_MEMBER (kptl_msg_t, ptlm_srcnid);
+        CHECK_MEMBER (kptl_msg_t, ptlm_srcstamp);
+        CHECK_MEMBER (kptl_msg_t, ptlm_dstnid);
+        CHECK_MEMBER (kptl_msg_t, ptlm_dststamp);
+        CHECK_MEMBER (kptl_msg_t, ptlm_srcpid);
+        CHECK_MEMBER (kptl_msg_t, ptlm_dstpid);
+        CHECK_MEMBER (kptl_msg_t, ptlm_u.immediate);
+        CHECK_MEMBER (kptl_msg_t, ptlm_u.rdma);
+        CHECK_MEMBER (kptl_msg_t, ptlm_u.hello);
+
+        CHECK_STRUCT (kptl_immediate_msg_t);
+        CHECK_MEMBER (kptl_immediate_msg_t, kptlim_hdr);
+        CHECK_MEMBER (kptl_immediate_msg_t, kptlim_payload[13]);
+
+        CHECK_STRUCT (kptl_rdma_msg_t);
+        CHECK_MEMBER (kptl_rdma_msg_t, kptlrm_hdr);
+        CHECK_MEMBER (kptl_rdma_msg_t, kptlrm_matchbits);
+
+        CHECK_STRUCT (kptl_hello_msg_t);
+        CHECK_MEMBER (kptl_hello_msg_t, kptlhm_matchbits);
+        CHECK_MEMBER (kptl_hello_msg_t, kptlhm_max_msg_size);
+
+        printf ("}\n\n");
+
+        return (0);
+}
index d27240c..b623e02 100644 (file)
@@ -1,5 +1,5 @@
-MODULES := kqswnal
-kqswnal-objs := qswnal.o qswnal_cb.o
+MODULES := kqswlnd
+kqswlnd-objs := qswlnd.o qswlnd_cb.o qswlnd_modparams.o
 
 EXTRA_POST_CFLAGS := @QSWCPPFLAGS@ -I/usr/include
 
index 228689d..721e86f 100644 (file)
@@ -4,12 +4,10 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
-if BUILD_QSWNAL
-modulenet_DATA = kqswnal$(KMODEXT)
-endif
+if BUILD_QSWLND
+modulenet_DATA = kqswlnd$(KMODEXT)
 endif
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(kqswnal-objs:%.o=%.c) qswnal.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kqswlnd-objs:%.o=%.c) qswlnd.h
index be01f5d..a8ecaca 100644 (file)
  *
  */
 
-#include "qswnal.h"
+#include "qswlnd.h"
 
-nal_t                  kqswnal_api;
-kqswnal_data_t         kqswnal_data;
-ptl_handle_ni_t         kqswnal_ni;
-kqswnal_tunables_t      kqswnal_tunables;
-
-kpr_nal_interface_t kqswnal_router_interface = {
-       kprni_nalid:    QSWNAL,
-       kprni_arg:      NULL,
-       kprni_fwd:      kqswnal_fwd_packet,
-       kprni_notify:   NULL,                   /* we're connectionless */
-};
-
-#if CONFIG_SYSCTL
-#define QSWNAL_SYSCTL  201
-
-#define QSWNAL_SYSCTL_OPTIMIZED_GETS     1
-#define QSWNAL_SYSCTL_OPTIMIZED_PUTS     2
 
-static ctl_table kqswnal_ctl_table[] = {
-       {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts",
-        &kqswnal_tunables.kqn_optimized_puts, sizeof (int),
-        0644, NULL, &proc_dointvec},
-       {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets",
-        &kqswnal_tunables.kqn_optimized_gets, sizeof (int),
-        0644, NULL, &proc_dointvec},
-       {0}
+lnd_t the_kqswlnd =
+{
+       .lnd_type       = QSWLND,
+       .lnd_startup    = kqswnal_startup,
+       .lnd_shutdown   = kqswnal_shutdown,
+       .lnd_ctl        = kqswnal_ctl,
+       .lnd_send       = kqswnal_send,
+        .lnd_recv       = kqswnal_recv,
 };
 
-static ctl_table kqswnal_top_ctl_table[] = {
-       {QSWNAL_SYSCTL, "qswnal", NULL, 0, 0555, kqswnal_ctl_table},
-       {0}
-};
-#endif
+kqswnal_data_t         kqswnal_data;
 
 int
-kqswnal_get_tx_desc (struct portals_cfg *pcfg)
+kqswnal_get_tx_desc (struct libcfs_ioctl_data *data)
 {
        unsigned long      flags;
        struct list_head  *tmp;
        kqswnal_tx_t      *ktx;
-       ptl_hdr_t         *hdr;
-       int                index = pcfg->pcfg_count;
+       lnet_hdr_t        *hdr;
+       int                index = data->ioc_count;
        int                rc = -ENOENT;
 
        spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
@@ -72,18 +51,15 @@ kqswnal_get_tx_desc (struct portals_cfg *pcfg)
                        continue;
 
                ktx = list_entry (tmp, kqswnal_tx_t, ktx_list);
-               hdr = (ptl_hdr_t *)ktx->ktx_buffer;
-
-               memcpy(pcfg->pcfg_pbuf, ktx,
-                      MIN(sizeof(*ktx), pcfg->pcfg_plen1));
-               pcfg->pcfg_count = le32_to_cpu(hdr->type);
-               pcfg->pcfg_size  = le32_to_cpu(hdr->payload_length);
-               pcfg->pcfg_nid   = le64_to_cpu(hdr->dest_nid);
-               pcfg->pcfg_nid2  = ktx->ktx_nid;
-               pcfg->pcfg_misc  = ktx->ktx_launcher;
-               pcfg->pcfg_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) |
-                                 (!ktx->ktx_isnblk                    ? 0 : 2) |
-                                 (ktx->ktx_state << 2);
+               hdr = (lnet_hdr_t *)ktx->ktx_buffer;
+
+               data->ioc_count  = le32_to_cpu(hdr->payload_length);
+               data->ioc_nid    = le64_to_cpu(hdr->dest_nid);
+               data->ioc_u64[0] = ktx->ktx_nid;
+               data->ioc_u32[0] = le32_to_cpu(hdr->type);
+               data->ioc_u32[1] = ktx->ktx_launcher;
+               data->ioc_flags  = (list_empty (&ktx->ktx_schedlist) ? 0 : 1) |
+                                  (ktx->ktx_state << 2);
                rc = 0;
                break;
        }
@@ -93,44 +69,42 @@ kqswnal_get_tx_desc (struct portals_cfg *pcfg)
 }
 
 int
-kqswnal_cmd (struct portals_cfg *pcfg, void *private)
+kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg)
 {
-       LASSERT (pcfg != NULL);
-       
-       switch (pcfg->pcfg_command) {
-       case NAL_CMD_GET_TXDESC:
-               return (kqswnal_get_tx_desc (pcfg));
-
-       case NAL_CMD_REGISTER_MYNID:
-               CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n",
-                       pcfg->pcfg_nid - kqswnal_data.kqn_elanid,
-                       kqswnal_data.kqn_nid_offset);
-               kqswnal_data.kqn_nid_offset =
-                       pcfg->pcfg_nid - kqswnal_data.kqn_elanid;
-               kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid;
-               return (0);
+       struct libcfs_ioctl_data *data = arg;
+
+       LASSERT (ni == kqswnal_data.kqn_ni);
+
+       switch (cmd) {
+       case IOC_LIBCFS_GET_TXDESC:
+               return (kqswnal_get_tx_desc (data));
+
+       case IOC_LIBCFS_REGISTER_MYNID:
+               if (data->ioc_nid == ni->ni_nid)
+                       return 0;
+               
+               LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid));
+
+               CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n",
+                      libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(ni->ni_nid));
+               return 0;
                
        default:
                return (-EINVAL);
        }
 }
 
-static void
-kqswnal_shutdown(nal_t *nal)
+void
+kqswnal_shutdown(lnet_ni_t *ni)
 {
        unsigned long flags;
        kqswnal_tx_t *ktx;
        kqswnal_rx_t *krx;
-       int           do_lib_fini = 0;
-
-       /* NB The first ref was this module! */
-       if (nal->nal_refct != 0) {
-               PORTAL_MODULE_UNUSE;
-               return;
-       }
-
+       
        CDEBUG (D_NET, "shutdown\n");
-       LASSERT (nal == &kqswnal_api);
+       LASSERT (ni->ni_data == &kqswnal_data);
+       LASSERT (ni == kqswnal_data.kqn_ni);
 
        switch (kqswnal_data.kqn_init)
        {
@@ -138,46 +112,26 @@ kqswnal_shutdown(nal_t *nal)
                LASSERT (0);
 
        case KQN_INIT_ALL:
-                libcfs_nal_cmd_unregister(QSWNAL);
-               /* fall through */
-
-       case KQN_INIT_LIB:
-               do_lib_fini = 1;
-               /* fall through */
-
        case KQN_INIT_DATA:
                break;
-
-       case KQN_INIT_NOTHING:
-               return;
        }
 
        /**********************************************************************/
-       /* Tell router we're shutting down.  Any router calls my threads
-        * make will now fail immediately and the router will stop calling
-        * into me. */
-       kpr_shutdown (&kqswnal_data.kqn_router);
-       
-       /**********************************************************************/
        /* Signal the start of shutdown... */
        spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
        kqswnal_data.kqn_shuttingdown = 1;
        spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
 
-       wake_up_all(&kqswnal_data.kqn_idletxd_waitq);
-
        /**********************************************************************/
        /* wait for sends that have allocated a tx desc to launch or give up */
        while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) {
                CDEBUG(D_NET, "waiting for %d pending sends\n",
                       atomic_read (&kqswnal_data.kqn_pending_txs));
-               set_current_state (TASK_UNINTERRUPTIBLE);
-               schedule_timeout (HZ);
+               cfs_pause(cfs_time_seconds(1));
        }
 
        /**********************************************************************/
        /* close elan comms */
-#if MULTIRAIL_EKC
        /* Shut down receivers first; rx callbacks might try sending... */
        if (kqswnal_data.kqn_eprx_small != NULL)
                ep_free_rcvr (kqswnal_data.kqn_eprx_small);
@@ -188,7 +142,7 @@ kqswnal_shutdown(nal_t *nal)
        /* NB ep_free_rcvr() returns only after we've freed off all receive
         * buffers (see shutdown handling in kqswnal_requeue_rx()).  This
         * means we must have completed any messages we passed to
-        * lib_parse() or kpr_fwd_start(). */
+        * lnet_parse() */
 
        if (kqswnal_data.kqn_eptx != NULL)
                ep_free_xmtr (kqswnal_data.kqn_eptx);
@@ -196,25 +150,7 @@ kqswnal_shutdown(nal_t *nal)
        /* NB ep_free_xmtr() returns only after all outstanding transmits
         * have called their callback... */
        LASSERT(list_empty(&kqswnal_data.kqn_activetxds));
-#else
-       /* "Old" EKC just pretends to shutdown cleanly but actually
-        * provides no guarantees */
-       if (kqswnal_data.kqn_eprx_small != NULL)
-               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small);
-
-       if (kqswnal_data.kqn_eprx_large != NULL)
-               ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large);
 
-       /* wait for transmits to complete */
-       while (!list_empty(&kqswnal_data.kqn_activetxds)) {
-               CWARN("waiting for active transmits to complete\n");
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               schedule_timeout(HZ);
-       }
-
-       if (kqswnal_data.kqn_eptx != NULL)
-               ep_free_large_xmtr (kqswnal_data.kqn_eptx);
-#endif
        /**********************************************************************/
        /* flag threads to terminate, wake them and wait for them to die */
        kqswnal_data.kqn_shuttingdown = 2;
@@ -223,8 +159,7 @@ kqswnal_shutdown(nal_t *nal)
        while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
                CDEBUG(D_NET, "waiting for %d threads to terminate\n",
                       atomic_read (&kqswnal_data.kqn_nthreads));
-               set_current_state (TASK_UNINTERRUPTIBLE);
-               schedule_timeout (HZ);
+               cfs_pause(cfs_time_seconds(1));
        }
 
        /**********************************************************************/
@@ -232,37 +167,14 @@ kqswnal_shutdown(nal_t *nal)
         * I control the horizontals and the verticals...
         */
 
-#if MULTIRAIL_EKC
        LASSERT (list_empty (&kqswnal_data.kqn_readyrxds));
+       LASSERT (list_empty (&kqswnal_data.kqn_donetxds));
        LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds));
-       LASSERT (list_empty (&kqswnal_data.kqn_delayedfwds));
-#endif
-
-       /**********************************************************************/
-       /* Complete any blocked forwarding packets, with error
-        */
-
-       while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq))
-       {
-               kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
-                                                 kpr_fwd_desc_t, kprfd_list);
-               list_del (&fwd->kprfd_list);
-               kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -ESHUTDOWN);
-       }
-
-       /**********************************************************************/
-       /* finalise router and portals lib */
-
-       kpr_deregister (&kqswnal_data.kqn_router);
-
-       if (do_lib_fini)
-               lib_fini (&kqswnal_lib);
 
        /**********************************************************************/
        /* Unmap message buffers and free all descriptors and buffers
         */
 
-#if MULTIRAIL_EKC
        /* FTTB, we need to unmap any remaining mapped memory.  When
         * ep_dvma_release() get fixed (and releases any mappings in the
         * region), we can delete all the code from here -------->  */
@@ -294,38 +206,15 @@ kqswnal_shutdown(nal_t *nal)
 
        if (kqswnal_data.kqn_ep_tx_nmh != NULL)
                ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh);
-#else
-       if (kqswnal_data.kqn_eprxdmahandle != NULL)
-       {
-               elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
-                                 kqswnal_data.kqn_eprxdmahandle, 0,
-                                 KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
-                                 KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE);
-
-               elan3_dma_release(kqswnal_data.kqn_ep->DmaState,
-                                 kqswnal_data.kqn_eprxdmahandle);
-       }
-
-       if (kqswnal_data.kqn_eptxdmahandle != NULL)
-       {
-               elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
-                                 kqswnal_data.kqn_eptxdmahandle, 0,
-                                 KQSW_NTXMSGPAGES * (KQSW_NTXMSGS +
-                                                     KQSW_NNBLK_TXMSGS));
-
-               elan3_dma_release(kqswnal_data.kqn_ep->DmaState,
-                                 kqswnal_data.kqn_eptxdmahandle);
-       }
-#endif
 
        while (kqswnal_data.kqn_txds != NULL) {
                ktx = kqswnal_data.kqn_txds;
 
                if (ktx->ktx_buffer != NULL)
-                       PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
+                       LIBCFS_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
 
                kqswnal_data.kqn_txds = ktx->ktx_alloclist;
-               PORTAL_FREE(ktx, sizeof(*ktx));
+               LIBCFS_FREE(ktx, sizeof(*ktx));
        }
 
        while (kqswnal_data.kqn_rxds != NULL) {
@@ -337,106 +226,96 @@ kqswnal_shutdown(nal_t *nal)
                                __free_page (krx->krx_kiov[i].kiov_page);
 
                kqswnal_data.kqn_rxds = krx->krx_alloclist;
-               PORTAL_FREE(krx, sizeof (*krx));
+               LIBCFS_FREE(krx, sizeof (*krx));
        }
 
        /* resets flags, pointers to NULL etc */
        memset(&kqswnal_data, 0, sizeof (kqswnal_data));
 
-       CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
+       CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&libcfs_kmemory));
 
-       printk (KERN_INFO "Lustre: Routing QSW NAL unloaded (final mem %d)\n",
-                atomic_read(&portal_kmemory));
+       PORTAL_MODULE_UNUSE;
 }
 
-static int
-kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
-                ptl_ni_limits_t *requested_limits, 
-                ptl_ni_limits_t *actual_limits)
+int
+kqswnal_startup (lnet_ni_t *ni)
 {
-#if MULTIRAIL_EKC
        EP_RAILMASK       all_rails = EP_RAILMASK_ALL;
-#else
-       ELAN3_DMA_REQUEST dmareq;
-#endif
        int               rc;
        int               i;
        kqswnal_rx_t     *krx;
        kqswnal_tx_t     *ktx;
        int               elan_page_idx;
-       ptl_process_id_t  my_process_id;
-       int               pkmem = atomic_read(&portal_kmemory);
 
-       LASSERT (nal == &kqswnal_api);
+       LASSERT (ni->ni_lnd == &the_kqswlnd);
 
-       if (nal->nal_refct != 0) {
-               if (actual_limits != NULL)
-                       *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits;
-               /* This module got the first ref */
-               PORTAL_MODULE_USE;
-               return (PTL_OK);
+#if KQSW_CKSUM
+       if (the_lnet.ln_ptlcompat != 0) {
+               CERROR("Checksumming version not portals compatible\n");
+               return -ENODEV;
        }
-
-       LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING);
-
-       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory));
-
+#endif
+       /* Only 1 instance supported */
+       if (kqswnal_data.kqn_init != KQN_INIT_NOTHING) {
+                CERROR ("Only 1 instance supported\n");
+                return -EPERM;
+        }
+
+        if (ni->ni_interfaces[0] != NULL) {
+                CERROR("Explicit interface config not supported\n");
+                return -EPERM;
+        }
+
+       if (*kqswnal_tunables.kqn_credits >=
+           *kqswnal_tunables.kqn_ntxmsgs) {
+               LCONSOLE_ERROR("Configuration error: please set "
+                              "ntxmsgs(%d) > credits(%d)\n",
+                              *kqswnal_tunables.kqn_ntxmsgs,
+                              *kqswnal_tunables.kqn_credits);
+       }
+        
+       CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&libcfs_kmemory));
+       
        /* ensure all pointers NULL etc */
        memset (&kqswnal_data, 0, sizeof (kqswnal_data));
 
+       kqswnal_data.kqn_ni = ni;
+       ni->ni_data = &kqswnal_data;
+       ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits;
+       ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits;
+
        INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
-       INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds);
        INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds);
        spin_lock_init (&kqswnal_data.kqn_idletxd_lock);
-       init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq);
-       INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq);
 
-       INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds);
        INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
+       INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds);
        INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
 
        spin_lock_init (&kqswnal_data.kqn_sched_lock);
        init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
 
-       /* Leave kqn_rpc_success zeroed */
-#if MULTIRAIL_EKC
-       kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED;
-#else
-       kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED;
-#endif
-
        /* pointers/lists/locks initialised */
        kqswnal_data.kqn_init = KQN_INIT_DATA;
+       PORTAL_MODULE_USE;
        
-#if MULTIRAIL_EKC
        kqswnal_data.kqn_ep = ep_system();
        if (kqswnal_data.kqn_ep == NULL) {
                CERROR("Can't initialise EKC\n");
-               kqswnal_shutdown(nal);
-               return (PTL_IFACE_INVALID);
+               kqswnal_shutdown(ni);
+               return (-ENODEV);
        }
 
        if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) {
                CERROR("Can't get elan ID\n");
-               kqswnal_shutdown(nal);
-               return (PTL_IFACE_INVALID);
+               kqswnal_shutdown(ni);
+               return (-ENODEV);
        }
-#else
-       /**********************************************************************/
-       /* Find the first Elan device */
 
-       kqswnal_data.kqn_ep = ep_device (0);
-       if (kqswnal_data.kqn_ep == NULL)
-       {
-               CERROR ("Can't get elan device 0\n");
-               kqswnal_shutdown(nal);
-               return (PTL_IFACE_INVALID);
-       }
-#endif
+       kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep);
+       kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep);
 
-       kqswnal_data.kqn_nid_offset = 0;
-       kqswnal_data.kqn_nnodes     = ep_numnodes (kqswnal_data.kqn_ep);
-       kqswnal_data.kqn_elanid     = ep_nodeid (kqswnal_data.kqn_ep);
+       ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), kqswnal_data.kqn_elanid);
        
        /**********************************************************************/
        /* Get the transmitter */
@@ -445,172 +324,129 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
        if (kqswnal_data.kqn_eptx == NULL)
        {
                CERROR ("Can't allocate transmitter\n");
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
+               kqswnal_shutdown (ni);
+               return (-ENOMEM);
        }
 
        /**********************************************************************/
        /* Get the receivers */
 
-       kqswnal_data.kqn_eprx_small = ep_alloc_rcvr (kqswnal_data.kqn_ep,
-                                                    EP_MSG_SVC_PORTALS_SMALL,
-                                                    KQSW_EP_ENVELOPES_SMALL);
+       kqswnal_data.kqn_eprx_small = 
+               ep_alloc_rcvr (kqswnal_data.kqn_ep,
+                              EP_MSG_SVC_PORTALS_SMALL,
+                              *kqswnal_tunables.kqn_ep_envelopes_small);
        if (kqswnal_data.kqn_eprx_small == NULL)
        {
                CERROR ("Can't install small msg receiver\n");
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
+               kqswnal_shutdown (ni);
+               return (-ENOMEM);
        }
 
-       kqswnal_data.kqn_eprx_large = ep_alloc_rcvr (kqswnal_data.kqn_ep,
-                                                    EP_MSG_SVC_PORTALS_LARGE,
-                                                    KQSW_EP_ENVELOPES_LARGE);
+       kqswnal_data.kqn_eprx_large = 
+               ep_alloc_rcvr (kqswnal_data.kqn_ep,
+                              EP_MSG_SVC_PORTALS_LARGE,
+                              *kqswnal_tunables.kqn_ep_envelopes_large);
        if (kqswnal_data.kqn_eprx_large == NULL)
        {
                CERROR ("Can't install large msg receiver\n");
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
+               kqswnal_shutdown (ni);
+               return (-ENOMEM);
        }
 
        /**********************************************************************/
        /* Reserve Elan address space for transmit descriptors NB we may
         * either send the contents of associated buffers immediately, or
         * map them for the peer to suck/blow... */
-#if MULTIRAIL_EKC
        kqswnal_data.kqn_ep_tx_nmh = 
                ep_dvma_reserve(kqswnal_data.kqn_ep,
-                               KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
+                               KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs),
                                EP_PERM_WRITE);
        if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
                CERROR("Can't reserve tx dma space\n");
-               kqswnal_shutdown(nal);
-               return (PTL_NO_SPACE);
-       }
-#else
-        dmareq.Waitfn   = DDI_DMA_SLEEP;
-        dmareq.ElanAddr = (E3_Addr) 0;
-        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
-        dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
-
-       rc = elan3_dma_reserve(kqswnal_data.kqn_ep->DmaState,
-                             KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS),
-                             &dmareq, &kqswnal_data.kqn_eptxdmahandle);
-       if (rc != DDI_SUCCESS)
-       {
-               CERROR ("Can't reserve rx dma space\n");
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
+               kqswnal_shutdown(ni);
+               return (-ENOMEM);
        }
-#endif
+
        /**********************************************************************/
        /* Reserve Elan address space for receive buffers */
-#if MULTIRAIL_EKC
        kqswnal_data.kqn_ep_rx_nmh =
                ep_dvma_reserve(kqswnal_data.kqn_ep,
-                               KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
-                               KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
+                               KQSW_NRXMSGPAGES_SMALL * 
+                               (*kqswnal_tunables.kqn_nrxmsgs_small) +
+                               KQSW_NRXMSGPAGES_LARGE * 
+                               (*kqswnal_tunables.kqn_nrxmsgs_large),
                                EP_PERM_WRITE);
        if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
                CERROR("Can't reserve rx dma space\n");
-               kqswnal_shutdown(nal);
-               return (PTL_NO_SPACE);
-       }
-#else
-        dmareq.Waitfn   = DDI_DMA_SLEEP;
-        dmareq.ElanAddr = (E3_Addr) 0;
-        dmareq.Attr     = PTE_LOAD_LITTLE_ENDIAN;
-        dmareq.Perm     = ELAN_PERM_REMOTEWRITE;
-
-       rc = elan3_dma_reserve (kqswnal_data.kqn_ep->DmaState,
-                               KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL +
-                               KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE,
-                               &dmareq, &kqswnal_data.kqn_eprxdmahandle);
-       if (rc != DDI_SUCCESS)
-       {
-               CERROR ("Can't reserve rx dma space\n");
-               kqswnal_shutdown (nal);
-               return (PTL_NO_SPACE);
+               kqswnal_shutdown(ni);
+               return (-ENOMEM);
        }
-#endif
+
        /**********************************************************************/
        /* Allocate/Initialise transmit descriptors */
 
        kqswnal_data.kqn_txds = NULL;
-       for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++)
+       for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++)
        {
                int           premapped_pages;
                int           basepage = i * KQSW_NTXMSGPAGES;
 
-               PORTAL_ALLOC (ktx, sizeof(*ktx));
+               LIBCFS_ALLOC (ktx, sizeof(*ktx));
                if (ktx == NULL) {
-                       kqswnal_shutdown (nal);
-                       return (PTL_NO_SPACE);
+                       kqswnal_shutdown (ni);
+                       return (-ENOMEM);
                }
 
                memset(ktx, 0, sizeof(*ktx));   /* NULL pointers; zero flags */
                ktx->ktx_alloclist = kqswnal_data.kqn_txds;
                kqswnal_data.kqn_txds = ktx;
 
-               PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
+               LIBCFS_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
                if (ktx->ktx_buffer == NULL)
                {
-                       kqswnal_shutdown (nal);
-                       return (PTL_NO_SPACE);
+                       kqswnal_shutdown (ni);
+                       return (-ENOMEM);
                }
 
                /* Map pre-allocated buffer NOW, to save latency on transmit */
                premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
                                                        KQSW_TX_BUFFER_SIZE);
-#if MULTIRAIL_EKC
                ep_dvma_load(kqswnal_data.kqn_ep, NULL, 
                             ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, 
                             kqswnal_data.kqn_ep_tx_nmh, basepage,
                             &all_rails, &ktx->ktx_ebuffer);
-#else
-               elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
-                                      kqswnal_data.kqn_eptxdmahandle,
-                                      ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
-                                      basepage, &ktx->ktx_ebuffer);
-#endif
+
                ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
                ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
 
-               INIT_LIST_HEAD (&ktx->ktx_delayed_list);
+               INIT_LIST_HEAD (&ktx->ktx_schedlist);
 
                ktx->ktx_state = KTX_IDLE;
-#if MULTIRAIL_EKC
                ktx->ktx_rail = -1;             /* unset rail */
-#endif
-               ktx->ktx_isnblk = (i >= KQSW_NTXMSGS);
-               list_add_tail (&ktx->ktx_list, 
-                              ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds :
-                                                &kqswnal_data.kqn_idletxds);
+
+               list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
        }
 
        /**********************************************************************/
        /* Allocate/Initialise receive descriptors */
        kqswnal_data.kqn_rxds = NULL;
        elan_page_idx = 0;
-       for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++)
+       for (i = 0; i < *kqswnal_tunables.kqn_nrxmsgs_small + *kqswnal_tunables.kqn_nrxmsgs_large; i++)
        {
-#if MULTIRAIL_EKC
                EP_NMD        elanbuffer;
-#else
-               E3_Addr       elanbuffer;
-#endif
                int           j;
 
-               PORTAL_ALLOC(krx, sizeof(*krx));
+               LIBCFS_ALLOC(krx, sizeof(*krx));
                if (krx == NULL) {
-                       kqswnal_shutdown(nal);
-                       return (PTL_NO_SPACE);
+                       kqswnal_shutdown(ni);
+                       return (-ENOMEM);
                }
 
                memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
                krx->krx_alloclist = kqswnal_data.kqn_rxds;
                kqswnal_data.kqn_rxds = krx;
 
-               if (i < KQSW_NRXMSGS_SMALL)
+               if (i < *kqswnal_tunables.kqn_nrxmsgs_small)
                {
                        krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
                        krx->krx_eprx   = kqswnal_data.kqn_eprx_small;
@@ -627,14 +463,15 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
                        struct page *page = alloc_page(GFP_KERNEL);
                        
                        if (page == NULL) {
-                               kqswnal_shutdown (nal);
-                               return (PTL_NO_SPACE);
+                               kqswnal_shutdown (ni);
+                               return (-ENOMEM);
                        }
 
-                       krx->krx_kiov[j].kiov_page = page;
+                       krx->krx_kiov[j] = (lnet_kiov_t) {.kiov_page = page,
+                                                         .kiov_offset = 0,
+                                                         .kiov_len = PAGE_SIZE};
                        LASSERT(page_address(page) != NULL);
 
-#if MULTIRAIL_EKC
                        ep_dvma_load(kqswnal_data.kqn_ep, NULL,
                                     page_address(page),
                                     PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh,
@@ -649,42 +486,13 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
                                /* NB contiguous mapping */
                                LASSERT(rc);
                        }
-#else
-                       elan3_dvma_kaddr_load(kqswnal_data.kqn_ep->DmaState,
-                                             kqswnal_data.kqn_eprxdmahandle,
-                                             page_address(page),
-                                             PAGE_SIZE, elan_page_idx,
-                                             &elanbuffer);
-                       if (j == 0)
-                               krx->krx_elanbuffer = elanbuffer;
-
-                       /* NB contiguous mapping */
-                       LASSERT (elanbuffer == krx->krx_elanbuffer + j * PAGE_SIZE);
-#endif
                        elan_page_idx++;
 
                }
        }
        LASSERT (elan_page_idx ==
-                (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) +
-                (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE));
-
-       /**********************************************************************/
-       /* Network interface ready to initialise */
-
-       my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid);
-       my_process_id.pid = requested_pid;
-
-       rc = lib_init(&kqswnal_lib, nal, my_process_id,
-                     requested_limits, actual_limits);
-        if (rc != PTL_OK)
-       {
-               CERROR ("lib_init failed %d\n", rc);
-               kqswnal_shutdown (nal);
-               return (rc);
-       }
-
-       kqswnal_data.kqn_init = KQN_INIT_LIB;
+                (*kqswnal_tunables.kqn_nrxmsgs_small * KQSW_NRXMSGPAGES_SMALL) +
+                (*kqswnal_tunables.kqn_nrxmsgs_large * KQSW_NRXMSGPAGES_LARGE));
 
        /**********************************************************************/
        /* Queue receives, now that it's OK to run their completion callbacks */
@@ -692,19 +500,12 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
        for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
                /* NB this enqueue can allocate/sleep (attr == 0) */
                krx->krx_state = KRX_POSTED;
-#if MULTIRAIL_EKC
                rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
                                      &krx->krx_elanbuffer, 0);
-#else
-               rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
-                                     krx->krx_elanbuffer,
-                                     krx->krx_npages * PAGE_SIZE, 0);
-#endif
-               if (rc != EP_SUCCESS)
-               {
+               if (rc != EP_SUCCESS) {
                        CERROR ("failed ep_queue_receive %d\n", rc);
-                       kqswnal_shutdown (nal);
-                       return (PTL_FAIL);
+                       kqswnal_shutdown (ni);
+                       return (-EIO);
                }
        }
 
@@ -715,83 +516,36 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid,
                if (rc != 0)
                {
                        CERROR ("failed to spawn scheduling thread: %d\n", rc);
-                       kqswnal_shutdown (nal);
-                       return (PTL_FAIL);
+                       kqswnal_shutdown (ni);
+                       return (-ESRCH);
                }
        }
 
-       /**********************************************************************/
-       /* Connect to the router */
-       rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface);
-       CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc);
-
-       rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL);
-       if (rc != 0) {
-               CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-               kqswnal_shutdown (nal);
-               return (PTL_FAIL);
-       }
-
        kqswnal_data.kqn_init = KQN_INIT_ALL;
-
-       printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d "
-              "(Routing %s, initial mem %d)\n", 
-              kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes,
-              kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
-              pkmem);
-
-       return (PTL_OK);
+       return (0);
 }
 
 void __exit
 kqswnal_finalise (void)
 {
-#if CONFIG_SYSCTL
-       if (kqswnal_tunables.kqn_sysctl != NULL)
-               unregister_sysctl_table (kqswnal_tunables.kqn_sysctl);
-#endif
-       PtlNIFini(kqswnal_ni);
-
-       ptl_unregister_nal(QSWNAL);
+       lnet_unregister_lnd(&the_kqswlnd);
+       kqswnal_tunables_fini();
 }
 
 static int __init
 kqswnal_initialise (void)
 {
-       int   rc;
-
-       kqswnal_api.nal_ni_init = kqswnal_startup;
-       kqswnal_api.nal_ni_fini = kqswnal_shutdown;
-
-       /* Initialise dynamic tunables to defaults once only */
-       kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS;
-       kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS;
+       int   rc = kqswnal_tunables_init();
        
-       rc = ptl_register_nal(QSWNAL, &kqswnal_api);
-       if (rc != PTL_OK) {
-               CERROR("Can't register QSWNAL: %d\n", rc);
-               return (-ENOMEM);               /* or something... */
-       }
-
-       /* Pure gateways, and the workaround for 'EKC blocks forever until
-        * the service is active' want the NAL started up at module load
-        * time... */
-       rc = PtlNIInit(QSWNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kqswnal_ni);
-       if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-               ptl_unregister_nal(QSWNAL);
-               return (-ENODEV);
-       }
+       if (rc != 0)
+               return rc;
 
-#if CONFIG_SYSCTL
-        /* Press on regardless even if registering sysctl doesn't work */
-        kqswnal_tunables.kqn_sysctl = 
-               register_sysctl_table (kqswnal_top_ctl_table, 0);
-#endif
+       lnet_register_lnd(&the_kqswlnd);
        return (0);
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Quadrics/Elan NAL v1.01");
+MODULE_DESCRIPTION("Kernel Quadrics/Elan LND v1.01");
 MODULE_LICENSE("GPL");
 
 module_init (kqswnal_initialise);
index c138be4..0fe2a5e 100644 (file)
 #include <linux/config.h>
 #include <linux/module.h>
 
-#if MULTIRAIL_EKC
-# include <elan/epcomms.h>
-#else
-# include <elan3/elanregs.h>
-# include <elan3/elandev.h>
-# include <elan3/elanvp.h>
-# include <elan3/elan3mmu.h>
-# include <elan3/elanctxt.h>
-# include <elan3/elandebug.h>
-# include <elan3/urom_addrs.h>
-# include <elan3/busops.h>
-# include <elan3/kcomm.h>
-#endif
+#include <elan/epcomms.h>
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/sysctl.h>
 #include <asm/segment.h>
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/kp30.h>
-#include <portals/kpr.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
-
-#define KQSW_CHECKSUM   0
-#if KQSW_CHECKSUM
-typedef unsigned long kqsw_csum_t;
-#define KQSW_CSUM_SIZE  (2 * sizeof (kqsw_csum_t))
-#else
-#define KQSW_CSUM_SIZE  0
-#endif
-#define KQSW_HDR_SIZE   (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE)
-
-/*
- * Performance Tuning defines
- * NB no mention of PAGE_SIZE for interoperability
- */
-#define KQSW_MAXPAYLOAD                 PTL_MTU
-#define KQSW_SMALLPAYLOAD               ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */
-
-#define KQSW_TX_MAXCONTIG               (1<<10) /* largest payload that gets made contiguous on transmit */
-
-#define KQSW_NTXMSGS                    8       /* # normal transmit messages */
-#define KQSW_NNBLK_TXMSGS               (PAGE_SIZE == 4096 ? 512 : 256)     /* # reserved transmit messages if can't block */ /* avoid qsnet crash b=5291 */
-
-#define KQSW_NRXMSGS_LARGE              64      /* # large receive buffers */
-#define KQSW_EP_ENVELOPES_LARGE         256     /* # large ep envelopes */
-
-#define KQSW_NRXMSGS_SMALL              256     /* # small receive buffers */
-#define KQSW_EP_ENVELOPES_SMALL         2048    /* # small ep envelopes */
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
 
+/* fixed constants */
+#define KQSW_SMALLMSG                  (4<<10)  /* small/large ep receiver breakpoint */
 #define KQSW_RESCHED                    100     /* # busy loops that forces scheduler to yield */
 
-#define KQSW_OPTIMIZED_GETS             1       /* optimize gets >= this size */
-#define KQSW_OPTIMIZED_PUTS            (32<<10) /* optimize puts >= this size */
-#define KQSW_COPY_SMALL_FWD             0       /* copy small fwd messages to pre-mapped buffer? */
+#define KQSW_CKSUM                      0       /* enable checksumming (protocol incompatible) */
 
 /*
  * derived constants
  */
 
-#define KQSW_TX_BUFFER_SIZE     (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG)
+#define KQSW_TX_BUFFER_SIZE     (offsetof(kqswnal_msg_t, \
+                                          kqm_u.immediate.kqim_payload[*kqswnal_tunables.kqn_tx_maxcontig]))
 /* The pre-allocated tx buffer (hdr + small payload) */
 
-#define KQSW_NTXMSGPAGES        (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1)
+#define KQSW_NTXMSGPAGES        (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(LNET_MAX_PAYLOAD) + 1)
 /* Reserve elan address space for pre-allocated and pre-mapped transmit
  * buffer and a full payload too.  Extra pages allow for page alignment */
 
-#define KQSW_NRXMSGPAGES_SMALL  (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD))
+#define KQSW_NRXMSGPAGES_SMALL  (btopr(KQSW_SMALLMSG))
 /* receive hdr/payload always contiguous and page aligned */
 #define KQSW_NRXMSGBYTES_SMALL  (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE)
 
-#define KQSW_NRXMSGPAGES_LARGE  (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD))
+#define KQSW_NRXMSGPAGES_LARGE  (btopr(sizeof(lnet_msg_t) + LNET_MAX_PAYLOAD))
 /* receive hdr/payload always contiguous and page aligned */
 #define KQSW_NRXMSGBYTES_LARGE  (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE)
 /* biggest complete packet we can receive (or transmit) */
 
+/* Wire messages */
 /* Remote memory descriptor */
 typedef struct
 {
         __u32            kqrmd_nfrag;           /* # frags */
-#if MULTIRAIL_EKC
         EP_NMD           kqrmd_frag[0];         /* actual frags */
+} kqswnal_remotemd_t;
+
+/* Immediate data */
+typedef struct
+{
+        lnet_hdr_t       kqim_hdr;              /* LNET header */
+        char             kqim_payload[0];       /* piggy-backed payload */
+} WIRE_ATTR kqswnal_immediate_msg_t;
+
+/* RDMA request */
+typedef struct
+{
+        lnet_hdr_t          kqrm_hdr;           /* LNET header */
+        kqswnal_remotemd_t  kqrm_rmd;           /* peer's buffer */
+} WIRE_ATTR kqswnal_rdma_msg_t;
+
+typedef struct
+{
+        __u32            kqm_magic;             /* I'm a qswlnd message */
+        __u16            kqm_version;           /* this is my version number */
+        __u16            kqm_type;              /* msg type */
+#if KQSW_CKSUM
+        __u32            kqm_cksum;             /* crc32 checksum */
+        __u32            kqm_nob;               /* original msg length */
+#endif
+        union {
+                kqswnal_immediate_msg_t  immediate;
+                kqswnal_rdma_msg_t       rdma;
+        } WIRE_ATTR kqm_u;
+} WIRE_ATTR kqswnal_msg_t;
+
+#if KQSW_CKSUM                                           /* enable checksums ? */
+# include <linux/crc32.h>
+static inline __u32 kqswnal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+        return crc32_le(crc, p, len);
 #else
-        EP_IOVEC         kqrmd_frag[0];         /* actual frags */
+        while (len-- > 0)
+                crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+        return crc;
 #endif
-} kqswnal_remotemd_t;
+}
+# define QSWLND_PROTO_VERSION         0xbeef
+#else
+# define QSWLND_PROTO_VERSION         1
+#endif
+
+#define QSWLND_MSG_IMMEDIATE          0
+#define QSWLND_MSG_RDMA               1
+
+typedef union {
+        EP_STATUSBLK     ep_statusblk;
+        struct {
+                __u32       status;
+                __u32       magic;
+                __u32       version;
+                union {
+                        struct {
+                                __u32    len;
+                                __u32    cksum;
+                        } WIRE_ATTR get;
+                } WIRE_ATTR u;
+        } WIRE_ATTR     msg;
+} kqswnal_rpc_reply_t;
 
 typedef struct kqswnal_rx
 {
@@ -150,19 +171,18 @@ typedef struct kqswnal_rx
         struct kqswnal_rx *krx_alloclist;       /* stack in kqn_rxds */
         EP_RCVR         *krx_eprx;              /* port to post receives to */
         EP_RXD          *krx_rxd;               /* receive descriptor (for repost) */
-#if MULTIRAIL_EKC
         EP_NMD           krx_elanbuffer;        /* contiguous Elan buffer */
-#else
-        E3_Addr          krx_elanbuffer;        /* contiguous Elan buffer */
-#endif
         int              krx_npages;            /* # pages in receive buffer */
         int              krx_nob;               /* Number Of Bytes received into buffer */
-        int              krx_rpc_reply_needed /* peer waiting for EKC RPC reply */
-        int              krx_rpc_reply_status;  /* what status to send */
+        int              krx_rpc_reply_needed:1; /* peer waiting for EKC RPC reply */
+        int              krx_raw_lnet_hdr:1;    /* msg is a raw lnet hdr (portals compatible) */
         int              krx_state;             /* what this RX is doing */
         atomic_t         krx_refcount;          /* how to tell when rpc is done */
-        kpr_fwd_desc_t   krx_fwd;               /* embedded forwarding descriptor */
-        ptl_kiov_t       krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */
+#if KQSW_CKSUM
+        __u32            krx_cksum;             /* checksum */
+#endif
+        kqswnal_rpc_reply_t krx_rpc_reply;      /* rpc reply status block */
+        lnet_kiov_t      krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */
 }  kqswnal_rx_t;
 
 #define KRX_POSTED       1                      /* receiving */
@@ -173,48 +193,56 @@ typedef struct kqswnal_rx
 typedef struct kqswnal_tx
 {
         struct list_head  ktx_list;             /* enqueue idle/active */
-        struct list_head  ktx_delayed_list;     /* enqueue delayedtxds */
+        struct list_head  ktx_schedlist;        /* enqueue on scheduler */
         struct kqswnal_tx *ktx_alloclist;       /* stack in kqn_txds */
-        unsigned int      ktx_isnblk:1;         /* reserved descriptor? */
         unsigned int      ktx_state:7;          /* What I'm doing */
         unsigned int      ktx_firsttmpfrag:1;   /* ktx_frags[0] is in my ebuffer ? 0 : 1 */
         uint32_t          ktx_basepage;         /* page offset in reserved elan tx vaddrs for mapping pages */
         int               ktx_npages;           /* pages reserved for mapping messages */
         int               ktx_nmappedpages;     /* # pages mapped for current message */
         int               ktx_port;             /* destination ep port */
-        ptl_nid_t         ktx_nid;              /* destination node */
+        lnet_nid_t        ktx_nid;              /* destination node */
         void             *ktx_args[3];          /* completion passthru */
         char             *ktx_buffer;           /* pre-allocated contiguous buffer for hdr + small payloads */
         unsigned long     ktx_launchtime;       /* when (in jiffies) the transmit was launched */
-
+        int               ktx_status;           /* completion status */
+#if KQSW_CKSUM
+        __u32             ktx_cksum;            /* optimized GET payload checksum */
+#endif
         /* debug/info fields */
         pid_t             ktx_launcher;         /* pid of launching process */
 
         int               ktx_nfrag;            /* # message frags */
-#if MULTIRAIL_EKC
         int               ktx_rail;             /* preferred rail */
         EP_NMD            ktx_ebuffer;          /* elan mapping of ktx_buffer */
         EP_NMD            ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */
-#else
-        E3_Addr           ktx_ebuffer;          /* elan address of ktx_buffer */
-        EP_IOVEC          ktx_frags[EP_MAXFRAG];/* msg frags (elan vaddrs) */
-#endif
 } kqswnal_tx_t;
 
-#define KTX_IDLE        0                       /* on kqn_(nblk_)idletxds */
-#define KTX_FORWARDING  1                       /* sending a forwarded packet */
-#define KTX_SENDING     2                       /* normal send */
-#define KTX_GETTING     3                       /* sending optimised get */
-#define KTX_PUTTING     4                       /* sending optimised put */
-#define KTX_RDMAING     5                       /* handling optimised put/get */
+#define KTX_IDLE        0                       /* on kqn_idletxds */
+#define KTX_SENDING     1                       /* normal send */
+#define KTX_GETTING     2                       /* sending optimised get */
+#define KTX_PUTTING     3                       /* sending optimised put */
+#define KTX_RDMA_FETCH  4                       /* handling optimised put */
+#define KTX_RDMA_STORE  5                       /* handling optimised get */
 
 typedef struct
 {
-        /* dynamic tunables... */
-        int                      kqn_optimized_puts;  /* optimized PUTs? */
-        int                      kqn_optimized_gets;  /* optimized GETs? */
-#if CONFIG_SYSCTL
-        struct ctl_table_header *kqn_sysctl;          /* sysctl interface */
+        int               *kqn_tx_maxcontig;    /* maximum payload to defrag */
+        int               *kqn_ntxmsgs;         /* # normal tx msgs */
+        int               *kqn_credits;         /* # concurrent sends */
+        int               *kqn_peercredits;     /* # concurrent sends to 1 peer */
+        int               *kqn_nrxmsgs_large;   /* # 'large' rx msgs */
+        int               *kqn_ep_envelopes_large; /* # 'large' rx ep envelopes */
+        int               *kqn_nrxmsgs_small;   /* # 'small' rx msgs */
+        int               *kqn_ep_envelopes_small; /* # 'small' rx ep envelopes */
+        int               *kqn_optimized_puts;  /* optimized PUTs? */
+        int               *kqn_optimized_gets;  /* optimized GETs? */
+#if KQSW_CKSUM
+        int               *kqn_inject_csum_error; /* # csum errors to inject */
+#endif
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+        struct ctl_table_header *kqn_sysctl;    /* sysctl interface */
 #endif
 } kqswnal_tunables_t;
 
@@ -223,82 +251,68 @@ typedef struct
         char               kqn_init;            /* what's been initialised */
         char               kqn_shuttingdown;    /* I'm trying to shut down */
         atomic_t           kqn_nthreads;        /* # threads running */
+        lnet_ni_t         *kqn_ni;              /* _the_ instance of me */
 
         kqswnal_rx_t      *kqn_rxds;            /* stack of all the receive descriptors */
         kqswnal_tx_t      *kqn_txds;            /* stack of all the transmit descriptors */
 
         struct list_head   kqn_idletxds;        /* transmit descriptors free to use */
-        struct list_head   kqn_nblk_idletxds;   /* reserved free transmit descriptors */
         struct list_head   kqn_activetxds;      /* transmit descriptors being used */
         spinlock_t         kqn_idletxd_lock;    /* serialise idle txd access */
-        wait_queue_head_t  kqn_idletxd_waitq;   /* sender blocks here waiting for idle txd */
-        struct list_head   kqn_idletxd_fwdq;    /* forwarded packets block here waiting for idle txd */
         atomic_t           kqn_pending_txs;     /* # transmits being prepped */
 
         spinlock_t         kqn_sched_lock;      /* serialise packet schedulers */
         wait_queue_head_t  kqn_sched_waitq;     /* scheduler blocks here */
 
         struct list_head   kqn_readyrxds;       /* rxds full of data */
-        struct list_head   kqn_delayedfwds;     /* delayed forwards */
+        struct list_head   kqn_donetxds;        /* completed transmits */
         struct list_head   kqn_delayedtxds;     /* delayed transmits */
 
-#if MULTIRAIL_EKC
         EP_SYS            *kqn_ep;              /* elan system */
         EP_NMH            *kqn_ep_tx_nmh;       /* elan reserved tx vaddrs */
         EP_NMH            *kqn_ep_rx_nmh;       /* elan reserved rx vaddrs */
-#else
-        EP_DEV            *kqn_ep;              /* elan device */
-        ELAN3_DMA_HANDLE  *kqn_eptxdmahandle;   /* elan reserved tx vaddrs */
-        ELAN3_DMA_HANDLE  *kqn_eprxdmahandle;   /* elan reserved rx vaddrs */
-#endif
         EP_XMTR           *kqn_eptx;            /* elan transmitter */
         EP_RCVR           *kqn_eprx_small;      /* elan receiver (small messages) */
         EP_RCVR           *kqn_eprx_large;      /* elan receiver (large messages) */
-        kpr_router_t       kqn_router;          /* connection to Kernel Portals Router module */
 
-        ptl_nid_t          kqn_nid_offset;      /* this cluster's NID offset */
         int                kqn_nnodes;          /* this cluster's size */
         int                kqn_elanid;          /* this nodes's elan ID */
 
         EP_STATUSBLK       kqn_rpc_success;     /* preset RPC reply status blocks */
         EP_STATUSBLK       kqn_rpc_failed;
+        EP_STATUSBLK       kqn_rpc_version;     /* reply to future version query */
+        EP_STATUSBLK       kqn_rpc_magic;       /* reply to future version query */
 }  kqswnal_data_t;
 
 /* kqn_init state */
 #define KQN_INIT_NOTHING        0               /* MUST BE ZERO so zeroed state is initialised OK */
 #define KQN_INIT_DATA           1
-#define KQN_INIT_LIB            2
-#define KQN_INIT_ALL            3
+#define KQN_INIT_ALL            2
 
-extern lib_nal_t           kqswnal_lib;
-extern nal_t               kqswnal_api;
 extern kqswnal_tunables_t  kqswnal_tunables;
 extern kqswnal_data_t      kqswnal_data;
 
 extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
 extern void kqswnal_rxhandler(EP_RXD *rxd);
 extern int kqswnal_scheduler (void *);
-extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
 extern void kqswnal_rx_done (kqswnal_rx_t *krx);
 
-static inline ptl_nid_t
+static inline lnet_nid_t
 kqswnal_elanid2nid (int elanid)
 {
-        return (kqswnal_data.kqn_nid_offset + elanid);
+        return LNET_MKNID(LNET_NIDNET(kqswnal_data.kqn_ni->ni_nid), elanid);
 }
 
 static inline int
-kqswnal_nid2elanid (ptl_nid_t nid)
+kqswnal_nid2elanid (lnet_nid_t nid)
 {
-        /* not in this cluster? */
-        if (nid < kqswnal_data.kqn_nid_offset ||
-            nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes)
-                return (-1);
+        __u32 elanid = LNET_NIDADDR(nid);
 
-        return (nid - kqswnal_data.kqn_nid_offset);
+        /* not in this cluster? */
+        return (elanid >= kqswnal_data.kqn_nnodes) ? -1 : elanid;
 }
 
-static inline ptl_nid_t
+static inline lnet_nid_t
 kqswnal_rx_nid(kqswnal_rx_t *krx)
 {
         return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd)));
@@ -314,18 +328,6 @@ kqswnal_pages_spanned (void *base, int nob)
         return (last_page - first_page + 1);
 }
 
-#if KQSW_CHECKSUM
-static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob)
-{
-        unsigned char *ptr = (unsigned char *)base;
-
-        while (nob-- > 0)
-                sum += *ptr++;
-
-        return (sum);
-}
-#endif
-
 static inline void kqswnal_rx_decref (kqswnal_rx_t *krx)
 {
         LASSERT (atomic_read (&krx->krx_refcount) > 0);
@@ -333,44 +335,16 @@ static inline void kqswnal_rx_decref (kqswnal_rx_t *krx)
                 kqswnal_rx_done(krx);
 }
 
-#if MULTIRAIL_EKC
-# ifndef EP_RAILMASK_ALL
-#  error "old (unsupported) version of EKC headers"
-# endif
-#else
-/* multirail defines these in <elan/epcomms.h> */
-#define EP_MSG_SVC_PORTALS_SMALL      (0x10)  /* Portals over elan port number (large payloads) */
-#define EP_MSG_SVC_PORTALS_LARGE      (0x11)  /* Portals over elan port number (small payloads) */
-/* NB small/large message sizes are GLOBAL constants */
-
-/* A minimal attempt to minimise inline #ifdeffing */
-
-#define EP_SUCCESS      ESUCCESS
-#define EP_ENOMEM      ENOMEM
-
-static inline EP_XMTR *
-ep_alloc_xmtr(EP_DEV *e)
-{
-        return (ep_alloc_large_xmtr(e));
-}
-
-static inline EP_RCVR *
-ep_alloc_rcvr(EP_DEV *e, int svc, int nenv)
-{
-        return (ep_install_large_rcvr(e, svc, nenv));
-}
-
-static inline void
-ep_free_xmtr(EP_XMTR *x)
-{
-        ep_free_large_xmtr(x);
-}
-
-static inline void
-ep_free_rcvr(EP_RCVR *r)
-{
-        ep_remove_large_rcvr(r);
-}
-#endif
+int kqswnal_startup (lnet_ni_t *ni);
+void kqswnal_shutdown (lnet_ni_t *ni);
+int kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
+int kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kqswnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, 
+                 int delayed, unsigned int niov, 
+                 struct iovec *iov, lnet_kiov_t *kiov,
+                 unsigned int offset, unsigned int mlen, unsigned int rlen);
+
+int kqswnal_tunables_init(void);
+void kqswnal_tunables_fini(void);
 
 #endif /* _QSWNAL_H */
index 22e2cd9..86a1f8f 100644 (file)
  *
  */
 
-#include "qswnal.h"
-
-/*
- *  LIB functions follow
- *
- */
-static int
-kqswnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
-        if (nid == nal->libnal_ni.ni_pid.nid)
-                *dist = 0;                      /* it's me */
-        else if (kqswnal_nid2elanid (nid) >= 0)
-                *dist = 1;                      /* it's my peer */
-        else
-                *dist = 2;                      /* via router */
-        return (0);
-}
+#include "qswlnd.h"
 
 void
 kqswnal_notify_peer_down(kqswnal_tx_t *ktx)
@@ -48,22 +32,19 @@ kqswnal_notify_peer_down(kqswnal_tx_t *ktx)
         do_gettimeofday (&now);
         then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ;
 
-        kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then);
+        lnet_notify(kqswnal_data.kqn_ni, ktx->ktx_nid, 0, then);
 }
 
 void
 kqswnal_unmap_tx (kqswnal_tx_t *ktx)
 {
-#if MULTIRAIL_EKC
         int      i;
 
         ktx->ktx_rail = -1;                     /* unset rail */
-#endif
 
         if (ktx->ktx_nmappedpages == 0)
                 return;
         
-#if MULTIRAIL_EKC
         CDEBUG(D_NET, "%p unloading %d frags starting at %d\n",
                ktx, ktx->ktx_nfrag, ktx->ktx_firsttmpfrag);
 
@@ -71,30 +52,20 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx)
                 ep_dvma_unload(kqswnal_data.kqn_ep,
                                kqswnal_data.kqn_ep_tx_nmh,
                                &ktx->ktx_frags[i]);
-#else
-        CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n",
-                ktx, ktx->ktx_nfrag, ktx->ktx_basepage, ktx->ktx_nmappedpages);
 
-        LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages);
-        LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <=
-                 kqswnal_data.kqn_eptxdmahandle->NumDvmaPages);
-
-        elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState,
-                          kqswnal_data.kqn_eptxdmahandle,
-                          ktx->ktx_basepage, ktx->ktx_nmappedpages);
-#endif
         ktx->ktx_nmappedpages = 0;
 }
 
 int
-kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov)
+kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, 
+                     unsigned int niov, lnet_kiov_t *kiov)
 {
         int       nfrags    = ktx->ktx_nfrag;
         int       nmapped   = ktx->ktx_nmappedpages;
         int       maxmapped = ktx->ktx_npages;
         uint32_t  basepage  = ktx->ktx_basepage + nmapped;
         char     *ptr;
-#if MULTIRAIL_EKC
+
         EP_RAILMASK railmask;
         int         rail;
 
@@ -104,11 +75,11 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_
                                                  kqswnal_nid2elanid(ktx->ktx_nid));
         rail = ktx->ktx_rail;
         if (rail < 0) {
-                CERROR("No rails available for "LPX64"\n", ktx->ktx_nid);
+                CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid));
                 return (-ENETDOWN);
         }
         railmask = 1 << rail;
-#endif
+
         LASSERT (nmapped <= maxmapped);
         LASSERT (nfrags >= ktx->ktx_firsttmpfrag);
         LASSERT (nfrags <= EP_MAXFRAG);
@@ -154,7 +125,6 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_
                        "%p[%d] loading %p for %d, page %d, %d total\n",
                         ktx, nfrags, ptr, fraglen, basepage, nmapped);
 
-#if MULTIRAIL_EKC
                 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
                              ptr, fraglen,
                              kqswnal_data.kqn_ep_tx_nmh, basepage,
@@ -167,22 +137,6 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_
                         /* new frag if this is the first or can't merge */
                         nfrags++;
                 }
-#else
-                elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
-                                       kqswnal_data.kqn_eptxdmahandle,
-                                       ptr, fraglen,
-                                       basepage, &ktx->ktx_frags[nfrags].Base);
-
-                if (nfrags > 0 &&                /* previous frag mapped */
-                    ktx->ktx_frags[nfrags].Base == /* contiguous with this one */
-                    (ktx->ktx_frags[nfrags-1].Base + ktx->ktx_frags[nfrags-1].Len))
-                        /* just extend previous */
-                        ktx->ktx_frags[nfrags - 1].Len += fraglen;
-                else {
-                        ktx->ktx_frags[nfrags].Len = fraglen;
-                        nfrags++;                /* new frag */
-                }
-#endif
 
                 kunmap (kiov->kiov_page);
                 
@@ -207,15 +161,65 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_
         return (0);
 }
 
+#if KQSW_CKSUM
+__u32
+kqswnal_csum_kiov (__u32 csum, int offset, int nob, 
+                   unsigned int niov, lnet_kiov_t *kiov)
+{
+        char     *ptr;
+
+        if (nob == 0)
+                return csum;
+
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+
+        /* skip complete frags before 'offset' */
+        while (offset >= kiov->kiov_len) {
+                offset -= kiov->kiov_len;
+                kiov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+
+        do {
+                int  fraglen = kiov->kiov_len - offset;
+
+                /* each page frag is contained in one page */
+                LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE);
+
+                if (fraglen > nob)
+                        fraglen = nob;
+
+                ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset;
+
+                csum = kqswnal_csum(csum, ptr, fraglen);
+
+                kunmap (kiov->kiov_page);
+                
+                kiov++;
+                niov--;
+                nob -= fraglen;
+                offset = 0;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        return csum;
+}
+#endif
+
 int
 kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, 
-                    int niov, struct iovec *iov)
+                    unsigned int niov, struct iovec *iov)
 {
         int       nfrags    = ktx->ktx_nfrag;
         int       nmapped   = ktx->ktx_nmappedpages;
         int       maxmapped = ktx->ktx_npages;
         uint32_t  basepage  = ktx->ktx_basepage + nmapped;
-#if MULTIRAIL_EKC
+
         EP_RAILMASK railmask;
         int         rail;
         
@@ -225,11 +229,11 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
                                                  kqswnal_nid2elanid(ktx->ktx_nid));
         rail = ktx->ktx_rail;
         if (rail < 0) {
-                CERROR("No rails available for "LPX64"\n", ktx->ktx_nid);
+                CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid));
                 return (-ENETDOWN);
         }
         railmask = 1 << rail;
-#endif
+
         LASSERT (nmapped <= maxmapped);
         LASSERT (nfrags >= ktx->ktx_firsttmpfrag);
         LASSERT (nfrags <= EP_MAXFRAG);
@@ -270,7 +274,6 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
                        ktx, nfrags, iov->iov_base + offset, fraglen, 
                        basepage, npages, nmapped);
 
-#if MULTIRAIL_EKC
                 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
                              iov->iov_base + offset, fraglen,
                              kqswnal_data.kqn_ep_tx_nmh, basepage,
@@ -283,22 +286,6 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
                         /* new frag if this is the first or can't merge */
                         nfrags++;
                 }
-#else
-                elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState,
-                                       kqswnal_data.kqn_eptxdmahandle,
-                                       iov->iov_base + offset, fraglen,
-                                       basepage, &ktx->ktx_frags[nfrags].Base);
-
-                if (nfrags > 0 &&                /* previous frag mapped */
-                    ktx->ktx_frags[nfrags].Base == /* contiguous with this one */
-                    (ktx->ktx_frags[nfrags-1].Base + ktx->ktx_frags[nfrags-1].Len))
-                        /* just extend previous */
-                        ktx->ktx_frags[nfrags - 1].Len += fraglen;
-                else {
-                        ktx->ktx_frags[nfrags].Len = fraglen;
-                        nfrags++;                /* new frag */
-                }
-#endif
 
                 /* keep in loop for failure case */
                 ktx->ktx_nmappedpages = nmapped;
@@ -321,11 +308,50 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob,
         return (0);
 }
 
+#if KQSW_CKSUM
+__u32
+kqswnal_csum_iov (__u32 csum, int offset, int nob, 
+                  unsigned int niov, struct iovec *iov)
+{
+        if (nob == 0)
+                return csum;
+        
+        LASSERT (niov > 0);
+        LASSERT (nob > 0);
+
+        /* skip complete frags before offset */
+        while (offset >= iov->iov_len) {
+                offset -= iov->iov_len;
+                iov++;
+                niov--;
+                LASSERT (niov > 0);
+        }
+        
+        do {
+                int  fraglen = iov->iov_len - offset;
+                
+                if (fraglen > nob)
+                        fraglen = nob;
+
+                csum = kqswnal_csum(csum, iov->iov_base + offset, fraglen);
+
+                iov++;
+                niov--;
+                nob -= fraglen;
+                offset = 0;
+
+                /* iov must not run out before end of data */
+                LASSERT (nob == 0 || niov > 0);
+
+        } while (nob > 0);
+
+        return csum;
+}
+#endif
 
 void
 kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
 {
-        kpr_fwd_desc_t   *fwd = NULL;
         unsigned long     flags;
 
         kqswnal_unmap_tx (ktx);                 /* release temporary mappings */
@@ -334,133 +360,133 @@ kqswnal_put_idle_tx (kqswnal_tx_t *ktx)
         spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
 
         list_del (&ktx->ktx_list);              /* take off active list */
-
-        if (ktx->ktx_isnblk) {
-                /* reserved for non-blocking tx */
-                list_add (&ktx->ktx_list, &kqswnal_data.kqn_nblk_idletxds);
-                spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
-                return;
-        }
-
         list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
 
-        /* anything blocking for a tx descriptor? */
-        if (!kqswnal_data.kqn_shuttingdown &&
-            !list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */
-        {
-                CDEBUG(D_NET,"wakeup fwd\n");
-
-                fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next,
-                                  kpr_fwd_desc_t, kprfd_list);
-                list_del (&fwd->kprfd_list);
-        }
-
-        wake_up (&kqswnal_data.kqn_idletxd_waitq);
-
         spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
-
-        if (fwd == NULL)
-                return;
-
-        /* schedule packet for forwarding again */
-        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
-
-        list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds);
-        wake_up (&kqswnal_data.kqn_sched_waitq);
-
-        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
 }
 
 kqswnal_tx_t *
-kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block)
+kqswnal_get_idle_tx (void)
 {
         unsigned long  flags;
-        kqswnal_tx_t  *ktx = NULL;
+        kqswnal_tx_t  *ktx;
 
-        for (;;) {
-                spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
-
-                if (kqswnal_data.kqn_shuttingdown)
-                        break;
-
-                /* "normal" descriptor is free */
-                if (!list_empty (&kqswnal_data.kqn_idletxds)) {
-                        ktx = list_entry (kqswnal_data.kqn_idletxds.next,
-                                          kqswnal_tx_t, ktx_list);
-                        break;
-                }
-
-                if (fwd != NULL)                /* forwarded packet? */
-                        break;
-
-                /* doing a local transmit */
-                if (!may_block) {
-                        if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) {
-                                CERROR ("intr tx desc pool exhausted\n");
-                                break;
-                        }
-
-                        ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next,
-                                          kqswnal_tx_t, ktx_list);
-                        break;
-                }
-
-                /* block for idle tx */
+        spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
 
+        if (kqswnal_data.kqn_shuttingdown ||
+            list_empty (&kqswnal_data.kqn_idletxds)) {
                 spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
 
-                CDEBUG (D_NET, "blocking for tx desc\n");
-                wait_event (kqswnal_data.kqn_idletxd_waitq,
-                            !list_empty (&kqswnal_data.kqn_idletxds) ||
-                            kqswnal_data.kqn_shuttingdown);
+                return NULL;
         }
 
-        if (ktx != NULL) {
-                list_del (&ktx->ktx_list);
-                list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds);
-                ktx->ktx_launcher = current->pid;
-                atomic_inc(&kqswnal_data.kqn_pending_txs);
-        } else if (fwd != NULL) {
-                /* queue forwarded packet until idle txd available */
-                CDEBUG (D_NET, "blocked fwd [%p]\n", fwd);
-                list_add_tail (&fwd->kprfd_list,
-                               &kqswnal_data.kqn_idletxd_fwdq);
-        }
+        ktx = list_entry (kqswnal_data.kqn_idletxds.next, kqswnal_tx_t, ktx_list);
+        list_del (&ktx->ktx_list);
+
+        list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds);
+        ktx->ktx_launcher = current->pid;
+        atomic_inc(&kqswnal_data.kqn_pending_txs);
 
         spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
 
         /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */
-        LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0);
-
+        LASSERT (ktx->ktx_nmappedpages == 0);
         return (ktx);
 }
 
 void
-kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
+kqswnal_tx_done_in_thread_context (kqswnal_tx_t *ktx)
 {
+        lnet_msg_t    *lnetmsg0 = NULL;
+        lnet_msg_t    *lnetmsg1 = NULL;
+        int            status0  = 0;
+        int            status1  = 0;
+        kqswnal_rx_t  *krx;
+        
+        LASSERT (!in_interrupt());
+        
+        if (ktx->ktx_status == -EHOSTDOWN)
+                kqswnal_notify_peer_down(ktx);
+
         switch (ktx->ktx_state) {
-        case KTX_FORWARDING:       /* router asked me to forward this packet */
-                kpr_fwd_done (&kqswnal_data.kqn_router,
-                              (kpr_fwd_desc_t *)ktx->ktx_args[0], error);
+        case KTX_RDMA_FETCH:                    /* optimized PUT/REPLY handled */
+                krx      = (kqswnal_rx_t *)ktx->ktx_args[0];
+                lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
+                status0  = ktx->ktx_status;
+#if KQSW_CKSUM
+                if (status0 == 0) {             /* RDMA succeeded */
+                        kqswnal_msg_t *msg;
+                        __u32          csum;
+
+                        msg = (kqswnal_msg_t *)
+                              page_address(krx->krx_kiov[0].kiov_page);
+
+                        csum = (lnetmsg0->msg_kiov != NULL) ?
+                               kqswnal_csum_kiov(krx->krx_cksum,
+                                                 lnetmsg0->msg_offset,
+                                                 lnetmsg0->msg_wanted,
+                                                 lnetmsg0->msg_niov,
+                                                 lnetmsg0->msg_kiov) :
+                               kqswnal_csum_iov(krx->krx_cksum,
+                                                lnetmsg0->msg_offset,
+                                                lnetmsg0->msg_wanted,
+                                                lnetmsg0->msg_niov,
+                                                lnetmsg0->msg_iov);
+
+                        /* Can only check csum if I got it all */
+                        if (lnetmsg0->msg_wanted == lnetmsg0->msg_len &&
+                            csum != msg->kqm_cksum) {
+                                ktx->ktx_status = -EIO;
+                                krx->krx_rpc_reply.msg.status = -EIO;
+                                CERROR("RDMA checksum failed %u(%u) from %s\n",
+                                       csum, msg->kqm_cksum,
+                                       libcfs_nid2str(kqswnal_rx_nid(krx)));
+                        }
+                }
+#endif       
+                LASSERT (krx->krx_state == KRX_COMPLETING);
+                kqswnal_rx_decref (krx);
                 break;
 
-        case KTX_RDMAING:          /* optimized GET/PUT handled */
+        case KTX_RDMA_STORE:       /* optimized GET handled */
         case KTX_PUTTING:          /* optimized PUT sent */
         case KTX_SENDING:          /* normal send */
-                lib_finalize (&kqswnal_lib, NULL,
-                              (lib_msg_t *)ktx->ktx_args[1],
-                              (error == 0) ? PTL_OK : PTL_FAIL);
+                lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
+                status0  = ktx->ktx_status;
                 break;
 
-        case KTX_GETTING:          /* optimized GET sent & REPLY received */
+        case KTX_GETTING:          /* optimized GET sent & payload received */
                 /* Complete the GET with success since we can't avoid
                  * delivering a REPLY event; we committed to it when we
                  * launched the GET */
-                lib_finalize (&kqswnal_lib, NULL, 
-                              (lib_msg_t *)ktx->ktx_args[1], PTL_OK);
-                lib_finalize (&kqswnal_lib, NULL,
-                              (lib_msg_t *)ktx->ktx_args[2],
-                              (error == 0) ? PTL_OK : PTL_FAIL);
+                lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
+                status0  = 0;
+                lnetmsg1 = (lnet_msg_t *)ktx->ktx_args[2];
+                status1  = ktx->ktx_status;
+#if KQSW_CKSUM
+                if (status1 == 0) {             /* RDMA succeeded */
+                        lnet_msg_t   *lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1];
+                        lnet_libmd_t *md = lnetmsg0->msg_md;
+                        __u32         csum;
+                
+                        csum = ((md->md_options & LNET_MD_KIOV) != 0) ? 
+                               kqswnal_csum_kiov(~0, 0,
+                                                 md->md_length,
+                                                 md->md_niov, 
+                                                 md->md_iov.kiov) :
+                               kqswnal_csum_iov(~0, 0,
+                                                md->md_length,
+                                                md->md_niov,
+                                                md->md_iov.iov);
+
+                        if (csum != ktx->ktx_cksum) {
+                                CERROR("RDMA checksum failed %u(%u) from %s\n",
+                                       csum, ktx->ktx_cksum,
+                                       libcfs_nid2str(ktx->ktx_nid));
+                                status1 = -EIO;
+                        }
+                }
+#endif                
                 break;
 
         default:
@@ -468,12 +494,39 @@ kqswnal_tx_done (kqswnal_tx_t *ktx, int error)
         }
 
         kqswnal_put_idle_tx (ktx);
+
+        lnet_finalize (kqswnal_data.kqn_ni, lnetmsg0, status0);
+        if (lnetmsg1 != NULL)
+                lnet_finalize (kqswnal_data.kqn_ni, lnetmsg1, status1);
+}
+
+void
+kqswnal_tx_done (kqswnal_tx_t *ktx, int status)
+{
+        unsigned long      flags;
+
+        ktx->ktx_status = status;
+
+        if (!in_interrupt()) {
+                kqswnal_tx_done_in_thread_context(ktx);
+                return;
+        }
+
+        /* Complete the send in thread context */
+        spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
+        
+        list_add_tail(&ktx->ktx_schedlist, 
+                      &kqswnal_data.kqn_donetxds);
+        wake_up(&kqswnal_data.kqn_sched_waitq);
+        
+        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags);
 }
 
 static void
 kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
 {
-        kqswnal_tx_t      *ktx = (kqswnal_tx_t *)arg;
+        kqswnal_tx_t         *ktx = (kqswnal_tx_t *)arg;
+        kqswnal_rpc_reply_t  *reply;
 
         LASSERT (txd != NULL);
         LASSERT (ktx != NULL);
@@ -482,26 +535,57 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
 
         if (status != EP_SUCCESS) {
 
-                CERROR ("Tx completion to "LPX64" failed: %d\n", 
-                        ktx->ktx_nid, status);
+                CDEBUG (D_NETERROR, "Tx completion to %s failed: %d\n", 
+                        libcfs_nid2str(ktx->ktx_nid), status);
 
-                kqswnal_notify_peer_down(ktx);
                 status = -EHOSTDOWN;
 
         } else switch (ktx->ktx_state) {
 
         case KTX_GETTING:
         case KTX_PUTTING:
-                /* RPC completed OK; but what did our peer put in the status
-                 * block? */
-#if MULTIRAIL_EKC
-                status = ep_txd_statusblk(txd)->Data[0];
-#else
-                status = ep_txd_statusblk(txd)->Status;
+                /* RPC complete! */
+                reply = (kqswnal_rpc_reply_t *)ep_txd_statusblk(txd);
+                if (reply->msg.magic == 0) {    /* "old" peer */
+                        status = reply->msg.status;
+                        break;
+                }
+                
+                if (reply->msg.magic != LNET_PROTO_QSW_MAGIC) {
+                        if (reply->msg.magic != swab32(LNET_PROTO_QSW_MAGIC)) {
+                                CERROR("%s unexpected rpc reply magic %08x\n",
+                                       libcfs_nid2str(ktx->ktx_nid),
+                                       reply->msg.magic);
+                                status = -EPROTO;
+                                break;
+                        }
+
+                        __swab32s(&reply->msg.status);
+                        __swab32s(&reply->msg.version);
+                        
+                        if (ktx->ktx_state == KTX_GETTING) {
+                                __swab32s(&reply->msg.u.get.len);
+                                __swab32s(&reply->msg.u.get.cksum);
+                        }
+                }
+                        
+                status = reply->msg.status;
+                if (status != 0) {
+                        CERROR("%s RPC status %08x\n",
+                               libcfs_nid2str(ktx->ktx_nid), status);
+                        break;
+                }
+
+                if (ktx->ktx_state == KTX_GETTING) {
+                        lnet_set_reply_msg_len(kqswnal_data.kqn_ni,
+                                               (lnet_msg_t *)ktx->ktx_args[2],
+                                               reply->msg.u.get.len);
+#if KQSW_CKSUM
+                        ktx->ktx_cksum = reply->msg.u.get.cksum;
 #endif
+                }
                 break;
                 
-        case KTX_FORWARDING:
         case KTX_SENDING:
                 status = 0;
                 break;
@@ -511,7 +595,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status)
                 break;
         }
 
-        kqswnal_tx_done (ktx, status);
+        kqswnal_tx_done(ktx, status);
 }
 
 int
@@ -530,14 +614,31 @@ kqswnal_launch (kqswnal_tx_t *ktx)
 
         LASSERT (dest >= 0);                    /* must be a peer */
 
-#if MULTIRAIL_EKC
         if (ktx->ktx_nmappedpages != 0)
                 attr = EP_SET_PREFRAIL(attr, ktx->ktx_rail);
-#endif
 
         switch (ktx->ktx_state) {
         case KTX_GETTING:
         case KTX_PUTTING:
+                if (the_lnet.ln_testprotocompat != 0 &&
+                    the_lnet.ln_ptlcompat == 0) {
+                        kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
+
+                        /* single-shot proto test:
+                         * Future version queries will use an RPC, so I'll
+                         * co-opt one of the existing ones */
+                        LNET_LOCK();
+                        if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                                msg->kqm_version++;
+                                the_lnet.ln_testprotocompat &= ~1;
+                        }
+                        if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                                msg->kqm_magic = LNET_PROTO_MAGIC;
+                                the_lnet.ln_testprotocompat &= ~2;
+                        }
+                        LNET_UNLOCK();
+                }
+
                 /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t.
                  * The other frags are the payload, awaiting RDMA */
                 rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest,
@@ -546,19 +647,11 @@ kqswnal_launch (kqswnal_tx_t *ktx)
                                      NULL, ktx->ktx_frags, 1);
                 break;
 
-        case KTX_FORWARDING:
         case KTX_SENDING:
-#if MULTIRAIL_EKC
                 rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest,
                                          ktx->ktx_port, attr,
                                          kqswnal_txhandler, ktx,
                                          NULL, ktx->ktx_frags, ktx->ktx_nfrag);
-#else
-                rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest,
-                                       ktx->ktx_port, attr, 
-                                       kqswnal_txhandler, ktx, 
-                                       ktx->ktx_frags, ktx->ktx_nfrag);
-#endif
                 break;
                 
         default:
@@ -574,14 +667,14 @@ kqswnal_launch (kqswnal_tx_t *ktx)
         case EP_ENOMEM: /* can't allocate ep txd => queue for later */
                 spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
 
-                list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds);
+                list_add_tail (&ktx->ktx_schedlist, &kqswnal_data.kqn_delayedtxds);
                 wake_up (&kqswnal_data.kqn_sched_waitq);
 
                 spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
                 return (0);
 
         default: /* fatal error */
-                CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc);
+                CDEBUG (D_NETERROR, "Tx to %s failed: %d\n", libcfs_nid2str(ktx->ktx_nid), rc);
                 kqswnal_notify_peer_down(ktx);
                 return (-EHOSTUNREACH);
         }
@@ -589,16 +682,16 @@ kqswnal_launch (kqswnal_tx_t *ktx)
 
 #if 0
 static char *
-hdr_type_string (ptl_hdr_t *hdr)
+hdr_type_string (lnet_hdr_t *hdr)
 {
         switch (hdr->type) {
-        case PTL_MSG_ACK:
+        case LNET_MSG_ACK:
                 return ("ACK");
-        case PTL_MSG_PUT:
+        case LNET_MSG_PUT:
                 return ("PUT");
-        case PTL_MSG_GET:
+        case LNET_MSG_GET:
                 return ("GET");
-        case PTL_MSG_REPLY:
+        case LNET_MSG_REPLY:
                 return ("REPLY");
         default:
                 return ("<UNKNOWN>");
@@ -606,7 +699,7 @@ hdr_type_string (ptl_hdr_t *hdr)
 }
 
 static void
-kqswnal_cerror_hdr(ptl_hdr_t * hdr)
+kqswnal_cerror_hdr(lnet_hdr_t * hdr)
 {
         char *type_str = hdr_type_string (hdr);
 
@@ -618,7 +711,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr)
                le32_to_cpu(hdr->dest_pid));
 
         switch (le32_to_cpu(hdr->type)) {
-        case PTL_MSG_PUT:
+        case LNET_MSG_PUT:
                 CERROR("    Ptl index %d, ack md "LPX64"."LPX64", "
                        "match bits "LPX64"\n",
                        le32_to_cpu(hdr->msg.put.ptl_index),
@@ -630,7 +723,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr)
                        hdr->msg.put.hdr_data);
                 break;
 
-        case PTL_MSG_GET:
+        case LNET_MSG_GET:
                 CERROR("    Ptl index %d, return md "LPX64"."LPX64", "
                        "match bits "LPX64"\n",
                        le32_to_cpu(hdr->msg.get.ptl_index),
@@ -642,14 +735,14 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr)
                        le32_to_cpu(hdr->msg.get.src_offset));
                 break;
 
-        case PTL_MSG_ACK:
+        case LNET_MSG_ACK:
                 CERROR("    dst md "LPX64"."LPX64", manipulated length %d\n",
                        hdr->msg.ack.dst_wmd.wh_interface_cookie,
                        hdr->msg.ack.dst_wmd.wh_object_cookie,
                        le32_to_cpu(hdr->msg.ack.mlength));
                 break;
 
-        case PTL_MSG_REPLY:
+        case LNET_MSG_REPLY:
                 CERROR("    dst md "LPX64"."LPX64"\n",
                        hdr->msg.reply.dst_wmd.wh_interface_cookie,
                        hdr->msg.reply.dst_wmd.wh_object_cookie);
@@ -658,67 +751,6 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr)
 }                               /* end of print_hdr() */
 #endif
 
-#if !MULTIRAIL_EKC
-void
-kqswnal_print_eiov (int how, char *str, int n, EP_IOVEC *iov) 
-{
-        int          i;
-
-        CDEBUG (how, "%s: %d\n", str, n);
-        for (i = 0; i < n; i++) {
-                CDEBUG (how, "   %08x for %d\n", iov[i].Base, iov[i].Len);
-        }
-}
-
-int
-kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv,
-                     int nsrc, EP_IOVEC *src,
-                     int ndst, EP_IOVEC *dst) 
-{
-        int        count;
-        int        nob;
-
-        LASSERT (ndv > 0);
-        LASSERT (nsrc > 0);
-        LASSERT (ndst > 0);
-
-        for (count = 0; count < ndv; count++, dv++) {
-
-                if (nsrc == 0 || ndst == 0) {
-                        if (nsrc != ndst) {
-                                /* For now I'll barf on any left over entries */
-                                CERROR ("mismatched src and dst iovs\n");
-                                return (-EINVAL);
-                        }
-                        return (count);
-                }
-
-                nob = (src->Len < dst->Len) ? src->Len : dst->Len;
-                dv->Len    = nob;
-                dv->Source = src->Base;
-                dv->Dest   = dst->Base;
-
-                if (nob >= src->Len) {
-                        src++;
-                        nsrc--;
-                } else {
-                        src->Len -= nob;
-                        src->Base += nob;
-                }
-                
-                if (nob >= dst->Len) {
-                        dst++;
-                        ndst--;
-                } else {
-                        src->Len -= nob;
-                        src->Base += nob;
-                }
-        }
-
-        CERROR ("DATAVEC too small\n");
-        return (-E2BIG);
-}
-#else
 int
 kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag,
                     int nrfrag, EP_NMD *rfrag)
@@ -741,36 +773,17 @@ kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag,
         
         return (0);
 }
-#endif
 
 kqswnal_remotemd_t *
-kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, ptl_nid_t expected_nid)
+kqswnal_get_portalscompat_rmd (kqswnal_rx_t *krx)
 {
+        /* Check that the RMD sent after the "raw" LNET header in a
+         * portals-compatible QSWLND message is OK */
         char               *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page);
-        ptl_hdr_t          *hdr = (ptl_hdr_t *)buffer;
-        kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE);
-        ptl_nid_t           nid = kqswnal_rx_nid(krx);
-
-        /* Note (1) lib_parse has already flipped hdr.
-         *      (2) RDMA addresses are sent in native endian-ness.  When
-         *      EKC copes with different endian nodes, I'll fix this (and
-         *      eat my hat :) */
+        kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + sizeof(lnet_hdr_t));
 
-        LASSERT (krx->krx_nob >= sizeof(*hdr));
-
-        if (hdr->type != type) {
-                CERROR ("Unexpected optimized get/put type %d (%d expected)"
-                        "from "LPX64"\n", hdr->type, type, nid);
-                return (NULL);
-        }
-        
-        if (hdr->src_nid != nid) {
-                CERROR ("Unexpected optimized get/put source NID "
-                        LPX64" from "LPX64"\n", hdr->src_nid, nid);
-                return (NULL);
-        }
-
-        LASSERT (nid == expected_nid);
+        /* Note RDMA addresses are sent in native endian-ness in the "old"
+         * portals protocol so no swabbing... */
 
         if (buffer + krx->krx_nob < (char *)(rmd + 1)) {
                 /* msg too small to discover rmd size */
@@ -800,116 +813,96 @@ kqswnal_rdma_store_complete (EP_RXD *rxd)
         CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
                "rxd %p, ktx %p, status %d\n", rxd, ktx, status);
 
-        LASSERT (ktx->ktx_state == KTX_RDMAING);
+        LASSERT (ktx->ktx_state == KTX_RDMA_STORE);
         LASSERT (krx->krx_rxd == rxd);
         LASSERT (krx->krx_rpc_reply_needed);
 
         krx->krx_rpc_reply_needed = 0;
         kqswnal_rx_decref (krx);
 
-        /* free ktx & finalize() its lib_msg_t */
+        /* free ktx & finalize() its lnet_msg_t */
         kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED);
 }
 
 void
 kqswnal_rdma_fetch_complete (EP_RXD *rxd) 
 {
-        /* Completed fetching the PUT data */
+        /* Completed fetching the PUT/REPLY data */
         int           status = ep_rxd_status(rxd);
         kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd);
         kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0];
-        unsigned long flags;
         
         CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR,
                "rxd %p, ktx %p, status %d\n", rxd, ktx, status);
 
-        LASSERT (ktx->ktx_state == KTX_RDMAING);
+        LASSERT (ktx->ktx_state == KTX_RDMA_FETCH);
         LASSERT (krx->krx_rxd == rxd);
         /* RPC completes with failure by default */
         LASSERT (krx->krx_rpc_reply_needed);
-        LASSERT (krx->krx_rpc_reply_status != 0);
+        LASSERT (krx->krx_rpc_reply.msg.status != 0);
 
         if (status == EP_SUCCESS) {
-                status = krx->krx_rpc_reply_status = 0;
+                krx->krx_rpc_reply.msg.status = 0;
+                status = 0;
         } else {
                 /* Abandon RPC since get failed */
                 krx->krx_rpc_reply_needed = 0;
                 status = -ECONNABORTED;
         }
 
-        /* free ktx & finalize() its lib_msg_t */
-        kqswnal_tx_done(ktx, status);
-
-        if (!in_interrupt()) {
-                /* OK to complete the RPC now (iff I had the last ref) */
-                kqswnal_rx_decref (krx);
-                return;
-        }
-
+        /* krx gets decref'd in kqswnal_tx_done_in_thread_context() */
         LASSERT (krx->krx_state == KRX_PARSE);
         krx->krx_state = KRX_COMPLETING;
 
-        /* Complete the RPC in thread context */
-        spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
-
-        list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds);
-        wake_up (&kqswnal_data.kqn_sched_waitq);
-
-        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+        /* free ktx & finalize() its lnet_msg_t */
+        kqswnal_tx_done(ktx, status);
 }
 
 int
-kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type,
-              int niov, struct iovec *iov, ptl_kiov_t *kiov,
-              size_t offset, size_t len)
+kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg,
+              int type, kqswnal_remotemd_t *rmd,
+              unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+              unsigned int offset, unsigned int len)
 {
-        kqswnal_remotemd_t *rmd;
         kqswnal_tx_t       *ktx;
         int                 eprc;
         int                 rc;
-#if !MULTIRAIL_EKC
-        EP_DATAVEC          datav[EP_MAXFRAG];
-        int                 ndatav;
-#endif
 
-        LASSERT (type == PTL_MSG_GET || type == PTL_MSG_PUT);
         /* Not both mapped and paged payload */
         LASSERT (iov == NULL || kiov == NULL);
         /* RPC completes with failure by default */
         LASSERT (krx->krx_rpc_reply_needed);
-        LASSERT (krx->krx_rpc_reply_status != 0);
-
-        rmd = kqswnal_parse_rmd(krx, type, libmsg->ev.initiator.nid);
-        if (rmd == NULL)
-                return (-EPROTO);
+        LASSERT (krx->krx_rpc_reply.msg.status != 0);
 
         if (len == 0) {
                 /* data got truncated to nothing. */
-                lib_finalize(&kqswnal_lib, krx, libmsg, PTL_OK);
+                lnet_finalize(kqswnal_data.kqn_ni, lntmsg, 0);
                 /* Let kqswnal_rx_done() complete the RPC with success */
-                krx->krx_rpc_reply_status = 0;
+                krx->krx_rpc_reply.msg.status = 0;
                 return (0);
         }
         
         /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not
            actually sending a portals message with it */
-        ktx = kqswnal_get_idle_tx(NULL, 0);
+        ktx = kqswnal_get_idle_tx();
         if (ktx == NULL) {
-                CERROR ("Can't get txd for RDMA with "LPX64"\n",
-                        libmsg->ev.initiator.nid);
+                CERROR ("Can't get txd for RDMA with %s\n",
+                        libcfs_nid2str(kqswnal_rx_nid(krx)));
                 return (-ENOMEM);
         }
 
-        ktx->ktx_state   = KTX_RDMAING;
-        ktx->ktx_nid     = libmsg->ev.initiator.nid;
+        ktx->ktx_state   = type;
+        ktx->ktx_nid     = kqswnal_rx_nid(krx);
         ktx->ktx_args[0] = krx;
-        ktx->ktx_args[1] = libmsg;
+        ktx->ktx_args[1] = lntmsg;
+
+        LASSERT (atomic_read(&krx->krx_refcount) > 0);
+        /* Take an extra ref for the completion callback */
+        atomic_inc(&krx->krx_refcount);
 
-#if MULTIRAIL_EKC
         /* Map on the rail the RPC prefers */
         ktx->ktx_rail = ep_rcvr_prefrail(krx->krx_eprx,
                                          ep_rxd_railmask(krx->krx_rxd));
-#endif
 
         /* Start mapping at offset 0 (we're not mapping any headers) */
         ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0;
@@ -924,60 +917,36 @@ kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type,
                 goto out;
         }
 
-#if MULTIRAIL_EKC
         rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags,
                                  rmd->kqrmd_nfrag, rmd->kqrmd_frag);
         if (rc != 0) {
                 CERROR ("Incompatible RDMA descriptors\n");
                 goto out;
         }
-#else
+
         switch (type) {
         default:
                 LBUG();
-
-        case PTL_MSG_GET:
-                ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav,
-                                             ktx->ktx_nfrag, ktx->ktx_frags,
-                                             rmd->kqrmd_nfrag, rmd->kqrmd_frag);
-                break;
-
-        case PTL_MSG_PUT:
-                ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav,
-                                             rmd->kqrmd_nfrag, rmd->kqrmd_frag,
-                                             ktx->ktx_nfrag, ktx->ktx_frags);
-                break;
-        }
                 
-        if (ndatav < 0) {
-                CERROR ("Can't create datavec: %d\n", ndatav);
-                rc = ndatav;
-                goto out;
-        }
+        case KTX_RDMA_STORE:
+                krx->krx_rpc_reply.msg.status    = 0;
+                krx->krx_rpc_reply.msg.magic     = LNET_PROTO_QSW_MAGIC;
+                krx->krx_rpc_reply.msg.version   = QSWLND_PROTO_VERSION;
+                krx->krx_rpc_reply.msg.u.get.len = len;
+#if KQSW_CKSUM
+                krx->krx_rpc_reply.msg.u.get.cksum = (kiov != NULL) ?
+                            kqswnal_csum_kiov(~0, offset, len, niov, kiov) :
+                            kqswnal_csum_iov(~0, offset, len, niov, iov);
+                if (*kqswnal_tunables.kqn_inject_csum_error == 4) {
+                        krx->krx_rpc_reply.msg.u.get.cksum++;
+                        *kqswnal_tunables.kqn_inject_csum_error = 0;
+                }
 #endif
-
-        LASSERT (atomic_read(&krx->krx_refcount) > 0);
-        /* Take an extra ref for the completion callback */
-        atomic_inc(&krx->krx_refcount);
-
-        switch (type) {
-        default:
-                LBUG();
-
-        case PTL_MSG_GET:
-#if MULTIRAIL_EKC
                 eprc = ep_complete_rpc(krx->krx_rxd, 
                                        kqswnal_rdma_store_complete, ktx, 
-                                       &kqswnal_data.kqn_rpc_success,
-                                       ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag);
-#else
-                eprc = ep_complete_rpc (krx->krx_rxd, 
-                                        kqswnal_rdma_store_complete, ktx,
-                                        &kqswnal_data.kqn_rpc_success, 
-                                        datav, ndatav);
-                if (eprc != EP_SUCCESS) /* "old" EKC destroys rxd on failed completion */
-                        krx->krx_rxd = NULL;
-#endif
+                                       &krx->krx_rpc_reply.ep_statusblk, 
+                                       ktx->ktx_frags, rmd->kqrmd_frag, 
+                                       rmd->kqrmd_nfrag);
                 if (eprc != EP_SUCCESS) {
                         CERROR("can't complete RPC: %d\n", eprc);
                         /* don't re-attempt RPC completion */
@@ -986,16 +955,10 @@ kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type,
                 }
                 break;
                 
-        case PTL_MSG_PUT:
-#if MULTIRAIL_EKC
+        case KTX_RDMA_FETCH:
                 eprc = ep_rpc_get (krx->krx_rxd, 
                                    kqswnal_rdma_fetch_complete, ktx,
                                    rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag);
-#else
-                eprc = ep_rpc_get (krx->krx_rxd,
-                                   kqswnal_rdma_fetch_complete, ktx,
-                                   datav, ndatav);
-#endif
                 if (eprc != EP_SUCCESS) {
                         CERROR("ep_rpc_get failed: %d\n", eprc);
                         /* Don't attempt RPC completion: 
@@ -1016,228 +979,243 @@ kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type,
         return (rc);
 }
 
-static ptl_err_t
-kqswnal_sendmsg (lib_nal_t    *nal,
-                 void         *private,
-                 lib_msg_t    *libmsg,
-                 ptl_hdr_t    *hdr,
-                 int           type,
-                 ptl_nid_t     nid,
-                 ptl_pid_t     pid,
-                 unsigned int  payload_niov,
-                 struct iovec *payload_iov,
-                 ptl_kiov_t   *payload_kiov,
-                 size_t        payload_offset,
-                 size_t        payload_nob)
+int
+kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
-        kqswnal_tx_t      *ktx;
-        int                rc;
-        ptl_nid_t          targetnid;
-#if KQSW_CHECKSUM
-        int                i;
-        kqsw_csum_t        csum;
-        int                sumoff;
-        int                sumnob;
-#endif
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+        int               type = lntmsg->msg_type;
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      payload_niov = lntmsg->msg_niov;
+        struct iovec     *payload_iov = lntmsg->msg_iov;
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        int               nob;
+        kqswnal_tx_t     *ktx;
+        int               rc;
+
         /* NB 1. hdr is in network byte order */
         /*    2. 'private' depends on the message type */
         
-        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64
-               " pid %u\n", payload_nob, payload_niov, nid, pid);
+        CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
 
         LASSERT (payload_nob == 0 || payload_niov > 0);
-        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
 
         /* It must be OK to kmap() if required */
         LASSERT (payload_kiov == NULL || !in_interrupt ());
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
 
-        if (payload_nob > KQSW_MAXPAYLOAD) {
-                CERROR ("request exceeds MTU size "LPSZ" (max %u).\n",
-                        payload_nob, KQSW_MAXPAYLOAD);
-                return (PTL_FAIL);
-        }
-
-        if (type == PTL_MSG_REPLY &&            /* can I look in 'private' */
-            ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */
-                /* Must be a REPLY for an optimized GET */
-                rc = kqswnal_rdma ((kqswnal_rx_t *)private, libmsg, PTL_MSG_GET,
-                                   payload_niov, payload_iov, payload_kiov, 
-                                   payload_offset, payload_nob);
-                return ((rc == 0) ? PTL_OK : PTL_FAIL);
-        }
-
-        targetnid = nid;
-        if (kqswnal_nid2elanid (nid) < 0) {     /* Can't send direct: find gateway? */
-                rc = kpr_lookup (&kqswnal_data.kqn_router, nid, 
-                                 sizeof (ptl_hdr_t) + payload_nob, &targetnid);
-                if (rc != 0) {
-                        CERROR("Can't route to "LPX64": router error %d\n",
-                               nid, rc);
-                        return (PTL_FAIL);
-                }
-                if (kqswnal_nid2elanid (targetnid) < 0) {
-                        CERROR("Bad gateway "LPX64" for "LPX64"\n",
-                               targetnid, nid);
-                        return (PTL_FAIL);
-                }
+        if (kqswnal_nid2elanid (target.nid) < 0) {
+                CERROR("%s not in my cluster\n", libcfs_nid2str(target.nid));
+                return -EIO;
         }
 
         /* I may not block for a transmit descriptor if I might block the
-         * receiver, or an interrupt handler. */
-        ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK ||
-                                          type == PTL_MSG_REPLY ||
-                                          in_interrupt()));
+         * router, receiver, or an interrupt handler. */
+        ktx = kqswnal_get_idle_tx();
         if (ktx == NULL) {
-                CERROR ("Can't get txd for msg type %d for "LPX64"\n",
-                        type, libmsg->ev.initiator.nid);
-                return (PTL_NO_SPACE);
+                CERROR ("Can't get txd for msg type %d for %s\n",
+                        type, libcfs_nid2str(target.nid));
+                return (-ENOMEM);
         }
 
         ktx->ktx_state   = KTX_SENDING;
-        ktx->ktx_nid     = targetnid;
+        ktx->ktx_nid     = target.nid;
         ktx->ktx_args[0] = private;
-        ktx->ktx_args[1] = libmsg;
+        ktx->ktx_args[1] = lntmsg;
         ktx->ktx_args[2] = NULL;    /* set when a GET commits to REPLY */
 
-        memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */
-
-#if KQSW_CHECKSUM
-        csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr));
-        memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum));
-        for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) {
-                LASSERT(i < niov);
-                if (payload_kiov != NULL) {
-                        ptl_kiov_t *kiov = &payload_kiov[i];
-
-                        if (sumoff >= kiov->kiov_len) {
-                                sumoff -= kiov->kiov_len;
-                        } else {
-                                char *addr = ((char *)kmap (kiov->kiov_page)) +
-                                             kiov->kiov_offset + sumoff;
-                                int   fragnob = kiov->kiov_len - sumoff;
-
-                                csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
-                                sumnob -= fragnob;
-                                sumoff = 0;
-                                kunmap(kiov->kiov_page);
-                        }
-                } else {
-                        struct iovec *iov = &payload_iov[i];
-
-                        if (sumoff > iov->iov_len) {
-                                sumoff -= iov->iov_len;
-                        } else {
-                                char *addr = iov->iov_base + sumoff;
-                                int   fragnob = iov->iov_len - sumoff;
-                                
-                                csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob));
-                                sumnob -= fragnob;
-                                sumoff = 0;
-                        }
-                }
-        }
-        memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum));
-#endif
-
-        /* The first frag will be the pre-mapped buffer for (at least) the
-         * portals header. */
+        /* The first frag will be the pre-mapped buffer. */
         ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1;
 
-        if (nid == targetnid &&                 /* not forwarding */
-            ((type == PTL_MSG_GET &&            /* optimize GET? */
-              kqswnal_tunables.kqn_optimized_gets != 0 &&
-              le32_to_cpu(hdr->msg.get.sink_length) >= kqswnal_tunables.kqn_optimized_gets) ||
-             (type == PTL_MSG_PUT &&            /* optimize PUT? */
-              kqswnal_tunables.kqn_optimized_puts != 0 &&
-              payload_nob >= kqswnal_tunables.kqn_optimized_puts))) {
-                lib_md_t           *md = libmsg->md;
-                kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE);
-                
+        if ((!target_is_router &&               /* target.nid is final dest */
+             !routing &&                        /* I'm the source */
+             type == LNET_MSG_GET &&            /* optimize GET? */
+             *kqswnal_tunables.kqn_optimized_gets != 0 &&
+             lntmsg->msg_md->md_length >= 
+             *kqswnal_tunables.kqn_optimized_gets) ||
+            ((type == LNET_MSG_PUT ||            /* optimize PUT? */
+              type == LNET_MSG_REPLY) &&         /* optimize REPLY? */
+             *kqswnal_tunables.kqn_optimized_puts != 0 &&
+             payload_nob >= *kqswnal_tunables.kqn_optimized_puts)) {
+                lnet_libmd_t       *md = lntmsg->msg_md;
+                kqswnal_msg_t      *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
+                lnet_hdr_t         *mhdr;
+                kqswnal_remotemd_t *rmd;
+
                 /* Optimised path: I send over the Elan vaddrs of the local
                  * buffers, and my peer DMAs directly to/from them.
                  *
                  * First I set up ktx as if it was going to send this
                  * payload, (it needs to map it anyway).  This fills
                  * ktx_frags[1] and onward with the network addresses
-                 * of the GET sink frags.  I copy these into ktx_buffer,
-                 * immediately after the header, and send that as my
-                 * message. */
+                 * of the buffer frags. */
+
+                if (the_lnet.ln_ptlcompat == 2) {
+                        /* Strong portals compatibility: send "raw" LNET
+                         * header + rdma descriptor */
+                        mhdr = (lnet_hdr_t *)ktx->ktx_buffer;
+                        rmd  = (kqswnal_remotemd_t *)(mhdr + 1);
+                } else {
+                        /* Send an RDMA message */
+                        msg->kqm_magic = LNET_PROTO_QSW_MAGIC;
+                        msg->kqm_version = QSWLND_PROTO_VERSION;
+                        msg->kqm_type = QSWLND_MSG_RDMA;
+
+                        mhdr = &msg->kqm_u.rdma.kqrm_hdr;
+                        rmd  = &msg->kqm_u.rdma.kqrm_rmd;
+                }
 
-                ktx->ktx_state = (type == PTL_MSG_PUT) ? KTX_PUTTING : KTX_GETTING;
+                *mhdr = *hdr;
+                nob = (((char *)rmd) - ktx->ktx_buffer);
+
+                if (type == LNET_MSG_GET) {
+                        if ((md->md_options & LNET_MD_KIOV) != 0) 
+                                rc = kqswnal_map_tx_kiov (ktx, 0, md->md_length,
+                                                          md->md_niov, md->md_iov.kiov);
+                        else
+                                rc = kqswnal_map_tx_iov (ktx, 0, md->md_length,
+                                                         md->md_niov, md->md_iov.iov);
+                        ktx->ktx_state = KTX_GETTING;
+                } else {
+                        if (payload_kiov != NULL)
+                                rc = kqswnal_map_tx_kiov(ktx, 0, payload_nob,
+                                                         payload_niov, payload_kiov);
+                        else
+                                rc = kqswnal_map_tx_iov(ktx, 0, payload_nob,
+                                                        payload_niov, payload_iov);
+                        ktx->ktx_state = KTX_PUTTING;
+                }
 
-                if ((libmsg->md->options & PTL_MD_KIOV) != 0) 
-                        rc = kqswnal_map_tx_kiov (ktx, 0, md->length,
-                                                  md->md_niov, md->md_iov.kiov);
-                else
-                        rc = kqswnal_map_tx_iov (ktx, 0, md->length,
-                                                 md->md_niov, md->md_iov.iov);
                 if (rc != 0)
                         goto out;
 
                 rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1;
+                nob += offsetof(kqswnal_remotemd_t,
+                                kqrmd_frag[rmd->kqrmd_nfrag]);
+                LASSERT (nob <= KQSW_TX_BUFFER_SIZE);
 
-                payload_nob = offsetof(kqswnal_remotemd_t,
-                                       kqrmd_frag[rmd->kqrmd_nfrag]);
-                LASSERT (KQSW_HDR_SIZE + payload_nob <= KQSW_TX_BUFFER_SIZE);
-
-#if MULTIRAIL_EKC
                 memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1],
                        rmd->kqrmd_nfrag * sizeof(EP_NMD));
 
-                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
-                              0, KQSW_HDR_SIZE + payload_nob);
-#else
-                memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1],
-                       rmd->kqrmd_nfrag * sizeof(EP_IOVEC));
-                
-                ktx->ktx_frags[0].Base = ktx->ktx_ebuffer;
-                ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob;
+                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob);
+#if KQSW_CKSUM
+                LASSERT (the_lnet.ln_ptlcompat != 2);
+                msg->kqm_nob   = nob + payload_nob;
+                msg->kqm_cksum = 0;
+                msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob);
 #endif
-                if (type == PTL_MSG_GET) {
+                if (type == LNET_MSG_GET) {
                         /* Allocate reply message now while I'm in thread context */
-                        ktx->ktx_args[2] = lib_create_reply_msg (&kqswnal_lib,
-                                                                 nid, libmsg);
+                        ktx->ktx_args[2] = lnet_create_reply_msg (
+                                kqswnal_data.kqn_ni, lntmsg);
                         if (ktx->ktx_args[2] == NULL)
                                 goto out;
 
                         /* NB finalizing the REPLY message is my
                          * responsibility now, whatever happens. */
+#if KQSW_CKSUM
+                        if (*kqswnal_tunables.kqn_inject_csum_error ==  3) {
+                                msg->kqm_cksum++;
+                                *kqswnal_tunables.kqn_inject_csum_error = 0;
+                        }
+
+                } else if (payload_kiov != NULL) {
+                        /* must checksum payload after header so receiver can
+                         * compute partial header cksum before swab.  Sadly
+                         * this causes 2 rounds of kmap */
+                        msg->kqm_cksum =
+                                kqswnal_csum_kiov(msg->kqm_cksum, 0, payload_nob,
+                                                  payload_niov, payload_kiov);
+                        if (*kqswnal_tunables.kqn_inject_csum_error ==  2) {
+                                msg->kqm_cksum++;
+                                *kqswnal_tunables.kqn_inject_csum_error = 0;
+                        }
+                } else {
+                        msg->kqm_cksum =
+                                kqswnal_csum_iov(msg->kqm_cksum, 0, payload_nob,
+                                                 payload_niov, payload_iov);
+                        if (*kqswnal_tunables.kqn_inject_csum_error ==  2) {
+                                msg->kqm_cksum++;
+                                *kqswnal_tunables.kqn_inject_csum_error = 0;
+                        }
+#endif
                 }
                 
-        } else if (payload_nob <= KQSW_TX_MAXCONTIG) {
+        } else if (payload_nob <= *kqswnal_tunables.kqn_tx_maxcontig) {
+                lnet_hdr_t    *mhdr;
+                char          *payload;
+                kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
 
                 /* small message: single frag copied into the pre-mapped buffer */
+                if (the_lnet.ln_ptlcompat == 2) {
+                        /* Strong portals compatibility: send "raw" LNET header
+                         * + payload */
+                        mhdr = (lnet_hdr_t *)ktx->ktx_buffer;
+                        payload = (char *)(mhdr + 1);
+                } else {
+                        /* Send an IMMEDIATE message */
+                        msg->kqm_magic = LNET_PROTO_QSW_MAGIC;
+                        msg->kqm_version = QSWLND_PROTO_VERSION;
+                        msg->kqm_type = QSWLND_MSG_IMMEDIATE;
 
-#if MULTIRAIL_EKC
-                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
-                              0, KQSW_HDR_SIZE + payload_nob);
-#else
-                ktx->ktx_frags[0].Base = ktx->ktx_ebuffer;
-                ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob;
-#endif
-                if (payload_nob > 0) {
-                        if (payload_kiov != NULL)
-                                lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
-                                                   payload_niov, payload_kiov, 
-                                                   payload_offset, payload_nob);
-                        else
-                                lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE,
-                                                  payload_niov, payload_iov, 
-                                                  payload_offset, payload_nob);
+                        mhdr = &msg->kqm_u.immediate.kqim_hdr;
+                        payload = msg->kqm_u.immediate.kqim_payload;
+                }
+
+                *mhdr = *hdr;
+                nob = (payload - ktx->ktx_buffer) + payload_nob;
+
+                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob);
+
+                if (payload_kiov != NULL)
+                        lnet_copy_kiov2flat(KQSW_TX_BUFFER_SIZE, payload, 0,
+                                            payload_niov, payload_kiov, 
+                                            payload_offset, payload_nob);
+                else
+                        lnet_copy_iov2flat(KQSW_TX_BUFFER_SIZE, payload, 0,
+                                           payload_niov, payload_iov, 
+                                           payload_offset, payload_nob);
+#if KQSW_CKSUM
+                LASSERT (the_lnet.ln_ptlcompat != 2);
+                msg->kqm_nob   = nob;
+                msg->kqm_cksum = 0;
+                msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob);
+                if (*kqswnal_tunables.kqn_inject_csum_error == 1) {
+                        msg->kqm_cksum++;
+                        *kqswnal_tunables.kqn_inject_csum_error = 0;
                 }
+#endif
         } else {
+                lnet_hdr_t    *mhdr;
+                kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer;
 
                 /* large message: multiple frags: first is hdr in pre-mapped buffer */
+                if (the_lnet.ln_ptlcompat == 2) {
+                        /* Strong portals compatibility: send "raw" LNET header
+                         * + payload */
+                        mhdr = (lnet_hdr_t *)ktx->ktx_buffer;
+                        nob = sizeof(lnet_hdr_t);
+                } else {
+                        /* Send an IMMEDIATE message */
+                        msg->kqm_magic = LNET_PROTO_QSW_MAGIC;
+                        msg->kqm_version = QSWLND_PROTO_VERSION;
+                        msg->kqm_type = QSWLND_MSG_IMMEDIATE;
+
+                        mhdr = &msg->kqm_u.immediate.kqim_hdr;
+                        nob = offsetof(kqswnal_msg_t,
+                                       kqm_u.immediate.kqim_payload);
+                }
+
+                *mhdr = *hdr;
+
+                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob);
 
-#if MULTIRAIL_EKC
-                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
-                              0, KQSW_HDR_SIZE);
-#else
-                ktx->ktx_frags[0].Base = ktx->ktx_ebuffer;
-                ktx->ktx_frags[0].Len = KQSW_HDR_SIZE;
-#endif
                 if (payload_kiov != NULL)
                         rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, 
                                                   payload_niov, payload_kiov);
@@ -1246,175 +1224,61 @@ kqswnal_sendmsg (lib_nal_t    *nal,
                                                  payload_niov, payload_iov);
                 if (rc != 0)
                         goto out;
+
+#if KQSW_CKSUM
+                msg->kqm_nob   = nob + payload_nob;
+                msg->kqm_cksum = 0;
+                msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob);
+
+                msg->kqm_cksum = (payload_kiov != NULL) ?
+                                 kqswnal_csum_kiov(msg->kqm_cksum,
+                                                   payload_offset, payload_nob,
+                                                   payload_niov, payload_kiov) :
+                                 kqswnal_csum_iov(msg->kqm_cksum,
+                                                  payload_offset, payload_nob,
+                                                  payload_niov, payload_iov);
+
+                if (*kqswnal_tunables.kqn_inject_csum_error == 1) {
+                        msg->kqm_cksum++;
+                        *kqswnal_tunables.kqn_inject_csum_error = 0;
+                }
+#endif
+                nob += payload_nob;
         }
         
-        ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ?
+        ktx->ktx_port = (nob <= KQSW_SMALLMSG) ?
                         EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE;
 
         rc = kqswnal_launch (ktx);
 
  out:
-        CDEBUG(rc == 0 ? D_NET : D_ERROR, 
-               "%s "LPSZ" bytes to "LPX64" via "LPX64": rc %d\n", 
-               rc == 0 ? "Sent" : "Failed to send",
-               payload_nob, nid, targetnid, rc);
+        CDEBUG(rc == 0 ? D_NET : D_NETERROR, "%s %d bytes to %s%s: rc %d\n",
+               routing ? (rc == 0 ? "Routed" : "Failed to route") :
+                         (rc == 0 ? "Sent" : "Failed to send"),
+               nob, libcfs_nid2str(target.nid),
+               target_is_router ? "(router)" : "", rc);
 
         if (rc != 0) {
-                if (ktx->ktx_state == KTX_GETTING &&
-                    ktx->ktx_args[2] != NULL) {
+                lnet_msg_t *repmsg = (lnet_msg_t *)ktx->ktx_args[2];
+                int         state = ktx->ktx_state;
+                
+                kqswnal_put_idle_tx (ktx);
+
+                if (state == KTX_GETTING && repmsg != NULL) {
                         /* We committed to reply, but there was a problem
                          * launching the GET.  We can't avoid delivering a
                          * REPLY event since we committed above, so we
                          * pretend the GET succeeded but the REPLY
                          * failed. */
                         rc = 0;
-                        lib_finalize (&kqswnal_lib, private, libmsg, PTL_OK);
-                        lib_finalize (&kqswnal_lib, private,
-                                      (lib_msg_t *)ktx->ktx_args[2], PTL_FAIL);
+                        lnet_finalize (kqswnal_data.kqn_ni, lntmsg, 0);
+                        lnet_finalize (kqswnal_data.kqn_ni, repmsg, -EIO);
                 }
                 
-                kqswnal_put_idle_tx (ktx);
         }
         
         atomic_dec(&kqswnal_data.kqn_pending_txs);
-        return (rc == 0 ? PTL_OK : PTL_FAIL);
-}
-
-static ptl_err_t
-kqswnal_send (lib_nal_t    *nal,
-              void         *private,
-              lib_msg_t    *libmsg,
-              ptl_hdr_t    *hdr,
-              int           type,
-              ptl_nid_t     nid,
-              ptl_pid_t     pid,
-              unsigned int  payload_niov,
-              struct iovec *payload_iov,
-              size_t        payload_offset,
-              size_t        payload_nob)
-{
-        return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
-                                 payload_niov, payload_iov, NULL, 
-                                 payload_offset, payload_nob));
-}
-
-static ptl_err_t
-kqswnal_send_pages (lib_nal_t    *nal,
-                    void         *private,
-                    lib_msg_t    *libmsg,
-                    ptl_hdr_t    *hdr,
-                    int           type,
-                    ptl_nid_t     nid,
-                    ptl_pid_t     pid,
-                    unsigned int  payload_niov,
-                    ptl_kiov_t   *payload_kiov,
-                    size_t        payload_offset,
-                    size_t        payload_nob)
-{
-        return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid,
-                                 payload_niov, NULL, payload_kiov, 
-                                 payload_offset, payload_nob));
-}
-
-void
-kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
-{
-        int             rc;
-        kqswnal_tx_t   *ktx;
-        ptl_kiov_t     *kiov = fwd->kprfd_kiov;
-        int             niov = fwd->kprfd_niov;
-        int             nob = fwd->kprfd_nob;
-        ptl_nid_t       nid = fwd->kprfd_gateway_nid;
-
-#if KQSW_CHECKSUM
-        CERROR ("checksums for forwarded packets not implemented\n");
-        LBUG ();
-#endif
-        /* The router wants this NAL to forward a packet */
-        CDEBUG (D_NET, "forwarding [%p] to "LPX64", payload: %d frags %d bytes\n",
-                fwd, nid, niov, nob);
-
-        ktx = kqswnal_get_idle_tx (fwd, 0);
-        if (ktx == NULL)        /* can't get txd right now */
-                return;         /* fwd will be scheduled when tx desc freed */
-
-        if (nid == kqswnal_lib.libnal_ni.ni_pid.nid) /* gateway is me */
-                nid = fwd->kprfd_target_nid;    /* target is final dest */
-
-        /* copy hdr into pre-mapped buffer */
-        memcpy(ktx->ktx_buffer, fwd->kprfd_hdr, sizeof(ptl_hdr_t));
-
-        ktx->ktx_port    = (nob <= KQSW_SMALLPAYLOAD) ?
-                           EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE;
-        ktx->ktx_nid     = nid;
-        ktx->ktx_state   = KTX_FORWARDING;
-        ktx->ktx_args[0] = fwd;
-        ktx->ktx_nfrag   = ktx->ktx_firsttmpfrag = 1;
-
-        if (kqswnal_nid2elanid (nid) < 0) {
-                CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid);
-                rc = -EHOSTUNREACH;
-                goto out;
-        }
-
-        if (nob <= KQSW_TX_MAXCONTIG) 
-        {
-                /* send payload from ktx's pre-mapped contiguous buffer */
-#if MULTIRAIL_EKC
-                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
-                              0, KQSW_HDR_SIZE + nob);
-#else
-                ktx->ktx_frags[0].Base = ktx->ktx_ebuffer;
-                ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + nob;
-#endif
-                if (nob > 0)
-                        lib_copy_kiov2buf(ktx->ktx_buffer + KQSW_HDR_SIZE,
-                                          niov, kiov, 0, nob);
-        }
-        else
-        {
-                /* zero copy payload */
-#if MULTIRAIL_EKC
-                ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer,
-                              0, KQSW_HDR_SIZE);
-#else
-                ktx->ktx_frags[0].Base = ktx->ktx_ebuffer;
-                ktx->ktx_frags[0].Len = KQSW_HDR_SIZE;
-#endif
-                rc = kqswnal_map_tx_kiov (ktx, 0, nob, niov, kiov);
-                if (rc != 0)
-                        goto out;
-        }
-
-        rc = kqswnal_launch (ktx);
- out:
-        if (rc != 0) {
-                CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc);
-
-                /* complete now (with failure) */
-                kqswnal_tx_done (ktx, rc);
-        }
-
-        atomic_dec(&kqswnal_data.kqn_pending_txs);
-}
-
-void
-kqswnal_fwd_callback (void *arg, int error)
-{
-        kqswnal_rx_t *krx = (kqswnal_rx_t *)arg;
-
-        /* The router has finished forwarding this packet */
-
-        if (error != 0)
-        {
-                ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page);
-
-                CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n",
-                       le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid),error);
-        }
-
-        LASSERT (atomic_read(&krx->krx_refcount) == 1);
-        kqswnal_rx_decref (krx);
+        return (rc == 0 ? 0 : -EIO);
 }
 
 void
@@ -1425,7 +1289,6 @@ kqswnal_requeue_rx (kqswnal_rx_t *krx)
 
         krx->krx_state = KRX_POSTED;
 
-#if MULTIRAIL_EKC
         if (kqswnal_data.kqn_shuttingdown) {
                 /* free EKC rxd on shutdown */
                 ep_complete_receive(krx->krx_rxd);
@@ -1435,26 +1298,6 @@ kqswnal_requeue_rx (kqswnal_rx_t *krx)
                                    kqswnal_rxhandler, krx,
                                    &krx->krx_elanbuffer, 0);
         }
-#else                
-        if (kqswnal_data.kqn_shuttingdown)
-                return;
-
-        if (krx->krx_rxd == NULL) {
-                /* We had a failed ep_complete_rpc() which nukes the
-                 * descriptor in "old" EKC */
-                int eprc = ep_queue_receive(krx->krx_eprx, 
-                                            kqswnal_rxhandler, krx,
-                                            krx->krx_elanbuffer, 
-                                            krx->krx_npages * PAGE_SIZE, 0);
-                LASSERT (eprc == EP_SUCCESS);
-                /* We don't handle failure here; it's incredibly rare
-                 * (never reported?) and only happens with "old" EKC */
-        } else {
-                ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx,
-                                   krx->krx_elanbuffer, 
-                                   krx->krx_npages * PAGE_SIZE);
-        }
-#endif
 }
 
 void
@@ -1477,33 +1320,23 @@ void
 kqswnal_rx_done (kqswnal_rx_t *krx) 
 {
         int           rc;
-        EP_STATUSBLK *sblk;
 
         LASSERT (atomic_read(&krx->krx_refcount) == 0);
 
         if (krx->krx_rpc_reply_needed) {
                 /* We've not completed the peer's RPC yet... */
-                sblk = (krx->krx_rpc_reply_status == 0) ? 
-                       &kqswnal_data.kqn_rpc_success : 
-                       &kqswnal_data.kqn_rpc_failed;
+                krx->krx_rpc_reply.msg.magic   = LNET_PROTO_QSW_MAGIC;
+                krx->krx_rpc_reply.msg.version = QSWLND_PROTO_VERSION;
 
                 LASSERT (!in_interrupt());
-#if MULTIRAIL_EKC
-                rc = ep_complete_rpc(krx->krx_rxd, 
-                                     kqswnal_rpc_complete, krx,
-                                     sblk, NULL, NULL, 0);
-                if (rc == EP_SUCCESS)
-                        return;
-#else
+
                 rc = ep_complete_rpc(krx->krx_rxd, 
                                      kqswnal_rpc_complete, krx,
-                                     sblk, NULL, 0);
+                                     &krx->krx_rpc_reply.ep_statusblk, 
+                                     NULL, NULL, 0);
                 if (rc == EP_SUCCESS)
                         return;
 
-                /* "old" EKC destroys rxd on failed completion */
-                krx->krx_rxd = NULL;
-#endif
                 CERROR("can't complete RPC: %d\n", rc);
                 krx->krx_rpc_reply_needed = 0;
         }
@@ -1514,60 +1347,199 @@ kqswnal_rx_done (kqswnal_rx_t *krx)
 void
 kqswnal_parse (kqswnal_rx_t *krx)
 {
-        ptl_hdr_t      *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page);
-        ptl_nid_t       dest_nid = le64_to_cpu(hdr->dest_nid);
-        int             payload_nob;
+        lnet_ni_t      *ni = kqswnal_data.kqn_ni;
+        kqswnal_msg_t  *msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page);
+        lnet_nid_t      fromnid = kqswnal_rx_nid(krx);
+        int             swab;
+        int             n;
+        int             i;
         int             nob;
-        int             niov;
+        int             rc;
 
         LASSERT (atomic_read(&krx->krx_refcount) == 1);
 
-        if (dest_nid == kqswnal_lib.libnal_ni.ni_pid.nid) { /* It's for me :) */
-                /* I ignore parse errors since I'm not consuming a byte
-                 * stream */
-                (void)lib_parse (&kqswnal_lib, hdr, krx);
-
-                /* Drop my ref; any RDMA activity takes an additional ref */
-                kqswnal_rx_decref(krx);
-                return;
+        /* If ln_ptlcompat is set, peers may send me an "old" unencapsulated
+         * lnet hdr */
+        LASSERT (offsetof(kqswnal_msg_t, kqm_u) <= sizeof(lnet_hdr_t));
+        
+        if (krx->krx_nob < offsetof(kqswnal_msg_t, kqm_u)) {
+                CERROR("Short message %d received from %s\n",
+                       krx->krx_nob, libcfs_nid2str(fromnid));
+                goto done;
         }
 
-#if KQSW_CHECKSUM
-        LASSERTF (0, "checksums for forwarded packets not implemented\n");
+        swab = msg->kqm_magic == __swab32(LNET_PROTO_QSW_MAGIC);
+
+        if (swab || msg->kqm_magic == LNET_PROTO_QSW_MAGIC) {
+#if KQSW_CKSUM
+                __u32 csum0;
+                __u32 csum1;
+
+                /* csum byte array before swab */
+                csum1 = msg->kqm_cksum;
+                msg->kqm_cksum = 0;
+                csum0 = kqswnal_csum_kiov(~0, 0, krx->krx_nob,
+                                          krx->krx_npages, krx->krx_kiov);
+                msg->kqm_cksum = csum1;
 #endif
 
-        if (kqswnal_nid2elanid (dest_nid) >= 0)  /* should have gone direct to peer */
-        {
-                CERROR("dropping packet from "LPX64" for "LPX64
-                       ": target is peer\n", le64_to_cpu(hdr->src_nid), dest_nid);
+                if (swab) {
+                        __swab16s(&msg->kqm_version);
+                        __swab16s(&msg->kqm_type);
+#if KQSW_CKSUM
+                        __swab32s(&msg->kqm_cksum);
+                        __swab32s(&msg->kqm_nob);
+#endif
+                }
 
-                kqswnal_rx_decref (krx);
-                return;
+                if (msg->kqm_version != QSWLND_PROTO_VERSION) {
+                        /* Future protocol version compatibility support!
+                         * The next qswlnd-specific protocol rev will first
+                         * send an RPC to check version.
+                         * 1.4.6 and 1.4.7.early reply with a status
+                         * block containing its current version.
+                         * Later versions send a failure (-ve) status +
+                         * magic/version */
+
+                        if (!krx->krx_rpc_reply_needed) {
+                                CERROR("Unexpected version %d from %s\n",
+                                       msg->kqm_version, libcfs_nid2str(fromnid));
+                                goto done;
+                        }
+
+                        LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO);
+                        goto done;
+                }
+
+                switch (msg->kqm_type) {
+                default:
+                        CERROR("Bad request type %x from %s\n",
+                               msg->kqm_type, libcfs_nid2str(fromnid));
+                        goto done;
+
+                case QSWLND_MSG_IMMEDIATE:
+                        if (krx->krx_rpc_reply_needed) {
+                                /* Should have been a simple message */
+                                CERROR("IMMEDIATE sent as RPC from %s\n",
+                                       libcfs_nid2str(fromnid));
+                                goto done;
+                        }
+
+                        nob = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload);
+                        if (krx->krx_nob < nob) {
+                                CERROR("Short IMMEDIATE %d(%d) from %s\n",
+                                       krx->krx_nob, nob, libcfs_nid2str(fromnid));
+                                goto done;
+                        }
+
+#if KQSW_CKSUM
+                        if (csum0 != msg->kqm_cksum) {
+                                CERROR("Bad IMMEDIATE checksum %08x(%08x) from %s\n",
+                                       csum0, msg->kqm_cksum, libcfs_nid2str(fromnid));
+                                CERROR("nob %d (%d)\n", krx->krx_nob, msg->kqm_nob);
+                                goto done;
+                        }
+#endif
+                        rc = lnet_parse(ni, &msg->kqm_u.immediate.kqim_hdr,
+                                        fromnid, krx, 0);
+                        if (rc < 0)
+                                goto done;
+                        return;
+
+                case QSWLND_MSG_RDMA:
+                        if (!krx->krx_rpc_reply_needed) {
+                                /* Should have been a simple message */
+                                CERROR("RDMA sent as simple message from %s\n",
+                                       libcfs_nid2str(fromnid));
+                                goto done;
+                        }
+
+                        nob = offsetof(kqswnal_msg_t,
+                                       kqm_u.rdma.kqrm_rmd.kqrmd_frag[0]);
+                        if (krx->krx_nob < nob) {
+                                CERROR("Short RDMA message %d(%d) from %s\n",
+                                       krx->krx_nob, nob, libcfs_nid2str(fromnid));
+                                goto done;
+                        }
+
+                        if (swab)
+                                __swab32s(&msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag);
+
+                        n = msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag;
+                        nob = offsetof(kqswnal_msg_t,
+                                       kqm_u.rdma.kqrm_rmd.kqrmd_frag[n]);
+
+                        if (krx->krx_nob < nob) {
+                                CERROR("short RDMA message %d(%d) from %s\n",
+                                       krx->krx_nob, nob, libcfs_nid2str(fromnid));
+                                goto done;
+                        }
+
+                        if (swab) {
+                                for (i = 0; i < n; i++) {
+                                        EP_NMD *nmd = &msg->kqm_u.rdma.kqrm_rmd.kqrmd_frag[i];
+
+                                        __swab32s(&nmd->nmd_addr);
+                                        __swab32s(&nmd->nmd_len);
+                                        __swab32s(&nmd->nmd_attr);
+                                }
+                        }
+
+#if KQSW_CKSUM
+                        krx->krx_cksum = csum0; /* stash checksum so far */
+#endif
+                        rc = lnet_parse(ni, &msg->kqm_u.rdma.kqrm_hdr,
+                                        fromnid, krx, 1);
+                        if (rc < 0)
+                                goto done;
+                        return;
+                }
+                /* Not Reached */
         }
 
-        nob = payload_nob = krx->krx_nob - KQSW_HDR_SIZE;
-        niov = 0;
-        if (nob > 0) {
-                krx->krx_kiov[0].kiov_offset = KQSW_HDR_SIZE;
-                krx->krx_kiov[0].kiov_len = MIN(PAGE_SIZE - KQSW_HDR_SIZE, nob);
-                niov = 1;
-                nob -= PAGE_SIZE - KQSW_HDR_SIZE;
-                
-                while (nob > 0) {
-                        LASSERT (niov < krx->krx_npages);
-                        
-                        krx->krx_kiov[niov].kiov_offset = 0;
-                        krx->krx_kiov[niov].kiov_len = MIN(PAGE_SIZE, nob);
-                        niov++;
-                        nob -= PAGE_SIZE;
+        if (msg->kqm_magic == LNET_PROTO_MAGIC ||
+            msg->kqm_magic == __swab32(LNET_PROTO_MAGIC)) {
+                /* Future protocol version compatibility support!
+                 * When LNET unifies protocols over all LNDs, the first thing a
+                 * peer will send will be a version query RPC.  
+                 * 1.4.6 and 1.4.7.early reply with a status block containing
+                 * LNET_PROTO_QSW_MAGIC..
+                 * Later versions send a failure (-ve) status +
+                 * magic/version */
+
+                if (!krx->krx_rpc_reply_needed) {
+                        CERROR("Unexpected magic %08x from %s\n",
+                               msg->kqm_magic, libcfs_nid2str(fromnid));
+                        goto done;
                 }
+
+                LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO);
+                goto done;
         }
 
-        kpr_fwd_init (&krx->krx_fwd, dest_nid, 
-                      hdr, payload_nob, niov, krx->krx_kiov,
-                      kqswnal_fwd_callback, krx);
+        if (the_lnet.ln_ptlcompat != 0) {
+                /* Portals compatibility (strong or weak)
+                 * This could be an unencapsulated LNET header.  If it's big
+                 * enough, let LNET's parser sort it out */
+
+                if (krx->krx_nob < sizeof(lnet_hdr_t)) {
+                        CERROR("Short portals-compatible message from %s\n",
+                               libcfs_nid2str(fromnid));
+                        goto done;
+                }
+
+                krx->krx_raw_lnet_hdr = 1;
+                rc = lnet_parse(ni, (lnet_hdr_t *)msg,
+                                fromnid, krx, krx->krx_rpc_reply_needed);
+                if (rc < 0)
+                        goto done;
+                return;
+        }
 
-        kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd);
+        CERROR("Unrecognised magic %08x from %s\n",
+               msg->kqm_magic, libcfs_nid2str(fromnid));
+ done:
+        kqswnal_rx_decref(krx);
 }
 
 /* Receive Interrupt Handler: posts to schedulers */
@@ -1578,7 +1550,6 @@ kqswnal_rxhandler(EP_RXD *rxd)
         int           nob    = ep_rxd_len (rxd);
         int           status = ep_rxd_status (rxd);
         kqswnal_rx_t *krx    = (kqswnal_rx_t *)ep_rxd_arg (rxd);
-
         CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n",
                rxd, krx, nob, status);
 
@@ -1588,6 +1559,7 @@ kqswnal_rxhandler(EP_RXD *rxd)
         krx->krx_state = KRX_PARSE;
         krx->krx_rxd = rxd;
         krx->krx_nob = nob;
+        krx->krx_raw_lnet_hdr = 0;
 
         /* RPC reply iff rpc request received without error */
         krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd) &&
@@ -1595,24 +1567,16 @@ kqswnal_rxhandler(EP_RXD *rxd)
                                      status == EP_MSG_TOO_BIG);
 
         /* Default to failure if an RPC reply is requested but not handled */
-        krx->krx_rpc_reply_status = -EPROTO;
+        krx->krx_rpc_reply.msg.status = -EPROTO;
         atomic_set (&krx->krx_refcount, 1);
 
-        /* must receive a whole header to be able to parse */
-        if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t))
-        {
+        if (status != EP_SUCCESS) {
                 /* receives complete with failure when receiver is removed */
-#if MULTIRAIL_EKC
                 if (status == EP_SHUTDOWN)
                         LASSERT (kqswnal_data.kqn_shuttingdown);
                 else
                         CERROR("receive status failed with status %d nob %d\n",
                                ep_rxd_status(rxd), nob);
-#else
-                if (!kqswnal_data.kqn_shuttingdown)
-                        CERROR("receive status failed with status %d nob %d\n",
-                               ep_rxd_status(rxd), nob);
-#endif
                 kqswnal_rx_decref(krx);
                 return;
         }
@@ -1630,249 +1594,124 @@ kqswnal_rxhandler(EP_RXD *rxd)
         spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
 }
 
-#if KQSW_CHECKSUM
-void
-kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr)
-{
-        ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page);
-
-        CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64
-                ", dpid %d, spid %d, type %d\n",
-                ishdr ? "Header" : "Payload", krx,
-                le64_to_cpu(hdr->dest_nid), le64_to_cpu(hdr->src_nid)
-                le32_to_cpu(hdr->dest_pid), le32_to_cpu(hdr->src_pid),
-                le32_to_cpu(hdr->type));
-
-        switch (le32_to_cpu(hdr->type))
-        {
-        case PTL_MSG_ACK:
-                CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64
-                       " len %u\n",
-                       le32_to_cpu(hdr->msg.ack.mlength),
-                       hdr->msg.ack.dst_wmd.handle_cookie,
-                       hdr->msg.ack.dst_wmd.handle_idx,
-                       le64_to_cpu(hdr->msg.ack.match_bits),
-                       le32_to_cpu(hdr->msg.ack.length));
-                break;
-        case PTL_MSG_PUT:
-                CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64
-                       " len %u off %u data "LPX64"\n",
-                       le32_to_cpu(hdr->msg.put.ptl_index),
-                       hdr->msg.put.ack_wmd.handle_cookie,
-                       hdr->msg.put.ack_wmd.handle_idx,
-                       le64_to_cpu(hdr->msg.put.match_bits),
-                       le32_to_cpu(hdr->msg.put.length),
-                       le32_to_cpu(hdr->msg.put.offset),
-                       hdr->msg.put.hdr_data);
-                break;
-        case PTL_MSG_GET:
-                CERROR ("GET: <>\n");
-                break;
-        case PTL_MSG_REPLY:
-                CERROR ("REPLY: <>\n");
-                break;
-        default:
-                CERROR ("TYPE?: <>\n");
-        }
-}
-#endif
-
-static ptl_err_t
-kqswnal_recvmsg (lib_nal_t    *nal,
-                 void         *private,
-                 lib_msg_t    *libmsg,
-                 unsigned int  niov,
-                 struct iovec *iov,
-                 ptl_kiov_t   *kiov,
-                 size_t        offset,
-                 size_t        mlen,
-                 size_t        rlen)
+int
+kqswnal_recv (lnet_ni_t     *ni,
+              void          *private,
+              lnet_msg_t    *lntmsg,
+              int            delayed,
+              unsigned int   niov,
+              struct iovec  *iov,
+              lnet_kiov_t   *kiov,
+              unsigned int   offset,
+              unsigned int   mlen,
+              unsigned int   rlen)
 {
-        kqswnal_rx_t *krx = (kqswnal_rx_t *)private;
-        char         *buffer = page_address(krx->krx_kiov[0].kiov_page);
-        ptl_hdr_t    *hdr = (ptl_hdr_t *)buffer;
-        int           page;
-        char         *page_ptr;
-        int           page_nob;
-        char         *iov_ptr;
-        int           iov_nob;
-        int           frag;
-        int           rc;
-#if KQSW_CHECKSUM
-        kqsw_csum_t   senders_csum;
-        kqsw_csum_t   payload_csum = 0;
-        kqsw_csum_t   hdr_csum = kqsw_csum(0, hdr, sizeof(*hdr));
-        size_t        csum_len = mlen;
-        int           csum_frags = 0;
-        int           csum_nob = 0;
-        static atomic_t csum_counter;
-        int           csum_verbose = (atomic_read(&csum_counter)%1000001) == 0;
-
-        atomic_inc (&csum_counter);
-
-        memcpy (&senders_csum, buffer + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t));
-        if (senders_csum != hdr_csum)
-                kqswnal_csum_error (krx, 1);
-#endif
-        /* NB lib_parse() has already flipped *hdr */
-
-        CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen);
-
-        if (libmsg == NULL) {                   /* portals is discarding. */
-                LASSERT (mlen == 0);
-                return PTL_OK;                  /* ignored by caller! */
-        }
-        
-        if (krx->krx_rpc_reply_needed &&
-            hdr->type == PTL_MSG_PUT) {
-                /* This must be an optimized PUT */
-                rc = kqswnal_rdma (krx, libmsg, PTL_MSG_PUT,
-                                   niov, iov, kiov, offset, mlen);
-                return (rc == 0 ? PTL_OK : PTL_FAIL);
-        }
-
-        /* What was actually received must be >= payload. */
-        LASSERT (mlen <= rlen);
-        if (krx->krx_nob < KQSW_HDR_SIZE + mlen) {
-                CERROR("Bad message size: have %d, need %d + %d\n",
-                       krx->krx_nob, (int)KQSW_HDR_SIZE, (int)mlen);
-                return (PTL_FAIL);
-        }
+        kqswnal_rx_t       *krx = (kqswnal_rx_t *)private;
+        lnet_nid_t          fromnid;
+        kqswnal_msg_t      *msg;
+        lnet_hdr_t         *hdr;
+        kqswnal_remotemd_t *rmd;
+        int                 msg_offset;
+        int                 rc;
 
-        /* It must be OK to kmap() if required */
-        LASSERT (kiov == NULL || !in_interrupt ());
+        LASSERT (!in_interrupt ());             /* OK to map */
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
 
-        if (mlen != 0) {
-                page     = 0;
-                page_ptr = buffer + KQSW_HDR_SIZE;
-                page_nob = PAGE_SIZE - KQSW_HDR_SIZE;
-
-                LASSERT (niov > 0);
+        fromnid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ep_rxd_node(krx->krx_rxd));
+        msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page);
 
-                if (kiov != NULL) {
-                        /* skip complete frags */
-                        while (offset >= kiov->kiov_len) {
-                                offset -= kiov->kiov_len;
-                                kiov++;
-                                niov--;
-                                LASSERT (niov > 0);
-                        }
-                        iov_ptr = ((char *)kmap (kiov->kiov_page)) +
-                                kiov->kiov_offset + offset;
-                        iov_nob = kiov->kiov_len - offset;
+        if (krx->krx_rpc_reply_needed) {
+                /* optimized (rdma) request sent as RPC */
+
+                if (krx->krx_raw_lnet_hdr) {
+                        LASSERT (the_lnet.ln_ptlcompat != 0);
+                        hdr = (lnet_hdr_t *)msg;
+                        rmd = kqswnal_get_portalscompat_rmd(krx);
+                        if (rmd == NULL)
+                                return (-EPROTO);
                 } else {
-                        /* skip complete frags */
-                        while (offset >= iov->iov_len) {
-                                offset -= iov->iov_len;
-                                iov++;
-                                niov--;
-                                LASSERT (niov > 0);
-                        }
-                        iov_ptr = iov->iov_base + offset;
-                        iov_nob = iov->iov_len - offset;
+                        LASSERT (msg->kqm_type == QSWLND_MSG_RDMA);
+                        hdr = &msg->kqm_u.rdma.kqrm_hdr;
+                        rmd = &msg->kqm_u.rdma.kqrm_rmd;
                 }
-                
-                for (;;)
-                {
-                        frag = mlen;
-                        if (frag > page_nob)
-                                frag = page_nob;
-                        if (frag > iov_nob)
-                                frag = iov_nob;
-
-                        memcpy (iov_ptr, page_ptr, frag);
-#if KQSW_CHECKSUM
-                        payload_csum = kqsw_csum (payload_csum, iov_ptr, frag);
-                        csum_nob += frag;
-                        csum_frags++;
-#endif
-                        mlen -= frag;
-                        if (mlen == 0)
+
+                /* NB header is still in wire byte order */
+
+                switch (le32_to_cpu(hdr->type)) {
+                        case LNET_MSG_PUT:
+                        case LNET_MSG_REPLY:
+                                /* This is an optimized PUT/REPLY */
+                                rc = kqswnal_rdma(krx, lntmsg, 
+                                                  KTX_RDMA_FETCH, rmd,
+                                                  niov, iov, kiov, offset, mlen);
                                 break;
 
-                        page_nob -= frag;
-                        if (page_nob != 0)
-                                page_ptr += frag;
-                        else
-                        {
-                                page++;
-                                LASSERT (page < krx->krx_npages);
-                                page_ptr = page_address(krx->krx_kiov[page].kiov_page);
-                                page_nob = PAGE_SIZE;
-                        }
+                        case LNET_MSG_GET:
+#if KQSW_CKSUM
+                                if (krx->krx_cksum != msg->kqm_cksum) {
+                                        CERROR("Bad GET checksum %08x(%08x) from %s\n",
+                                               krx->krx_cksum, msg->kqm_cksum,
+                                               libcfs_nid2str(fromnid));
+                                        rc = -EIO;
+                                        break;
+                                }
+#endif                                
+                                if (lntmsg == NULL) {
+                                        /* No buffer match: my decref will
+                                         * complete the RPC with failure */
+                                        rc = 0;
+                                } else {
+                                        /* Matched something! */
+                                        rc = kqswnal_rdma(krx, lntmsg,
+                                                          KTX_RDMA_STORE, rmd,
+                                                          lntmsg->msg_niov,
+                                                          lntmsg->msg_iov,
+                                                          lntmsg->msg_kiov,
+                                                          lntmsg->msg_offset,
+                                                          lntmsg->msg_len);
+                                }
+                                break;
 
-                        iov_nob -= frag;
-                        if (iov_nob != 0)
-                                iov_ptr += frag;
-                        else if (kiov != NULL) {
-                                kunmap (kiov->kiov_page);
-                                kiov++;
-                                niov--;
-                                LASSERT (niov > 0);
-                                iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset;
-                                iov_nob = kiov->kiov_len;
-                        } else {
-                                iov++;
-                                niov--;
-                                LASSERT (niov > 0);
-                                iov_ptr = iov->iov_base;
-                                iov_nob = iov->iov_len;
-                        }
+                        default:
+                                CERROR("Bad RPC type %d\n",
+                                       le32_to_cpu(hdr->type));
+                                rc = -EPROTO;
+                                break;
                 }
 
-                if (kiov != NULL)
-                        kunmap (kiov->kiov_page);
+                kqswnal_rx_decref(krx);
+                return rc;
         }
 
-#if KQSW_CHECKSUM
-        memcpy (&senders_csum, buffer + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), 
-                sizeof(kqsw_csum_t));
-
-        if (csum_len != rlen)
-                CERROR("Unable to checksum data in user's buffer\n");
-        else if (senders_csum != payload_csum)
-                kqswnal_csum_error (krx, 0);
-
-        if (csum_verbose)
-                CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, "
-                       "csum_nob %d\n",
-                        hdr_csum, payload_csum, csum_frags, csum_nob);
-#endif
-        lib_finalize(nal, private, libmsg, PTL_OK);
-
-        return (PTL_OK);
-}
+        if (krx->krx_raw_lnet_hdr) {
+                LASSERT (the_lnet.ln_ptlcompat != 0);
+                msg_offset = sizeof(lnet_hdr_t);
+        } else {
+                LASSERT (msg->kqm_type == QSWLND_MSG_IMMEDIATE);
+                msg_offset = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload);
+        }
+        
+        if (krx->krx_nob < msg_offset + rlen) {
+                CERROR("Bad message size from %s: have %d, need %d + %d\n",
+                       libcfs_nid2str(fromnid), krx->krx_nob,
+                       msg_offset, rlen);
+                kqswnal_rx_decref(krx);
+                return -EPROTO;
+        }
 
-static ptl_err_t
-kqswnal_recv(lib_nal_t    *nal,
-             void         *private,
-             lib_msg_t    *libmsg,
-             unsigned int  niov,
-             struct iovec *iov,
-             size_t        offset,
-             size_t        mlen,
-             size_t        rlen)
-{
-        return (kqswnal_recvmsg(nal, private, libmsg, 
-                                niov, iov, NULL, 
-                                offset, mlen, rlen));
-}
+        if (kiov != NULL)
+                lnet_copy_kiov2kiov(niov, kiov, offset,
+                                    krx->krx_npages, krx->krx_kiov, 
+                                    msg_offset, mlen);
+        else
+                lnet_copy_kiov2iov(niov, iov, offset,
+                                   krx->krx_npages, krx->krx_kiov, 
+                                   msg_offset, mlen);
 
-static ptl_err_t
-kqswnal_recv_pages (lib_nal_t    *nal,
-                    void         *private,
-                    lib_msg_t    *libmsg,
-                    unsigned int  niov,
-                    ptl_kiov_t   *kiov,
-                    size_t        offset,
-                    size_t        mlen,
-                    size_t        rlen)
-{
-        return (kqswnal_recvmsg(nal, private, libmsg, 
-                                niov, NULL, kiov, 
-                                offset, mlen, rlen));
+        lnet_finalize(ni, lntmsg, 0);
+        kqswnal_rx_decref(krx);
+        return 0;
 }
 
 int
@@ -1898,14 +1737,13 @@ kqswnal_scheduler (void *arg)
 {
         kqswnal_rx_t    *krx;
         kqswnal_tx_t    *ktx;
-        kpr_fwd_desc_t  *fwd;
         unsigned long    flags;
         int              rc;
         int              counter = 0;
         int              did_something;
 
-        kportal_daemonize ("kqswnal_sched");
-        kportal_blockallsigs ();
+        cfs_daemonize ("kqswnal_sched");
+        cfs_block_allsigs ();
         
         spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
 
@@ -1921,49 +1759,42 @@ kqswnal_scheduler (void *arg)
                         spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
                                                flags);
 
-                        switch (krx->krx_state) {
-                        case KRX_PARSE:
-                                kqswnal_parse (krx);
-                                break;
-                        case KRX_COMPLETING:
-                                kqswnal_rx_decref (krx);
-                                break;
-                        default:
-                                LBUG();
-                        }
+                        LASSERT (krx->krx_state == KRX_PARSE);
+                        kqswnal_parse (krx);
 
                         did_something = 1;
                         spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags);
                 }
 
-                if (!list_empty (&kqswnal_data.kqn_delayedtxds))
+                if (!list_empty (&kqswnal_data.kqn_donetxds))
                 {
-                        ktx = list_entry(kqswnal_data.kqn_delayedtxds.next,
-                                         kqswnal_tx_t, ktx_delayed_list);
-                        list_del_init (&ktx->ktx_delayed_list);
+                        ktx = list_entry(kqswnal_data.kqn_donetxds.next,
+                                         kqswnal_tx_t, ktx_schedlist);
+                        list_del_init (&ktx->ktx_schedlist);
                         spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
                                                flags);
 
-                        rc = kqswnal_launch (ktx);
-                        if (rc != 0) {
-                                CERROR("Failed delayed transmit to "LPX64
-                                       ": %d\n", ktx->ktx_nid, rc);
-                                kqswnal_tx_done (ktx, rc);
-                        }
-                        atomic_dec (&kqswnal_data.kqn_pending_txs);
+                        kqswnal_tx_done_in_thread_context(ktx);
 
                         did_something = 1;
                         spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
                 }
 
-                if (!list_empty (&kqswnal_data.kqn_delayedfwds))
+                if (!list_empty (&kqswnal_data.kqn_delayedtxds))
                 {
-                        fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list);
-                        list_del (&fwd->kprfd_list);
-                        spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+                        ktx = list_entry(kqswnal_data.kqn_delayedtxds.next,
+                                         kqswnal_tx_t, ktx_schedlist);
+                        list_del_init (&ktx->ktx_schedlist);
+                        spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock,
+                                               flags);
 
-                        /* If we're shutting down, this will just requeue fwd on kqn_idletxd_fwdq */
-                        kqswnal_fwd_packet (NULL, fwd);
+                        rc = kqswnal_launch (ktx);
+                        if (rc != 0) {
+                                CERROR("Failed delayed transmit to %s: %d\n", 
+                                       libcfs_nid2str(ktx->ktx_nid), rc);
+                                kqswnal_tx_done (ktx, rc);
+                        }
+                        atomic_dec (&kqswnal_data.kqn_pending_txs);
 
                         did_something = 1;
                         spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
@@ -1982,11 +1813,12 @@ kqswnal_scheduler (void *arg)
                                          * there's nothing left to do */
                                         break;
                                 }
-                                rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq,
-                                                               kqswnal_data.kqn_shuttingdown == 2 ||
-                                                               !list_empty(&kqswnal_data.kqn_readyrxds) ||
-                                                               !list_empty(&kqswnal_data.kqn_delayedtxds) ||
-                                                               !list_empty(&kqswnal_data.kqn_delayedfwds));
+                                rc = wait_event_interruptible_exclusive (
+                                        kqswnal_data.kqn_sched_waitq,
+                                        kqswnal_data.kqn_shuttingdown == 2 ||
+                                        !list_empty(&kqswnal_data.kqn_readyrxds) ||
+                                        !list_empty(&kqswnal_data.kqn_donetxds) ||
+                                        !list_empty(&kqswnal_data.kqn_delayedtxds));
                                 LASSERT (rc == 0);
                         } else if (need_resched())
                                 schedule ();
@@ -1998,13 +1830,3 @@ kqswnal_scheduler (void *arg)
         kqswnal_thread_fini ();
         return (0);
 }
-
-lib_nal_t kqswnal_lib =
-{
-        libnal_data:       &kqswnal_data,         /* NAL private data */
-        libnal_send:        kqswnal_send,
-        libnal_send_pages:  kqswnal_send_pages,
-        libnal_recv:        kqswnal_recv,
-        libnal_recv_pages:  kqswnal_recv_pages,
-        libnal_dist:        kqswnal_dist
-};
diff --git a/lnet/klnds/qswlnd/qswlnd_modparams.c b/lnet/klnds/qswlnd/qswlnd_modparams.c
new file mode 100644 (file)
index 0000000..62f8924
--- /dev/null
@@ -0,0 +1,149 @@
+/*
+ * Copyright (C) 2002-2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ * This file is part of Portals, http://www.lustre.org
+ *
+ * Portals is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Portals is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Portals; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "qswlnd.h"
+
+static int tx_maxcontig = (1<<10);
+CFS_MODULE_PARM(tx_maxcontig, "i", int, 0444,
+               "maximum payload to de-fragment");
+
+static int ntxmsgs = 256;
+CFS_MODULE_PARM(ntxmsgs, "i", int, 0444,
+               "# 'normal' tx msg buffers");
+
+static int credits = 128;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# per-peer concurrent sends");
+
+static int nrxmsgs_large = 64;
+CFS_MODULE_PARM(nrxmsgs_large, "i", int, 0444,
+               "# 'large' rx msg buffers");
+
+static int ep_envelopes_large = 256;
+CFS_MODULE_PARM(ep_envelopes_large, "i", int, 0444,
+               "# 'large' rx msg envelope buffers");
+
+static int nrxmsgs_small = 256;
+CFS_MODULE_PARM(nrxmsgs_small, "i", int, 0444,
+               "# 'small' rx msg buffers");
+
+static int ep_envelopes_small = 2048;
+CFS_MODULE_PARM(ep_envelopes_small, "i", int, 0444,
+               "# 'small' rx msg envelope buffers");
+
+static int optimized_puts = (32<<10);
+CFS_MODULE_PARM(optimized_puts, "i", int, 0644,
+               "zero-copy puts >= this size");
+
+static int optimized_gets = 2048;
+CFS_MODULE_PARM(optimized_gets, "i", int, 0644,
+               "zero-copy gets >= this size");
+
+#if KQSW_CKSUM
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+               "test checksumming");
+#endif
+
+kqswnal_tunables_t kqswnal_tunables = {
+       .kqn_tx_maxcontig       = &tx_maxcontig,
+       .kqn_ntxmsgs            = &ntxmsgs,
+       .kqn_credits            = &credits,
+       .kqn_peercredits        = &peer_credits,
+       .kqn_nrxmsgs_large      = &nrxmsgs_large,
+       .kqn_ep_envelopes_large = &ep_envelopes_large,
+       .kqn_nrxmsgs_small      = &nrxmsgs_small,
+       .kqn_ep_envelopes_small = &ep_envelopes_small,
+       .kqn_optimized_puts     = &optimized_puts,
+       .kqn_optimized_gets     = &optimized_gets,
+#if KQSW_CKSUM
+       .kqn_inject_csum_error  = &inject_csum_error,
+#endif
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table kqswnal_ctl_table[] = {
+       {1, "tx_maxcontig", &tx_maxcontig, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {2, "ntxmsgs", &ntxmsgs, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {3, "credits", &credits, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {4, "peer_credits", &peer_credits, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {5, "nrxmsgs_large", &nrxmsgs_large, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {6, "ep_envelopes_large", &ep_envelopes_large, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {7, "nrxmsgs_small", &nrxmsgs_small, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {8, "ep_envelopes_small", &ep_envelopes_small, 
+        sizeof (int), 0444, NULL, &proc_dointvec},
+       {9, "optimized_puts", &optimized_puts, 
+        sizeof (int), 0644, NULL, &proc_dointvec},
+       {10, "optimized_gets", &optimized_gets, 
+        sizeof (int), 0644, NULL, &proc_dointvec},
+#if KQSW_CKSUM
+       {11, "inject_csum_error", &inject_csum_error, 
+        sizeof (int), 0644, NULL, &proc_dointvec},
+#endif
+       {0}
+};
+
+static ctl_table kqswnal_top_ctl_table[] = {
+       {201, "qswnal", NULL, 0, 0555, kqswnal_ctl_table},
+       {0}
+};
+
+int
+kqswnal_tunables_init ()
+{
+       kqswnal_tunables.kqn_sysctl =
+               register_sysctl_table(kqswnal_top_ctl_table, 0);
+       
+       if (kqswnal_tunables.kqn_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+kqswnal_tunables_fini ()
+{
+       if (kqswnal_tunables.kqn_sysctl != NULL)
+               unregister_sysctl_table(kqswnal_tunables.kqn_sysctl);
+}
+#else
+int 
+kqswnal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kqswnal_tunables_fini ()
+{
+}
+#endif
index 1772cc2..e1f5e82 100644 (file)
@@ -1,5 +1,5 @@
-MODULES := kranal
-kranal-objs := ranal.o ranal_cb.o
+MODULES := kralnd
+kralnd-objs := ralnd.o ralnd_cb.o ralnd_modparams.o
 
 EXTRA_POST_CFLAGS := @RACPPFLAGS@
 
index 3bb7642..7f3df4c 100644 (file)
@@ -4,12 +4,10 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
-if BUILD_RANAL
-modulenet_DATA = kranal$(KMODEXT)
-endif
+if BUILD_RALND
+modulenet_DATA = kralnd$(KMODEXT)
 endif
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(kranal-objs:%.o=%.c) ranal.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kralnd-objs:%.o=%.c) ralnd.h
index eb13d73..a0a4d93 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  */
-#include "ranal.h"
-
-static int        kranal_devids[] = {RAPK_MAIN_DEVICE_ID,
-                                     RAPK_EXPANSION_DEVICE_ID};
-
-nal_t                   kranal_api;
-ptl_handle_ni_t         kranal_ni;
-kra_data_t              kranal_data;
-kra_tunables_t          kranal_tunables;
-
-#define RANAL_SYSCTL_TIMEOUT           1
-#define RANAL_SYSCTL_LISTENER_TIMEOUT  2
-#define RANAL_SYSCTL_BACKLOG           3
-#define RANAL_SYSCTL_PORT              4
-#define RANAL_SYSCTL_MAX_IMMEDIATE     5
-
-#define RANAL_SYSCTL                   202
-
-static ctl_table kranal_ctl_table[] = {
-        {RANAL_SYSCTL_TIMEOUT, "timeout",
-         &kranal_tunables.kra_timeout, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {RANAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout",
-         &kranal_tunables.kra_listener_timeout, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {RANAL_SYSCTL_BACKLOG, "backlog",
-         &kranal_tunables.kra_backlog, sizeof(int),
-         0644, NULL, kranal_listener_procint},
-        {RANAL_SYSCTL_PORT, "port",
-         &kranal_tunables.kra_port, sizeof(int),
-         0644, NULL, kranal_listener_procint},
-        {RANAL_SYSCTL_MAX_IMMEDIATE, "max_immediate",
-         &kranal_tunables.kra_max_immediate, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        { 0 }
+#include "ralnd.h"
+
+static int        kranal_devids[RANAL_MAXDEVS] = {RAPK_MAIN_DEVICE_ID,
+                                                  RAPK_EXPANSION_DEVICE_ID};
+
+lnd_t the_kralnd = {
+        .lnd_type       = RALND,
+        .lnd_startup    = kranal_startup,
+        .lnd_shutdown   = kranal_shutdown,
+        .lnd_ctl        = kranal_ctl,
+        .lnd_send       = kranal_send,
+        .lnd_recv       = kranal_recv,
+        .lnd_eager_recv = kranal_eager_recv,
+        .lnd_accept     = kranal_accept,
 };
 
-static ctl_table kranal_top_ctl_table[] = {
-        {RANAL_SYSCTL, "ranal", NULL, 0, 0555, kranal_ctl_table},
-        { 0 }
-};
-
-int
-kranal_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-        struct iovec  iov = {
-                .iov_base = buffer,
-                .iov_len  = nob
-        };
-        struct msghdr msg = {
-                .msg_name       = NULL,
-                .msg_namelen    = 0,
-                .msg_iov        = &iov,
-                .msg_iovlen     = 1,
-                .msg_control    = NULL,
-                .msg_controllen = 0,
-                .msg_flags      = MSG_DONTWAIT
-        };
-
-        /* We've set up the socket's send buffer to be large enough for
-         * everything we send, so a single non-blocking send should
-         * complete without error. */
-
-        set_fs(KERNEL_DS);
-        rc = sock_sendmsg(sock, &msg, iov.iov_len);
-        set_fs(oldmm);
-
-        if (rc == nob)
-                return 0;
-
-        if (rc >= 0)
-                return -EAGAIN;
-
-        return rc;
-}
-
-int
-kranal_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
-{
-        int            rc;
-        mm_segment_t   oldmm = get_fs();
-        long           ticks = timeout * HZ;
-        unsigned long  then;
-        struct timeval tv;
-
-        LASSERT (nob > 0);
-        LASSERT (ticks > 0);
-
-        for (;;) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                /* Set receive timeout to remaining time */
-                tv = (struct timeval) {
-                        .tv_sec = ticks / HZ,
-                        .tv_usec = ((ticks % HZ) * 1000000) / HZ
-                };
-                set_fs(KERNEL_DS);
-                rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
-                                     (char *)&tv, sizeof(tv));
-                set_fs(oldmm);
-                if (rc != 0) {
-                        CERROR("Can't set socket recv timeout %d: %d\n",
-                               timeout, rc);
-                        return rc;
-                }
-
-                set_fs(KERNEL_DS);
-                then = jiffies;
-                rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
-                ticks -= jiffies - then;
-                set_fs(oldmm);
-
-                if (rc < 0)
-                        return rc;
-
-                if (rc == 0)
-                        return -ECONNABORTED;
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-
-                if (nob == 0)
-                        return 0;
-
-                if (ticks <= 0)
-                        return -ETIMEDOUT;
-        }
-}
-
-int
-kranal_create_sock(struct socket **sockp)
-{
-        struct socket       *sock;
-        int                  rc;
-        int                  option;
-        mm_segment_t         oldmm = get_fs();
-
-        rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
-        if (rc != 0) {
-                CERROR("Can't create socket: %d\n", rc);
-                return rc;
-        }
-
-        /* Ensure sending connection info doesn't block */
-        option = 2 * sizeof(kra_connreq_t);
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set send buffer %d: %d\n", option, rc);
-                goto failed;
-        }
-
-        option = 1;
-        set_fs(KERNEL_DS);
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                             (char *)&option, sizeof(option));
-        set_fs(oldmm);
-        if (rc != 0) {
-                CERROR("Can't set SO_REUSEADDR: %d\n", rc);
-                goto failed;
-        }
-
-        *sockp = sock;
-        return 0;
-
- failed:
-        sock_release(sock);
-        return rc;
-}
-
-void
-kranal_pause(int ticks)
-{
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule_timeout(ticks);
-}
+kra_data_t              kranal_data;
 
 void
-kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid)
+kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, lnet_nid_t dstnid)
 {
         RAP_RETURN   rrc;
 
@@ -222,8 +47,13 @@ kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid)
 
         connreq->racr_magic     = RANAL_MSG_MAGIC;
         connreq->racr_version   = RANAL_MSG_VERSION;
+
+        if (conn == NULL)                       /* prepping a "stub" reply */
+                return;
+
         connreq->racr_devid     = conn->rac_device->rad_id;
-        connreq->racr_srcnid    = kranal_lib.libnal_ni.ni_pid.nid;
+        connreq->racr_srcnid    = lnet_ptlcompat_srcnid(kranal_data.kra_ni->ni_nid,
+                                                        dstnid);
         connreq->racr_dstnid    = dstnid;
         connreq->racr_peerstamp = kranal_data.kra_peerstamp;
         connreq->racr_connstamp = conn->rac_my_connstamp;
@@ -234,22 +64,101 @@ kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid)
 }
 
 int
-kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int timeout)
+kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int active)
 {
+        int         timeout = active ? *kranal_tunables.kra_timeout :
+                                        lnet_acceptor_timeout();
+        int         swab;
         int         rc;
 
-        rc = kranal_sock_read(sock, connreq, sizeof(*connreq), timeout);
+        /* return 0 on success, -ve on error, +ve to tell the peer I'm "old" */
+
+        rc = libcfs_sock_read(sock, &connreq->racr_magic, 
+                              sizeof(connreq->racr_magic), timeout);
         if (rc != 0) {
-                CERROR("Read failed: %d\n", rc);
-                return rc;
+                CERROR("Read(magic) failed(1): %d\n", rc);
+                return -EIO;
+        }
+
+        if (connreq->racr_magic != RANAL_MSG_MAGIC &&
+            connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) {
+                /* Unexpected magic! */
+                if (!active &&
+                    the_lnet.ln_ptlcompat == 0 &&
+                    (connreq->racr_magic == LNET_PROTO_MAGIC ||
+                     connreq->racr_magic == __swab32(LNET_PROTO_MAGIC))) {
+                        /* future protocol version compatibility!
+                         * When LNET unifies protocols over all LNDs, the first
+                         * thing sent will be a version query.  +ve rc means I
+                         * reply with my current magic/version */
+                        return EPROTO;
+                }
+
+                if (active ||
+                    the_lnet.ln_ptlcompat == 0) {
+                        CERROR("Unexpected magic %08x (1)\n",
+                               connreq->racr_magic);
+                        return -EPROTO;
+                }
+
+                /* When portals compatibility is set, I may be passed a new
+                 * connection "blindly" by the acceptor, and I have to
+                 * determine if my peer has sent an acceptor connection request
+                 * or not.  This isn't a connreq, so I'll get the acceptor to
+                 * look at it... */
+                rc = lnet_accept(kranal_data.kra_ni, sock, connreq->racr_magic);
+                if (rc != 0)
+                        return -EPROTO;
+
+                /* ...and if it's OK I'm back to looking for a connreq... */
+                rc = libcfs_sock_read(sock, &connreq->racr_magic,
+                                      sizeof(connreq->racr_magic), timeout);
+                if (rc != 0) {
+                        CERROR("Read(magic) failed(2): %d\n", rc);
+                        return -EIO;
+                }
+
+                if (connreq->racr_magic != RANAL_MSG_MAGIC &&
+                    connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) {
+                        CERROR("Unexpected magic %08x(2)\n",
+                               connreq->racr_magic);
+                        return -EPROTO;
+                }
+        }
+
+        swab = (connreq->racr_magic == __swab32(RANAL_MSG_MAGIC));
+
+        rc = libcfs_sock_read(sock, &connreq->racr_version,
+                              sizeof(connreq->racr_version), timeout);
+        if (rc != 0) {
+                CERROR("Read(version) failed: %d\n", rc);
+                return -EIO;
         }
 
-        if (connreq->racr_magic != RANAL_MSG_MAGIC) {
-                if (__swab32(connreq->racr_magic) != RANAL_MSG_MAGIC) {
-                        CERROR("Unexpected magic %08x\n", connreq->racr_magic);
+        if (swab)
+                __swab16s(&connreq->racr_version);
+        
+        if (connreq->racr_version != RANAL_MSG_VERSION) {
+                if (active) {
+                        CERROR("Unexpected version %d\n", connreq->racr_version);
                         return -EPROTO;
                 }
+                /* If this is a future version of the ralnd protocol, and I'm
+                 * passive (accepted the connection), tell my peer I'm "old"
+                 * (+ve rc) */
+                return EPROTO;
+        }
+
+        rc = libcfs_sock_read(sock, &connreq->racr_devid,
+                              sizeof(connreq->racr_version) -
+                              offsetof(kra_connreq_t, racr_devid),
+                              timeout);
+        if (rc != 0) {
+                CERROR("Read(body) failed: %d\n", rc);
+                return -EIO;
+        }
 
+        if (swab) {
                 __swab32s(&connreq->racr_magic);
                 __swab16s(&connreq->racr_version);
                 __swab16s(&connreq->racr_devid);
@@ -265,14 +174,9 @@ kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int timeout)
                 __swab32s(&connreq->racr_riparams.CompletionCookie);
         }
 
-        if (connreq->racr_version != RANAL_MSG_VERSION) {
-                CERROR("Unexpected version %d\n", connreq->racr_version);
-                return -EPROTO;
-        }
-
-        if (connreq->racr_srcnid == PTL_NID_ANY ||
-            connreq->racr_dstnid == PTL_NID_ANY) {
-                CERROR("Received PTL_NID_ANY\n");
+        if (connreq->racr_srcnid == LNET_NID_ANY ||
+            connreq->racr_dstnid == LNET_NID_ANY) {
+                CERROR("Received LNET_NID_ANY\n");
                 return -EPROTO;
         }
 
@@ -294,7 +198,7 @@ kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn)
         int                 loopback;
         int                 count = 0;
 
-        loopback = peer->rap_nid == kranal_lib.libnal_ni.ni_pid.nid;
+        loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid;
 
         list_for_each_safe (ctmp, cnxt, &peer->rap_conns) {
                 conn = list_entry(ctmp, kra_conn_t, rac_list);
@@ -303,8 +207,9 @@ kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn)
                         continue;
 
                 if (conn->rac_peerstamp != newconn->rac_peerstamp) {
-                        CDEBUG(D_NET, "Closing stale conn nid:"LPX64
-                               " peerstamp:"LPX64"("LPX64")\n", peer->rap_nid,
+                        CDEBUG(D_NET, "Closing stale conn nid: %s "
+                               " peerstamp:"LPX64"("LPX64")\n", 
+                               libcfs_nid2str(peer->rap_nid),
                                conn->rac_peerstamp, newconn->rac_peerstamp);
                         LASSERT (conn->rac_peerstamp < newconn->rac_peerstamp);
                         count++;
@@ -322,8 +227,9 @@ kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn)
 
                 LASSERT (conn->rac_peer_connstamp < newconn->rac_peer_connstamp);
 
-                CDEBUG(D_NET, "Closing stale conn nid:"LPX64
-                       " connstamp:"LPX64"("LPX64")\n", peer->rap_nid,
+                CDEBUG(D_NET, "Closing stale conn nid: %s"
+                       " connstamp:"LPX64"("LPX64")\n", 
+                       libcfs_nid2str(peer->rap_nid),
                        conn->rac_peer_connstamp, newconn->rac_peer_connstamp);
 
                 count++;
@@ -340,7 +246,7 @@ kranal_conn_isdup_locked(kra_peer_t *peer, kra_conn_t *newconn)
         struct list_head *tmp;
         int               loopback;
 
-        loopback = peer->rap_nid == kranal_lib.libnal_ni.ni_pid.nid;
+        loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid;
 
         list_for_each(tmp, &peer->rap_conns) {
                 conn = list_entry(tmp, kra_conn_t, rac_list);
@@ -404,7 +310,7 @@ kranal_create_conn(kra_conn_t **connp, kra_device_t *dev)
         RAP_RETURN     rrc;
 
         LASSERT (!in_interrupt());
-        PORTAL_ALLOC(conn, sizeof(*conn));
+        LIBCFS_ALLOC(conn, sizeof(*conn));
 
         if (conn == NULL)
                 return -ENOMEM;
@@ -422,14 +328,14 @@ kranal_create_conn(kra_conn_t **connp, kra_device_t *dev)
         kranal_set_conn_uniqueness(conn);
 
         conn->rac_device = dev;
-        conn->rac_timeout = MAX(kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT);
+        conn->rac_timeout = MAX(*kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT);
         kranal_update_reaper_timeout(conn->rac_timeout);
 
         rrc = RapkCreateRi(dev->rad_handle, conn->rac_cqid,
                            &conn->rac_rihandle);
         if (rrc != RAP_SUCCESS) {
                 CERROR("RapkCreateRi failed: %d\n", rrc);
-                PORTAL_FREE(conn, sizeof(*conn));
+                LIBCFS_FREE(conn, sizeof(*conn));
                 return -ENETDOWN;
         }
 
@@ -460,7 +366,7 @@ kranal_destroy_conn(kra_conn_t *conn)
         if (conn->rac_peer != NULL)
                 kranal_peer_decref(conn->rac_peer);
 
-        PORTAL_FREE(conn, sizeof(*conn));
+        LIBCFS_FREE(conn, sizeof(*conn));
         atomic_dec(&kranal_data.kra_nconns);
 }
 
@@ -488,8 +394,9 @@ kranal_close_conn_locked (kra_conn_t *conn, int error)
 {
         kra_peer_t        *peer = conn->rac_peer;
 
-        CDEBUG(error == 0 ? D_NET : D_ERROR,
-               "closing conn to "LPX64": error %d\n", peer->rap_nid, error);
+        CDEBUG(error == 0 ? D_NET : D_NETERROR,
+               "closing conn to %s: error %d\n", 
+               libcfs_nid2str(peer->rap_nid), error);
 
         LASSERT (!in_interrupt());
         LASSERT (conn->rac_state == RANAL_CONN_ESTABLISHED);
@@ -576,10 +483,9 @@ kranal_set_conn_params(kra_conn_t *conn, kra_connreq_t *connreq,
 }
 
 int
-kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp,
-                               ptl_nid_t *dst_nidp, kra_conn_t **connp)
+kranal_passive_conn_handshake (struct socket *sock, lnet_nid_t *src_nidp,
+                               lnet_nid_t *dst_nidp, kra_conn_t **connp)
 {
-        struct sockaddr_in   addr;
         __u32                peer_ip;
         unsigned int         peer_port;
         kra_connreq_t        rx_connreq;
@@ -587,33 +493,36 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp,
         kra_conn_t          *conn;
         kra_device_t        *dev;
         int                  rc;
-        int                  len;
         int                  i;
 
-        len = sizeof(addr);
-        rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2);
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
         if (rc != 0) {
                 CERROR("Can't get peer's IP: %d\n", rc);
                 return rc;
         }
 
-        peer_ip = ntohl(addr.sin_addr.s_addr);
-        peer_port = ntohs(addr.sin_port);
+        rc = kranal_recv_connreq(sock, &rx_connreq, 0);
 
-        if (peer_port >= 1024) {
-                CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n",
-                       HIPQUAD(peer_ip), peer_port);
-                return -ECONNREFUSED;
-        }
-
-        rc = kranal_recv_connreq(sock, &rx_connreq,
-                                 kranal_tunables.kra_listener_timeout);
-        if (rc != 0) {
+        if (rc < 0) {
                 CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer_ip), peer_port, rc);
                 return rc;
         }
 
+        if (rc > 0) {
+                /* Request from "new" peer: send reply with my MAGIC/VERSION to
+                 * tell her I'm old... */
+                kranal_pack_connreq(&tx_connreq, NULL, LNET_NID_ANY);
+
+                rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq),
+                                       lnet_acceptor_timeout());
+                if (rc != 0)
+                        CERROR("Can't tx stub connreq to %u.%u.%u.%u/%d: %d\n",
+                               HIPQUAD(peer_ip), peer_port, rc);
+
+                return -EPROTO;
+        }
+
         for (i = 0;;i++) {
                 if (i == kranal_data.kra_ndevs) {
                         CERROR("Can't match dev %d from %u.%u.%u.%u/%d\n",
@@ -631,7 +540,8 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp,
 
         kranal_pack_connreq(&tx_connreq, conn, rx_connreq.racr_srcnid);
 
-        rc = kranal_sock_write(sock, &tx_connreq, sizeof(tx_connreq));
+        rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq),
+                               lnet_acceptor_timeout());
         if (rc != 0) {
                 CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer_ip), peer_port, rc);
@@ -652,72 +562,8 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp,
 }
 
 int
-ranal_connect_sock(kra_peer_t *peer, struct socket **sockp)
-{
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct socket      *sock;
-        unsigned int        port;
-        int                 rc;
-
-        for (port = 1023; port >= 512; port--) {
-
-                memset(&locaddr, 0, sizeof(locaddr));
-                locaddr.sin_family      = AF_INET;
-                locaddr.sin_port        = htons(port);
-                locaddr.sin_addr.s_addr = htonl(INADDR_ANY);
-
-                memset (&srvaddr, 0, sizeof (srvaddr));
-                srvaddr.sin_family      = AF_INET;
-                srvaddr.sin_port        = htons (peer->rap_port);
-                srvaddr.sin_addr.s_addr = htonl (peer->rap_ip);
-
-                rc = kranal_create_sock(&sock);
-                if (rc != 0)
-                        return rc;
-
-                rc = sock->ops->bind(sock,
-                                     (struct sockaddr *)&locaddr, sizeof(locaddr));
-                if (rc != 0) {
-                        sock_release(sock);
-
-                        if (rc == -EADDRINUSE) {
-                                CDEBUG(D_NET, "Port %d already in use\n", port);
-                                continue;
-                        }
-
-                        CERROR("Can't bind to reserved port %d: %d\n", port, rc);
-                        return rc;
-                }
-
-                rc = sock->ops->connect(sock,
-                                        (struct sockaddr *)&srvaddr, sizeof(srvaddr),
-                                        0);
-                if (rc == 0) {
-                        *sockp = sock;
-                        return 0;
-                }
-
-                sock_release(sock);
-
-                if (rc != -EADDRNOTAVAIL) {
-                        CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n",
-                               port, HIPQUAD(peer->rap_ip), peer->rap_port, rc);
-                        return rc;
-                }
-
-                CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n",
-                       port, HIPQUAD(peer->rap_ip), peer->rap_port);
-        }
-
-        /* all ports busy */
-        return -EHOSTUNREACH;
-}
-
-
-int
 kranal_active_conn_handshake(kra_peer_t *peer,
-                             ptl_nid_t *dst_nidp, kra_conn_t **connp)
+                             lnet_nid_t *dst_nidp, kra_conn_t **connp)
 {
         kra_connreq_t       connreq;
         kra_conn_t         *conn;
@@ -728,7 +574,7 @@ kranal_active_conn_handshake(kra_peer_t *peer,
 
         /* spread connections over all devices using both peer NIDs to ensure
          * all nids use all devices */
-        idx = peer->rap_nid + kranal_lib.libnal_ni.ni_pid.nid;
+        idx = peer->rap_nid + kranal_data.kra_ni->ni_nid;
         dev = &kranal_data.kra_devices[idx % kranal_data.kra_ndevs];
 
         rc = kranal_create_conn(&conn, dev);
@@ -737,7 +583,22 @@ kranal_active_conn_handshake(kra_peer_t *peer,
 
         kranal_pack_connreq(&connreq, conn, peer->rap_nid);
 
-        rc = ranal_connect_sock(peer, &sock);
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto test */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        connreq.racr_version++;
+                        the_lnet.ln_testprotocompat &= ~1;
+                }
+                if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                        connreq.racr_magic = LNET_PROTO_MAGIC;
+                        the_lnet.ln_testprotocompat &= ~2;
+                }
+                LNET_UNLOCK();
+        }
+
+        rc = lnet_connect(&sock, peer->rap_nid,
+                         0, peer->rap_ip, peer->rap_port);
         if (rc != 0)
                 goto failed_0;
 
@@ -745,29 +606,31 @@ kranal_active_conn_handshake(kra_peer_t *peer,
          * immediately after accepting a connection, so we connect and then
          * send immediately. */
 
-        rc = kranal_sock_write(sock, &connreq, sizeof(connreq));
+        rc = libcfs_sock_write(sock, &connreq, sizeof(connreq),
+                               lnet_acceptor_timeout());
         if (rc != 0) {
                 CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer->rap_ip), peer->rap_port, rc);
-                goto failed_1;
+                goto failed_2;
         }
 
-        rc = kranal_recv_connreq(sock, &connreq, kranal_tunables.kra_timeout);
+        rc = kranal_recv_connreq(sock, &connreq, 1);
         if (rc != 0) {
                 CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n",
                        HIPQUAD(peer->rap_ip), peer->rap_port, rc);
-                goto failed_1;
+                goto failed_2;
         }
 
-        sock_release(sock);
+        libcfs_sock_release(sock);
         rc = -EPROTO;
 
         if (connreq.racr_srcnid != peer->rap_nid) {
                 CERROR("Unexpected srcnid from %u.%u.%u.%u/%d: "
-                       "received "LPX64" expected "LPX64"\n",
+                       "received %s expected %s\n",
                        HIPQUAD(peer->rap_ip), peer->rap_port,
-                       connreq.racr_srcnid, peer->rap_nid);
-                goto failed_0;
+                       libcfs_nid2str(connreq.racr_srcnid), 
+                       libcfs_nid2str(peer->rap_nid));
+                goto failed_1;
         }
 
         if (connreq.racr_devid != dev->rad_id) {
@@ -775,20 +638,23 @@ kranal_active_conn_handshake(kra_peer_t *peer,
                        "received %d expected %d\n",
                        HIPQUAD(peer->rap_ip), peer->rap_port,
                        connreq.racr_devid, dev->rad_id);
-                goto failed_0;
+                goto failed_1;
         }
 
         rc = kranal_set_conn_params(conn, &connreq,
                                     peer->rap_ip, peer->rap_port);
         if (rc != 0)
-                goto failed_0;
+                goto failed_1;
 
         *connp = conn;
         *dst_nidp = connreq.racr_dstnid;
         return 0;
 
+ failed_2:
+        libcfs_sock_release(sock);
  failed_1:
-        sock_release(sock);
+        lnet_connect_console_error(rc, peer->rap_nid,
+                                  peer->rap_ip, peer->rap_port);
  failed_0:
         kranal_conn_decref(conn);
         return rc;
@@ -799,8 +665,8 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer)
 {
         kra_peer_t        *peer2;
         kra_tx_t          *tx;
-        ptl_nid_t          peer_nid;
-        ptl_nid_t          dst_nid;
+        lnet_nid_t          peer_nid;
+        lnet_nid_t          dst_nid;
         unsigned long      flags;
         kra_conn_t        *conn;
         int                rc;
@@ -837,9 +703,10 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer)
                         return rc;
 
                 /* assume this is a new peer */
-                peer = kranal_create_peer(peer_nid);
-                if (peer == NULL) {
-                        CERROR("Can't allocate peer for "LPX64"\n", peer_nid);
+                rc = kranal_create_peer(&peer, peer_nid);
+                if (rc != 0) {
+                        CERROR("Can't create conn for %s\n", 
+                               libcfs_nid2str(peer_nid));
                         kranal_conn_decref(conn);
                         return -ENOMEM;
                 }
@@ -861,12 +728,12 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer)
         /* Refuse connection if peer thinks we are a different NID.  We check
          * this while holding the global lock, to synch with connection
          * destruction on NID change. */
-        if (dst_nid != kranal_lib.libnal_ni.ni_pid.nid) {
+        if (!lnet_ptlcompat_matchnid(kranal_data.kra_ni->ni_nid, dst_nid)) {
                 write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
 
-                CERROR("Stale/bad connection with "LPX64
-                       ": dst_nid "LPX64", expected "LPX64"\n",
-                       peer_nid, dst_nid, kranal_lib.libnal_ni.ni_pid.nid);
+                CERROR("Stale/bad connection with %s: dst_nid %s, expected %s\n",
+                       libcfs_nid2str(peer_nid), libcfs_nid2str(dst_nid), 
+                       libcfs_nid2str(kranal_data.kra_ni->ni_nid));
                 rc = -ESTALE;
                 goto failed;
         }
@@ -879,8 +746,8 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer)
                 LASSERT (!list_empty(&peer->rap_conns));
                 LASSERT (list_empty(&peer->rap_tx_queue));
                 write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-                CWARN("Not creating duplicate connection to "LPX64": %d\n",
-                      peer_nid, rc);
+                CWARN("Not creating duplicate connection to %s: %d\n",
+                      libcfs_nid2str(peer_nid), rc);
                 rc = 0;
                 goto failed;
         }
@@ -918,10 +785,12 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer)
         /* CAVEAT EMPTOR: passive peer can disappear NOW */
 
         if (nstale != 0)
-                CWARN("Closed %d stale conns to "LPX64"\n", nstale, peer_nid);
+                CWARN("Closed %d stale conns to %s\n", nstale, 
+                      libcfs_nid2str(peer_nid));
 
-        CWARN("New connection to "LPX64" on devid[%d] = %d\n",
-               peer_nid, conn->rac_device->rad_idx, conn->rac_device->rad_id);
+        CWARN("New connection to %s on devid[%d] = %d\n",
+               libcfs_nid2str(peer_nid), 
+               conn->rac_device->rad_idx, conn->rac_device->rad_id);
 
         /* Ensure conn gets checked.  Transmits may have been queued and an
          * FMA event may have happened before it got in the cq hash table */
@@ -945,11 +814,13 @@ kranal_connect (kra_peer_t *peer)
 
         LASSERT (peer->rap_connecting);
 
-        CDEBUG(D_NET, "About to handshake "LPX64"\n", peer->rap_nid);
+        CDEBUG(D_NET, "About to handshake %s\n", 
+               libcfs_nid2str(peer->rap_nid));
 
         rc = kranal_conn_handshake(NULL, peer);
 
-        CDEBUG(D_NET, "Done handshake "LPX64":%d \n", peer->rap_nid, rc);
+        CDEBUG(D_NET, "Done handshake %s:%d \n", 
+               libcfs_nid2str(peer->rap_nid), rc);
 
         write_lock_irqsave(&kranal_data.kra_global_lock, flags);
 
@@ -961,18 +832,21 @@ kranal_connect (kra_peer_t *peer)
                  * success to avoid messages jumping the queue */
                 LASSERT (list_empty(&peer->rap_tx_queue));
 
-                /* reset reconnection timeouts */
-                peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL;
-                peer->rap_reconnect_time = CURRENT_SECONDS;
+                peer->rap_reconnect_interval = 0; /* OK to reconnect at any time */
 
                 write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
                 return;
         }
 
-        LASSERT (peer->rap_reconnect_interval != 0);
-        peer->rap_reconnect_time = CURRENT_SECONDS + peer->rap_reconnect_interval;
-        peer->rap_reconnect_interval = MAX(RANAL_MAX_RECONNECT_INTERVAL,
-                                           1 * peer->rap_reconnect_interval);
+        peer->rap_reconnect_interval *= 2;
+        peer->rap_reconnect_interval =
+                MAX(peer->rap_reconnect_interval,
+                    *kranal_tunables.kra_min_reconnect_interval);
+        peer->rap_reconnect_interval =
+                MIN(peer->rap_reconnect_interval,
+                    *kranal_tunables.kra_max_reconnect_interval);
+
+        peer->rap_reconnect_time = jiffies + peer->rap_reconnect_interval * HZ;
 
         /* Grab all blocked packets while we have the global lock */
         list_add(&zombies, &peer->rap_tx_queue);
@@ -983,8 +857,8 @@ kranal_connect (kra_peer_t *peer)
         if (list_empty(&zombies))
                 return;
 
-        CWARN("Dropping packets for "LPX64": connection failed\n",
-              peer->rap_nid);
+        CDEBUG(D_NETERROR, "Dropping packets for %s: connection failed\n",
+               libcfs_nid2str(peer->rap_nid));
 
         do {
                 tx = list_entry(zombies.next, kra_tx_t, tx_list);
@@ -998,309 +872,51 @@ kranal_connect (kra_peer_t *peer)
 void
 kranal_free_acceptsock (kra_acceptsock_t *ras)
 {
-        sock_release(ras->ras_sock);
-        PORTAL_FREE(ras, sizeof(*ras));
+        libcfs_sock_release(ras->ras_sock);
+        LIBCFS_FREE(ras, sizeof(*ras));
 }
 
 int
-kranal_listener (void *arg)
+kranal_accept (lnet_ni_t *ni, struct socket *sock)
 {
-        struct sockaddr_in addr;
-        wait_queue_t       wait;
-        struct socket     *sock;
         kra_acceptsock_t  *ras;
-        int                port;
-        char               name[16];
         int                rc;
+        __u32              peer_ip;
+        int                peer_port;
         unsigned long      flags;
 
-        /* Parent thread holds kra_nid_mutex, and is, or is about to
-         * block on kra_listener_signal */
-
-        port = kranal_tunables.kra_port;
-        snprintf(name, sizeof(name), "kranal_lstn%03d", port);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
-
-        init_waitqueue_entry(&wait, current);
-
-        rc = kranal_create_sock(&sock);
-        if (rc != 0)
-                goto out_0;
-
-        memset(&addr, 0, sizeof(addr));
-        addr.sin_family      = AF_INET;
-        addr.sin_port        = htons(port);
-        addr.sin_addr.s_addr = INADDR_ANY;
-
-        rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr));
-        if (rc != 0) {
-                CERROR("Can't bind to port %d\n", port);
-                goto out_1;
-        }
-
-        rc = sock->ops->listen(sock, kranal_tunables.kra_backlog);
-        if (rc != 0) {
-                CERROR("Can't set listen backlog %d: %d\n",
-                       kranal_tunables.kra_backlog, rc);
-                goto out_1;
-        }
-
-        LASSERT (kranal_data.kra_listener_sock == NULL);
-        kranal_data.kra_listener_sock = sock;
-
-        /* unblock waiting parent */
-        LASSERT (kranal_data.kra_listener_shutdown == 0);
-        up(&kranal_data.kra_listener_signal);
-
-        /* Wake me any time something happens on my socket */
-        add_wait_queue(sock->sk->sk_sleep, &wait);
-        ras = NULL;
-
-        while (kranal_data.kra_listener_shutdown == 0) {
-
-                if (ras == NULL) {
-                        PORTAL_ALLOC(ras, sizeof(*ras));
-                        if (ras == NULL) {
-                                CERROR("Out of Memory: pausing...\n");
-                                kranal_pause(HZ);
-                                continue;
-                        }
-                        ras->ras_sock = NULL;
-                }
-
-                if (ras->ras_sock == NULL) {
-                        ras->ras_sock = sock_alloc();
-                        if (ras->ras_sock == NULL) {
-                                CERROR("Can't allocate socket: pausing...\n");
-                                kranal_pause(HZ);
-                                continue;
-                        }
-                        /* XXX this should add a ref to sock->ops->owner, if
-                         * TCP could be a module */
-                        ras->ras_sock->type = sock->type;
-                        ras->ras_sock->ops = sock->ops;
-                }
-
-                set_current_state(TASK_INTERRUPTIBLE);
-
-                rc = sock->ops->accept(sock, ras->ras_sock, O_NONBLOCK);
-
-                /* Sleep for socket activity? */
-                if (rc == -EAGAIN &&
-                    kranal_data.kra_listener_shutdown == 0)
-                        schedule();
-
-                set_current_state(TASK_RUNNING);
-
-                if (rc == 0) {
-                        spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
-
-                        list_add_tail(&ras->ras_list,
-                                      &kranal_data.kra_connd_acceptq);
-
-                        spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-                        wake_up(&kranal_data.kra_connd_waitq);
-
-                        ras = NULL;
-                        continue;
-                }
-
-                if (rc != -EAGAIN) {
-                        CERROR("Accept failed: %d, pausing...\n", rc);
-                        kranal_pause(HZ);
-                }
-        }
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+        LASSERT (rc == 0);                      /* we succeeded before */
 
-        if (ras != NULL) {
-                if (ras->ras_sock != NULL)
-                        sock_release(ras->ras_sock);
-                PORTAL_FREE(ras, sizeof(*ras));
-        }
-
-        rc = 0;
-        remove_wait_queue(sock->sk->sk_sleep, &wait);
- out_1:
-        sock_release(sock);
-        kranal_data.kra_listener_sock = NULL;
- out_0:
-        /* set completion status and unblock thread waiting for me
-         * (parent on startup failure, executioner on normal shutdown) */
-        kranal_data.kra_listener_shutdown = rc;
-        up(&kranal_data.kra_listener_signal);
-
-        return 0;
-}
-
-int
-kranal_start_listener (void)
-{
-        long           pid;
-        int            rc;
-
-        CDEBUG(D_NET, "Starting listener\n");
-
-        /* Called holding kra_nid_mutex: listener stopped */
-        LASSERT (kranal_data.kra_listener_sock == NULL);
-
-        kranal_data.kra_listener_shutdown = 0;
-        pid = kernel_thread(kranal_listener, NULL, 0);
-        if (pid < 0) {
-                CERROR("Can't spawn listener: %ld\n", pid);
-                return (int)pid;
+        LIBCFS_ALLOC(ras, sizeof(*ras));
+        if (ras == NULL) {
+                CERROR("ENOMEM allocating connection request from "
+                       "%u.%u.%u.%u\n", HIPQUAD(peer_ip));
+                return -ENOMEM;
         }
 
-        /* Block until listener has started up. */
-        down(&kranal_data.kra_listener_signal);
-
-        rc = kranal_data.kra_listener_shutdown;
-        LASSERT ((rc != 0) == (kranal_data.kra_listener_sock == NULL));
-
-        CDEBUG(D_NET, "Listener %ld started OK\n", pid);
-        return rc;
-}
-
-void
-kranal_stop_listener(int clear_acceptq)
-{
-        struct list_head  zombie_accepts;
-        unsigned long     flags;
-        kra_acceptsock_t *ras;
-
-        CDEBUG(D_NET, "Stopping listener\n");
+        ras->ras_sock = sock;
 
-        /* Called holding kra_nid_mutex: listener running */
-        LASSERT (kranal_data.kra_listener_sock != NULL);
-
-        kranal_data.kra_listener_shutdown = 1;
-        wake_up_all(kranal_data.kra_listener_sock->sk->sk_sleep);
-
-        /* Block until listener has torn down. */
-        down(&kranal_data.kra_listener_signal);
-
-        LASSERT (kranal_data.kra_listener_sock == NULL);
-        CDEBUG(D_NET, "Listener stopped\n");
-
-        if (!clear_acceptq)
-                return;
-
-        /* Close any unhandled accepts */
         spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
 
-        list_add(&zombie_accepts, &kranal_data.kra_connd_acceptq);
-        list_del_init(&kranal_data.kra_connd_acceptq);
+        list_add_tail(&ras->ras_list, &kranal_data.kra_connd_acceptq);
+        wake_up(&kranal_data.kra_connd_waitq);
 
         spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
-
-        while (!list_empty(&zombie_accepts)) {
-                ras = list_entry(zombie_accepts.next,
-                                 kra_acceptsock_t, ras_list);
-                list_del(&ras->ras_list);
-                kranal_free_acceptsock(ras);
-        }
-}
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
-int
-kranal_listener_procint(ctl_table *table, int write, struct file *filp,
-                        void *buffer, size_t *lenp)
-#else
-int
-kranal_listener_procint(ctl_table *table, int write, struct file *filp,
-                        void *buffer, size_t *lenp, loff_t *ppos)
-#endif
-{
-        int   *tunable = (int *)table->data;
-        int    old_val;
-        int    rc;
-
-        /* No race with nal initialisation since the nal is setup all the time
-         * it's loaded.  When that changes, change this! */
-        LASSERT (kranal_data.kra_init == RANAL_INIT_ALL);
-
-        down(&kranal_data.kra_nid_mutex);
-
-        LASSERT (tunable == &kranal_tunables.kra_port ||
-                 tunable == &kranal_tunables.kra_backlog);
-        old_val = *tunable;
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8)
-        rc = proc_dointvec(table, write, filp, buffer, lenp);
-#else
-        rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
-#endif
-
-        if (write &&
-            (*tunable != old_val ||
-             kranal_data.kra_listener_sock == NULL)) {
-
-                if (kranal_data.kra_listener_sock != NULL)
-                        kranal_stop_listener(0);
-
-                rc = kranal_start_listener();
-
-                if (rc != 0) {
-                        CWARN("Unable to start listener with new tunable:"
-                              " reverting to old value\n");
-                        *tunable = old_val;
-                        kranal_start_listener();
-                }
-        }
-
-        up(&kranal_data.kra_nid_mutex);
-
-        LASSERT (kranal_data.kra_init == RANAL_INIT_ALL);
-        return rc;
+        return 0;
 }
 
 int
-kranal_set_mynid(ptl_nid_t nid)
-{
-        unsigned long    flags;
-        lib_ni_t        *ni = &kranal_lib.libnal_ni;
-        int              rc = 0;
-
-        CDEBUG(D_NET, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->ni_pid.nid);
-
-        down(&kranal_data.kra_nid_mutex);
-
-        if (nid == ni->ni_pid.nid) {
-                /* no change of NID */
-                up(&kranal_data.kra_nid_mutex);
-                return 0;
-        }
-
-        if (kranal_data.kra_listener_sock != NULL)
-                kranal_stop_listener(1);
-
-        write_lock_irqsave(&kranal_data.kra_global_lock, flags);
-        kranal_data.kra_peerstamp++;
-        ni->ni_pid.nid = nid;
-        write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
-
-        /* Delete all existing peers and their connections after new
-         * NID/connstamp set to ensure no old connections in our brave
-         * new world. */
-        kranal_del_peer(PTL_NID_ANY, 0);
-
-        if (nid != PTL_NID_ANY)
-                rc = kranal_start_listener();
-
-        up(&kranal_data.kra_nid_mutex);
-        return rc;
-}
-
-kra_peer_t *
-kranal_create_peer (ptl_nid_t nid)
+kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid)
 {
-        kra_peer_t *peer;
+        kra_peer_t    *peer;
+        unsigned long  flags;
 
-        LASSERT (nid != PTL_NID_ANY);
+        LASSERT (nid != LNET_NID_ANY);
 
-        PORTAL_ALLOC(peer, sizeof(*peer));
+        LIBCFS_ALLOC(peer, sizeof(*peer));
         if (peer == NULL)
-                return NULL;
+                return -ENOMEM;
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
 
@@ -1312,17 +928,32 @@ kranal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD(&peer->rap_conns);
         INIT_LIST_HEAD(&peer->rap_tx_queue);
 
-        peer->rap_reconnect_time = CURRENT_SECONDS;
-        peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL;
+        peer->rap_reconnect_interval = 0;       /* OK to connect at any time */
+
+        write_lock_irqsave(&kranal_data.kra_global_lock, flags);
+
+        if (kranal_data.kra_nonewpeers) {
+                /* shutdown has started already */
+                write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
+                
+                LIBCFS_FREE(peer, sizeof(*peer));
+                CERROR("Can't create peer: network shutdown\n");
+                return -ESHUTDOWN;
+        }
 
         atomic_inc(&kranal_data.kra_npeers);
-        return peer;
+
+        write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
+
+        *peerp = peer;
+        return 0;
 }
 
 void
 kranal_destroy_peer (kra_peer_t *peer)
 {
-        CDEBUG(D_NET, "peer "LPX64" %p deleted\n", peer->rap_nid, peer);
+        CDEBUG(D_NET, "peer %s %p deleted\n", 
+               libcfs_nid2str(peer->rap_nid), peer);
 
         LASSERT (atomic_read(&peer->rap_refcount) == 0);
         LASSERT (peer->rap_persistence == 0);
@@ -1332,7 +963,7 @@ kranal_destroy_peer (kra_peer_t *peer)
         LASSERT (list_empty(&peer->rap_tx_queue));
         LASSERT (list_empty(&peer->rap_connd_list));
 
-        PORTAL_FREE(peer, sizeof(*peer));
+        LIBCFS_FREE(peer, sizeof(*peer));
 
         /* NB a peer's connections keep a reference on their peer until
          * they are destroyed, so we can be assured that _all_ state to do
@@ -1342,7 +973,7 @@ kranal_destroy_peer (kra_peer_t *peer)
 }
 
 kra_peer_t *
-kranal_find_peer_locked (ptl_nid_t nid)
+kranal_find_peer_locked (lnet_nid_t nid)
 {
         struct list_head *peer_list = kranal_nid2peerlist(nid);
         struct list_head *tmp;
@@ -1358,15 +989,16 @@ kranal_find_peer_locked (ptl_nid_t nid)
                 if (peer->rap_nid != nid)
                         continue;
 
-                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
-                       peer, nid, atomic_read(&peer->rap_refcount));
+                CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                       peer, libcfs_nid2str(nid), 
+                       atomic_read(&peer->rap_refcount));
                 return peer;
         }
         return NULL;
 }
 
 kra_peer_t *
-kranal_find_peer (ptl_nid_t nid)
+kranal_find_peer (lnet_nid_t nid)
 {
         kra_peer_t     *peer;
 
@@ -1393,7 +1025,7 @@ kranal_unlink_peer_locked (kra_peer_t *peer)
 }
 
 int
-kranal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
+kranal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp,
                       int *persistencep)
 {
         kra_peer_t        *peer;
@@ -1428,18 +1060,19 @@ kranal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp,
 }
 
 int
-kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
+kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port)
 {
         unsigned long      flags;
         kra_peer_t        *peer;
         kra_peer_t        *peer2;
+        int                rc;
 
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return -EINVAL;
 
-        peer = kranal_create_peer(nid);
-        if (peer == NULL)
-                return -ENOMEM;
+        rc = kranal_create_peer(&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave(&kranal_data.kra_global_lock, flags);
 
@@ -1462,19 +1095,13 @@ kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port)
 }
 
 void
-kranal_del_peer_locked (kra_peer_t *peer, int single_share)
+kranal_del_peer_locked (kra_peer_t *peer)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
         kra_conn_t       *conn;
 
-        if (!single_share)
-                peer->rap_persistence = 0;
-        else if (peer->rap_persistence > 0)
-                peer->rap_persistence--;
-
-        if (peer->rap_persistence != 0)
-                return;
+        peer->rap_persistence = 0;
 
         if (list_empty(&peer->rap_conns)) {
                 kranal_unlink_peer_locked(peer);
@@ -1489,7 +1116,7 @@ kranal_del_peer_locked (kra_peer_t *peer, int single_share)
 }
 
 int
-kranal_del_peer (ptl_nid_t nid, int single_share)
+kranal_del_peer (lnet_nid_t nid)
 {
         unsigned long      flags;
         struct list_head  *ptmp;
@@ -1502,7 +1129,7 @@ kranal_del_peer (ptl_nid_t nid, int single_share)
 
         write_lock_irqsave(&kranal_data.kra_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers;
         else {
                 lo = 0;
@@ -1515,17 +1142,14 @@ kranal_del_peer (ptl_nid_t nid, int single_share)
                         LASSERT (peer->rap_persistence > 0 ||
                                  !list_empty(&peer->rap_conns));
 
-                        if (!(nid == PTL_NID_ANY || peer->rap_nid == nid))
+                        if (!(nid == LNET_NID_ANY || peer->rap_nid == nid))
                                 continue;
 
-                        kranal_del_peer_locked(peer, single_share);
+                        kranal_del_peer_locked(peer);
                         rc = 0;         /* matched something */
-
-                        if (single_share)
-                                goto out;
                 }
         }
- out:
+
         write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
 
         return rc;
@@ -1554,8 +1178,8 @@ kranal_get_conn_by_idx (int index)
                                         continue;
 
                                 conn = list_entry(ctmp, kra_conn_t, rac_list);
-                                CDEBUG(D_NET, "++conn[%p] -> "LPX64" (%d)\n",
-                                       conn, conn->rac_peer->rap_nid,
+                                CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn, 
+                                       libcfs_nid2str(conn->rac_peer->rap_nid),
                                        atomic_read(&conn->rac_refcount));
                                 atomic_inc(&conn->rac_refcount);
                                 read_unlock(&kranal_data.kra_global_lock);
@@ -1587,7 +1211,7 @@ kranal_close_peer_conns_locked (kra_peer_t *peer, int why)
 }
 
 int
-kranal_close_matching_conns (ptl_nid_t nid)
+kranal_close_matching_conns (lnet_nid_t nid)
 {
         unsigned long       flags;
         kra_peer_t         *peer;
@@ -1600,7 +1224,7 @@ kranal_close_matching_conns (ptl_nid_t nid)
 
         write_lock_irqsave(&kranal_data.kra_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers;
         else {
                 lo = 0;
@@ -1614,7 +1238,7 @@ kranal_close_matching_conns (ptl_nid_t nid)
                         LASSERT (peer->rap_persistence > 0 ||
                                  !list_empty(&peer->rap_conns));
 
-                        if (!(nid == PTL_NID_ANY || nid == peer->rap_nid))
+                        if (!(nid == LNET_NID_ANY || nid == peer->rap_nid))
                                 continue;
 
                         count += kranal_close_peer_conns_locked(peer, 0);
@@ -1624,72 +1248,72 @@ kranal_close_matching_conns (ptl_nid_t nid)
         write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
 
         /* wildcards always succeed */
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return 0;
 
         return (count == 0) ? -ENOENT : 0;
 }
 
 int
-kranal_cmd(struct portals_cfg *pcfg, void * private)
+kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 {
-        int rc = -EINVAL;
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
 
-        LASSERT (pcfg != NULL);
+        LASSERT (ni == kranal_data.kra_ni);
 
-        switch(pcfg->pcfg_command) {
-        case NAL_CMD_GET_PEER: {
-                ptl_nid_t   nid = 0;
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
                 __u32       ip = 0;
                 int         port = 0;
                 int         share_count = 0;
 
-                rc = kranal_get_peer_info(pcfg->pcfg_count,
+                rc = kranal_get_peer_info(data->ioc_count,
                                           &nid, &ip, &port, &share_count);
-                pcfg->pcfg_nid   = nid;
-                pcfg->pcfg_size  = 0;
-                pcfg->pcfg_id    = ip;
-                pcfg->pcfg_misc  = port;
-                pcfg->pcfg_count = 0;
-                pcfg->pcfg_wait  = share_count;
+                data->ioc_nid    = nid;
+                data->ioc_count  = share_count;
+                data->ioc_u32[0] = ip;
+                data->ioc_u32[1] = port;
                 break;
         }
-        case NAL_CMD_ADD_PEER: {
-                rc = kranal_add_persistent_peer(pcfg->pcfg_nid,
-                                                pcfg->pcfg_id, /* IP */
-                                                pcfg->pcfg_misc); /* port */
+        case IOC_LIBCFS_ADD_PEER: {
+                rc = kranal_add_persistent_peer(data->ioc_nid,
+                                                data->ioc_u32[0], /* IP */
+                                                data->ioc_u32[1]); /* port */
                 break;
         }
-        case NAL_CMD_DEL_PEER: {
-                rc = kranal_del_peer(pcfg->pcfg_nid,
-                                     /* flags == single_share */
-                                     pcfg->pcfg_flags != 0);
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kranal_del_peer(data->ioc_nid);
                 break;
         }
-        case NAL_CMD_GET_CONN: {
-                kra_conn_t *conn = kranal_get_conn_by_idx(pcfg->pcfg_count);
+        case IOC_LIBCFS_GET_CONN: {
+                kra_conn_t *conn = kranal_get_conn_by_idx(data->ioc_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
                 else {
                         rc = 0;
-                        pcfg->pcfg_nid   = conn->rac_peer->rap_nid;
-                        pcfg->pcfg_id    = conn->rac_device->rad_id;
-                        pcfg->pcfg_misc  = 0;
-                        pcfg->pcfg_flags = 0;
+                        data->ioc_nid    = conn->rac_peer->rap_nid;
+                        data->ioc_u32[0] = conn->rac_device->rad_id;
                         kranal_conn_decref(conn);
                 }
                 break;
         }
-        case NAL_CMD_CLOSE_CONNECTION: {
-                rc = kranal_close_matching_conns(pcfg->pcfg_nid);
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kranal_close_matching_conns(data->ioc_nid);
                 break;
         }
-        case NAL_CMD_REGISTER_MYNID: {
-                if (pcfg->pcfg_nid == PTL_NID_ANY)
+        case IOC_LIBCFS_REGISTER_MYNID: {
+                /* Ignore if this is a noop */
+                if (data->ioc_nid == ni->ni_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kranal_set_mynid(pcfg->pcfg_nid);
+                }
                 break;
         }
         }
@@ -1706,44 +1330,39 @@ kranal_free_txdescs(struct list_head *freelist)
                 tx = list_entry(freelist->next, kra_tx_t, tx_list);
 
                 list_del(&tx->tx_list);
-                PORTAL_FREE(tx->tx_phys, PTL_MD_MAX_IOV * sizeof(*tx->tx_phys));
-                PORTAL_FREE(tx, sizeof(*tx));
+                LIBCFS_FREE(tx->tx_phys, LNET_MAX_IOV * sizeof(*tx->tx_phys));
+                LIBCFS_FREE(tx, sizeof(*tx));
         }
 }
 
 int
 kranal_alloc_txdescs(struct list_head *freelist, int n)
 {
-        int            isnblk = (freelist == &kranal_data.kra_idle_nblk_txs);
         int            i;
         kra_tx_t      *tx;
 
-        LASSERT (freelist == &kranal_data.kra_idle_txs ||
-                 freelist == &kranal_data.kra_idle_nblk_txs);
+        LASSERT (freelist == &kranal_data.kra_idle_txs);
         LASSERT (list_empty(freelist));
 
         for (i = 0; i < n; i++) {
 
-                PORTAL_ALLOC(tx, sizeof(*tx));
+                LIBCFS_ALLOC(tx, sizeof(*tx));
                 if (tx == NULL) {
-                        CERROR("Can't allocate %stx[%d]\n",
-                               isnblk ? "nblk " : "", i);
+                        CERROR("Can't allocate tx[%d]\n", i);
                         kranal_free_txdescs(freelist);
                         return -ENOMEM;
                 }
 
-                PORTAL_ALLOC(tx->tx_phys,
-                             PTL_MD_MAX_IOV * sizeof(*tx->tx_phys));
+                LIBCFS_ALLOC(tx->tx_phys,
+                             LNET_MAX_IOV * sizeof(*tx->tx_phys));
                 if (tx->tx_phys == NULL) {
-                        CERROR("Can't allocate %stx[%d]->tx_phys\n",
-                               isnblk ? "nblk " : "", i);
+                        CERROR("Can't allocate tx[%d]->tx_phys\n", i);
 
-                        PORTAL_FREE(tx, sizeof(*tx));
+                        LIBCFS_FREE(tx, sizeof(*tx));
                         kranal_free_txdescs(freelist);
                         return -ENOMEM;
                 }
 
-                tx->tx_isnblk = isnblk;
                 tx->tx_buftype = RANAL_BUF_NONE;
                 tx->tx_msg.ram_type = RANAL_MSG_NONE;
 
@@ -1756,7 +1375,7 @@ kranal_alloc_txdescs(struct list_head *freelist, int n)
 int
 kranal_device_init(int id, kra_device_t *dev)
 {
-        const int         total_ntx = RANAL_NTX + RANAL_NTX_NBLK;
+        int               total_ntx = *kranal_tunables.kra_ntx;
         RAP_RETURN        rrc;
 
         dev->rad_id = id;
@@ -1777,16 +1396,17 @@ kranal_device_init(int id, kra_device_t *dev)
         rrc = RapkCreateCQ(dev->rad_handle, total_ntx, RAP_CQTYPE_SEND,
                            &dev->rad_rdma_cqh);
         if (rrc != RAP_SUCCESS) {
-                CERROR("Can't create rdma cq size %d"
-                       " for device %d: %d\n", total_ntx, id, rrc);
+                CERROR("Can't create rdma cq size %d for device %d: %d\n",
+                       total_ntx, id, rrc);
                 goto failed_1;
         }
 
-        rrc = RapkCreateCQ(dev->rad_handle, RANAL_FMA_CQ_SIZE, RAP_CQTYPE_RECV,
-                           &dev->rad_fma_cqh);
+        rrc = RapkCreateCQ(dev->rad_handle, 
+                           *kranal_tunables.kra_fma_cq_size, 
+                           RAP_CQTYPE_RECV, &dev->rad_fma_cqh);
         if (rrc != RAP_SUCCESS) {
-                CERROR("Can't create fma cq size %d"
-                       " for device %d: %d\n", RANAL_FMA_CQ_SIZE, id, rrc);
+                CERROR("Can't create fma cq size %d for device %d: %d\n", 
+                       *kranal_tunables.kra_fma_cq_size, id, rrc);
                 goto failed_2;
         }
 
@@ -1803,6 +1423,13 @@ kranal_device_init(int id, kra_device_t *dev)
 void
 kranal_device_fini(kra_device_t *dev)
 {
+        LASSERT (list_empty(&dev->rad_ready_conns));
+        LASSERT (list_empty(&dev->rad_new_conns));
+        LASSERT (dev->rad_nphysmap == 0);
+        LASSERT (dev->rad_nppphysmap == 0);
+        LASSERT (dev->rad_nvirtmap == 0);
+        LASSERT (dev->rad_nobvirtmap == 0);
+                
         LASSERT(dev->rad_scheduler == NULL);
         RapkDestroyCQ(dev->rad_handle, dev->rad_fma_cqh);
         RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh);
@@ -1810,21 +1437,16 @@ kranal_device_fini(kra_device_t *dev)
 }
 
 void
-kranal_api_shutdown (nal_t *nal)
+kranal_shutdown (lnet_ni_t *ni)
 {
         int           i;
         unsigned long flags;
 
-        if (nal->nal_refct != 0) {
-                /* This module got the first ref */
-                PORTAL_MODULE_UNUSE;
-                return;
-        }
-
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
-               atomic_read(&portal_kmemory));
+               atomic_read(&libcfs_kmemory));
 
-        LASSERT (nal == &kranal_api);
+        LASSERT (ni == kranal_data.kra_ni);
+        LASSERT (ni->ni_data == &kranal_data);
 
         switch (kranal_data.kra_init) {
         default:
@@ -1832,54 +1454,57 @@ kranal_api_shutdown (nal_t *nal)
                 LBUG();
 
         case RANAL_INIT_ALL:
-                /* stop calls to nal_cmd */
-                libcfs_nal_cmd_unregister(RANAL);
-                /* No new persistent peers */
-
-                /* resetting my NID to unadvertises me, removes my
-                 * listener and nukes all current peers */
-                kranal_set_mynid(PTL_NID_ANY);
-                /* no new peers or conns */
+                /* Prevent new peers from being created */
+                write_lock_irqsave(&kranal_data.kra_global_lock, flags);
+                kranal_data.kra_nonewpeers = 1;
+                write_unlock_irqrestore(&kranal_data.kra_global_lock, flags);
+                
+                /* Remove all existing peers from the peer table */
+                kranal_del_peer(LNET_NID_ANY);
 
-                /* Wait for all peer/conn state to clean up */
+                /* Wait for pending conn reqs to be handled */
                 i = 2;
-                while (atomic_read(&kranal_data.kra_nconns) != 0 ||
-                       atomic_read(&kranal_data.kra_npeers) != 0) {
+                spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
+                while (!list_empty(&kranal_data.kra_connd_acceptq)) {
+                        spin_unlock_irqrestore(&kranal_data.kra_connd_lock, 
+                                               flags);
                         i++;
-                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
-                               "waiting for %d peers and %d conns to close down\n",
-                               atomic_read(&kranal_data.kra_npeers),
-                               atomic_read(&kranal_data.kra_nconns));
-                        kranal_pause(HZ);
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
+                               "waiting for conn reqs to clean up\n");
+                        cfs_pause(cfs_time_seconds(1));
+
+                        spin_lock_irqsave(&kranal_data.kra_connd_lock, flags);
                 }
-                /* fall through */
+                spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
 
-        case RANAL_INIT_LIB:
-                lib_fini(&kranal_lib);
+                /* Wait for all peers to be freed */
+                i = 2;
+                while (atomic_read(&kranal_data.kra_npeers) != 0) {
+                        i++;
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */
+                               "waiting for %d peers to close down\n",
+                               atomic_read(&kranal_data.kra_npeers));
+                        cfs_pause(cfs_time_seconds(1));
+                }
                 /* fall through */
 
         case RANAL_INIT_DATA:
                 break;
         }
 
-        /* Conn/Peer state all cleaned up BEFORE setting shutdown, so threads
-         * don't have to worry about shutdown races */
-        LASSERT (atomic_read(&kranal_data.kra_nconns) == 0);
+        /* Peer state all cleaned up BEFORE setting shutdown, so threads don't
+         * have to worry about shutdown races.  NB connections may be created
+         * while there are still active connds, but these will be temporary
+         * since peer creation always fails after the listener has started to
+         * shut down. */
         LASSERT (atomic_read(&kranal_data.kra_npeers) == 0);
         
-        /* flag threads to terminate; wake and wait for them to die */
+        /* Flag threads to terminate */
         kranal_data.kra_shutdown = 1;
 
         for (i = 0; i < kranal_data.kra_ndevs; i++) {
                 kra_device_t *dev = &kranal_data.kra_devices[i];
 
-                LASSERT (list_empty(&dev->rad_ready_conns));
-                LASSERT (list_empty(&dev->rad_new_conns));
-                LASSERT (dev->rad_nphysmap == 0);
-                LASSERT (dev->rad_nppphysmap == 0);
-                LASSERT (dev->rad_nvirtmap == 0);
-                LASSERT (dev->rad_nobvirtmap == 0);
-                
                 spin_lock_irqsave(&dev->rad_lock, flags);
                 wake_up(&dev->rad_waitq);
                 spin_unlock_irqrestore(&dev->rad_lock, flags);
@@ -1894,13 +1519,14 @@ kranal_api_shutdown (nal_t *nal)
         wake_up_all(&kranal_data.kra_connd_waitq);
         spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
 
+        /* Wait for threads to exit */
         i = 2;
         while (atomic_read(&kranal_data.kra_nthreads) != 0) {
                 i++;
                 CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                        "Waiting for %d threads to terminate\n",
                        atomic_read(&kranal_data.kra_nthreads));
-                kranal_pause(HZ);
+                cfs_pause(cfs_time_seconds(1));
         }
 
         LASSERT (atomic_read(&kranal_data.kra_npeers) == 0);
@@ -1908,7 +1534,7 @@ kranal_api_shutdown (nal_t *nal)
                 for (i = 0; i < kranal_data.kra_peer_hash_size; i++)
                         LASSERT (list_empty(&kranal_data.kra_peers[i]));
 
-                PORTAL_FREE(kranal_data.kra_peers,
+                LIBCFS_FREE(kranal_data.kra_peers,
                             sizeof (struct list_head) *
                             kranal_data.kra_peer_hash_size);
         }
@@ -1918,7 +1544,7 @@ kranal_api_shutdown (nal_t *nal)
                 for (i = 0; i < kranal_data.kra_conn_hash_size; i++)
                         LASSERT (list_empty(&kranal_data.kra_conns[i]));
 
-                PORTAL_FREE(kranal_data.kra_conns,
+                LIBCFS_FREE(kranal_data.kra_conns,
                             sizeof (struct list_head) *
                             kranal_data.kra_conn_hash_size);
         }
@@ -1927,42 +1553,51 @@ kranal_api_shutdown (nal_t *nal)
                 kranal_device_fini(&kranal_data.kra_devices[i]);
 
         kranal_free_txdescs(&kranal_data.kra_idle_txs);
-        kranal_free_txdescs(&kranal_data.kra_idle_nblk_txs);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
-               atomic_read(&portal_kmemory));
-        printk(KERN_INFO "Lustre: RapidArray NAL unloaded (final mem %d)\n",
-               atomic_read(&portal_kmemory));
+               atomic_read(&libcfs_kmemory));
 
         kranal_data.kra_init = RANAL_INIT_NOTHING;
+        PORTAL_MODULE_UNUSE;
 }
 
 int
-kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
-                    ptl_ni_limits_t *requested_limits,
-                    ptl_ni_limits_t *actual_limits)
+kranal_startup (lnet_ni_t *ni)
 {
         struct timeval    tv;
-        ptl_process_id_t  process_id;
-        int               pkmem = atomic_read(&portal_kmemory);
+        int               pkmem = atomic_read(&libcfs_kmemory);
         int               rc;
         int               i;
         kra_device_t     *dev;
 
-        LASSERT (nal == &kranal_api);
+        LASSERT (ni->ni_lnd == &the_kralnd);
 
-        if (nal->nal_refct != 0) {
-                if (actual_limits != NULL)
-                        *actual_limits = kranal_lib.libnal_ni.ni_actual_limits;
-                /* This module got the first ref */
-                PORTAL_MODULE_USE;
-                return PTL_OK;
+        /* Only 1 instance supported */
+        if (kranal_data.kra_init != RANAL_INIT_NOTHING) {
+                CERROR ("Only 1 instance supported\n");
+                return -EPERM;
         }
 
-        LASSERT (kranal_data.kra_init == RANAL_INIT_NOTHING);
+        if (lnet_set_ip_niaddr(ni) != 0) {
+                CERROR ("Can't determine my NID\n");
+                return -EPERM;
+        }
 
+        if (*kranal_tunables.kra_credits > *kranal_tunables.kra_ntx) {
+                CERROR ("Can't set credits(%d) > ntx(%d)\n",
+                        *kranal_tunables.kra_credits,
+                        *kranal_tunables.kra_ntx);
+                return -EINVAL;
+        }
+        
         memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */
 
+        ni->ni_maxtxcredits = *kranal_tunables.kra_credits;
+        ni->ni_peertxcredits = *kranal_tunables.kra_peercredits;
+
+        ni->ni_data = &kranal_data;
+        kranal_data.kra_ni = ni;
+
         /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and
          * a unique (for all time) connstamp so we can uniquely identify
          * the sender.  The connstamp is an incrementing counter
@@ -1973,9 +1608,6 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kranal_data.kra_connstamp =
         kranal_data.kra_peerstamp = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 
-        init_MUTEX(&kranal_data.kra_nid_mutex);
-        init_MUTEX_LOCKED(&kranal_data.kra_listener_signal);
-
         rwlock_init(&kranal_data.kra_global_lock);
 
         for (i = 0; i < RANAL_MAXDEVS; i++ ) {
@@ -1998,15 +1630,14 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         spin_lock_init(&kranal_data.kra_connd_lock);
 
         INIT_LIST_HEAD(&kranal_data.kra_idle_txs);
-        INIT_LIST_HEAD(&kranal_data.kra_idle_nblk_txs);
-        init_waitqueue_head(&kranal_data.kra_idle_tx_waitq);
         spin_lock_init(&kranal_data.kra_tx_lock);
 
         /* OK to call kranal_api_shutdown() to cleanup now */
         kranal_data.kra_init = RANAL_INIT_DATA;
+        PORTAL_MODULE_USE;
 
         kranal_data.kra_peer_hash_size = RANAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC(kranal_data.kra_peers,
+        LIBCFS_ALLOC(kranal_data.kra_peers,
                      sizeof(struct list_head) * kranal_data.kra_peer_hash_size);
         if (kranal_data.kra_peers == NULL)
                 goto failed;
@@ -2015,7 +1646,7 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 INIT_LIST_HEAD(&kranal_data.kra_peers[i]);
 
         kranal_data.kra_conn_hash_size = RANAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC(kranal_data.kra_conns,
+        LIBCFS_ALLOC(kranal_data.kra_conns,
                      sizeof(struct list_head) * kranal_data.kra_conn_hash_size);
         if (kranal_data.kra_conns == NULL)
                 goto failed;
@@ -2023,35 +1654,18 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         for (i = 0; i < kranal_data.kra_conn_hash_size; i++)
                 INIT_LIST_HEAD(&kranal_data.kra_conns[i]);
 
-        rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, RANAL_NTX);
+        rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, 
+                                  *kranal_tunables.kra_ntx);
         if (rc != 0)
                 goto failed;
 
-        rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs,RANAL_NTX_NBLK);
-        if (rc != 0)
-                goto failed;
-
-        process_id.pid = requested_pid;
-        process_id.nid = PTL_NID_ANY;           /* don't know my NID yet */
-
-        rc = lib_init(&kranal_lib, nal, process_id,
-                      requested_limits, actual_limits);
-        if (rc != PTL_OK) {
-                CERROR("lib_init failed: error %d\n", rc);
-                goto failed;
-        }
-
-        /* lib interface initialised */
-        kranal_data.kra_init = RANAL_INIT_LIB;
-        /*****************************************************/
-
         rc = kranal_thread_start(kranal_reaper, NULL);
         if (rc != 0) {
                 CERROR("Can't spawn ranal reaper: %d\n", rc);
                 goto failed;
         }
 
-        for (i = 0; i < RANAL_N_CONND; i++) {
+        for (i = 0; i < *kranal_tunables.kra_n_connd; i++) {
                 rc = kranal_thread_start(kranal_connd, (void *)(unsigned long)i);
                 if (rc != 0) {
                         CERROR("Can't spawn ranal connd[%d]: %d\n",
@@ -2062,16 +1676,15 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         LASSERT (kranal_data.kra_ndevs == 0);
 
-        for (i = 0; i < sizeof(kranal_devids)/sizeof(kranal_devids[0]); i++) {
-                LASSERT (i < RANAL_MAXDEVS);
-
+        /* Use all available RapidArray devices */
+        for (i = 0; i < RANAL_MAXDEVS; i++) {
                 dev = &kranal_data.kra_devices[kranal_data.kra_ndevs];
 
                 rc = kranal_device_init(kranal_devids[i], dev);
                 if (rc == 0)
                         kranal_data.kra_ndevs++;
         }
-        
+
         if (kranal_data.kra_ndevs == 0) {
                 CERROR("Can't initialise any RapidArray devices\n");
                 goto failed;
@@ -2087,36 +1700,23 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 }
         }
 
-        rc = libcfs_nal_cmd_register(RANAL, &kranal_cmd, NULL);
-        if (rc != 0) {
-                CERROR("Can't initialise command interface (rc = %d)\n", rc);
-                goto failed;
-        }
-
         /* flag everything initialised */
         kranal_data.kra_init = RANAL_INIT_ALL;
         /*****************************************************/
 
-        CDEBUG(D_MALLOC, "initial kmem %d\n", atomic_read(&portal_kmemory));
-        printk(KERN_INFO "Lustre: RapidArray NAL loaded "
-               "(initial mem %d)\n", pkmem);
-
-        return PTL_OK;
+        CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem);
+        return 0;
 
  failed:
-        kranal_api_shutdown(&kranal_api);
-        return PTL_FAIL;
+        kranal_shutdown(ni);
+        return -ENETDOWN;
 }
 
 void __exit
 kranal_module_fini (void)
 {
-        if (kranal_tunables.kra_sysctl != NULL)
-                unregister_sysctl_table(kranal_tunables.kra_sysctl);
-
-        PtlNIFini(kranal_ni);
-
-        ptl_unregister_nal(RANAL);
+        lnet_unregister_lnd(&the_kralnd);
+        kranal_tunables_fini();
 }
 
 int __init
@@ -2124,51 +1724,17 @@ kranal_module_init (void)
 {
         int    rc;
 
-        /* the following must be sizeof(int) for
-         * proc_dointvec/kranal_listener_procint() */
-        LASSERT (sizeof(kranal_tunables.kra_timeout) == sizeof(int));
-        LASSERT (sizeof(kranal_tunables.kra_listener_timeout) == sizeof(int));
-        LASSERT (sizeof(kranal_tunables.kra_backlog) == sizeof(int));
-        LASSERT (sizeof(kranal_tunables.kra_port) == sizeof(int));
-        LASSERT (sizeof(kranal_tunables.kra_max_immediate) == sizeof(int));
-
-        kranal_api.nal_ni_init = kranal_api_startup;
-        kranal_api.nal_ni_fini = kranal_api_shutdown;
-
-        /* Initialise dynamic tunables to defaults once only */
-        kranal_tunables.kra_timeout = RANAL_TIMEOUT;
-        kranal_tunables.kra_listener_timeout = RANAL_LISTENER_TIMEOUT;
-        kranal_tunables.kra_backlog = RANAL_BACKLOG;
-        kranal_tunables.kra_port = RANAL_PORT;
-        kranal_tunables.kra_max_immediate = RANAL_MAX_IMMEDIATE;
-
-        rc = ptl_register_nal(RANAL, &kranal_api);
-        if (rc != PTL_OK) {
-                CERROR("Can't register RANAL: %d\n", rc);
-                return -ENOMEM;               /* or something... */
-        }
-
-        /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(RANAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kranal_ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                ptl_unregister_nal(RANAL);
-                return -ENODEV;
-        }
-
-        kranal_tunables.kra_sysctl =
-                register_sysctl_table(kranal_top_ctl_table, 0);
-        if (kranal_tunables.kra_sysctl == NULL) {
-                CERROR("Can't register sysctl table\n");
-                PtlNIFini(kranal_ni);
-                ptl_unregister_nal(RANAL);
-                return -ENOMEM;
-        }
+        rc = kranal_tunables_init();
+        if (rc != 0)
+                return rc;
+
+        lnet_register_lnd(&the_kralnd);
 
         return 0;
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel RapidArray NAL v0.01");
+MODULE_DESCRIPTION("Kernel RapidArray LND v0.01");
 MODULE_LICENSE("GPL");
 
 module_init(kranal_module_init);
index aa269c3..300cf40 100644 (file)
 #include <net/sock.h>
 #include <linux/in.h>
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
 
 #include <rapl.h>
 
-#define RANAL_MAXDEVS       2                   /* max # devices RapidArray supports */
+/* tunables determined at compile time */
+#define RANAL_RESCHED             100           /* # scheduler loops before reschedule */
 
-#define RANAL_N_CONND       4                   /* # connection daemons */
+#define RANAL_PEER_HASH_SIZE      101           /* # peer lists */
+#define RANAL_CONN_HASH_SIZE      101           /* # conn lists */
 
-#define RANAL_MIN_RECONNECT_INTERVAL 1          /* first failed connection retry (seconds)... */
-#define RANAL_MAX_RECONNECT_INTERVAL 60         /* ...exponentially increasing to this */
+#define RANAL_MIN_TIMEOUT         5             /* minimum timeout interval (seconds) */
+#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2)  /* timeout -> keepalive interval */
 
-#define RANAL_FMA_MAX_PREFIX      232           /* max size of FMA "Prefix" */
+/* fixed constants */
+#define RANAL_MAXDEVS             2             /* max # devices RapidArray supports */
+#define RANAL_FMA_MAX_PREFIX      232           /* max bytes in FMA "Prefix" we can use */
 #define RANAL_FMA_MAX_DATA        ((7<<10)-256) /* Max FMA MSG is 7K including prefix */
 
-#define RANAL_PEER_HASH_SIZE  101               /* # peer lists */
-#define RANAL_CONN_HASH_SIZE  101               /* # conn lists */
-
-#define RANAL_NTX             64                /* # tx descs */
-#define RANAL_NTX_NBLK        256               /* # reserved tx descs */
-
-#define RANAL_FMA_CQ_SIZE     8192              /* # entries in receive CQ
-                                                 * (overflow is a performance hit) */
-
-#define RANAL_RESCHED         100               /* # scheduler loops before reschedule */
-
-#define RANAL_MIN_TIMEOUT     5                 /* minimum timeout interval (seconds) */
-#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2)  /* timeout -> keepalive interval */
-
-/* default vals for runtime tunables */
-#define RANAL_TIMEOUT           30              /* comms timeout (seconds) */
-#define RANAL_LISTENER_TIMEOUT   5              /* listener timeout (seconds) */
-#define RANAL_BACKLOG          127              /* listener's backlog */
-#define RANAL_PORT             988              /* listener's port */
-#define RANAL_MAX_IMMEDIATE    (2<<10)          /* immediate payload breakpoint */
 
 typedef struct
 {
-        int               kra_timeout;          /* comms timeout (seconds) */
-        int               kra_listener_timeout; /* max time the listener can block */
-        int               kra_backlog;          /* listener's backlog */
-        int               kra_port;             /* listener's TCP/IP port */
-        int               kra_max_immediate;    /* immediate payload breakpoint */
-
+        int              *kra_n_connd;          /* # connection daemons */
+        int              *kra_min_reconnect_interval; /* first failed connection retry... */
+        int              *kra_max_reconnect_interval; /* ...exponentially increasing to this */
+        int              *kra_ntx;              /* # tx descs */
+        int              *kra_credits;          /* # concurrent sends */
+        int              *kra_peercredits;      /* # concurrent sends to 1 peer */
+        int              *kra_fma_cq_size;      /* # entries in receive CQ */
+        int              *kra_timeout;          /* comms timeout (seconds) */
+        int              *kra_max_immediate;    /* immediate payload breakpoint */
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
         struct ctl_table_header *kra_sysctl;    /* sysctl interface */
+#endif
 } kra_tunables_t;
 
 typedef struct
@@ -126,12 +115,8 @@ typedef struct
         int               kra_init;             /* initialisation state */
         int               kra_shutdown;         /* shut down? */
         atomic_t          kra_nthreads;         /* # live threads */
-
-        struct semaphore  kra_nid_mutex;        /* serialise NID/listener ops */
-        struct semaphore  kra_listener_signal;  /* block for listener startup/shutdown */
-        struct socket    *kra_listener_sock;    /* listener's socket */
-        int               kra_listener_shutdown; /* ask listener to close */
-
+        lnet_ni_t        *kra_ni;               /* _the_ nal instance */
+        
         kra_device_t      kra_devices[RANAL_MAXDEVS]; /* device/ptag/cq etc */
         int               kra_ndevs;            /* # devices */
 
@@ -140,6 +125,7 @@ typedef struct
         struct list_head *kra_peers;            /* hash table of all my known peers */
         int               kra_peer_hash_size;   /* size of kra_peers */
         atomic_t          kra_npeers;           /* # peers extant */
+        int               kra_nonewpeers;       /* prevent new peers */
 
         struct list_head *kra_conns;            /* conns hashed by cqid */
         int               kra_conn_hash_size;   /* size of kra_conns */
@@ -158,16 +144,13 @@ typedef struct
         spinlock_t        kra_connd_lock;       /* serialise */
 
         struct list_head  kra_idle_txs;         /* idle tx descriptors */
-        struct list_head  kra_idle_nblk_txs;    /* idle reserved tx descriptors */
         __u64             kra_next_tx_cookie;   /* RDMA completion cookie */
-        wait_queue_head_t kra_idle_tx_waitq;    /* block here for tx descriptor */
         spinlock_t        kra_tx_lock;          /* serialise */
 } kra_data_t;
 
 #define RANAL_INIT_NOTHING         0
 #define RANAL_INIT_DATA            1
-#define RANAL_INIT_LIB             2
-#define RANAL_INIT_ALL             3
+#define RANAL_INIT_ALL             2
 
 typedef struct kra_acceptsock                   /* accepted socket queued for connd */
 {
@@ -202,13 +185,13 @@ typedef struct
 
 typedef struct
 {
-        ptl_hdr_t         raim_hdr;             /* portals header */
+        lnet_hdr_t        raim_hdr;             /* portals header */
         /* Portals payload is in FMA "Message Data" */
 } kra_immediate_msg_t;
 
 typedef struct
 {
-        ptl_hdr_t         raprm_hdr;            /* portals header */
+        lnet_hdr_t        raprm_hdr;            /* portals header */
         __u64             raprm_cookie;         /* opaque completion cookie */
 } kra_putreq_msg_t;
 
@@ -221,7 +204,7 @@ typedef struct
 
 typedef struct
 {
-        ptl_hdr_t         ragm_hdr;             /* portals header */
+        lnet_hdr_t        ragm_hdr;             /* portals header */
         __u64             ragm_cookie;          /* opaque completion cookie */
         kra_rdma_desc_t   ragm_desc;            /* sender's sink buffer */
 } kra_get_msg_t;
@@ -248,7 +231,7 @@ typedef struct                                  /* NB must fit in FMA "Prefix" *
         __u32             ram_seq;              /* incrementing sequence number */
 } kra_msg_t;
 
-#define RANAL_MSG_MAGIC       0x0be91b92        /* unique magic */
+#define RANAL_MSG_MAGIC     LNET_PROTO_RA_MAGIC /* unique magic */
 #define RANAL_MSG_VERSION              1        /* current protocol version */
 
 #define RANAL_MSG_FENCE             0x80        /* fence RDMA */
@@ -271,9 +254,8 @@ typedef struct kra_tx                           /* message descriptor */
 {
         struct list_head          tx_list;      /* queue on idle_txs/rac_sendq/rac_waitq */
         struct kra_conn          *tx_conn;      /* owning conn */
-        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+        lnet_msg_t               *tx_lntmsg[2]; /* ptl msgs to finalize on completion */
         unsigned long             tx_qtime;     /* when tx started to wait for something (jiffies) */
-        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
         int                       tx_nob;       /* # bytes of payload */
         int                       tx_buftype;   /* payload buffer type */
         void                     *tx_buffer;    /* source/sink buffer */
@@ -334,7 +316,7 @@ typedef struct kra_peer
         struct list_head    rap_connd_list;     /* schedule on kra_connd_peers */
         struct list_head    rap_conns;          /* all active connections */
         struct list_head    rap_tx_queue;       /* msgs waiting for a conn */
-        ptl_nid_t           rap_nid;            /* who's on the other end(s) */
+        lnet_nid_t           rap_nid;            /* who's on the other end(s) */
         __u32               rap_ip;             /* IP address of peer */
         int                 rap_port;           /* port on which peer listens */
         atomic_t            rap_refcount;       /* # users */
@@ -344,20 +326,6 @@ typedef struct kra_peer
         unsigned long       rap_reconnect_interval; /* exponential backoff */
 } kra_peer_t;
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
-# define sk_allocation  allocation
-# define sk_data_ready  data_ready
-# define sk_write_space write_space
-# define sk_user_data   user_data
-# define sk_prot        prot
-# define sk_sndbuf      sndbuf
-# define sk_socket      socket
-# define sk_wmem_queued wmem_queued
-# define sk_err         err
-# define sk_sleep       sleep
-#endif
-
-extern lib_nal_t       kranal_lib;
 extern kra_data_t      kranal_data;
 extern kra_tunables_t  kranal_tunables;
 
@@ -367,7 +335,7 @@ extern void kranal_destroy_conn(kra_conn_t *conn);
 static inline void
 kranal_peer_addref(kra_peer_t *peer)
 {
-        CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid);
+        CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid));
         LASSERT(atomic_read(&peer->rap_refcount) > 0);
         atomic_inc(&peer->rap_refcount);
 }
@@ -375,14 +343,14 @@ kranal_peer_addref(kra_peer_t *peer)
 static inline void
 kranal_peer_decref(kra_peer_t *peer)
 {
-        CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid);
+        CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid));
         LASSERT(atomic_read(&peer->rap_refcount) > 0);
         if (atomic_dec_and_test(&peer->rap_refcount))
                 kranal_destroy_peer(peer);
 }
 
 static inline struct list_head *
-kranal_nid2peerlist (ptl_nid_t nid)
+kranal_nid2peerlist (lnet_nid_t nid)
 {
         unsigned int hash = ((unsigned int)nid) % kranal_data.kra_peer_hash_size;
 
@@ -399,7 +367,8 @@ kranal_peer_active(kra_peer_t *peer)
 static inline void
 kranal_conn_addref(kra_conn_t *conn)
 {
-        CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid);
+        CDEBUG(D_NET, "%p->%s\n", conn, 
+               libcfs_nid2str(conn->rac_peer->rap_nid));
         LASSERT(atomic_read(&conn->rac_refcount) > 0);
         atomic_inc(&conn->rac_refcount);
 }
@@ -407,7 +376,8 @@ kranal_conn_addref(kra_conn_t *conn)
 static inline void
 kranal_conn_decref(kra_conn_t *conn)
 {
-        CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid);
+        CDEBUG(D_NET, "%p->%s\n", conn,
+               libcfs_nid2str(conn->rac_peer->rap_nid));
         LASSERT(atomic_read(&conn->rac_refcount) > 0);
         if (atomic_dec_and_test(&conn->rac_refcount))
                 kranal_destroy_conn(conn);
@@ -445,11 +415,17 @@ kranal_tx_mapped (kra_tx_t *tx)
                 tx->tx_buftype == RANAL_BUF_PHYS_MAPPED);
 }
 
-static inline __u64
-kranal_page2phys (struct page *p)
-{
-        return page_to_phys(p);
-}
+int kranal_startup (lnet_ni_t *ni);
+void kranal_shutdown (lnet_ni_t *ni);
+int kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int kranal_eager_recv(lnet_ni_t *ni, void *private, 
+                        lnet_msg_t *lntmsg, void **new_private);
+int kranal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, 
+                int delayed, unsigned int niov, 
+                struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
+int kranal_accept(lnet_ni_t *ni, struct socket *sock);
 
 extern void kranal_free_acceptsock (kra_acceptsock_t *ras);
 extern int kranal_listener_procint (ctl_table *table,
@@ -459,17 +435,21 @@ extern void kranal_update_reaper_timeout (long timeout);
 extern void kranal_tx_done (kra_tx_t *tx, int completion);
 extern void kranal_unlink_peer_locked (kra_peer_t *peer);
 extern void kranal_schedule_conn (kra_conn_t *conn);
-extern kra_peer_t *kranal_create_peer (ptl_nid_t nid);
-extern kra_peer_t *kranal_find_peer_locked (ptl_nid_t nid);
+extern int kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid);
+extern int kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port);
+extern kra_peer_t *kranal_find_peer_locked (lnet_nid_t nid);
 extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx);
-extern int kranal_del_peer (ptl_nid_t nid, int single_share);
+extern int kranal_del_peer (lnet_nid_t nid);
 extern void kranal_device_callback (RAP_INT32 devid, RAP_PVOID arg);
 extern int kranal_thread_start (int(*fn)(void *arg), void *arg);
 extern int kranal_connd (void *arg);
 extern int kranal_reaper (void *arg);
 extern int kranal_scheduler (void *arg);
 extern void kranal_close_conn_locked (kra_conn_t *conn, int error);
+extern void kranal_close_conn (kra_conn_t *conn, int error);
 extern void kranal_terminate_conn_locked (kra_conn_t *conn);
 extern void kranal_connect (kra_peer_t *peer);
 extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer);
-extern void kranal_pause(int ticks);
+extern int kranal_tunables_init(void);
+extern void kranal_tunables_fini(void);
+extern void kranal_init_msg(kra_msg_t *msg, int type);
index dd910ce..969efd2 100644 (file)
  *
  */
 
-#include "ranal.h"
-
-int
-kranal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
-        /* I would guess that if kranal_get_peer (nid) == NULL,
-           and we're not routing, then 'nid' is very distant :) */
-        if ( nal->libnal_ni.ni_pid.nid == nid ) {
-                *dist = 0;
-        } else {
-                *dist = 1;
-        }
-
-        return 0;
-}
+#include "ralnd.h"
 
 void
 kranal_device_callback(RAP_INT32 devid, RAP_PVOID arg)
@@ -85,56 +71,33 @@ kranal_schedule_conn(kra_conn_t *conn)
 }
 
 kra_tx_t *
-kranal_get_idle_tx (int may_block)
+kranal_get_idle_tx (void)
 {
         unsigned long  flags;
-        kra_tx_t      *tx = NULL;
-
-        for (;;) {
-                spin_lock_irqsave(&kranal_data.kra_tx_lock, flags);
+        kra_tx_t      *tx;
 
-                /* "normal" descriptor is free */
-                if (!list_empty(&kranal_data.kra_idle_txs)) {
-                        tx = list_entry(kranal_data.kra_idle_txs.next,
-                                        kra_tx_t, tx_list);
-                        break;
-                }
-
-                if (!may_block) {
-                        /* may dip into reserve pool */
-                        if (list_empty(&kranal_data.kra_idle_nblk_txs)) {
-                                CERROR("reserved tx desc pool exhausted\n");
-                                break;
-                        }
-
-                        tx = list_entry(kranal_data.kra_idle_nblk_txs.next,
-                                        kra_tx_t, tx_list);
-                        break;
-                }
+        spin_lock_irqsave(&kranal_data.kra_tx_lock, flags);
 
-                /* block for idle tx */
+        if (list_empty(&kranal_data.kra_idle_txs)) {
                 spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags);
-
-                wait_event(kranal_data.kra_idle_tx_waitq,
-                           !list_empty(&kranal_data.kra_idle_txs));
+                return NULL;
         }
 
-        if (tx != NULL) {
-                list_del(&tx->tx_list);
+        tx = list_entry(kranal_data.kra_idle_txs.next, kra_tx_t, tx_list);
+        list_del(&tx->tx_list);
 
-                /* Allocate a new completion cookie.  It might not be
-                 * needed, but we've got a lock right now... */
-                tx->tx_cookie = kranal_data.kra_next_tx_cookie++;
-
-                LASSERT (tx->tx_buftype == RANAL_BUF_NONE);
-                LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE);
-                LASSERT (tx->tx_conn == NULL);
-                LASSERT (tx->tx_libmsg[0] == NULL);
-                LASSERT (tx->tx_libmsg[1] == NULL);
-        }
+        /* Allocate a new completion cookie.  It might not be needed, but we've
+         * got a lock right now... */
+        tx->tx_cookie = kranal_data.kra_next_tx_cookie++;
 
         spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags);
 
+        LASSERT (tx->tx_buftype == RANAL_BUF_NONE);
+        LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE);
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_lntmsg[0] == NULL);
+        LASSERT (tx->tx_lntmsg[1] == NULL);
+
         return tx;
 }
 
@@ -144,24 +107,24 @@ kranal_init_msg(kra_msg_t *msg, int type)
         msg->ram_magic = RANAL_MSG_MAGIC;
         msg->ram_version = RANAL_MSG_VERSION;
         msg->ram_type = type;
-        msg->ram_srcnid = kranal_lib.libnal_ni.ni_pid.nid;
+        msg->ram_srcnid = kranal_data.kra_ni->ni_nid;
         /* ram_connstamp gets set when FMA is sent */
 }
 
 kra_tx_t *
-kranal_new_tx_msg (int may_block, int type)
+kranal_new_tx_msg (int type)
 {
-        kra_tx_t *tx = kranal_get_idle_tx(may_block);
+        kra_tx_t *tx = kranal_get_idle_tx();
 
-        if (tx == NULL)
-                return NULL;
+        if (tx != NULL)
+                kranal_init_msg(&tx->tx_msg, type);
 
-        kranal_init_msg(&tx->tx_msg, type);
         return tx;
 }
 
 int
-kranal_setup_immediate_buffer (kra_tx_t *tx, int niov, struct iovec *iov,
+kranal_setup_immediate_buffer (kra_tx_t *tx, 
+                               unsigned int niov, struct iovec *iov,
                                int offset, int nob)
 
 {
@@ -198,7 +161,8 @@ kranal_setup_immediate_buffer (kra_tx_t *tx, int niov, struct iovec *iov,
 }
 
 int
-kranal_setup_virt_buffer (kra_tx_t *tx, int niov, struct iovec *iov,
+kranal_setup_virt_buffer (kra_tx_t *tx, 
+                          unsigned int niov, struct iovec *iov,
                           int offset, int nob)
 
 {
@@ -225,7 +189,7 @@ kranal_setup_virt_buffer (kra_tx_t *tx, int niov, struct iovec *iov,
 }
 
 int
-kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov,
+kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, lnet_kiov_t *kiov,
                           int offset, int nob)
 {
         RAP_PHYS_REGION *phys = tx->tx_phys;
@@ -248,7 +212,7 @@ kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov,
         tx->tx_nob = nob;
         tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset));
 
-        phys->Address = kranal_page2phys(kiov->kiov_page);
+        phys->Address = lnet_page2phys(kiov->kiov_page);
         phys++;
 
         resid = nob - (kiov->kiov_len - offset);
@@ -268,12 +232,12 @@ kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov,
                         return -EINVAL;
                 }
 
-                if ((phys - tx->tx_phys) == PTL_MD_MAX_IOV) {
+                if ((phys - tx->tx_phys) == LNET_MAX_IOV) {
                         CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys));
                         return -EMSGSIZE;
                 }
 
-                phys->Address = kranal_page2phys(kiov->kiov_page);
+                phys->Address = lnet_page2phys(kiov->kiov_page);
                 phys++;
 
                 resid -= PAGE_SIZE;
@@ -284,8 +248,8 @@ kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov,
 }
 
 static inline int
-kranal_setup_rdma_buffer (kra_tx_t *tx, int niov,
-                          struct iovec *iov, ptl_kiov_t *kiov,
+kranal_setup_rdma_buffer (kra_tx_t *tx, unsigned int niov,
+                          struct iovec *iov, lnet_kiov_t *kiov,
                           int offset, int nob)
 {
         LASSERT ((iov == NULL) != (kiov == NULL));
@@ -404,7 +368,7 @@ kranal_unmap_buffer (kra_tx_t *tx)
 void
 kranal_tx_done (kra_tx_t *tx, int completion)
 {
-        ptl_err_t        ptlrc = (completion == 0) ? PTL_OK : PTL_FAIL;
+        lnet_msg_t      *lnetmsg[2];
         unsigned long    flags;
         int              i;
 
@@ -412,14 +376,8 @@ kranal_tx_done (kra_tx_t *tx, int completion)
 
         kranal_unmap_buffer(tx);
 
-        for (i = 0; i < 2; i++) {
-                /* tx may have up to 2 libmsgs to finalise */
-                if (tx->tx_libmsg[i] == NULL)
-                        continue;
-
-                lib_finalize(&kranal_lib, NULL, tx->tx_libmsg[i], ptlrc);
-                tx->tx_libmsg[i] = NULL;
-        }
+        lnetmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+        lnetmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
 
         tx->tx_buftype = RANAL_BUF_NONE;
         tx->tx_msg.ram_type = RANAL_MSG_NONE;
@@ -427,14 +385,17 @@ kranal_tx_done (kra_tx_t *tx, int completion)
 
         spin_lock_irqsave(&kranal_data.kra_tx_lock, flags);
 
-        if (tx->tx_isnblk) {
-                list_add_tail(&tx->tx_list, &kranal_data.kra_idle_nblk_txs);
-        } else {
-                list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs);
-                wake_up(&kranal_data.kra_idle_tx_waitq);
-        }
+        list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs);
 
         spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags);
+
+        /* finalize AFTER freeing lnet msgs */
+        for (i = 0; i < 2; i++) {
+                if (lnetmsg[i] == NULL)
+                        continue;
+
+                lnet_finalize(kranal_data.kra_ni, lnetmsg[i], completion);
+        }
 }
 
 kra_conn_t *
@@ -466,12 +427,13 @@ kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx)
 }
 
 void
-kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid)
+kranal_launch_tx (kra_tx_t *tx, lnet_nid_t nid)
 {
         unsigned long    flags;
         kra_peer_t      *peer;
         kra_conn_t      *conn;
-        unsigned long    now;
+        int              rc;
+        int              retry;
         rwlock_t        *g_lock = &kranal_data.kra_global_lock;
 
         /* If I get here, I've committed to send, so I complete the tx with
@@ -479,33 +441,46 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid)
 
         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
 
-        read_lock(g_lock);
+        for (retry = 0; ; retry = 1) {
 
-        peer = kranal_find_peer_locked(nid);
-        if (peer == NULL) {
-                read_unlock(g_lock);
-                kranal_tx_done(tx, -EHOSTUNREACH);
-                return;
-        }
+                read_lock(g_lock);
 
-        conn = kranal_find_conn_locked(peer);
-        if (conn != NULL) {
-                kranal_post_fma(conn, tx);
+                peer = kranal_find_peer_locked(nid);
+                if (peer != NULL) {
+                        conn = kranal_find_conn_locked(peer);
+                        if (conn != NULL) {
+                                kranal_post_fma(conn, tx);
+                                read_unlock(g_lock);
+                                return;
+                        }
+                }
+                
+                /* Making connections; I'll need a write lock... */
                 read_unlock(g_lock);
-                return;
-        }
+                write_lock_irqsave(g_lock, flags);
 
-        /* Making one or more connections; I'll need a write lock... */
-        read_unlock(g_lock);
-        write_lock_irqsave(g_lock, flags);
-
-        peer = kranal_find_peer_locked(nid);
-        if (peer == NULL) {
+                peer = kranal_find_peer_locked(nid);
+                if (peer != NULL)
+                        break;
+                
                 write_unlock_irqrestore(g_lock, flags);
-                kranal_tx_done(tx, -EHOSTUNREACH);
-                return;
-        }
+                
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
+                        kranal_tx_done(tx, -EHOSTUNREACH);
+                        return;
+                }
 
+                rc = kranal_add_persistent_peer(nid, LNET_NIDADDR(nid),
+                                                lnet_acceptor_port());
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        kranal_tx_done(tx, rc);
+                        return;
+                }
+        }
+        
         conn = kranal_find_conn_locked(peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
@@ -513,14 +488,14 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid)
                 write_unlock_irqrestore(g_lock, flags);
                 return;
         }
-
+                        
         LASSERT (peer->rap_persistence > 0);
 
         if (!peer->rap_connecting) {
                 LASSERT (list_empty(&peer->rap_tx_queue));
 
-                now = CURRENT_SECONDS;
-                if (now < peer->rap_reconnect_time) {
+                if (!(peer->rap_reconnect_interval == 0 || /* first attempt */
+                      time_after_eq(jiffies, peer->rap_reconnect_time))) {
                         write_unlock_irqrestore(g_lock, flags);
                         kranal_tx_done(tx, -EHOSTUNREACH);
                         return;
@@ -603,219 +578,205 @@ kranal_consume_rxmsg (kra_conn_t *conn, void *buffer, int nob)
         conn->rac_rxmsg = NULL;
 
         if (nob_received < nob) {
-                CWARN("Incomplete immediate msg from "LPX64
-                      ": expected %d, got %d\n",
-                      conn->rac_peer->rap_nid, nob, nob_received);
+                CWARN("Incomplete immediate msg from %s: expected %d, got %d\n",
+                      libcfs_nid2str(conn->rac_peer->rap_nid), 
+                      nob, nob_received);
                 return -EPROTO;
         }
 
         return 0;
 }
 
-ptl_err_t
-kranal_do_send (lib_nal_t    *nal,
-                void         *private,
-                lib_msg_t    *libmsg,
-                ptl_hdr_t    *hdr,
-                int           type,
-                ptl_nid_t     nid,
-                ptl_pid_t     pid,
-                unsigned int  niov,
-                struct iovec *iov,
-                ptl_kiov_t   *kiov,
-                int           offset,
-                int           nob)
+int
+kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
-        kra_conn_t *conn;
-        kra_tx_t   *tx;
-        int         rc;
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr;
+        int               type = lntmsg->msg_type;
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      niov = lntmsg->msg_niov;
+        struct iovec     *iov = lntmsg->msg_iov;
+        lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+        unsigned int      offset = lntmsg->msg_offset;
+        unsigned int      nob = lntmsg->msg_len;
+        kra_tx_t         *tx;
+        int               rc;
 
         /* NB 'private' is different depending on what we're sending.... */
 
-        CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64" pid %d\n",
-               nob, niov, nid, pid);
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+               nob, niov, libcfs_id2str(target));
 
         LASSERT (nob == 0 || niov > 0);
-        LASSERT (niov <= PTL_MD_MAX_IOV);
+        LASSERT (niov <= LNET_MAX_IOV);
 
         LASSERT (!in_interrupt());
         /* payload is either all vaddrs or all pages */
         LASSERT (!(kiov != NULL && iov != NULL));
 
+        if (routing) {
+                CERROR ("Can't route\n");
+                return -EIO;
+        }
+
         switch(type) {
         default:
                 LBUG();
 
-        case PTL_MSG_REPLY: {
-                /* reply's 'private' is the conn that received the GET_REQ */
-                conn = private;
-                LASSERT (conn->rac_rxmsg != NULL);
-
-                if (conn->rac_rxmsg->ram_type == RANAL_MSG_IMMEDIATE) {
-                        if (nob > RANAL_FMA_MAX_DATA) {
-                                CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
-                                       nob, nid);
-                                return PTL_FAIL;
-                        }
-                        break;                  /* RDMA not expected */
-                }
-
-                /* Incoming message consistent with RDMA? */
-                if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) {
-                        CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
-                               nid, conn->rac_rxmsg->ram_type);
-                        return PTL_FAIL;
-                }
-
-                tx = kranal_get_idle_tx(0);
-                if (tx == NULL)
-                        return PTL_FAIL;
-
-                rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
-                if (rc != 0) {
-                        kranal_tx_done(tx, rc);
-                        return PTL_FAIL;
-                }
-
-                tx->tx_conn = conn;
-                tx->tx_libmsg[0] = libmsg;
-
-                rc = kranal_map_buffer(tx);
-                if (rc != 0) {
-                        kranal_tx_done(tx, rc);
-                        return PTL_FAIL;
-                }
-
-                kranal_rdma(tx, RANAL_MSG_GET_DONE,
-                            &conn->rac_rxmsg->ram_u.get.ragm_desc, nob,
-                            conn->rac_rxmsg->ram_u.get.ragm_cookie);
-
-                /* flag matched by consuming rx message */
-                kranal_consume_rxmsg(conn, NULL, 0);
-                return PTL_OK;
-        }
+        case LNET_MSG_ACK:
+                LASSERT (nob == 0);
+                break;
 
-        case PTL_MSG_GET:
+        case LNET_MSG_GET:
                 LASSERT (niov == 0);
                 LASSERT (nob == 0);
                 /* We have to consider the eventual sink buffer rather than any
                  * payload passed here (there isn't any, and strictly, looking
-                 * inside libmsg is a layering violation).  We send a simple
+                 * inside lntmsg is a layering violation).  We send a simple
                  * IMMEDIATE GET if the sink buffer is mapped already and small
                  * enough for FMA */
 
-                if ((libmsg->md->options & PTL_MD_KIOV) == 0 &&
-                    libmsg->md->length <= RANAL_FMA_MAX_DATA &&
-                    libmsg->md->length <= kranal_tunables.kra_max_immediate)
-                        break;
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0 &&
+                    lntmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA &&
+                    lntmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate)
+                        break;                  /* send IMMEDIATE */
 
-                tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_GET_REQ);
+                tx = kranal_new_tx_msg(RANAL_MSG_GET_REQ);
                 if (tx == NULL)
-                        return PTL_NO_SPACE;
+                        return -ENOMEM;
 
-                if ((libmsg->md->options & PTL_MD_KIOV) == 0)
-                        rc = kranal_setup_virt_buffer(tx, libmsg->md->md_niov,
-                                                      libmsg->md->md_iov.iov,
-                                                      0, libmsg->md->length);
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+                        rc = kranal_setup_virt_buffer(tx, lntmsg->msg_md->md_niov,
+                                                      lntmsg->msg_md->md_iov.iov,
+                                                      0, lntmsg->msg_md->md_length);
                 else
-                        rc = kranal_setup_phys_buffer(tx, libmsg->md->md_niov,
-                                                      libmsg->md->md_iov.kiov,
-                                                      0, libmsg->md->length);
+                        rc = kranal_setup_phys_buffer(tx, lntmsg->msg_md->md_niov,
+                                                      lntmsg->msg_md->md_iov.kiov,
+                                                      0, lntmsg->msg_md->md_length);
                 if (rc != 0) {
                         kranal_tx_done(tx, rc);
-                        return PTL_FAIL;
+                        return -EIO;
                 }
 
-                tx->tx_libmsg[1] = lib_create_reply_msg(&kranal_lib, nid, libmsg);
-                if (tx->tx_libmsg[1] == NULL) {
-                        CERROR("Can't create reply for GET to "LPX64"\n", nid);
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg);
+                if (tx->tx_lntmsg[1] == NULL) {
+                        CERROR("Can't create reply for GET to %s\n", 
+                               libcfs_nid2str(target.nid));
                         kranal_tx_done(tx, rc);
-                        return PTL_FAIL;
+                        return -EIO;
                 }
 
-                tx->tx_libmsg[0] = libmsg;
+                tx->tx_lntmsg[0] = lntmsg;
                 tx->tx_msg.ram_u.get.ragm_hdr = *hdr;
                 /* rest of tx_msg is setup just before it is sent */
-                kranal_launch_tx(tx, nid);
-                return PTL_OK;
-
-        case PTL_MSG_ACK:
-                LASSERT (nob == 0);
-                break;
+                kranal_launch_tx(tx, target.nid);
+                return 0;
 
-        case PTL_MSG_PUT:
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
                 if (kiov == NULL &&             /* not paged */
                     nob <= RANAL_FMA_MAX_DATA && /* small enough */
-                    nob <= kranal_tunables.kra_max_immediate)
+                    nob <= *kranal_tunables.kra_max_immediate)
                         break;                  /* send IMMEDIATE */
 
-                tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_PUT_REQ);
+                tx = kranal_new_tx_msg(RANAL_MSG_PUT_REQ);
                 if (tx == NULL)
-                        return PTL_NO_SPACE;
+                        return -ENOMEM;
 
                 rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
                 if (rc != 0) {
                         kranal_tx_done(tx, rc);
-                        return PTL_FAIL;
+                        return -EIO;
                 }
 
-                tx->tx_libmsg[0] = libmsg;
+                tx->tx_lntmsg[0] = lntmsg;
                 tx->tx_msg.ram_u.putreq.raprm_hdr = *hdr;
                 /* rest of tx_msg is setup just before it is sent */
-                kranal_launch_tx(tx, nid);
-                return PTL_OK;
+                kranal_launch_tx(tx, target.nid);
+                return 0;
         }
 
+        /* send IMMEDIATE */
+
         LASSERT (kiov == NULL);
         LASSERT (nob <= RANAL_FMA_MAX_DATA);
 
-        tx = kranal_new_tx_msg(!(type == PTL_MSG_ACK ||
-                                 type == PTL_MSG_REPLY ||
-                                 in_interrupt()),
-                               RANAL_MSG_IMMEDIATE);
+        tx = kranal_new_tx_msg(RANAL_MSG_IMMEDIATE);
         if (tx == NULL)
-                return PTL_NO_SPACE;
+                return -ENOMEM;
 
         rc = kranal_setup_immediate_buffer(tx, niov, iov, offset, nob);
         if (rc != 0) {
                 kranal_tx_done(tx, rc);
-                return PTL_FAIL;
+                return -EIO;
         }
 
         tx->tx_msg.ram_u.immediate.raim_hdr = *hdr;
-        tx->tx_libmsg[0] = libmsg;
-        kranal_launch_tx(tx, nid);
-        return PTL_OK;
+        tx->tx_lntmsg[0] = lntmsg;
+        kranal_launch_tx(tx, target.nid);
+        return 0;
 }
 
-ptl_err_t
-kranal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
-             ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-             unsigned int niov, struct iovec *iov,
-             size_t offset, size_t len)
+void
+kranal_reply(lnet_ni_t *ni, kra_conn_t *conn, lnet_msg_t *lntmsg)
 {
-        return kranal_do_send(nal, private, cookie,
-                              hdr, type, nid, pid,
-                              niov, iov, NULL,
-                              offset, len);
+        kra_msg_t     *rxmsg = conn->rac_rxmsg;
+        unsigned int   niov = lntmsg->msg_niov;
+        struct iovec  *iov = lntmsg->msg_iov;
+        lnet_kiov_t   *kiov = lntmsg->msg_kiov;
+        unsigned int   offset = lntmsg->msg_offset;
+        unsigned int   nob = lntmsg->msg_len;
+        kra_tx_t      *tx;
+        int            rc;
+
+        tx = kranal_get_idle_tx();
+        if (tx == NULL)
+                goto failed_0;
+
+        rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob);
+        if (rc != 0)
+                goto failed_1;
+
+        tx->tx_conn = conn;
+
+        rc = kranal_map_buffer(tx);
+        if (rc != 0)
+                goto failed_1;
+
+        tx->tx_lntmsg[0] = lntmsg;
+
+        kranal_rdma(tx, RANAL_MSG_GET_DONE,
+                    &rxmsg->ram_u.get.ragm_desc, nob,
+                    rxmsg->ram_u.get.ragm_cookie);
+        return;
+
+ failed_1:
+        kranal_tx_done(tx, -EIO);
+ failed_0:
+        lnet_finalize(ni, lntmsg, -EIO);
 }
 
-ptl_err_t
-kranal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie,
-                   ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                   unsigned int niov, ptl_kiov_t *kiov,
-                   size_t offset, size_t len)
+int
+kranal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                   void **new_private)
 {
-        return kranal_do_send(nal, private, cookie,
-                              hdr, type, nid, pid,
-                              niov, NULL, kiov,
-                              offset, len);
+        kra_conn_t *conn = (kra_conn_t *)private;
+
+        LCONSOLE_ERROR("Dropping message from %s: no buffers free.\n",
+                       libcfs_nid2str(conn->rac_peer->rap_nid));
+
+        return -EDEADLK;
 }
 
-ptl_err_t
-kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
-                unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
-                int offset, int mlen, int rlen)
+int
+kranal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+             int delayed, unsigned int niov, 
+             struct iovec *iov, lnet_kiov_t *kiov,
+             unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
         kra_conn_t  *conn = private;
         kra_msg_t   *rxmsg = conn->rac_rxmsg;
@@ -828,26 +789,18 @@ kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
 
-        CDEBUG(D_NET, "conn %p, rxmsg %p, libmsg %p\n", conn, rxmsg, libmsg);
-
-        if (libmsg == NULL) {
-                /* GET or ACK or portals is discarding */
-                LASSERT (mlen == 0);
-                lib_finalize(nal, NULL, libmsg, PTL_OK);
-                return PTL_OK;
-        }
+        CDEBUG(D_NET, "conn %p, rxmsg %p, lntmsg %p\n", conn, rxmsg, lntmsg);
 
         switch(rxmsg->ram_type) {
         default:
                 LBUG();
-                return PTL_FAIL;
 
         case RANAL_MSG_IMMEDIATE:
                 if (mlen == 0) {
                         buffer = NULL;
                 } else if (kiov != NULL) {
                         CERROR("Can't recv immediate into paged buffer\n");
-                        return PTL_FAIL;
+                        return -EIO;
                 } else {
                         LASSERT (niov > 0);
                         while (offset >= iov->iov_len) {
@@ -858,30 +811,34 @@ kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                         }
                         if (mlen > iov->iov_len - offset) {
                                 CERROR("Can't handle immediate frags\n");
-                                return PTL_FAIL;
+                                return -EIO;
                         }
                         buffer = ((char *)iov->iov_base) + offset;
                 }
                 rc = kranal_consume_rxmsg(conn, buffer, mlen);
-                lib_finalize(nal, NULL, libmsg, (rc == 0) ? PTL_OK : PTL_FAIL);
-                return PTL_OK;
+                lnet_finalize(ni, lntmsg, (rc == 0) ? 0 : -EIO);
+                return 0;
 
         case RANAL_MSG_PUT_REQ:
-                tx = kranal_new_tx_msg(0, RANAL_MSG_PUT_ACK);
-                if (tx == NULL)
-                        return PTL_NO_SPACE;
-
+                tx = kranal_new_tx_msg(RANAL_MSG_PUT_ACK);
+                if (tx == NULL) {
+                        kranal_consume_rxmsg(conn, NULL, 0);
+                        return -ENOMEM;
+                }
+                
                 rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen);
                 if (rc != 0) {
                         kranal_tx_done(tx, rc);
-                        return PTL_FAIL;
+                        kranal_consume_rxmsg(conn, NULL, 0);
+                        return -EIO;
                 }
 
                 tx->tx_conn = conn;
                 rc = kranal_map_buffer(tx);
                 if (rc != 0) {
                         kranal_tx_done(tx, rc);
-                        return PTL_FAIL;
+                        kranal_consume_rxmsg(conn, NULL, 0);
+                        return -EIO;
                 }
 
                 tx->tx_msg.ram_u.putack.rapam_src_cookie =
@@ -892,34 +849,30 @@ kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                         (__u64)((unsigned long)tx->tx_buffer);
                 tx->tx_msg.ram_u.putack.rapam_desc.rard_nob = mlen;
 
-                tx->tx_libmsg[0] = libmsg; /* finalize this on RDMA_DONE */
+                tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */
 
                 kranal_post_fma(conn, tx);
+                kranal_consume_rxmsg(conn, NULL, 0);
+                return 0;
 
-                /* flag matched by consuming rx message */
+        case RANAL_MSG_GET_REQ:
+                if (lntmsg != NULL) {
+                        /* Matched! */
+                        kranal_reply(ni, conn, lntmsg);
+                } else {
+                        /* No match */
+                        tx = kranal_new_tx_msg(RANAL_MSG_GET_NAK);
+                        if (tx != NULL) {
+                                tx->tx_msg.ram_u.completion.racm_cookie = 
+                                        rxmsg->ram_u.get.ragm_cookie;
+                                kranal_post_fma(conn, tx);
+                        }
+                }
                 kranal_consume_rxmsg(conn, NULL, 0);
-                return PTL_OK;
+                return 0;
         }
 }
 
-ptl_err_t
-kranal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
-             unsigned int niov, struct iovec *iov,
-             size_t offset, size_t mlen, size_t rlen)
-{
-        return kranal_do_recv(nal, private, msg, niov, iov, NULL,
-                              offset, mlen, rlen);
-}
-
-ptl_err_t
-kranal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
-                   unsigned int niov, ptl_kiov_t *kiov,
-                   size_t offset, size_t mlen, size_t rlen)
-{
-        return kranal_do_recv(nal, private, msg, niov, NULL, kiov,
-                              offset, mlen, rlen);
-}
-
 int
 kranal_thread_start (int(*fn)(void *arg), void *arg)
 {
@@ -953,8 +906,8 @@ kranal_check_conn_timeouts (kra_conn_t *conn)
         if (!conn->rac_close_sent &&
             time_after_eq(now, conn->rac_last_tx + conn->rac_keepalive * HZ)) {
                 /* not sent in a while; schedule conn so scheduler sends a keepalive */
-                CDEBUG(D_NET, "Scheduling keepalive %p->"LPX64"\n",
-                       conn, conn->rac_peer->rap_nid);
+                CDEBUG(D_NET, "Scheduling keepalive %p->%s\n",
+                       conn, libcfs_nid2str(conn->rac_peer->rap_nid));
                 kranal_schedule_conn(conn);
         }
 
@@ -962,10 +915,11 @@ kranal_check_conn_timeouts (kra_conn_t *conn)
 
         if (!conn->rac_close_recvd &&
             time_after_eq(now, conn->rac_last_rx + timeout)) {
-                CERROR("%s received from "LPX64" within %lu seconds\n",
+                CERROR("%s received from %s within %lu seconds\n",
                        (conn->rac_state == RANAL_CONN_ESTABLISHED) ?
                        "Nothing" : "CLOSE not",
-                       conn->rac_peer->rap_nid, (now - conn->rac_last_rx)/HZ);
+                       libcfs_nid2str(conn->rac_peer->rap_nid), 
+                       (now - conn->rac_last_rx)/HZ);
                 return -ETIMEDOUT;
         }
 
@@ -983,8 +937,9 @@ kranal_check_conn_timeouts (kra_conn_t *conn)
 
                 if (time_after_eq(now, tx->tx_qtime + timeout)) {
                         spin_unlock_irqrestore(&conn->rac_lock, flags);
-                        CERROR("tx on fmaq for "LPX64" blocked %lu seconds\n",
-                               conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ);
+                        CERROR("tx on fmaq for %s blocked %lu seconds\n",
+                               libcfs_nid2str(conn->rac_peer->rap_nid),
+                               (now - tx->tx_qtime)/HZ);
                         return -ETIMEDOUT;
                 }
         }
@@ -994,8 +949,9 @@ kranal_check_conn_timeouts (kra_conn_t *conn)
 
                 if (time_after_eq(now, tx->tx_qtime + timeout)) {
                         spin_unlock_irqrestore(&conn->rac_lock, flags);
-                        CERROR("tx on rdmaq for "LPX64" blocked %lu seconds\n",
-                               conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ);
+                        CERROR("tx on rdmaq for %s blocked %lu seconds\n",
+                               libcfs_nid2str(conn->rac_peer->rap_nid), 
+                               (now - tx->tx_qtime)/HZ);
                         return -ETIMEDOUT;
                 }
         }
@@ -1005,8 +961,9 @@ kranal_check_conn_timeouts (kra_conn_t *conn)
 
                 if (time_after_eq(now, tx->tx_qtime + timeout)) {
                         spin_unlock_irqrestore(&conn->rac_lock, flags);
-                        CERROR("tx on replyq for "LPX64" blocked %lu seconds\n",
-                               conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ);
+                        CERROR("tx on replyq for %s blocked %lu seconds\n",
+                               libcfs_nid2str(conn->rac_peer->rap_nid),
+                               (now - tx->tx_qtime)/HZ);
                         return -ETIMEDOUT;
                 }
         }
@@ -1044,8 +1001,9 @@ kranal_reaper_check (int idx, unsigned long *min_timeoutp)
                 kranal_conn_addref(conn);
                 read_unlock(&kranal_data.kra_global_lock);
 
-                CERROR("Conn to "LPX64", cqid %d timed out\n",
-                       conn->rac_peer->rap_nid, conn->rac_cqid);
+                CERROR("Conn to %s, cqid %d timed out\n",
+                       libcfs_nid2str(conn->rac_peer->rap_nid), 
+                       conn->rac_cqid);
 
                 write_lock_irqsave(&kranal_data.kra_global_lock, flags);
 
@@ -1085,8 +1043,8 @@ kranal_connd (void *arg)
         int                did_something;
 
         snprintf(name, sizeof(name), "kranal_connd_%02ld", id);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
+        cfs_daemonize(name);
+        cfs_block_allsigs();
 
         init_waitqueue_entry(&wait, current);
 
@@ -1131,7 +1089,7 @@ kranal_connd (void *arg)
                         continue;
 
                 set_current_state(TASK_INTERRUPTIBLE);
-                add_wait_queue(&kranal_data.kra_connd_waitq, &wait);
+                add_wait_queue_exclusive(&kranal_data.kra_connd_waitq, &wait);
 
                 spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags);
 
@@ -1178,8 +1136,8 @@ kranal_reaper (void *arg)
         long               next_min_timeout = MAX_SCHEDULE_TIMEOUT;
         long               current_min_timeout = 1;
 
-        kportal_daemonize("kranal_reaper");
-        kportal_blockallsigs();
+        cfs_daemonize("kranal_reaper");
+        cfs_block_allsigs();
 
         init_waitqueue_entry(&wait, current);
 
@@ -1376,7 +1334,8 @@ kranal_check_fma_cq (kra_device_t *dev)
                 }
 
                 /* FMA CQ has overflowed: check ALL conns */
-                CWARN("Scheduling ALL conns on device %d\n", dev->rad_id);
+                CWARN("FMA CQ overflow: scheduling ALL conns on device %d\n", 
+                      dev->rad_id);
 
                 for (i = 0; i < kranal_data.kra_conn_hash_size; i++) {
 
@@ -1482,7 +1441,8 @@ kranal_process_fmaq (kra_conn_t *conn)
                 if (conn->rac_close_sent)
                         return;
 
-                CWARN("sending CLOSE to "LPX64"\n", conn->rac_peer->rap_nid);
+                CWARN("sending CLOSE to %s\n", 
+                      libcfs_nid2str(conn->rac_peer->rap_nid));
                 kranal_init_msg(&conn->rac_msg, RANAL_MSG_CLOSE);
                 rc = kranal_sendmsg(conn, &conn->rac_msg, NULL, 0);
                 if (rc != 0)
@@ -1509,8 +1469,8 @@ kranal_process_fmaq (kra_conn_t *conn)
 
                 if (time_after_eq(jiffies,
                                   conn->rac_last_tx + conn->rac_keepalive * HZ)) {
-                        CDEBUG(D_NET, "sending NOOP -> "LPX64" (%p idle %lu(%ld))\n",
-                               conn->rac_peer->rap_nid, conn,
+                        CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%ld))\n",
+                               libcfs_nid2str(conn->rac_peer->rap_nid), conn,
                                (jiffies - conn->rac_last_tx)/HZ, conn->rac_keepalive);
                         kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP);
                         kranal_sendmsg(conn, &conn->rac_msg, NULL, 0);
@@ -1634,9 +1594,9 @@ kranal_match_reply(kra_conn_t *conn, int type, __u64 cookie)
                 if (tx->tx_msg.ram_type != type) {
                         spin_unlock_irqrestore(&conn->rac_lock, flags);
                         CWARN("Unexpected type %x (%x expected) "
-                              "matched reply from "LPX64"\n",
+                              "matched reply from %s\n",
                               tx->tx_msg.ram_type, type,
-                              conn->rac_peer->rap_nid);
+                              libcfs_nid2str(conn->rac_peer->rap_nid));
                         return NULL;
                 }
 
@@ -1646,8 +1606,8 @@ kranal_match_reply(kra_conn_t *conn, int type, __u64 cookie)
         }
 
         spin_unlock_irqrestore(&conn->rac_lock, flags);
-        CWARN("Unmatched reply %02x/"LPX64" from "LPX64"\n",
-              type, cookie, conn->rac_peer->rap_nid);
+        CWARN("Unmatched reply %02x/"LPX64" from %s\n",
+              type, cookie, libcfs_nid2str(conn->rac_peer->rap_nid));
         return NULL;
 }
 
@@ -1661,6 +1621,8 @@ kranal_check_fma_rx (kra_conn_t *conn)
         void         *prefix;
         RAP_RETURN    rrc = RapkFmaGetPrefix(conn->rac_rihandle, &prefix);
         kra_peer_t   *peer = conn->rac_peer;
+        int           rc = 0;
+        int           repost = 1;
 
         if (rrc == RAP_NOT_DONE)
                 return;
@@ -1679,8 +1641,9 @@ kranal_check_fma_rx (kra_conn_t *conn)
 
         if (msg->ram_magic != RANAL_MSG_MAGIC) {
                 if (__swab32(msg->ram_magic) != RANAL_MSG_MAGIC) {
-                        CERROR("Unexpected magic %08x from "LPX64"\n",
-                               msg->ram_magic, peer->rap_nid);
+                        CERROR("Unexpected magic %08x from %s\n",
+                               msg->ram_magic, libcfs_nid2str(peer->rap_nid));
+                        rc = -EPROTO;
                         goto out;
                 }
 
@@ -1707,45 +1670,55 @@ kranal_check_fma_rx (kra_conn_t *conn)
         }
 
         if (msg->ram_version != RANAL_MSG_VERSION) {
-                CERROR("Unexpected protocol version %d from "LPX64"\n",
-                       msg->ram_version, peer->rap_nid);
+                CERROR("Unexpected protocol version %d from %s\n",
+                       msg->ram_version, libcfs_nid2str(peer->rap_nid));
+                rc = -EPROTO;
                 goto out;
         }
 
         if (msg->ram_srcnid != peer->rap_nid) {
-                CERROR("Unexpected peer "LPX64" from "LPX64"\n",
-                       msg->ram_srcnid, peer->rap_nid);
+                CERROR("Unexpected peer %s from %s\n",
+                       libcfs_nid2str(msg->ram_srcnid), 
+                       libcfs_nid2str(peer->rap_nid));
+                rc = -EPROTO;
                 goto out;
         }
 
         if (msg->ram_connstamp != conn->rac_peer_connstamp) {
                 CERROR("Unexpected connstamp "LPX64"("LPX64
-                       " expected) from "LPX64"\n",
+                       " expected) from %s\n",
                        msg->ram_connstamp, conn->rac_peer_connstamp,
-                       peer->rap_nid);
+                       libcfs_nid2str(peer->rap_nid));
+                rc = -EPROTO;
                 goto out;
         }
 
         if (msg->ram_seq != seq) {
-                CERROR("Unexpected sequence number %d(%d expected) from "
-                       LPX64"\n", msg->ram_seq, seq, peer->rap_nid);
+                CERROR("Unexpected sequence number %d(%d expected) from %s\n",
+                       msg->ram_seq, seq, libcfs_nid2str(peer->rap_nid));
+                rc = -EPROTO;
                 goto out;
         }
 
         if ((msg->ram_type & RANAL_MSG_FENCE) != 0) {
                 /* This message signals RDMA completion... */
                 rrc = RapkFmaSyncWait(conn->rac_rihandle);
-                LASSERT (rrc == RAP_SUCCESS);
+                if (rrc != RAP_SUCCESS) {
+                        CERROR("RapkFmaSyncWait failed: %d\n", rrc);
+                        rc = -ENETDOWN;
+                        goto out;
+                }
         }
 
         if (conn->rac_close_recvd) {
-                CERROR("Unexpected message %d after CLOSE from "LPX64"\n",
-                       msg->ram_type, conn->rac_peer->rap_nid);
+                CERROR("Unexpected message %d after CLOSE from %s\n",
+                       msg->ram_type, libcfs_nid2str(conn->rac_peer->rap_nid));
+                rc = -EPROTO;
                 goto out;
         }
 
         if (msg->ram_type == RANAL_MSG_CLOSE) {
-                CWARN("RX CLOSE from "LPX64"\n", conn->rac_peer->rap_nid);
+                CWARN("RX CLOSE from %s\n", libcfs_nid2str(conn->rac_peer->rap_nid));
                 conn->rac_close_recvd = 1;
                 write_lock_irqsave(&kranal_data.kra_global_lock, flags);
 
@@ -1770,23 +1743,16 @@ kranal_check_fma_rx (kra_conn_t *conn)
 
         case RANAL_MSG_IMMEDIATE:
                 CDEBUG(D_NET, "RX IMMEDIATE on %p\n", conn);
-                lib_parse(&kranal_lib, &msg->ram_u.immediate.raim_hdr, conn);
+                rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.immediate.raim_hdr, 
+                                msg->ram_srcnid, conn, 0);
+                repost = rc < 0;
                 break;
 
         case RANAL_MSG_PUT_REQ:
                 CDEBUG(D_NET, "RX PUT_REQ on %p\n", conn);
-                lib_parse(&kranal_lib, &msg->ram_u.putreq.raprm_hdr, conn);
-
-                if (conn->rac_rxmsg == NULL)    /* lib_parse matched something */
-                        break;
-
-                tx = kranal_new_tx_msg(0, RANAL_MSG_PUT_NAK);
-                if (tx == NULL)
-                        break;
-
-                tx->tx_msg.ram_u.completion.racm_cookie =
-                        msg->ram_u.putreq.raprm_cookie;
-                kranal_post_fma(conn, tx);
+                rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.putreq.raprm_hdr, 
+                                msg->ram_srcnid, conn, 1);
+                repost = rc < 0;
                 break;
 
         case RANAL_MSG_PUT_NAK:
@@ -1828,17 +1794,9 @@ kranal_check_fma_rx (kra_conn_t *conn)
 
         case RANAL_MSG_GET_REQ:
                 CDEBUG(D_NET, "RX GET_REQ on %p\n", conn);
-                lib_parse(&kranal_lib, &msg->ram_u.get.ragm_hdr, conn);
-
-                if (conn->rac_rxmsg == NULL)    /* lib_parse matched something */
-                        break;
-
-                tx = kranal_new_tx_msg(0, RANAL_MSG_GET_NAK);
-                if (tx == NULL)
-                        break;
-
-                tx->tx_msg.ram_u.completion.racm_cookie = msg->ram_u.get.ragm_cookie;
-                kranal_post_fma(conn, tx);
+                rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.get.ragm_hdr, 
+                                msg->ram_srcnid, conn, 1);
+                repost = rc < 0;
                 break;
 
         case RANAL_MSG_GET_NAK:
@@ -1862,12 +1820,20 @@ kranal_check_fma_rx (kra_conn_t *conn)
 
                 LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED ||
                          tx->tx_buftype == RANAL_BUF_VIRT_MAPPED);
+#if 0
+                /* completion message should send rdma length if we ever allow
+                 * GET truncation */
+                lnet_set_reply_msg_len(kranal_data.kra_ni, tx->tx_lntmsg[1], ???);
+#endif
                 kranal_tx_done(tx, 0);
                 break;
         }
 
  out:
-        if (conn->rac_rxmsg != NULL)
+        if (rc < 0)                             /* protocol/comms error */
+                kranal_close_conn (conn, rc);
+
+        if (repost && conn->rac_rxmsg != NULL)
                 kranal_consume_rxmsg(conn, NULL, 0);
 
         /* check again later */
@@ -1901,8 +1867,8 @@ kranal_complete_closed_conn (kra_conn_t *conn)
                 kranal_tx_done(tx, -ECONNABORTED);
         }
 
-        CWARN("Closed conn %p -> "LPX64": nmsg %d nreplies %d\n",
-              conn, conn->rac_peer->rap_nid, nfma, nreplies);
+        CWARN("Closed conn %p -> %s: nmsg %d nreplies %d\n",
+               conn, libcfs_nid2str(conn->rac_peer->rap_nid), nfma, nreplies);
 }
 
 int
@@ -1944,8 +1910,8 @@ kranal_scheduler (void *arg)
         int               busy_loops = 0;
 
         snprintf(name, sizeof(name), "kranal_sd_%02d", dev->rad_idx);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
+        cfs_daemonize(name);
+        cfs_block_allsigs();
 
         dev->rad_scheduler = current;
         init_waitqueue_entry(&wait, current);
@@ -2043,7 +2009,7 @@ kranal_scheduler (void *arg)
                         continue;
 
                 set_current_state(TASK_INTERRUPTIBLE);
-                add_wait_queue(&dev->rad_waitq, &wait);
+                add_wait_queue_exclusive(&dev->rad_waitq, &wait);
                 spin_unlock_irqrestore(&dev->rad_lock, flags);
 
                 if (nsoonest == 0) {
@@ -2068,13 +2034,3 @@ kranal_scheduler (void *arg)
         kranal_thread_fini();
         return 0;
 }
-
-
-lib_nal_t kranal_lib = {
-        libnal_data:        &kranal_data,      /* NAL private data */
-        libnal_send:         kranal_send,
-        libnal_send_pages:   kranal_send_pages,
-        libnal_recv:         kranal_recv,
-        libnal_recv_pages:   kranal_recv_pages,
-        libnal_dist:         kranal_dist
-};
diff --git a/lnet/klnds/ralnd/ralnd_modparams.c b/lnet/klnds/ralnd/ralnd_modparams.c
new file mode 100644 (file)
index 0000000..45f42e1
--- /dev/null
@@ -0,0 +1,135 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "ralnd.h"
+
+static int n_connd = 4;
+CFS_MODULE_PARM(n_connd, "i", int, 0444,
+                "# of connection daemons");
+
+static int min_reconnect_interval = 1;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = 60;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int ntx = 256;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of transmit descriptors");
+
+static int credits = 128;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 32;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int fma_cq_size = 8192;
+CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
+               "size of the completion queue");
+
+static int timeout = 30;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "communications timeout (seconds)");
+
+static int max_immediate = (2<<10);
+CFS_MODULE_PARM(max_immediate, "i", int, 0644,
+               "immediate/RDMA breakpoint");
+
+kra_tunables_t kranal_tunables = {
+       .kra_n_connd                = &n_connd,
+       .kra_min_reconnect_interval = &min_reconnect_interval,
+       .kra_max_reconnect_interval = &max_reconnect_interval,
+       .kra_ntx                    = &ntx,
+       .kra_credits                = &credits,
+       .kra_peercredits            = &peer_credits,
+       .kra_fma_cq_size            = &fma_cq_size,
+       .kra_timeout                = &timeout,
+       .kra_max_immediate          = &max_immediate,
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table kranal_ctl_table[] = {
+       {1, "n_connd", &n_connd, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {2, "min_reconnect_interval", &min_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {3, "max_reconnect_interval", &max_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {4, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {5, "credits", &credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {6, "peer_credits", &peer_credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {7, "fma_cq_size", &fma_cq_size, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {8, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {9, "max_immediate", &max_immediate, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {0}
+};
+
+static ctl_table kranal_top_ctl_table[] = {
+       {202, "ranal", NULL, 0, 0555, kranal_ctl_table},
+       {0}
+};
+
+int
+kranal_tunables_init ()
+{
+       kranal_tunables.kra_sysctl =
+               register_sysctl_table(kranal_top_ctl_table, 0);
+       
+       if (kranal_tunables.kra_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+       
+       return 0;
+}
+
+void
+kranal_tunables_fini ()
+{
+       if (kranal_tunables.kra_sysctl != NULL)
+               unregister_sysctl_table(kranal_tunables.kra_sysctl);
+}
+
+#else
+
+int
+kranal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kranal_tunables_fini ()
+{
+}
+
+#endif
+
index 11be93d..f5a5460 100644 (file)
@@ -5,11 +5,11 @@
        <key>CFBundleDevelopmentRegion</key>
        <string>English</string>
        <key>CFBundleExecutable</key>
-       <string>ksocknal</string>
+       <string>ksocklnd</string>
        <key>CFBundleIconFile</key>
        <string></string>
        <key>CFBundleIdentifier</key>
-       <string>com.clusterfs.lustre.ksocknal</string>
+       <string>com.clusterfs.lustre.ksocklnd</string>
        <key>CFBundleInfoDictionaryVersion</key>
        <string>6.0</string>
        <key>CFBundlePackageType</key>
        <string>1.0.0</string>
        <key>OSBundleLibraries</key>
        <dict> 
-               <key>com.apple.kernel.bsd</key> 
-               <string>1.1</string> 
-               <key>com.apple.kernel.iokit</key> 
-               <string>1.0.0b1</string> 
-               <key>com.apple.kernel.mach</key> 
-               <string>1.0.0b1</string> 
+               <key>com.apple.kpi.bsd</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.libkern</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.mach</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.unsupported</key>
+               <string>8.0.0b1</string>
                <key>com.clusterfs.lustre.libcfs</key> 
                <string>1.0.0</string>
-               <key>com.clusterfs.lustre.portals</key> 
+               <key>com.clusterfs.lustre.lnet</key> 
                <string>1.0.0</string>
        </dict>
 </dict>
index 7fe9638..3a6c3f7 100644 (file)
@@ -1,9 +1,5 @@
-MODULES := ksocknal
+MODULES := ksocklnd
 
-ksocknal-objs := socknal.o socknal_cb.o socknal_lib-linux.o
-
-# If you don't build with -O2, your modules won't insert, becahse htonl is
-# just special that way.
-EXTRA_POST_CFLAGS := -O2
+ksocklnd-objs := socklnd.o socklnd_cb.o socklnd_modparams.o socklnd_lib-linux.o
 
 @INCLUDE_RULES@
index 71a3633..0dbe697 100644 (file)
@@ -1,25 +1,23 @@
 if MODULES
 if LINUX
-if !CRAY_PORTALS
 
-  modulenet_DATA := ksocknal$(KMODEXT)
+  modulenet_DATA := ksocklnd$(KMODEXT)
 
-endif # !CRAY_PORTALS
 endif # LINUX
 endif # MODULES
 
-DIST_SOURCES := $(ksocknal-objs:%.o=%.c) socknal_lib-linux.h socknal.h
+DIST_SOURCES := $(ksocklnd-objs:%.o=%.c) socklnd_lib-linux.h socklnd.h
 
 if DARWIN
 
-  macos_PROGRAMS := ksocknal
+  macos_PROGRAMS := ksocklnd
 
-  nodist_ksocknal_SOURCES := socknal.c socknal_cb.c socknal_lib-darwin.c
-  DIST_SOURCES += socknal_lib-darwin.c socknal_lib-darwin.h
+  nodist_ksocklnd_SOURCES := socklnd.c socklnd_cb.c socklnd_modparams.c socklnd_lib-darwin.c
+  DIST_SOURCES += socklnd_lib-darwin.c socklnd_lib-darwin.h
 
-  ksocknal_CFLAGS := $(EXTRA_KCFLAGS)
-  ksocknal_LDFLAGS := $(EXTRA_KLDFLAGS)
-  ksocknal_LDADD := $(EXTRA_KLIBS)
+  ksocklnd_CFLAGS := $(EXTRA_KCFLAGS)
+  ksocklnd_LDFLAGS := $(EXTRA_KLDFLAGS)
+  ksocklnd_LDADD := $(EXTRA_KLIBS)
 
   plist_DATA := Info.plist
   install_data_hook := fix-kext-ownership
@@ -29,4 +27,4 @@ endif # DARWIN
 EXTRA_DIST := $(plist_DATA)
 install-data-hook: $(install_data_hook)
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  socknal_lib.c
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ socklnd_lib.c
index 295ec35..9f6ba9c 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "socknal.h"
-
-nal_t                   ksocknal_api;
-ksock_nal_data_t        ksocknal_data;
-ptl_handle_ni_t         ksocknal_ni;
-ksock_tunables_t        ksocknal_tunables;
-
-kpr_nal_interface_t ksocknal_router_interface = {
-        kprni_nalid:      SOCKNAL,
-        kprni_arg:        &ksocknal_data,
-        kprni_fwd:        ksocknal_fwd_packet,
-        kprni_notify:     ksocknal_notify,
+#include "socklnd.h"
+
+lnd_t the_ksocklnd = {
+        .lnd_type       = SOCKLND,
+        .lnd_startup    = ksocknal_startup,
+        .lnd_shutdown   = ksocknal_shutdown,
+        .lnd_ctl        = ksocknal_ctl,
+        .lnd_send       = ksocknal_send,
+        .lnd_recv       = ksocknal_recv,
+        .lnd_notify     = ksocknal_notify,
+        .lnd_accept     = ksocknal_accept,
 };
 
-int
-ksocknal_set_mynid(ptl_nid_t nid)
-{
-        lib_ni_t *ni = &ksocknal_lib.libnal_ni;
-
-        /* FIXME: we have to do this because we call lib_init() at module
-         * insertion time, which is before we have 'mynid' available.  lib_init
-         * sets the NAL's nid, which it uses to tell other nodes where packets
-         * are coming from.  This is not a very graceful solution to this
-         * problem. */
-
-        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->ni_pid.nid);
-
-        ni->ni_pid.nid = nid;
-        return (0);
-}
+ksock_nal_data_t        ksocknal_data;
 
 ksock_interface_t *
-ksocknal_ip2iface(__u32 ip)
+ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip)
 {
+        ksock_net_t       *net = ni->ni_data;
         int                i;
         ksock_interface_t *iface;
 
-        for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) {
-                LASSERT(i < SOCKNAL_MAX_INTERFACES);
-                iface = &ksocknal_data.ksnd_interfaces[i];
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                LASSERT(i < LNET_MAX_INTERFACES);
+                iface = &net->ksnn_interfaces[i];
 
                 if (iface->ksni_ipaddr == ip)
                         return (iface);
@@ -77,21 +61,22 @@ ksocknal_create_route (__u32 ipaddr, int port)
 {
         ksock_route_t *route;
 
-        PORTAL_ALLOC (route, sizeof (*route));
+        LIBCFS_ALLOC (route, sizeof (*route));
         if (route == NULL)
                 return (NULL);
 
         atomic_set (&route->ksnr_refcount, 1);
         route->ksnr_peer = NULL;
-        route->ksnr_timeout = cfs_time_current();
-        route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
+        route->ksnr_retry_interval = 0;         /* OK to connect at any time */
         route->ksnr_ipaddr = ipaddr;
         route->ksnr_port = port;
+        route->ksnr_scheduled = 0;
         route->ksnr_connecting = 0;
         route->ksnr_connected = 0;
         route->ksnr_deleted = 0;
         route->ksnr_conn_count = 0;
         route->ksnr_share_count = 0;
+        route->ksnr_proto = &ksocknal_protocol_v2x;
 
         return (route);
 }
@@ -99,86 +84,90 @@ ksocknal_create_route (__u32 ipaddr, int port)
 void
 ksocknal_destroy_route (ksock_route_t *route)
 {
-        if (route->ksnr_peer != NULL)
-                ksocknal_put_peer (route->ksnr_peer);
+        LASSERT (atomic_read(&route->ksnr_refcount) == 0);
 
-        PORTAL_FREE (route, sizeof (*route));
-}
-
-void
-ksocknal_put_route (ksock_route_t *route)
-{
-        CDEBUG (D_OTHER, "putting route[%p] (%d)\n",
-                route, atomic_read (&route->ksnr_refcount));
-
-        LASSERT (atomic_read (&route->ksnr_refcount) > 0);
-        if (!atomic_dec_and_test (&route->ksnr_refcount))
-             return;
+        if (route->ksnr_peer != NULL)
+                ksocknal_peer_decref(route->ksnr_peer);
 
-        ksocknal_destroy_route (route);
+        LIBCFS_FREE (route, sizeof (*route));
 }
 
-ksock_peer_t *
-ksocknal_create_peer (ptl_nid_t nid)
+int
+ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id)
 {
-        ksock_peer_t *peer;
+        ksock_net_t   *net = ni->ni_data;
+        ksock_peer_t  *peer;
 
-        LASSERT (nid != PTL_NID_ANY);
+        LASSERT (id.nid != LNET_NID_ANY);
+        LASSERT (id.pid != LNET_PID_ANY);
+        LASSERT (!in_interrupt());
 
-        PORTAL_ALLOC (peer, sizeof (*peer));
+        LIBCFS_ALLOC (peer, sizeof (*peer));
         if (peer == NULL)
-                return (NULL);
+                return -ENOMEM;
 
         memset (peer, 0, sizeof (*peer));       /* NULL pointers/clear flags etc */
 
-        peer->ksnp_nid = nid;
+        peer->ksnp_ni = ni;
+        peer->ksnp_id = id;
         atomic_set (&peer->ksnp_refcount, 1);   /* 1 ref for caller */
         peer->ksnp_closing = 0;
+        peer->ksnp_accepting = 0;
+        peer->ksnp_zc_next_cookie = 1;
         CFS_INIT_LIST_HEAD (&peer->ksnp_conns);
         CFS_INIT_LIST_HEAD (&peer->ksnp_routes);
         CFS_INIT_LIST_HEAD (&peer->ksnp_tx_queue);
+        CFS_INIT_LIST_HEAD (&peer->ksnp_zc_req_list);
+        spin_lock_init(&peer->ksnp_lock);
 
-        atomic_inc (&ksocknal_data.ksnd_npeers);
-        return (peer);
+        spin_lock_bh (&net->ksnn_lock);
+
+        if (net->ksnn_shutdown) {
+                spin_unlock_bh (&net->ksnn_lock);
+                
+                LIBCFS_FREE(peer, sizeof(*peer));
+                CERROR("Can't create peer: network shutdown\n");
+                return -ESHUTDOWN;
+        }
+
+        net->ksnn_npeers++;
+
+        spin_unlock_bh (&net->ksnn_lock);
+
+        *peerp = peer;
+        return 0;
 }
 
 void
 ksocknal_destroy_peer (ksock_peer_t *peer)
 {
-        CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ksnp_nid, peer);
+        ksock_net_t    *net = peer->ksnp_ni->ni_data;
+
+        CDEBUG (D_NET, "peer %s %p deleted\n", 
+                libcfs_id2str(peer->ksnp_id), peer);
 
         LASSERT (atomic_read (&peer->ksnp_refcount) == 0);
+        LASSERT (peer->ksnp_accepting == 0);
         LASSERT (list_empty (&peer->ksnp_conns));
         LASSERT (list_empty (&peer->ksnp_routes));
         LASSERT (list_empty (&peer->ksnp_tx_queue));
+        LASSERT (list_empty (&peer->ksnp_zc_req_list));
 
-        PORTAL_FREE (peer, sizeof (*peer));
-
-        /* NB a peer's connections and autoconnect routes keep a reference
-         * on their peer until they are destroyed, so we can be assured
-         * that _all_ state to do with this peer has been cleaned up when
-         * its refcount drops to zero. */
-        atomic_dec (&ksocknal_data.ksnd_npeers);
-}
-
-void
-ksocknal_put_peer (ksock_peer_t *peer)
-{
-        CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n",
-                peer, peer->ksnp_nid,
-                atomic_read (&peer->ksnp_refcount));
-
-        LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
-        if (!atomic_dec_and_test (&peer->ksnp_refcount))
-                return;
+        LIBCFS_FREE (peer, sizeof (*peer));
 
-        ksocknal_destroy_peer (peer);
+        /* NB a peer's connections and routes keep a reference on their peer
+         * until they are destroyed, so we can be assured that _all_ state to
+         * do with this peer has been cleaned up when its refcount drops to
+         * zero. */
+        spin_lock_bh (&net->ksnn_lock);
+        net->ksnn_npeers--;
+        spin_unlock_bh (&net->ksnn_lock);
 }
 
 ksock_peer_t *
-ksocknal_find_peer_locked (ptl_nid_t nid)
+ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id)
 {
-        struct list_head *peer_list = ksocknal_nid2peerlist (nid);
+        struct list_head *peer_list = ksocknal_nid2peerlist(id.nid);
         struct list_head *tmp;
         ksock_peer_t     *peer;
 
@@ -188,25 +177,30 @@ ksocknal_find_peer_locked (ptl_nid_t nid)
 
                 LASSERT (!peer->ksnp_closing);
 
-                if (peer->ksnp_nid != nid)
+                if (peer->ksnp_ni != ni)
                         continue;
 
-                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
-                       peer, nid, atomic_read (&peer->ksnp_refcount));
+                if (peer->ksnp_id.nid != id.nid ||
+                    peer->ksnp_id.pid != id.pid)
+                        continue;
+
+                CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                       peer, libcfs_id2str(id), 
+                       atomic_read(&peer->ksnp_refcount));
                 return (peer);
         }
         return (NULL);
 }
 
 ksock_peer_t *
-ksocknal_get_peer (ptl_nid_t nid)
+ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id)
 {
         ksock_peer_t     *peer;
 
         read_lock (&ksocknal_data.ksnd_global_lock);
-        peer = ksocknal_find_peer_locked (nid);
+        peer = ksocknal_find_peer_locked (ni, id);
         if (peer != NULL)                       /* +1 ref for caller? */
-                atomic_inc (&peer->ksnp_refcount);
+                ksocknal_peer_addref(peer);
         read_unlock (&ksocknal_data.ksnd_global_lock);
 
         return (peer);
@@ -219,10 +213,10 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer)
         __u32              ip;
 
         for (i = 0; i < peer->ksnp_n_passive_ips; i++) {
-                LASSERT (i < SOCKNAL_MAX_INTERFACES);
+                LASSERT (i < LNET_MAX_INTERFACES);
                 ip = peer->ksnp_passive_ips[i];
 
-                ksocknal_ip2iface(ip)->ksni_npeers--;
+                ksocknal_ip2iface(peer->ksnp_ni, ip)->ksni_npeers--;
         }
 
         LASSERT (list_empty(&peer->ksnp_conns));
@@ -231,12 +225,12 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer)
         peer->ksnp_closing = 1;
         list_del (&peer->ksnp_list);
         /* lose peerlist's ref */
-        ksocknal_put_peer (peer);
+        ksocknal_peer_decref(peer);
 }
 
 int
-ksocknal_get_peer_info (int index, ptl_nid_t *nid,
-                        __u32 *myip, __u32 *peer_ip, int *port,
+ksocknal_get_peer_info (lnet_ni_t *ni, int index, 
+                        lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip, int *port,
                         int *conn_count, int *share_count)
 {
         ksock_peer_t      *peer;
@@ -254,12 +248,15 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid,
                 list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) {
                         peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
 
+                        if (peer->ksnp_ni != ni)
+                                continue;
+
                         if (peer->ksnp_n_passive_ips == 0 &&
                             list_empty(&peer->ksnp_routes)) {
                                 if (index-- > 0)
                                         continue;
 
-                                *nid = peer->ksnp_nid;
+                                *id = peer->ksnp_id;
                                 *myip = 0;
                                 *peer_ip = 0;
                                 *port = 0;
@@ -273,7 +270,7 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid,
                                 if (index-- > 0)
                                         continue;
 
-                                *nid = peer->ksnp_nid;
+                                *id = peer->ksnp_id;
                                 *myip = peer->ksnp_passive_ips[j];
                                 *peer_ip = 0;
                                 *port = 0;
@@ -290,7 +287,7 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid,
                                 route = list_entry(rtmp, ksock_route_t,
                                                    ksnr_list);
 
-                                *nid = peer->ksnp_nid;
+                                *id = peer->ksnp_id;
                                 *myip = route->ksnr_myipaddr;
                                 *peer_ip = route->ksnr_ipaddr;
                                 *port = route->ksnr_port;
@@ -314,29 +311,31 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
         ksock_interface_t *iface;
 
         conn->ksnc_route = route;
-        atomic_inc (&route->ksnr_refcount);
+        ksocknal_route_addref(route);
 
         if (route->ksnr_myipaddr != conn->ksnc_myipaddr) {
                 if (route->ksnr_myipaddr == 0) {
                         /* route wasn't bound locally yet (the initial route) */
-                        CWARN("Binding "LPX64" %u.%u.%u.%u to %u.%u.%u.%u\n",
-                              peer->ksnp_nid,
-                              HIPQUAD(route->ksnr_ipaddr),
-                              HIPQUAD(conn->ksnc_myipaddr));
+                        CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(route->ksnr_ipaddr),
+                               HIPQUAD(conn->ksnc_myipaddr));
                 } else {
-                        CWARN("Rebinding "LPX64" %u.%u.%u.%u from "
-                              "%u.%u.%u.%u to %u.%u.%u.%u\n",
-                              peer->ksnp_nid,
-                              HIPQUAD(route->ksnr_ipaddr),
-                              HIPQUAD(route->ksnr_myipaddr),
-                              HIPQUAD(conn->ksnc_myipaddr));
-
-                        iface = ksocknal_ip2iface(route->ksnr_myipaddr);
+                        CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from "
+                               "%u.%u.%u.%u to %u.%u.%u.%u\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(route->ksnr_ipaddr),
+                               HIPQUAD(route->ksnr_myipaddr),
+                               HIPQUAD(conn->ksnc_myipaddr));
+
+                        iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                                  route->ksnr_myipaddr);
                         if (iface != NULL)
                                 iface->ksni_nroutes--;
                 }
                 route->ksnr_myipaddr = conn->ksnc_myipaddr;
-                iface = ksocknal_ip2iface(route->ksnr_myipaddr);
+                iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                          route->ksnr_myipaddr);
                 if (iface != NULL)
                         iface->ksni_nroutes++;
         }
@@ -346,8 +345,7 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn)
 
         /* Successful connection => further attempts can
          * proceed immediately */
-        route->ksnr_timeout = cfs_time_current();
-        route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
+        route->ksnr_retry_interval = 0;
 }
 
 void
@@ -355,10 +353,11 @@ ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
 {
         struct list_head  *tmp;
         ksock_conn_t      *conn;
-        int                type;
         ksock_route_t     *route2;
 
+        LASSERT (!peer->ksnp_closing);
         LASSERT (route->ksnr_peer == NULL);
+        LASSERT (!route->ksnr_scheduled);
         LASSERT (!route->ksnr_connecting);
         LASSERT (route->ksnr_connected == 0);
 
@@ -367,20 +366,20 @@ ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route)
                 route2 = list_entry(tmp, ksock_route_t, ksnr_list);
 
                 if (route2->ksnr_ipaddr == route->ksnr_ipaddr) {
-                        CERROR ("Duplicate route "LPX64" %u.%u.%u.%u\n",
-                                peer->ksnp_nid, HIPQUAD(route->ksnr_ipaddr));
+                        CERROR ("Duplicate route %s %u.%u.%u.%u\n",
+                                libcfs_id2str(peer->ksnp_id), 
+                                HIPQUAD(route->ksnr_ipaddr));
                         LBUG();
                 }
         }
 
         route->ksnr_peer = peer;
-        atomic_inc (&peer->ksnp_refcount);
+        ksocknal_peer_addref(peer);
         /* peer's routelist takes over my ref on 'route' */
         list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
 
         list_for_each(tmp, &peer->ksnp_conns) {
                 conn = list_entry(tmp, ksock_conn_t, ksnc_list);
-                type = conn->ksnc_type;
 
                 if (conn->ksnc_ipaddr != route->ksnr_ipaddr)
                         continue;
@@ -412,57 +411,59 @@ ksocknal_del_route_locked (ksock_route_t *route)
         }
 
         if (route->ksnr_myipaddr != 0) {
-                iface = ksocknal_ip2iface(route->ksnr_myipaddr);
+                iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni,
+                                          route->ksnr_myipaddr);
                 if (iface != NULL)
                         iface->ksni_nroutes--;
         }
 
         route->ksnr_deleted = 1;
         list_del (&route->ksnr_list);
-        ksocknal_put_route (route);             /* drop peer's ref */
+        ksocknal_route_decref(route);             /* drop peer's ref */
 
         if (list_empty (&peer->ksnp_routes) &&
             list_empty (&peer->ksnp_conns)) {
-                /* I've just removed the last autoconnect route of a peer
-                 * with no active connections */
+                /* I've just removed the last route to a peer with no active
+                 * connections */
                 ksocknal_unlink_peer_locked (peer);
         }
 }
 
 int
-ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port)
+ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port)
 {
-        unsigned long      flags;
         struct list_head  *tmp;
         ksock_peer_t      *peer;
         ksock_peer_t      *peer2;
         ksock_route_t     *route;
         ksock_route_t     *route2;
+        int                rc;
 
-        if (nid == PTL_NID_ANY)
+        if (id.nid == LNET_NID_ANY ||
+            id.pid == LNET_PID_ANY)
                 return (-EINVAL);
 
         /* Have a brand new peer ready... */
-        peer = ksocknal_create_peer (nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = ksocknal_create_peer(&peer, ni, id);
+        if (rc != 0)
+                return rc;
 
         route = ksocknal_create_route (ipaddr, port);
         if (route == NULL) {
-                ksocknal_put_peer (peer);
+                ksocknal_peer_decref(peer);
                 return (-ENOMEM);
         }
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
-        peer2 = ksocknal_find_peer_locked (nid);
+        peer2 = ksocknal_find_peer_locked (ni, id);
         if (peer2 != NULL) {
-                ksocknal_put_peer (peer);
+                ksocknal_peer_decref(peer);
                 peer = peer2;
         } else {
                 /* peer table takes my ref on peer */
                 list_add_tail (&peer->ksnp_list,
-                               ksocknal_nid2peerlist (nid));
+                               ksocknal_nid2peerlist (id.nid));
         }
 
         route2 = NULL;
@@ -478,17 +479,17 @@ ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port)
                 ksocknal_add_route_locked(peer, route);
                 route->ksnr_share_count++;
         } else {
-                ksocknal_put_route(route);
+                ksocknal_route_decref(route);
                 route2->ksnr_share_count++;
         }
 
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
         return (0);
 }
 
 void
-ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share)
+ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip)
 {
         ksock_conn_t     *conn;
         ksock_route_t    *route;
@@ -499,30 +500,18 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share)
         LASSERT (!peer->ksnp_closing);
 
         /* Extra ref prevents peer disappearing until I'm done with it */
-        atomic_inc(&peer->ksnp_refcount);
+        ksocknal_peer_addref(peer);
 
         list_for_each_safe (tmp, nxt, &peer->ksnp_routes) {
                 route = list_entry(tmp, ksock_route_t, ksnr_list);
 
-                if (single_share && route->ksnr_share_count == 0)
-                        continue;
-
                 /* no match */
                 if (!(ip == 0 || route->ksnr_ipaddr == ip))
                         continue;
 
-                if (!single_share)
-                        route->ksnr_share_count = 0;
-                else if (route->ksnr_share_count > 0)
-                        route->ksnr_share_count--;
-
-                if (route->ksnr_share_count == 0) {
-                        /* This deletes associated conns too */
-                        ksocknal_del_route_locked (route);
-                }
-
-                if (single_share)
-                        break;
+                route->ksnr_share_count = 0;
+                /* This deletes associated conns too */
+                ksocknal_del_route_locked (route);
         }
 
         nshared = 0;
@@ -550,14 +539,14 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share)
                 }
         }
 
-        ksocknal_put_peer(peer);
+        ksocknal_peer_decref(peer);
         /* NB peer unlinks itself when last conn/route is removed */
 }
 
 int
-ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share)
+ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip)
 {
-        unsigned long      flags;
+        CFS_LIST_HEAD     (zombies);
         struct list_head  *ptmp;
         struct list_head  *pnxt;
         ksock_peer_t      *peer;
@@ -566,10 +555,10 @@ ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share)
         int                i;
         int                rc = -ENOENT;
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
-        if (nid != PTL_NID_ANY)
-                lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers;
+        if (id.nid != LNET_NID_ANY)
+                lo = hi = ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers;
         else {
                 lo = 0;
                 hi = ksocknal_data.ksnd_peer_hash_size - 1;
@@ -579,24 +568,39 @@ ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share)
                 list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) {
                         peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
 
-                        if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid))
+                        if (peer->ksnp_ni != ni)
                                 continue;
 
-                        ksocknal_del_peer_locked (peer, ip, single_share);
-                        rc = 0;                 /* matched! */
+                        if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) &&
+                              (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid)))
+                                continue;
 
-                        if (single_share)
-                                break;
+                        ksocknal_peer_addref(peer);     /* a ref for me... */
+
+                        ksocknal_del_peer_locked (peer, ip);
+
+                        if (peer->ksnp_closing && !list_empty(&peer->ksnp_tx_queue)) {
+                                LASSERT (list_empty(&peer->ksnp_conns));
+                                LASSERT (list_empty(&peer->ksnp_routes));
+
+                                list_splice_init(&peer->ksnp_tx_queue, &zombies);
+                        }
+
+                        ksocknal_peer_decref(peer);     /* ...till here */
+
+                        rc = 0;                 /* matched! */
                 }
         }
 
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
+
+        ksocknal_txlist_done(ni, &zombies, 1);
 
         return (rc);
 }
 
 ksock_conn_t *
-ksocknal_get_conn_by_idx (int index)
+ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index)
 {
         ksock_peer_t      *peer;
         struct list_head  *ptmp;
@@ -612,12 +616,15 @@ ksocknal_get_conn_by_idx (int index)
 
                         LASSERT (!peer->ksnp_closing);
 
+                        if (peer->ksnp_ni != ni)
+                                continue;
+
                         list_for_each (ctmp, &peer->ksnp_conns) {
                                 if (index-- > 0)
                                         continue;
 
                                 conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
-                                atomic_inc (&conn->ksnc_refcount);
+                                ksocknal_conn_addref(conn);
                                 read_unlock (&ksocknal_data.ksnd_global_lock);
                                 return (conn);
                         }
@@ -663,18 +670,26 @@ ksocknal_choose_scheduler_locked (unsigned int irq)
 }
 
 int
-ksocknal_local_ipvec (__u32 *ipaddrs)
+ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs)
 {
+        ksock_net_t       *net = ni->ni_data;
         int                i;
         int                nip;
 
         read_lock (&ksocknal_data.ksnd_global_lock);
 
-        nip = ksocknal_data.ksnd_ninterfaces;
-        for (i = 0; i < nip; i++) {
-                LASSERT (i < SOCKNAL_MAX_INTERFACES);
+        nip = net->ksnn_ninterfaces;
+        LASSERT (nip < LNET_MAX_INTERFACES);
 
-                ipaddrs[i] = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr;
+        /* Only offer interfaces for additional connections if I have 
+         * more than one. */
+        if (nip < 2) {
+                read_unlock (&ksocknal_data.ksnd_global_lock);
+                return 0;
+        }
+        
+        for (i = 0; i < nip; i++) {
+                ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr;
                 LASSERT (ipaddrs[i] != 0);
         }
 
@@ -718,7 +733,7 @@ int
 ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
 {
         rwlock_t           *global_lock = &ksocknal_data.ksnd_global_lock;
-        unsigned long       flags;
+        ksock_net_t        *net = peer->ksnp_ni->ni_data;
         ksock_interface_t  *iface;
         ksock_interface_t  *best_iface;
         int                 n_ips;
@@ -739,12 +754,15 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
         /* Also note that I'm not going to return more than n_peerips
          * interfaces, even if I have more myself */
 
-        write_lock_irqsave(global_lock, flags);
+        write_lock_bh (global_lock);
 
-        LASSERT (n_peerips <= SOCKNAL_MAX_INTERFACES);
-        LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES);
+        LASSERT (n_peerips <= LNET_MAX_INTERFACES);
+        LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
 
-        n_ips = MIN(n_peerips, ksocknal_data.ksnd_ninterfaces);
+        /* Only match interfaces for additional connections 
+         * if I have > 1 interface */
+        n_ips = (net->ksnn_ninterfaces < 2) ? 0 :
+                MIN(n_peerips, net->ksnn_ninterfaces);
 
         for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) {
                 /*              ^ yes really... */
@@ -758,7 +776,7 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
                 if (i < peer->ksnp_n_passive_ips) {
                         /* Old interface. */
                         ip = peer->ksnp_passive_ips[i];
-                        best_iface = ksocknal_ip2iface(ip);
+                        best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip);
 
                         /* peer passive ips are kept up to date */
                         LASSERT(best_iface != NULL);
@@ -770,8 +788,8 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
                         best_netmatch = 0;
                         best_npeers = 0;
 
-                        for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) {
-                                iface = &ksocknal_data.ksnd_interfaces[j];
+                        for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                                iface = &net->ksnn_interfaces[j];
                                 ip = iface->ksni_ipaddr;
 
                                 for (k = 0; k < peer->ksnp_n_passive_ips; k++)
@@ -812,7 +830,7 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips)
         /* Overwrite input peer IP addresses */
         memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips));
 
-        write_unlock_irqrestore(global_lock, flags);
+        write_unlock_bh (global_lock);
 
         return (n_ips);
 }
@@ -823,7 +841,8 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
 {
         ksock_route_t      *newroute = NULL;
         rwlock_t           *global_lock = &ksocknal_data.ksnd_global_lock;
-        unsigned long       flags;
+        lnet_ni_t          *ni = peer->ksnp_ni;
+        ksock_net_t        *net = ni->ni_data;
         struct list_head   *rtmp;
         ksock_route_t      *route;
         ksock_interface_t  *iface;
@@ -839,21 +858,33 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
          * expecting to be dealing with small numbers of interfaces, so the
          * O(n**3)-ness here shouldn't matter */
 
-        write_lock_irqsave(global_lock, flags);
+        write_lock_bh (global_lock);
 
-        LASSERT (npeer_ipaddrs <= SOCKNAL_MAX_INTERFACES);
+        if (net->ksnn_ninterfaces < 2) {
+                /* Only create additional connections 
+                 * if I have > 1 interface */
+                write_unlock_bh (global_lock);
+                return;
+        }
+        
+        LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES);
 
         for (i = 0; i < npeer_ipaddrs; i++) {
                 if (newroute != NULL) {
                         newroute->ksnr_ipaddr = peer_ipaddrs[i];
                 } else {
-                        write_unlock_irqrestore(global_lock, flags);
+                        write_unlock_bh (global_lock);
 
                         newroute = ksocknal_create_route(peer_ipaddrs[i], port);
                         if (newroute == NULL)
                                 return;
 
-                        write_lock_irqsave(global_lock, flags);
+                        write_lock_bh (global_lock);
+                }
+
+                if (peer->ksnp_closing) {
+                        /* peer got closed under me */
+                        break;
                 }
 
                 /* Already got a route? */
@@ -873,11 +904,11 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
                 best_nroutes = 0;
                 best_netmatch = 0;
 
-                LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES);
+                LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES);
 
                 /* Select interface to connect from */
-                for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) {
-                        iface = &ksocknal_data.ksnd_interfaces[j];
+                for (j = 0; j < net->ksnn_ninterfaces; j++) {
+                        iface = &net->ksnn_interfaces[j];
 
                         /* Using this interface already? */
                         list_for_each(rtmp, &peer->ksnp_routes) {
@@ -916,143 +947,266 @@ ksocknal_create_routes(ksock_peer_t *peer, int port,
                 newroute = NULL;
         }
 
-        write_unlock_irqrestore(global_lock, flags);
+        write_unlock_bh (global_lock);
         if (newroute != NULL)
-                ksocknal_put_route(newroute);
+                ksocknal_route_decref(newroute);
 }
 
 int
-ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
+ksocknal_accept (lnet_ni_t *ni, cfs_socket_t *sock)
+{
+        ksock_connreq_t    *cr;
+        int                 rc;
+        __u32               peer_ip;
+        int                 peer_port;
+
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+        LASSERT (rc == 0);                      /* we succeeded before */
+
+        LIBCFS_ALLOC(cr, sizeof(*cr));
+        if (cr == NULL) {
+                LCONSOLE_ERROR("Dropping connection request from "
+                               "%u.%u.%u.%u: memory exhausted\n",
+                               HIPQUAD(peer_ip));
+                return -ENOMEM;
+        }
+
+        lnet_ni_addref(ni);
+        cr->ksncr_ni   = ni;
+        cr->ksncr_sock = sock;
+
+        spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
+
+        list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs);
+        cfs_waitq_signal(&ksocknal_data.ksnd_connd_waitq);
+                        
+        spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
+        return 0;
+}
+
+int
+ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, 
+                      cfs_socket_t *sock, int type)
 {
-        int                passive = (type == SOCKNAL_CONN_NONE);
         rwlock_t          *global_lock = &ksocknal_data.ksnd_global_lock;
-        __u32              ipaddrs[SOCKNAL_MAX_INTERFACES];
-        int                nipaddrs;
-        ptl_nid_t          nid;
+        CFS_LIST_HEAD     (zombies);
+        lnet_process_id_t  peerid;
         struct list_head  *tmp;
         __u64              incarnation;
-        unsigned long      flags;
         ksock_conn_t      *conn;
         ksock_conn_t      *conn2;
         ksock_peer_t      *peer = NULL;
         ksock_peer_t      *peer2;
         ksock_sched_t     *sched;
+        ksock_hello_msg_t *hello;
         unsigned int       irq;
         ksock_tx_t        *tx;
         int                rc;
+        int                active;
+        char              *warn = NULL;
 
-        /* NB, sock has an associated file since (a) this connection might
-         * have been created in userland and (b) we need to refcount the
-         * socket so that we don't close it while I/O is being done on
-         * it, and sock->file has that pre-cooked... */
-        LASSERT (KSN_SOCK2FILE(sock) != NULL);
-        LASSERT (cfs_file_count(KSN_SOCK2FILE(sock)) > 0);
-        LASSERT (route == NULL || !passive);
+        active = (route != NULL);
 
-        rc = ksocknal_lib_setup_sock (sock);
-        if (rc != 0)
-                return (rc);
+        LASSERT (active == (type != SOCKLND_CONN_NONE));
+        LASSERT (route == NULL || route->ksnr_proto != NULL);
 
         irq = ksocknal_lib_sock_irq (sock);
 
-        PORTAL_ALLOC(conn, sizeof(*conn));
-        if (conn == NULL)
-                return (-ENOMEM);
+        LIBCFS_ALLOC(conn, sizeof(*conn));
+        if (conn == NULL) {
+                rc = -ENOMEM;
+                goto failed_0;
+        }
 
         memset (conn, 0, sizeof (*conn));
         conn->ksnc_peer = NULL;
         conn->ksnc_route = NULL;
         conn->ksnc_sock = sock;
+        atomic_set (&conn->ksnc_sock_refcount, 1); /* 1 ref for conn */
         conn->ksnc_type = type;
         ksocknal_lib_save_callback(sock, conn);
-        atomic_set (&conn->ksnc_refcount, 1);    /* 1 ref for me */
+        atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
+
+        conn->ksnc_zc_capable = ksocknal_lib_zc_capable(sock);
 
         conn->ksnc_rx_ready = 0;
         conn->ksnc_rx_scheduled = 0;
-        ksocknal_new_packet (conn, 0);
 
         CFS_INIT_LIST_HEAD (&conn->ksnc_tx_queue);
         conn->ksnc_tx_ready = 0;
         conn->ksnc_tx_scheduled = 0;
+        conn->ksnc_tx_mono = NULL;
         atomic_set (&conn->ksnc_tx_nob, 0);
 
+        LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t,
+                                     kshm_ips[LNET_MAX_INTERFACES]));
+        if (hello == NULL) {
+                rc = -ENOMEM;
+                goto failed_1;
+        }
+
         /* stash conn's local and remote addrs */
         rc = ksocknal_lib_get_conn_addrs (conn);
         if (rc != 0)
-                goto failed_0;
+                goto failed_1;
+
+        /* Find out/confirm peer's NID and connection type and get the
+         * vector of interfaces she's willing to let me connect to.
+         * Passive connections use the listener timeout since the peer sends
+         * eagerly */
+
+        if (active) {
+                LASSERT(ni == route->ksnr_peer->ksnp_ni);
 
-        if (!passive) {
                 /* Active connection sends HELLO eagerly */
-                rc = ksocknal_local_ipvec(ipaddrs);
-                if (rc < 0)
-                        goto failed_0;
-                nipaddrs = rc;
+                hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips);
+                peerid = route->ksnr_peer->ksnp_id;
+                conn->ksnc_proto = route->ksnr_proto;
 
-                rc = ksocknal_send_hello (conn, ipaddrs, nipaddrs);
+                rc = ksocknal_send_hello (ni, conn, peerid.nid, hello);
                 if (rc != 0)
-                        goto failed_0;
+                        goto failed_1;
+        } else {
+                peerid.nid = LNET_NID_ANY;
+                peerid.pid = LNET_PID_ANY;
+
+                /* Passive, get protocol from peer */
+                conn->ksnc_proto = NULL;
         }
 
-        /* Find out/confirm peer's NID and connection type and get the
-         * vector of interfaces she's willing to let me connect to */
-        nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid;
-        rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs);
-        if (rc < 0)
-                goto failed_0;
-        nipaddrs = rc;
-        LASSERT (nid != PTL_NID_ANY);
+        rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation);
+        if (rc < 0) {
+                if (rc == -EALREADY) {
+                        /* only active connection loses conn race */
+                        LASSERT (active);
 
-        if (route != NULL) {
+                        CDEBUG(D_NET, "Lost connection race with %s\n", 
+                               libcfs_id2str(peerid));
+                        /* Not an actual failure: return +ve RC so active
+                         * connector can back off */
+                        rc = EALREADY;
+                }
+                goto failed_1;
+        }
+
+        if (active && route->ksnr_proto != conn->ksnc_proto) {
+                /* Active connecting, and different protocol is returned */
+                CDEBUG(D_NET, "Connecting by %d.x protocol is rejected,"
+                              " compatible version %d.x found.\n",
+                       route->ksnr_proto->pro_version,
+                       conn->ksnc_proto->pro_version);
+                /* Not an actual failure: return +ve RC so active
+                 * connector can back off */
+                rc = EPROTO;
+
+                /* Retry with peer's protocol later */
+                route->ksnr_proto = conn->ksnc_proto;
+
+                goto failed_1;
+        }
+        
+        LASSERT (peerid.nid != LNET_NID_ANY);
+
+        if (active) {
                 peer = route->ksnr_peer;
-                atomic_inc(&peer->ksnp_refcount);
+                ksocknal_peer_addref(peer);
+
+                /* additional routes after interface exchange? */
+                ksocknal_create_routes(peer, conn->ksnc_port,
+                                       hello->kshm_ips, hello->kshm_nips);
+
+                /* setup the socket AFTER I've received hello (it disables
+                 * SO_LINGER).  I might call back to the acceptor who may want
+                 * to send a protocol version response and then close the
+                 * socket; this ensures the socket only tears down after the
+                 * response has been sent. */
+                rc = ksocknal_lib_setup_sock(sock);
+
+                write_lock_bh (global_lock);
+
+                if (rc != 0)
+                        goto failed_2;
         } else {
-                peer = ksocknal_create_peer(nid);
-                if (peer == NULL) {
-                        rc = -ENOMEM;
-                        goto failed_0;
-                }
+                rc = ksocknal_create_peer(&peer, ni, peerid);
+                if (rc != 0)
+                        goto failed_1;
 
-                write_lock_irqsave(global_lock, flags);
+                write_lock_bh (global_lock);
 
-                peer2 = ksocknal_find_peer_locked(nid);
+                peer2 = ksocknal_find_peer_locked(ni, peerid);
                 if (peer2 == NULL) {
                         /* NB this puts an "empty" peer in the peer
                          * table (which takes my ref) */
                         list_add_tail(&peer->ksnp_list,
-                                      ksocknal_nid2peerlist(nid));
-                } else  {
-                        ksocknal_put_peer(peer);
+                                      ksocknal_nid2peerlist(peerid.nid));
+                } else {
+                        ksocknal_peer_decref(peer);
                         peer = peer2;
                 }
+
                 /* +1 ref for me */
-                atomic_inc(&peer->ksnp_refcount);
+                ksocknal_peer_addref(peer);
+                peer->ksnp_accepting++;
+                
+                /* Am I already connecting to this guy?  Resolve in
+                 * favour of higher NID... */
+                rc = 0;
+                if (peerid.nid < ni->ni_nid) {
+                        list_for_each(tmp, &peer->ksnp_routes) {
+                                route = list_entry(tmp, ksock_route_t, 
+                                                   ksnr_list);
 
-                write_unlock_irqrestore(global_lock, flags);
-        }
+                                if (route->ksnr_ipaddr != conn->ksnc_ipaddr)
+                                        continue;
+                        
+                                if (route->ksnr_connecting) {
+                                        rc = EALREADY;  /* not a failure */
+                                        warn = "connection race";
+                                }
 
-        if (!passive) {
-                ksocknal_create_routes(peer, conn->ksnc_port,
-                                       ipaddrs, nipaddrs);
-                rc = 0;
-        } else {
-                rc = ksocknal_select_ips(peer, ipaddrs, nipaddrs);
-                LASSERT (rc >= 0);
-                rc = ksocknal_send_hello (conn, ipaddrs, rc);
-        }
-        if (rc < 0)
-                goto failed_1;
+                                break;
+                        }
+                }
+                route = NULL;
+                
+                write_unlock_bh (global_lock);
 
-        write_lock_irqsave (global_lock, flags);
+                if (rc != 0) {
+                        /* set CONN_NONE makes returned HELLO acknowledge I
+                         * lost a connection race */
+                        conn->ksnc_type = SOCKLND_CONN_NONE;
+                        hello->kshm_nips = 0;
+                        ksocknal_send_hello(ni, conn, peerid.nid, hello);
+                } else {
+                        hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+                                                               hello->kshm_nips);
+                        rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+
+                        /* Setup the socket (it disables SO_LINGER).  I don't
+                         * do it if I'm sending a negative response to ensure
+                         * the response isn't discarded when I close the socket
+                         * immediately after sending it. */
+                        if (rc == 0)
+                                rc = ksocknal_lib_setup_sock(sock);
+                }
+                
+                write_lock_bh (global_lock);
+                peer->ksnp_accepting--;
+                
+                if (rc != 0)
+                        goto failed_2;
+        }
 
         if (peer->ksnp_closing ||
-            (route != NULL && route->ksnr_deleted)) {
-                /* route/peer got closed under me */
+            (active && route->ksnr_deleted)) {
+                /* peer/route got closed under me */
                 rc = -ESTALE;
+                warn = "peer/route removed";
                 goto failed_2;
         }
 
-        /* Refuse to duplicate an existing connection (both sides might
-         * autoconnect at once), unless this is a loopback connection */
+        /* Refuse to duplicate an existing connection, unless this is a
+         * loopback connection */
         if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
                 list_for_each(tmp, &peer->ksnp_conns) {
                         conn2 = list_entry(tmp, ksock_conn_t, ksnc_list);
@@ -1063,10 +1217,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                             conn2->ksnc_incarnation != incarnation)
                                 continue;
 
-                        CWARN("Not creating duplicate connection to "
-                              "%u.%u.%u.%u type %d\n",
-                              HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type);
-                        rc = -EALREADY;
+                        rc = 0;    /* more of a NOOP than a failure */
+                        warn = "duplicate";
                         goto failed_2;
                 }
         }
@@ -1074,10 +1226,10 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
         /* If the connection created by this route didn't bind to the IP
          * address the route connected to, the connection/route matching
          * code below probably isn't going to work. */
-        if (route != NULL &&
+        if (active &&
             route->ksnr_ipaddr != conn->ksnc_ipaddr) {
-                CERROR("Route "LPX64" %u.%u.%u.%u connected to %u.%u.%u.%u\n",
-                       peer->ksnp_nid,
+                CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n",
+                       libcfs_id2str(peer->ksnp_id),
                        HIPQUAD(route->ksnr_ipaddr),
                        HIPQUAD(conn->ksnc_ipaddr));
         }
@@ -1096,9 +1248,6 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
                 break;
         }
 
-        /* Give conn a ref on sock->file since we're going to return success */
-        cfs_get_file(KSN_SOCK2FILE(sock));
-
         conn->ksnc_peer = peer;                 /* conn takes my ref on peer */
         conn->ksnc_incarnation = incarnation;
         peer->ksnp_last_alive = cfs_time_current();
@@ -1110,11 +1259,13 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
 
         /* Set the deadline for the outgoing HELLO to drain */
         conn->ksnc_tx_bufnob = SOCK_WMEM_QUEUED(sock);
-        conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout);
+        conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
         mb();       /* order with adding to peer's conn list */
 
         list_add (&conn->ksnc_list, &peer->ksnp_conns);
-        atomic_inc (&conn->ksnc_refcount);
+        ksocknal_conn_addref(conn);
+
+        ksocknal_new_packet(conn, 0);
 
         /* NB my callbacks block while I hold ksnd_global_lock */
         ksocknal_lib_set_callback(sock, conn);
@@ -1131,46 +1282,67 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type)
         }
 
         rc = ksocknal_close_stale_conns_locked(peer, incarnation);
+        write_unlock_bh (global_lock);
+
         if (rc != 0)
-                CDEBUG(D_HA,
-                       "Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n",
-                       rc, conn->ksnc_peer->ksnp_nid,
+                CDEBUG(D_NET, "Closed %d stale conns to %s ip %d.%d.%d.%d\n",
+                       rc, libcfs_id2str(conn->ksnc_peer->ksnp_id),
                        HIPQUAD(conn->ksnc_ipaddr));
 
-        write_unlock_irqrestore (global_lock, flags);
-
         ksocknal_lib_bind_irq (irq);
 
         /* Call the callbacks right now to get things going. */
-        if (ksocknal_getconnsock(conn) == 0) {
-                ksocknal_lib_act_callback(sock, conn);
-                ksocknal_putconnsock(conn);
+        if (ksocknal_connsock_addref(conn) == 0) {
+                ksocknal_read_callback(conn);
+                ksocknal_write_callback(conn);
+                ksocknal_connsock_decref(conn);
         }
 
-        CDEBUG(D_HA, "New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d "
-               "incarnation:"LPX64" sched[%d]/%d\n",
-               nid, HIPQUAD(conn->ksnc_myipaddr),
+        CDEBUG(D_NET, "New conn %s %u.%u.%u.%u -> %u.%u.%u.%u/%d"
+               " incarnation:"LPD64" sched[%d]/%d\n",
+               libcfs_id2str(peerid), HIPQUAD(conn->ksnc_myipaddr),
                HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation,
-               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers),irq);
+               (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
+
+        LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+                                    kshm_ips[LNET_MAX_INTERFACES]));
 
-        ksocknal_put_conn (conn);
+        ksocknal_conn_decref(conn);
         return (0);
 
  failed_2:
         if (!peer->ksnp_closing &&
             list_empty (&peer->ksnp_conns) &&
-            list_empty (&peer->ksnp_routes))
+            list_empty (&peer->ksnp_routes)) {
+                list_add(&zombies, &peer->ksnp_tx_queue);
+                list_del_init(&peer->ksnp_tx_queue);
                 ksocknal_unlink_peer_locked(peer);
-        write_unlock_irqrestore(global_lock, flags);
+        }
+        
+        write_unlock_bh (global_lock);
+
+        if (warn != NULL) {
+                if (rc < 0)
+                        CERROR("Not creating conn %s type %d: %s\n",
+                               libcfs_id2str(peerid), conn->ksnc_type, warn);
+                else
+                        CDEBUG(D_NET, "Not creating conn %s type %d: %s\n",
+                              libcfs_id2str(peerid), conn->ksnc_type, warn);
+        }
+
+        ksocknal_txlist_done(ni, &zombies, 1);
+        ksocknal_peer_decref(peer);
 
  failed_1:
-        ksocknal_put_peer (peer);
+        if (hello != NULL)
+                LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
+                                            kshm_ips[LNET_MAX_INTERFACES]));
 
- failed_0:
-        PORTAL_FREE (conn, sizeof(*conn));
+        LIBCFS_FREE (conn, sizeof(*conn));
 
-        LASSERT (rc != 0);
-        return (rc);
+ failed_0:
+        libcfs_sock_release(sock);
+        return rc;
 }
 
 void
@@ -1187,7 +1359,6 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
         LASSERT (peer->ksnp_error == 0);
         LASSERT (!conn->ksnc_closing);
         conn->ksnc_closing = 1;
-        atomic_inc (&ksocknal_data.ksnd_nclosing_conns);
 
         /* ksnd_deathrow_conns takes over peer's ref */
         list_del (&conn->ksnc_list);
@@ -1217,7 +1388,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
                 list_del (&route->ksnr_list);   /* make route least favourite */
                 list_add_tail (&route->ksnr_list, &peer->ksnp_routes);
 #endif
-                ksocknal_put_route (route);     /* drop conn's ref on route */
+                ksocknal_route_decref(route);     /* drop conn's ref on route */
         }
 
         if (list_empty (&peer->ksnp_conns)) {
@@ -1227,17 +1398,46 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
 
                 if (list_empty (&peer->ksnp_routes)) {
                         /* I've just closed last conn belonging to a
-                         * non-autoconnecting peer */
+                         * peer with no routes to it */
                         ksocknal_unlink_peer_locked (peer);
                 }
         }
 
-        spin_lock (&ksocknal_data.ksnd_reaper_lock);
+        spin_lock_bh (&ksocknal_data.ksnd_reaper_lock);
 
         list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns);
         cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
 
-        spin_unlock (&ksocknal_data.ksnd_reaper_lock);
+        spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock);
+}
+
+void
+ksocknal_peer_failed (ksock_peer_t *peer)
+{
+        time_t    last_alive = 0;
+        int       notify = 0;
+
+        /* There has been a connection failure or comms error; but I'll only
+         * tell LNET I think the peer is dead if it's to another kernel and
+         * there are no connections or connection attempts in existance. */
+        
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 &&
+            list_empty(&peer->ksnp_conns) &&
+            peer->ksnp_accepting == 0 &&
+            ksocknal_find_connecting_route_locked(peer) == NULL) {
+                notify = 1;
+                last_alive = cfs_time_current_sec() - 
+                             cfs_duration_sec(cfs_time_current() - 
+                                              peer->ksnp_last_alive);
+        }
+        
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+        if (notify)
+                lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0,
+                             last_alive);
 }
 
 void
@@ -1247,17 +1447,14 @@ ksocknal_terminate_conn (ksock_conn_t *conn)
          * disengage the socket from its callbacks and close it.
          * ksnc_refcount will eventually hit zero, and then the reaper will
          * destroy it. */
-        unsigned long   flags;
         ksock_peer_t   *peer = conn->ksnc_peer;
         ksock_sched_t  *sched = conn->ksnc_scheduler;
-        struct timeval  now;
-        time_t          then = 0;
-        int             notify = 0;
+        int             failed = 0;
 
         LASSERT(conn->ksnc_closing);
 
         /* wake up the scheduler to "send" all remaining packets to /dev/null */
-        spin_lock_irqsave(&sched->kss_lock, flags);
+        spin_lock_bh (&sched->kss_lock);
 
         if (!conn->ksnc_tx_scheduled &&
             !list_empty(&conn->ksnc_tx_queue)){
@@ -1267,15 +1464,43 @@ ksocknal_terminate_conn (ksock_conn_t *conn)
                 conn->ksnc_tx_ready = 1;
                 conn->ksnc_tx_scheduled = 1;
                 /* extra ref for scheduler */
-                atomic_inc (&conn->ksnc_refcount);
+                ksocknal_conn_addref(conn);
 
                 cfs_waitq_signal (&sched->kss_waitq);
         }
 
-        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        spin_unlock_bh (&sched->kss_lock);
+
+        spin_lock(&peer->ksnp_lock);
+        if (!list_empty(&peer->ksnp_zc_req_list)) {
+                struct list_head *tmp;
+                struct list_head *nxt;
+                ksock_tx_t       *tx;
+                LIST_HEAD         (zlist);
+
+                list_for_each_safe(tmp, nxt, &peer->ksnp_zc_req_list) {
+                        tx = list_entry(tmp, ksock_tx_t, tx_zc_list);
+
+                        if (tx->tx_conn != conn)
+                                continue;
+                        list_del(&tx->tx_zc_list);
+                        /* tell scheduler it's deleted */
+                        tx->tx_msg.ksm_zc_req_cookie = 0;
+                        list_add(&tx->tx_zc_list, &zlist);
+                }
+                spin_unlock(&peer->ksnp_lock);
+
+                list_for_each_safe(tmp, nxt, &zlist) {
+                        tx = list_entry(tmp, ksock_tx_t, tx_zc_list);
+                        list_del(&tx->tx_zc_list);
+                        ksocknal_tx_decref(tx);
+                }
+        } else {
+                spin_unlock(&peer->ksnp_lock);
+        }
 
         /* serialise with callbacks */
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
         ksocknal_lib_reset_callback(conn->ksnc_sock, conn);
 
@@ -1286,26 +1511,35 @@ ksocknal_terminate_conn (ksock_conn_t *conn)
         if (peer->ksnp_error != 0) {
                 /* peer's last conn closed in error */
                 LASSERT (list_empty (&peer->ksnp_conns));
-
-                /* convert peer's last-known-alive timestamp from jiffies */
-                do_gettimeofday (&now);
-                then = now.tv_sec - cfs_duration_sec(cfs_time_sub(cfs_time_current(),
-                                                                  peer->ksnp_last_alive));
-                notify = 1;
+                failed = 1;
+                peer->ksnp_error = 0;     /* avoid multiple notifications */
         }
 
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
+
+        if (failed)
+                ksocknal_peer_failed(peer);
 
         /* The socket is closed on the final put; either here, or in
          * ksocknal_{send,recv}msg().  Since we set up the linger2 option
          * when the connection was established, this will close the socket
          * immediately, aborting anything buffered in it. Any hung
          * zero-copy transmits will therefore complete in finite time. */
-        ksocknal_putconnsock (conn);
+        ksocknal_connsock_decref(conn);
+}
 
-        if (notify)
-                kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid,
-                            0, then);
+void
+ksocknal_queue_zombie_conn (ksock_conn_t *conn)
+{
+        /* Queue the conn for the reaper to destroy */
+
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) == 0);
+        spin_lock_bh (&ksocknal_data.ksnd_reaper_lock);
+
+        list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
+        cfs_waitq_signal(&ksocknal_data.ksnd_reaper_waitq);
+        
+        spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock);
 }
 
 void
@@ -1314,7 +1548,9 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
         /* Final coup-de-grace of the reaper */
         CDEBUG (D_NET, "connection %p\n", conn);
 
-        LASSERT (atomic_read (&conn->ksnc_refcount) == 0);
+        LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0);
+        LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0);
+        LASSERT (conn->ksnc_sock == NULL);
         LASSERT (conn->ksnc_route == NULL);
         LASSERT (!conn->ksnc_tx_scheduled);
         LASSERT (!conn->ksnc_rx_scheduled);
@@ -1322,49 +1558,45 @@ ksocknal_destroy_conn (ksock_conn_t *conn)
 
         /* complete current receive if any */
         switch (conn->ksnc_rx_state) {
-        case SOCKNAL_RX_BODY:
-                CERROR("Completing partial receive from "LPX64
+        case SOCKNAL_RX_LNET_PAYLOAD:
+                CERROR("Completing partial receive from %s"
                        ", ip %d.%d.%d.%d:%d, with error\n",
-                       conn->ksnc_peer->ksnp_nid,
+                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
                        HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
-                lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL);
+                lnet_finalize (conn->ksnc_peer->ksnp_ni, 
+                               conn->ksnc_cookie, -EIO);
                 break;
-        case SOCKNAL_RX_BODY_FWD:
-                ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED);
+        case SOCKNAL_RX_LNET_HEADER:
+                if (conn->ksnc_rx_started)
+                        CERROR("Incomplete receive of lnet header from %s"
+                               ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                               conn->ksnc_proto->pro_version);
                 break;
-        case SOCKNAL_RX_HEADER:
-        case SOCKNAL_RX_SLOP:
+        case SOCKNAL_RX_KSM_HEADER:
+                if (conn->ksnc_rx_started)
+                        CERROR("Incomplete receive of ksock message from %s"
+                               ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port,
+                               conn->ksnc_proto->pro_version);
                 break;
+        case SOCKNAL_RX_SLOP:
+                if (conn->ksnc_rx_started)
+                        CERROR("Incomplete receive of slops from %s"
+                               ", ip %d.%d.%d.%d:%d, with error\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+               break;
         default:
                 LBUG ();
                 break;
         }
 
-        ksocknal_put_peer (conn->ksnc_peer);
+        ksocknal_peer_decref(conn->ksnc_peer);
 
-        PORTAL_FREE (conn, sizeof (*conn));
-        atomic_dec (&ksocknal_data.ksnd_nclosing_conns);
-}
-
-void
-ksocknal_put_conn (ksock_conn_t *conn)
-{
-        unsigned long flags;
-
-        CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n",
-                conn, conn->ksnc_peer->ksnp_nid,
-                atomic_read (&conn->ksnc_refcount));
-
-        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
-        if (!atomic_dec_and_test (&conn->ksnc_refcount))
-                return;
-
-        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
-
-        list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns);
-        cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
-
-        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+        LIBCFS_FREE (conn, sizeof (*conn));
 }
 
 int
@@ -1402,10 +1634,11 @@ ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation)
                 if (conn->ksnc_incarnation == incarnation)
                         continue;
 
-                CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d "
-                      "incarnation:"LPX64"("LPX64")\n",
-                      peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port,
-                      conn->ksnc_incarnation, incarnation);
+                CDEBUG(D_NET, "Closing stale conn %s ip:%08x/%d "
+                       "incarnation:"LPD64"("LPD64")\n",
+                       libcfs_id2str(peer->ksnp_id), 
+                       conn->ksnc_ipaddr, conn->ksnc_port,
+                       conn->ksnc_incarnation, incarnation);
 
                 count++;
                 ksocknal_close_conn_locked (conn, -ESTALE);
@@ -1419,22 +1652,20 @@ ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why)
 {
         ksock_peer_t     *peer = conn->ksnc_peer;
         __u32             ipaddr = conn->ksnc_ipaddr;
-        unsigned long     flags;
         int               count;
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
         count = ksocknal_close_peer_conns_locked (peer, ipaddr, why);
 
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
         return (count);
 }
 
 int
-ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr)
+ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr)
 {
-        unsigned long       flags;
         ksock_peer_t       *peer;
         struct list_head   *ptmp;
         struct list_head   *pnxt;
@@ -1443,10 +1674,10 @@ ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr)
         int                 i;
         int                 count = 0;
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
-        if (nid != PTL_NID_ANY)
-                lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers;
+        if (id.nid != LNET_NID_ANY)
+                lo = hi = ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers;
         else {
                 lo = 0;
                 hi = ksocknal_data.ksnd_peer_hash_size - 1;
@@ -1457,33 +1688,36 @@ ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr)
 
                         peer = list_entry (ptmp, ksock_peer_t, ksnp_list);
 
-                        if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid))
+                        if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) &&
+                              (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid)))
                                 continue;
 
                         count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0);
                 }
         }
 
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
         /* wildcards always succeed */
-        if (nid == PTL_NID_ANY || ipaddr == 0)
+        if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0)
                 return (0);
 
         return (count == 0 ? -ENOENT : 0);
 }
 
 void
-ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive)
+ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive)
 {
         /* The router is telling me she's been notified of a change in
          * gateway state.... */
+        lnet_process_id_t  id = {.nid = gw_nid, .pid = LNET_PID_ANY};
 
-        CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down");
+        CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), 
+                alive ? "up" : "down");
 
         if (!alive) {
                 /* If the gateway crashed, close all open connections... */
-                ksocknal_close_matching_conns (gw_nid, 0);
+                ksocknal_close_matching_conns (id, 0);
                 return;
         }
 
@@ -1508,7 +1742,7 @@ ksocknal_push_peer (ksock_peer_t *peer)
                 list_for_each (tmp, &peer->ksnp_conns) {
                         if (i++ == index) {
                                 conn = list_entry (tmp, ksock_conn_t, ksnc_list);
-                                atomic_inc (&conn->ksnc_refcount);
+                                ksocknal_conn_addref(conn);
                                 break;
                         }
                 }
@@ -1519,12 +1753,12 @@ ksocknal_push_peer (ksock_peer_t *peer)
                         break;
 
                 ksocknal_lib_push_conn (conn);
-                ksocknal_put_conn (conn);
+                ksocknal_conn_decref(conn);
         }
 }
 
 int
-ksocknal_push (ptl_nid_t nid)
+ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id)
 {
         ksock_peer_t      *peer;
         struct list_head  *tmp;
@@ -1533,17 +1767,6 @@ ksocknal_push (ptl_nid_t nid)
         int                j;
         int                rc = -ENOENT;
 
-        if (nid != PTL_NID_ANY) {
-                peer = ksocknal_get_peer (nid);
-
-                if (peer != NULL) {
-                        rc = 0;
-                        ksocknal_push_peer (peer);
-                        ksocknal_put_peer (peer);
-                }
-                return (rc);
-        }
-
         for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                 for (j = 0; ; j++) {
                         read_lock (&ksocknal_data.ksnd_global_lock);
@@ -1552,10 +1775,19 @@ ksocknal_push (ptl_nid_t nid)
                         peer = NULL;
 
                         list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) {
+                                peer = list_entry(tmp, ksock_peer_t,
+                                                  ksnp_list);
+
+                                if (!((id.nid == LNET_NID_ANY ||
+                                       id.nid == peer->ksnp_id.nid) &&
+                                      (id.pid == LNET_PID_ANY ||
+                                       id.pid == peer->ksnp_id.pid))) {
+                                        peer = NULL;
+                                        continue;
+                                }
+
                                 if (index++ == j) {
-                                        peer = list_entry(tmp, ksock_peer_t,
-                                                          ksnp_list);
-                                        atomic_inc (&peer->ksnp_refcount);
+                                        ksocknal_peer_addref(peer);
                                         break;
                                 }
                         }
@@ -1565,7 +1797,7 @@ ksocknal_push (ptl_nid_t nid)
                         if (peer != NULL) {
                                 rc = 0;
                                 ksocknal_push_peer (peer);
-                                ksocknal_put_peer (peer);
+                                ksocknal_peer_decref(peer);
                         }
                 }
 
@@ -1575,9 +1807,9 @@ ksocknal_push (ptl_nid_t nid)
 }
 
 int
-ksocknal_add_interface(__u32 ipaddress, __u32 netmask)
+ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask)
 {
-        unsigned long      flags;
+        ksock_net_t       *net = ni->ni_data;
         ksock_interface_t *iface;
         int                rc;
         int                i;
@@ -1591,16 +1823,16 @@ ksocknal_add_interface(__u32 ipaddress, __u32 netmask)
             netmask == 0)
                 return (-EINVAL);
 
-        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
-        iface = ksocknal_ip2iface(ipaddress);
+        iface = ksocknal_ip2iface(ni, ipaddress);
         if (iface != NULL) {
                 /* silently ignore dups */
                 rc = 0;
-        } else if (ksocknal_data.ksnd_ninterfaces == SOCKNAL_MAX_INTERFACES) {
+        } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) {
                 rc = -ENOSPC;
         } else {
-                iface = &ksocknal_data.ksnd_interfaces[ksocknal_data.ksnd_ninterfaces++];
+                iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++];
 
                 iface->ksni_ipaddr = ipaddress;
                 iface->ksni_netmask = netmask;
@@ -1628,7 +1860,7 @@ ksocknal_add_interface(__u32 ipaddress, __u32 netmask)
                 /* NB only new connections will pay attention to the new interface! */
         }
 
-        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
         return (rc);
 }
@@ -1675,10 +1907,10 @@ ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr)
 }
 
 int
-ksocknal_del_interface(__u32 ipaddress)
+ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress)
 {
+        ksock_net_t       *net = ni->ni_data;
         int                rc = -ENOENT;
-        unsigned long      flags;
         struct list_head  *tmp;
         struct list_head  *nxt;
         ksock_peer_t      *peer;
@@ -1686,10 +1918,10 @@ ksocknal_del_interface(__u32 ipaddress)
         int                i;
         int                j;
 
-        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
-        for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) {
-                this_ip = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr;
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                this_ip = net->ksnn_interfaces[i].ksni_ipaddr;
 
                 if (!(ipaddress == 0 ||
                       ipaddress == this_ip))
@@ -1697,266 +1929,217 @@ ksocknal_del_interface(__u32 ipaddress)
 
                 rc = 0;
 
-                for (j = i+1; j < ksocknal_data.ksnd_ninterfaces; j++)
-                        ksocknal_data.ksnd_interfaces[j-1] =
-                                ksocknal_data.ksnd_interfaces[j];
+                for (j = i+1; j < net->ksnn_ninterfaces; j++)
+                        net->ksnn_interfaces[j-1] =
+                                net->ksnn_interfaces[j];
 
-                ksocknal_data.ksnd_ninterfaces--;
+                net->ksnn_ninterfaces--;
 
                 for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) {
                         list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) {
                                 peer = list_entry(tmp, ksock_peer_t, ksnp_list);
 
+                                if (peer->ksnp_ni != ni)
+                                        continue;
+
                                 ksocknal_peer_del_interface_locked(peer, this_ip);
                         }
                 }
         }
 
-        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
         return (rc);
 }
 
 int
-ksocknal_cmd(struct portals_cfg *pcfg, void * private)
+ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 {
+        struct libcfs_ioctl_data *data = arg;
         int rc;
 
-        switch(pcfg->pcfg_command) {
-        case NAL_CMD_GET_INTERFACE: {
+        switch(cmd) {
+        case IOC_LIBCFS_GET_INTERFACE: {
+                ksock_net_t       *net = ni->ni_data;
                 ksock_interface_t *iface;
 
                 read_lock (&ksocknal_data.ksnd_global_lock);
 
-                if (pcfg->pcfg_count < 0 ||
-                    pcfg->pcfg_count >= ksocknal_data.ksnd_ninterfaces) {
+                if (data->ioc_count < 0 ||
+                    data->ioc_count >= net->ksnn_ninterfaces) {
                         rc = -ENOENT;
                 } else {
                         rc = 0;
-                        iface = &ksocknal_data.ksnd_interfaces[pcfg->pcfg_count];
+                        iface = &net->ksnn_interfaces[data->ioc_count];
 
-                        pcfg->pcfg_id    = iface->ksni_ipaddr;
-                        pcfg->pcfg_misc  = iface->ksni_netmask;
-                        pcfg->pcfg_fd    = iface->ksni_npeers;
-                        pcfg->pcfg_count = iface->ksni_nroutes;
+                        data->ioc_u32[0] = iface->ksni_ipaddr;
+                        data->ioc_u32[1] = iface->ksni_netmask;
+                        data->ioc_u32[2] = iface->ksni_npeers;
+                        data->ioc_u32[3] = iface->ksni_nroutes;
                 }
 
                 read_unlock (&ksocknal_data.ksnd_global_lock);
-                break;
-        }
-        case NAL_CMD_ADD_INTERFACE: {
-                rc = ksocknal_add_interface(pcfg->pcfg_id, /* IP address */
-                                            pcfg->pcfg_misc); /* net mask */
-                break;
-        }
-        case NAL_CMD_DEL_INTERFACE: {
-                rc = ksocknal_del_interface(pcfg->pcfg_id); /* IP address */
-                break;
+                return rc;
         }
-        case NAL_CMD_GET_PEER: {
-                ptl_nid_t    nid = 0;
-                __u32        myip = 0;
-                __u32        ip = 0;
-                int          port = 0;
-                int          conn_count = 0;
-                int          share_count = 0;
 
-                rc = ksocknal_get_peer_info(pcfg->pcfg_count, &nid,
-                                            &myip, &ip, &port,
-                                            &conn_count,  &share_count);
-                pcfg->pcfg_nid   = nid;
-                pcfg->pcfg_size  = myip;
-                pcfg->pcfg_id    = ip;
-                pcfg->pcfg_misc  = port;
-                pcfg->pcfg_count = conn_count;
-                pcfg->pcfg_wait  = share_count;
-                break;
-        }
-        case NAL_CMD_ADD_PEER: {
-                rc = ksocknal_add_peer (pcfg->pcfg_nid,
-                                        pcfg->pcfg_id, /* IP */
-                                        pcfg->pcfg_misc); /* port */
-                break;
-        }
-        case NAL_CMD_DEL_PEER: {
-                rc = ksocknal_del_peer (pcfg->pcfg_nid,
-                                        pcfg->pcfg_id, /* IP */
-                                        pcfg->pcfg_flags); /* single_share? */
-                break;
-        }
-        case NAL_CMD_GET_CONN: {
-                ksock_conn_t *conn = ksocknal_get_conn_by_idx (pcfg->pcfg_count);
+        case IOC_LIBCFS_ADD_INTERFACE:
+                return ksocknal_add_interface(ni,
+                                              data->ioc_u32[0], /* IP address */
+                                              data->ioc_u32[1]); /* net mask */
 
-                if (conn == NULL)
-                        rc = -ENOENT;
-                else {
-                        int   txmem;
-                        int   rxmem;
-                        int   nagle;
+        case IOC_LIBCFS_DEL_INTERFACE:
+                return ksocknal_del_interface(ni, 
+                                              data->ioc_u32[0]); /* IP address */
 
-                        ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_process_id_t id = {0,};
+                __u32            myip = 0;
+                __u32            ip = 0;
+                int              port = 0;
+                int              conn_count = 0;
+                int              share_count = 0;
 
-                        rc = 0;
-                        pcfg->pcfg_nid    = conn->ksnc_peer->ksnp_nid;
-                        pcfg->pcfg_id     = conn->ksnc_ipaddr;
-                        pcfg->pcfg_misc   = conn->ksnc_port;
-                        pcfg->pcfg_fd     = conn->ksnc_myipaddr;
-                        pcfg->pcfg_flags  = conn->ksnc_type;
-                        pcfg->pcfg_gw_nal = conn->ksnc_scheduler -
-                                            ksocknal_data.ksnd_schedulers;
-                        pcfg->pcfg_count  = txmem;
-                        pcfg->pcfg_size   = rxmem;
-                        pcfg->pcfg_wait   = nagle;
-                        ksocknal_put_conn (conn);
-                }
-                break;
-        }
-        case NAL_CMD_REGISTER_PEER_FD: {
-                struct socket *sock = sockfd_lookup (pcfg->pcfg_fd, &rc);
-                int            type = pcfg->pcfg_misc;
-
-                if (sock == NULL)
-                        break;
+                rc = ksocknal_get_peer_info(ni, data->ioc_count,
+                                            &id, &myip, &ip, &port,
+                                            &conn_count,  &share_count);
+                if (rc != 0)
+                        return rc;
+                        
+                data->ioc_nid    = id.nid;
+                data->ioc_count  = share_count;
+                data->ioc_u32[0] = ip;
+                data->ioc_u32[1] = port;
+                data->ioc_u32[2] = myip;
+                data->ioc_u32[3] = conn_count;
+                data->ioc_u32[4] = id.pid;
+                return 0;
+        }
+
+        case IOC_LIBCFS_ADD_PEER: {
+                lnet_process_id_t  id = {.nid = data->ioc_nid,
+                                         .pid = LUSTRE_SRV_LNET_PID};
+                return ksocknal_add_peer (ni, id,
+                                          data->ioc_u32[0], /* IP */
+                                          data->ioc_u32[1]); /* port */
+        }
+        case IOC_LIBCFS_DEL_PEER: {
+                lnet_process_id_t  id = {.nid = data->ioc_nid,
+                                         .pid = LNET_PID_ANY};
+                return ksocknal_del_peer (ni, id,
+                                          data->ioc_u32[0]); /* IP */
+        }
+        case IOC_LIBCFS_GET_CONN: {
+                int           txmem;
+                int           rxmem;
+                int           nagle;
+                ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count);
 
-                switch (type) {
-                case SOCKNAL_CONN_NONE:
-                case SOCKNAL_CONN_ANY:
-                case SOCKNAL_CONN_CONTROL:
-                case SOCKNAL_CONN_BULK_IN:
-                case SOCKNAL_CONN_BULK_OUT:
-                        rc = ksocknal_create_conn(NULL, sock, type);
-                        break;
-                default:
-                        rc = -EINVAL;
-                        break;
-                }
-                cfs_put_file (KSN_SOCK2FILE(sock));
-                break;
-        }
-        case NAL_CMD_CLOSE_CONNECTION: {
-                rc = ksocknal_close_matching_conns (pcfg->pcfg_nid,
-                                                    pcfg->pcfg_id);
-                break;
-        }
-        case NAL_CMD_REGISTER_MYNID: {
-                rc = ksocknal_set_mynid (pcfg->pcfg_nid);
-                break;
-        }
-        case NAL_CMD_PUSH_CONNECTION: {
-                rc = ksocknal_push (pcfg->pcfg_nid);
-                break;
+                if (conn == NULL)
+                        return -ENOENT;
+
+                ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle);
+
+                data->ioc_count  = txmem;
+                data->ioc_nid    = conn->ksnc_peer->ksnp_id.nid;
+                data->ioc_flags  = nagle;
+                data->ioc_u32[0] = conn->ksnc_ipaddr;
+                data->ioc_u32[1] = conn->ksnc_port;
+                data->ioc_u32[2] = conn->ksnc_myipaddr;
+                data->ioc_u32[3] = conn->ksnc_type;
+                data->ioc_u32[4] = conn->ksnc_scheduler -
+                                   ksocknal_data.ksnd_schedulers;
+                data->ioc_u32[5] = rxmem;
+                data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid;
+                ksocknal_conn_decref(conn);
+                return 0;
+        }
+
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                lnet_process_id_t  id = {.nid = data->ioc_nid,
+                                        .pid = LNET_PID_ANY};
+
+                return ksocknal_close_matching_conns (id,
+                                                      data->ioc_u32[0]);
+        }
+        case IOC_LIBCFS_REGISTER_MYNID:
+                /* Ignore if this is a noop */
+               if (data->ioc_nid == ni->ni_nid)
+                       return 0;
+
+               CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                      libcfs_nid2str(data->ioc_nid),
+                      libcfs_nid2str(ni->ni_nid));
+               return -EINVAL;
+
+        case IOC_LIBCFS_PUSH_CONNECTION: {
+                lnet_process_id_t  id = {.nid = data->ioc_nid,
+                                        .pid = LNET_PID_ANY};
+                
+                return ksocknal_push(ni, id);
         }
         default:
-                rc = -EINVAL;
-                break;
-        }
-
-        return rc;
-}
-
-void
-ksocknal_free_fmbs (ksock_fmb_pool_t *p)
-{
-        int          npages = p->fmp_buff_pages;
-        ksock_fmb_t *fmb;
-        int          i;
-
-        LASSERT (list_empty(&p->fmp_blocked_conns));
-        LASSERT (p->fmp_nactive_fmbs == 0);
-
-        while (!list_empty(&p->fmp_idle_fmbs)) {
-
-                fmb = list_entry(p->fmp_idle_fmbs.next,
-                                 ksock_fmb_t, fmb_list);
-
-                for (i = 0; i < npages; i++)
-                        if (fmb->fmb_kiov[i].kiov_page != NULL)
-                                cfs_free_page(fmb->fmb_kiov[i].kiov_page);
-
-                list_del(&fmb->fmb_list);
-                PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages]));
+                return -EINVAL;
         }
+        /* not reached */
 }
 
 void
 ksocknal_free_buffers (void)
 {
-        ksocknal_free_fmbs(&ksocknal_data.ksnd_small_fmp);
-        ksocknal_free_fmbs(&ksocknal_data.ksnd_large_fmp);
-
-        LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_ltxs) == 0);
+        LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0);
 
         if (ksocknal_data.ksnd_schedulers != NULL)
-                PORTAL_FREE (ksocknal_data.ksnd_schedulers,
+                LIBCFS_FREE (ksocknal_data.ksnd_schedulers,
                              sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
 
-        PORTAL_FREE (ksocknal_data.ksnd_peers,
+        LIBCFS_FREE (ksocknal_data.ksnd_peers,
                      sizeof (struct list_head) *
                      ksocknal_data.ksnd_peer_hash_size);
+
+        spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+        if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+                struct list_head  zlist;
+                ksock_tx_t       *tx;
+
+                list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs);
+                list_del_init(&ksocknal_data.ksnd_idle_noop_txs);
+                spin_unlock(&ksocknal_data.ksnd_tx_lock);
+
+                while(!list_empty(&zlist)) {
+                        tx = list_entry(zlist.next, ksock_tx_t, tx_list);
+                        list_del(&tx->tx_list);
+                        LIBCFS_FREE(tx, tx->tx_desc_size);
+                }
+        } else {
+                spin_unlock(&ksocknal_data.ksnd_tx_lock);
+        }
 }
 
 void
-ksocknal_api_shutdown (nal_t *nal)
+ksocknal_base_shutdown (void)
 {
         ksock_sched_t *sched;
         int            i;
 
-        if (nal->nal_refct != 0) {
-                /* This module got the first ref */
-                PORTAL_MODULE_UNUSE;
-                return;
-        }
-
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
-
-        LASSERT(nal == &ksocknal_api);
+               atomic_read (&libcfs_kmemory));
+        LASSERT (ksocknal_data.ksnd_nnets == 0);
 
         switch (ksocknal_data.ksnd_init) {
         default:
                 LASSERT (0);
 
         case SOCKNAL_INIT_ALL:
-                libcfs_nal_cmd_unregister(SOCKNAL);
-
-                ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB;
-                /* fall through */
-
-        case SOCKNAL_INIT_LIB:
-                /* No more calls to ksocknal_cmd() to create new
-                 * autoroutes/connections since we're being unloaded. */
-
-                /* Delete all peers */
-                ksocknal_del_peer(PTL_NID_ANY, 0, 0);
-
-                /* Wait for all peer state to clean up */
-                i = 2;
-                while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) {
-                        i++;
-                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
-                               "waiting for %d peers to disconnect\n",
-                               atomic_read (&ksocknal_data.ksnd_npeers));
-                        set_current_state (TASK_UNINTERRUPTIBLE);
-                        schedule_timeout (cfs_time_seconds(1));
-                }
-
-                /* Tell lib we've stopped calling into her. */
-                lib_fini(&ksocknal_lib);
-
-                ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
-                /* fall through */
-
         case SOCKNAL_INIT_DATA:
-                LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0);
                 LASSERT (ksocknal_data.ksnd_peers != NULL);
                 for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) {
                         LASSERT (list_empty (&ksocknal_data.ksnd_peers[i]));
                 }
                 LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns));
                 LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns));
-                LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes));
-                LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns));
-                LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns));
+                LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs));
+                LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes));
 
                 if (ksocknal_data.ksnd_schedulers != NULL)
                         for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
@@ -1965,57 +2148,49 @@ ksocknal_api_shutdown (nal_t *nal)
 
                                 LASSERT (list_empty (&kss->kss_tx_conns));
                                 LASSERT (list_empty (&kss->kss_rx_conns));
+                                LASSERT (list_empty (&kss->kss_zombie_noop_txs));
                                 LASSERT (kss->kss_nconns == 0);
                         }
 
-                /* stop router calling me */
-                kpr_shutdown (&ksocknal_data.ksnd_router);
-
                 /* flag threads to terminate; wake and wait for them to die */
                 ksocknal_data.ksnd_shuttingdown = 1;
-                cfs_waitq_broadcast (&ksocknal_data.ksnd_autoconnectd_waitq);
+                cfs_waitq_broadcast (&ksocknal_data.ksnd_connd_waitq);
                 cfs_waitq_broadcast (&ksocknal_data.ksnd_reaper_waitq);
 
-                for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
-                        sched = &ksocknal_data.ksnd_schedulers[i];
-                        cfs_waitq_broadcast(&sched->kss_waitq);
-                }
+                if (ksocknal_data.ksnd_schedulers != NULL)
+                        for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
+                                sched = &ksocknal_data.ksnd_schedulers[i];
+                                cfs_waitq_broadcast(&sched->kss_waitq);
+                        }
 
                 i = 4;
-                read_lock(&ksocknal_data.ksnd_global_lock);
+                read_lock (&ksocknal_data.ksnd_global_lock);
                 while (ksocknal_data.ksnd_nthreads != 0) {
                         i++;
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "waiting for %d threads to terminate\n",
                                 ksocknal_data.ksnd_nthreads);
-                        read_unlock(&ksocknal_data.ksnd_global_lock);
-                        set_current_state (TASK_UNINTERRUPTIBLE);
-                        schedule_timeout (cfs_time_seconds(1));
-                        read_lock(&ksocknal_data.ksnd_global_lock);
+                        read_unlock (&ksocknal_data.ksnd_global_lock);
+                        cfs_pause(cfs_time_seconds(1));
+                        read_lock (&ksocknal_data.ksnd_global_lock);
                 }
-                read_unlock(&ksocknal_data.ksnd_global_lock);
-
-                kpr_deregister (&ksocknal_data.ksnd_router);
+                read_unlock (&ksocknal_data.ksnd_global_lock);
 
                 ksocknal_free_buffers();
 
                 ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING;
-                /* fall through */
-
-        case SOCKNAL_INIT_NOTHING:
                 break;
         }
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
+               atomic_read (&libcfs_kmemory));
 
-        printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n",
-               atomic_read(&portal_kmemory));
+        PORTAL_MODULE_UNUSE;
 }
 
 
-void
-ksocknal_init_incarnation (void)
+__u64
+ksocknal_new_incarnation (void)
 {
         struct timeval tv;
 
@@ -2026,81 +2201,57 @@ ksocknal_init_incarnation (void)
 
         do_gettimeofday(&tv);
 
-        ksocknal_data.ksnd_incarnation =
-                (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+        return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
 }
 
 int
-ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
-                      ptl_ni_limits_t *requested_limits,
-                      ptl_ni_limits_t *actual_limits)
+ksocknal_base_startup (void)
 {
-        ptl_process_id_t  process_id;
-        int               pkmem = atomic_read(&portal_kmemory);
         int               rc;
         int               i;
-        int               j;
-
-        LASSERT (nal == &ksocknal_api);
-
-        if (nal->nal_refct != 0) {
-                if (actual_limits != NULL)
-                        *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits;
-                /* This module got the first ref */
-                PORTAL_MODULE_USE;
-                return (PTL_OK);
-        }
 
         LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
+        LASSERT (ksocknal_data.ksnd_nnets == 0);
 
         memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
 
-        ksocknal_init_incarnation();
-
         ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (ksocknal_data.ksnd_peers,
+        LIBCFS_ALLOC (ksocknal_data.ksnd_peers,
                       sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size);
         if (ksocknal_data.ksnd_peers == NULL)
-                return (-ENOMEM);
+                return -ENOMEM;
 
         for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++)
                 CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]);
 
         rwlock_init(&ksocknal_data.ksnd_global_lock);
 
-        spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock);
-        CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs);
-        CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns);
-        ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES;
-
-        spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock);
-        CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs);
-        CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns);
-        ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES;
-
         spin_lock_init (&ksocknal_data.ksnd_reaper_lock);
         CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns);
         CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns);
         CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns);
         cfs_waitq_init(&ksocknal_data.ksnd_reaper_waitq);
 
-        spin_lock_init (&ksocknal_data.ksnd_autoconnectd_lock);
-        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_autoconnectd_routes);
-        cfs_waitq_init(&ksocknal_data.ksnd_autoconnectd_waitq);
+        spin_lock_init (&ksocknal_data.ksnd_connd_lock);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes);
+        cfs_waitq_init(&ksocknal_data.ksnd_connd_waitq);
+
+        spin_lock_init (&ksocknal_data.ksnd_tx_lock);
+        CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs);
 
         /* NB memset above zeros whole of ksocknal_data, including
          * ksocknal_data.ksnd_irqinfo[all].ksni_valid */
 
         /* flag lists/ptrs/locks initialised */
         ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA;
+        PORTAL_MODULE_USE;
 
         ksocknal_data.ksnd_nschedulers = ksocknal_nsched();
-        PORTAL_ALLOC(ksocknal_data.ksnd_schedulers,
+        LIBCFS_ALLOC(ksocknal_data.ksnd_schedulers,
                      sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers);
-        if (ksocknal_data.ksnd_schedulers == NULL) {
-                ksocknal_api_shutdown (nal);
-                return (-ENOMEM);
-        }
+        if (ksocknal_data.ksnd_schedulers == NULL)
+                goto failed;
 
         for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
                 ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i];
@@ -2108,197 +2259,252 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 spin_lock_init (&kss->kss_lock);
                 CFS_INIT_LIST_HEAD (&kss->kss_rx_conns);
                 CFS_INIT_LIST_HEAD (&kss->kss_tx_conns);
-#if SOCKNAL_ZC
-                CFS_INIT_LIST_HEAD (&kss->kss_zctxdone_list);
-#endif
+                CFS_INIT_LIST_HEAD (&kss->kss_zombie_noop_txs);
                 cfs_waitq_init (&kss->kss_waitq);
         }
 
-        /* NB we have to wait to be told our true NID... */
-        process_id.pid = requested_pid;
-        process_id.nid = 0;
-
-        rc = lib_init(&ksocknal_lib, nal, process_id,
-                      requested_limits, actual_limits);
-        if (rc != PTL_OK) {
-                CERROR("lib_init failed: error %d\n", rc);
-                ksocknal_api_shutdown (nal);
-                return (rc);
-        }
-
-        ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called
-
         for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) {
                 rc = ksocknal_thread_start (ksocknal_scheduler,
                                             &ksocknal_data.ksnd_schedulers[i]);
                 if (rc != 0) {
                         CERROR("Can't spawn socknal scheduler[%d]: %d\n",
                                i, rc);
-                        ksocknal_api_shutdown (nal);
-                        return (rc);
+                        goto failed;
                 }
         }
 
-        for (i = 0; i < SOCKNAL_N_AUTOCONNECTD; i++) {
-                rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i));
+        /* must have at least 2 connds to remain responsive to accepts while
+         * connecting */
+        if (*ksocknal_tunables.ksnd_nconnds < 2)
+                *ksocknal_tunables.ksnd_nconnds = 2;
+        
+        for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) {
+                rc = ksocknal_thread_start (ksocknal_connd, (void *)((long)i));
                 if (rc != 0) {
-                        CERROR("Can't spawn socknal autoconnectd: %d\n", rc);
-                        ksocknal_api_shutdown (nal);
-                        return (rc);
+                        CERROR("Can't spawn socknal connd: %d\n", rc);
+                        goto failed;
                 }
         }
 
         rc = ksocknal_thread_start (ksocknal_reaper, NULL);
         if (rc != 0) {
                 CERROR ("Can't spawn socknal reaper: %d\n", rc);
-                ksocknal_api_shutdown (nal);
-                return (rc);
+                goto failed;
         }
 
-        rc = kpr_register(&ksocknal_data.ksnd_router,
-                          &ksocknal_router_interface);
-        if (rc != 0) {
-                CDEBUG(D_NET, "Can't initialise routing interface "
-                       "(rc = %d): not routing\n", rc);
-        } else {
-                /* Only allocate forwarding buffers if there's a router */
+        /* flag everything initialised */
+        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
 
-                for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS +
-                                 SOCKNAL_LARGE_FWD_NMSGS); i++) {
-                        ksock_fmb_t      *fmb;
-                        ksock_fmb_pool_t *pool;
+        return 0;
 
+ failed:
+        ksocknal_base_shutdown();
+        return -ENETDOWN;
+}
 
-                        if (i < SOCKNAL_SMALL_FWD_NMSGS)
-                                pool = &ksocknal_data.ksnd_small_fmp;
-                        else
-                                pool = &ksocknal_data.ksnd_large_fmp;
+void
+ksocknal_shutdown (lnet_ni_t *ni)
+{
+        ksock_net_t      *net = ni->ni_data;
+        int               i;
+        lnet_process_id_t  anyid = {.nid = LNET_NID_ANY,
+                                   .pid = LNET_PID_ANY};
 
-                        PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t,
-                                                   fmb_kiov[pool->fmp_buff_pages]));
-                        if (fmb == NULL) {
-                                ksocknal_api_shutdown(nal);
-                                return (-ENOMEM);
-                        }
+        LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL);
+        LASSERT(ksocknal_data.ksnd_nnets > 0);
 
-                        fmb->fmb_pool = pool;
+        spin_lock_bh (&net->ksnn_lock);
+        net->ksnn_shutdown = 1;                 /* prevent new peers */
+        spin_unlock_bh (&net->ksnn_lock);
 
-                        for (j = 0; j < pool->fmp_buff_pages; j++) {
-                                fmb->fmb_kiov[j].kiov_page = cfs_alloc_page(CFS_ALLOC_STD);
+        /* Delete all peers */
+        ksocknal_del_peer(ni, anyid, 0);
 
-                                if (fmb->fmb_kiov[j].kiov_page == NULL) {
-                                        ksocknal_api_shutdown (nal);
-                                        return (-ENOMEM);
-                                }
+        /* Wait for all peer state to clean up */
+        i = 2;
+        spin_lock_bh (&net->ksnn_lock);
+        while (net->ksnn_npeers != 0) {
+                spin_unlock_bh (&net->ksnn_lock);
 
-                                LASSERT(cfs_page_address(fmb->fmb_kiov[j].kiov_page) != NULL);
-                        }
+                i++;
+                CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                       "waiting for %d peers to disconnect\n",
+                       net->ksnn_npeers);
+                cfs_pause(cfs_time_seconds(1));
+
+                spin_lock_bh (&net->ksnn_lock);
+        }
+        spin_unlock_bh (&net->ksnn_lock);
+
+        for (i = 0; i < net->ksnn_ninterfaces; i++) {
+                LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0);
+                LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0);
+        }
+
+        LIBCFS_FREE(net, sizeof(*net));
+        
+        ksocknal_data.ksnd_nnets--;
+        if (ksocknal_data.ksnd_nnets == 0)
+                ksocknal_base_shutdown();
+}
 
-                        list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs);
+int
+ksocknal_enumerate_interfaces(ksock_net_t *net)
+{
+        char      **names;
+        int         i;
+        int         j;
+        int         rc;
+        int         n;
+                
+        n = libcfs_ipif_enumerate(&names);
+        if (n <= 0) {
+                CERROR("Can't enumerate interfaces: %d\n", n);
+                return n;
+        }
+
+        for (i = j = 0; i < n; i++) {
+                int        up;
+                __u32      ip;
+                __u32      mask;
+
+                if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                        continue;
+
+                rc = libcfs_ipif_query(names[i], &up, &ip, &mask);
+                if (rc != 0) {
+                        CWARN("Can't get interface %s info: %d\n",
+                              names[i], rc);
+                        continue;
                 }
+                
+                if (!up) {
+                        CWARN("Ignoring interface %s (down)\n",
+                              names[i]);
+                        continue;
+                }
+
+                if (j == LNET_MAX_INTERFACES) {
+                        CWARN("Ignoring interface %s (too many interfaces)\n",
+                              names[i]);
+                        continue;
+                }
+
+                net->ksnn_interfaces[j].ksni_ipaddr = ip;
+                net->ksnn_interfaces[j].ksni_netmask = mask;
+                j++;
         }
 
-        rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL);
-        if (rc != 0) {
-                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
-                ksocknal_api_shutdown (nal);
-                return (rc);
+        libcfs_ipif_free_enumeration(names, n);
+        
+        if (j == 0)
+                CERROR("Can't find any usable interfaces\n");
+        
+        return j;
+}
+
+int
+ksocknal_startup (lnet_ni_t *ni)
+{
+        ksock_net_t  *net;
+        int           rc;
+        int           i;
+
+        LASSERT (ni->ni_lnd == &the_ksocklnd);
+
+        if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) {
+                rc = ksocknal_base_startup();
+                if (rc != 0)
+                        return rc;
+        }
+        
+        LIBCFS_ALLOC(net, sizeof(*net));
+        if (net == NULL)
+                goto fail_0;
+                
+        memset(net, 0, sizeof(*net));
+        spin_lock_init(&net->ksnn_lock);
+        net->ksnn_incarnation = ksocknal_new_incarnation();
+        ni->ni_data = net;
+        ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits;
+        ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peercredits;
+        
+        if (ni->ni_interfaces[0] == NULL) {
+                rc = ksocknal_enumerate_interfaces(net);
+                if (rc <= 0)
+                        goto fail_1;
+
+                net->ksnn_ninterfaces = 1;
+        } else {
+                for (i = 0; i < LNET_MAX_INTERFACES; i++) {
+                        int    up;
+
+                        if (ni->ni_interfaces[i] == NULL)
+                                break;
+
+                        rc = libcfs_ipif_query(
+                                ni->ni_interfaces[i], &up,
+                                &net->ksnn_interfaces[i].ksni_ipaddr,
+                                &net->ksnn_interfaces[i].ksni_netmask);
+                        
+                        if (rc != 0) {
+                                CERROR("Can't get interface %s info: %d\n",
+                                       ni->ni_interfaces[i], rc);
+                                goto fail_1;
+                        }
+                        
+                        if (!up) {
+                                CERROR("Interface %s is down\n",
+                                       ni->ni_interfaces[i]);
+                                goto fail_1;
+                        }
+                }
+                net->ksnn_ninterfaces = i;
         }
 
-        /* flag everything initialised */
-        ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid),
+                                net->ksnn_interfaces[0].ksni_ipaddr);
 
-        printk(KERN_INFO "Lustre: Routing socket NAL loaded "
-               "(Routing %s, initial mem %d, incarnation "LPX64")\n",
-               kpr_routing (&ksocknal_data.ksnd_router) ?
-               "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation);
+        ksocknal_data.ksnd_nnets++;
 
-        return (0);
+        return 0;
+        
+ fail_1:
+        LIBCFS_FREE(net, sizeof(*net));
+ fail_0:
+        if (ksocknal_data.ksnd_nnets == 0)
+                ksocknal_base_shutdown();
+
+        return -ENETDOWN;
 }
 
+
 void __exit
 ksocknal_module_fini (void)
 {
-#ifdef CONFIG_SYSCTL
-        if (ksocknal_tunables.ksnd_sysctl != NULL)
-                unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl);
-#endif
-        PtlNIFini(ksocknal_ni);
-
-        ptl_unregister_nal(SOCKNAL);
+        lnet_unregister_lnd(&the_ksocklnd);
+        ksocknal_lib_tunables_fini();
 }
 
-extern cfs_sysctl_table_t ksocknal_top_ctl_table[];
-
 int __init
 ksocknal_module_init (void)
 {
         int    rc;
 
-        /* packet descriptor must fit in a router descriptor's scratchpad */
-        LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
-        /* the following must be sizeof(int) for proc_dointvec() */
-        LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_buffer_size) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_nagle) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_idle) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_count) == sizeof (int));
-        LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_intvl) == sizeof (int));
-#if CPU_AFFINITY
-        LASSERT(sizeof (ksocknal_tunables.ksnd_irq_affinity) == sizeof (int));
-#endif
-#if SOCKNAL_ZC
-        LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int));
-#endif
         /* check ksnr_connected/connecting field large enough */
-        LASSERT(SOCKNAL_CONN_NTYPES <= 4);
-
-        ksocknal_api.nal_ni_init = ksocknal_api_startup;
-        ksocknal_api.nal_ni_fini = ksocknal_api_shutdown;
-
-        /* Initialise dynamic tunables to defaults once only */
-        ksocknal_tunables.ksnd_io_timeout      = SOCKNAL_IO_TIMEOUT;
-        ksocknal_tunables.ksnd_eager_ack       = SOCKNAL_EAGER_ACK;
-        ksocknal_tunables.ksnd_typed_conns     = SOCKNAL_TYPED_CONNS;
-        ksocknal_tunables.ksnd_min_bulk        = SOCKNAL_MIN_BULK;
-        ksocknal_tunables.ksnd_buffer_size     = SOCKNAL_BUFFER_SIZE;
-        ksocknal_tunables.ksnd_nagle           = SOCKNAL_NAGLE;
-        ksocknal_tunables.ksnd_keepalive_idle  = SOCKNAL_KEEPALIVE_IDLE;
-        ksocknal_tunables.ksnd_keepalive_count = SOCKNAL_KEEPALIVE_COUNT;
-        ksocknal_tunables.ksnd_keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL;
-#if CPU_AFFINITY
-        ksocknal_tunables.ksnd_irq_affinity = SOCKNAL_IRQ_AFFINITY;
-#endif
-#if SOCKNAL_ZC
-        ksocknal_tunables.ksnd_zc_min_frag  = SOCKNAL_ZC_MIN_FRAG;
-#endif
-
-        rc = ptl_register_nal(SOCKNAL, &ksocknal_api);
-        if (rc != PTL_OK) {
-                CERROR("Can't register SOCKNAL: %d\n", rc);
-                return (-ENOMEM);               /* or something... */
-        }
+        CLASSERT(SOCKLND_CONN_NTYPES <= 4);
+        
+        rc = ksocknal_lib_tunables_init();
+        if (rc != 0)
+                return rc;
 
-        /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(SOCKNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &ksocknal_ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                ptl_unregister_nal(SOCKNAL);
-                return (-ENODEV);
-        }
+        lnet_register_lnd(&the_ksocklnd);
 
-#ifdef CONFIG_SYSCTL
-        /* Press on regardless even if registering sysctl doesn't work */
-        ksocknal_tunables.ksnd_sysctl =
-                register_sysctl_table (ksocknal_top_ctl_table, 0);
-#endif
-        return (0);
+        return 0;
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel TCP Socket NAL v1.0.0");
+MODULE_DESCRIPTION("Kernel TCP Socket LND v2.0.0");
 MODULE_LICENSE("GPL");
 
-cfs_module(ksocknal, "1.0.0", ksocknal_module_init, ksocknal_module_fini);
+cfs_module(ksocknal, "2.0.0", ksocknal_module_init, ksocknal_module_fini);
index 8c69aa0..a1f1861 100644 (file)
 # define EXPORT_SYMTAB
 #endif
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
 #if defined(__linux__)
-#include "socknal_lib-linux.h"
+#include "socklnd_lib-linux.h"
 #elif defined(__APPLE__)
-#include "socknal_lib-darwin.h"
+#include "socklnd_lib-darwin.h"
+#elif defined(__WINNT__)
+#include "socklnd_lib-winnt.h"
 #else
 #error Unsupported Operating System
 #endif
 
 #include <libcfs/kp30.h>
-#include <portals/kpr.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
-#include <portals/socknal.h>
-
-#define SOCKNAL_N_AUTOCONNECTD  4               /* # socknal autoconnect daemons */
-
-#define SOCKNAL_MIN_RECONNECT_INTERVAL cfs_time_seconds(1)   /* first failed connection retry... */
-#define SOCKNAL_MAX_RECONNECT_INTERVAL cfs_time_seconds(60)  /* ...exponentially increasing to this */
-
-/* default vals for runtime tunables */
-#define SOCKNAL_IO_TIMEOUT       50             /* default comms timeout (seconds) */
-#define SOCKNAL_EAGER_ACK        SOCKNAL_ARCH_EAGER_ACK  /* default eager ack (boolean) */
-#define SOCKNAL_TYPED_CONNS      1              /* unidirectional large, bidirectional small? */
-#define SOCKNAL_ZC_MIN_FRAG     (2<<10)         /* default smallest zerocopy fragment */
-#define SOCKNAL_MIN_BULK        (1<<10)         /* smallest "large" message */
-#define SOCKNAL_BUFFER_SIZE     (8<<20)         /* default socket buffer size */
-#define SOCKNAL_NAGLE            0              /* enable/disable NAGLE? */
-#define SOCKNAL_IRQ_AFFINITY     1              /* enable/disable IRQ affinity? */
-#define SOCKNAL_KEEPALIVE_IDLE   35             /* # seconds idle before 1st probe */
-
-#define SOCKNAL_KEEPALIVE_COUNT  5             /* # unanswered probes to determine peer death */
-#define SOCKNAL_KEEPALIVE_INTVL  5              /* seconds between probes */
-
-#define SOCKNAL_PEER_HASH_SIZE   101            /* # peer lists */
-
-#define SOCKNAL_SMALL_FWD_NMSGS        128             /* # small messages I can be forwarding at any time */
-#define SOCKNAL_LARGE_FWD_NMSGS 64              /* # large messages I can be forwarding at any time */
-
-#define SOCKNAL_SMALL_FWD_PAGES        1               /* # pages in a small message fwd buffer */
-
-#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN(PTL_MTU) >> PAGE_SHIFT)
-                                               /* # pages in a large message fwd buffer */
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/socklnd.h>
 
+#define SOCKNAL_PEER_HASH_SIZE  101             /* # peer lists */
 #define SOCKNAL_RESCHED         100             /* # scheduler loops before reschedule */
-#define SOCKNAL_ENOMEM_RETRY    CFS_MIN_DELAY   /* jiffies between retries */
-
-#define SOCKNAL_MAX_INTERFACES  16              /* Largest number of interfaces we bind */
+#define SOCKNAL_ENOMEM_RETRY    CFS_TICK        /* jiffies between retries */
 
 #define SOCKNAL_ROUND_ROBIN     0               /* round robin / load balance */
 
 # define SOCKNAL_RISK_KMAP_DEADLOCK  1
 #endif
 
-typedef struct                                  /* pool of forwarding buffers */
-{
-        spinlock_t        fmp_lock;             /* serialise */
-        struct list_head  fmp_idle_fmbs;        /* free buffers */
-        struct list_head  fmp_blocked_conns;    /* connections waiting for a buffer */
-        int               fmp_nactive_fmbs;     /* # buffers in use */
-        int               fmp_buff_pages;       /* # pages per buffer */
-} ksock_fmb_pool_t;
-
-
 typedef struct                                  /* per scheduler state */
 {
         spinlock_t        kss_lock;             /* serialise */
         struct list_head  kss_rx_conns;         /* conn waiting to be read */
         struct list_head  kss_tx_conns;         /* conn waiting to be written */
-#if SOCKNAL_ZC
-        struct list_head  kss_zctxdone_list;    /* completed ZC transmits */
-#endif
+        struct list_head  kss_zombie_noop_txs;  /* zombie noop tx list */
         cfs_waitq_t       kss_waitq;            /* where scheduler sleeps */
         int               kss_nconns;           /* # connections assigned to this scheduler */
 } ksock_sched_t;
 
 typedef struct
 {
-        int               ksni_valid:1;         /* been set yet? */
-        int               ksni_bound:1;         /* bound to a cpu yet? */
-        int               ksni_sched:6;         /* which scheduler (assumes < 64) */
+        unsigned int      ksni_valid:1;         /* been set yet? */
+        unsigned int      ksni_bound:1;         /* bound to a cpu yet? */
+        unsigned int      ksni_sched:6;         /* which scheduler (assumes < 64) */
 } ksock_irqinfo_t;
 
-typedef struct
+typedef struct                                  /* in-use interface */
 {
         __u32             ksni_ipaddr;          /* interface's IP address */
         __u32             ksni_netmask;         /* interface's network mask */
         int               ksni_nroutes;         /* # routes using (active) */
         int               ksni_npeers;          /* # peers using (passive) */
+        char              ksni_name[16];        /* interface name */
 } ksock_interface_t;
 
 typedef struct
 {
-        int               ksnd_io_timeout;      /* "stuck" socket timeout (seconds) */
-        int               ksnd_eager_ack;       /* make TCP ack eagerly? */
-        int               ksnd_typed_conns;     /* drive sockets by type? */
-        int               ksnd_min_bulk;        /* smallest "large" message */
-        int               ksnd_buffer_size;     /* socket buffer size */
-        int               ksnd_nagle;           /* enable NAGLE? */
-        int               ksnd_irq_affinity;    /* enable IRQ affinity? */
-        int               ksnd_keepalive_idle;  /* # idle secs before 1st probe */
-        int               ksnd_keepalive_count; /* # probes */
-        int               ksnd_keepalive_intvl; /* time between probes */
-#if SOCKNAL_ZC
-        unsigned int      ksnd_zc_min_frag;     /* minimum zero copy frag size */
+        int              *ksnd_timeout;         /* "stuck" socket timeout (seconds) */
+        int              *ksnd_nconnds;         /* # connection daemons */
+        int              *ksnd_min_reconnectms; /* first connection retry after (ms)... */
+        int              *ksnd_max_reconnectms; /* ...exponentially increasing to this */
+        int              *ksnd_eager_ack;       /* make TCP ack eagerly? */
+        int              *ksnd_typed_conns;     /* drive sockets by type? */
+        int              *ksnd_min_bulk;        /* smallest "large" message */
+        int              *ksnd_tx_buffer_size;  /* socket tx buffer size */
+        int              *ksnd_rx_buffer_size;  /* socket rx buffer size */
+        int              *ksnd_nagle;           /* enable NAGLE? */
+        int              *ksnd_keepalive_idle;  /* # idle secs before 1st probe */
+        int              *ksnd_keepalive_count; /* # probes */
+        int              *ksnd_keepalive_intvl; /* time between probes */
+        int              *ksnd_credits;         /* # concurrent sends */
+        int              *ksnd_peercredits;     /* # concurrent sends to 1 peer */
+        int              *ksnd_enable_csum;     /* enable check sum */
+        int              *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
+        unsigned int     *ksnd_zc_min_frag;     /* minimum zero copy frag size */
+#ifdef CPU_AFFINITY
+        int              *ksnd_irq_affinity;    /* enable IRQ affinity? */
 #endif
+#ifdef SOCKNAL_BACKOFF
+        int              *ksnd_backoff_init;    /* initial TCP backoff */
+        int              *ksnd_backoff_max;     /* maximum TCP backoff */
+#endif
+#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
         cfs_sysctl_table_header_t *ksnd_sysctl;   /* sysctl interface */
+#endif
 } ksock_tunables_t;
 
 typedef struct
 {
+        __u64             ksnn_incarnation;     /* my epoch */
+        spinlock_t        ksnn_lock;            /* serialise */
+        int               ksnn_npeers;          /* # peers */
+        int               ksnn_shutdown;        /* shutting down? */
+        int               ksnn_ninterfaces;     /* IP interfaces */
+        ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES];
+} ksock_net_t;
+
+typedef struct
+{
         int               ksnd_init;            /* initialisation state */
-        __u64             ksnd_incarnation;     /* my epoch */
-        
+        int               ksnd_nnets;           /* # networks set up */
+
         rwlock_t          ksnd_global_lock;     /* stabilize peer/conn ops */
         struct list_head *ksnd_peers;           /* hash table of all my known peers */
         int               ksnd_peer_hash_size;  /* size of ksnd_peers */
@@ -161,15 +144,7 @@ typedef struct
         int               ksnd_nschedulers;     /* # schedulers */
         ksock_sched_t    *ksnd_schedulers;      /* their state */
 
-        atomic_t          ksnd_npeers;          /* total # peers extant */
-        atomic_t          ksnd_nclosing_conns;  /* # closed conns extant */
-
-        kpr_router_t      ksnd_router;          /* THE router */
-
-        ksock_fmb_pool_t  ksnd_small_fmp;       /* small message forwarding buffers */
-        ksock_fmb_pool_t  ksnd_large_fmp;       /* large message forwarding buffers */
-
-        atomic_t          ksnd_nactive_ltxs;    /* #active ltxs */
+        atomic_t          ksnd_nactive_txs;     /* #active txs */
 
         struct list_head  ksnd_deathrow_conns;  /* conns to close: reaper_lock*/
         struct list_head  ksnd_zombie_conns;    /* conns to free: reaper_lock */
@@ -182,163 +157,148 @@ typedef struct
         int               ksnd_stall_tx;        /* test sluggish sender */
         int               ksnd_stall_rx;        /* test sluggish receiver */
 
-        struct list_head  ksnd_autoconnectd_routes; /* routes waiting to be connected */
-        cfs_waitq_t       ksnd_autoconnectd_waitq; /* autoconnectds sleep here */
-        spinlock_t        ksnd_autoconnectd_lock; /* serialise */
+        struct list_head  ksnd_connd_connreqs;  /* incoming connection requests */
+        struct list_head  ksnd_connd_routes;    /* routes waiting to be connected */
+        cfs_waitq_t       ksnd_connd_waitq;     /* connds sleep here */
+        int               ksnd_connd_connecting;/* # connds connecting */
+        spinlock_t        ksnd_connd_lock;      /* serialise */
+
+        struct list_head  ksnd_idle_noop_txs;   /* list head for freed noop tx */
+        spinlock_t        ksnd_tx_lock;         /* serialise, NOT safe in g_lock */
 
         ksock_irqinfo_t   ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */
 
-        int               ksnd_ninterfaces;
-        ksock_interface_t ksnd_interfaces[SOCKNAL_MAX_INTERFACES]; /* published interfaces */
 } ksock_nal_data_t;
 
 #define SOCKNAL_INIT_NOTHING    0
 #define SOCKNAL_INIT_DATA       1
-#define SOCKNAL_INIT_LIB        2
-#define SOCKNAL_INIT_ALL        3
+#define SOCKNAL_INIT_ALL        2
 
 /* A packet just assembled for transmission is represented by 1 or more
  * struct iovec fragments (the first frag contains the portals header),
- * followed by 0 or more ptl_kiov_t fragments.
+ * followed by 0 or more lnet_kiov_t fragments.
  *
  * On the receive side, initially 1 struct iovec fragment is posted for
  * receive (the header).  Once the header has been received, the payload is
- * received into either struct iovec or ptl_kiov_t fragments, depending on
+ * received into either struct iovec or lnet_kiov_t fragments, depending on
  * what the header matched or whether the message needs forwarding. */
 
 struct ksock_conn;                              /* forward ref */
 struct ksock_peer;                              /* forward ref */
 struct ksock_route;                             /* forward ref */
+struct ksock_protocol;                          /* forward ref */
 
 typedef struct                                  /* transmit packet */
 {
         struct list_head        tx_list;        /* queue on conn for transmission etc */
-        char                    tx_isfwd;       /* forwarding / sourced here */
+        struct list_head        tx_zc_list;     /* queue on peer for ZC request */
+        atomic_t                tx_refcount;    /* tx reference count */
         int                     tx_nob;         /* # packet bytes */
         int                     tx_resid;       /* residual bytes */
         int                     tx_niov;        /* # packet iovec frags */
         struct iovec           *tx_iov;         /* packet iovec frags */
         int                     tx_nkiov;       /* # packet page frags */
-        ptl_kiov_t             *tx_kiov;        /* packet page frags */
+        lnet_kiov_t            *tx_kiov;        /* packet page frags */
         struct ksock_conn      *tx_conn;        /* owning conn */
-        ptl_hdr_t              *tx_hdr;         /* packet header (for debug only) */
-#if SOCKNAL_ZC        
-        zccd_t                  tx_zccd;        /* zero copy callback descriptor */
-#endif
+        lnet_msg_t             *tx_lnetmsg;     /* lnet message for lnet_finalize() */
+        ksock_msg_t             tx_msg;         /* socklnd message buffer */
+        int                     tx_desc_size;   /* size of this descriptor */
+        union {
+                struct {
+                        struct iovec iov;       /* virt hdr */
+                        lnet_kiov_t  kiov[0];   /* paged payload */
+                }                  paged;
+                struct {
+                        struct iovec iov[1];    /* virt hdr + payload */
+                }                  virt;
+        }                       tx_frags;
 } ksock_tx_t;
 
-typedef struct                                  /* forwarded packet */
-{
-        ksock_tx_t             ftx_tx;          /* send info */
-        struct iovec           ftx_iov;         /* hdr iovec */
-} ksock_ftx_t;
+#define KSOCK_NOOP_TX_SIZE      offsetof(ksock_tx_t, tx_frags.paged.kiov[0])
 
-#define KSOCK_ZCCD_2_TX(ptr)   list_entry (ptr, ksock_tx_t, tx_zccd)
 /* network zero copy callback descriptor embedded in ksock_tx_t */
 
-typedef struct                                  /* locally transmitted packet */
-{
-        ksock_tx_t              ltx_tx;         /* send info */
-        void                   *ltx_private;    /* lib_finalize() callback arg */
-        void                   *ltx_cookie;     /* lib_finalize() callback arg */
-        ptl_hdr_t               ltx_hdr;        /* buffer for packet header */
-        int                     ltx_desc_size;  /* bytes allocated for this desc */
-        struct iovec            ltx_iov[1];     /* iov for hdr + payload */
-        ptl_kiov_t              ltx_kiov[0];    /* kiov for payload */
-} ksock_ltx_t;
-
-#define KSOCK_TX_2_KPR_FWD_DESC(ptr)    list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch)
-/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */
-
-#define KSOCK_TX_2_KSOCK_LTX(ptr)       list_entry (ptr, ksock_ltx_t, ltx_tx)
-/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */
-
-/* NB list_entry() is used here as convenient macro for calculating a
- * pointer to a struct from the address of a member. */
-
-typedef struct                                  /* Kernel portals Socket Forwarding message buffer */
-{                                               /* (socknal->router) */
-        struct list_head        fmb_list;       /* queue idle */
-        kpr_fwd_desc_t          fmb_fwd;        /* router's descriptor */
-        ksock_fmb_pool_t       *fmb_pool;       /* owning pool */
-        struct ksock_peer      *fmb_peer;       /* peer received from */
-        ptl_hdr_t               fmb_hdr;        /* message header */
-        ptl_kiov_t              fmb_kiov[0];    /* payload frags */
-} ksock_fmb_t;
-
 /* space for the rx frag descriptors; we either read a single contiguous
- * header, or up to PTL_MD_MAX_IOV frags of payload of either type. */
+ * header, or up to LNET_MAX_IOV frags of payload of either type. */
 typedef union {
-        struct iovec    iov[PTL_MD_MAX_IOV];
-        ptl_kiov_t      kiov[PTL_MD_MAX_IOV];
+        struct iovec     iov[LNET_MAX_IOV];
+        lnet_kiov_t      kiov[LNET_MAX_IOV];
 } ksock_rxiovspace_t;
 
-#define SOCKNAL_RX_HEADER       1               /* reading header */
-#define SOCKNAL_RX_BODY         2               /* reading body (to deliver here) */
-#define SOCKNAL_RX_BODY_FWD     3               /* reading body (to forward) */
-#define SOCKNAL_RX_SLOP         4               /* skipping body */
-#define SOCKNAL_RX_GET_FMB      5               /* scheduled for forwarding */
-#define SOCKNAL_RX_FMB_SLEEP    6               /* blocked waiting for a fwd desc */
+#define SOCKNAL_RX_KSM_HEADER   1               /* reading ksock message header */
+#define SOCKNAL_RX_LNET_HEADER  2               /* reading lnet message header */
+#define SOCKNAL_RX_PARSE        3               /* Calling lnet_parse() */
+#define SOCKNAL_RX_PARSE_WAIT   4               /* waiting to be told to read the body */
+#define SOCKNAL_RX_LNET_PAYLOAD 5               /* reading lnet payload (to deliver here) */
+#define SOCKNAL_RX_SLOP         6               /* skipping body */
 
 typedef struct ksock_conn
-{ 
+{
         struct ksock_peer  *ksnc_peer;          /* owning peer */
         struct ksock_route *ksnc_route;         /* owning route */
         struct list_head    ksnc_list;          /* stash on peer's conn list */
-        struct socket      *ksnc_sock;          /* actual socket */
+        cfs_socket_t       *ksnc_sock;          /* actual socket */
         void               *ksnc_saved_data_ready; /* socket's original data_ready() callback */
         void               *ksnc_saved_write_space; /* socket's original write_space() callback */
-        atomic_t            ksnc_refcount;      /* # users */
+        atomic_t            ksnc_conn_refcount; /* conn refcount */
+        atomic_t            ksnc_sock_refcount; /* sock refcount */
         ksock_sched_t     *ksnc_scheduler;     /* who schedules this connection */
         __u32               ksnc_myipaddr;      /* my IP */
         __u32               ksnc_ipaddr;        /* peer's IP */
         int                 ksnc_port;          /* peer's port */
-        int                 ksnc_closing;       /* being shut down */
-        int                 ksnc_type;          /* type of connection */
+        int                 ksnc_type:3;        /* type of connection, should be signed value */
+        int                 ksnc_closing:1;     /* being shut down */
+        int                 ksnc_flip:1;        /* flip or not, only for V2.x */
+        int                 ksnc_zc_capable:1;  /* enable to ZC */
         __u64               ksnc_incarnation;   /* peer's incarnation */
-        
+
         /* reader */
         struct list_head    ksnc_rx_list;       /* where I enq waiting input or a forwarding descriptor */
         cfs_time_t          ksnc_rx_deadline;   /* when (in jiffies) receive times out */
-        int                 ksnc_rx_started;    /* started receiving a message */
-        int                 ksnc_rx_ready;      /* data ready to read */
-        int                 ksnc_rx_scheduled;  /* being progressed */
-        int                 ksnc_rx_state;      /* what is being read */
+        __u8                ksnc_rx_started;    /* started receiving a message */
+        __u8                ksnc_rx_ready;      /* data ready to read */
+        __u8                ksnc_rx_scheduled;  /* being progressed */
+        __u8                ksnc_rx_state;      /* what is being read */
         int                 ksnc_rx_nob_left;   /* # bytes to next hdr/body  */
         int                 ksnc_rx_nob_wanted; /* bytes actually wanted */
         int                 ksnc_rx_niov;       /* # iovec frags */
         struct iovec       *ksnc_rx_iov;        /* the iovec frags */
         int                 ksnc_rx_nkiov;      /* # page frags */
-        ptl_kiov_t         *ksnc_rx_kiov;       /* the page frags */
+        lnet_kiov_t        *ksnc_rx_kiov;       /* the page frags */
         ksock_rxiovspace_t  ksnc_rx_iov_space;  /* space for frag descriptors */
-        void               *ksnc_cookie;        /* rx lib_finalize passthru arg */
-        ptl_hdr_t           ksnc_hdr;           /* where I read headers into */
+        __u32               ksnc_rx_csum;       /* partial checksum for incoming data */
+        void               *ksnc_cookie;        /* rx lnet_finalize passthru arg */
+        ksock_msg_t         ksnc_msg;           /* incoming message buffer:
+                                                 * V2.x message takes the whole struct
+                                                 * V1.x message is a bare lnet_hdr_t, it's stored
+                                                 * in ksnc_msg.ksm_u.lnetmsg */
 
         /* WRITER */
         struct list_head    ksnc_tx_list;       /* where I enq waiting for output space */
         struct list_head    ksnc_tx_queue;      /* packets waiting to be sent */
+        ksock_tx_t         *ksnc_tx_mono;       /* V2.x only, next mono-packet, mono-packet is :
+                                                 * a. lnet packet without piggyback
+                                                 * b. noop ZC-ACK packet */
         cfs_time_t          ksnc_tx_deadline;   /* when (in jiffies) tx times out */
         int                 ksnc_tx_bufnob;     /* send buffer marker */
         atomic_t            ksnc_tx_nob;        /* # bytes queued */
         int                 ksnc_tx_ready;      /* write space */
         int                 ksnc_tx_scheduled;  /* being progressed */
 
+        struct ksock_protocol *ksnc_proto;      /* protocol table for the connection */
+        
 #if !SOCKNAL_SINGLE_FRAG_RX
-        struct iovec        ksnc_rx_scratch_iov[PTL_MD_MAX_IOV];
+        struct iovec        ksnc_rx_scratch_iov[LNET_MAX_IOV];
 #endif
 #if !SOCKNAL_SINGLE_FRAG_TX
-        struct iovec        ksnc_tx_scratch_iov[PTL_MD_MAX_IOV];
+        struct iovec        ksnc_tx_scratch_iov[LNET_MAX_IOV];
 #endif
 } ksock_conn_t;
 
-#define KSNR_TYPED_ROUTES   ((1 << SOCKNAL_CONN_CONTROL) |      \
-                             (1 << SOCKNAL_CONN_BULK_IN) |      \
-                             (1 << SOCKNAL_CONN_BULK_OUT))
-
 typedef struct ksock_route
 {
         struct list_head    ksnr_list;          /* chain on peer route list */
-        struct list_head    ksnr_connect_list;  /* chain on autoconnect list */
+        struct list_head    ksnr_connd_list;    /* chain on ksnr_connd_routes */
         struct ksock_peer  *ksnr_peer;          /* owning peer */
         atomic_t            ksnr_refcount;      /* # users */
         cfs_time_t          ksnr_timeout;       /* when (in jiffies) reconnection can happen next */
@@ -346,51 +306,109 @@ typedef struct ksock_route
         __u32               ksnr_myipaddr;      /* my IP */
         __u32               ksnr_ipaddr;        /* IP address to connect to */
         int                 ksnr_port;          /* port to connect to */
-        unsigned int        ksnr_connecting:1;  /* autoconnect in progress */
+        unsigned int        ksnr_scheduled:1;   /* scheduled for attention */
+        unsigned int        ksnr_connecting:1;  /* connection establishment in progress */
         unsigned int        ksnr_connected:4;   /* connections established by type */
         unsigned int        ksnr_deleted:1;     /* been removed from peer? */
         unsigned int        ksnr_share_count;   /* created explicitly? */
         int                 ksnr_conn_count;    /* # conns established by this route */
+        struct ksock_protocol *ksnr_proto  ;    /* protocol table for connecting */
 } ksock_route_t;
 
 typedef struct ksock_peer
 {
         struct list_head    ksnp_list;          /* stash on global peer list */
-        ptl_nid_t           ksnp_nid;           /* who's on the other end(s) */
+        lnet_process_id_t   ksnp_id;            /* who's on the other end(s) */
         atomic_t            ksnp_refcount;      /* # users */
         int                 ksnp_sharecount;    /* lconf usage counter */
         int                 ksnp_closing;       /* being closed */
+        int                 ksnp_accepting;     /* # passive connections pending */
         int                 ksnp_error;         /* errno on closing last conn */
+        __u64               ksnp_zc_next_cookie;/* ZC completion cookie */
         struct list_head    ksnp_conns;         /* all active connections */
         struct list_head    ksnp_routes;        /* routes */
         struct list_head    ksnp_tx_queue;      /* waiting packets */
+        spinlock_t          ksnp_lock;          /* serialize, NOT safe in g_lock */
+        struct list_head    ksnp_zc_req_list;   /* zero copy requests wait for ACK  */
         cfs_time_t          ksnp_last_alive;    /* when (in jiffies) I was last alive */
+        lnet_ni_t          *ksnp_ni;            /* which network */
         int                 ksnp_n_passive_ips; /* # of... */
-        __u32               ksnp_passive_ips[SOCKNAL_MAX_INTERFACES]; /* preferred local interfaces */
+        __u32               ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */
 } ksock_peer_t;
 
+typedef struct ksock_connreq
+{
+        struct list_head    ksncr_list;         /* stash on ksnd_connd_connreqs */
+        lnet_ni_t          *ksncr_ni;           /* chosen NI */
+        cfs_socket_t       *ksncr_sock;         /* accepted socket */
+} ksock_connreq_t;
 
-extern lib_nal_t        ksocknal_lib;
 extern ksock_nal_data_t ksocknal_data;
 extern ksock_tunables_t ksocknal_tunables;
 
+typedef struct ksock_protocol
+{
+        int     pro_version;                                                /* version number of protocol */
+        int     (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *);     /* handshake function */
+        int     (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */
+        void    (*pro_pack)(ksock_tx_t *);                                  /* message pack */
+        void    (*pro_unpack)(ksock_msg_t *);                               /* message unpack */
+} ksock_protocol_t;
+
+extern ksock_protocol_t ksocknal_protocol_v1x;
+extern ksock_protocol_t ksocknal_protocol_v2x;
+
+#define KSOCK_PROTO_V1_MAJOR    LNET_PROTO_TCP_VERSION_MAJOR
+#define KSOCK_PROTO_V1_MINOR    LNET_PROTO_TCP_VERSION_MINOR
+#define KSOCK_PROTO_V1          KSOCK_PROTO_V1_MAJOR
+#define KSOCK_PROTO_V2          2
+
+static inline int
+ksocknal_route_mask(void) 
+{
+        if (!*ksocknal_tunables.ksnd_typed_conns)
+                return (1 << SOCKLND_CONN_ANY);
+        
+        return ((1 << SOCKLND_CONN_CONTROL) |
+                (1 << SOCKLND_CONN_BULK_IN) |
+                (1 << SOCKLND_CONN_BULK_OUT));
+}
+
 static inline struct list_head *
-ksocknal_nid2peerlist (ptl_nid_t nid)
+ksocknal_nid2peerlist (lnet_nid_t nid)
 {
         unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size;
 
         return (&ksocknal_data.ksnd_peers [hash]);
 }
 
+static inline void
+ksocknal_conn_addref (ksock_conn_t *conn)
+{
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+        atomic_inc(&conn->ksnc_conn_refcount);
+}
+
+extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn);
+
+static inline void
+ksocknal_conn_decref (ksock_conn_t *conn)
+{
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
+        if (atomic_dec_and_test(&conn->ksnc_conn_refcount))
+                ksocknal_queue_zombie_conn(conn);
+}
+
 static inline int
-ksocknal_getconnsock (ksock_conn_t *conn)
+ksocknal_connsock_addref (ksock_conn_t *conn)
 {
         int   rc = -ESHUTDOWN;
 
         read_lock (&ksocknal_data.ksnd_global_lock);
         if (!conn->ksnc_closing) {
+                LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+                atomic_inc(&conn->ksnc_sock_refcount);
                 rc = 0;
-                cfs_get_file (KSN_CONN2FILE(conn));
         }
         read_unlock (&ksocknal_data.ksnd_global_lock);
 
@@ -398,61 +416,127 @@ ksocknal_getconnsock (ksock_conn_t *conn)
 }
 
 static inline void
-ksocknal_putconnsock (ksock_conn_t *conn)
+ksocknal_connsock_decref (ksock_conn_t *conn)
+{
+        LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0);
+        if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) {
+                LASSERT (conn->ksnc_closing);
+                libcfs_sock_release(conn->ksnc_sock);
+                conn->ksnc_sock = NULL;
+        }
+}
+
+static inline void
+ksocknal_tx_addref (ksock_tx_t *tx)
+{
+        LASSERT (atomic_read(&tx->tx_refcount) > 0);
+        atomic_inc(&tx->tx_refcount);
+}
+
+extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx);
+
+static inline void
+ksocknal_tx_decref (ksock_tx_t *tx)
 {
-        cfs_put_file (KSN_CONN2FILE(conn));
+        LASSERT (atomic_read(&tx->tx_refcount) > 0);
+        if (atomic_dec_and_test(&tx->tx_refcount))
+                ksocknal_tx_done(NULL, tx);
 }
 
-extern void ksocknal_put_route (ksock_route_t *route);
-extern void ksocknal_put_peer (ksock_peer_t *peer);
-extern ksock_peer_t *ksocknal_find_peer_locked (ptl_nid_t nid);
-extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid);
-extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr,
-                               int single, int keep_conn);
-extern int ksocknal_create_conn (ksock_route_t *route,
-                                 struct socket *sock, int type);
+static inline void
+ksocknal_route_addref (ksock_route_t *route)
+{
+        LASSERT (atomic_read(&route->ksnr_refcount) > 0);
+        atomic_inc(&route->ksnr_refcount);
+}
+
+extern void ksocknal_destroy_route (ksock_route_t *route);
+
+static inline void
+ksocknal_route_decref (ksock_route_t *route)
+{
+        LASSERT (atomic_read (&route->ksnr_refcount) > 0);
+        if (atomic_dec_and_test(&route->ksnr_refcount))
+                ksocknal_destroy_route (route);
+}
+
+static inline void
+ksocknal_peer_addref (ksock_peer_t *peer)
+{
+        LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+        atomic_inc(&peer->ksnp_refcount);
+}
+
+extern void ksocknal_destroy_peer (ksock_peer_t *peer);
+
+static inline void
+ksocknal_peer_decref (ksock_peer_t *peer)
+{
+        LASSERT (atomic_read (&peer->ksnp_refcount) > 0);
+        if (atomic_dec_and_test(&peer->ksnp_refcount))
+                ksocknal_destroy_peer (peer);
+}
+
+int ksocknal_startup (lnet_ni_t *ni);
+void ksocknal_shutdown (lnet_ni_t *ni);
+int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, 
+                  int delayed, unsigned int niov, 
+                  struct iovec *iov, lnet_kiov_t *kiov,
+                  unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ksocknal_accept(lnet_ni_t *ni, cfs_socket_t *sock);
+
+extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port);
+extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id);
+extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id);
+extern void ksocknal_peer_failed (ksock_peer_t *peer);
+extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
+                                 cfs_socket_t *sock, int type);
 extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
 extern void ksocknal_terminate_conn (ksock_conn_t *conn);
 extern void ksocknal_destroy_conn (ksock_conn_t *conn);
-extern void ksocknal_put_conn (ksock_conn_t *conn);
 extern int ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation);
 extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why);
-extern int ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr);
+extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr);
 
 extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn);
-extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch);
-extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
-extern void ksocknal_fmb_callback (void *arg, int error);
-extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive);
+extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error);
+extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive);
 extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
+extern void ksocknal_thread_fini (void);
+extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer);
 extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
 extern int ksocknal_scheduler (void *arg);
-extern int ksocknal_autoconnectd (void *arg);
+extern int ksocknal_connd (void *arg);
 extern int ksocknal_reaper (void *arg);
-extern int ksocknal_setup_sock (struct socket *sock);
-extern int ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs);
-extern int ksocknal_recv_hello (ksock_conn_t *conn,
-                                ptl_nid_t *nid, __u64 *incarnation, __u32 *ipaddrs);
-
-extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn);
-extern void ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn);
-extern void ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn);
-extern void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn);
+extern ksock_protocol_t * ksocknal_compat_protocol(ksock_hello_msg_t *);
+extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                                lnet_nid_t peer_nid, ksock_hello_msg_t *hello);
+extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, 
+                                ksock_hello_msg_t *hello, lnet_process_id_t *id,
+                                __u64 *incarnation);
+extern void ksocknal_read_callback(ksock_conn_t *conn);
+extern void ksocknal_write_callback(ksock_conn_t *conn);
+
+extern int ksocknal_lib_zc_capable(cfs_socket_t *sock);
+extern void ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn);
+extern void ksocknal_lib_set_callback(cfs_socket_t *sock,  ksock_conn_t *conn);
+extern void ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn);
 extern void ksocknal_lib_push_conn (ksock_conn_t *conn);
 extern void ksocknal_lib_bind_irq (unsigned int irq);
 extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn);
-extern unsigned int ksocknal_lib_sock_irq (struct socket *sock);
-extern int ksocknal_lib_setup_sock (struct socket *so);
+extern unsigned int ksocknal_lib_sock_irq (cfs_socket_t *sock);
+extern int ksocknal_lib_setup_sock (cfs_socket_t *so);
 extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx);
 extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx);
 extern void ksocknal_lib_eager_ack (ksock_conn_t *conn);
 extern int ksocknal_lib_recv_iov (ksock_conn_t *conn);
 extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn);
-extern int ksocknal_lib_sock_write (struct socket *sock, 
-                                    void *buffer, int nob);
-extern int ksocknal_lib_sock_read (struct socket *sock, 
-                                   void *buffer, int nob);
 extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, 
                                            int *rxmem, int *nagle);
-extern int ksocknal_lib_connect_sock(struct socket **sockp, int *may_retry,
-                                     ksock_route_t *route, int local_port);
+
+extern int ksocknal_lib_tunables_init(void);
+extern void ksocknal_lib_tunables_fini(void);
+
+extern void ksocknal_lib_csum_tx(ksock_tx_t *tx);
index bd26027..7ca80cd 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include "socknal.h"
+#include "socklnd.h"
 
-/*
- *  LIB functions follow
- *
- */
-int
-ksocknal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
+ksock_tx_t *
+ksocknal_alloc_tx (int size) 
 {
-        /* I would guess that if ksocknal_get_peer (nid) == NULL,
-           and we're not routing, then 'nid' is very distant :) */
-        if (nal->libnal_ni.ni_pid.nid == nid) {
-                *dist = 0;
-        } else {
-                *dist = 1;
+        ksock_tx_t *tx = NULL;
+
+        if (size == KSOCK_NOOP_TX_SIZE) {
+                /* searching for a noop tx in free list */
+                spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+                if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) {
+                        tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next,
+                                        ksock_tx_t, tx_list);
+                        LASSERT(tx->tx_desc_size == size);
+                        list_del(&tx->tx_list);
+                }
+                             
+                spin_unlock(&ksocknal_data.ksnd_tx_lock);
         }
+                
+        if (tx == NULL)
+                LIBCFS_ALLOC(tx, size);
 
-        return 0;
+        if (tx == NULL) 
+                return NULL;
+
+        atomic_set(&tx->tx_refcount, 1);
+        tx->tx_desc_size = size;
+        atomic_inc(&ksocknal_data.ksnd_nactive_txs);
+
+        return tx;
+}
+
+void
+ksocknal_free_tx (ksock_tx_t *tx)
+{
+        atomic_dec(&ksocknal_data.ksnd_nactive_txs);
+
+        if (tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) {
+                /* it's a noop tx */
+                spin_lock(&ksocknal_data.ksnd_tx_lock);
+
+                list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs);
+
+                spin_unlock(&ksocknal_data.ksnd_tx_lock);
+        } else {
+                LIBCFS_FREE(tx, tx->tx_desc_size);
+        }
 }
 
 void
-ksocknal_free_ltx (ksock_ltx_t *ltx)
+ksocknal_init_msg(ksock_msg_t *msg, int type)
 {
-        atomic_dec(&ksocknal_data.ksnd_nactive_ltxs);
-        PORTAL_FREE(ltx, ltx->ltx_desc_size);
+        msg->ksm_type           = type;
+        msg->ksm_csum           = 0;
+        msg->ksm_zc_req_cookie  = 0;
+        msg->ksm_zc_ack_cookie  = 0;
 }
 
 int
@@ -90,7 +123,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 int
 ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 { 
-        ptl_kiov_t    *kiov = tx->tx_kiov;
+        lnet_kiov_t    *kiov = tx->tx_kiov;
         int     nob;
         int     rc;
 
@@ -102,7 +135,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 
         if (rc <= 0)                            /* sent nothing? */ 
                 return (rc); 
-
+        
         nob = rc; 
         LASSERT (nob <= tx->tx_resid); 
         tx->tx_resid -= nob; 
@@ -130,15 +163,14 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
 {
         int      rc;
         int      bufnob;
-
+        
         if (ksocknal_data.ksnd_stall_tx != 0) {
-                set_current_state (TASK_UNINTERRUPTIBLE);
-                schedule_timeout (cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
+                cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx));
         }
 
         LASSERT (tx->tx_resid != 0);
 
-        rc = ksocknal_getconnsock (conn);
+        rc = ksocknal_connsock_addref(conn);
         if (rc != 0) {
                 LASSERT (conn->ksnc_closing);
                 return (-ESHUTDOWN);
@@ -158,18 +190,18 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                 bufnob = SOCK_WMEM_QUEUED(conn->ksnc_sock);
                 if (rc > 0)                     /* sent something? */
                         conn->ksnc_tx_bufnob += rc; /* account it */
-
+                
                 if (bufnob < conn->ksnc_tx_bufnob) {
                         /* allocated send buffer bytes < computed; infer
                          * something got ACKed */
-                        conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout);
+                        conn->ksnc_tx_deadline = 
+                                cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
                         conn->ksnc_peer->ksnp_last_alive = cfs_time_current();
                         conn->ksnc_tx_bufnob = bufnob;
                         mb();
                 }
 
                 if (rc <= 0) { /* Didn't write anything? */
-                        unsigned long  flags;
                         ksock_sched_t *sched;
 
                         if (rc == 0) /* some stacks return 0 instead of -EAGAIN */
@@ -181,8 +213,8 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                         /* Check if EAGAIN is due to memory pressure */
 
                         sched = conn->ksnc_scheduler;
-                        spin_lock_irqsave(&sched->kss_lock, flags);
-
+                        spin_lock_bh (&sched->kss_lock);
+                                
                         if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) &&
                             !conn->ksnc_tx_ready) {
                                 /* SOCK_NOSPACE is set when the socket fills
@@ -196,7 +228,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                                 rc = -ENOMEM;
                         }
 
-                        spin_unlock_irqrestore(&sched->kss_lock, flags);
+                        spin_unlock_bh (&sched->kss_lock);
                         break;
                 }
 
@@ -206,7 +238,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
 
         } while (tx->tx_resid != 0);
 
-        ksocknal_putconnsock (conn);
+        ksocknal_connsock_decref(conn);
         return (rc);
 }
 
@@ -228,12 +260,13 @@ ksocknal_recv_iov (ksock_conn_t *conn)
 
         /* received something... */ 
         nob = rc; 
-
+        
         conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); 
-        conn->ksnc_rx_deadline = cfs_time_shift (ksocknal_tunables.ksnd_io_timeout); 
+        conn->ksnc_rx_deadline = 
+                cfs_time_shift(*ksocknal_tunables.ksnd_timeout); 
         mb();                           /* order with setting rx_started */ 
         conn->ksnc_rx_started = 1; 
-
+        
         conn->ksnc_rx_nob_wanted -= nob; 
         conn->ksnc_rx_nob_left -= nob;
 
@@ -257,7 +290,7 @@ ksocknal_recv_iov (ksock_conn_t *conn)
 int
 ksocknal_recv_kiov (ksock_conn_t *conn)
 {
-        ptl_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
         int     nob;
         int     rc;
         LASSERT (conn->ksnc_rx_nkiov > 0);
@@ -265,21 +298,22 @@ ksocknal_recv_kiov (ksock_conn_t *conn)
         /* Never touch conn->ksnc_rx_kiov or change connection 
          * status inside ksocknal_lib_recv_iov */
         rc = ksocknal_lib_recv_kiov(conn); 
-
+        
         if (rc <= 0) 
                 return (rc); 
-
+        
         /* received something... */ 
         nob = rc; 
 
         conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); 
-        conn->ksnc_rx_deadline = cfs_time_shift (ksocknal_tunables.ksnd_io_timeout); 
+        conn->ksnc_rx_deadline = 
+                cfs_time_shift(*ksocknal_tunables.ksnd_timeout); 
         mb();                           /* order with setting rx_started */ 
         conn->ksnc_rx_started = 1;
 
         conn->ksnc_rx_nob_wanted -= nob; 
         conn->ksnc_rx_nob_left -= nob; 
-
+        
         do { 
                 LASSERT (conn->ksnc_rx_nkiov > 0); 
 
@@ -305,13 +339,12 @@ ksocknal_receive (ksock_conn_t *conn)
          * progress/completion. */
         int     rc;
         ENTRY;
-
+        
         if (ksocknal_data.ksnd_stall_rx != 0) {
-                set_current_state (TASK_UNINTERRUPTIBLE);
-                schedule_timeout(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
+                cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx));
         }
 
-        rc = ksocknal_getconnsock (conn);
+        rc = ksocknal_connsock_addref(conn);
         if (rc != 0) {
                 LASSERT (conn->ksnc_closing);
                 return (-ESHUTDOWN);
@@ -337,107 +370,128 @@ ksocknal_receive (ksock_conn_t *conn)
                 /* Completed a fragment */
 
                 if (conn->ksnc_rx_nob_wanted == 0) {
-                        /* Completed a message segment (header or payload) */
-                        if ((ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0 &&
-                            (conn->ksnc_rx_state ==  SOCKNAL_RX_BODY ||
-                             conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) {
-                                /* Remind the socket to ack eagerly... */
-                                ksocknal_lib_eager_ack(conn);
-                        }
                         rc = 1;
                         break;
                 }
         }
 
-        ksocknal_putconnsock (conn);
+        ksocknal_connsock_decref(conn);
         RETURN (rc);
 }
 
-#if SOCKNAL_ZC
 void
-ksocknal_zc_callback (zccd_t *zcd)
+ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx)
 {
-        ksock_tx_t    *tx = KSOCK_ZCCD_2_TX(zcd);
-        ksock_sched_t *sched = tx->tx_conn->ksnc_scheduler;
-        unsigned long  flags;
+        lnet_msg_t  *lnetmsg = tx->tx_lnetmsg;
+        int          rc = (tx->tx_resid == 0) ? 0 : -EIO;
         ENTRY;
 
-        /* Schedule tx for cleanup (can't do it now due to lock conflicts) */
+        LASSERT(ni != NULL || tx->tx_conn != NULL);
+
+        if (tx->tx_conn != NULL)
+                ksocknal_conn_decref(tx->tx_conn);
 
-        spin_lock_irqsave (&sched->kss_lock, flags);
+        if (ni == NULL && tx->tx_conn != NULL) 
+                ni = tx->tx_conn->ksnc_peer->ksnp_ni;
 
-        list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
-        cfs_waitq_signal (&sched->kss_waitq);
+        ksocknal_free_tx (tx);
+        if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
+                lnet_finalize (ni, lnetmsg, rc);
 
-        spin_unlock_irqrestore (&sched->kss_lock, flags);
         EXIT;
 }
-#endif
 
 void
-ksocknal_tx_done (ksock_tx_t *tx, int asynch)
+ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error)
 {
-        ksock_ltx_t   *ltx;
-        ENTRY;
+        ksock_tx_t *tx;
+        
+        while (!list_empty (txlist)) {
+                tx = list_entry (txlist->next, ksock_tx_t, tx_list);
+
+                if (error && tx->tx_lnetmsg != NULL) {
+                        CDEBUG (D_NETERROR, "Deleting packet type %d len %d %s->%s\n",
+                                le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
+                                le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
+                                libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+                                libcfs_nid2str(le64_to_cpu (tx->tx_lnetmsg->msg_hdr.dest_nid)));
+                } else if (error) {
+                        CDEBUG (D_NETERROR, "Deleting noop packet\n");
+                }
 
-        if (tx->tx_conn != NULL) {
-#if SOCKNAL_ZC
-                /* zero copy completion isn't always from
-                 * process_transmit() so it needs to keep a ref on
-                 * tx_conn... */
-                if (asynch)
-                        ksocknal_put_conn (tx->tx_conn);
-#else
-                LASSERT (!asynch);
-#endif
-        }
+                list_del (&tx->tx_list);
 
-        if (tx->tx_isfwd) {             /* was a forwarded packet? */
-                kpr_fwd_done (&ksocknal_data.ksnd_router,
-                              KSOCK_TX_2_KPR_FWD_DESC (tx), 
-                              (tx->tx_resid == 0) ? 0 : -ECONNABORTED);
-                EXIT;
-                return;
+                LASSERT (atomic_read(&tx->tx_refcount) == 1);
+                ksocknal_tx_done (ni, tx);
         }
+}
+
+int
+ksocknal_zc_req(ksock_tx_t *tx)
+{
+        lnet_kiov_t    *kiov = tx->tx_kiov;
+        int             nkiov = tx->tx_nkiov;
 
-        /* local send */
-        ltx = KSOCK_TX_2_KSOCK_LTX (tx);
+        if (!tx->tx_conn->ksnc_zc_capable)
+                return 0;
 
-        lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie,
-                      (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL);
+        while (nkiov > 0) {
+                if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag)
+                        return 1;
+                --nkiov;
+                ++kiov;
+        } 
 
-        ksocknal_free_ltx (ltx);
-        EXIT;
+        return 0;
 }
 
-void
-ksocknal_tx_launched (ksock_tx_t *tx) 
+static void
+ksocknal_queue_zc_req(ksock_tx_t *tx)
 {
-#if SOCKNAL_ZC
-        if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
-                ksock_conn_t  *conn = tx->tx_conn;
+        ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
 
-                /* zccd skbufs are still in-flight.  First take a ref on
-                 * conn, so it hangs about for ksocknal_tx_done... */
-                atomic_inc (&conn->ksnc_refcount);
+        /* assign cookie and queue tx to pending list, it will be 
+         * released while getting ack, see ksocknal_handle_zc_ack() */
 
-                /* ...then drop the initial ref on zccd, so the zero copy
-                 * callback can occur */
-                zccd_put (&tx->tx_zccd);
-                return;
-        }
-#endif
-        /* Any zero-copy-ness (if any) has completed; I can complete the
-         * transmit now, avoiding an extra schedule */
-        ksocknal_tx_done (tx, 0);
+        ksocknal_tx_addref(tx); /* +1 ref */
+
+        spin_lock(&peer->ksnp_lock);
+
+        tx->tx_msg.ksm_zc_req_cookie = peer->ksnp_zc_next_cookie++; 
+        list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list);
+
+        spin_unlock(&peer->ksnp_lock);
 }
 
+static void
+ksocknal_dequeue_zc_req(ksock_tx_t *tx)
+{
+        ksock_peer_t   *peer = tx->tx_conn->ksnc_peer;
+
+        spin_lock(&peer->ksnp_lock);
+
+        if (tx->tx_msg.ksm_zc_req_cookie != 0) {
+                /* not deleted by ksocknal_terminate_conn() */
+                list_del(&tx->tx_zc_list);
+        }
+
+        spin_unlock(&peer->ksnp_lock);
+
+        if (tx->tx_msg.ksm_zc_req_cookie != 0)
+                ksocknal_tx_decref(tx); /* -1 ref */
+}
 int
 ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
 {
-        unsigned long  flags;
         int            rc;
 
+        if (conn->ksnc_proto == &ksocknal_protocol_v2x &&
+            tx->tx_msg.ksm_zc_req_cookie == 0 &&
+            ksocknal_zc_req(tx)) {
+                /* wait for ACK */
+                ksocknal_queue_zc_req(tx);
+        }
+      
         rc = ksocknal_transmit (conn, tx);
 
         CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
@@ -446,7 +500,6 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                 /* Sent everything OK */
                 LASSERT (rc == 0);
 
-                ksocknal_tx_launched (tx);
                 return (0);
         }
 
@@ -458,11 +511,11 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
 
                 counter++;   /* exponential backoff warnings */
                 if ((counter & (-counter)) == counter)
-                        CWARN("%d ENOMEM tx %p (%u allocated)\n",
-                              counter, conn, atomic_read(&portal_kmemory));
+                        CWARN("%u ENOMEM tx %p (%u allocated)\n",
+                              counter, conn, atomic_read(&libcfs_kmemory));
 
                 /* Queue on ksnd_enomem_conns for retry after a timeout */
-                spin_lock_irqsave(&ksocknal_data.ksnd_reaper_lock, flags);
+                spin_lock_bh (&ksocknal_data.ksnd_reaper_lock);
 
                 /* enomem list takes over scheduler's ref... */
                 LASSERT (conn->ksnc_tx_scheduled);
@@ -472,8 +525,8 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                                                    SOCKNAL_ENOMEM_RETRY),
                                    ksocknal_data.ksnd_reaper_waketime))
                         cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq);
-
-                spin_unlock_irqrestore(&ksocknal_data.ksnd_reaper_lock, flags);
+                
+                spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock);
                 return (rc);
         }
 
@@ -494,76 +547,47 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx)
                                       HIPQUAD(conn->ksnc_ipaddr), rc);
                         break;
                 }
-                CDEBUG(D_HA, "[%p] Error %d on write to "LPX64
+                CDEBUG(D_NET, "[%p] Error %d on write to %s"
                        " ip %d.%d.%d.%d:%d\n", conn, rc,
-                       conn->ksnc_peer->ksnp_nid,
+                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
                        HIPQUAD(conn->ksnc_ipaddr),
                        conn->ksnc_port);
+        } else {
+                /* closed, dequeue the ZC request if needed */
+                ksocknal_dequeue_zc_req(tx);
         }
 
-        ksocknal_close_conn_and_siblings (conn, rc);
-        ksocknal_tx_launched (tx);
+        /* it's not an error if conn is being closed */
+        ksocknal_close_conn_and_siblings (conn, 
+                                          (conn->ksnc_closing) ? 0 : rc);
 
         return (rc);
 }
 
 void
-ksocknal_launch_autoconnect_locked (ksock_route_t *route)
+ksocknal_launch_connection_locked (ksock_route_t *route)
 {
-        unsigned long     flags;
 
         /* called holding write lock on ksnd_global_lock */
-        LASSERT (!route->ksnr_connecting);
-
-        route->ksnr_connecting = 1;             /* scheduling conn for autoconnectd */
-        atomic_inc (&route->ksnr_refcount);     /* extra ref for autoconnectd */
-
-        spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags);
-
-        list_add_tail (&route->ksnr_connect_list,
-                       &ksocknal_data.ksnd_autoconnectd_routes);
-        cfs_waitq_signal (&ksocknal_data.ksnd_autoconnectd_waitq);
-
-        spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags);
-}
-
-ksock_peer_t *
-ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid)
-{
-        char          ipbuf[PTL_NALFMT_SIZE];
-        ptl_nid_t     target_nid;
-        int           rc;
-        ksock_peer_t *peer = ksocknal_find_peer_locked (nid);
-
-        if (peer != NULL)
-                return (peer);
-
-        if (tx->tx_isfwd) {
-                CERROR ("Can't send packet to "LPX64
-                       " %s: routed target is not a peer\n",
-                        nid, portals_nid2str(SOCKNAL, nid, ipbuf));
-                return (NULL);
-        }
-
-        rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob,
-                         &target_nid);
-        if (rc != 0) {
-                CERROR ("Can't route to "LPX64" %s: router error %d\n",
-                        nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc);
-                return (NULL);
-        }
-
-        peer = ksocknal_find_peer_locked (target_nid);
-        if (peer != NULL)
-                return (peer);
 
-        CERROR ("Can't send packet to "LPX64" %s: no peer entry\n",
-                target_nid, portals_nid2str(SOCKNAL, target_nid, ipbuf));
-        return (NULL);
+        LASSERT (!route->ksnr_scheduled);
+        LASSERT (!route->ksnr_connecting);
+        LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0);
+        
+        route->ksnr_scheduled = 1;              /* scheduling conn for connd */
+        ksocknal_route_addref(route);           /* extra ref for connd */
+        
+        spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
+        
+        list_add_tail (&route->ksnr_connd_list,
+                       &ksocknal_data.ksnd_connd_routes);
+        cfs_waitq_signal (&ksocknal_data.ksnd_connd_waitq);
+        
+        spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
 }
 
 ksock_conn_t *
-ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
+ksocknal_find_conn_locked (int payload_nob, ksock_peer_t *peer)
 {
         struct list_head *tmp;
         ksock_conn_t     *typed = NULL;
@@ -574,6 +598,7 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
 
         list_for_each (tmp, &peer->ksnp_conns) {
                 ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list);
+                int           hdr_nob = 0;
 #if SOCKNAL_ROUND_ROBIN
                 const int     nob = 0;
 #else
@@ -581,29 +606,40 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
                                         SOCK_WMEM_QUEUED(c->ksnc_sock);
 #endif
                 LASSERT (!c->ksnc_closing);
+                LASSERT(c->ksnc_proto != NULL);
 
                 if (fallback == NULL || nob < fnob) {
                         fallback = c;
                         fnob     = nob;
                 }
 
-                if (!ksocknal_tunables.ksnd_typed_conns)
+                if (!*ksocknal_tunables.ksnd_typed_conns)
                         continue;
 
+                if (payload_nob == 0) {
+                        /* noop packet */
+                        hdr_nob = offsetof(ksock_msg_t, ksm_u);
+                } else {
+                        /* lnet packet */
+                        hdr_nob = (c->ksnc_proto == &ksocknal_protocol_v2x)?
+                                  offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload):
+                                  sizeof(lnet_hdr_t);
+                }
+
                 switch (c->ksnc_type) {
                 default:
                         CERROR("ksnc_type bad: %u\n", c->ksnc_type);
                         LBUG();
-                case SOCKNAL_CONN_ANY:
+                case SOCKLND_CONN_ANY:
                         break;
-                case SOCKNAL_CONN_BULK_IN:
+                case SOCKLND_CONN_BULK_IN:
                         continue;
-                case SOCKNAL_CONN_BULK_OUT:
-                        if (tx->tx_nob < ksocknal_tunables.ksnd_min_bulk)
+                case SOCKLND_CONN_BULK_OUT:
+                        if ((hdr_nob + payload_nob) < *ksocknal_tunables.ksnd_min_bulk)
                                 continue;
                         break;
-                case SOCKNAL_CONN_CONTROL:
-                        if (tx->tx_nob >= ksocknal_tunables.ksnd_min_bulk)
+                case SOCKLND_CONN_CONTROL:
+                        if ((hdr_nob + payload_nob) >= *ksocknal_tunables.ksnd_min_bulk)
                                 continue;
                         break;
                 }
@@ -628,54 +664,179 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer)
 }
 
 void
+ksocknal_next_mono_tx(ksock_conn_t *conn)
+{
+        ksock_tx_t     *tx = conn->ksnc_tx_mono;
+
+        /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+        LASSERT(conn->ksnc_proto == &ksocknal_protocol_v2x);
+        LASSERT(!list_empty(&conn->ksnc_tx_queue));
+        LASSERT(tx != NULL);
+
+        if (tx->tx_list.next == &conn->ksnc_tx_queue) {
+                /* no more packets queued */
+                conn->ksnc_tx_mono = NULL;
+        } else {
+                conn->ksnc_tx_mono = list_entry(tx->tx_list.next, ksock_tx_t, tx_list);
+                LASSERT(conn->ksnc_tx_mono->tx_msg.ksm_type == tx->tx_msg.ksm_type);
+        }
+}
+
+int
+ksocknal_piggyback_zcack(ksock_conn_t *conn, __u64 cookie)
+{
+        ksock_tx_t     *tx = conn->ksnc_tx_mono;
+
+        /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */
+
+        if (tx == NULL)
+                return 0;
+
+        if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+                /* tx is noop zc-ack, can't piggyback zc-ack cookie */
+                return 0;
+        }
+
+        LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET);
+        LASSERT(tx->tx_msg.ksm_zc_ack_cookie == 0);
+
+        /* piggyback the zc-ack cookie */
+        tx->tx_msg.ksm_zc_ack_cookie = cookie;
+        ksocknal_next_mono_tx(conn);
+
+        return 1;
+}
+
+void
 ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn)
 {
-        unsigned long  flags;
         ksock_sched_t *sched = conn->ksnc_scheduler;
+        ksock_msg_t   *msg = &tx->tx_msg;
+        ksock_tx_t    *ztx;
+        int            bufnob = 0;
 
         /* called holding global lock (read or irq-write) and caller may
          * not have dropped this lock between finding conn and calling me,
          * so we don't need the {get,put}connsock dance to deref
          * ksnc_sock... */
         LASSERT(!conn->ksnc_closing);
-        LASSERT(tx->tx_resid == tx->tx_nob);
 
-        CDEBUG (D_NET, "Sending to "LPX64" ip %d.%d.%d.%d:%d\n", 
-                conn->ksnc_peer->ksnp_nid,
+        CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n", 
+                libcfs_id2str(conn->ksnc_peer->ksnp_id),
                 HIPQUAD(conn->ksnc_ipaddr),
                 conn->ksnc_port);
 
+        conn->ksnc_proto->pro_pack(tx);
+
+        /* Ensure the frags we've been given EXACTLY match the number of
+         * bytes we want to send.  Many TCP/IP stacks disregard any total
+         * size parameters passed to them and just look at the frags. 
+         *
+         * We always expect at least 1 mapped fragment containing the
+         * complete ksocknal message header. */
+        LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) +
+                 lnet_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob);
+        LASSERT (tx->tx_niov >= 1);
+        LASSERT (tx->tx_resid == tx->tx_nob);
+
+        CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n",
+                tx, (tx->tx_lnetmsg != NULL)? tx->tx_lnetmsg->msg_hdr.type: 
+                                              KSOCK_MSG_NOOP, 
+                tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
+        
         atomic_add (tx->tx_nob, &conn->ksnc_tx_nob);
         tx->tx_conn = conn;
+        ksocknal_conn_addref(conn); /* +1 ref for tx */
 
-#if SOCKNAL_ZC
-        zccd_init (&tx->tx_zccd, ksocknal_zc_callback);
-        /* NB this sets 1 ref on zccd, so the callback can only occur after
-         * I've released this ref. */
-#endif
-        spin_lock_irqsave (&sched->kss_lock, flags);
+        /* 
+         * NB Darwin: SOCK_WMEM_QUEUED()->sock_getsockopt() will take
+         * a blockable lock(socket lock), so SOCK_WMEM_QUEUED can't be
+         * put in spinlock. 
+         */
+        bufnob = SOCK_WMEM_QUEUED(conn->ksnc_sock);
+        spin_lock_bh (&sched->kss_lock);
 
-        if (list_empty(&conn->ksnc_tx_queue) &&
-            SOCK_WMEM_QUEUED(conn->ksnc_sock) == 0) {
+        if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) {
                 /* First packet starts the timeout */
-                conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout);
+                conn->ksnc_tx_deadline = 
+                        cfs_time_shift(*ksocknal_tunables.ksnd_timeout);
                 conn->ksnc_tx_bufnob = 0;
                 mb();    /* order with adding to tx_queue */
         }
 
-        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+        ztx = NULL;
+
+        if (msg->ksm_type == KSOCK_MSG_NOOP) {
+                /* The packet is noop ZC ACK, try to piggyback the ack_cookie
+                 * on a normal packet so I don't need to send it */
+                LASSERT(msg->ksm_zc_req_cookie == 0);
+                LASSERT(msg->ksm_zc_ack_cookie != 0);
+
+                if (conn->ksnc_tx_mono != NULL) {
+                        if (ksocknal_piggyback_zcack(conn, msg->ksm_zc_ack_cookie)) {
+                                /* zc-ack cookie is piggybacked */
+                                atomic_sub (tx->tx_nob, &conn->ksnc_tx_nob);
+                                ztx = tx;       /* Put to freelist later */
+                        } else {
+                                /* no packet can piggyback zc-ack cookie */
+                                list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+                        }
+                } else {
+                        /* It's the first mono-packet */
+                        conn->ksnc_tx_mono = tx;
+                        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+                }
+
+        } else {
+                /* It's a normal packet - can it piggback a noop zc-ack that
+                 * has been queued already? */
+                LASSERT(msg->ksm_zc_ack_cookie == 0);
+
+                if (conn->ksnc_proto == &ksocknal_protocol_v2x &&  /* V2.x packet */
+                    conn->ksnc_tx_mono != NULL) {          
+                        if (conn->ksnc_tx_mono->tx_msg.ksm_type == KSOCK_MSG_NOOP) {
+                                /* There is a noop zc-ack can be piggybacked */
+                                ztx = conn->ksnc_tx_mono;
+
+                                msg->ksm_zc_ack_cookie = ztx->tx_msg.ksm_zc_ack_cookie;
+                                ksocknal_next_mono_tx(conn);
+
+                                /* use tx to replace the noop zc-ack packet, ztx will
+                                 * be put to freelist later */
+                                list_add(&tx->tx_list, &ztx->tx_list);
+                                list_del(&ztx->tx_list);
+
+                                atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob);
+                        } else {
+                                /* no noop zc-ack packet, just enqueue it */
+                                LASSERT(conn->ksnc_tx_mono->tx_msg.ksm_type == KSOCK_MSG_LNET);
+                                list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+                        }
+
+                } else if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+                        /* it's the first mono-packet, enqueue it */
+                        conn->ksnc_tx_mono = tx;
+                        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+                } else {
+                        /* V1.x packet, just enqueue it */
+                        list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
+                }
+        }
 
+        if (ztx != NULL)
+                list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs);
+                
         if (conn->ksnc_tx_ready &&      /* able to send */
             !conn->ksnc_tx_scheduled) { /* not scheduled to send */
                 /* +1 ref for scheduler */
-                atomic_inc (&conn->ksnc_refcount);
+                ksocknal_conn_addref(conn);
                 list_add_tail (&conn->ksnc_tx_list, 
                                &sched->kss_tx_conns);
                 conn->ksnc_tx_scheduled = 1;
                 cfs_waitq_signal (&sched->kss_waitq);
         }
 
-        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        spin_unlock_bh (&sched->kss_lock);
 }
 
 ksock_route_t *
@@ -683,33 +844,28 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
 {
         struct list_head  *tmp;
         ksock_route_t     *route;
-        int                bits;
-
+        
         list_for_each (tmp, &peer->ksnp_routes) {
                 route = list_entry (tmp, ksock_route_t, ksnr_list);
-                bits  = route->ksnr_connected;
 
-                if (ksocknal_tunables.ksnd_typed_conns) {
-                        /* All typed connections established? */
-                        if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES)
-                                continue;
-                } else {
-                        /* Untyped connection established? */
-                        if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0)
-                                continue;
-                }
+                LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
 
-                /* connection being established? */
-                if (route->ksnr_connecting)
+                if (route->ksnr_scheduled)      /* connections being established */
                         continue;
 
-                /* too soon to retry this guy? */
-                if (!cfs_time_aftereq (cfs_time_current(), route->ksnr_timeout))
+                /* all route types connected ? */
+                if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0)
                         continue;
 
+                /* too soon to retry this guy? */
+                if (!(route->ksnr_retry_interval == 0 || /* first attempt */
+                      cfs_time_aftereq (cfs_time_current(), 
+                                        route->ksnr_timeout)))
+                        continue;
+                
                 return (route);
         }
-
+        
         return (NULL);
 }
 
@@ -722,687 +878,560 @@ ksocknal_find_connecting_route_locked (ksock_peer_t *peer)
         list_for_each (tmp, &peer->ksnp_routes) {
                 route = list_entry (tmp, ksock_route_t, ksnr_list);
 
-                if (route->ksnr_connecting)
+                LASSERT (!route->ksnr_connecting || route->ksnr_scheduled);
+                
+                if (route->ksnr_scheduled)
                         return (route);
         }
-
+        
         return (NULL);
 }
 
 int
-ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid)
+ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id)
 {
-        unsigned long     flags;
         ksock_peer_t     *peer;
         ksock_conn_t     *conn;
         ksock_route_t    *route;
         rwlock_t         *g_lock;
-
-        /* Ensure the frags we've been given EXACTLY match the number of
-         * bytes we want to send.  Many TCP/IP stacks disregard any total
-         * size parameters passed to them and just look at the frags. 
-         *
-         * We always expect at least 1 mapped fragment containing the
-         * complete portals header. */
-        LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) +
-                 lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob);
-        LASSERT (tx->tx_niov >= 1);
-        LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t));
-
-        CDEBUG (D_NET, "packet %p type %d, nob %d niov %d nkiov %d\n",
-                tx, ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, 
-                tx->tx_nob, tx->tx_niov, tx->tx_nkiov);
-
-        tx->tx_conn = NULL;                     /* only set when assigned a conn */
-        tx->tx_resid = tx->tx_nob;
-        tx->tx_hdr = (ptl_hdr_t *)tx->tx_iov[0].iov_base;
+        int               retry;
+        int               rc;
+        
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_lnetmsg != NULL);
 
         g_lock = &ksocknal_data.ksnd_global_lock;
+        
+        for (retry = 0;; retry = 1) {
 #if !SOCKNAL_ROUND_ROBIN
-        read_lock (g_lock);
-
-        peer = ksocknal_find_target_peer_locked (tx, nid);
-        if (peer == NULL) {
-                read_unlock (g_lock);
-                return (-EHOSTUNREACH);
-        }
-
-        if (ksocknal_find_connectable_route_locked(peer) == NULL) {
-                conn = ksocknal_find_conn_locked (tx, peer);
-                if (conn != NULL) {
-                        /* I've got no autoconnect routes that need to be
-                         * connecting and I do have an actual connection... */
-                        ksocknal_queue_tx_locked (tx, conn);
-                        read_unlock (g_lock);
-                        return (0);
+                read_lock (g_lock);
+                peer = ksocknal_find_peer_locked(ni, id);
+                if (peer != NULL) {
+                        if (ksocknal_find_connectable_route_locked(peer) == NULL) {
+                                conn = ksocknal_find_conn_locked (tx->tx_lnetmsg->msg_len, peer);
+                                if (conn != NULL) {
+                                        /* I've got no routes that need to be
+                                         * connecting and I do have an actual
+                                         * connection... */
+                                        ksocknal_queue_tx_locked (tx, conn);
+                                        read_unlock (g_lock);
+                                        return (0);
+                                }
+                        }
                 }
-        }
-
-        /* I'll need a write lock... */
-        read_unlock (g_lock);
+                /* I'll need a write lock... */
+                read_unlock (g_lock);
 #endif
-        write_lock_irqsave(g_lock, flags);
+                write_lock_bh (g_lock);
+
+                peer = ksocknal_find_peer_locked(ni, id);
+                if (peer != NULL) 
+                        break;
+                
+                write_unlock_bh (g_lock);
 
-        peer = ksocknal_find_target_peer_locked (tx, nid);
-        if (peer == NULL) {
-                write_unlock_irqrestore(g_lock, flags);
-                return (-EHOSTUNREACH);
+                if ((id.pid & LNET_PID_USERFLAG) != 0) {
+                        CERROR("Refusing to create a connection to "
+                               "userspace process %s\n", libcfs_id2str(id));
+                        return -EHOSTUNREACH;
+                }
+                
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_id2str(id));
+                        return -EHOSTUNREACH;
+                }
+                
+                rc = ksocknal_add_peer(ni, id, 
+                                       LNET_NIDADDR(id.nid),
+                                       lnet_acceptor_port());
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_id2str(id), rc);
+                        return rc;
+                }
         }
 
         for (;;) {
-                /* launch any/all autoconnections that need it */
+                /* launch any/all connections that need it */
                 route = ksocknal_find_connectable_route_locked (peer);
                 if (route == NULL)
                         break;
 
-                ksocknal_launch_autoconnect_locked (route);
+                ksocknal_launch_connection_locked (route);
         }
 
-        conn = ksocknal_find_conn_locked (tx, peer);
+        conn = ksocknal_find_conn_locked (tx->tx_lnetmsg->msg_len, peer);
         if (conn != NULL) {
                 /* Connection exists; queue message on it */
                 ksocknal_queue_tx_locked (tx, conn);
-                write_unlock_irqrestore (g_lock, flags);
+                write_unlock_bh (g_lock);
                 return (0);
         }
 
-        route = ksocknal_find_connecting_route_locked (peer);
-        if (route != NULL) {
-                /* At least 1 connection is being established; queue the
-                 * message... */
+        if (peer->ksnp_accepting > 0 ||
+            ksocknal_find_connecting_route_locked (peer) != NULL) {
+                /* Queue the message until a connection is established */
                 list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
-                write_unlock_irqrestore (g_lock, flags);
-                return (0);
+                write_unlock_bh (g_lock);
+                return 0;
         }
+        
+        write_unlock_bh (g_lock);
 
-        write_unlock_irqrestore (g_lock, flags);
+        /* NB Routes may be ignored if connections to them failed recently */
+        CDEBUG(D_NETERROR, "No usable routes to %s\n", libcfs_id2str(id));
         return (-EHOSTUNREACH);
 }
 
-ptl_err_t
-ksocknal_sendmsg(lib_nal_t     *nal, 
-                 void         *private, 
-                 lib_msg_t    *cookie,
-                 ptl_hdr_t    *hdr, 
-                 int           type, 
-                 ptl_nid_t     nid, 
-                 ptl_pid_t     pid,
-                 unsigned int  payload_niov, 
-                 struct iovec *payload_iov, 
-                 ptl_kiov_t   *payload_kiov,
-                 size_t        payload_offset,
-                 size_t        payload_nob)
+int
+ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
-        ksock_ltx_t  *ltx;
-        int           desc_size;
-        int           rc;
+        int               type = lntmsg->msg_type; 
+        lnet_process_id_t target = lntmsg->msg_target;
+        unsigned int      payload_niov = lntmsg->msg_niov; 
+        struct iovec     *payload_iov = lntmsg->msg_iov; 
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        ksock_tx_t       *tx;
+        int               desc_size;
+        int               rc;
 
         /* NB 'private' is different depending on what we're sending.
          * Just ignore it... */
 
-        CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64
-               " pid %d\n", payload_nob, payload_niov, nid , pid);
+        CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
 
         LASSERT (payload_nob == 0 || payload_niov > 0);
-        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
-
-        /* It must be OK to kmap() if required */
-        LASSERT (payload_kiov == NULL || !in_interrupt ());
+        LASSERT (payload_niov <= LNET_MAX_IOV);
         /* payload is either all vaddrs or all pages */
         LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
-
+        LASSERT (!in_interrupt ());
+        
         if (payload_iov != NULL)
-                desc_size = offsetof(ksock_ltx_t, ltx_iov[1 + payload_niov]);
+                desc_size = offsetof(ksock_tx_t, 
+                                     tx_frags.virt.iov[1 + payload_niov]);
         else
-                desc_size = offsetof(ksock_ltx_t, ltx_kiov[payload_niov]);
-
-        if (in_interrupt() ||
-            type == PTL_MSG_ACK ||
-            type == PTL_MSG_REPLY) {
-                /* Can't block if in interrupt or responding to an incoming
-                 * message */
-                PORTAL_ALLOC_ATOMIC(ltx, desc_size);
-        } else {
-                PORTAL_ALLOC(ltx, desc_size);
-        }
-
-        if (ltx == NULL) {
-                CERROR("Can't allocate tx desc type %d size %d %s\n",
-                       type, desc_size, in_interrupt() ? "(intr)" : "");
-                return (PTL_NO_SPACE);
+                desc_size = offsetof(ksock_tx_t, 
+                                     tx_frags.paged.kiov[payload_niov]);
+        
+        tx = ksocknal_alloc_tx(desc_size);
+        if (tx == NULL) {
+                CERROR("Can't allocate tx desc type %d size %d\n",
+                       type, desc_size);
+                return (-ENOMEM);
         }
 
-        atomic_inc(&ksocknal_data.ksnd_nactive_ltxs);
-
-        ltx->ltx_desc_size = desc_size;
-
-        /* We always have 1 mapped frag for the header */
-        ltx->ltx_tx.tx_iov = ltx->ltx_iov;
-        ltx->ltx_iov[0].iov_base = &ltx->ltx_hdr;
-        ltx->ltx_iov[0].iov_len = sizeof(*hdr);
-        ltx->ltx_hdr = *hdr;
-
-        ltx->ltx_private = private;
-        ltx->ltx_cookie = cookie;
-
-        ltx->ltx_tx.tx_isfwd = 0;
-        ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_nob;
+        tx->tx_conn = NULL;                     /* set when assigned a conn */
+        tx->tx_lnetmsg = lntmsg;
 
         if (payload_iov != NULL) {
-                /* payload is all mapped */
-                ltx->ltx_tx.tx_kiov  = NULL;
-                ltx->ltx_tx.tx_nkiov = 0;
-
-                ltx->ltx_tx.tx_niov = 
-                        1 + lib_extract_iov(payload_niov, &ltx->ltx_iov[1],
-                                            payload_niov, payload_iov,
-                                            payload_offset, payload_nob);
+                tx->tx_kiov = NULL;
+                tx->tx_nkiov = 0;
+                tx->tx_iov = tx->tx_frags.virt.iov;
+                tx->tx_niov = 1 + 
+                              lnet_extract_iov(payload_niov, &tx->tx_iov[1],
+                                               payload_niov, payload_iov,
+                                               payload_offset, payload_nob);
         } else {
-                /* payload is all pages */
-                ltx->ltx_tx.tx_niov = 1;
-
-                ltx->ltx_tx.tx_kiov = ltx->ltx_kiov;
-                ltx->ltx_tx.tx_nkiov =
-                        lib_extract_kiov(payload_niov, ltx->ltx_kiov,
-                                         payload_niov, payload_kiov,
-                                         payload_offset, payload_nob);
+                tx->tx_niov = 1;
+                tx->tx_iov = &tx->tx_frags.paged.iov;
+                tx->tx_kiov = tx->tx_frags.paged.kiov;
+                tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov,
+                                                 payload_niov, payload_kiov,
+                                                 payload_offset, payload_nob);
         }
 
-        rc = ksocknal_launch_packet(&ltx->ltx_tx, nid);
-        if (rc == 0)
-                return (PTL_OK);
-
-        ksocknal_free_ltx(ltx);
-        return (PTL_FAIL);
-}
-
-ptl_err_t
-ksocknal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
-               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-               unsigned int payload_niov, struct iovec *payload_iov,
-               size_t payload_offset, size_t payload_len)
-{
-        return (ksocknal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, payload_iov, NULL,
-                                 payload_offset, payload_len));
-}
-
-ptl_err_t
-ksocknal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
-                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
-                     size_t payload_offset, size_t payload_len)
-{
-        return (ksocknal_sendmsg(nal, private, cookie,
-                                 hdr, type, nid, pid,
-                                 payload_niov, NULL, payload_kiov,
-                                 payload_offset, payload_len));
-}
-
-void
-ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
-{
-        ptl_nid_t     nid = fwd->kprfd_gateway_nid;
-        ksock_ftx_t  *ftx = (ksock_ftx_t *)&fwd->kprfd_scratch;
-        int           rc;
-
-        CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd,
-                fwd->kprfd_gateway_nid, fwd->kprfd_target_nid);
+        ksocknal_init_msg(&tx->tx_msg, KSOCK_MSG_LNET);
 
-        /* I'm the gateway; must be the last hop */
-        if (nid == ksocknal_lib.libnal_ni.ni_pid.nid)
-                nid = fwd->kprfd_target_nid;
-
-        /* setup iov for hdr */
-        ftx->ftx_iov.iov_base = fwd->kprfd_hdr;
-        ftx->ftx_iov.iov_len = sizeof(ptl_hdr_t);
-
-        ftx->ftx_tx.tx_isfwd = 1;                  /* This is a forwarding packet */
-        ftx->ftx_tx.tx_nob   = sizeof(ptl_hdr_t) + fwd->kprfd_nob;
-        ftx->ftx_tx.tx_niov  = 1;
-        ftx->ftx_tx.tx_iov   = &ftx->ftx_iov;
-        ftx->ftx_tx.tx_nkiov = fwd->kprfd_niov;
-        ftx->ftx_tx.tx_kiov  = fwd->kprfd_kiov;
-
-        rc = ksocknal_launch_packet (&ftx->ftx_tx, nid);
-        if (rc != 0)
-                kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, rc);
+        /* The first fragment will be set later in pro_pack */
+        rc = ksocknal_launch_packet(ni, tx, target);
+        if (rc == 0)
+                return (0);
+        
+        ksocknal_free_tx(tx);
+        return (-EIO);
 }
 
 int
 ksocknal_thread_start (int (*fn)(void *arg), void *arg)
 {
         long          pid = cfs_kernel_thread (fn, arg, 0);
-        unsigned long flags;
 
         if (pid < 0)
                 return ((int)pid);
 
-        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
         ksocknal_data.ksnd_nthreads++;
-        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
         return (0);
 }
 
 void
 ksocknal_thread_fini (void)
 {
-        unsigned long flags;
-
-        write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
         ksocknal_data.ksnd_nthreads--;
-        write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 }
 
-void
-ksocknal_fmb_callback (void *arg, int error)
+int
+ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
 {
-        ksock_fmb_t       *fmb = (ksock_fmb_t *)arg;
-        ksock_fmb_pool_t  *fmp = fmb->fmb_pool;
-        ptl_hdr_t         *hdr = &fmb->fmb_hdr;
-        ksock_conn_t      *conn = NULL;
-        ksock_sched_t     *sched;
-        unsigned long      flags;
-        char               ipbuf[PTL_NALFMT_SIZE];
-        char               ipbuf2[PTL_NALFMT_SIZE];
-
-        if (error != 0)
-                CERROR("Failed to route packet from "
-                       LPX64" %s to "LPX64" %s: %d\n",
-                       le64_to_cpu(hdr->src_nid),
-                       portals_nid2str(SOCKNAL, le64_to_cpu(hdr->src_nid), ipbuf),
-                       le64_to_cpu(hdr->dest_nid),
-                       portals_nid2str(SOCKNAL, le64_to_cpu(hdr->dest_nid), ipbuf2),
-                       error);
-        else
-                CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
-                        le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid));
-
-        /* drop peer ref taken on init */
-        ksocknal_put_peer (fmb->fmb_peer);
+        static char ksocknal_slop_buffer[4096];
 
-        spin_lock_irqsave (&fmp->fmp_lock, flags);
+        int            nob;
+        unsigned int   niov;
+        int            skipped;
 
-        list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
-        fmp->fmp_nactive_fmbs--;
+        LASSERT(conn->ksnc_proto != NULL);
 
-        if (!list_empty (&fmp->fmp_blocked_conns)) {
-                conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next,
-                                   ksock_conn_t, ksnc_rx_list);
-                list_del (&conn->ksnc_rx_list);
+        if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) {
+                /* Remind the socket to ack eagerly... */
+                ksocknal_lib_eager_ack(conn);
         }
 
-        spin_unlock_irqrestore (&fmp->fmp_lock, flags);
+        if (nob_to_skip == 0) {         /* right at next packet boundary now */
+                conn->ksnc_rx_started = 0;
+                mb ();                          /* racing with timeout thread */
+                
+                switch (conn->ksnc_proto->pro_version) {
+                case  KSOCK_PROTO_V2:
+                        conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER;
+                        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                        conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg;
+
+                        if (conn->ksnc_type == SOCKLND_CONN_BULK_IN) {
+                                /* always expect lnet_hdr_t to avoid extra-read for better performance */
+                                conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload);
+                                conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload);
+                                conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload);
+
+                        } else {
+                                /* can't make sure if it's noop or not */
+                                conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u);
+                                conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u);
+                                conn->ksnc_rx_iov[0].iov_len  = offsetof(ksock_msg_t, ksm_u);
+                        } 
+                        break;
 
-        if (conn == NULL)
-                return;
+                case KSOCK_PROTO_V1:
+                        /* Receiving bare lnet_hdr_t */
+                        conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+                        conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t);
+                        conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t);
 
-        CDEBUG (D_NET, "Scheduling conn %p\n", conn);
-        LASSERT (conn->ksnc_rx_scheduled);
-        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP);
+                        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                        conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+                        conn->ksnc_rx_iov[0].iov_len  = sizeof (lnet_hdr_t);
+                        break;
 
-        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;
+                default:
+                        LBUG ();
+                } 
+                conn->ksnc_rx_niov = 1;
 
-        sched = conn->ksnc_scheduler;
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_nkiov = 0;
+                conn->ksnc_rx_csum = ~0;
+                return (1);
+        }
+
+        /* Set up to skip as much as possible now.  If there's more left
+         * (ran out of iov entries) we'll get called again */
+
+        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
+        conn->ksnc_rx_nob_left = nob_to_skip;
+        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+        skipped = 0;
+        niov = 0;
+
+        do {
+                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
 
-        spin_lock_irqsave (&sched->kss_lock, flags);
+                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
+                conn->ksnc_rx_iov[niov].iov_len  = nob;
+                niov++;
+                skipped += nob;
+                nob_to_skip -=nob;
 
-        list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns);
-        cfs_waitq_signal (&sched->kss_waitq);
+        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
+                 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
 
-        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        conn->ksnc_rx_niov = niov;
+        conn->ksnc_rx_kiov = NULL;
+        conn->ksnc_rx_nkiov = 0;
+        conn->ksnc_rx_nob_wanted = skipped;
+        return (0);
 }
 
-ksock_fmb_t *
-ksocknal_get_idle_fmb (ksock_conn_t *conn)
+/* (Sink) handle incoming ZC request from sender */
+static int
+ksocknal_handle_zc_req(ksock_peer_t *peer, __u64 cookie)
 {
-        int               payload_nob = conn->ksnc_rx_nob_left;
-        unsigned long     flags;
-        ksock_fmb_pool_t *pool;
-        ksock_fmb_t      *fmb;
+        ksock_conn_t   *conn;
+        ksock_tx_t     *tx;
+        ksock_sched_t  *sched;
+        int             rc;
 
-        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
-        LASSERT (kpr_routing(&ksocknal_data.ksnd_router));
+        read_lock (&ksocknal_data.ksnd_global_lock);
 
-        if (payload_nob <= SOCKNAL_SMALL_FWD_PAGES * CFS_PAGE_SIZE)
-                pool = &ksocknal_data.ksnd_small_fmp;
-        else
-                pool = &ksocknal_data.ksnd_large_fmp;
+        conn = ksocknal_find_conn_locked (0, peer);
+        if (conn == NULL) {
+                read_unlock (&ksocknal_data.ksnd_global_lock);
+                CERROR("Can't find connection to send zcack.\n");
+                return -ECONNRESET;
+        }
 
-        spin_lock_irqsave (&pool->fmp_lock, flags);
+        sched = conn->ksnc_scheduler;
 
-        if (!list_empty (&pool->fmp_idle_fmbs)) {
-                fmb = list_entry(pool->fmp_idle_fmbs.next,
-                                 ksock_fmb_t, fmb_list);
-                list_del (&fmb->fmb_list);
-                pool->fmp_nactive_fmbs++;
-                spin_unlock_irqrestore (&pool->fmp_lock, flags);
+        spin_lock_bh (&sched->kss_lock);
+        rc = ksocknal_piggyback_zcack(conn, cookie);
+        spin_unlock_bh (&sched->kss_lock);
 
-                return (fmb);
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        if (rc) {
+                /* Ack cookie is piggybacked */
+                return 0;
         }
 
-        /* deschedule until fmb free */
+        tx = ksocknal_alloc_tx(KSOCK_NOOP_TX_SIZE);
+        if (tx == NULL) {
+                CERROR("Can't allocate noop tx desc\n");
+                return -ENOMEM;
+        }
 
-        conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP;
+        tx->tx_conn     = NULL;
+        tx->tx_lnetmsg  = NULL;
+        tx->tx_kiov     = NULL;
+        tx->tx_nkiov    = 0;
+        tx->tx_iov      = tx->tx_frags.virt.iov;
+        tx->tx_niov     = 1;
 
-        list_add_tail (&conn->ksnc_rx_list,
-                       &pool->fmp_blocked_conns);
+        ksocknal_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP);
+        tx->tx_msg.ksm_zc_ack_cookie = cookie; /* incoming cookie */
 
-        spin_unlock_irqrestore (&pool->fmp_lock, flags);
-        return (NULL);
-}
-
-int
-ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb)
-{
-        int       payload_nob = conn->ksnc_rx_nob_left;
-        ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid);
-        int       niov = 0;
-        int       nob = payload_nob;
-
-        LASSERT (conn->ksnc_rx_scheduled);
-        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB);
-        LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left);
-        LASSERT (payload_nob >= 0);
-        LASSERT (payload_nob <= fmb->fmb_pool->fmp_buff_pages * CFS_PAGE_SIZE);
-        LASSERT (sizeof (ptl_hdr_t) < CFS_PAGE_SIZE);
-        LASSERT (fmb->fmb_kiov[0].kiov_offset == 0);
-
-        /* Take a ref on the conn's peer to prevent module unload before
-         * forwarding completes. */
-        fmb->fmb_peer = conn->ksnc_peer;
-        atomic_inc (&conn->ksnc_peer->ksnp_refcount);
-
-        /* Copy the header we just read into the forwarding buffer.  If
-         * there's payload, start reading reading it into the buffer,
-         * otherwise the forwarding buffer can be kicked off
-         * immediately. */
-        fmb->fmb_hdr = conn->ksnc_hdr;
-
-        while (nob > 0) {
-                LASSERT (niov < fmb->fmb_pool->fmp_buff_pages);
-                LASSERT (fmb->fmb_kiov[niov].kiov_offset == 0);
-                fmb->fmb_kiov[niov].kiov_len = MIN (CFS_PAGE_SIZE, nob);
-                nob -= CFS_PAGE_SIZE;
-                niov++;
-        }
-
-        kpr_fwd_init(&fmb->fmb_fwd, dest_nid, &fmb->fmb_hdr,
-                     payload_nob, niov, fmb->fmb_kiov,
-                     ksocknal_fmb_callback, fmb);
-
-        if (payload_nob == 0) {         /* got complete packet already */
-                CDEBUG (D_NET, "%p "LPX64"->"LPX64" fwd_start (immediate)\n",
-                        conn, le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid);
-
-                kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd);
+        read_lock (&ksocknal_data.ksnd_global_lock);
 
-                ksocknal_new_packet (conn, 0);  /* on to next packet */
-                return (1);
+        conn = ksocknal_find_conn_locked (0, peer);
+        if (conn == NULL) {
+                read_unlock (&ksocknal_data.ksnd_global_lock);
+                ksocknal_free_tx(tx);
+                CERROR("Can't find connection to send zcack.\n");
+                return -ECONNRESET;
         }
+        ksocknal_queue_tx_locked(tx, conn);
 
-        conn->ksnc_cookie = fmb;                /* stash fmb for later */
-        conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
-
-        /* Set up conn->ksnc_rx_kiov to read the payload into fmb's kiov-ed
-         * buffer */
-        LASSERT (niov <= sizeof(conn->ksnc_rx_iov_space)/sizeof(ptl_kiov_t));
-
-        conn->ksnc_rx_niov = 0;
-        conn->ksnc_rx_nkiov = niov;
-        conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
-        memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t));
-
-        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn,
-                le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid, payload_nob);
-        return (0);
-}
-
-void
-ksocknal_fwd_parse (ksock_conn_t *conn)
-{
-        ksock_peer_t *peer;
-        ptl_nid_t     dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid);
-        ptl_nid_t     src_nid = le64_to_cpu(conn->ksnc_hdr.src_nid);
-        int           body_len = le32_to_cpu(conn->ksnc_hdr.payload_length);
-        char str[PTL_NALFMT_SIZE];
-        char str2[PTL_NALFMT_SIZE];
-
-        CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn,
-                src_nid, dest_nid, conn->ksnc_rx_nob_left);
-
-        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER);
-        LASSERT (conn->ksnc_rx_scheduled);
+        read_unlock (&ksocknal_data.ksnd_global_lock);
 
-        if (body_len < 0) {                 /* length corrupt (overflow) */
-                CERROR("dropping packet from "LPX64" (%s) for "LPX64" (%s): "
-                       "packet size %d illegal\n",
-                       src_nid, portals_nid2str(TCPNAL, src_nid, str),
-                       dest_nid, portals_nid2str(TCPNAL, dest_nid, str2),
-                       body_len);
-
-                ksocknal_new_packet (conn, 0);  /* on to new packet */
-                return;
-        }
-
-        if (!kpr_routing(&ksocknal_data.ksnd_router)) {    /* not forwarding */
-                CERROR("dropping packet from "LPX64" (%s) for "LPX64
-                       " (%s): not forwarding\n",
-                       src_nid, portals_nid2str(TCPNAL, src_nid, str),
-                       dest_nid, portals_nid2str(TCPNAL, dest_nid, str2));
-                /* on to new packet (skip this one's body) */
-                ksocknal_new_packet (conn, body_len);
-                return;
-        }
-
-        if (body_len > PTL_MTU) {      /* too big to forward */
-                CERROR ("dropping packet from "LPX64" (%s) for "LPX64
-                        "(%s): packet size %d too big\n",
-                        src_nid, portals_nid2str(TCPNAL, src_nid, str),
-                        dest_nid, portals_nid2str(TCPNAL, dest_nid, str2),
-                        body_len);
-                /* on to new packet (skip this one's body) */
-                ksocknal_new_packet (conn, body_len);
-                return;
-        }
-
-        /* should have gone direct */
-        peer = ksocknal_get_peer (conn->ksnc_hdr.dest_nid);
-        if (peer != NULL) {
-                CERROR ("dropping packet from "LPX64" (%s) for "LPX64
-                        "(%s): target is a peer\n",
-                        src_nid, portals_nid2str(TCPNAL, src_nid, str),
-                        dest_nid, portals_nid2str(TCPNAL, dest_nid, str2));
-                ksocknal_put_peer (peer);  /* drop ref from get above */
-
-                /* on to next packet (skip this one's body) */
-                ksocknal_new_packet (conn, body_len);
-                return;
-        }
-
-        conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB;       /* Getting FMB now */
-        conn->ksnc_rx_nob_left = body_len;              /* stash packet size */
-        conn->ksnc_rx_nob_wanted = body_len;            /* (no slop) */
+        return 0;
 }
 
-int
-ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip)
+/* (Sender) handle ZC_ACK from sink */
+static int
+ksocknal_handle_zc_ack(ksock_peer_t *peer, __u64 cookie)
 {
-        static char ksocknal_slop_buffer[4096];
+        ksock_tx_t             *tx;
+        struct list_head       *ctmp;
 
-        int   nob;
-        int   niov;
-        int   skipped;
+        spin_lock(&peer->ksnp_lock);
 
-        if (nob_to_skip == 0) {         /* right at next packet boundary now */
-                conn->ksnc_rx_started = 0;
-                mb ();                          /* racing with timeout thread */
+        list_for_each(ctmp, &peer->ksnp_zc_req_list) {
+                tx = list_entry (ctmp, ksock_tx_t, tx_zc_list); 
+                if (tx->tx_msg.ksm_zc_req_cookie != cookie) 
+                        continue;
 
-                conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
-                conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
-                conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
+                list_del(&tx->tx_zc_list);
+                spin_unlock(&peer->ksnp_lock);
 
-                conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
-                conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr;
-                conn->ksnc_rx_iov[0].iov_len  = sizeof (ptl_hdr_t);
-                conn->ksnc_rx_niov = 1;
+                ksocknal_tx_decref(tx);
 
-                conn->ksnc_rx_kiov = NULL;
-                conn->ksnc_rx_nkiov = 0;
-                return (1);
+                return 0;
         }
+        spin_unlock(&peer->ksnp_lock);
 
-        /* Set up to skip as much a possible now.  If there's more left
-         * (ran out of iov entries) we'll get called again */
-
-        conn->ksnc_rx_state = SOCKNAL_RX_SLOP;
-        conn->ksnc_rx_nob_left = nob_to_skip;
-        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
-        skipped = 0;
-        niov = 0;
-
-        do {
-                nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer));
-
-                conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer;
-                conn->ksnc_rx_iov[niov].iov_len  = nob;
-                niov++;
-                skipped += nob;
-                nob_to_skip -=nob;
-
-        } while (nob_to_skip != 0 &&    /* mustn't overflow conn's rx iov */
-                 niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec));
-
-        conn->ksnc_rx_niov = niov;
-        conn->ksnc_rx_kiov = NULL;
-        conn->ksnc_rx_nkiov = 0;
-        conn->ksnc_rx_nob_wanted = skipped;
-        return (0);
+        return -EPROTO;
 }
 
 int
 ksocknal_process_receive (ksock_conn_t *conn)
 {
-        ksock_fmb_t  *fmb;
         int           rc;
+        
+        LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0);
 
-        LASSERT (atomic_read (&conn->ksnc_refcount) > 0);
-
-        /* doesn't need a forwarding buffer */
-        if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB)
-                goto try_read;
-
- get_fmb:
-        fmb = ksocknal_get_idle_fmb (conn);
-        if (fmb == NULL) {
-                /* conn descheduled waiting for idle fmb */
-                return (0);
-        }
-
-        if (ksocknal_init_fmb (conn, fmb)) {
-                /* packet forwarded */
-                return (0);
-        }
-
- try_read:
         /* NB: sched lock NOT held */
-        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
-                 conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD ||
+        /* SOCKNAL_RX_LNET_HEADER is here for backward compatability */
+        LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD ||
+                 conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER ||
                  conn->ksnc_rx_state == SOCKNAL_RX_SLOP);
+ again:
+        if (conn->ksnc_rx_nob_wanted != 0) {
+                rc = ksocknal_receive(conn);
 
-        LASSERT (conn->ksnc_rx_nob_wanted > 0);
-
-        rc = ksocknal_receive(conn);
-
-        if (rc <= 0) {
-                LASSERT (rc != -EAGAIN);
-
-                if (rc == 0)
-                        CWARN ("[%p] EOF from "LPX64" ip %d.%d.%d.%d:%d\n",
-                               conn, conn->ksnc_peer->ksnp_nid,
-                               HIPQUAD(conn->ksnc_ipaddr),
-                               conn->ksnc_port);
-                else if (!conn->ksnc_closing)
-                        CERROR ("[%p] Error %d on read from "LPX64
-                                " ip %d.%d.%d.%d:%d\n",
-                                conn, rc, conn->ksnc_peer->ksnp_nid,
-                                HIPQUAD(conn->ksnc_ipaddr),
-                                conn->ksnc_port);
+                if (rc <= 0) {
+                        LASSERT (rc != -EAGAIN);
 
-                ksocknal_close_conn_and_siblings (conn, rc);
-                return (rc == 0 ? -ESHUTDOWN : rc);
+                        if (rc == 0)
+                                CDEBUG (D_NET, "[%p] EOF from %s"
+                                        " ip %d.%d.%d.%d:%d\n", conn, 
+                                        libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                        HIPQUAD(conn->ksnc_ipaddr),
+                                        conn->ksnc_port);
+                        else if (!conn->ksnc_closing)
+                                CERROR ("[%p] Error %d on read from %s"
+                                        " ip %d.%d.%d.%d:%d\n",
+                                        conn, rc, 
+                                        libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                        HIPQUAD(conn->ksnc_ipaddr),
+                                        conn->ksnc_port);
+
+                        /* it's not an error if conn is being closed */
+                        ksocknal_close_conn_and_siblings (conn, 
+                                                          (conn->ksnc_closing) ? 0 : rc);
+                        return (rc == 0 ? -ESHUTDOWN : rc);
+                }
+                
+                if (conn->ksnc_rx_nob_wanted != 0) {
+                        /* short read */
+                        return (-EAGAIN);
+                }
         }
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_KSM_HEADER:
+                if (conn->ksnc_flip) {
+                        __swab32s(&conn->ksnc_msg.ksm_type);
+                        __swab32s(&conn->ksnc_msg.ksm_csum);
+                        __swab64s(&conn->ksnc_msg.ksm_zc_req_cookie);
+                        __swab64s(&conn->ksnc_msg.ksm_zc_ack_cookie);
+                } 
 
-        if (conn->ksnc_rx_nob_wanted != 0) {
-                /* short read */
-                return (-EAGAIN);
-        }
+                if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP &&
+                    conn->ksnc_msg.ksm_csum != 0 &&     /* has checksum */
+                    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { 
+                        /* NOOP Checksum error */
+                        CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                        ksocknal_new_packet(conn, 0);
+                        ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                        return (-EIO);
+                }
 
-        switch (conn->ksnc_rx_state) {
-        case SOCKNAL_RX_HEADER:
-                if (conn->ksnc_hdr.type != cpu_to_le32(PTL_MSG_HELLO) &&
-                    le64_to_cpu(conn->ksnc_hdr.dest_nid) != 
-                    ksocknal_lib.libnal_ni.ni_pid.nid) {
-                        /* This packet isn't for me */
-                        ksocknal_fwd_parse (conn);
-                        switch (conn->ksnc_rx_state) {
-                        case SOCKNAL_RX_HEADER: /* skipped (zero payload) */
-                                return (0);     /* => come back later */
-                        case SOCKNAL_RX_SLOP:   /* skipping packet's body */
-                                goto try_read;  /* => go read it */
-                        case SOCKNAL_RX_GET_FMB: /* forwarding */
-                                goto get_fmb;   /* => go get a fwd msg buffer */
-                        default:
-                                LBUG ();
+                if (conn->ksnc_msg.ksm_zc_ack_cookie != 0) {
+                        LASSERT(conn->ksnc_proto == &ksocknal_protocol_v2x);
+
+                        rc = ksocknal_handle_zc_ack(conn->ksnc_peer,
+                                                    conn->ksnc_msg.ksm_zc_ack_cookie);
+                        if (rc != 0) {
+                                CERROR("%s: Unknown zero copy ACK cookie: "LPU64"\n",
+                                       libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                                       conn->ksnc_msg.ksm_zc_ack_cookie);
+                                ksocknal_new_packet(conn, 0);
+                                ksocknal_close_conn_and_siblings(conn, -EPROTO);
+                                return (rc);
                         }
-                        /* Not Reached */
                 }
 
-                /* sets wanted_len, iovs etc */
-                rc = lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
+                if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { 
+                        ksocknal_new_packet (conn, 0);
+                        return 0;       /* NOOP is done and just return */
+                }
+                LASSERT (conn->ksnc_msg.ksm_type == KSOCK_MSG_LNET);
+
+                if (conn->ksnc_type == SOCKLND_CONN_BULK_IN) {
+                        conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+                        /* has read lnet_hdr_t already (re ksocknal_new_packet), fall through */
+                } else {
+                        conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER;
+                        conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t);
+                        conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t);
+        
+                        conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space;
+                        conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg;
+                        conn->ksnc_rx_iov[0].iov_len  = sizeof(ksock_lnet_msg_t);
+
+                        conn->ksnc_rx_niov = 1;
+                        conn->ksnc_rx_kiov = NULL;
+                        conn->ksnc_rx_nkiov = 0;
+
+                        goto again;     /* read lnet header now */
+                }
+
+        case SOCKNAL_RX_LNET_HEADER:
+                /* unpack message header */
+                conn->ksnc_proto->pro_unpack(&conn->ksnc_msg);
+
+                if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { 
+                        /* Userspace peer */
+                        lnet_process_id_t *id = &conn->ksnc_peer->ksnp_id;
+                        lnet_hdr_t        *lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr;
+                        
+                        /* Substitute process ID assigned at connection time */
+                        lhdr->src_pid = cpu_to_le32(id->pid);
+                        lhdr->src_nid = cpu_to_le64(id->nid);
+                }
 
-                if (rc != PTL_OK) {
+                conn->ksnc_rx_state = SOCKNAL_RX_PARSE;
+                ksocknal_conn_addref(conn);     /* ++ref while parsing */
+                
+                rc = lnet_parse(conn->ksnc_peer->ksnp_ni, 
+                                &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, 
+                                conn->ksnc_peer->ksnp_id.nid, conn, 0);
+                if (rc < 0) {
                         /* I just received garbage: give up on this conn */
+                        ksocknal_new_packet(conn, 0);
                         ksocknal_close_conn_and_siblings (conn, rc);
+                        ksocknal_conn_decref(conn);
                         return (-EPROTO);
                 }
 
-                if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
-                        conn->ksnc_rx_state = SOCKNAL_RX_BODY;
-                        goto try_read;          /* go read the payload */
+                /* I'm racing with ksocknal_recv() */
+                LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE ||
+                         conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD);
+                
+                if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD)
+                        return 0;
+                
+                /* ksocknal_recv() got called */
+                goto again;
+
+        case SOCKNAL_RX_LNET_PAYLOAD:
+                /* payload all received */
+                rc = 0;
+
+                if (conn->ksnc_rx_nob_left == 0 &&   /* not truncating */
+                    conn->ksnc_msg.ksm_csum != 0 &&  /* has checksum */
+                    conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { 
+                        CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n",
+                               libcfs_id2str(conn->ksnc_peer->ksnp_id),
+                               conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum);
+                        rc = -EIO;
                 }
-                /* Fall through (completed packet for me) */
 
-        case SOCKNAL_RX_BODY:
-                /* payload all received */
-                lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK);
+                lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc);
+
+                if (rc == 0 && conn->ksnc_msg.ksm_zc_req_cookie != 0) {
+                        LASSERT(conn->ksnc_proto == &ksocknal_protocol_v2x);
+                        rc = ksocknal_handle_zc_req(conn->ksnc_peer,
+                                                    conn->ksnc_msg.ksm_zc_req_cookie);
+                }
+
+                if (rc != 0) {
+                        ksocknal_new_packet(conn, 0);
+                        ksocknal_close_conn_and_siblings (conn, rc);
+                        return (-EPROTO);
+                }
                 /* Fall through */
 
         case SOCKNAL_RX_SLOP:
                 /* starting new packet? */
                 if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left))
-                        return (0);     /* come back later */
-                goto try_read;          /* try to finish reading slop now */
-
-        case SOCKNAL_RX_BODY_FWD:
-                /* payload all received */
-                CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n",
-                        conn, le64_to_cpu(conn->ksnc_hdr.src_nid),
-                        le64_to_cpu(conn->ksnc_hdr.dest_nid),
-                        conn->ksnc_rx_nob_left);
-
-                /* forward the packet. NB ksocknal_init_fmb() put fmb into
-                 * conn->ksnc_cookie */
-                fmb = (ksock_fmb_t *)conn->ksnc_cookie;
-                kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd);
-
-                /* no slop in forwarded packets */
-                LASSERT (conn->ksnc_rx_nob_left == 0);
-
-                ksocknal_new_packet (conn, 0);  /* on to next packet */
-                return (0);                     /* (later) */
+                        return 0;       /* come back later */
+                goto again;             /* try to finish reading slop now */
 
         default:
                 break;
@@ -1413,78 +1442,76 @@ ksocknal_process_receive (ksock_conn_t *conn)
         return (-EINVAL);                       /* keep gcc happy */
 }
 
-ptl_err_t
-ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
-               unsigned int niov, struct iovec *iov, 
-               size_t offset, size_t mlen, size_t rlen)
+int
+ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+               unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+               unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
-        ksock_conn_t *conn = (ksock_conn_t *)private;
+        ksock_conn_t  *conn = (ksock_conn_t *)private;
+        ksock_sched_t *sched = conn->ksnc_scheduler;
 
         LASSERT (mlen <= rlen);
-        LASSERT (niov <= PTL_MD_MAX_IOV);
-
+        LASSERT (niov <= LNET_MAX_IOV);
+        
         conn->ksnc_cookie = msg;
         conn->ksnc_rx_nob_wanted = mlen;
         conn->ksnc_rx_nob_left   = rlen;
 
-        conn->ksnc_rx_nkiov = 0;
-        conn->ksnc_rx_kiov = NULL;
-        conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
-        conn->ksnc_rx_niov =
-                lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov,
-                                niov, iov, offset, mlen);
-
+        if (mlen == 0 || iov != NULL) {
+                conn->ksnc_rx_nkiov = 0;
+                conn->ksnc_rx_kiov = NULL;
+                conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov;
+                conn->ksnc_rx_niov =
+                        lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov,
+                                         niov, iov, offset, mlen);
+        } else {
+                conn->ksnc_rx_niov = 0;
+                conn->ksnc_rx_iov  = NULL;
+                conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
+                conn->ksnc_rx_nkiov = 
+                        lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov,
+                                          niov, kiov, offset, mlen);
+        }
+        
         LASSERT (mlen == 
-                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
-                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
-
-        return (PTL_OK);
-}
-
-ptl_err_t
-ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
-                     unsigned int niov, ptl_kiov_t *kiov, 
-                     size_t offset, size_t mlen, size_t rlen)
-{
-        ksock_conn_t *conn = (ksock_conn_t *)private;
-
-        LASSERT (mlen <= rlen);
-        LASSERT (niov <= PTL_MD_MAX_IOV);
+                 lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
+                 lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
 
-        conn->ksnc_cookie = msg;
-        conn->ksnc_rx_nob_wanted = mlen;
-        conn->ksnc_rx_nob_left   = rlen;
+        LASSERT (conn->ksnc_rx_scheduled);
 
-        conn->ksnc_rx_niov = 0;
-        conn->ksnc_rx_iov  = NULL;
-        conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov;
-        conn->ksnc_rx_nkiov = 
-                lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov,
-                                 niov, kiov, offset, mlen);
+        spin_lock_bh (&sched->kss_lock);
 
-        LASSERT (mlen == 
-                 lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) +
-                 lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov));
+        switch (conn->ksnc_rx_state) {
+        case SOCKNAL_RX_PARSE_WAIT:
+                list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns);
+                cfs_waitq_signal (&sched->kss_waitq);
+                LASSERT (conn->ksnc_rx_ready);
+                break;
+                
+        case SOCKNAL_RX_PARSE:
+                /* scheduler hasn't noticed I'm parsing yet */
+                break;
+        }
 
-        return (PTL_OK);
+        conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD;
+        
+        spin_unlock_bh (&sched->kss_lock);
+        ksocknal_conn_decref(conn);
+        return (0);
 }
 
 static inline int
 ksocknal_sched_cansleep(ksock_sched_t *sched)
 {
-        unsigned long flags;
         int           rc;
 
-        spin_lock_irqsave(&sched->kss_lock, flags);
+        spin_lock_bh (&sched->kss_lock);
 
         rc = (!ksocknal_data.ksnd_shuttingdown &&
-#if SOCKNAL_ZC
-              list_empty(&sched->kss_zctxdone_list) &&
-#endif
               list_empty(&sched->kss_rx_conns) &&
               list_empty(&sched->kss_tx_conns));
-
-        spin_unlock_irqrestore(&sched->kss_lock, flags);
+        
+        spin_unlock_bh (&sched->kss_lock);
         return (rc);
 }
 
@@ -1493,15 +1520,14 @@ int ksocknal_scheduler (void *arg)
         ksock_sched_t     *sched = (ksock_sched_t *)arg;
         ksock_conn_t      *conn;
         ksock_tx_t        *tx;
-        unsigned long      flags;
         int                rc;
         int                nloops = 0;
         int                id = sched - ksocknal_data.ksnd_schedulers;
         char               name[16];
 
-        snprintf (name, sizeof (name),"ksocknald_%02d", id);
-        kportal_daemonize (name);
-        kportal_blockallsigs ();
+        snprintf (name, sizeof (name),"socknal_sd%02d", id);
+        cfs_daemonize (name);
+        cfs_block_allsigs ();
 
 #if (CONFIG_SMP && CPU_AFFINITY)
         id = ksocknal_sched2cpu(id);
@@ -1514,7 +1540,7 @@ int ksocknal_scheduler (void *arg)
         }
 #endif /* CONFIG_SMP && CPU_AFFINITY */
 
-        spin_lock_irqsave (&sched->kss_lock, flags);
+        spin_lock_bh (&sched->kss_lock);
 
         while (!ksocknal_data.ksnd_shuttingdown) {
                 int did_something = 0;
@@ -1534,11 +1560,11 @@ int ksocknal_scheduler (void *arg)
                          * data_ready can set it any time after we release
                          * kss_lock. */
                         conn->ksnc_rx_ready = 0;
-                        spin_unlock_irqrestore(&sched->kss_lock, flags);
+                        spin_unlock_bh (&sched->kss_lock);
 
                         rc = ksocknal_process_receive(conn);
 
-                        spin_lock_irqsave(&sched->kss_lock, flags);
+                        spin_lock_bh (&sched->kss_lock);
 
                         /* I'm the only one that can clear this flag */
                         LASSERT(conn->ksnc_rx_scheduled);
@@ -1547,13 +1573,11 @@ int ksocknal_scheduler (void *arg)
                         if (rc == 0)
                                 conn->ksnc_rx_ready = 1;
 
-                        if (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP ||
-                            conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB) {
-                                /* Conn blocked for a forwarding buffer.
-                                 * It will get queued for my attention when
-                                 * one becomes available (and it might just
-                                 * already have been!).  Meanwhile my ref
-                                 * on it stays put. */
+                        if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) {
+                                /* Conn blocked waiting for ksocknal_recv()
+                                 * I change its state (under lock) to signal
+                                 * it can be rescheduled */
+                                conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT;
                         } else if (conn->ksnc_rx_ready) {
                                 /* reschedule for rx */
                                 list_add_tail (&conn->ksnc_rx_list,
@@ -1561,23 +1585,34 @@ int ksocknal_scheduler (void *arg)
                         } else {
                                 conn->ksnc_rx_scheduled = 0;
                                 /* drop my ref */
-                                ksocknal_put_conn(conn);
+                                ksocknal_conn_decref(conn);
                         }
 
                         did_something = 1;
                 }
 
                 if (!list_empty (&sched->kss_tx_conns)) {
+                        CFS_LIST_HEAD    (zlist);
+
+                        if (!list_empty(&sched->kss_zombie_noop_txs)) {
+                                list_add(&zlist, &sched->kss_zombie_noop_txs); 
+                                list_del_init(&sched->kss_zombie_noop_txs);
+                        }
+
                         conn = list_entry(sched->kss_tx_conns.next,
                                           ksock_conn_t, ksnc_tx_list);
                         list_del (&conn->ksnc_tx_list);
-
+                        
                         LASSERT(conn->ksnc_tx_scheduled);
                         LASSERT(conn->ksnc_tx_ready);
                         LASSERT(!list_empty(&conn->ksnc_tx_queue));
-
+                        
                         tx = list_entry(conn->ksnc_tx_queue.next,
                                         ksock_tx_t, tx_list);
+
+                        if (conn->ksnc_tx_mono == tx)
+                                ksocknal_next_mono_tx(conn);
+
                         /* dequeue now so empty list => more to send */
                         list_del(&tx->tx_list);
 
@@ -1586,17 +1621,26 @@ int ksocknal_scheduler (void *arg)
                          * write_space can set it any time after we release
                          * kss_lock. */
                         conn->ksnc_tx_ready = 0;
-                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                        spin_unlock_bh (&sched->kss_lock);
 
-                        rc = ksocknal_process_transmit(conn, tx);
+                        if (!list_empty(&zlist)) {
+                                /* free zombie noop txs, it's fast because 
+                                 * noop txs are just put in freelist */
+                                ksocknal_txlist_done(NULL, &zlist, 0);
+                        }
 
-                        spin_lock_irqsave (&sched->kss_lock, flags);
+                        rc = ksocknal_process_transmit(conn, tx);
 
                         if (rc == -ENOMEM || rc == -EAGAIN) {
                                 /* Incomplete send: replace tx on HEAD of tx_queue */
+                                spin_lock_bh (&sched->kss_lock);
                                 list_add (&tx->tx_list, &conn->ksnc_tx_queue);
                         } else {
-                                /* Complete send; assume space for more */
+                                /* Complete send; tx -ref */
+                                ksocknal_tx_decref (tx);
+
+                                spin_lock_bh (&sched->kss_lock);
+                                /* assume space for more */
                                 conn->ksnc_tx_ready = 1;
                         }
 
@@ -1611,44 +1655,31 @@ int ksocknal_scheduler (void *arg)
                         } else {
                                 conn->ksnc_tx_scheduled = 0;
                                 /* drop my ref */
-                                ksocknal_put_conn (conn);
+                                ksocknal_conn_decref(conn);
                         }
-
-                        did_something = 1;
-                }
-#if SOCKNAL_ZC
-                if (!list_empty (&sched->kss_zctxdone_list)) {
-                        ksock_tx_t *tx =
-                                list_entry(sched->kss_zctxdone_list.next,
-                                           ksock_tx_t, tx_list);
+                                
                         did_something = 1;
-
-                        list_del (&tx->tx_list);
-                        spin_unlock_irqrestore (&sched->kss_lock, flags);
-
-                        ksocknal_tx_done (tx, 1);
-
-                        spin_lock_irqsave (&sched->kss_lock, flags);
                 }
-#endif
                 if (!did_something ||           /* nothing to do */
                     ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */
-                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                        spin_unlock_bh (&sched->kss_lock);
 
                         nloops = 0;
 
                         if (!did_something) {   /* wait for something to do */
-                                rc = wait_event_interruptible (sched->kss_waitq,
-                                                               !ksocknal_sched_cansleep(sched));
+                                rc = wait_event_interruptible_exclusive(
+                                        sched->kss_waitq,
+                                        !ksocknal_sched_cansleep(sched));
                                 LASSERT (rc == 0);
-                        } else
-                               our_cond_resched();
+                        } else {
+                                our_cond_resched();
+                        }
 
-                        spin_lock_irqsave (&sched->kss_lock, flags);
+                        spin_lock_bh (&sched->kss_lock);
                 }
         }
 
-        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        spin_unlock_bh (&sched->kss_lock);
         ksocknal_thread_fini ();
         return (0);
 }
@@ -1660,12 +1691,11 @@ int ksocknal_scheduler (void *arg)
 void ksocknal_read_callback (ksock_conn_t *conn)
 {
         ksock_sched_t *sched; 
-        unsigned long  flags;
         ENTRY;
 
         sched = conn->ksnc_scheduler; 
 
-        spin_lock_irqsave (&sched->kss_lock, flags); 
+        spin_lock_bh (&sched->kss_lock);
 
         conn->ksnc_rx_ready = 1; 
 
@@ -1674,11 +1704,11 @@ void ksocknal_read_callback (ksock_conn_t *conn)
                               &sched->kss_rx_conns); 
                 conn->ksnc_rx_scheduled = 1; 
                 /* extra ref for scheduler */ 
-                atomic_inc (&conn->ksnc_refcount); 
+                ksocknal_conn_addref(conn);
 
                 cfs_waitq_signal (&sched->kss_waitq); 
         } 
-        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        spin_unlock_bh (&sched->kss_lock);
 
         EXIT;
 } 
@@ -1690,12 +1720,11 @@ void ksocknal_read_callback (ksock_conn_t *conn)
 void ksocknal_write_callback (ksock_conn_t *conn)
 { 
         ksock_sched_t *sched; 
-        unsigned long  flags;
         ENTRY;
-
+        
         sched = conn->ksnc_scheduler; 
 
-        spin_lock_irqsave (&sched->kss_lock, flags); 
+        spin_lock_bh (&sched->kss_lock);
 
         conn->ksnc_tx_ready = 1; 
 
@@ -1705,375 +1734,714 @@ void ksocknal_write_callback (ksock_conn_t *conn)
                                &sched->kss_tx_conns); 
                 conn->ksnc_tx_scheduled = 1; 
                 /* extra ref for scheduler */ 
-                atomic_inc (&conn->ksnc_refcount); 
+                ksocknal_conn_addref(conn); 
 
                 cfs_waitq_signal (&sched->kss_waitq); 
         } 
 
-        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        spin_unlock_bh (&sched->kss_lock);
 
         EXIT;
 }
 
-int
-ksocknal_sock_write (struct socket *sock, void *buffer, int nob)
+ksock_protocol_t *
+ksocknal_compat_protocol (ksock_hello_msg_t *hello)
 {
-        return ksocknal_lib_sock_write(sock, buffer, nob);
-}
+        if ((hello->kshm_magic   == LNET_PROTO_MAGIC &&
+             hello->kshm_version == KSOCK_PROTO_V2) ||
+            (hello->kshm_magic   == __swab32(LNET_PROTO_MAGIC) &&
+             hello->kshm_version == __swab32(KSOCK_PROTO_V2)))
+                return &ksocknal_protocol_v2x;
 
-int
-ksocknal_sock_read (struct socket *sock, void *buffer, int nob)
-{
-        return ksocknal_lib_sock_read(sock, buffer, nob);
-}
+        if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+                lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello;
 
-int
-ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs)
-{
-        /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
-        struct socket      *sock = conn->ksnc_sock;
-        ptl_hdr_t           hdr;
-        ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
-        int                 i;
-        int                 rc;
+                CLASSERT (sizeof (lnet_magicversion_t) ==
+                          offsetof (ksock_hello_msg_t, kshm_src_nid));
 
-        LASSERT (conn->ksnc_type != SOCKNAL_CONN_NONE);
-        LASSERT (nipaddrs <= SOCKNAL_MAX_INTERFACES);
+                if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) &&
+                    hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR))
+                        return &ksocknal_protocol_v1x;
+        }
 
-        /* No need for getconnsock/putconnsock */
-        LASSERT (!conn->ksnc_closing);
+        return NULL;
+}
 
-        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
-        hmv->magic         = cpu_to_le32 (PORTALS_PROTO_MAGIC);
-        hmv->version_major = cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR);
-        hmv->version_minor = cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR);
+static int
+ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
+{
+        cfs_socket_t        *sock = conn->ksnc_sock;
+        lnet_hdr_t          *hdr;
+        lnet_magicversion_t *hmv;
+        int                  rc;
+        int                  i;
+
+        CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid));
+
+        LIBCFS_ALLOC(hdr, sizeof(*hdr));
+        if (hdr == NULL) {
+                CERROR("Can't allocate lnet_hdr_t\n");
+                return -ENOMEM;
+        }
+
+        hmv = (lnet_magicversion_t *)&hdr->dest_nid;
+
+        /* Re-organize V2.x message header to V1.x (lnet_hdr_t)
+         * header and send out */
+        hmv->magic         = cpu_to_le32 (LNET_PROTO_TCP_MAGIC);
+        hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR);
+        hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR);
+
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto check */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        hmv->version_major++;   /* just different! */
+                        the_lnet.ln_testprotocompat &= ~1;
+                }
+                if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                        hmv->magic = LNET_PROTO_MAGIC;
+                        the_lnet.ln_testprotocompat &= ~2;
+                }
+                LNET_UNLOCK();
+        }
 
-        hdr.src_nid        = cpu_to_le64 (ksocknal_lib.libnal_ni.ni_pid.nid);
-        hdr.type           = cpu_to_le32 (PTL_MSG_HELLO);
-        hdr.payload_length = cpu_to_le32 (nipaddrs * sizeof(*ipaddrs));
+        hdr->src_nid        = cpu_to_le64 (hello->kshm_src_nid);
+        hdr->src_pid        = cpu_to_le32 (hello->kshm_src_pid);
+        hdr->type           = cpu_to_le32 (LNET_MSG_HELLO);
+        hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32));
+        hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype);
+        hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation);
 
-        hdr.msg.hello.type = cpu_to_le32 (conn->ksnc_type);
-        hdr.msg.hello.incarnation =
-                cpu_to_le64 (ksocknal_data.ksnd_incarnation);
+        rc = libcfs_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout());
 
-        /* Receiver is eager */
-        rc = ksocknal_sock_write (sock, &hdr, sizeof(hdr));
         if (rc != 0) {
-                CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                CDEBUG (D_NETERROR, "Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
                         rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
-                return (rc);
+                goto out;
         }
 
-        if (nipaddrs == 0)
-                return (0);
+        if (hello->kshm_nips == 0)
+                goto out;
 
-        for (i = 0; i < nipaddrs; i++) {
-                ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]);
+        for (i = 0; i < hello->kshm_nips; i++) {
+                hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]);
         }
-
-        rc = ksocknal_sock_write (sock, ipaddrs, nipaddrs * sizeof(*ipaddrs));
-        if (rc != 0)
-                CERROR ("Error %d sending HELLO payload (%d)"
-                        " to %u.%u.%u.%u/%d\n", rc, nipaddrs, 
+        
+        rc = libcfs_sock_write(sock, hello->kshm_ips,
+                               hello->kshm_nips * sizeof(__u32),
+                               lnet_acceptor_timeout());
+        if (rc != 0) {
+                CDEBUG (D_NETERROR, "Error %d sending HELLO payload (%d)"
+                        " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips, 
                         HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
-        return (rc);
-}
+        }
+out:
+        LIBCFS_FREE(hdr, sizeof(*hdr));
 
-int
-ksocknal_invert_type(int type)
+        return rc;
+} 
+
+static int
+ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello)
 {
-        switch (type)
-        {
-        case SOCKNAL_CONN_ANY:
-        case SOCKNAL_CONN_CONTROL:
-                return (type);
-        case SOCKNAL_CONN_BULK_IN:
-                return SOCKNAL_CONN_BULK_OUT;
-        case SOCKNAL_CONN_BULK_OUT:
-                return SOCKNAL_CONN_BULK_IN;
-        default:
-                return (SOCKNAL_CONN_NONE);
+        cfs_socket_t   *sock = conn->ksnc_sock;
+        int             rc;
+
+        hello->kshm_magic       = LNET_PROTO_MAGIC;
+        hello->kshm_version     = KSOCK_PROTO_V2;
+
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto check */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        hello->kshm_version++;   /* just different! */
+                        the_lnet.ln_testprotocompat &= ~1;
+                }
+                LNET_UNLOCK();
         }
-}
 
-int
-ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid,
-                     __u64 *incarnation, __u32 *ipaddrs)
-{
-        struct socket      *sock = conn->ksnc_sock;
-        int                 rc;
-        int                 nips;
-        int                 i;
-        int                 type;
-        ptl_hdr_t           hdr;
-        ptl_magicversion_t *hmv;
-        char                ipbuf[PTL_NALFMT_SIZE];
-
-        hmv = (ptl_magicversion_t *)&hdr.dest_nid;
-        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
-
-        rc = ksocknal_sock_read (sock, hmv, sizeof (*hmv));
+        rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips),
+                               lnet_acceptor_timeout());
+
         if (rc != 0) {
-                CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
-                        rc, HIPQUAD(conn->ksnc_ipaddr));
-                return (rc);
+                CDEBUG (D_NETERROR, "Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
+                return rc;
         }
 
-        if (hmv->magic != le32_to_cpu (PORTALS_PROTO_MAGIC)) {
-                CERROR ("Bad magic %#08x (%#08x expected) from %u.%u.%u.%u\n",
-                        __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC,
-                        HIPQUAD(conn->ksnc_ipaddr));
-                return (-EPROTO);
-        }
+        if (hello->kshm_nips == 0)
+                return 0;
 
-        if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) ||
-            hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) {
-                CERROR ("Incompatible protocol version %d.%d (%d.%d expected)"
-                        " from %u.%u.%u.%u\n",
-                        le16_to_cpu (hmv->version_major),
-                        le16_to_cpu (hmv->version_minor),
-                        PORTALS_PROTO_VERSION_MAJOR,
-                        PORTALS_PROTO_VERSION_MINOR,
-                        HIPQUAD(conn->ksnc_ipaddr));
-                return (-EPROTO);
+        rc = libcfs_sock_write(sock, hello->kshm_ips,
+                               hello->kshm_nips * sizeof(__u32),
+                               lnet_acceptor_timeout());
+        if (rc != 0) {
+                CDEBUG (D_NETERROR, "Error %d sending HELLO payload (%d)"
+                        " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips,
+                        HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port);
         }
 
-#if (PORTALS_PROTO_VERSION_MAJOR != 1)
-# error "This code only understands protocol version 1.x"
-#endif
-        /* version 1 sends magic/version as the dest_nid of a 'hello'
-         * header, followed by payload full of interface IP addresses.
-         * Read the rest of it in now... */
+        return rc;
+}
+
+static int
+ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout)
+{
+        cfs_socket_t        *sock = conn->ksnc_sock;
+        lnet_hdr_t          *hdr;
+        int                  rc;
+        int                  i;
+
+        LIBCFS_ALLOC(hdr, sizeof(*hdr));
+        if (hdr == NULL) {
+                CERROR("Can't allocate lnet_hdr_t\n");
+                return -ENOMEM;
+        }
 
-        rc = ksocknal_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv));
+        rc = libcfs_sock_read(sock, &hdr->src_nid,
+                              sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid),
+                              timeout);
         if (rc != 0) {
                 CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n",
                         rc, HIPQUAD(conn->ksnc_ipaddr));
-                return (rc);
+                LASSERT (rc < 0 && rc != -EALREADY);
+                goto out;
         }
 
         /* ...and check we got what we expected */
-        if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) {
+        if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) {
                 CERROR ("Expecting a HELLO hdr,"
                         " but got type %d from %u.%u.%u.%u\n",
-                        le32_to_cpu (hdr.type),
+                        le32_to_cpu (hdr->type),
                         HIPQUAD(conn->ksnc_ipaddr));
-                return (-EPROTO);
+                rc = -EPROTO;
+                goto out;
         }
 
-        if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) {
-                CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY"
-                       "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
-                return (-EPROTO);
+        hello->kshm_src_nid         = le64_to_cpu (hdr->src_nid);
+        hello->kshm_src_pid         = le32_to_cpu (hdr->src_pid);
+        hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation);
+        hello->kshm_ctype           = le32_to_cpu (hdr->msg.hello.type);
+        hello->kshm_nips            = le32_to_cpu (hdr->payload_length) /
+                                         sizeof (__u32);
+
+        if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+                CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+                       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+                rc = -EPROTO;
+                goto out;
         }
 
-        if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
-                *nid = le64_to_cpu(hdr.src_nid);
-        } else if (*nid != le64_to_cpu (hdr.src_nid)) {
-                LCONSOLE_ERROR("Connected successfully to nid "LPX64" on host "
-                               "%u.%u.%u.%u, but they claimed they were nid "
-                               LPX64" (%s); please check your Lustre "
-                               "configuration.\n",
-                               *nid, HIPQUAD(conn->ksnc_ipaddr),
-                               le64_to_cpu(hdr.src_nid),
-                               portals_nid2str(SOCKNAL,
-                                               le64_to_cpu(hdr.src_nid),
-                                               ipbuf));
+        if (hello->kshm_nips == 0)
+                goto out;
 
-                return (-EPROTO);
+        rc = libcfs_sock_read(sock, hello->kshm_ips,
+                              hello->kshm_nips * sizeof(__u32), timeout);
+        if (rc != 0) {
+                CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                goto out;
         }
 
-        type = __le32_to_cpu(hdr.msg.hello.type);
-
-        if (conn->ksnc_type == SOCKNAL_CONN_NONE) {
-                /* I've accepted this connection; peer determines type */
-                conn->ksnc_type = ksocknal_invert_type(type);
-                if (conn->ksnc_type == SOCKNAL_CONN_NONE) {
-                        CERROR ("Unexpected type %d from "LPX64"@%u.%u.%u.%u\n",
-                                type, *nid, HIPQUAD(conn->ksnc_ipaddr));
-                        return (-EPROTO);
+        for (i = 0; i < hello->kshm_nips; i++) {
+                hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]);
+                
+                if (hello->kshm_ips[i] == 0) {
+                        CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+                               i, HIPQUAD(conn->ksnc_ipaddr));
+                        rc = -EPROTO;
+                        break;
                 }
-        } else if (ksocknal_invert_type(type) != conn->ksnc_type) {
-                CERROR ("Mismatched types: me %d, "LPX64"@%u.%u.%u.%u %d\n",
-                        conn->ksnc_type, *nid, HIPQUAD(conn->ksnc_ipaddr),
-                        le32_to_cpu(hdr.msg.hello.type));
-                return (-EPROTO);
         }
+out:
+        LIBCFS_FREE(hdr, sizeof(*hdr));
+
+        return rc;
+}
 
-        *incarnation = le64_to_cpu(hdr.msg.hello.incarnation);
+static int
+ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout) 
+{
+        cfs_socket_t      *sock = conn->ksnc_sock;
+        int                rc;         
+        int                i;
+
+        if (hello->kshm_magic == LNET_PROTO_MAGIC)
+                conn->ksnc_flip = 0;
+        else
+                conn->ksnc_flip = 1;
 
-        nips = __le32_to_cpu (hdr.payload_length) / sizeof (__u32);
+        rc = libcfs_sock_read(sock, &hello->kshm_src_nid,
+                              offsetof(ksock_hello_msg_t, kshm_ips) -
+                                       offsetof(ksock_hello_msg_t, kshm_src_nid),
+                              timeout); 
+        if (rc != 0) {
+                CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                return rc;
+        }
 
-        if (nips > SOCKNAL_MAX_INTERFACES ||
-            nips * sizeof(__u32) != __le32_to_cpu (hdr.payload_length)) {
-                CERROR("Bad payload length %d from "LPX64"@%u.%u.%u.%u\n",
-                       __le32_to_cpu (hdr.payload_length),
-                       *nid, HIPQUAD(conn->ksnc_ipaddr));
+        if (conn->ksnc_flip) {
+                __swab32s(&hello->kshm_src_pid);
+                __swab64s(&hello->kshm_src_nid);
+                __swab32s(&hello->kshm_dst_pid);
+                __swab64s(&hello->kshm_dst_nid);
+                __swab64s(&hello->kshm_src_incarnation);
+                __swab64s(&hello->kshm_dst_incarnation);
+                __swab32s(&hello->kshm_ctype);
+                __swab32s(&hello->kshm_nips);
         }
 
-        if (nips == 0)
-                return (0);
+        if (hello->kshm_nips > LNET_MAX_INTERFACES) {
+                CERROR("Bad nips %d from ip %u.%u.%u.%u\n",
+                       hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr));
+                return -EPROTO;
+        }
 
-        rc = ksocknal_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs));
+        if (hello->kshm_nips == 0)
+                return 0;
+        
+        rc = libcfs_sock_read(sock, hello->kshm_ips,
+                              hello->kshm_nips * sizeof(__u32), timeout);
         if (rc != 0) {
-                CERROR ("Error %d reading IPs from "LPX64"@%u.%u.%u.%u\n",
-                        rc, *nid, HIPQUAD(conn->ksnc_ipaddr));
-                return (rc);
+                CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                return rc;
+        }
+
+        for (i = 0; i < hello->kshm_nips; i++) {
+                if (conn->ksnc_flip)
+                        __swab32s(&hello->kshm_ips[i]);
+                
+                if (hello->kshm_ips[i] == 0) {
+                        CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n",
+                               i, HIPQUAD(conn->ksnc_ipaddr));
+                        return -EPROTO;
+                }
         }
 
-        for (i = 0; i < nips; i++) {
-                ipaddrs[i] = __le32_to_cpu(ipaddrs[i]);
+        return 0;
+}
 
-                if (ipaddrs[i] == 0) {
-                        CERROR("Zero IP[%d] from "LPX64"@%u.%u.%u.%u\n",
-                               i, *nid, HIPQUAD(conn->ksnc_ipaddr));
-                        return (-EPROTO);
-                }
+static void
+ksocknal_pack_msg_v1(ksock_tx_t *tx)
+{
+        /* V1.x has no KSOCK_MSG_NOOP */
+        LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+        LASSERT(tx->tx_lnetmsg != NULL);
+
+        tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr;
+        tx->tx_iov[0].iov_len  = sizeof(lnet_hdr_t);
+
+        tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t);
+}
+
+static void
+ksocknal_pack_msg_v2(ksock_tx_t *tx)
+{
+        tx->tx_iov[0].iov_base = (void *)&tx->tx_msg;
+
+        if (tx->tx_lnetmsg != NULL) {
+                LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP);
+
+                tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr;
+                tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload);
+                tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_payload) +
+                                            tx->tx_lnetmsg->msg_len;
+        } else {
+                LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP);
+
+                tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr);
+                tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t,  ksm_u.lnetmsg.ksnm_hdr);
         }
+        /* Don't checksum before start sending, because packet can be piggybacked with ACK */
+}
+
+static void
+ksocknal_unpack_msg_v1(ksock_msg_t *msg)
+{
+        msg->ksm_type           = KSOCK_MSG_LNET;
+        msg->ksm_csum           = 0;
+        msg->ksm_zc_req_cookie  = 0;
+        msg->ksm_zc_ack_cookie  = 0;
+}
 
-        return (nips);
+static void
+ksocknal_unpack_msg_v2(ksock_msg_t *msg)
+{
+        return;  /* Do nothing */
 }
 
+ksock_protocol_t  ksocknal_protocol_v1x =
+{
+        KSOCK_PROTO_V1,
+        ksocknal_send_hello_v1,
+        ksocknal_recv_hello_v1,
+        ksocknal_pack_msg_v1,
+        ksocknal_unpack_msg_v1
+};
+
+ksock_protocol_t  ksocknal_protocol_v2x =
+{
+        KSOCK_PROTO_V2,
+        ksocknal_send_hello_v2,
+        ksocknal_recv_hello_v2,
+        ksocknal_pack_msg_v2,
+        ksocknal_unpack_msg_v2
+};
+
 int
-ksocknal_connect_peer (ksock_route_t *route, int type)
+ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
+                     lnet_nid_t peer_nid, ksock_hello_msg_t *hello)
 {
-        struct socket      *sock;
-        int                 rc;
-        int                 port;
-        int                 may_retry;
+        /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */
+        ksock_net_t         *net = (ksock_net_t *)ni->ni_data;
+        lnet_nid_t           srcnid;
 
-        /* Iterate through reserved ports.  When typed connections are
-         * used, we will need to bind to multiple ports, but we only know
-         * this at connect time.  But, by that time we've already called
-         * bind() so we need a new socket. */
+        LASSERT (0 <= hello->kshm_nips && hello->kshm_nips <= LNET_MAX_INTERFACES);
 
-        for (port = 1023; port > 512; --port) {
+        /* No need for getconnsock/putconnsock */
+        LASSERT (!conn->ksnc_closing);
+        LASSERT (conn->ksnc_proto != NULL);
 
-                rc = ksocknal_lib_connect_sock(&sock, &may_retry, route, port);
+        srcnid = lnet_ptlcompat_srcnid(ni->ni_nid, peer_nid);
 
-                if (rc == 0) {
-                        rc = ksocknal_create_conn(route, sock, type);
-                        cfs_put_file(KSN_SOCK2FILE(sock));
-                        return rc;
+        hello->kshm_src_nid         = srcnid;
+        hello->kshm_dst_nid         = peer_nid;
+        hello->kshm_src_pid         = the_lnet.ln_pid;
+
+        hello->kshm_src_incarnation = net->ksnn_incarnation;
+        hello->kshm_ctype           = conn->ksnc_type;
+
+        return conn->ksnc_proto->pro_send_hello(conn, hello);
+}
+
+int
+ksocknal_invert_type(int type)
+{
+        switch (type)
+        {
+        case SOCKLND_CONN_ANY:
+        case SOCKLND_CONN_CONTROL:
+                return (type);
+        case SOCKLND_CONN_BULK_IN:
+                return SOCKLND_CONN_BULK_OUT;
+        case SOCKLND_CONN_BULK_OUT:
+                return SOCKLND_CONN_BULK_IN;
+        default:
+                return (SOCKLND_CONN_NONE);
+        }
+}
+
+int
+ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, 
+                     ksock_hello_msg_t *hello, lnet_process_id_t *peerid,
+                     __u64 *incarnation)
+{
+        cfs_socket_t        *sock = conn->ksnc_sock;
+        int                  active;
+        int                  timeout;
+        int                  match = 0;
+        int                  rc;
+        ksock_protocol_t    *proto;
+        lnet_process_id_t    recv_id;
+
+        active = (peerid->nid != LNET_NID_ANY);
+        timeout = active ? *ksocknal_tunables.ksnd_timeout :
+                            lnet_acceptor_timeout();
+
+        rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout);
+        if (rc != 0) {
+                CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                return rc;
+        }
+
+        if (hello->kshm_magic != LNET_PROTO_MAGIC &&
+            hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) &&
+            hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+                /* Unexpected magic! */
+                if (active ||
+                    the_lnet.ln_ptlcompat == 0) {
+                        CERROR ("Bad magic(1) %#08x (%#08x expected) from "
+                                "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+                                LNET_PROTO_TCP_MAGIC,
+                                HIPQUAD(conn->ksnc_ipaddr));
+                        return -EPROTO;
                 }
 
-                if (!may_retry)
+                /* When portals compatibility is set, I may be passed a new
+                 * connection "blindly" by the acceptor, and I have to
+                 * determine if my peer has sent an acceptor connection request
+                 * or not.  This isn't a 'hello', so I'll get the acceptor to
+                 * look at it... */
+                rc = lnet_accept(ni, sock, hello->kshm_magic);
+                if (rc != 0)
+                        return -EPROTO;
+
+                /* ...and if it's OK I'm back to looking for a 'hello'... */
+                rc = libcfs_sock_read(sock, &hello->kshm_magic, 
+                                      sizeof (hello->kshm_magic), timeout);
+                if (rc != 0) {
+                        CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                                rc, HIPQUAD(conn->ksnc_ipaddr));
+                        LASSERT (rc < 0 && rc != -EALREADY);
                         return rc;
+                }
+        
+                /* Only need to check V1.x magic */
+                if (hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) {
+                        CERROR ("Bad magic(2) %#08x (%#08x expected) from "
+                                "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic),
+                                LNET_PROTO_TCP_MAGIC,
+                                HIPQUAD(conn->ksnc_ipaddr));
+                        return -EPROTO;
+                }
+        }
+
+        rc = libcfs_sock_read(sock, &hello->kshm_version,
+                              sizeof(hello->kshm_version), timeout);
+        if (rc != 0) {
+                CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n",
+                        rc, HIPQUAD(conn->ksnc_ipaddr));
+                LASSERT (rc < 0 && rc != -EALREADY);
+                return rc;
+        }
+
+        proto = ksocknal_compat_protocol(hello);
+        if (proto == NULL) {
+                if (!active) { 
+                        /* unknown protocol from peer, tell peer my protocol */
+                        conn->ksnc_proto = &ksocknal_protocol_v2x;
+                        hello->kshm_nips = 0;
+                        ksocknal_send_hello(ni, conn, ni->ni_nid, hello);
+                }
+
+                CERROR ("Unknown protocol version (%d.x expected)"
+                        " from %u.%u.%u.%u\n",
+                        conn->ksnc_proto->pro_version,
+                        HIPQUAD(conn->ksnc_ipaddr));
+
+                return -EPROTO;
+        }
+
+        if (conn->ksnc_proto == proto)
+                match = 1;
+
+        conn->ksnc_proto = proto;
+
+        /* receive the rest of hello message anyway */
+        rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout);
+        if (rc != 0) {
+                CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n",
+                       rc, HIPQUAD(conn->ksnc_ipaddr));
+                return rc;
+        }
+
+        if (hello->kshm_src_nid == LNET_NID_ANY) {
+                CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY"
+                       "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr));
+                return -EPROTO;
+        }
+
+        if (conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {          
+                /* Userspace NAL assigns peer process ID from socket */
+                recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG;
+                recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr);
+        } else {
+                recv_id.nid = hello->kshm_src_nid;
+
+                if (the_lnet.ln_ptlcompat > 1 && /* portals peers may exist */
+                    LNET_NIDNET(recv_id.nid) == 0) /* this is one */
+                        recv_id.pid = the_lnet.ln_pid; /* give it a sensible pid */
+                else
+                        recv_id.pid = hello->kshm_src_pid;
+
+        }
+        
+        if (!active) {                          /* don't know peer's nid yet */
+                *peerid = recv_id;
+        } else if (peerid->pid != recv_id.pid ||
+                   !lnet_ptlcompat_matchnid(peerid->nid, recv_id.nid)) {
+                LCONSOLE_ERROR("Connected successfully to %s on host "
+                               "%u.%u.%u.%u, but they claimed they were "
+                               "%s; please check your Lustre "
+                               "configuration.\n",
+                               libcfs_id2str(*peerid),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               libcfs_id2str(recv_id));
+                return -EPROTO;
+        }
+
+        if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+                /* I've accepted this connection; peer determines type */
+                conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype);
+                if (conn->ksnc_type == SOCKLND_CONN_NONE) {
+                        CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n",
+                                hello->kshm_ctype, libcfs_id2str(*peerid), 
+                                HIPQUAD(conn->ksnc_ipaddr));
+                        return -EPROTO;
+                }
+        } else if (hello->kshm_ctype == SOCKLND_CONN_NONE) {
+                if (match) {
+                        /* lost a connection race */
+                        return -EALREADY;
+                }
+                /* unmatched protocol get SOCKLND_CONN_NONE anyway */
+        } else if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) {
+                CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n",
+                        conn->ksnc_type, libcfs_id2str(*peerid), 
+                        HIPQUAD(conn->ksnc_ipaddr),
+                        hello->kshm_ctype);
+                return -EPROTO;
         }
 
-        CERROR("Out of ports trying to bind to a reserved port\n");
-        return (-EADDRINUSE);
+        *incarnation = hello->kshm_src_incarnation;
+
+        return 0;
 }
 
 void
-ksocknal_autoconnect (ksock_route_t *route)
+ksocknal_connect (ksock_route_t *route)
 {
         CFS_LIST_HEAD    (zombies);
-        ksock_tx_t       *tx;
-        ksock_peer_t     *peer;
-        unsigned long     flags;
+        ksock_peer_t     *peer = route->ksnr_peer;
         int               type;
+        int               wanted;
+        cfs_socket_t     *sock;
+        cfs_time_t        deadline;
+        int               retry_later = 0;
         int               rc = 0;
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        deadline = cfs_time_add(cfs_time_current(), 
+                                cfs_time_seconds(*ksocknal_tunables.ksnd_timeout));
+
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
+
+        LASSERT (route->ksnr_scheduled);
+        LASSERT (!route->ksnr_connecting);
+
+        route->ksnr_connecting = 1;
 
         for (;;) {
-                if (!ksocknal_tunables.ksnd_typed_conns) {
-                        if ((route->ksnr_connected & (1<<SOCKNAL_CONN_ANY)) == 0)
-                                type = SOCKNAL_CONN_ANY;
-                        else
-                                break;  /* got connected while route queued */
+                wanted = ksocknal_route_mask() & ~route->ksnr_connected;
+
+                /* stop connecting if peer/route got closed under me, or
+                 * route got connected while queued */
+                if (peer->ksnp_closing || route->ksnr_deleted ||
+                    wanted == 0) {
+                        retry_later = 0;
+                        break;
+                }
+
+                /* reschedule if peer is connecting to me */
+                if (peer->ksnp_accepting > 0) {
+                        CDEBUG(D_NET,
+                               "peer %s(%d) already connecting to me, retry later.\n",
+                               libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting);
+                        retry_later = 1;
+                }
+
+                if (retry_later) /* needs reschedule */
+                        break;
+                        
+                if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) {
+                        type = SOCKLND_CONN_ANY;
+                } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) {
+                        type = SOCKLND_CONN_CONTROL;
+                } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) {
+                        type = SOCKLND_CONN_BULK_IN;
                 } else {
-                        if ((route->ksnr_connected & (1<<SOCKNAL_CONN_CONTROL)) == 0)
-                                type = SOCKNAL_CONN_CONTROL;
-                        else if ((route->ksnr_connected & (1<<SOCKNAL_CONN_BULK_IN)) == 0)
-                                type = SOCKNAL_CONN_BULK_IN;
-                        else if ((route->ksnr_connected & (1<<SOCKNAL_CONN_BULK_OUT)) == 0)
-                                type = SOCKNAL_CONN_BULK_OUT;
-                        else
-                                break;  /* got connected while route queued */
+                        LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0);
+                        type = SOCKLND_CONN_BULK_OUT;
                 }
 
-                write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags);
+                write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
-                rc = ksocknal_connect_peer (route, type);
+                if (cfs_time_aftereq(cfs_time_current(), deadline)) {
+                        rc = -ETIMEDOUT;
+                        lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                                                   route->ksnr_ipaddr,
+                                                   route->ksnr_port);
+                        goto failed;
+                }
+                
+                rc = lnet_connect(&sock, peer->ksnp_id.nid,
+                                  route->ksnr_myipaddr, 
+                                  route->ksnr_ipaddr, route->ksnr_port);
                 if (rc != 0)
                         goto failed;
 
-                write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+                rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type);
+
+                if (rc < 0) {
+                        lnet_connect_console_error(rc, peer->ksnp_id.nid,
+                                                   route->ksnr_ipaddr, 
+                                                   route->ksnr_port);
+                        goto failed;
+                }
+
+                /* rc == EALREADY means I lost a connection race and my
+                 * peer is connecting to me.
+                 * rc == EPROTO means my peer is speaking an older 
+                 * protocol version. */
+                LASSERT (rc == 0 || rc == EALREADY || rc == EPROTO);
+
+                retry_later = rc != 0;
+                if (retry_later)
+                        CDEBUG(D_NET, "peer %s: conn race, retry later.\n",
+                               libcfs_nid2str(peer->ksnp_id.nid));
+                
+                write_lock_bh (&ksocknal_data.ksnd_global_lock);
         }
 
-        LASSERT (route->ksnr_connecting);
+        route->ksnr_scheduled = 0;
         route->ksnr_connecting = 0;
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
-        return;
 
- failed:
-        switch (rc) {
-        /* "normal" errors */
-        case -ECONNREFUSED:
-                LCONSOLE_ERROR("Connection was refused by host %u.%u.%u.%u on "
-                               "port %d; check that Lustre is running on that "
-                               "node.\n",
-                               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
-                break;
-        case -EHOSTUNREACH:
-        case -ENETUNREACH:
-                LCONSOLE_ERROR("Host %u.%u.%u.%u was unreachable; the network "
-                               "or that node may be down, or Lustre may be "
-                               "misconfigured.\n",
-                               HIPQUAD(route->ksnr_ipaddr));
-                break;
-        case -ETIMEDOUT:
-                LCONSOLE_ERROR("Connecting to host %u.%u.%u.%u on port %d took "
-                               "too long; that node may be hung or "
-                               "experiencing high load.\n",
-                               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
-                break;
-        /* errors that should be rare */
-        case -EPROTO:
-                LCONSOLE_ERROR("Protocol error connecting to host %u.%u.%u.%u "
-                               "on port %d: Is it running a compatible version"
-                               " of Lustre?\n", 
-                               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
-                break;
-        case -EADDRINUSE:
-                LCONSOLE_ERROR("No privileged ports available to connect to "
-                               "host %u.%u.%u.%u on port %d\n",
-                               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
-                break;
-        default:
-                LCONSOLE_ERROR("Unexpected error %d connecting to "
-                               "host %u.%u.%u.%u on port %d\n", rc,
-                               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
-                break;
+        if (retry_later) {
+                /* re-queue for attention; this frees me up to handle
+                 * the peer's incoming connection request */
+                ksocknal_launch_connection_locked(route);
         }
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
+        return;
+
+ failed:
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
-        peer = route->ksnr_peer;
-        LASSERT (route->ksnr_connecting);
+        route->ksnr_scheduled = 0;
         route->ksnr_connecting = 0;
 
         /* This is a retry rather than a new connection */
+        route->ksnr_retry_interval *= 2;
+        route->ksnr_retry_interval = 
+                MAX(route->ksnr_retry_interval,
+                    cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000);
+        route->ksnr_retry_interval = 
+                MIN(route->ksnr_retry_interval,
+                    cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000);
+        
         LASSERT (route->ksnr_retry_interval != 0);
         route->ksnr_timeout = cfs_time_add(cfs_time_current(),
                                            route->ksnr_retry_interval);
-        route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2,
-                                          SOCKNAL_MAX_RECONNECT_INTERVAL);
 
-        if (!list_empty (&peer->ksnp_tx_queue) &&
-            ksocknal_find_connecting_route_locked (peer) == NULL) {
+        if (!list_empty(&peer->ksnp_tx_queue) &&
+            peer->ksnp_accepting == 0 &&
+            ksocknal_find_connecting_route_locked(peer) == NULL) {
+                /* ksnp_tx_queue is queued on a conn on successful
+                 * connection */
                 LASSERT (list_empty (&peer->ksnp_conns));
 
-                /* None of the connections that the blocked packets are
-                 * waiting for have been successful.  Complete them now... */
-                do {
-                        tx = list_entry (peer->ksnp_tx_queue.next,
-                                         ksock_tx_t, tx_list);
-                        list_del (&tx->tx_list);
-                        list_add_tail (&tx->tx_list, &zombies);
-                } while (!list_empty (&peer->ksnp_tx_queue));
+                /* take all the blocked packets while I've got the lock and
+                 * complete below... */
+                list_add(&zombies, &peer->ksnp_tx_queue);
+                list_del_init(&peer->ksnp_tx_queue);
         }
 
 #if 0           /* irrelevent with only eager routes */
@@ -2083,154 +2451,180 @@ ksocknal_autoconnect (ksock_route_t *route)
                 list_add_tail(&route->ksnr_list, &peer->ksnp_routes);
         }
 #endif
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
-
-        while (!list_empty (&zombies)) {
-                char ipbuf[PTL_NALFMT_SIZE];
-                char ipbuf2[PTL_NALFMT_SIZE];
-                tx = list_entry (zombies.next, ksock_tx_t, tx_list);
-
-                CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n",
-                        le32_to_cpu (tx->tx_hdr->type),
-                        le32_to_cpu (tx->tx_hdr->payload_length),
-                        le64_to_cpu (tx->tx_hdr->src_nid),
-                        portals_nid2str(SOCKNAL,
-                                        le64_to_cpu(tx->tx_hdr->src_nid),
-                                        ipbuf),
-                        le64_to_cpu (tx->tx_hdr->dest_nid),
-                        portals_nid2str(SOCKNAL,
-                                        le64_to_cpu(tx->tx_hdr->src_nid),
-                                        ipbuf2));
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
-                list_del (&tx->tx_list);
-                /* complete now */
-                ksocknal_tx_done (tx, 0);
-        }
+        ksocknal_peer_failed(peer);
+        ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1);
+}
+
+static inline int
+ksocknal_connd_connect_route_locked(void)
+{
+        /* Only handle an outgoing connection request if there is someone left
+         * to handle incoming connections */
+        return !list_empty(&ksocknal_data.ksnd_connd_routes) &&
+                ((ksocknal_data.ksnd_connd_connecting + 1) <
+                 *ksocknal_tunables.ksnd_nconnds);
+}
+
+static inline int
+ksocknal_connd_ready(void)
+{
+        int            rc;
+        
+        spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
+        
+        rc = ksocknal_data.ksnd_shuttingdown ||
+             !list_empty(&ksocknal_data.ksnd_connd_connreqs) ||
+             ksocknal_connd_connect_route_locked();
+        
+        spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
+
+        return rc;
 }
 
 int
-ksocknal_autoconnectd (void *arg)
+ksocknal_connd (void *arg)
 {
         long               id = (long)arg;
         char               name[16];
-        unsigned long      flags;
+        ksock_connreq_t   *cr;
         ksock_route_t     *route;
-        int                rc;
 
-        snprintf (name, sizeof (name), "ksocknal_ad%02ld", id);
-        kportal_daemonize (name);
-        kportal_blockallsigs ();
+        snprintf (name, sizeof (name), "socknal_cd%02ld", id);
+        cfs_daemonize (name);
+        cfs_block_allsigs ();
 
-        spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags);
+        spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
 
         while (!ksocknal_data.ksnd_shuttingdown) {
 
-                if (!list_empty (&ksocknal_data.ksnd_autoconnectd_routes)) {
-                        route = list_entry (ksocknal_data.ksnd_autoconnectd_routes.next,
-                                            ksock_route_t, ksnr_connect_list);
+                if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) {
+                        /* Connection accepted by the listener */
+                        cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next,
+                                        ksock_connreq_t, ksncr_list);
+                        
+                        list_del(&cr->ksncr_list);
+                        spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
+                        
+                        ksocknal_create_conn(cr->ksncr_ni, NULL, 
+                                             cr->ksncr_sock, SOCKLND_CONN_NONE);
+                        lnet_ni_decref(cr->ksncr_ni);
+                        LIBCFS_FREE(cr, sizeof(*cr));
+                        
+                        spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
+                }
 
-                        list_del (&route->ksnr_connect_list);
-                        spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags);
+                if (ksocknal_connd_connect_route_locked()) {
+                        /* Connection request */
+                        route = list_entry (ksocknal_data.ksnd_connd_routes.next,
+                                            ksock_route_t, ksnr_connd_list);
 
-                        ksocknal_autoconnect (route);
-                        ksocknal_put_route (route);
+                        list_del (&route->ksnr_connd_list);
+                        ksocknal_data.ksnd_connd_connecting++;
+                        spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
 
-                        spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock,
-                                          flags);
-                        continue;
+                        ksocknal_connect (route);
+                        ksocknal_route_decref(route);
+
+                        spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
+                        ksocknal_data.ksnd_connd_connecting--;
                 }
 
-                spin_unlock_irqrestore(&ksocknal_data.ksnd_autoconnectd_lock,
-                                       flags);
+                spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
 
-                rc = wait_event_interruptible(ksocknal_data.ksnd_autoconnectd_waitq,
-                                              ksocknal_data.ksnd_shuttingdown ||
-                                              !list_empty(&ksocknal_data.ksnd_autoconnectd_routes));
+                wait_event_interruptible_exclusive(
+                        ksocknal_data.ksnd_connd_waitq,
+                        ksocknal_connd_ready());
 
-                spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, flags);
+                spin_lock_bh (&ksocknal_data.ksnd_connd_lock);
         }
 
-        spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags);
+        spin_unlock_bh (&ksocknal_data.ksnd_connd_lock);
 
         ksocknal_thread_fini ();
         return (0);
 }
 
 ksock_conn_t *
-ksocknal_find_timed_out_conn (ksock_peer_t *peer) 
+ksocknal_find_timed_out_conn (ksock_peer_t *peer)
 {
         /* We're called with a shared lock on ksnd_global_lock */
         ksock_conn_t      *conn;
         struct list_head  *ctmp;
 
         list_for_each (ctmp, &peer->ksnp_conns) {
+                int     error;
                 conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
 
-                /* Don't need the {get,put}connsock dance to deref ksnc_sock... */
+                /* Don't need the {get,put}connsock dance to deref ksnc_sock */
                 LASSERT (!conn->ksnc_closing);
 
-                if (SOCK_ERROR(conn->ksnc_sock) != 0) {
-                        atomic_inc (&conn->ksnc_refcount);
+                /* SOCK_ERROR will reset error code of socket in
+                 * some platform (like Darwin8.x) */
+                error = SOCK_ERROR(conn->ksnc_sock);
+                if (error != 0) {
+                        ksocknal_conn_addref(conn);
 
-                        switch (SOCK_ERROR(conn->ksnc_sock)) {
+                        switch (error) {
                         case ECONNRESET:
-                                LCONSOLE_WARN("A connection with %u.%u.%u.%u "
-                                              "was reset; they may have "
-                                              "rebooted.\n",
-                                              HIPQUAD(conn->ksnc_ipaddr));
+                                CDEBUG(D_NETERROR, "A connection with %s "
+                                       "(%u.%u.%u.%u:%d) was reset; "
+                                       "it may have rebooted.\n",
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
                                 break;
                         case ETIMEDOUT:
-                                LCONSOLE_WARN("A connection with %u.%u.%u.%u "
-                                              "timed out; the network or that "
-                                              "node may be down.\n",
-                                              HIPQUAD(conn->ksnc_ipaddr));
+                                CDEBUG(D_NETERROR, "A connection with %s "
+                                       "(%u.%u.%u.%u:%d) timed out; the "
+                                       "network or node may be down.\n",
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
                                 break;
                         default:
-                                LCONSOLE_WARN("An unexpected network error "
-                                              "occurred with %u.%u.%u.%u: %d\n",
-                                              HIPQUAD(conn->ksnc_ipaddr),
-                                              SOCK_ERROR(conn->ksnc_sock));
+                                CDEBUG(D_NETERROR, "An unexpected network error %d "
+                                       "occurred with %s "
+                                       "(%u.%u.%u.%u:%d\n", error,
+                                       libcfs_id2str(peer->ksnp_id),
+                                       HIPQUAD(conn->ksnc_ipaddr),
+                                       conn->ksnc_port);
                                 break;
                         }
 
-                        /* Something (e.g. failed keepalive) set the socket error */
-                        CDEBUG(D_HA,"Socket error %d: "LPX64" %p %d.%d.%d.%d\n",
-                               SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid,
-                               conn, HIPQUAD(conn->ksnc_ipaddr));
-
                         return (conn);
                 }
 
                 if (conn->ksnc_rx_started &&
-                    cfs_time_aftereq (cfs_time_current(), 
-                                      conn->ksnc_rx_deadline)) {
+                    cfs_time_aftereq(cfs_time_current(),
+                                     conn->ksnc_rx_deadline)) {
                         /* Timed out incomplete incoming message */
-                        atomic_inc (&conn->ksnc_refcount);
-                        LCONSOLE_ERROR("A timeout occurred receiving data from "
-                                       "%u.%u.%u.%u; the network or that node "
-                                       "may be down.\n",
-                                       HIPQUAD(conn->ksnc_ipaddr));
-                        CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n",
-                                peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr));
+                        ksocknal_conn_addref(conn);
+                        CDEBUG(D_NETERROR, "Timeout receiving from %s "
+                               "(%u.%u.%u.%u:%d), state %d wanted %d left %d\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               conn->ksnc_port,
+                               conn->ksnc_rx_state,
+                               conn->ksnc_rx_nob_wanted,
+                               conn->ksnc_rx_nob_left);
                         return (conn);
                 }
 
-                if ((!list_empty (&conn->ksnc_tx_queue) ||
+                if ((!list_empty(&conn->ksnc_tx_queue) ||
                      SOCK_WMEM_QUEUED(conn->ksnc_sock) != 0) &&
-                    cfs_time_aftereq (cfs_time_current(), 
-                                      conn->ksnc_tx_deadline)) {
+                    cfs_time_aftereq(cfs_time_current(),
+                                     conn->ksnc_tx_deadline)) {
                         /* Timed out messages queued for sending or
                          * buffered in the socket's send buffer */
-                        atomic_inc (&conn->ksnc_refcount);
-                        LCONSOLE_ERROR("A timeout occurred sending data to "
-                                       "%u.%u.%u.%u; the network or that node "
-                                       "may be down.\n",
-                                       HIPQUAD(conn->ksnc_ipaddr));
-                        CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n",
-                                peer->ksnp_nid,
-                                list_empty (&conn->ksnc_tx_queue) ? "" : "Q ",
-                                SOCK_WMEM_QUEUED(conn->ksnc_sock), conn,
-                                HIPQUAD(conn->ksnc_ipaddr));
+                        ksocknal_conn_addref(conn);
+                        CDEBUG(D_NETERROR, "Timeout sending data to %s "
+                               "(%u.%u.%u.%u:%d) the network or that "
+                               "node may be down.\n",
+                               libcfs_id2str(peer->ksnp_id),
+                               HIPQUAD(conn->ksnc_ipaddr),
+                               conn->ksnc_port);
                         return (conn);
                 }
         }
@@ -2259,15 +2653,12 @@ ksocknal_check_peer_timeouts (int idx)
                 if (conn != NULL) {
                         read_unlock (&ksocknal_data.ksnd_global_lock);
 
-                        CERROR("Timeout out conn->"LPX64" ip %d.%d.%d.%d:%d\n",
-                               peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr),
-                               conn->ksnc_port);
                         ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT);
 
                         /* NB we won't find this one again, but we can't
                          * just proceed with the next peer, since we dropped
                          * ksnd_global_lock and it might be dead already! */
-                        ksocknal_put_conn (conn);
+                        ksocknal_conn_decref(conn);
                         goto again;
                 }
         }
@@ -2279,7 +2670,6 @@ int
 ksocknal_reaper (void *arg)
 {
         cfs_waitlink_t     wait;
-        unsigned long      flags;
         ksock_conn_t      *conn;
         ksock_sched_t     *sched;
         struct list_head   enomem_conns;
@@ -2289,13 +2679,13 @@ ksocknal_reaper (void *arg)
         int                peer_index = 0;
         cfs_time_t         deadline = cfs_time_current();
 
-        kportal_daemonize ("ksocknal_reaper");
-        kportal_blockallsigs ();
+        cfs_daemonize ("socknal_reaper");
+        cfs_block_allsigs ();
 
         CFS_INIT_LIST_HEAD(&enomem_conns);
         cfs_waitlink_init (&wait);
 
-        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+        spin_lock_bh (&ksocknal_data.ksnd_reaper_lock);
 
         while (!ksocknal_data.ksnd_shuttingdown) {
 
@@ -2303,13 +2693,13 @@ ksocknal_reaper (void *arg)
                         conn = list_entry (ksocknal_data.ksnd_deathrow_conns.next,
                                            ksock_conn_t, ksnc_list);
                         list_del (&conn->ksnc_list);
-
-                        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+                        
+                        spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock);
 
                         ksocknal_terminate_conn (conn);
-                        ksocknal_put_conn (conn);
+                        ksocknal_conn_decref(conn);
 
-                        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+                        spin_lock_bh (&ksocknal_data.ksnd_reaper_lock);
                         continue;
                 }
 
@@ -2317,12 +2707,12 @@ ksocknal_reaper (void *arg)
                         conn = list_entry (ksocknal_data.ksnd_zombie_conns.next,
                                            ksock_conn_t, ksnc_list);
                         list_del (&conn->ksnc_list);
-
-                        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+                        
+                        spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock);
 
                         ksocknal_destroy_conn (conn);
 
-                        spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+                        spin_lock_bh (&ksocknal_data.ksnd_reaper_lock);
                         continue;
                 }
 
@@ -2331,7 +2721,7 @@ ksocknal_reaper (void *arg)
                         list_del_init(&ksocknal_data.ksnd_enomem_conns);
                 }
 
-                spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+                spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock);
 
                 /* reschedule all the connections that stalled with ENOMEM... */
                 nenomem_conns = 0;
@@ -2342,14 +2732,14 @@ ksocknal_reaper (void *arg)
 
                         sched = conn->ksnc_scheduler;
 
-                        spin_lock_irqsave (&sched->kss_lock, flags);
+                        spin_lock_bh (&sched->kss_lock);
 
                         LASSERT (conn->ksnc_tx_scheduled);
                         conn->ksnc_tx_ready = 1;
-                        list_add_tail(&conn->ksnc_tx_list,&sched->kss_tx_conns);
+                        list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns);
                         cfs_waitq_signal (&sched->kss_waitq);
 
-                        spin_unlock_irqrestore (&sched->kss_lock, flags);
+                        spin_unlock_bh (&sched->kss_lock);
                         nenomem_conns++;
                 }
 
@@ -2367,9 +2757,9 @@ ksocknal_reaper (void *arg)
                          * timeout on any connection within (n+1)/n times the
                          * timeout interval. */
 
-                        if (ksocknal_tunables.ksnd_io_timeout > n * p)
+                        if (*ksocknal_tunables.ksnd_timeout > n * p)
                                 chunk = (chunk * n * p) /
-                                        ksocknal_tunables.ksnd_io_timeout;
+                                        *ksocknal_tunables.ksnd_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
@@ -2397,25 +2787,16 @@ ksocknal_reaper (void *arg)
                 if (!ksocknal_data.ksnd_shuttingdown &&
                     list_empty (&ksocknal_data.ksnd_deathrow_conns) &&
                     list_empty (&ksocknal_data.ksnd_zombie_conns))
-                        cfs_waitq_timedwait (&wait, timeout);
+                        cfs_waitq_timedwait (&wait, CFS_TASK_INTERRUPTIBLE, timeout);
 
                 set_current_state (TASK_RUNNING);
                 cfs_waitq_del (&ksocknal_data.ksnd_reaper_waitq, &wait);
 
-                spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags);
+                spin_lock_bh (&ksocknal_data.ksnd_reaper_lock);
         }
 
-        spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
+        spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock);
 
         ksocknal_thread_fini ();
         return (0);
 }
-
-lib_nal_t ksocknal_lib = {
-        libnal_data:       &ksocknal_data,      /* NAL private data */
-        libnal_send:        ksocknal_send,
-        libnal_send_pages:  ksocknal_send_pages,
-        libnal_recv:        ksocknal_recv,
-        libnal_recv_pages:  ksocknal_recv_pages,
-        libnal_dist:        ksocknal_dist
-};
index ada5b64..25d6b45 100644 (file)
 #include <netinet/tcp.h>
 #include <sys/file.h>
 
-#include "socknal.h"
+#include "socklnd.h"
 
-#if 0
-#undef SOCKNAL_SINGLE_FRAG_TX
-#define SOCKNAL_SINGLE_FRAG_TX  1
-#undef SOCKNAL_SINGLE_FRAG_RX
-#define SOCKNAL_SINGLE_FRAG_RX  1
-#endif
+# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
 
-SYSCTL_DECL(_portals);
+SYSCTL_DECL(_lnet);
 
-SYSCTL_NODE (_portals,           OID_AUTO,       ksocknal,        CTLFLAG_RW, 
-             0,                 "ksocknal_sysctl");
+SYSCTL_NODE (_lnet,           OID_AUTO,         ksocknal,        CTLFLAG_RW, 
+             0,                                 "ksocknal_sysctl");
 
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       timeout, 
-           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_io_timeout, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         timeout, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_timeout, 
            0,                                   "timeout");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       eager_ack, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         credits, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_credits, 
+           0,                                   "credits");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         peer_credits, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_peercredits, 
+           0,                                   "peer_credits");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         nconnds, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_nconnds, 
+           0,                                   "nconnds");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         min_reconnectms, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_min_reconnectms, 
+           0,                                   "min_reconnectms");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         max_reconnectms, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_max_reconnectms, 
+           0,                                   "max_reconnectms");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         eager_ack, 
            CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_eager_ack, 
            0,                                   "eager_ack");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       typed, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         typed, 
            CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_typed_conns, 
            0,                                   "typed");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       min_bulk, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         min_bulk, 
            CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_min_bulk, 
            0,                                   "min_bulk");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       buffer_size, 
-           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_buffer_size, 
-           0,                                   "buffer_size");
-SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       nagle, 
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         rx_buffer_size, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_rx_buffer_size, 
+           0,                                   "rx_buffer_size");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         tx_buffer_size, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_tx_buffer_size, 
+           0,                                   "tx_buffer_size");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         nagle, 
            CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_nagle, 
            0,                                   "nagle");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         keepalive_idle, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_keepalive_idle, 
+           0,                                   "keepalive_idle");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         keepalive_count, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_keepalive_count, 
+           0,                                   "keepalive_count");
+SYSCTL_INT(_lnet_ksocknal,    OID_AUTO,         keepalive_intvl, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_keepalive_intvl, 
+           0,                                   "keepalive_intvl");
 
 cfs_sysctl_table_t      ksocknal_top_ctl_table [] = {
-        &sysctl__portals_ksocknal,
-        &sysctl__portals_ksocknal_timeout,
-        &sysctl__portals_ksocknal_eager_ack,
-        &sysctl__portals_ksocknal_typed,
-        &sysctl__portals_ksocknal_min_bulk,
-        &sysctl__portals_ksocknal_buffer_size,
-        &sysctl__portals_ksocknal_nagle,
+        &sysctl__lnet_ksocknal,
+        &sysctl__lnet_ksocknal_timeout,
+        &sysctl__lnet_ksocknal_credits,
+        &sysctl__lnet_ksocknal_peer_credits,
+        &sysctl__lnet_ksocknal_nconnds,
+        &sysctl__lnet_ksocknal_min_reconnectms,
+        &sysctl__lnet_ksocknal_max_reconnectms,
+        &sysctl__lnet_ksocknal_eager_ack,
+        &sysctl__lnet_ksocknal_typed,
+        &sysctl__lnet_ksocknal_min_bulk,
+        &sysctl__lnet_ksocknal_rx_buffer_size,
+        &sysctl__lnet_ksocknal_tx_buffer_size,
+        &sysctl__lnet_ksocknal_nagle,
+        &sysctl__lnet_ksocknal_keepalive_idle,
+        &sysctl__lnet_ksocknal_keepalive_count,
+        &sysctl__lnet_ksocknal_keepalive_intvl,
         NULL
 };
 
-static unsigned long  ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES);
-
-struct socket *
-sockfd_lookup(int fd, void *foo)
+int
+ksocknal_lib_tunables_init ()
 {
-       struct socket *so;
-       struct file *fp;
-        CFS_DECL_FUNNEL_DATA;
+        ksocknal_tunables.ksnd_sysctl =
+                cfs_register_sysctl_table (ksocknal_top_ctl_table, 0);
 
-        CFS_NET_IN;
-       getsock(current_proc()->p_fd, fd, &fp);
-        CFS_NET_EX;
-       so = (struct socket *)fp->f_data;
-       so->reserved4 = fp;
-        CFS_CONE_IN;
-       fref(fp);
-        CFS_CONE_EX;
-       return so;
-}
+        if (ksocknal_tunables.ksnd_sysctl == NULL)
+               return -ENOMEM;
 
-extern struct fileops socketops;
+       return 0;
+}
 
-static int
-sock_map_fd (struct socket *so)
+void
+ksocknal_lib_tunables_fini ()
 {
-       struct file *fp;
-       int fd;
-        CFS_DECL_FUNNEL_DATA;
-       
-        CFS_CONE_IN;
-       falloc(current_proc(), &fp, &fd);
-       fp->f_flag = FREAD|FWRITE;
-       fp->f_type = DTYPE_SOCKET;
-       fp->f_ops = &socketops;
-       fp->f_data = (caddr_t)so;
-       so->reserved4 = fp;
-       *fdflags(current_proc(), fd) &= ~UF_RESERVED;
-        CFS_CONE_EX;
-
-       return fd;
+        if (ksocknal_tunables.ksnd_sysctl != NULL)
+                cfs_unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl);   
 }
-
-static void
-sock_release(struct socket *so)
+#else
+int
+ksocknal_lib_tunables_init ()
 {
-       struct file *fp;
-        CFS_DECL_FUNNEL_DATA;
-
-       fp = (struct file *)so->reserved4;
-       so->reserved4 = NULL;
-       fp->f_data = NULL;
-        CFS_CONE_IN;
-       frele(fp);
-        CFS_CONE_EX;
-        CFS_NET_IN;
-       soshutdown(so, 0);
-        CFS_NET_EX;
+       return 0;
 }
 
-static void
-sock_fdrelse(int fd)
-{ 
-        CFS_DECL_FUNNEL_DATA;
-
-        CFS_CONE_IN;
-        fdrelse(current_proc(), fd);
-        CFS_CONE_EX;
+void
+ksocknal_lib_tunables_fini ()
+{
 }
+#endif
+
+/*
+ * To use bigger buffer for socket:
+ * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so
+ *    we must patch kernel).
+ * 2. Increase net.inet.tcp.reass.maxsegments
+ * 3. Increase net.inet.tcp.sendspace
+ * 4. Increase net.inet.tcp.recvspace
+ * 5. Increase kern.ipc.maxsockbuf
+ */
+#define KSOCKNAL_MAX_BUFFER        (1152*1024)
 
 void
 ksocknal_lib_bind_irq (unsigned int irq)
@@ -148,7 +152,7 @@ ksocknal_lib_bind_irq (unsigned int irq)
 }
 
 unsigned int
-ksocknal_lib_sock_irq (struct socket *sock)
+ksocknal_lib_sock_irq (cfs_socket_t *sock)
 {
         return 0;
 }
@@ -156,46 +160,374 @@ ksocknal_lib_sock_irq (struct socket *sock)
 int
 ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 { 
-        struct sockaddr_in *sin; 
-        struct sockaddr    *sa; 
-        int                rc; 
-        CFS_DECL_NET_DATA;
+        int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+                                     &conn->ksnc_ipaddr,
+                                     &conn->ksnc_port);
 
-        CFS_NET_IN; 
-        rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa); 
-        LASSERT (!conn->ksnc_closing); 
-        if (rc != 0) { 
-                CFS_NET_EX; 
-                if (sa) FREE(sa, M_SONAME); 
-                CERROR ("Error %d getting sock peer IP\n", rc); 
-                return rc; 
+        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+        LASSERT (!conn->ksnc_closing);
+
+        if (rc != 0) {
+                CERROR ("Error %d getting sock peer IP\n", rc);
+                return rc;
+        }
+
+        rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+                                 &conn->ksnc_myipaddr, NULL);
+        if (rc != 0) {
+                CERROR ("Error %d getting sock local IP\n", rc);
+                return rc;
+        }
+
+        return 0;
+}
+
+#ifdef __DARWIN8__
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        socket_t        sock = C2B_SOCK(conn->ksnc_sock);
+        size_t          sndlen;
+        int             nob;
+        int             rc;
+
+#if SOCKNAL_SINGLE_FRAG_TX
+        struct iovec    scratch;
+        struct iovec   *scratchiov = &scratch;
+        unsigned int    niov = 1;
+#else
+        struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov;
+        unsigned int    niov = tx->tx_niov;
+#endif
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = MSG_DONTWAIT
+        };
+        
+        int  i;
+        
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i] = tx->tx_iov[i];
+                nob += scratchiov[i].iov_len;
         } 
-        sin = (struct sockaddr_in *)sa; 
-        conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr); 
-        conn->ksnc_port = ntohs (sin->sin_port); 
-        if (sa) FREE(sa, M_SONAME); 
-        rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa); 
-        CFS_NET_EX; 
-        if (rc != 0) { 
-                if (sa) FREE(sa, M_SONAME); 
-                CERROR ("Error %d getting sock local IP\n", rc); 
-                return rc; 
+        
+        /* 
+         * XXX Liang:
+         * Linux has MSG_MORE, do we have anything to
+         * reduce number of partial TCP segments sent?
+         */
+        rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
+        if (rc == 0)
+                rc = sndlen;
+        return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        socket_t       sock = C2B_SOCK(conn->ksnc_sock);
+        lnet_kiov_t   *kiov = tx->tx_kiov;
+        int            rc;
+        int            nob;
+        size_t         sndlen;
+
+#if SOCKNAL_SINGLE_FRAG_TX
+        struct iovec  scratch;
+        struct iovec *scratchiov = &scratch;
+        unsigned int  niov = 1;
+#else
+        struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+        unsigned int  niov = tx->tx_nkiov;
+#endif
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = MSG_DONTWAIT
+        };
+        
+        int           i;
+        
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) +
+                                         kiov[i].kiov_offset;
+                nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+        }
+
+        /* 
+         * XXX Liang:
+         * Linux has MSG_MORE, do wen have anyting to
+         * reduce number of partial TCP segments sent?
+         */
+        rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
+        for (i = 0; i < niov; i++)
+                cfs_kunmap(kiov[i].kiov_page);
+        if (rc == 0)
+                rc = sndlen;
+        return rc;
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+        struct iovec  scratch;
+        struct iovec *scratchiov = &scratch;
+        unsigned int  niov = 1;
+#else
+        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+        unsigned int  niov = conn->ksnc_rx_niov;
+#endif
+        struct iovec *iov = conn->ksnc_rx_iov;
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        size_t       rcvlen;
+        int          nob;
+        int          i;
+        int          rc;
+
+        LASSERT (niov > 0);
+
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i] = iov[i];
+                nob += scratchiov[i].iov_len;
+        }
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted); 
+        rc = -sock_receive (C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen);
+        if (rc == 0)
+                rc = rcvlen;
+
+        return rc;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+        struct iovec  scratch;
+        struct iovec *scratchiov = &scratch;
+        unsigned int  niov = 1;
+#else
+        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+        unsigned int  niov = conn->ksnc_rx_nkiov;
+#endif
+        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        struct msghdr msg = {
+                .msg_name       = NULL,
+                .msg_namelen    = 0,
+                .msg_iov        = scratchiov,
+                .msg_iovlen     = niov,
+                .msg_control    = NULL,
+                .msg_controllen = 0,
+                .msg_flags      = 0
+        };
+        int          nob;
+        int          i;
+        size_t       rcvlen;
+        int          rc;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+        for (nob = i = 0; i < niov; i++) {
+                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + \
+                                         kiov[i].kiov_offset;
+                nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+        }
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+        rc = -sock_receive(C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen); 
+        for (i = 0; i < niov; i++)
+                cfs_kunmap(kiov[i].kiov_page); 
+        if (rc == 0)
+                rc = rcvlen;
+        return (rc);
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+        /* XXX Liang: */
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+        socket_t       sock = C2B_SOCK(conn->ksnc_sock);
+        int            len;
+        int            rc;
+
+        rc = ksocknal_connsock_addref(conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                *txmem = *rxmem = *nagle = 0;
+                return (-ESHUTDOWN);
+        }
+        rc = libcfs_sock_getbuf(conn->ksnc_sock, txmem, rxmem);
+        if (rc == 0) {
+                len = sizeof(*nagle);
+                rc = -sock_getsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+                                      nagle, &len);
+        }
+        ksocknal_connsock_decref(conn);
+
+        if (rc == 0)
+                *nagle = !*nagle;
+        else
+                *txmem = *rxmem = *nagle = 0;
+
+        return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (cfs_socket_t *sock)
+{
+        int             rc; 
+        int             option; 
+        int             keep_idle; 
+        int             keep_intvl; 
+        int             keep_count; 
+        int             do_keepalive; 
+        socket_t        so = C2B_SOCK(sock);
+        struct linger   linger;
+
+        /* Ensure this socket aborts active sends immediately when we close
+         * it. */
+        linger.l_onoff = 0;
+        linger.l_linger = 0;
+        rc = -sock_setsockopt(so, SOL_SOCKET, SO_LINGER, &linger, sizeof(linger));
+        if (rc != 0) {
+                CERROR ("Can't set SO_LINGER: %d\n", rc);
+                return (rc);
+        }
+
+        if (!*ksocknal_tunables.ksnd_nagle) { 
+                option = 1; 
+                rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &option, sizeof(option));
+                if (rc != 0) { 
+                        CERROR ("Can't disable nagle: %d\n", rc); 
+                        return (rc);
+                } 
         } 
-        conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr);
 
-        return 0;
+        rc = libcfs_sock_setbuf(sock,
+                                *ksocknal_tunables.ksnd_tx_buffer_size,
+                                *ksocknal_tunables.ksnd_rx_buffer_size);
+        if (rc != 0) {
+                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                        *ksocknal_tunables.ksnd_tx_buffer_size,
+                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+                return (rc);
+        }
+
+        /* snapshot tunables */ 
+        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle; 
+        keep_count = *ksocknal_tunables.ksnd_keepalive_count; 
+        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); 
+        option = (do_keepalive ? 1 : 0); 
+
+        rc = -sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &option, sizeof(option)); 
+        if (rc != 0) { 
+                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); 
+                return (rc);
+        }
+        
+        if (!do_keepalive)
+                return (rc);
+        rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_KEEPALIVE, 
+                              &keep_idle, sizeof(keep_idle));
+        
+        return (rc);
+}
+
+void
+ksocknal_lib_push_conn(ksock_conn_t *conn)
+{ 
+        socket_t        sock; 
+        int             val = 1; 
+        int             rc; 
+        
+        rc = ksocknal_connsock_addref(conn); 
+        if (rc != 0)            /* being shut down */ 
+                return; 
+        sock = C2B_SOCK(conn->ksnc_sock); 
+
+        rc = -sock_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)); 
+        LASSERT(rc == 0);
+
+        ksocknal_connsock_decref(conn);
+        return;
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+
+static void
+ksocknal_upcall(socket_t so, void *arg, int waitf)
+{
+        ksock_conn_t  *conn = (ksock_conn_t *)arg;
+        ENTRY;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+        if (conn == NULL)
+                goto out;
+
+        ksocknal_read_callback (conn);
+        /* XXX Liang */
+        ksocknal_write_callback (conn);
+out:
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+        EXIT;
+}
+
+void
+ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{ 
+        /* No callback need to save in osx */
+        return;
+}
+
+void
+ksocknal_lib_set_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{ 
+        libcfs_sock_set_cb(sock, ksocknal_upcall, (void *)conn);
+        return;
+}
+
+void 
+ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{ 
+        libcfs_sock_reset_cb(sock);
 }
 
+#else /* !__DARWIN8__ */
+
 int
 ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 { 
 #if SOCKNAL_SINGLE_FRAG_TX 
         struct iovec    scratch; 
         struct iovec   *scratchiov = &scratch; 
-        int             niov = 1;
+        unsigned int    niov = 1;
 #else 
         struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov; 
-        int             niov = tx->tx_niov;
+        unsigned int    niov = tx->tx_niov;
 #endif
         struct socket *sock = conn->ksnc_sock;
         int            nob;
@@ -248,13 +580,13 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK 
         struct iovec  scratch; 
         struct iovec *scratchiov = &scratch; 
-        int           niov = 1;
+        unsigned int  niov = 1;
 #else
         struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; 
-        int           niov = tx->tx_nkiov;
+        unsigned int  niov = tx->tx_nkiov;
 #endif
         struct socket *sock = conn->ksnc_sock;
-        ptl_kiov_t    *kiov = tx->tx_kiov;
+        lnet_kiov_t    *kiov = tx->tx_kiov;
         int            nob;
         int            rc;
         int            i;
@@ -364,6 +696,10 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn)
         CFS_NET_IN;
         s = splnet();
 
+        /*
+         * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
+         * to send immediate ACK. 
+         */
         if (tp && tp->t_flags & TF_DELACK){
                 tp->t_flags &= ~TF_DELACK;
                 tp->t_flags |= TF_ACKNOW;
@@ -371,14 +707,6 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn)
         }
         splx(s);
 
-        /*
-         * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
-         * to send immediate ACK. It's not the best resolution because
-         * tcp_fasttimo will send out ACK for all delayed-ack tcp socket.
-         * Anyway, it's working now. 
-         * extern void tcp_fasttimo(); 
-         * tcp_fasttimo();
-         */
         CFS_NET_EX;
 
         return;
@@ -390,10 +718,10 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
 #if SOCKNAL_SINGLE_FRAG_RX 
         struct iovec  scratch; 
         struct iovec *scratchiov = &scratch; 
-        int           niov = 1;
+        unsigned int  niov = 1;
 #else 
         struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; 
-        int           niov = conn->ksnc_rx_niov;
+        unsigned int  niov = conn->ksnc_rx_niov;
 #endif
         struct iovec *iov = conn->ksnc_rx_iov;
         int          nob;
@@ -444,12 +772,12 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK 
         struct iovec  scratch; 
         struct iovec *scratchiov = &scratch; 
-        int           niov = 1;
+        unsigned int  niov = 1;
 #else 
         struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; 
-        int           niov = conn->ksnc_rx_nkiov;
+        unsigned int  niov = conn->ksnc_rx_nkiov;
 #endif
-        ptl_kiov_t    *kiov = conn->ksnc_rx_kiov;
+        lnet_kiov_t    *kiov = conn->ksnc_rx_kiov;
         int           nob;
         int           rc;
         int           i;
@@ -497,138 +825,43 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 }
 
 int
-ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        CFS_DECL_NET_DATA;
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct  uio suio = {
-                        .uio_iov        = &iov,
-                        .uio_iovcnt     = 1,
-                        .uio_offset     = 0,
-                        .uio_resid      = nob,
-                        .uio_segflg     = UIO_SYSSPACE,
-                        .uio_rw         = UIO_WRITE,
-                        .uio_procp      = NULL
-                };
-
-                CFS_NET_IN;
-                rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0);
-                CFS_NET_EX;
-
-                if (rc != 0) {
-                        if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
-                                                rc == EWOULDBLOCK))
-                                rc = 0;
-                        if ( rc != 0 )
-                                return -rc;
-                        rc = nob - suio.uio_resid;
-                        buffer = ((char *)buffer) + rc;
-                        nob = suio.uio_resid;
-                        continue;
-                }
-                break;
-        }
-
-        return (0);
-}
-
-int
-ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        CFS_DECL_NET_DATA;
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct uio  ruio = {
-                        .uio_iov        = &iov,
-                        .uio_iovcnt     = 1,
-                        .uio_offset     = 0,
-                        .uio_resid      = nob,
-                        .uio_segflg     = UIO_SYSSPACE,
-                        .uio_rw         = UIO_READ,
-                        .uio_procp      = NULL
-                };
-
-                CFS_NET_IN;
-                rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0);
-                CFS_NET_EX;
-
-                if (rc != 0) {
-                        if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
-                                                rc == EWOULDBLOCK))
-                                rc = 0;
-                        if (rc != 0)
-                                return -rc;
-                        rc = nob - ruio.uio_resid;
-                        buffer = ((char *)buffer) + rc;
-                        nob = ruio.uio_resid;
-                        continue;
-                }
-                break;
-        }
-
-        return (0);
-}
-
-int
 ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
 {
-        struct sockopt  sopt;
         struct socket *sock = conn->ksnc_sock;
-        int            len;
         int            rc;
-        CFS_DECL_NET_DATA;
 
-        rc = ksocknal_getconnsock (conn);
+        rc = ksocknal_connsock_addref(conn);
         if (rc != 0) {
                 LASSERT (conn->ksnc_closing);
                 *txmem = *rxmem = *nagle = 0;
-                rc = -ESHUTDOWN;
-                goto out;
-        }
-        len = sizeof(*txmem);
-        bzero(&sopt, sizeof sopt);
-        sopt.sopt_dir = SOPT_GET; 
-        sopt.sopt_level = SOL_SOCKET; 
-        sopt.sopt_name = SO_SNDBUF; 
-        sopt.sopt_val = txmem; 
-        sopt.sopt_valsize = len;
-
-        CFS_NET_IN;
-        rc = sogetopt(sock, &sopt);
-        if (rc == 0) {
-                len = sizeof(*rxmem);
-                sopt.sopt_name = SO_RCVBUF;
-                sopt.sopt_val = rxmem;
-                rc = sogetopt(sock, &sopt);
+                return -ESHUTDOWN;
         }
+        rc = libcfs_sock_getbuf(sock, txmem, rxmem);
         if (rc == 0) {
+                struct sockopt  sopt;
+                int            len;
+                CFS_DECL_NET_DATA;
+
                 len = sizeof(*nagle);
+                bzero(&sopt, sizeof sopt);
+                sopt.sopt_dir = SOPT_GET; 
                 sopt.sopt_level = IPPROTO_TCP;
                 sopt.sopt_name = TCP_NODELAY;
                 sopt.sopt_val = nagle;
-                rc = sogetopt(sock, &sopt);
+                sopt.sopt_valsize = len;
+
+                CFS_NET_IN;
+                rc = -sogetopt(sock, &sopt);
+                CFS_NET_EX;
         }
-        CFS_NET_EX;
 
-        ksocknal_putconnsock (conn);
+        ksocknal_connsock_decref(conn);
 
         if (rc == 0)
                 *nagle = !*nagle;
         else
                 *txmem = *rxmem = *nagle = 0;
-out:
-        return (-rc);
+        return (rc);
 }
 
 int
@@ -644,9 +877,18 @@ ksocknal_lib_setup_sock (struct socket *so)
         struct linger   linger;
         CFS_DECL_NET_DATA;
 
+        rc = libcfs_sock_setbuf(so,
+                                *ksocknal_tunables.ksnd_tx_buffer_size,
+                                *ksocknal_tunables.ksnd_rx_buffer_size);
+        if (rc != 0) {
+                CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                        *ksocknal_tunables.ksnd_tx_buffer_size,
+                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+                return (rc);
+        }
+
         /* Ensure this socket aborts active sends immediately when we close
          * it. */
-
         bzero(&sopt, sizeof sopt);
 
         linger.l_onoff = 0;
@@ -658,14 +900,13 @@ ksocknal_lib_setup_sock (struct socket *so)
         sopt.sopt_valsize = sizeof(linger);
 
         CFS_NET_IN;
-        rc = sosetopt(so, &sopt);
+        rc = -sosetopt(so, &sopt);
         if (rc != 0) {
                 CERROR ("Can't set SO_LINGER: %d\n", rc);
                 goto out;
         }
 
-
-        if (!ksocknal_tunables.ksnd_nagle) { 
+        if (!*ksocknal_tunables.ksnd_nagle) { 
                 option = 1; 
                 bzero(&sopt, sizeof sopt);
                 sopt.sopt_dir = SOPT_SET; 
@@ -673,41 +914,17 @@ ksocknal_lib_setup_sock (struct socket *so)
                 sopt.sopt_name = TCP_NODELAY; 
                 sopt.sopt_val = &option; 
                 sopt.sopt_valsize = sizeof(option);
-                rc = sosetopt(so, &sopt);
+                rc = -sosetopt(so, &sopt);
                 if (rc != 0) { 
                         CERROR ("Can't disable nagle: %d\n", rc); 
                         goto out;
                 } 
         } 
-        if (ksocknal_tunables.ksnd_buffer_size > 0) { 
-                option = ksocknal_tunables.ksnd_buffer_size; 
-                if (option > ksocknal_mbuf_size) 
-                        option = ksocknal_mbuf_size; 
-                                                
-                sopt.sopt_dir = SOPT_SET; 
-                sopt.sopt_level = SOL_SOCKET; 
-                sopt.sopt_name = SO_SNDBUF; 
-                sopt.sopt_val = &option; 
-                sopt.sopt_valsize = sizeof(option); 
-                rc = sosetopt(so, &sopt); 
-                if (rc != 0) { 
-                        CERROR ("Can't set send buffer %d: %d\n", 
-                                        option, rc); 
-                        goto out;
-                } 
-                
-                sopt.sopt_name = SO_RCVBUF; 
-                rc = sosetopt(so, &sopt); 
-                if (rc != 0) { 
-                        CERROR ("Can't set receive buffer %d: %d\n", 
-                                        option, rc); 
-                        goto out;
-                }
-        } 
+
         /* snapshot tunables */ 
-        keep_idle  = ksocknal_tunables.ksnd_keepalive_idle; 
-        keep_count = ksocknal_tunables.ksnd_keepalive_count; 
-        keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl;
+        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle; 
+        keep_count = *ksocknal_tunables.ksnd_keepalive_count; 
+        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
 
         do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); 
         option = (do_keepalive ? 1 : 0); 
@@ -717,7 +934,7 @@ ksocknal_lib_setup_sock (struct socket *so)
         sopt.sopt_name = SO_KEEPALIVE; 
         sopt.sopt_val = &option; 
         sopt.sopt_valsize = sizeof(option); 
-        rc = sosetopt(so, &sopt); 
+        rc = -sosetopt(so, &sopt); 
         if (rc != 0) { 
                 CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); 
                 goto out; 
@@ -735,161 +952,14 @@ ksocknal_lib_setup_sock (struct socket *so)
         sopt.sopt_name = TCP_KEEPALIVE; 
         sopt.sopt_val = &keep_idle; 
         sopt.sopt_valsize = sizeof(keep_idle); 
-        rc = sosetopt(so, &sopt); 
+        rc = -sosetopt(so, &sopt); 
         if (rc != 0) { 
                 CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc); 
                 goto out; 
         }
 out:
         CFS_NET_EX;
-        return (-rc);
-}
-
-int
-ksocknal_lib_connect_sock (struct socket **sockp, int *may_retry, 
-                           ksock_route_t *route, int local_port)
-{
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct timeval      tv;
-        int                 fd;
-        struct socket      *so;
-        struct sockopt      sopt;
-        int                 option;
-        int                 rc;
-        int                 s;
-        CFS_DECL_FUNNEL_DATA;
-
-        ENTRY; 
-        bzero (&locaddr, sizeof (locaddr)); 
-        locaddr.sin_len = sizeof(struct sockaddr_in); 
-        locaddr.sin_family = AF_INET; 
-        locaddr.sin_port = htons (local_port);
-        locaddr.sin_addr.s_addr = 
-                (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
-                                            : INADDR_ANY;
-        bzero(&srvaddr, sizeof(srvaddr));
-        srvaddr.sin_len = sizeof(struct sockaddr_in);
-        srvaddr.sin_family = AF_INET;
-        srvaddr.sin_port = htons (route->ksnr_port);
-        srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-
-        *may_retry = 0;
-
-        CFS_NET_IN;
-        rc = socreate(PF_INET, &so, SOCK_STREAM, 0); 
-        CFS_NET_EX;
-        *sockp = so;
-        if (rc != 0) {
-                CERROR ("Can't create autoconnect socket: %d\n", rc);
-                return (-rc);
-        }
-
-        /*
-         * XXX
-         * Liang: what do we need here? 
-         */
-        fd = sock_map_fd (so);
-        if (fd < 0) {
-                sock_release (so);
-                CERROR ("sock_map_fd error %d\n", fd);
-                return (fd);
-        }
-        sock_fdrelse(fd);
-
-        /* Set the socket timeouts, so our connection attempt completes in
-         * finite time */
-        tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
-        tv.tv_usec = 0;
-        bzero(&sopt, sizeof sopt);
-        sopt.sopt_dir = SOPT_SET;
-        sopt.sopt_level = SOL_SOCKET;
-        sopt.sopt_name = SO_SNDTIMEO;
-        sopt.sopt_val = &tv;
-        sopt.sopt_valsize = sizeof(tv);
-
-        CFS_NET_IN;
-        rc = sosetopt(so, &sopt);
-        if (rc != 0) { 
-                CFS_NET_EX;
-                CERROR ("Can't set send timeout %d: %d\n",
-                        ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
-        }
-        sopt.sopt_level = SOL_SOCKET;
-        sopt.sopt_name = SO_RCVTIMEO;
-        rc = sosetopt(so, &sopt);
-        if (rc != 0) {
-                CFS_NET_EX;
-                CERROR ("Can't set receive timeout %d: %d\n",
-                        ksocknal_tunables.ksnd_io_timeout, rc);
-                goto out;
-        } 
-        option = 1;
-        sopt.sopt_level = SOL_SOCKET;
-        sopt.sopt_name = SO_REUSEADDR;
-        sopt.sopt_val = &option;
-        sopt.sopt_valsize = sizeof(option);
-        rc = sosetopt(so, &sopt);
-        if (rc != 0) {
-                CFS_NET_EX;
-                CERROR ("Can't set sock reuse address: %d\n", rc);
-                goto out;
-        } 
-        rc = sobind(so, (struct sockaddr *)&locaddr); 
-        if (rc == EADDRINUSE) { 
-                CFS_NET_EX; 
-                CDEBUG(D_NET, "Port %d already in use\n", local_port); 
-                *may_retry = 1; 
-                goto out;
-        }
-        if (rc != 0) { 
-                CFS_NET_EX; 
-                CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n", 
-                        HIPQUAD(route->ksnr_myipaddr), rc); 
-                goto out; 
-        }
-        rc = soconnect(so, (struct sockaddr *)&srvaddr);
-        *may_retry = (rc == EADDRNOTAVAIL || rc == EADDRINUSE);
-        if (rc != 0) { 
-                CFS_NET_EX;
-                if (rc != EADDRNOTAVAIL && rc != EADDRINUSE)
-                        CERROR ("Can't connect to nid "LPX64 
-                                " local IP: %u.%u.%u.%u," 
-                                " remote IP: %u.%u.%u.%u/%d: %d\n", 
-                                route->ksnr_peer->ksnp_nid, 
-                                HIPQUAD(route->ksnr_myipaddr), 
-                                HIPQUAD(route->ksnr_ipaddr), 
-                                route->ksnr_port, rc); 
-                goto out;
-        }
-
-        s = splnet();
-        while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
-                CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n");
-                (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz);
-        }
-        LASSERT((so->so_state & SS_ISCONNECTED));
-        splx(s);
-        CFS_NET_EX;
-
-        rc = so->so_error; 
-        if (rc != 0) { 
-                CERROR ("Error %d waiting for connection to nid "LPX64 
-                        " local IP: %u.%u.%u.%u," 
-                        " remote IP: %u.%u.%u.%u/%d: %d\n", rc,
-                        route->ksnr_peer->ksnp_nid, 
-                        HIPQUAD(route->ksnr_myipaddr), 
-                        HIPQUAD(route->ksnr_ipaddr), 
-                        route->ksnr_port, rc); 
-                goto out; 
-        }
-        return (-rc);
-
- out:
-        rele_file(KSN_SOCK2FILE(so));
-
-        return (-rc);
+        return (rc);
 }
 
 void
@@ -901,7 +971,7 @@ ksocknal_lib_push_conn(ksock_conn_t *conn)
         int             rc; 
         CFS_DECL_NET_DATA; 
         
-        rc = ksocknal_getconnsock (conn); 
+        rc = ksocknal_connsock_addref(conn); 
         if (rc != 0)            /* being shut down */ 
                 return; 
         sock = conn->ksnc_sock; 
@@ -916,47 +986,36 @@ ksocknal_lib_push_conn(ksock_conn_t *conn)
         sosetopt(sock, &sopt); 
         CFS_NET_EX; 
 
-        ksocknal_putconnsock (conn);
+        ksocknal_connsock_decref(conn);
         return;
 }
 
+
 extern void ksocknal_read_callback (ksock_conn_t *conn);
 extern void ksocknal_write_callback (ksock_conn_t *conn);
 
 static void
 ksocknal_upcall(struct socket *so, caddr_t arg, int waitf)
 {
-        ksock_conn_t  *conn;
-        CFS_DECL_NET_DATA;
+        ksock_conn_t  *conn = (ksock_conn_t *)arg;
         ENTRY;
 
         read_lock (&ksocknal_data.ksnd_global_lock);
-        conn = so->reserved3;
-
-        if (conn == NULL){
-                /* More processing is needed?  */
+        if (conn == NULL)
                 goto out;
-        }
-        if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) {
+
+        if (so->so_rcv.sb_flags & SB_UPCALL) {
                 extern int soreadable(struct socket *so);
-                CFS_NET_IN;
-                if (conn->ksnc_rx_nob_wanted && soreadable(so)){
+                if (conn->ksnc_rx_nob_wanted && soreadable(so))
                         /* To verify whether the upcall is for receive */
-                        CFS_NET_EX;
                         ksocknal_read_callback (conn);
-                }else
-                        CFS_NET_EX;
         }
         /* go foward? */
-        if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){
+        if (so->so_snd.sb_flags & SB_UPCALL){
                 extern int sowriteable(struct socket *so);
-                CFS_NET_IN;
-                if (sowriteable(so)){
+                if (sowriteable(so))
                         /* socket is writable */
-                        CFS_NET_EX;
                         ksocknal_write_callback(conn);
-                } else 
-                        CFS_NET_EX;
         }
 out:
         read_unlock (&ksocknal_data.ksnd_global_lock);
@@ -977,22 +1036,24 @@ ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
         CFS_DECL_NET_DATA;
 
         CFS_NET_IN;
-        sock->so_upcallarg = (void *)sock;  /* anything not NULL */ 
+        sock->so_upcallarg = (void *)conn;
         sock->so_upcall = ksocknal_upcall; 
         sock->so_snd.sb_timeo = 0; 
-        sock->so_rcv.sb_timeo = 2 * HZ; 
+        sock->so_rcv.sb_timeo = cfs_time_seconds(2);
         sock->so_rcv.sb_flags |= SB_UPCALL; 
         sock->so_snd.sb_flags |= SB_UPCALL; 
-        sock->reserved3 = conn;
         CFS_NET_EX;
         return;
 }
 
 void
-ksocknal_lib_act_callback(struct socket *sock)
+ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn)
 {
-        /* upcall will take the network funnel */
-        ksocknal_upcall (sock, 0, 0);
+        CFS_DECL_NET_DATA;
+
+        CFS_NET_IN;
+        ksocknal_upcall (sock, (void *)conn, 0);
+        CFS_NET_EX;
 }
 
 void 
@@ -1001,11 +1062,11 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
         CFS_DECL_NET_DATA;
 
         CFS_NET_IN;
-        sock->so_upcall = NULL; 
-        sock->so_upcallarg = NULL; 
         sock->so_rcv.sb_flags &= ~SB_UPCALL; 
         sock->so_snd.sb_flags &= ~SB_UPCALL;
+        sock->so_upcall = NULL; 
+        sock->so_upcallarg = NULL; 
         CFS_NET_EX;
 }
 
-
+#endif  /* !__DARWIN8__ */
index e3b286b..9e7574a 100644 (file)
 #include <mach/mach_types.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
-#include <netat/sysglue.h>
 #include <stdarg.h>
 
 #include <libcfs/libcfs.h>
 
-#define SOCKNAL_ARCH_EAGER_ACK         1
-
-#define KSN_SOCK2FILE(so)              ((struct file *)(so)->reserved4)
-#define KSN_CONN2FILE(conn)            ((struct file *)(conn)->ksnc_sock->reserved4)
-
-#define SOCK_WMEM_QUEUED(so)           ((so)->so_snd.sb_cc)
-#define SOCK_ERROR(so)                 ((so)->so_error)
-
-#define SOCK_TEST_NOSPACE(so)          (sbspace(&(so)->so_snd) < (so)->so_snd.sb_lowat)
-extern struct socket * sockfd_lookup(int fd, void *foo);
-
 static inline
 int ksocknal_nsched(void)
 { 
+       /* XXX Liang: fix it */
        return 1;
 }
 
index 48a813e..b7e2f49 100644 (file)
-#include "socknal.h"
-
-#ifdef CONFIG_SYSCTL
-#define SOCKNAL_SYSCTL 200
-
-#define SOCKNAL_SYSCTL_TIMEOUT          1
-#define SOCKNAL_SYSCTL_EAGER_ACK        2
-#define SOCKNAL_SYSCTL_ZERO_COPY        3
-#define SOCKNAL_SYSCTL_TYPED            4
-#define SOCKNAL_SYSCTL_MIN_BULK         5
-#define SOCKNAL_SYSCTL_BUFFER_SIZE      6
-#define SOCKNAL_SYSCTL_NAGLE            7
-#define SOCKNAL_SYSCTL_IRQ_AFFINITY     8
-#define SOCKNAL_SYSCTL_KEEPALIVE_IDLE   9
-#define SOCKNAL_SYSCTL_KEEPALIVE_COUNT 10
-#define SOCKNAL_SYSCTL_KEEPALIVE_INTVL 11
-
-static ctl_table ksocknal_ctl_table[] = {
-        {SOCKNAL_SYSCTL_TIMEOUT, "timeout",
-         &ksocknal_tunables.ksnd_io_timeout, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack",
-         &ksocknal_tunables.ksnd_eager_ack, sizeof (int),
-         0644, NULL, &proc_dointvec},
-#if SOCKNAL_ZC
-        {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy",
-         &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int),
-         0644, NULL, &proc_dointvec},
-#endif
-        {SOCKNAL_SYSCTL_TYPED, "typed",
-         &ksocknal_tunables.ksnd_typed_conns, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk",
-         &ksocknal_tunables.ksnd_min_bulk, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        {SOCKNAL_SYSCTL_BUFFER_SIZE, "buffer_size",
-         &ksocknal_tunables.ksnd_buffer_size, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {SOCKNAL_SYSCTL_NAGLE, "nagle",
-         &ksocknal_tunables.ksnd_nagle, sizeof(int),
-         0644, NULL, &proc_dointvec},
-#if CPU_AFFINITY
-        {SOCKNAL_SYSCTL_IRQ_AFFINITY, "irq_affinity",
-         &ksocknal_tunables.ksnd_irq_affinity, sizeof(int),
-         0644, NULL, &proc_dointvec},
-#endif
-        {SOCKNAL_SYSCTL_KEEPALIVE_IDLE, "keepalive_idle",
-         &ksocknal_tunables.ksnd_keepalive_idle, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {SOCKNAL_SYSCTL_KEEPALIVE_COUNT, "keepalive_count",
-         &ksocknal_tunables.ksnd_keepalive_count, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        {SOCKNAL_SYSCTL_KEEPALIVE_INTVL, "keepalive_intvl",
-         &ksocknal_tunables.ksnd_keepalive_intvl, sizeof(int),
-         0644, NULL, &proc_dointvec},
-        { 0 }
-};
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#include "socklnd.h"
+
+# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table ksocknal_ctl_table[21];
 
 ctl_table ksocknal_top_ctl_table[] = {
-        {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
+        {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
         { 0 }
 };
+
+int
+ksocknal_lib_tunables_init ()
+{
+       int    i = 0;
+       int    j = 1;
+
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "timeout", ksocknal_tunables.ksnd_timeout,
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "credits", ksocknal_tunables.ksnd_credits,
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits,
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "nconnds", ksocknal_tunables.ksnd_nconnds,
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms,
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms,
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack,
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_frag,
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "typed", ksocknal_tunables.ksnd_typed_conns,
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk,
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "rx_buffer_size", ksocknal_tunables.ksnd_rx_buffer_size,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "tx_buffer_size", ksocknal_tunables.ksnd_tx_buffer_size,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "nagle", ksocknal_tunables.ksnd_nagle,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+#if CPU_AFFINITY
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+#endif
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+       ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+#ifdef SOCKNAL_BACKOFF
+        ksocknal_ctl_table[i++] = (ctl_table)
+                {j++, "backoff_init", ksocknal_tunables.ksnd_backoff_init,
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+                {j++, "backoff_max", ksocknal_tunables.ksnd_backoff_max,
+                sizeof(int), 0644, NULL, &proc_dointvec};
 #endif
 
+       LASSERT (j == i+1);
+       LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0]));
+
+        ksocknal_tunables.ksnd_sysctl =
+                register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+        if (ksocknal_tunables.ksnd_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+        if (ksocknal_tunables.ksnd_sysctl != NULL)
+                unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);
+}
+#else
+int
+ksocknal_lib_tunables_init ()
+{
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */
+
 void
 ksocknal_lib_bind_irq (unsigned int irq)
 {
 #if (defined(CONFIG_SMP) && CPU_AFFINITY)
         int              bind;
         int              cpu;
-        unsigned long    flags;
         char             cmdline[64];
         ksock_irqinfo_t *info;
         char            *argv[] = {"/bin/sh",
@@ -85,13 +133,13 @@ ksocknal_lib_bind_irq (unsigned int irq)
 
         info = &ksocknal_data.ksnd_irqinfo[irq];
 
-        write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
+        write_lock_bh (&ksocknal_data.ksnd_global_lock);
 
         LASSERT (info->ksni_valid);
         bind = !info->ksni_bound;
         info->ksni_bound = 1;
 
-        write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
+        write_unlock_bh (&ksocknal_data.ksnd_global_lock);
 
         if (!bind)                              /* bound already */
                 return;
@@ -100,8 +148,8 @@ ksocknal_lib_bind_irq (unsigned int irq)
         snprintf (cmdline, sizeof (cmdline),
                   "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq);
 
-        printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n",
-                irq, cpu, cmdline);
+        LCONSOLE_INFO("Binding irq %u to CPU %d with cmd: %s\n",
+                     irq, cpu, cmdline);
 
         /* FIXME: Find a better method of setting IRQ affinity...
          */
@@ -113,12 +161,10 @@ ksocknal_lib_bind_irq (unsigned int irq)
 int
 ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
 {
-        struct sockaddr_in sin;
-        int                len = sizeof (sin);
-        int                rc;
+        int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+                                    &conn->ksnc_ipaddr,
+                                    &conn->ksnc_port);
 
-        rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock,
-                                            (struct sockaddr *)&sin, &len, 2);
         /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
         LASSERT (!conn->ksnc_closing);
 
@@ -127,18 +173,13 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
                 return rc;
         }
 
-        conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr);
-        conn->ksnc_port   = ntohs (sin.sin_port);
-
-        rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock,
-                                            (struct sockaddr *)&sin, &len, 0);
+        rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+                                &conn->ksnc_myipaddr, NULL);
         if (rc != 0) {
                 CERROR ("Error %d getting sock local IP\n", rc);
                 return rc;
         }
 
-        conn->ksnc_myipaddr = ntohl (sin.sin_addr.s_addr);
-
         return 0;
 }
 
@@ -146,9 +187,10 @@ unsigned int
 ksocknal_lib_sock_irq (struct socket *sock)
 {
         int                irq = 0;
+#if CPU_AFFINITY
         struct dst_entry  *dst;
 
-        if (!ksocknal_tunables.ksnd_irq_affinity)
+        if (!*ksocknal_tunables.ksnd_irq_affinity)
                 return 0;
 
         dst = sk_dst_get (sock->sk);
@@ -163,76 +205,45 @@ ksocknal_lib_sock_irq (struct socket *sock)
                 dst_release (dst);
         }
 
-        return (irq);
+#endif
+        return irq;
 }
 
-#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
-static struct page *
-ksocknal_kvaddr_to_page (unsigned long vaddr)
+int
+ksocknal_lib_zc_capable(struct socket *sock)
 {
-        struct page *page;
-
-        if (vaddr >= VMALLOC_START &&
-            vaddr < VMALLOC_END)
-                page = vmalloc_to_page ((void *)vaddr);
-#if CONFIG_HIGHMEM
-        else if (vaddr >= PKMAP_BASE &&
-                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
-                page = vmalloc_to_page ((void *)vaddr);
-                /* in 2.4 ^ just walks the page tables */
-#endif
-        else
-                page = virt_to_page (vaddr);
-
-        if (page == NULL ||
-            !VALID_PAGE (page))
-                return (NULL);
-
-        return (page);
+        int  caps = sock->sk->sk_route_caps;
+        
+        /* ZC if the socket supports scatter/gather and doesn't need software
+         * checksums */
+        return ((caps & NETIF_F_SG) != 0 &&
+                (caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) != 0);
 }
-#endif
 
 int
 ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
 {
         struct socket *sock = conn->ksnc_sock;
-#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
-        unsigned long  vaddr = (unsigned long)iov->iov_base
-        int            offset = vaddr & (PAGE_SIZE - 1);
-        int            zcsize = MIN (iov->iov_len, PAGE_SIZE - offset);
-        struct page   *page;
-#endif
         int            nob;
         int            rc;
 
+        if (*ksocknal_tunables.ksnd_enable_csum        && /* checksum enabled */
+            conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection  */
+            tx->tx_nob == tx->tx_resid                 && /* frist sending    */
+            tx->tx_msg.ksm_csum == 0)                     /* not checksummed  */
+                ksocknal_lib_csum_tx(tx);
+
         /* NB we can't trust socket ops to either consume our iovs
          * or leave them alone. */
 
-#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
-        if (zcsize >= ksocknal_data.ksnd_zc_min_frag &&
-            (sock->sk->route_caps & NETIF_F_SG) &&
-            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
-            (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
-                int msgflg = MSG_DONTWAIT;
-
-                CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
-                       (void *)vaddr, page, page_address(page), offset, zcsize);
-
-                if (!list_empty (&conn->ksnc_tx_queue) ||
-                    zcsize < tx->tx_resid)
-                        msgflg |= MSG_MORE;
-
-                rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd);
-        } else
-#endif
         {
 #if SOCKNAL_SINGLE_FRAG_TX
                 struct iovec    scratch;
                 struct iovec   *scratchiov = &scratch;
-                int             niov = 1;
+                unsigned int    niov = 1;
 #else
                 struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov;
-                int             niov = tx->tx_niov;
+                unsigned int    niov = tx->tx_niov;
 #endif
                 struct msghdr msg = {
                         .msg_name       = NULL,
@@ -266,17 +277,16 @@ int
 ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 {
         struct socket *sock = conn->ksnc_sock;
-        ptl_kiov_t    *kiov = tx->tx_kiov;
+        lnet_kiov_t    *kiov = tx->tx_kiov;
         int            rc;
         int            nob;
 
         /* NB we can't trust socket ops to either consume our iovs
          * or leave them alone. */
 
-#if SOCKNAL_ZC
-        if (kiov->kiov_len >= ksocknal_tunables.ksnd_zc_min_frag &&
-            (sock->sk->route_caps & NETIF_F_SG) &&
-            (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
+        if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag &&
+            tx->tx_msg.ksm_zc_req_cookie != 0) {
+                /* Zero copy is enabled */
                 struct page   *page = kiov->kiov_page;
                 int            offset = kiov->kiov_offset;
                 int            fragsize = kiov->kiov_len;
@@ -289,21 +299,18 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
                     fragsize < tx->tx_resid)
                         msgflg |= MSG_MORE;
 
-                rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg,
-                                       &tx->tx_zccd);
-        } else
-#endif
-        {
+                rc = tcp_sendpage(sock, page, offset, fragsize, msgflg);
+        } else {
 #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
                 struct iovec  scratch;
                 struct iovec *scratchiov = &scratch;
-                int           niov = 1;
+                unsigned int  niov = 1;
 #else
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
                 struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
-                int           niov = tx->tx_nkiov;
+                unsigned int  niov = tx->tx_nkiov;
 #endif
                 struct msghdr msg = {
                         .msg_name       = NULL,
@@ -325,7 +332,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 
                 if (!list_empty(&conn->ksnc_tx_queue) ||
                     nob < tx->tx_resid)
-                        msg.msg_flags |= MSG_DONTWAIT;
+                        msg.msg_flags |= MSG_MORE;
 
                 set_fs (KERNEL_DS);
                 rc = sock_sendmsg(sock, &msg, nob);
@@ -361,10 +368,10 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
 #if SOCKNAL_SINGLE_FRAG_RX
         struct iovec  scratch;
         struct iovec *scratchiov = &scratch;
-        int           niov = 1;
+        unsigned int  niov = 1;
 #else
         struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
-        int           niov = conn->ksnc_rx_niov;
+        unsigned int  niov = conn->ksnc_rx_niov;
 #endif
         struct iovec *iov = conn->ksnc_rx_iov;
         struct msghdr msg = {
@@ -380,6 +387,9 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
         int          nob;
         int          i;
         int          rc;
+        int          fragnob;
+        int          sum;
+        __u32        saved_csum;
 
         /* NB we can't trust socket ops to either consume our iovs
          * or leave them alone. */
@@ -396,6 +406,27 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
         /* NB this is just a boolean..........................^ */
         set_fs (oldmm);
 
+        saved_csum = 0;
+        if (conn->ksnc_proto == &ksocknal_protocol_v2x) {
+                saved_csum = conn->ksnc_msg.ksm_csum;
+                conn->ksnc_msg.ksm_csum = 0;
+        }
+
+        if (saved_csum != 0) {
+                /* accumulate checksum */
+                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                        LASSERT (i < niov);
+
+                        fragnob = iov[i].iov_len;
+                        if (fragnob > sum)
+                                fragnob = sum;
+                
+                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, 
+                                                           iov[i].iov_base, fragnob);
+                }
+                conn->ksnc_msg.ksm_csum = saved_csum;
+        }
+
        return rc;
 }
 
@@ -405,15 +436,15 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
         struct iovec  scratch;
         struct iovec *scratchiov = &scratch;
-        int           niov = 1;
+        unsigned int  niov = 1;
 #else
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
         struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
-        int           niov = conn->ksnc_rx_nkiov;
+        unsigned int  niov = conn->ksnc_rx_nkiov;
 #endif
-        ptl_kiov_t   *kiov = conn->ksnc_rx_kiov;
+        lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
         struct msghdr msg = {
                 .msg_name       = NULL,
                 .msg_namelen    = 0,
@@ -427,6 +458,9 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
         int          nob;
         int          i;
         int          rc;
+        void        *base;
+        int          sum;
+        int          fragnob;
 
         /* NB we can't trust socket ops to either consume our iovs
          * or leave them alone. */
@@ -441,88 +475,67 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
         /* NB this is just a boolean.......................^ */
         set_fs (oldmm);
 
+        if (conn->ksnc_msg.ksm_csum != 0) {
+                for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) {
+                        LASSERT (i < niov);
+
+                        /* Dang! have to kmap again because I have nowhere to stash the
+                         * mapped address.  But by doing it while the page is still
+                         * mapped, the kernel just bumps the map count and returns me
+                         * the address it stashed. */
+                        base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+                        fragnob = kiov[i].kiov_len;
+                        if (fragnob > sum)
+                                fragnob = sum;
+                
+                        conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum,
+                                                           base, fragnob);
+
+                        kunmap(kiov[i].kiov_page);
+                }
+        }
         for (i = 0; i < niov; i++)
                 kunmap(kiov[i].kiov_page);
 
        return (rc);
 }
 
-int
-ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob)
+void ksocknal_lib_csum_tx(ksock_tx_t *tx)
 {
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
-
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
-
-                set_fs (KERNEL_DS);
-                rc = sock_sendmsg (sock, &msg, iov.iov_len);
-                set_fs (oldmm);
-
-                if (rc < 0)
-                        return (rc);
-
-                if (rc == 0) {
-                        CERROR ("Unexpected zero rc\n");
-                        return (-ECONNABORTED);
-                }
-
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
-        }
+        int          i;
+        __u32        csum;
+        void        *base;
 
-        return (0);
-}
+        LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg);
+        LASSERT(tx->tx_conn != NULL);
+        LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x);
 
-int
-ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob)
-{
-        int           rc;
-        mm_segment_t  oldmm = get_fs();
+        tx->tx_msg.ksm_csum = 0;
 
-        while (nob > 0) {
-                struct iovec  iov = {
-                        .iov_base = buffer,
-                        .iov_len  = nob
-                };
-                struct msghdr msg = {
-                        .msg_name       = NULL,
-                        .msg_namelen    = 0,
-                        .msg_iov        = &iov,
-                        .msg_iovlen     = 1,
-                        .msg_control    = NULL,
-                        .msg_controllen = 0,
-                        .msg_flags      = 0
-                };
+        csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base,
+                             tx->tx_iov[0].iov_len);
 
-                set_fs (KERNEL_DS);
-                rc = sock_recvmsg (sock, &msg, iov.iov_len, 0);
-                set_fs (oldmm);
+        if (tx->tx_kiov != NULL) {
+                for (i = 0; i < tx->tx_nkiov; i++) {
+                        base = kmap(tx->tx_kiov[i].kiov_page) +
+                               tx->tx_kiov[i].kiov_offset;
 
-                if (rc < 0)
-                        return (rc);
+                        csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len);
 
-                if (rc == 0)
-                        return (-ECONNABORTED);
+                        kunmap(tx->tx_kiov[i].kiov_page);
+                }
+        } else {
+                for (i = 1; i < tx->tx_niov; i++)
+                        csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base,
+                                             tx->tx_iov[i].iov_len);
+        }
 
-                buffer = ((char *)buffer) + rc;
-                nob -= rc;
+        if (*ksocknal_tunables.ksnd_inject_csum_error) {
+                csum++;
+                *ksocknal_tunables.ksnd_inject_csum_error = 0;
         }
 
-        return (0);
+        tx->tx_msg.ksm_csum = csum;
 }
 
 int
@@ -533,31 +546,23 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int
         int            len;
         int            rc;
 
-        rc = ksocknal_getconnsock (conn);
+        rc = ksocknal_connsock_addref(conn);
         if (rc != 0) {
                 LASSERT (conn->ksnc_closing);
                 *txmem = *rxmem = *nagle = 0;
                 return (-ESHUTDOWN);
         }
 
-        set_fs (KERNEL_DS);
-
-        len = sizeof(*txmem);
-        rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                             (char *)txmem, &len);
-        if (rc == 0) {
-                len = sizeof(*rxmem);
-                rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF,
-                                     (char *)rxmem, &len);
-        }
+       rc = libcfs_sock_getbuf(sock, txmem, rxmem);
         if (rc == 0) {
                 len = sizeof(*nagle);
+               set_fs(KERNEL_DS);
                 rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY,
                                            (char *)nagle, &len);
+               set_fs(oldmm);
         }
 
-        set_fs (oldmm);
-        ksocknal_putconnsock (conn);
+        ksocknal_connsock_decref(conn);
 
         if (rc == 0)
                 *nagle = !*nagle;
@@ -606,7 +611,7 @@ ksocknal_lib_setup_sock (struct socket *sock)
                 return (rc);
         }
 
-        if (!ksocknal_tunables.ksnd_nagle) {
+        if (!*ksocknal_tunables.ksnd_nagle) {
                 option = 1;
 
                 set_fs (KERNEL_DS);
@@ -619,34 +624,51 @@ ksocknal_lib_setup_sock (struct socket *sock)
                 }
         }
 
-        if (ksocknal_tunables.ksnd_buffer_size > 0) {
-                option = ksocknal_tunables.ksnd_buffer_size;
+       rc = libcfs_sock_setbuf(sock,
+                                *ksocknal_tunables.ksnd_tx_buffer_size,
+                                *ksocknal_tunables.ksnd_rx_buffer_size);
+       if (rc != 0) {
+               CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+                        *ksocknal_tunables.ksnd_tx_buffer_size,
+                        *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+               return (rc);
+       }
+
+/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */
+#ifdef SOCKNAL_BACKOFF
+        if (*ksocknal_tunables.ksnd_backoff_init > 0) {
+                option = *ksocknal_tunables.ksnd_backoff_init;
 
                 set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF,
-                                      (char *)&option, sizeof (option));
+                rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_INIT,
+                                            (char *)&option, sizeof (option));
                 set_fs (oldmm);
                 if (rc != 0) {
-                        CERROR ("Can't set send buffer %d: %d\n",
+                        CERROR ("Can't set initial tcp backoff %d: %d\n",
                                 option, rc);
                         return (rc);
                 }
+        }
+
+        if (*ksocknal_tunables.ksnd_backoff_max > 0) {
+                option = *ksocknal_tunables.ksnd_backoff_max;
 
                 set_fs (KERNEL_DS);
-                rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
-                                      (char *)&option, sizeof (option));
+                rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_MAX,
+                                            (char *)&option, sizeof (option));
                 set_fs (oldmm);
                 if (rc != 0) {
-                        CERROR ("Can't set receive buffer %d: %d\n",
+                        CERROR ("Can't set maximum tcp backoff %d: %d\n",
                                 option, rc);
                         return (rc);
                 }
         }
+#endif
 
         /* snapshot tunables */
-        keep_idle  = ksocknal_tunables.ksnd_keepalive_idle;
-        keep_count = ksocknal_tunables.ksnd_keepalive_count;
-        keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl;
+        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+        keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
 
         do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
 
@@ -693,135 +715,13 @@ ksocknal_lib_setup_sock (struct socket *sock)
         return (0);
 }
 
-int
-ksocknal_lib_connect_sock(struct socket **sockp, int *may_retry,
-                      ksock_route_t *route, int local_port)
-{
-        struct sockaddr_in  locaddr;
-        struct sockaddr_in  srvaddr;
-        struct socket      *sock;
-        int                 rc;
-        int                 option;
-        mm_segment_t        oldmm = get_fs();
-        struct timeval      tv;
-
-        memset(&locaddr, 0, sizeof(locaddr));
-        locaddr.sin_family = AF_INET;
-        locaddr.sin_port = htons(local_port);
-        locaddr.sin_addr.s_addr =
-                (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
-                                            : INADDR_ANY;
-
-        memset (&srvaddr, 0, sizeof (srvaddr));
-        srvaddr.sin_family = AF_INET;
-        srvaddr.sin_port = htons (route->ksnr_port);
-        srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-
-        *may_retry = 0;
-
-        rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
-        *sockp = sock;
-        if (rc != 0) {
-                CERROR ("Can't create autoconnect socket: %d\n", rc);
-                return (rc);
-        }
-
-        /* Ugh; have to map_fd for compatibility with sockets passed in
-         * from userspace.  And we actually need the sock->file refcounting
-         * that this gives you :) */
-
-        rc = sock_map_fd (sock);
-        if (rc < 0) {
-                sock_release (sock);
-                CERROR ("sock_map_fd error %d\n", rc);
-                return (rc);
-        }
-
-        /* NB the file descriptor (rc) now owns the ref on sock->file */
-        LASSERT (sock->file != NULL);
-        LASSERT (file_count(sock->file) == 1);
-
-        get_file(sock->file);                /* extra ref makes sock->file */
-        sys_close(rc);                       /* survive this close */
-
-        /* Still got a single ref on sock->file */
-        LASSERT (file_count(sock->file) == 1);
-
-        /* Set the socket timeouts, so our connection attempt completes in
-         * finite time */
-        tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
-        tv.tv_usec = 0;
-
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO,
-                              (char *)&tv, sizeof (tv));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set send timeout %d: %d\n",
-                        ksocknal_tunables.ksnd_io_timeout, rc);
-                goto failed;
-        }
-
-        set_fs (KERNEL_DS);
-        rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO,
-                              (char *)&tv, sizeof (tv));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR ("Can't set receive timeout %d: %d\n",
-                        ksocknal_tunables.ksnd_io_timeout, rc);
-                goto failed;
-        }
-
-        set_fs (KERNEL_DS);
-        option = 1;
-        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                             (char *)&option, sizeof (option));
-        set_fs (oldmm);
-        if (rc != 0) {
-                CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
-                goto failed;
-        }
-
-        rc = sock->ops->bind(sock,
-                             (struct sockaddr *)&locaddr, sizeof(locaddr));
-        if (rc == -EADDRINUSE) {
-                CDEBUG(D_NET, "Port %d already in use\n", local_port);
-                *may_retry = 1;
-                goto failed;
-        }
-        if (rc != 0) {
-                CERROR("Error trying to bind to reserved port %d: %d\n",
-                       local_port, rc);
-                goto failed;
-        }
-
-        rc = sock->ops->connect(sock,
-                                (struct sockaddr *)&srvaddr, sizeof(srvaddr),
-                                sock->file->f_flags);
-        if (rc == 0)
-                return 0;
-
-        /* EADDRNOTAVAIL probably means we're already connected to the same
-         * peer/port on the same local port on a differently typed
-         * connection.  Let our caller retry with a different local
-         * port... */
-        *may_retry = (rc == -EADDRNOTAVAIL);
-
-        CDEBUG(*may_retry ? D_NET : D_ERROR,
-               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
-               HIPQUAD(route->ksnr_myipaddr), local_port,
-               HIPQUAD(route->ksnr_ipaddr), route->ksnr_port);
-
- failed:
-        fput(sock->file);
-        return rc;
-}
-
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
 struct tcp_opt *sock2tcp_opt(struct sock *sk)
 {
         return &(sk->tp_pinfo.af_tcp);
 }
+#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10))
+#define sock2tcp_opt(sk) tcp_sk(sk)
 #else
 struct tcp_opt *sock2tcp_opt(struct sock *sk)
 {
@@ -834,13 +734,17 @@ void
 ksocknal_lib_push_conn (ksock_conn_t *conn)
 {
         struct sock    *sk;
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11))
         struct tcp_opt *tp;
+#else
+        struct tcp_sock *tp;
+#endif
         int             nonagle;
         int             val = 1;
         int             rc;
         mm_segment_t    oldmm;
 
-        rc = ksocknal_getconnsock (conn);
+        rc = ksocknal_connsock_addref(conn);
         if (rc != 0)                            /* being shut down */
                 return;
 
@@ -865,7 +769,7 @@ ksocknal_lib_push_conn (ksock_conn_t *conn)
         tp->nonagle = nonagle;
         release_sock (sk);
 
-        ksocknal_putconnsock (conn);
+        ksocknal_connsock_decref(conn);
 }
 
 extern void ksocknal_read_callback (ksock_conn_t *conn);
@@ -880,6 +784,7 @@ ksocknal_data_ready (struct sock *sk, int n)
         ENTRY;
 
         /* interleave correctly with closing sockets... */
+        LASSERT(!in_irq());
         read_lock (&ksocknal_data.ksnd_global_lock);
 
         conn = sk->sk_user_data;
@@ -894,22 +799,23 @@ ksocknal_data_ready (struct sock *sk, int n)
         EXIT;
 }
 
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,7))
-#define tcp_wspace(sk) sk_stream_wspace(sk)
-#endif
-
 static void
 ksocknal_write_space (struct sock *sk)
 {
         ksock_conn_t  *conn;
+        int            wspace;
+        int            min_wpace;
 
         /* interleave correctly with closing sockets... */
+        LASSERT(!in_irq());
         read_lock (&ksocknal_data.ksnd_global_lock);
 
         conn = sk->sk_user_data;
+        wspace = SOCKNAL_WSPACE(sk);
+        min_wpace = SOCKNAL_MIN_WSPACE(sk);
 
         CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n",
-               sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn,
+               sk, wspace, min_wpace, conn,
                (conn == NULL) ? "" : (conn->ksnc_tx_ready ?
                                       " ready" : " blocked"),
                (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ?
@@ -925,7 +831,7 @@ ksocknal_write_space (struct sock *sk)
                 return;
         }
 
-        if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
+        if (wspace >= min_wpace) {              /* got enough space */
                ksocknal_write_callback(conn);
 
                /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the
@@ -955,14 +861,6 @@ ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
 }
 
 void
-ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn)
-{
-       ksocknal_data_ready (sock->sk, 0);
-       ksocknal_write_space (sock->sk);
-       return;
-}
-
-void
 ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
 {
        /* Remove conn's network callbacks.
index 6129fdc..594f29f 100644 (file)
@@ -6,7 +6,9 @@
 #ifndef __LINUX_SOCKNAL_LIB_H__
 #define __LINUX_SOCKNAL_LIB_H__
 
+#ifdef HAVE_KERNEL_CONFIG_H
 #include <linux/config.h>
+#endif
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/version.h>
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/uio.h>
-                                                                                                                                                                         
+
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/irq.h>
-                                                                                                                                                                         
+
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0))
 # include <linux/syscalls.h>
 #endif
-                                                                                                                                                                       
+
 #include <libcfs/kp30.h>
 #include <libcfs/linux/portals_compat25.h>
 
-#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10)
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
-# define sk_allocation  allocation
-# define sk_data_ready data_ready
-# define sk_write_space write_space
-# define sk_user_data   user_data
-# define sk_prot        prot
-# define sk_sndbuf      sndbuf
-# define sk_socket      socket
+#include <linux/crc32.h>
+static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len)
+{
+#if 1
+        return crc32_le(crc, p, len);
+#else
+        while (len-- > 0)
+                crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
+        return crc;
 #endif
+}
 
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0))
-# define sk_wmem_queued wmem_queued
-# define sk_err         err
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,7))
+# define SOCKNAL_WSPACE(sk)       sk_stream_wspace(sk)
+# define SOCKNAL_MIN_WSPACE(sk)   sk_stream_min_wspace(sk)
+#else
+# define SOCKNAL_WSPACE(sk)     tcp_wspace(sk)
+# define SOCKNAL_MIN_WSPACE(sk) (((sk)->sk_sndbuf*8)/10)
 #endif
 
-#define SOCKNAL_ARCH_EAGER_ACK 0
-#define SOCK_WMEM_QUEUED(so)    ((so)->sk->sk_wmem_queued)
-#define SOCK_ERROR(so)          ((so)->sk->sk_err)
-#define SOCK_TEST_NOSPACE(so)  test_bit(SOCK_NOSPACE, &(so)->flags)
-
-#define KSN_SOCK2FILE(so)       ((so)->file)
-#define KSN_CONN2FILE(conn)     ((conn)->ksnc_sock->file)
-
 #ifndef CONFIG_SMP
 static inline
 int ksocknal_nsched(void)
diff --git a/lnet/klnds/socklnd/socklnd_lib-winnt.c b/lnet/klnds/socklnd/socklnd_lib-winnt.c
new file mode 100755 (executable)
index 0000000..7669c77
--- /dev/null
@@ -0,0 +1,832 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2006 Cluster File Systems, Inc, All rights reserved.
+ * Author: Matt Wu
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * This Lustre Software is proprietary - please refer to the license
+ * agreement you received with your software.
+ *
+ * windows socknal library
+ *
+ */ 
+
+#include "socklnd.h"
+
+# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+static ctl_table ksocknal_ctl_table[18];
+
+ctl_table ksocknal_top_ctl_table[] = {
+        {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
+        { 0 }
+};
+
+int
+ksocknal_lib_tunables_init () 
+{
+           int    i = 0;
+           int    j = 1;
+       
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "timeout", ksocknal_tunables.ksnd_timeout, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "credits", ksocknal_tunables.ksnd_credits, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "nconnds", ksocknal_tunables.ksnd_nconnds, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+#if SOCKNAL_ZC
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_frag, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+#endif
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "typed", ksocknal_tunables.ksnd_typed_conns, 
+                sizeof (int), 0444, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk, 
+                sizeof (int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "buffer_size", ksocknal_tunables.ksnd_buffer_size, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "nagle", ksocknal_tunables.ksnd_nagle, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+#if CPU_AFFINITY
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+#endif
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+        ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+       ksocknal_ctl_table[i++] = (ctl_table)
+               {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl, 
+                sizeof(int), 0644, NULL, &proc_dointvec};
+
+       LASSERT (j == i+1);
+       LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0]));
+
+        ksocknal_tunables.ksnd_sysctl =
+                register_sysctl_table(ksocknal_top_ctl_table, 0);
+
+        if (ksocknal_tunables.ksnd_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+       return 0;
+}
+
+void
+ksocknal_lib_tunables_fini () 
+{
+        if (ksocknal_tunables.ksnd_sysctl != NULL)
+                unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl);        
+}
+#else
+int
+ksocknal_lib_tunables_init () 
+{
+       return 0;
+}
+
+void 
+ksocknal_lib_tunables_fini ()
+{
+}
+#endif
+
+void
+ksocknal_lib_bind_irq (unsigned int irq)
+{
+}
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+        int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+                                    &conn->ksnc_ipaddr, &conn->ksnc_port);
+
+        /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+        LASSERT (!conn->ksnc_closing);
+
+        if (rc != 0) {
+                CERROR ("Error %d getting sock peer IP\n", rc);
+                return rc;
+        }
+
+        rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+                                &conn->ksnc_myipaddr, NULL);
+        if (rc != 0) {
+                CERROR ("Error %d getting sock local IP\n", rc);
+                return rc;
+        }
+
+        return 0;
+}
+
+unsigned int
+ksocknal_lib_sock_irq (struct socket *sock)
+{
+    return 0;
+}
+
+#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
+static struct page *
+ksocknal_kvaddr_to_page (unsigned long vaddr)
+{
+        struct page *page;
+
+        if (vaddr >= VMALLOC_START &&
+            vaddr < VMALLOC_END)
+                page = vmalloc_to_page ((void *)vaddr);
+#if CONFIG_HIGHMEM
+        else if (vaddr >= PKMAP_BASE &&
+                 vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
+                page = vmalloc_to_page ((void *)vaddr);
+                /* in 2.4 ^ just walks the page tables */
+#endif
+        else
+                page = virt_to_page (vaddr);
+
+        if (page == NULL ||
+            !VALID_PAGE (page))
+                return (NULL);
+
+        return (page);
+}
+#endif
+
+/*
+ * ks_lock_iovs
+ *   Lock the i/o vector buffers into MDL structure
+ *
+ * Arguments:
+ *   iov:  the array of i/o vectors
+ *   niov: number of i/o vectors to be locked
+ *   len:  the real length of the iov vectors
+ *
+ * Return Value:
+ *   ksock_mdl_t *: the Mdl of the locked buffers or
+ *         NULL pointer in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+ksock_mdl_t *
+ks_lock_iovs(
+    IN struct iovec  *iov,
+    IN int            niov,
+    IN int            recving,
+    IN int *          len )
+{
+    int             rc = 0;
+
+    int             i = 0;
+    int             total = 0;
+    ksock_mdl_t *   mdl = NULL;
+    ksock_mdl_t *   tail = NULL;
+
+    LASSERT(iov != NULL);
+    LASSERT(niov > 0);
+    LASSERT(len != NULL);
+
+    for (i=0; i < niov; i++) {
+
+        ksock_mdl_t * Iovec = NULL;
+            
+        rc = ks_lock_buffer(
+                iov[i].iov_base,
+                FALSE,
+                iov[i].iov_len,
+                recving ? IoWriteAccess : IoReadAccess,
+                &Iovec );
+
+        if (rc < 0) {
+            break;
+        }
+
+        if (tail) {
+            tail->Next = Iovec;
+        } else {
+            mdl = Iovec;
+        }
+
+        tail = Iovec;
+
+        total +=iov[i].iov_len;
+    }
+
+    if (rc >= 0) {
+        *len = total;
+    } else {
+        if (mdl) {
+            ks_release_mdl(mdl, FALSE);
+            mdl = NULL;
+        }
+    }
+
+    return mdl;
+}
+
+/*
+ * ks_lock_kiovs
+ *   Lock the kiov pages into MDL structure
+ *
+ * Arguments:
+ *   kiov:  the array of kiov pages
+ *   niov:  number of kiov to be locked
+ *   len:   the real length of the kiov arrary
+ *
+ * Return Value:
+ *   PMDL: the Mdl of the locked buffers or NULL
+ *         pointer in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+ksock_mdl_t *
+ks_lock_kiovs(
+    IN lnet_kiov_t *  kiov,
+    IN int            nkiov,
+    IN int            recving,
+    IN int *          len )
+{
+    int             rc = 0;
+    int             i = 0;
+    int             total = 0;
+    ksock_mdl_t *   mdl = NULL;
+    ksock_mdl_t *   tail = NULL;
+
+    LASSERT(kiov != NULL);
+    LASSERT(nkiov > 0);
+    LASSERT(len != NULL);
+
+    for (i=0; i < nkiov; i++) {
+
+        ksock_mdl_t *        Iovec = NULL;
+
+
+        //
+        //  Lock the kiov page into Iovec Â¡Â­
+        //
+
+        rc = ks_lock_buffer(
+                (PUCHAR)kiov[i].kiov_page->addr + 
+                     kiov[i].kiov_offset,
+                FALSE,
+                kiov[i].kiov_len,
+                recving ? IoWriteAccess : IoReadAccess,
+                &Iovec
+            );
+
+        if (rc < 0) {
+            break;
+        }
+
+        //
+        // Attach the Iovec to the mdl chain
+        //
+
+        if (tail) {
+            tail->Next = Iovec;
+        } else {
+            mdl = Iovec;
+        }
+
+        tail = Iovec;
+
+        total += kiov[i].kiov_len;
+
+    }
+
+    if (rc >= 0) {
+        *len = total;
+    } else {
+        if (mdl) {
+            ks_release_mdl(mdl, FALSE);
+            mdl = NULL;
+        }
+    }
+
+    return mdl;
+}
+
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        struct socket *sock = conn->ksnc_sock;
+#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
+        unsigned long  vaddr = (unsigned long)iov->iov_base
+        int            offset = vaddr & (PAGE_SIZE - 1);
+        int            zcsize = MIN (iov->iov_len, PAGE_SIZE - offset);
+        struct page   *page;
+#endif
+        int            nob;
+        int            rc;
+        ksock_mdl_t *  mdl;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+
+#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC)
+        if (zcsize >= ksocknal_data.ksnd_zc_min_frag &&
+            (sock->sk->sk_route_caps & NETIF_F_SG) &&
+            (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
+            (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
+                int msgflg = MSG_DONTWAIT;
+
+                CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
+                       (void *)vaddr, page, page_address(page), offset, zcsize);
+
+                if (!list_empty (&conn->ksnc_tx_queue) ||
+                    zcsize < tx->tx_resid)
+                        msgflg |= MSG_MORE;
+
+                rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd);
+        } else
+#endif
+        {
+                /* lock the whole tx iovs into a single mdl chain */
+                mdl = ks_lock_iovs(tx->tx_iov, tx->tx_niov, FALSE, &nob);
+
+                if (mdl) {
+                        /* send the total mdl chain */
+                        rc = ks_send_mdl( conn->ksnc_sock, tx, mdl, nob, 
+                                    (!list_empty (&conn->ksnc_tx_queue) || nob < tx->tx_resid) ? 
+                                    (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
+                } else {
+                        rc = -ENOMEM;
+                }
+        }
+
+           return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+        struct socket *sock = conn->ksnc_sock;
+        lnet_kiov_t    *kiov = tx->tx_kiov;
+        int            rc;
+        int            nob;
+        ksock_mdl_t *  mdl;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone. */
+
+#if SOCKNAL_ZC
+        if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag &&
+            (sock->sk->sk_route_caps & NETIF_F_SG) &&
+            (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
+                struct page   *page = kiov->kiov_page;
+                int            offset = kiov->kiov_offset;
+                int            fragsize = kiov->kiov_len;
+                int            msgflg = MSG_DONTWAIT;
+
+                CDEBUG(D_NET, "page %p + offset %x for %d\n",
+                               page, offset, kiov->kiov_len);
+
+                if (!list_empty(&conn->ksnc_tx_queue) ||
+                    fragsize < tx->tx_resid)
+                        msgflg |= MSG_MORE;
+
+                rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg,
+                                       &tx->tx_zccd);
+        } else
+#endif
+        {
+                /* lock the whole tx kiovs into a single mdl chain */
+                mdl = ks_lock_kiovs(tx->tx_kiov, tx->tx_nkiov, FALSE, &nob);
+
+                if (mdl) {
+                        /* send the total mdl chain */
+                        rc = ks_send_mdl(
+                                    conn->ksnc_sock, tx, mdl, nob,
+                                    (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) ?
+                                    (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT);
+                } else {
+                        rc = -ENOMEM;
+                }
+        }
+
+           return rc;
+}
+
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+        struct iovec *iov = conn->ksnc_rx_iov;
+        int           rc;
+        int           size;
+        ksock_mdl_t * mdl;
+
+        /* lock the whole tx iovs into a single mdl chain */
+        mdl = ks_lock_iovs(iov, conn->ksnc_rx_niov, TRUE, &size);
+
+        if (!mdl) {
+            return (-ENOMEM);
+        }
+        
+        LASSERT (size <= conn->ksnc_rx_nob_wanted);
+
+        /* try to request data for the whole mdl chain */
+        rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
+
+        return rc;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+        lnet_kiov_t  *kiov = conn->ksnc_rx_kiov;
+        int           size;
+        int           rc;
+        ksock_mdl_t * mdl;
+
+        /* NB we can't trust socket ops to either consume our iovs
+         * or leave them alone, so we only receive 1 frag at a time. */
+        LASSERT (conn->ksnc_rx_nkiov > 0);
+
+        /* lock the whole tx kiovs into a single mdl chain */
+        mdl = ks_lock_kiovs(kiov, conn->ksnc_rx_nkiov, TRUE, &size);
+
+        if (!mdl) {
+            rc = -ENOMEM;
+            return (rc);
+        }
+        
+        LASSERT (size <= conn->ksnc_rx_nob_wanted);
+
+        /* try to request data for the whole mdl chain */
+        rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT);
+
+        return rc;
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+        __u32   option = 1;
+        int     rc = 0;
+                
+        rc = ks_set_tcp_option(
+                conn->ksnc_sock, TCP_SOCKET_NODELAY,
+                &option, sizeof(option) );
+        if (rc != 0) {
+                CERROR("Can't disable nagle: %d\n", rc);
+        }
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+        ksock_tconn_t * tconn = conn->ksnc_sock;
+        int             len;
+        int             rc;
+
+        ks_get_tconn (tconn);
+        
+        *txmem = *rxmem = 0;
+
+        len = sizeof(*nagle);
+
+        rc = ks_get_tcp_option(
+                    tconn, TCP_SOCKET_NODELAY,
+                    (__u32 *)nagle, &len);
+
+        ks_put_tconn (tconn);
+
+        printk("ksocknal_get_conn_tunables: nodelay = %d rc = %d\n", *nagle, rc);
+
+        if (rc == 0)
+                *nagle = !*nagle;
+        else
+                *txmem = *rxmem = *nagle = 0;
+                
+        return (rc);
+}
+
+int
+ksocknal_lib_buffersize (int current_sz, int tunable_sz)
+{
+           /* ensure >= SOCKNAL_MIN_BUFFER */
+           if (current_sz < SOCKNAL_MIN_BUFFER)
+                       return MAX(SOCKNAL_MIN_BUFFER, tunable_sz);
+
+           if (tunable_sz > SOCKNAL_MIN_BUFFER)
+                       return tunable_sz;
+       
+           /* leave alone */
+           return 0;
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *sock)
+{
+        int             rc;
+
+        int             keep_idle;
+        int             keep_count;
+        int             keep_intvl;
+        int             keep_alive;
+
+        __u32           option;
+
+        /* set the window size */
+
+#if 0
+        tconn->kstc_snd_wnd = ksocknal_tunables.ksnd_buffer_size;
+        tconn->kstc_rcv_wnd = ksocknal_tunables.ksnd_buffer_size;
+#endif
+
+        /* disable nagle */
+        if (!ksocknal_tunables.ksnd_nagle) {
+                option = 1;
+                
+                rc = ks_set_tcp_option(
+                            sock, TCP_SOCKET_NODELAY,
+                            &option, sizeof (option));
+                if (rc != 0) {
+                        printk ("Can't disable nagle: %d\n", rc);
+                        return (rc);
+                }
+        }
+
+        /* snapshot tunables */
+        keep_idle  = *ksocknal_tunables.ksnd_keepalive_idle;
+        keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+        keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+        
+        keep_alive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+
+        option = (__u32)(keep_alive ? 1 : 0);
+
+        rc = ks_set_tcp_option(
+                    sock, TCP_SOCKET_KEEPALIVE,
+                    &option, sizeof (option));
+        if (rc != 0) {
+                CERROR ("Can't disable nagle: %d\n", rc);
+                return (rc);
+        }
+
+        return (0);
+}
+
+void
+ksocknal_lib_push_conn (ksock_conn_t *conn)
+{
+        ksock_tconn_t * tconn;
+        __u32           nagle;
+        __u32           val = 1;
+        int             rc;
+
+        tconn = conn->ksnc_sock;
+
+        ks_get_tconn(tconn);
+
+        spin_lock(&tconn->kstc_lock);
+        if (tconn->kstc_type == kstt_sender) {
+            nagle = tconn->sender.kstc_info.nagle;
+            tconn->sender.kstc_info.nagle = 0;
+        } else {
+            LASSERT(tconn->kstc_type == kstt_child);
+            nagle = tconn->child.kstc_info.nagle;
+            tconn->child.kstc_info.nagle = 0;
+        }
+
+        spin_unlock(&tconn->kstc_lock);
+
+        val = 1;
+        rc = ks_set_tcp_option(
+                    tconn,
+                    TCP_SOCKET_NODELAY,
+                    &(val),
+                    sizeof(__u32)
+                    );
+
+        LASSERT (rc == 0);
+        spin_lock(&tconn->kstc_lock);
+
+        if (tconn->kstc_type == kstt_sender) {
+            tconn->sender.kstc_info.nagle = nagle;
+        } else {
+            LASSERT(tconn->kstc_type == kstt_child);
+            tconn->child.kstc_info.nagle = nagle;
+        }
+        spin_unlock(&tconn->kstc_lock);
+
+        ks_put_tconn(tconn);
+}
+
+/* @mode: 0: receiving mode / 1: sending mode */
+void
+ksocknal_sched_conn (ksock_conn_t *conn, int mode, ksock_tx_t *tx)
+{
+        int             flags;
+        ksock_sched_t * sched;
+        ENTRY;
+
+        /* interleave correctly with closing sockets... */
+        read_lock (&ksocknal_data.ksnd_global_lock);
+
+        sched = conn->ksnc_scheduler;
+
+        spin_lock_irqsave (&sched->kss_lock, flags);
+
+        if (mode) { /* transmission can continue ... */ 
+
+                conn->ksnc_tx_ready = 1;
+
+                if (tx) {
+                    /* Incomplete send: place tx on HEAD of tx_queue */
+                    list_add (&tx->tx_list, &conn->ksnc_tx_queue);
+                }
+
+                if ( !conn->ksnc_tx_scheduled && 
+                     !list_empty(&conn->ksnc_tx_queue)) {  //packets to send
+                        list_add_tail (&conn->ksnc_tx_list,
+                                       &sched->kss_tx_conns);
+                        conn->ksnc_tx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_conn_refcount);
+
+                        cfs_waitq_signal (&sched->kss_waitq);
+                }
+        } else {    /* receiving can continue ... */
+
+                conn->ksnc_rx_ready = 1;
+
+                if ( !conn->ksnc_rx_scheduled) {  /* not being progressed */
+                        list_add_tail(&conn->ksnc_rx_list,
+                                      &sched->kss_rx_conns);
+                        conn->ksnc_rx_scheduled = 1;
+                        /* extra ref for scheduler */
+                        atomic_inc (&conn->ksnc_conn_refcount);
+
+                        cfs_waitq_signal (&sched->kss_waitq);
+                }
+        }
+
+        spin_unlock_irqrestore (&sched->kss_lock, flags);
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+        EXIT;
+}
+
+void ksocknal_schedule_callback(struct socket*sock, int mode, void * tx, ulong_ptr bytes)
+{
+    ksock_conn_t * conn = (ksock_conn_t *) sock->kstc_conn;
+
+    if (mode) {
+        ksocknal_sched_conn(conn, mode, tx);
+    } else {
+        if ( CAN_BE_SCHED(bytes, (ulong_ptr)conn->ksnc_rx_nob_wanted )) {
+            ksocknal_sched_conn(conn, mode, tx);
+        }
+    }
+}
+
+extern void
+ksocknal_tx_launched (ksock_tx_t *tx);
+
+void
+ksocknal_fini_sending(ksock_tcpx_fini_t *tcpx)
+{
+    ksocknal_tx_launched(tcpx->tx);
+    cfs_free(tcpx);
+}
+
+void *
+ksocknal_update_tx(
+    struct socket*  tconn,
+    void *          txp,
+    ulong_ptr       rc
+    )
+{
+    ksock_tx_t *    tx = (ksock_tx_t *)txp;
+
+    /*
+     *  the transmission was done, we need update the tx
+     */
+
+    LASSERT(tx->tx_resid >= (int)rc);
+    tx->tx_resid -= (int)rc;
+
+    /*
+     *  just partial of tx is sent out, we need update
+     *  the fields of tx and schedule later transmission.
+     */
+
+    if (tx->tx_resid) {
+
+        if (tx->tx_niov > 0) {
+
+            /* if there's iov, we need process iov first */
+            while (rc > 0 ) {
+                if (rc < tx->tx_iov->iov_len) {
+                    /* didn't send whole iov entry... */
+                    tx->tx_iov->iov_base = 
+                        (char *)(tx->tx_iov->iov_base) + rc;
+                    tx->tx_iov->iov_len -= rc;
+                    rc = 0;
+                 } else {
+                    /* the whole of iov was sent out */
+                    rc -= tx->tx_iov->iov_len;
+                    tx->tx_iov++;
+                    tx->tx_niov--;
+                }
+            }
+
+        } else {
+
+            /* now we need process the kiov queues ... */
+
+            while (rc > 0 ) {
+
+                if (rc < tx->tx_kiov->kiov_len) {
+                    /* didn't send whole kiov entry... */
+                    tx->tx_kiov->kiov_offset += rc;
+                    tx->tx_kiov->kiov_len -= rc;
+                    rc = 0;
+                } else {
+                    /* whole kiov was sent out */
+                    rc -= tx->tx_kiov->kiov_len;
+                    tx->tx_kiov++;
+                    tx->tx_nkiov--;
+                }
+            }
+        }
+
+    } else {
+
+        ksock_tcpx_fini_t * tcpx = 
+                cfs_alloc(sizeof(ksock_tcpx_fini_t), CFS_ALLOC_ZERO);
+
+        ASSERT(tx->tx_resid == 0);
+
+        if (!tcpx) {
+
+            ksocknal_tx_launched (tx);
+
+        } else {
+
+            tcpx->tx = tx;
+            ExInitializeWorkItem(
+                    &(tcpx->item), 
+                    ksocknal_fini_sending,
+                    tcpx
+            );
+            ExQueueWorkItem(
+                    &(tcpx->item),
+                    CriticalWorkQueue
+                    );
+        }
+
+        tx = NULL;
+    }
+
+    return (void *)tx;
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock,  ksock_conn_t *conn)
+{
+        sock->kstc_conn      = conn;
+        sock->kstc_sched_cb  = ksocknal_schedule_callback;
+        sock->kstc_update_tx = ksocknal_update_tx;
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+        sock->kstc_conn      = NULL;
+        sock->kstc_sched_cb  = NULL;
+        sock->kstc_update_tx = NULL;
+}
+
diff --git a/lnet/klnds/socklnd/socklnd_lib-winnt.h b/lnet/klnds/socklnd/socklnd_lib-winnt.h
new file mode 100755 (executable)
index 0000000..492c9f5
--- /dev/null
@@ -0,0 +1,42 @@
+#define DEBUG_PORTAL_ALLOC
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#ifndef __WINNT_TDILND_LIB_H__
+#define __WINNT_TDILND_LIB_H__
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+#ifndef CONFIG_SMP
+
+static inline
+int ksocknal_nsched(void)
+{
+        return 1;
+}
+
+#else
+
+static inline int
+ksocknal_nsched(void)
+{
+        return num_online_cpus();
+}
+
+static inline int
+ksocknal_sched2cpu(int i)
+{
+        return i;
+}
+
+static inline int
+ksocknal_irqsched2cpu(int i)
+{
+        return i;
+}
+
+#endif
+
+#endif
diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c
new file mode 100644 (file)
index 0000000..917d4d7
--- /dev/null
@@ -0,0 +1,156 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "socklnd.h"
+
+static int sock_timeout = 50;
+CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
+                "dead socket timeout (seconds)");
+
+static int credits = 256;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+                "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+                "# concurrent sends to 1 peer");
+
+static int nconnds = 4;
+CFS_MODULE_PARM(nconnds, "i", int, 0444,
+                "# connection daemons");
+
+static int min_reconnectms = 1000;
+CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
+                "min connection retry interval (mS)");
+
+static int max_reconnectms = 60000;
+CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
+                "max connection retry interval (mS)");
+
+#if defined(__APPLE__) && !defined(__DARWIN8__)
+# define DEFAULT_EAGER_ACK 1
+#else
+# define DEFAULT_EAGER_ACK 0
+#endif
+static int eager_ack = DEFAULT_EAGER_ACK;
+CFS_MODULE_PARM(eager_ack, "i", int, 0644,
+                "send tcp ack packets eagerly");
+
+static int typed_conns = 1;
+CFS_MODULE_PARM(typed_conns, "i", int, 0444,
+                "use different sockets for bulk");
+
+static int min_bulk = (1<<10);
+CFS_MODULE_PARM(min_bulk, "i", int, 0644,
+                "smallest 'large' message");
+
+#ifdef __APPLE__
+# ifdef __DARWIN8__
+#  define DEFAULT_BUFFER_SIZE (224*1024)
+# else
+#  define DEFAULT_BUFFER_SIZE (1152 * 1024)
+# endif
+#else
+# define DEFAULT_BUFFER_SIZE 0
+#endif
+static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
+                "socket tx buffer size (0 for system default)");
+
+static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
+CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
+                "socket rx buffer size (0 for system default)");
+
+static int nagle = 0;
+CFS_MODULE_PARM(nagle, "i", int, 0644,
+                "enable NAGLE?");
+
+static int keepalive_idle = 30;
+CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
+                "# idle seconds before probe");
+
+#ifdef HAVE_BGL_SUPPORT
+#define DEFAULT_KEEPALIVE_COUNT  100
+#else
+#define DEFAULT_KEEPALIVE_COUNT  5
+#endif
+static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
+CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
+                "# missed probes == dead");
+
+static int keepalive_intvl = 5;
+CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
+                "seconds between probes");
+
+static int enable_csum = 0;
+CFS_MODULE_PARM(enable_csum, "i", int, 0644,
+                "enable check sum");
+
+static int inject_csum_error = 0;
+CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
+                "set non-zero to inject a checksum error");
+#ifdef CPU_AFFINITY
+static int enable_irq_affinity = 1;
+CFS_MODULE_PARM(enable_irq_affinity, "i", int, 0644,
+                "enable IRQ affinity");
+#endif
+
+static unsigned int zc_min_frag = (2<<10);
+CFS_MODULE_PARM(zc_min_frag, "i", int, 0644,
+                "minimum fragment to zero copy");
+
+#ifdef SOCKNAL_BACKOFF
+static int backoff_init = 3;
+CFS_MODULE_PARM(backoff_init, "i", int, 0644,
+                "seconds for initial tcp backoff");
+
+static int backoff_max = 3;
+CFS_MODULE_PARM(backoff_max, "i", int, 0644,
+                "seconds for maximum tcp backoff");
+#endif
+
+ksock_tunables_t ksocknal_tunables = {
+        .ksnd_timeout         = &sock_timeout,
+       .ksnd_credits         = &credits,
+       .ksnd_peercredits     = &peer_credits,
+       .ksnd_nconnds         = &nconnds,
+       .ksnd_min_reconnectms = &min_reconnectms,
+       .ksnd_max_reconnectms = &max_reconnectms,
+        .ksnd_eager_ack       = &eager_ack,
+        .ksnd_typed_conns     = &typed_conns,
+        .ksnd_min_bulk        = &min_bulk,
+        .ksnd_tx_buffer_size  = &tx_buffer_size,
+        .ksnd_rx_buffer_size  = &rx_buffer_size,
+        .ksnd_nagle           = &nagle,
+        .ksnd_keepalive_idle  = &keepalive_idle,
+        .ksnd_keepalive_count = &keepalive_count,
+        .ksnd_keepalive_intvl = &keepalive_intvl,
+        .ksnd_enable_csum     = &enable_csum,
+        .ksnd_inject_csum_error = &inject_csum_error,
+        .ksnd_zc_min_frag     = &zc_min_frag,
+#ifdef CPU_AFFINITY
+        .ksnd_irq_affinity    = &enable_irq_affinity,
+#endif
+#ifdef SOCKNAL_BACKOFF
+        .ksnd_backoff_init    = &backoff_init,
+        .ksnd_backoff_max     = &backoff_max,
+#endif
+};
+
index 5287e70..5b5c2db 100644 (file)
@@ -1,5 +1,5 @@
-MODULES := kvibnal
-kvibnal-objs := vibnal.o vibnal_cb.o
+MODULES := kviblnd
+kviblnd-objs := viblnd.o viblnd_cb.o viblnd_modparams.o
 
 EXTRA_POST_CFLAGS := @VIBCPPFLAGS@
 
index f90fbf2..19861a9 100644 (file)
@@ -4,12 +4,10 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
-if BUILD_VIBNAL
-modulenet_DATA = kvibnal$(KMODEXT)
-endif
+if BUILD_VIBLND
+modulenet_DATA = kviblnd$(KMODEXT)
 endif
 endif
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(kvibnal-objs:%.o=%.c) vibnal.h vibnal_wire.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@
+DIST_SOURCES = $(kviblnd-objs:%.o=%.c) viblnd.h viblnd_wire.h
index 65cd89c..0d738a1 100644 (file)
  *
  */
 
-#include "vibnal.h"
-
-nal_t                   kibnal_api;
-ptl_handle_ni_t         kibnal_ni;
-kib_data_t              kibnal_data;
-kib_tunables_t          kibnal_tunables;
-
-#ifdef CONFIG_SYSCTL
-#define IBNAL_SYSCTL             202
-
-#define IBNAL_SYSCTL_TIMEOUT     1
-
-static ctl_table kibnal_ctl_table[] = {
-        {IBNAL_SYSCTL_TIMEOUT, "timeout", 
-         &kibnal_tunables.kib_io_timeout, sizeof (int),
-         0644, NULL, &proc_dointvec},
-        { 0 }
+#include "viblnd.h"
+
+lnd_t the_kiblnd = {
+        .lnd_type       = VIBLND,
+        .lnd_startup    = kibnal_startup,
+        .lnd_shutdown   = kibnal_shutdown,
+        .lnd_ctl        = kibnal_ctl,
+        .lnd_send       = kibnal_send,
+        .lnd_recv       = kibnal_recv,
+        .lnd_eager_recv = kibnal_eager_recv,
 };
 
-static ctl_table kibnal_top_ctl_table[] = {
-        {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
-        { 0 }
-};
-#endif
+kib_data_t              kibnal_data;
 
 void vibnal_assert_wire_constants (void)
 {
@@ -56,7 +45,7 @@ void vibnal_assert_wire_constants (void)
 
         /* Constants... */
         CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91);
-        CLASSERT (IBNAL_MSG_VERSION == 0x10);
+        CLASSERT (IBNAL_MSG_VERSION == 0x11);
         CLASSERT (IBNAL_MSG_CONNREQ == 0xc0);
         CLASSERT (IBNAL_MSG_CONNACK == 0xc1);
         CLASSERT (IBNAL_MSG_NOOP == 0xd0);
@@ -164,13 +153,6 @@ void vibnal_assert_wire_constants (void)
         CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12);
 }
 
-void
-kibnal_pause(int ticks)
-{
-        set_current_state(TASK_UNINTERRUPTIBLE);
-        schedule_timeout(ticks);
-}
-
 __u32 
 kibnal_cksum (void *ptr, int nob)
 {
@@ -192,33 +174,36 @@ kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
 }
 
 void
-kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid
-                __u64 dststamp, __u64 seq)
+kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits
+                lnet_nid_t dstnid, __u64 dststamp, __u64 seq)
 {
         /* CAVEAT EMPTOR! all message fields not set here should have been
          * initialised previously. */
         msg->ibm_magic    = IBNAL_MSG_MAGIC;
-        msg->ibm_version  = IBNAL_MSG_VERSION;
+        msg->ibm_version  = version;
         /*   ibm_type */
         msg->ibm_credits  = credits;
         /*   ibm_nob */
         msg->ibm_cksum    = 0;
-        msg->ibm_srcnid   = kibnal_lib.libnal_ni.ni_pid.nid;
+        msg->ibm_srcnid   = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
+                                                  dstnid);
         msg->ibm_srcstamp = kibnal_data.kib_incarnation;
         msg->ibm_dstnid   = dstnid;
         msg->ibm_dststamp = dststamp;
         msg->ibm_seq      = seq;
-#if IBNAL_CKSUM
-        /* NB ibm_cksum zero while computing cksum */
-        msg->ibm_cksum    = kibnal_cksum(msg, msg->ibm_nob);
-#endif
+
+        if (*kibnal_tunables.kib_cksum) {
+                /* NB ibm_cksum zero while computing cksum */
+                msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
+        }
 }
 
 int
-kibnal_unpack_msg(kib_msg_t *msg, int nob)
+kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob)
 {
         const int hdr_size = offsetof(kib_msg_t, ibm_u);
         __u32     msg_cksum;
+        __u32     msg_version;
         int       flip;
         int       msg_nob;
 #if !IBNAL_USE_FMR
@@ -231,18 +216,35 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 return -EPROTO;
         }
 
+        /* Future protocol version compatibility support!
+         * If the viblnd-specific protocol changes, or when LNET unifies
+         * protocols over all LNDs, the initial connection will negotiate a
+         * protocol version.  If I find this, I avoid any console errors.  If
+         * my is doing connection establishment, the reject will tell the peer
+         * which version I'm running. */
+
         if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
                 flip = 0;
         } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
                 flip = 1;
         } else {
+                if (msg->ibm_magic == LNET_PROTO_MAGIC ||
+                    msg->ibm_magic == __swab32(LNET_PROTO_MAGIC))
+                        return -EPROTO;
+
+                /* Completely out to lunch */
                 CERROR("Bad magic: %08x\n", msg->ibm_magic);
                 return -EPROTO;
         }
 
-        if (msg->ibm_version != 
-            (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
-                CERROR("Bad version: %d\n", msg->ibm_version);
+        msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version;
+        if (expected_version == 0) {
+                if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
+                    msg_version != IBNAL_MSG_VERSION)
+                        return -EPROTO;
+        } else if (msg_version != expected_version) {
+                CERROR("Bad version: %x(%x expected)\n", 
+                       msg_version, expected_version);
                 return -EPROTO;
         }
 
@@ -270,7 +272,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
         
         if (flip) {
                 /* leave magic unflipped as a clue to peer endianness */
-                __swab16s(&msg->ibm_version);
+                msg->ibm_version = msg_version;
                 CLASSERT (sizeof(msg->ibm_type) == 1);
                 CLASSERT (sizeof(msg->ibm_credits) == 1);
                 msg->ibm_nob = msg_nob;
@@ -281,8 +283,8 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 __swab64s(&msg->ibm_seq);
         }
         
-        if (msg->ibm_srcnid == PTL_NID_ANY) {
-                CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid);
+        if (msg->ibm_srcnid == LNET_NID_ANY) {
+                CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
                 return -EPROTO;
         }
 
@@ -311,13 +313,12 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
                 break;
 
         case IBNAL_MSG_PUT_ACK:
-#if IBNAL_USE_FMR
                 if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
                         CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
                                (int)(hdr_size + sizeof(msg->ibm_u.putack)));
                         return -EPROTO;
                 }
-
+#if IBNAL_USE_FMR
                 if (flip) {
                         __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
                         __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
@@ -422,103 +423,75 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob)
 }
 
 int
-kibnal_set_mynid(ptl_nid_t nid)
+kibnal_start_listener (lnet_ni_t *ni)
 {
-        static cm_listen_data_t info;           /* protected by kib_nid_mutex */
+        static cm_listen_data_t info;
 
-        lib_ni_t        *ni = &kibnal_lib.libnal_ni;
-        int              rc;
         cm_return_t      cmrc;
 
-        CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n",
-               nid, ni->ni_pid.nid);
-
-        down (&kibnal_data.kib_nid_mutex);
+        LASSERT (kibnal_data.kib_listen_handle == NULL);
 
-        if (nid == ni->ni_pid.nid) {
-                /* no change of NID */
-                up (&kibnal_data.kib_nid_mutex);
-                return (0);
+        kibnal_data.kib_listen_handle = 
+                cm_create_cep(cm_cep_transp_rc);
+        if (kibnal_data.kib_listen_handle == NULL) {
+                CERROR ("Can't create listen CEP\n");
+                return -ENOMEM;
         }
 
-        CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid);
+        CDEBUG(D_NET, "Created CEP %p for listening\n", 
+               kibnal_data.kib_listen_handle);
 
-        if (kibnal_data.kib_listen_handle != NULL) {
-                cmrc = cm_cancel(kibnal_data.kib_listen_handle);
-                if (cmrc != cm_stat_success)
-                        CERROR ("Error %d stopping listener\n", cmrc);
+        memset(&info, 0, sizeof(info));
+        info.listen_addr.end_pt.sid = 
+                (__u64)(*kibnal_tunables.kib_service_number);
 
-                kibnal_pause(HZ/10);            /* ensure no more callbacks */
+        cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
+                         kibnal_listen_callback, NULL);
+        if (cmrc == cm_stat_success)
+                return 0;
         
-                cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
-                if (cmrc != vv_return_ok)
-                        CERROR ("Error %d destroying CEP\n", cmrc);
-
-                kibnal_data.kib_listen_handle = NULL;
-        }
-
-        /* Change NID.  NB queued passive connection requests (if any) will be
-         * rejected with an incorrect destination NID */
-        ni->ni_pid.nid = nid;
-        kibnal_data.kib_incarnation++;
-        mb();
-
-        /* Delete all existing peers and their connections after new
-         * NID/incarnation set to ensure no old connections in our brave
-         * new world. */
-        kibnal_del_peer (PTL_NID_ANY, 0);
-
-        if (ni->ni_pid.nid != PTL_NID_ANY) {    /* got a new NID to install */
-                kibnal_data.kib_listen_handle = 
-                        cm_create_cep(cm_cep_transp_rc);
-                if (kibnal_data.kib_listen_handle == NULL) {
-                        CERROR ("Can't create listen CEP\n");
-                        rc = -ENOMEM;
-                        goto failed_0;
-                }
+        CERROR ("cm_listen error: %d\n", cmrc);
 
-                CDEBUG(D_NET, "Created CEP %p for listening\n", 
-                       kibnal_data.kib_listen_handle);
+        cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
+        LASSERT (cmrc == cm_stat_success);
 
-                memset(&info, 0, sizeof(info));
-                info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id;
+        kibnal_data.kib_listen_handle = NULL;
+        return -EINVAL;
+}
 
-                cmrc = cm_listen(kibnal_data.kib_listen_handle, &info,
-                                 kibnal_listen_callback, NULL);
-                if (cmrc != 0) {
-                        CERROR ("cm_listen error: %d\n", cmrc);
-                        rc = -EINVAL;
-                        goto failed_1;
-                }
-        }
+void
+kibnal_stop_listener(lnet_ni_t *ni)
+{
+        cm_return_t      cmrc;
 
-        up (&kibnal_data.kib_nid_mutex);
-        return (0);
+        LASSERT (kibnal_data.kib_listen_handle != NULL);
+        
+        cmrc = cm_cancel(kibnal_data.kib_listen_handle);
+        if (cmrc != cm_stat_success)
+                CERROR ("Error %d stopping listener\n", cmrc);
 
- failed_1:
+        cfs_pause(cfs_time_seconds(1)/10);   /* ensure no more callbacks */
+        
         cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle);
-        LASSERT (cmrc == cm_stat_success);
+        if (cmrc != vv_return_ok)
+                CERROR ("Error %d destroying CEP\n", cmrc);
+
         kibnal_data.kib_listen_handle = NULL;
- failed_0:
-        ni->ni_pid.nid = PTL_NID_ANY;
-        kibnal_data.kib_incarnation++;
-        mb();
-        kibnal_del_peer (PTL_NID_ANY, 0);
-        up (&kibnal_data.kib_nid_mutex);
-        return rc;
 }
 
-kib_peer_t *
-kibnal_create_peer (ptl_nid_t nid)
+int
+kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
 {
-        kib_peer_t *peer;
+        kib_peer_t     *peer;
+        unsigned long   flags;
+        int             rc;
 
-        LASSERT (nid != PTL_NID_ANY);
+        LASSERT (nid != LNET_NID_ANY);
 
-        PORTAL_ALLOC(peer, sizeof (*peer));
+        LIBCFS_ALLOC(peer, sizeof (*peer));
         if (peer == NULL) {
-                CERROR("Canot allocate perr\n");
-                return (NULL);
+                CERROR("Cannot allocate peer\n");
+                return -ENOMEM;
         }
 
         memset(peer, 0, sizeof(*peer));         /* zero flags etc */
@@ -530,43 +503,62 @@ kibnal_create_peer (ptl_nid_t nid)
         INIT_LIST_HEAD (&peer->ibp_conns);
         INIT_LIST_HEAD (&peer->ibp_tx_queue);
 
-        peer->ibp_reconnect_time = jiffies;
-        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_error = 0;
+        peer->ibp_last_alive = cfs_time_current();
+        peer->ibp_reconnect_interval = 0;       /* OK to connect at any time */
+
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        if (atomic_read(&kibnal_data.kib_npeers) >=
+            *kibnal_tunables.kib_concurrent_peers) {
+                rc = -EOVERFLOW;        /* !! but at least it distinguishes */
+        } else if (kibnal_data.kib_listen_handle == NULL) {
+                rc = -ESHUTDOWN;        /* shutdown has started */
+        } else {
+                rc = 0;
+                /* npeers only grows with the global lock held */
+                atomic_inc(&kibnal_data.kib_npeers);
+        }
+        
+        write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
-        atomic_inc (&kibnal_data.kib_npeers);
-        if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS)
-                return peer;
+        if (rc != 0) {
+                CERROR("Can't create peer: %s\n", 
+                       (rc == -ESHUTDOWN) ? "shutting down" : 
+                       "too many peers");
+                LIBCFS_FREE(peer, sizeof(*peer));
+        } else {
+                *peerp = peer;
+        }
         
-        CERROR("Too many peers: CQ will overflow\n");
-        kibnal_peer_decref(peer);
-        return NULL;
+        return rc;
 }
 
 void
 kibnal_destroy_peer (kib_peer_t *peer)
 {
-
         LASSERT (atomic_read (&peer->ibp_refcount) == 0);
         LASSERT (peer->ibp_persistence == 0);
         LASSERT (!kibnal_peer_active(peer));
         LASSERT (peer->ibp_connecting == 0);
+        LASSERT (peer->ibp_accepting == 0);
         LASSERT (list_empty (&peer->ibp_conns));
         LASSERT (list_empty (&peer->ibp_tx_queue));
         
-        PORTAL_FREE (peer, sizeof (*peer));
+        LIBCFS_FREE (peer, sizeof (*peer));
 
         /* NB a peer's connections keep a reference on their peer until
          * they are destroyed, so we can be assured that _all_ state to do
          * with this peer has been cleaned up when its refcount drops to
          * zero. */
-        atomic_dec (&kibnal_data.kib_npeers);
+        atomic_dec(&kibnal_data.kib_npeers);
 }
 
-/* the caller is responsible for accounting for the additional reference
- * that this creates */
 kib_peer_t *
-kibnal_find_peer_locked (ptl_nid_t nid)
+kibnal_find_peer_locked (lnet_nid_t nid)
 {
+        /* the caller is responsible for accounting the additional reference
+         * that this creates */
         struct list_head *peer_list = kibnal_nid2peerlist (nid);
         struct list_head *tmp;
         kib_peer_t       *peer;
@@ -577,13 +569,15 @@ kibnal_find_peer_locked (ptl_nid_t nid)
 
                 LASSERT (peer->ibp_persistence != 0 || /* persistent peer */
                          peer->ibp_connecting != 0 || /* creating conns */
+                         peer->ibp_accepting != 0 ||
                          !list_empty (&peer->ibp_conns));  /* active conn */
 
                 if (peer->ibp_nid != nid)
                         continue;
 
-                CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n",
-                       peer, nid, atomic_read (&peer->ibp_refcount));
+                CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n",
+                       peer, libcfs_nid2str(nid),
+                       atomic_read (&peer->ibp_refcount));
                 return (peer);
         }
         return (NULL);
@@ -602,7 +596,7 @@ kibnal_unlink_peer_locked (kib_peer_t *peer)
 }
 
 int
-kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
+kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp,
                       int *persistencep)
 {
         kib_peer_t        *peer;
@@ -619,6 +613,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
                         if (index-- > 0)
@@ -639,20 +634,22 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp,
 }
 
 int
-kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
+kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip)
 {
         kib_peer_t        *peer;
         kib_peer_t        *peer2;
         unsigned long      flags;
+        int                rc;
 
-        CDEBUG(D_NET, LPX64"@%08x\n", nid, ip);
+        CDEBUG(D_NET, "%s at %u.%u.%u.%u\n",
+               libcfs_nid2str(nid), HIPQUAD(ip));
         
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (-EINVAL);
 
-        peer = kibnal_create_peer (nid);
-        if (peer == NULL)
-                return (-ENOMEM);
+        rc = kibnal_create_peer(&peer, nid);
+        if (rc != 0)
+                return rc;
 
         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
@@ -674,19 +671,13 @@ kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip)
 }
 
 void
-kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
+kibnal_del_peer_locked (kib_peer_t *peer)
 {
         struct list_head *ctmp;
         struct list_head *cnxt;
         kib_conn_t       *conn;
 
-        if (!single_share)
-                peer->ibp_persistence = 0;
-        else if (peer->ibp_persistence > 0)
-                peer->ibp_persistence--;
-
-        if (peer->ibp_persistence != 0)
-                return;
+        peer->ibp_persistence = 0;
 
         if (list_empty(&peer->ibp_conns)) {
                 kibnal_unlink_peer_locked(peer);
@@ -704,8 +695,9 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share)
 }
 
 int
-kibnal_del_peer (ptl_nid_t nid, int single_share)
+kibnal_del_peer (lnet_nid_t nid)
 {
+        CFS_LIST_HEAD     (zombies);
         struct list_head  *ptmp;
         struct list_head  *pnxt;
         kib_peer_t        *peer;
@@ -717,7 +709,7 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
 
         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -729,20 +721,27 @@ kibnal_del_peer (ptl_nid_t nid, int single_share)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid))
+                        if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid))
                                 continue;
 
-                        kibnal_del_peer_locked (peer, single_share);
-                        rc = 0;         /* matched something */
+                        if (!list_empty(&peer->ibp_tx_queue)) {
+                                LASSERT (list_empty(&peer->ibp_conns));
 
-                        if (single_share)
-                                goto out;
+                                list_splice_init(&peer->ibp_tx_queue, &zombies);
+                        }
+
+                        kibnal_del_peer_locked (peer);
+                        rc = 0;         /* matched something */
                 }
         }
- out:
+
         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        kibnal_txlist_done(&zombies, -EIO);
+
         return (rc);
 }
 
@@ -764,6 +763,7 @@ kibnal_get_conn_by_idx (int index)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence > 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
                         list_for_each (ctmp, &peer->ibp_conns) {
@@ -783,6 +783,74 @@ kibnal_get_conn_by_idx (int index)
         return (NULL);
 }
 
+void
+kibnal_debug_rx (kib_rx_t *rx)
+{
+        CDEBUG(D_CONSOLE, "      %p nob %d msg_type %x "
+               "cred %d seq "LPD64"\n",
+               rx, rx->rx_nob, rx->rx_msg->ibm_type,
+               rx->rx_msg->ibm_credits, rx->rx_msg->ibm_seq);
+}
+
+void
+kibnal_debug_tx (kib_tx_t *tx)
+{
+        CDEBUG(D_CONSOLE, "      %p snd %d q %d w %d rc %d dl %lx "
+               "cookie "LPX64" msg %s%s type %x cred %d seq "LPD64"\n",
+               tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting,
+               tx->tx_status, tx->tx_deadline, tx->tx_cookie,
+               tx->tx_lntmsg[0] == NULL ? "-" : "!",
+               tx->tx_lntmsg[1] == NULL ? "-" : "!",
+               tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits,
+               tx->tx_msg->ibm_seq);
+}
+
+void
+kibnal_debug_conn (kib_conn_t *conn)
+{
+        struct list_head *tmp;
+        int               i;
+        
+        spin_lock(&conn->ibc_lock);
+        
+        CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", 
+               atomic_read(&conn->ibc_refcount), conn, 
+               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+        CDEBUG(D_CONSOLE, "   txseq "LPD64" rxseq "LPD64" state %d \n",
+               conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state);
+        CDEBUG(D_CONSOLE, "   nposted %d cred %d o_cred %d r_cred %d\n",
+               conn->ibc_nsends_posted, conn->ibc_credits, 
+               conn->ibc_outstanding_credits, conn->ibc_reserved_credits);
+        CDEBUG(D_CONSOLE, "   disc %d comms_err %d\n",
+               conn->ibc_disconnect, conn->ibc_comms_error);
+
+        CDEBUG(D_CONSOLE, "   early_rxs:\n");
+        list_for_each(tmp, &conn->ibc_early_rxs)
+                kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list));
+        
+        CDEBUG(D_CONSOLE, "   tx_queue_nocred:\n");
+        list_for_each(tmp, &conn->ibc_tx_queue_nocred)
+                kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+        CDEBUG(D_CONSOLE, "   tx_queue_rsrvd:\n");
+        list_for_each(tmp, &conn->ibc_tx_queue_rsrvd)
+                kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+
+        CDEBUG(D_CONSOLE, "   tx_queue:\n");
+        list_for_each(tmp, &conn->ibc_tx_queue)
+                kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+        
+        CDEBUG(D_CONSOLE, "   active_txs:\n");
+        list_for_each(tmp, &conn->ibc_active_txs)
+                kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list));
+        
+        CDEBUG(D_CONSOLE, "   rxs:\n");
+        for (i = 0; i < IBNAL_RX_MSGS; i++)
+                kibnal_debug_rx(&conn->ibc_rxs[i]);
+
+        spin_unlock(&conn->ibc_lock);
+}
+
 int
 kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
 {
@@ -835,7 +903,7 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
                 rtr->destanation_qp            = cv->cv_remote_qpn;
                 rtr->receive_psn               = cv->cv_rxpsn;
                 rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD;
-                rtr->opt_min_rnr_nak_timer     = IBNAL_RNR_NAK_TIMER;
+                rtr->opt_min_rnr_nak_timer     = *kibnal_tunables.kib_rnr_nak_timer;
 
 
                 // XXX sdp sets VV_QP_AT_OP_F but no actual optional options
@@ -851,9 +919,9 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
                 struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts;
 
                 rts->send_psn                 = cv->cv_txpsn;
-                rts->local_ack_timeout        = IBNAL_LOCAL_ACK_TIMEOUT;
-                rts->retry_num                = IBNAL_RETRY_CNT;
-                rts->rnr_num                  = IBNAL_RNR_CNT;
+                rts->local_ack_timeout        = *kibnal_tunables.kib_local_ack_timeout;
+                rts->retry_num                = *kibnal_tunables.kib_retry_cnt;
+                rts->rnr_num                  = *kibnal_tunables.kib_rnr_cnt;
                 rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD;
                 
                 attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN |
@@ -874,8 +942,9 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state)
         
         vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL);
         if (vvrc != vv_return_ok) {
-                CERROR("Can't modify qp -> "LPX64" state to %d: %d\n", 
-                       conn->ibc_peer->ibp_nid, new_state, vvrc);
+                CERROR("Can't modify qp -> %s state to %d: %d\n", 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                       new_state, vvrc);
                 return -EIO;
         }
         
@@ -899,7 +968,7 @@ kibnal_create_conn (cm_cep_handle_t cep)
         LASSERT(!in_interrupt());
         LASSERT(current == kibnal_data.kib_connd);
         
-        PORTAL_ALLOC(conn, sizeof (*conn));
+        LIBCFS_ALLOC(conn, sizeof (*conn));
         if (conn == NULL) {
                 CERROR ("Can't allocate connection\n");
                 return (NULL);
@@ -908,8 +977,12 @@ kibnal_create_conn (cm_cep_handle_t cep)
         /* zero flags, NULL pointers etc... */
         memset (conn, 0, sizeof (*conn));
 
+        conn->ibc_version = IBNAL_MSG_VERSION;  /* Use latest version at first */
+
         INIT_LIST_HEAD (&conn->ibc_early_rxs);
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred);
         INIT_LIST_HEAD (&conn->ibc_tx_queue);
+        INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd);
         INIT_LIST_HEAD (&conn->ibc_active_txs);
         spin_lock_init (&conn->ibc_lock);
         
@@ -918,7 +991,7 @@ kibnal_create_conn (cm_cep_handle_t cep)
 
         conn->ibc_cep = cep;
 
-        PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
         if (conn->ibc_connvars == NULL) {
                 CERROR("Can't allocate in-progress connection state\n");
                 goto failed;
@@ -928,7 +1001,7 @@ kibnal_create_conn (cm_cep_handle_t cep)
         get_random_bytes(&conn->ibc_connvars->cv_rxpsn,
                          sizeof(conn->ibc_connvars->cv_rxpsn));
 
-        PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+        LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
         if (conn->ibc_rxs == NULL) {
                 CERROR("Cannot allocate RX buffers\n");
                 goto failed;
@@ -976,7 +1049,7 @@ kibnal_create_conn (cm_cep_handle_t cep)
         reqattr.create.cq_send_h                  = kibnal_data.kib_cq;
         reqattr.create.cq_receive_h               = kibnal_data.kib_cq;
         reqattr.create.send_max_outstand_wr       = (1 + IBNAL_MAX_RDMA_FRAGS) * 
-                                                    IBNAL_MSG_QUEUE_SIZE;
+                                                    (*kibnal_tunables.kib_concurrent_sends);
         reqattr.create.receive_max_outstand_wr    = IBNAL_RX_MSGS;
         reqattr.create.max_scatgat_per_send_wr    = 1;
         reqattr.create.max_scatgat_per_receive_wr = 1;
@@ -996,12 +1069,13 @@ kibnal_create_conn (cm_cep_handle_t cep)
         conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num;
 
         if (rspattr.create_return.receive_max_outstand_wr < 
-            IBNAL_MSG_QUEUE_SIZE ||
+            IBNAL_RX_MSGS ||
             rspattr.create_return.send_max_outstand_wr < 
-            (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) {
+            (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) {
                 CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n",
-                       IBNAL_MSG_QUEUE_SIZE, 
-                       (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE,
+                       IBNAL_RX_MSGS, 
+                       (1 + IBNAL_MAX_RDMA_FRAGS) * 
+                       (*kibnal_tunables.kib_concurrent_sends),
                        rspattr.create_return.receive_max_outstand_wr,
                        rspattr.create_return.send_max_outstand_wr);
                 goto failed;
@@ -1033,6 +1107,8 @@ kibnal_destroy_conn (kib_conn_t *conn)
         LASSERT (atomic_read (&conn->ibc_refcount) == 0);
         LASSERT (list_empty(&conn->ibc_early_rxs));
         LASSERT (list_empty(&conn->ibc_tx_queue));
+        LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd));
+        LASSERT (list_empty(&conn->ibc_tx_queue_nocred));
         LASSERT (list_empty(&conn->ibc_active_txs));
         LASSERT (conn->ibc_nsends_posted == 0);
 
@@ -1066,16 +1142,16 @@ kibnal_destroy_conn (kib_conn_t *conn)
                 kibnal_free_pages(conn->ibc_rx_pages);
 
         if (conn->ibc_rxs != NULL)
-                PORTAL_FREE(conn->ibc_rxs, 
+                LIBCFS_FREE(conn->ibc_rxs, 
                             IBNAL_RX_MSGS * sizeof(kib_rx_t));
 
         if (conn->ibc_connvars != NULL)
-                PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+                LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
 
         if (conn->ibc_peer != NULL)
                 kibnal_peer_decref(conn->ibc_peer);
 
-        PORTAL_FREE(conn, sizeof (*conn));
+        LIBCFS_FREE(conn, sizeof (*conn));
 
         atomic_dec(&kibnal_data.kib_nconns);
 }
@@ -1112,8 +1188,9 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
                 if (conn->ibc_incarnation == incarnation)
                         continue;
 
-                CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n",
-                       peer->ibp_nid, conn->ibc_incarnation, incarnation);
+                CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n",
+                       libcfs_nid2str(peer->ibp_nid),
+                       conn->ibc_incarnation, incarnation);
                 
                 count++;
                 kibnal_close_conn_locked (conn, -ESTALE);
@@ -1123,7 +1200,7 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation)
 }
 
 int
-kibnal_close_matching_conns (ptl_nid_t nid)
+kibnal_close_matching_conns (lnet_nid_t nid)
 {
         kib_peer_t         *peer;
         struct list_head   *ptmp;
@@ -1136,7 +1213,7 @@ kibnal_close_matching_conns (ptl_nid_t nid)
 
         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
-        if (nid != PTL_NID_ANY)
+        if (nid != LNET_NID_ANY)
                 lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers;
         else {
                 lo = 0;
@@ -1149,9 +1226,10 @@ kibnal_close_matching_conns (ptl_nid_t nid)
                         peer = list_entry (ptmp, kib_peer_t, ibp_list);
                         LASSERT (peer->ibp_persistence != 0 ||
                                  peer->ibp_connecting != 0 ||
+                                 peer->ibp_accepting != 0 ||
                                  !list_empty (&peer->ibp_conns));
 
-                        if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid))
+                        if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid))
                                 continue;
 
                         count += kibnal_close_peer_conns_locked (peer, 0);
@@ -1161,70 +1239,69 @@ kibnal_close_matching_conns (ptl_nid_t nid)
         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
         /* wildcards always succeed */
-        if (nid == PTL_NID_ANY)
+        if (nid == LNET_NID_ANY)
                 return (0);
         
         return (count == 0 ? -ENOENT : 0);
 }
 
 int
-kibnal_cmd(struct portals_cfg *pcfg, void * private)
+kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
 {
-        int rc = -EINVAL;
+        struct libcfs_ioctl_data *data = arg;
+        int                       rc = -EINVAL;
 
-        LASSERT (pcfg != NULL);
+        LASSERT (ni == kibnal_data.kib_ni);
 
-        switch(pcfg->pcfg_command) {
-        case NAL_CMD_GET_PEER: {
-                ptl_nid_t   nid = 0;
-                __u32       ip = 0;
-                int         share_count = 0;
+        switch(cmd) {
+        case IOC_LIBCFS_GET_PEER: {
+                lnet_nid_t   nid = 0;
+                __u32        ip = 0;
+                int          share_count = 0;
 
-                rc = kibnal_get_peer_info(pcfg->pcfg_count,
+                rc = kibnal_get_peer_info(data->ioc_count,
                                           &nid, &ip, &share_count);
-                pcfg->pcfg_nid   = nid;
-                pcfg->pcfg_size  = 0;
-                pcfg->pcfg_id    = ip;
-                pcfg->pcfg_misc  = IBNAL_SERVICE_NUMBER; /* port */
-                pcfg->pcfg_count = 0;
-                pcfg->pcfg_wait  = share_count;
+                data->ioc_nid    = nid;
+                data->ioc_count  = share_count;
+                data->ioc_u32[0] = ip;
+                data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */
                 break;
         }
-        case NAL_CMD_ADD_PEER: {
-                rc = kibnal_add_persistent_peer (pcfg->pcfg_nid,
-                                                 pcfg->pcfg_id); /* IP */
+        case IOC_LIBCFS_ADD_PEER: {
+                rc = kibnal_add_persistent_peer (data->ioc_nid,
+                                                 data->ioc_u32[0]); /* IP */
                 break;
         }
-        case NAL_CMD_DEL_PEER: {
-                rc = kibnal_del_peer (pcfg->pcfg_nid, 
-                                       /* flags == single_share */
-                                       pcfg->pcfg_flags != 0);
+        case IOC_LIBCFS_DEL_PEER: {
+                rc = kibnal_del_peer (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_GET_CONN: {
-                kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count);
+        case IOC_LIBCFS_GET_CONN: {
+                kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count);
 
                 if (conn == NULL)
                         rc = -ENOENT;
                 else {
+                        // kibnal_debug_conn(conn);
                         rc = 0;
-                        pcfg->pcfg_nid   = conn->ibc_peer->ibp_nid;
-                        pcfg->pcfg_id    = 0;
-                        pcfg->pcfg_misc  = 0;
-                        pcfg->pcfg_flags = 0;
+                        data->ioc_nid = conn->ibc_peer->ibp_nid;
                         kibnal_conn_decref(conn);
                 }
                 break;
         }
-        case NAL_CMD_CLOSE_CONNECTION: {
-                rc = kibnal_close_matching_conns (pcfg->pcfg_nid);
+        case IOC_LIBCFS_CLOSE_CONNECTION: {
+                rc = kibnal_close_matching_conns (data->ioc_nid);
                 break;
         }
-        case NAL_CMD_REGISTER_MYNID: {
-                if (pcfg->pcfg_nid == PTL_NID_ANY)
+        case IOC_LIBCFS_REGISTER_MYNID: {
+                if (ni->ni_nid == data->ioc_nid) {
+                        rc = 0;
+                } else {
+                        CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+                               libcfs_nid2str(data->ioc_nid),
+                               libcfs_nid2str(ni->ni_nid));
                         rc = -EINVAL;
-                else
-                        rc = kibnal_set_mynid (pcfg->pcfg_nid);
+                }
                 break;
         }
         }
@@ -1242,7 +1319,7 @@ kibnal_free_pages (kib_pages_t *p)
                 if (p->ibp_pages[i] != NULL)
                         __free_page(p->ibp_pages[i]);
         
-        PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages]));
 }
 
 int
@@ -1251,7 +1328,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
         kib_pages_t   *p;
         int            i;
 
-        PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
+        LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
         if (p == NULL) {
                 CERROR ("Can't allocate buffer %d\n", npages);
                 return (-ENOMEM);
@@ -1278,36 +1355,36 @@ kibnal_alloc_tx_descs (void)
 {
         int    i;
         
-        PORTAL_ALLOC (kibnal_data.kib_tx_descs,
-                      IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
+                      IBNAL_TX_MSGS() * sizeof(kib_tx_t));
         if (kibnal_data.kib_tx_descs == NULL)
                 return -ENOMEM;
         
         memset(kibnal_data.kib_tx_descs, 0,
-               IBNAL_TX_MSGS * sizeof(kib_tx_t));
+               IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
 
 #if IBNAL_USE_FMR
-                PORTAL_ALLOC(tx->tx_pages, PTL_MD_MAX_IOV *
+                LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
                              sizeof(*tx->tx_pages));
                 if (tx->tx_pages == NULL)
                         return -ENOMEM;
 #else
-                PORTAL_ALLOC(tx->tx_wrq, 
+                LIBCFS_ALLOC(tx->tx_wrq, 
                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
                              sizeof(*tx->tx_wrq));
                 if (tx->tx_wrq == NULL)
                         return -ENOMEM;
                 
-                PORTAL_ALLOC(tx->tx_gl, 
+                LIBCFS_ALLOC(tx->tx_gl, 
                              (1 + IBNAL_MAX_RDMA_FRAGS) * 
                              sizeof(*tx->tx_gl));
                 if (tx->tx_gl == NULL)
                         return -ENOMEM;
                 
-                PORTAL_ALLOC(tx->tx_rd, 
+                LIBCFS_ALLOC(tx->tx_rd, 
                              offsetof(kib_rdma_desc_t, 
                                       rd_frags[IBNAL_MAX_RDMA_FRAGS]));
                 if (tx->tx_rd == NULL)
@@ -1326,33 +1403,33 @@ kibnal_free_tx_descs (void)
         if (kibnal_data.kib_tx_descs == NULL)
                 return;
 
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
 
 #if IBNAL_USE_FMR
                 if (tx->tx_pages != NULL)
-                        PORTAL_FREE(tx->tx_pages, PTL_MD_MAX_IOV *
+                        LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
                                     sizeof(*tx->tx_pages));
 #else
                 if (tx->tx_wrq != NULL)
-                        PORTAL_FREE(tx->tx_wrq, 
+                        LIBCFS_FREE(tx->tx_wrq, 
                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
                                     sizeof(*tx->tx_wrq));
 
                 if (tx->tx_gl != NULL)
-                        PORTAL_FREE(tx->tx_gl, 
+                        LIBCFS_FREE(tx->tx_gl, 
                                     (1 + IBNAL_MAX_RDMA_FRAGS) * 
                                     sizeof(*tx->tx_gl));
 
                 if (tx->tx_rd != NULL)
-                        PORTAL_FREE(tx->tx_rd, 
+                        LIBCFS_FREE(tx->tx_rd, 
                                     offsetof(kib_rdma_desc_t, 
                                              rd_frags[IBNAL_MAX_RDMA_FRAGS]));
 #endif
         }
 
-        PORTAL_FREE(kibnal_data.kib_tx_descs,
-                    IBNAL_TX_MSGS * sizeof(kib_tx_t));
+        LIBCFS_FREE(kibnal_data.kib_tx_descs,
+                    IBNAL_TX_MSGS() * sizeof(kib_tx_t));
 }
 
 #if IBNAL_USE_FMR
@@ -1396,24 +1473,23 @@ kibnal_setup_tx_descs (void)
         /* No fancy arithmetic when we do the buffer calculations */
         CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
 
-        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, 
-                                0);
+        rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, 
+                                IBNAL_TX_MSG_PAGES(), 0);
         if (rc != 0)
                 return (rc);
 
-        for (i = 0; i < IBNAL_TX_MSGS; i++) {
+        for (i = 0; i < IBNAL_TX_MSGS(); i++) {
                 page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
                 tx = &kibnal_data.kib_tx_descs[i];
 
 #if IBNAL_USE_FMR
                 memset(&fmr_props, 0, sizeof(fmr_props));
                 fmr_props.pd_hndl              = kibnal_data.kib_pd;
-                fmr_props.acl                  = (vv_acc_r_mem_read |
-                                                  vv_acc_r_mem_write |
+                fmr_props.acl                  = (vv_acc_r_mem_write |
                                                   vv_acc_l_mem_write);
-                fmr_props.max_pages            = PTL_MD_MAX_IOV;
+                fmr_props.max_pages            = LNET_MAX_IOV;
                 fmr_props.log2_page_sz         = PAGE_SHIFT;
-                fmr_props.max_outstanding_maps = IBNAL_FMR_NMAPS;
+                fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps;
                 
                 vvrc = vv_alloc_fmr(kibnal_data.kib_hca,
                                     &fmr_props,
@@ -1426,7 +1502,7 @@ kibnal_setup_tx_descs (void)
                         return -ENOMEM;
                 }
 
-                tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS;
+                tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
                 tx->tx_md.md_active   = 0;
 #endif
                 tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + 
@@ -1440,17 +1516,10 @@ kibnal_setup_tx_descs (void)
                                             &rkey);
                 LASSERT (vvrc == vv_return_ok);
 
-                tx->tx_isnblk = (i >= IBNAL_NTX);
-
                 CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, 
                        tx->tx_msg, tx->tx_lkey);
 
-                if (tx->tx_isnblk)
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_nblk_txs);
-                else
-                        list_add (&tx->tx_list, 
-                                  &kibnal_data.kib_idle_txs);
+                list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
                 page_offset += IBNAL_MSG_SIZE;
                 LASSERT (page_offset <= PAGE_SIZE);
@@ -1458,7 +1527,7 @@ kibnal_setup_tx_descs (void)
                 if (page_offset == PAGE_SIZE) {
                         page_offset = 0;
                         ipage++;
-                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+                        LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
                 }
         }
         
@@ -1466,42 +1535,34 @@ kibnal_setup_tx_descs (void)
 }
 
 void
-kibnal_api_shutdown (nal_t *nal)
+kibnal_shutdown (lnet_ni_t *ni)
 {
-        int         i;
-        vv_return_t vvrc;
-
-        if (nal->nal_refct != 0) {
-                /* This module got the first ref */
-                PORTAL_MODULE_UNUSE;
-                return;
-        }
+        int           i;
+        vv_return_t   vvrc;
 
+        LASSERT (ni == kibnal_data.kib_ni);
+        LASSERT (ni->ni_data == &kibnal_data);
+        
         CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
-
-        LASSERT(nal == &kibnal_api);
+               atomic_read (&libcfs_kmemory));
 
         switch (kibnal_data.kib_init) {
 
         case IBNAL_INIT_ALL:
-                /* stop calls to nal_cmd */
-                libcfs_nal_cmd_unregister(VIBNAL);
-                /* No new peers */
+                /* stop accepting connections and prevent new peers */
+                kibnal_stop_listener(ni);
 
-                /* resetting my NID removes my listener and nukes all current
-                 * peers and their connections */
-                kibnal_set_mynid (PTL_NID_ANY);
+                /* nuke all existing peers */
+                kibnal_del_peer(LNET_NID_ANY);
 
                 /* Wait for all peer state to clean up */
                 i = 2;
-                while (atomic_read (&kibnal_data.kib_npeers) != 0) {
+                while (atomic_read(&kibnal_data.kib_npeers) != 0) {
                         i++;
-                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
+                        CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */
                                "waiting for %d peers to disconnect\n",
-                               atomic_read (&kibnal_data.kib_npeers));
-                        set_current_state (TASK_UNINTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                               atomic_read(&kibnal_data.kib_npeers));
+                        cfs_pause(cfs_time_seconds(1));
                 }
                 /* fall through */
 
@@ -1514,7 +1575,7 @@ kibnal_api_shutdown (nal_t *nal)
         case IBNAL_INIT_TXD:
                 kibnal_free_pages (kibnal_data.kib_tx_pages);
 #if IBNAL_USE_FMR
-                kibnal_free_fmrs(IBNAL_TX_MSGS);
+                kibnal_free_fmrs(IBNAL_TX_MSGS());
 #endif
                 /* fall through */
 
@@ -1542,19 +1603,13 @@ kibnal_api_shutdown (nal_t *nal)
                         CERROR ("Close HCA  error: %d\n", vvrc);
                 /* fall through */
 
-        case IBNAL_INIT_LIB:
-                lib_fini(&kibnal_lib);
-                /* fall through */
-
         case IBNAL_INIT_DATA:
-                LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
+                LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0);
                 LASSERT (kibnal_data.kib_peers != NULL);
                 for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
                         LASSERT (list_empty (&kibnal_data.kib_peers[i]));
                 }
                 LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
-                LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
-                LASSERT (list_empty (&kibnal_data.kib_sched_txq));
                 LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
                 LASSERT (list_empty (&kibnal_data.kib_connd_conns));
                 LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs));
@@ -1571,8 +1626,7 @@ kibnal_api_shutdown (nal_t *nal)
                         CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
                                "Waiting for %d threads to terminate\n",
                                atomic_read (&kibnal_data.kib_nthreads));
-                        set_current_state (TASK_INTERRUPTIBLE);
-                        schedule_timeout (HZ);
+                        cfs_pause(cfs_time_seconds(1));
                 }
                 /* fall through */
                 
@@ -1583,54 +1637,119 @@ kibnal_api_shutdown (nal_t *nal)
         kibnal_free_tx_descs();
 
         if (kibnal_data.kib_peers != NULL)
-                PORTAL_FREE (kibnal_data.kib_peers,
+                LIBCFS_FREE (kibnal_data.kib_peers,
                              sizeof (struct list_head) * 
                              kibnal_data.kib_peer_hash_size);
 
         CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
-               atomic_read (&portal_kmemory));
-        printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n",
-               atomic_read(&portal_kmemory));
+               atomic_read (&libcfs_kmemory));
 
         kibnal_data.kib_init = IBNAL_INIT_NOTHING;
+        PORTAL_MODULE_UNUSE;
 }
 
 int
-kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
-                     ptl_ni_limits_t *requested_limits,
-                     ptl_ni_limits_t *actual_limits)
+kibnal_startup (lnet_ni_t *ni)
 {
+        char                      scratch[32];
+        char                      ipif_name[32];
+        char                     *hca_name;
+        __u32                     ip;
+        __u32                     netmask;
+        int                       up;
+        int                       nob;
+        int                       devno;
         struct timeval            tv;
-        ptl_process_id_t          process_id;
-        int                       pkmem = atomic_read(&portal_kmemory);
         int                       rc;
         int                       i;
         vv_request_event_record_t req_er;
         vv_return_t               vvrc;
 
-        LASSERT (nal == &kibnal_api);
+        LASSERT (ni->ni_lnd == &the_kiblnd);
+
+        /* Only 1 instance supported */
+        if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) {
+                CERROR ("Only 1 instance supported\n");
+                return -EPERM;
+        }
+
+        if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
+                CERROR ("Can't set credits(%d) > ntx(%d)\n",
+                        *kibnal_tunables.kib_credits,
+                        *kibnal_tunables.kib_ntx);
+                return -EINVAL;
+        }
+
+        ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
+        ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
+
+        CLASSERT (LNET_MAX_INTERFACES > 1);
+        
+        if (ni->ni_interfaces[0] != NULL) {
+                /* Use the HCA specified in 'networks=' */
+
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Multiple interfaces not supported\n");
+                        return -EPERM;
+                }
+
+                /* Parse <hca base name><number> */
+                hca_name = ni->ni_interfaces[0];
+                nob = strlen(*kibnal_tunables.kib_hca_basename);
+                
+                if (strncmp(hca_name, *kibnal_tunables.kib_hca_basename, nob) ||
+                    sscanf(hca_name + nob, "%d%n", &devno, &nob) < 1) {
+                        CERROR("Unrecognised HCA %s\n", hca_name);
+                        return -EINVAL;
+                }
 
-        if (nal->nal_refct != 0) {
-                if (actual_limits != NULL)
-                        *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits;
-                /* This module got the first ref */
-                PORTAL_MODULE_USE;
-                return (PTL_OK);
+        } else {
+                /* Use <hca base name>0 */
+                devno = 0;
+
+                hca_name = scratch;
+                snprintf(hca_name, sizeof(scratch), "%s%d",
+                         *kibnal_tunables.kib_hca_basename, devno);
+                if (strlen(hca_name) == sizeof(scratch) - 1) {
+                        CERROR("HCA name %s truncated\n", hca_name);
+                        return -EINVAL;
+                }
         }
 
-        LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING);
+        /* Find IP address from <ipif base name><hca number> */
+        snprintf(ipif_name, sizeof(ipif_name), "%s%d",
+                 *kibnal_tunables.kib_ipif_basename, devno);
+        if (strlen(ipif_name) == sizeof(ipif_name - 1)) {
+                CERROR("IPoIB interface name %s truncated\n", ipif_name);
+                return -EINVAL;
+        }
+        
+        rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
+        if (rc != 0) {
+                CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
+                return -ENETDOWN;
+        }
+        
+        if (!up) {
+                CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
+                return -ENETDOWN;
+        }
+        
+        ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
+        
+        PORTAL_MODULE_USE;
         memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */
+
+        kibnal_data.kib_ni = ni;
+        ni->ni_data = &kibnal_data;
         
         do_gettimeofday(&tv);
         kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-        kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER;
-
-        init_MUTEX (&kibnal_data.kib_nid_mutex);
 
         rwlock_init(&kibnal_data.kib_global_lock);
 
         kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
-        PORTAL_ALLOC (kibnal_data.kib_peers,
+        LIBCFS_ALLOC (kibnal_data.kib_peers,
                       sizeof (struct list_head) * kibnal_data.kib_peer_hash_size);
         if (kibnal_data.kib_peers == NULL) {
                 goto failed;
@@ -1646,14 +1765,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         init_waitqueue_head (&kibnal_data.kib_connd_waitq);
 
         spin_lock_init (&kibnal_data.kib_sched_lock);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
-        INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
         init_waitqueue_head (&kibnal_data.kib_sched_waitq);
 
         spin_lock_init (&kibnal_data.kib_tx_lock);
         INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
-        INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs);
-        init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq);
 
         rc = kibnal_alloc_tx_descs();
         if (rc != 0) {
@@ -1665,20 +1780,6 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         kibnal_data.kib_init = IBNAL_INIT_DATA;
         /*****************************************************/
 
-        process_id.pid = requested_pid;
-        process_id.nid = PTL_NID_ANY;
-        
-        rc = lib_init(&kibnal_lib, nal, process_id,
-                      requested_limits, actual_limits);
-        if (rc != PTL_OK) {
-                CERROR("lib_init failed: error %d\n", rc);
-                goto failed;
-        }
-
-        /* lib interface initialised */
-        kibnal_data.kib_init = IBNAL_INIT_LIB;
-        /*****************************************************/
-
         for (i = 0; i < IBNAL_N_SCHED; i++) {
                 rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i));
                 if (rc != 0) {
@@ -1694,10 +1795,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 goto failed;
         }
 
-        /* TODO: apparently only one adapter is supported */
-        vvrc = vv_hca_open("InfiniHost0", NULL, &kibnal_data.kib_hca);
+        vvrc = vv_hca_open(hca_name, NULL, &kibnal_data.kib_hca);
         if (vvrc != vv_return_ok) {
-                CERROR ("Can't open CA: %d\n", vvrc);
+                CERROR ("Can't open HCA %s: %d\n", hca_name, vvrc);
                 goto failed;
         }
 
@@ -1709,7 +1809,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er,
                                      kibnal_async_callback);
         if (vvrc != vv_return_ok) {
-                CERROR ("Can't open CA: %d\n", vvrc);
+                CERROR ("Can't set HCA %s callback: %d\n", hca_name, vvrc);
                 goto failed; 
         }
 
@@ -1719,7 +1819,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
         vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs);
         if (vvrc != vv_return_ok) {
-                CERROR ("Can't size port attrs: %d\n", vvrc);
+                CERROR ("Can't size port attrs for %s: %d\n", hca_name, vvrc);
                 goto failed;
         }
 
@@ -1733,8 +1833,8 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
 
                 vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr);
                 if (vvrc != vv_return_ok) {
-                        CERROR("vv_port_query failed for port %d: %d\n",
-                               port_num, vvrc);
+                        CERROR("vv_port_query failed for %s port %d: %d\n",
+                               hca_name, port_num, vvrc);
                         continue;
                 }
 
@@ -1752,45 +1852,47 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         CDEBUG(D_NET, "port[%d] Active\n", port_num);
 
                         /* Found a suitable port. Get its GUID and PKEY. */
-                        kibnal_data.kib_port = port_num;
-                        
                         tbl_count = 1;
                         vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, 
                                                    port_num, &tbl_count,
                                                    &kibnal_data.kib_port_gid);
                         if (vvrc != vv_return_ok) {
                                 CERROR("vv_get_port_gid_tbl failed "
-                                       "for port %d: %d\n", port_num, vvrc);
+                                       "for %s port %d: %d\n", 
+                                       hca_name, port_num, vvrc);
                                 continue;
                         }
 
                         tbl_count = 1;
                         vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, 
-                                                        port_num, &tbl_count,
-                                                        &kibnal_data.kib_port_pkey);
+                                                         port_num, &tbl_count,
+                                                         &kibnal_data.kib_port_pkey);
                         if (vvrc != vv_return_ok) {
                                 CERROR("vv_get_port_partition_tbl failed "
-                                       "for port %d: %d\n", port_num, vvrc);
+                                       "for %s port %d: %d\n",
+                                       hca_name, port_num, vvrc);
                                 continue;
                         }
 
+                        kibnal_data.kib_port = port_num;
+
                         break;
                 case vv_state_linkActDefer: /* TODO: correct? */
                 case vv_state_linkNoChange:
-                        CERROR("Unexpected port[%d] state %d\n",
-                               i, pattr->port_state);
+                        CERROR("Unexpected %s port[%d] state %d\n",
+                               hca_name, i, pattr->port_state);
                         continue;
                 }
                 break;
         }
 
         if (kibnal_data.kib_port == -1) {
-                CERROR ("Can't find an active port\n");
+                CERROR ("Can't find an active port on %s\n", hca_name);
                 goto failed;
         }
 
-        CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n",
-               kibnal_data.kib_port, 
+        CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n",
+               hca_name, kibnal_data.kib_port, 
                kibnal_data.kib_port_gid.scope.g.subnet, 
                kibnal_data.kib_port_gid.scope.g.eui64);
         
@@ -1820,10 +1922,11 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
         /* flag TX descs initialised */
         kibnal_data.kib_init = IBNAL_INIT_TXD;
         /*****************************************************/
+
         {
                 uint32_t nentries;
 
-                vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
+                vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
                                     kibnal_cq_callback, 
                                     NULL, /* context */
                                     &kibnal_data.kib_cq, &nentries);
@@ -1835,9 +1938,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                 /* flag CQ initialised */
                 kibnal_data.kib_init = IBNAL_INIT_CQ;
 
-                if (nentries < IBNAL_CQ_ENTRIES) {
+                if (nentries < IBNAL_CQ_ENTRIES()) {
                         CERROR ("CQ only has %d entries, need %d\n", 
-                                nentries, IBNAL_CQ_ENTRIES);
+                                nentries, IBNAL_CQ_ENTRIES());
                         goto failed;
                 }
 
@@ -1849,40 +1952,30 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid,
                         goto failed;
                 }
         }
-        
-        /*****************************************************/
 
-        rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL);
+        rc = kibnal_start_listener(ni);
         if (rc != 0) {
-                CERROR ("Can't initialise command interface (rc = %d)\n", rc);
+                CERROR("Can't start listener: %d\n", rc);
                 goto failed;
         }
-
+        
         /* flag everything initialised */
         kibnal_data.kib_init = IBNAL_INIT_ALL;
         /*****************************************************/
 
-        printk(KERN_INFO "Lustre: Voltaire IB NAL loaded "
-               "(initial mem %d)\n", pkmem);
-
-        return (PTL_OK);
+        return (0);
 
  failed:
-        CDEBUG(D_NET, "kibnal_api_startup failed\n");
-        kibnal_api_shutdown (&kibnal_api);    
-        return (PTL_FAIL);
+        CDEBUG(D_NET, "kibnal_startup failed\n");
+        kibnal_shutdown (ni);    
+        return (-ENETDOWN);
 }
 
 void __exit
 kibnal_module_fini (void)
 {
-#ifdef CONFIG_SYSCTL
-        if (kibnal_tunables.kib_sysctl != NULL)
-                unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-#endif
-        PtlNIFini(kibnal_ni);
-
-        ptl_unregister_nal(VIBNAL);
+        lnet_unregister_lnd(&the_kiblnd);
+        kibnal_tunables_fini();
 }
 
 int __init
@@ -1903,38 +1996,17 @@ kibnal_module_init (void)
         CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS])
                   <= IBNAL_MSG_SIZE);
 #endif
-        /* the following must be sizeof(int) for proc_dointvec() */
-        CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int));
-
-        kibnal_api.nal_ni_init = kibnal_api_startup;
-        kibnal_api.nal_ni_fini = kibnal_api_shutdown;
-
-        /* Initialise dynamic tunables to defaults once only */
-        kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+        rc = kibnal_tunables_init();
+        if (rc != 0)
+                return rc;
 
-        rc = ptl_register_nal(VIBNAL, &kibnal_api);
-        if (rc != PTL_OK) {
-                CERROR("Can't register IBNAL: %d\n", rc);
-                return (-ENOMEM);               /* or something... */
-        }
+        lnet_register_lnd(&the_kiblnd);
 
-        /* Pure gateways want the NAL started up at module load time... */
-        rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                ptl_unregister_nal(VIBNAL);
-                return (-ENODEV);
-        }
-        
-#ifdef CONFIG_SYSCTL
-        /* Press on regardless even if registering sysctl doesn't work */
-        kibnal_tunables.kib_sysctl = 
-                register_sysctl_table (kibnal_top_ctl_table, 0);
-#endif
-        return (0);
+        return 0;
 }
 
 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01");
+MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00");
 MODULE_LICENSE("GPL");
 
 module_init(kibnal_module_init);
index 959c768..12c8df4 100644 (file)
 #include <net/sock.h>
 #include <linux/in.h>
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
+#include <lnet/lnet.h>
+#include <lnet/lib-lnet.h>
 
 /* CPU_{L,B}E #defines needed by Voltaire headers */
 #include <asm/byteorder.h>
 # define IBNAL_N_SCHED      1                   /* # schedulers */
 #endif
 
-/* sdp-connection.c */
+#define IBNAL_USE_FMR  1
+
+/* tunables fixed at compile time */
+#define IBNAL_PEER_HASH_SIZE         101        /* # peer lists */
+#define IBNAL_RESCHED                100        /* # scheduler loops before reschedule */
+#define IBNAL_MSG_QUEUE_SIZE         8          /* # messages/RDMAs in-flight */
+#define IBNAL_CREDIT_HIGHWATER       7          /* when eagerly to return credits */
+#define IBNAL_MSG_SIZE              (4<<10)     /* max size of queued messages (inc hdr) */
+
+/* constants derived from sdp-connection.c */
 #define IBNAL_QKEY               0
 #define IBNAL_PKEY               0xffff
 #define IBNAL_PKEY_IDX           0
 #define IBNAL_SGID_IDX           0
 #define IBNAL_SERVICE_LEVEL      0
 #define IBNAL_STATIC_RATE        0
-#define IBNAL_RETRY_CNT          7
-#define IBNAL_RNR_CNT            6 
 #define IBNAL_EE_FLOW_CNT        1
 #define IBNAL_LOCAL_SUB          1
 #define IBNAL_TRAFFIC_CLASS      0
 #define IBNAL_OUS_DST_RD         1
 #define IBNAL_IB_MTU             vv_mtu_1024
 
-/* sdp-hca-params.h */
+/* constants derived from sdp-hca-params.h */
 #define PATH_RATE_2_5GB           2
 #define MLX_IPD_1x                1
 #define MLX_IPD_4x                0
 #define IBNAL_R_2_STATIC_RATE(r)  ((r) == PATH_RATE_2_5GB ? MLX_IPD_1x : MLX_IPD_4x)
 
 /* other low-level IB constants */
-#define IBNAL_LOCAL_ACK_TIMEOUT   0x12
-#define IBNAL_RNR_NAK_TIMER       0x10
 #define IBNAL_PKT_LIFETIME        5
 #define IBNAL_ARB_INITIATOR_DEPTH 0
 #define IBNAL_ARB_RESP_RES        0
 #define IBNAL_FAILOVER_ACCEPTED   0
-#define IBNAL_SERVICE_NUMBER      0x11b9a2      /* Fixed service number */
-
-#define IBNAL_MIN_RECONNECT_INTERVAL HZ         /* first failed connection retry... */
-#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ)    /* ...exponentially increasing to this */
-
-#define IBNAL_MSG_SIZE           (4<<10)        /* max size of queued messages (inc hdr) */
-
-#define IBNAL_MSG_QUEUE_SIZE      8             /* # messages/RDMAs in-flight */
-#define IBNAL_CREDIT_HIGHWATER    7             /* when to eagerly return credits */
-
-#define IBNAL_ARP_RETRIES         3             /* How many times to retry ARP */
-
-#define IBNAL_NTX                 32            /* # tx descs */
-#define IBNAL_NTX_NBLK            256           /* # reserved tx descs */
-
-#define IBNAL_PEER_HASH_SIZE      101           /* # peer lists */
-
-#define IBNAL_RESCHED             100           /* # scheduler loops before reschedule */
-
-#define IBNAL_CONCURRENT_PEERS    1000          /* # nodes all talking at once to me */
-
-#define IBNAL_CKSUM      0
-
-/* default vals for runtime tunables */
-#define IBNAL_IO_TIMEOUT          50            /* default comms timeout (seconds) */
 
 /************************/
 /* derived constants... */
 
 /* TX messages (shared by all connections) */
-#define IBNAL_TX_MSGS       (IBNAL_NTX + IBNAL_NTX_NBLK)
-#define IBNAL_TX_MSG_BYTES  (IBNAL_TX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_TX_MSG_PAGES  ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
-
-#define IBNAL_USE_FMR   1
+#define IBNAL_TX_MSGS()       (*kibnal_tunables.kib_ntx)
+#define IBNAL_TX_MSG_BYTES()  (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE)
+#define IBNAL_TX_MSG_PAGES()  ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE)
 
 #if IBNAL_USE_FMR
 # define IBNAL_MAX_RDMA_FRAGS 1
-# define IBNAL_FMR_NMAPS      1000
+# define IBNAL_CONCURRENT_SENDS IBNAL_RX_MSGS
 #else
-# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV
+# define IBNAL_MAX_RDMA_FRAGS LNET_MAX_IOV
+# define IBNAL_CONCURRENT_SENDS IBNAL_MSG_QUEUE_SIZE
 #endif
 
 /* RX messages (per connection) */
-#define IBNAL_RX_MSGS       IBNAL_MSG_QUEUE_SIZE
-#define IBNAL_RX_MSG_BYTES  (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
-#define IBNAL_RX_MSG_PAGES  ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
+#define IBNAL_RX_MSGS         (IBNAL_MSG_QUEUE_SIZE*2)
+#define IBNAL_RX_MSG_BYTES    (IBNAL_RX_MSGS * IBNAL_MSG_SIZE)
+#define IBNAL_RX_MSG_PAGES    ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE)
 
-#define IBNAL_CQ_ENTRIES  (IBNAL_TX_MSGS * (1 + IBNAL_MAX_RDMA_FRAGS) + \
-                           IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)
+#define IBNAL_CQ_ENTRIES()    (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) +           \
+                               IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers)
 
 typedef struct
 {
-        int               kib_io_timeout;       /* comms timeout (seconds) */
+        unsigned int     *kib_service_number;   /* IB service number */
+        int              *kib_min_reconnect_interval; /* first failed connection retry... */
+        int              *kib_max_reconnect_interval; /* ...exponentially increasing to this */
+        int              *kib_concurrent_peers; /* max # nodes all talking to me */
+        int              *kib_cksum;            /* checksum kib_msg_t? */
+        int              *kib_timeout;          /* comms timeout (seconds) */
+        int              *kib_ntx;              /* # tx descs */
+        int              *kib_credits;          /* # concurrent sends */
+        int              *kib_peercredits;      /* # concurrent sends to 1 peer */
+        int              *kib_arp_retries;      /* # times to retry ARP */
+        char            **kib_hca_basename;     /* HCA base name */
+        char            **kib_ipif_basename;    /* IPoIB interface base name */
+        int              *kib_local_ack_timeout; /* IB RC QP ack timeout... */
+        int              *kib_retry_cnt;        /* ...and retry */
+        int              *kib_rnr_cnt;          /* RNR retries... */
+        int              *kib_rnr_nak_timer;    /* ...and interval */
+        int              *kib_keepalive;        /* keepalive interval */
+        int              *kib_concurrent_sends; /* send work queue sizing */
+#if IBNAL_USE_FMR
+        int              *kib_fmr_remaps;       /* # FMR maps before unmap required */
+#endif
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
         struct ctl_table_header *kib_sysctl;    /* sysctl interface */
+#endif
 } kib_tunables_t;
 
 typedef struct
@@ -198,16 +198,14 @@ typedef struct
         __u64             kib_incarnation;      /* which one am I */
         int               kib_shutdown;         /* shut down? */
         atomic_t          kib_nthreads;         /* # live threads */
+        lnet_ni_t        *kib_ni;               /* _the_ nal instance */
 
-        __u64             kib_svc_id;           /* service number I listen on */
         vv_gid_t          kib_port_gid;         /* device/port GID */
         vv_p_key_t        kib_port_pkey;        /* device/port pkey */
         
-        struct semaphore  kib_nid_mutex;        /* serialise NID ops */
         cm_cep_handle_t   kib_listen_handle;    /* IB listen handle */
 
         rwlock_t          kib_global_lock;      /* stabilize peer/conn ops */
-        spinlock_t        kib_vverbs_lock;      /* serialize vverbs calls */
         int               kib_ready;            /* CQ callback fired */
         int               kib_checking_cq;      /* a scheduler is checking the CQ */
         
@@ -225,16 +223,12 @@ typedef struct
         spinlock_t        kib_connd_lock;       /* serialise */
 
         wait_queue_head_t kib_sched_waitq;      /* schedulers sleep here */
-        struct list_head  kib_sched_txq;        /* tx requiring attention */
-        struct list_head  kib_sched_rxq;        /* rx requiring attention */
         spinlock_t        kib_sched_lock;       /* serialise */
 
         struct kib_tx    *kib_tx_descs;         /* all the tx descriptors */
         kib_pages_t      *kib_tx_pages;         /* premapped tx msg pages */
 
         struct list_head  kib_idle_txs;         /* idle tx descriptors */
-        struct list_head  kib_idle_nblk_txs;    /* idle reserved tx descriptors */
-        wait_queue_head_t kib_idle_tx_waitq;    /* block here for tx descriptor */
         __u64             kib_next_tx_cookie;   /* RDMA completion cookie */
         spinlock_t        kib_tx_lock;          /* serialise */
 
@@ -258,7 +252,7 @@ typedef struct
 #define IBNAL_INIT_CQ              7
 #define IBNAL_INIT_ALL             8
 
-#include "vibnal_wire.h"
+#include "viblnd_wire.h"
 
 /***********************************************************************/
 
@@ -266,8 +260,7 @@ typedef struct kib_rx                           /* receive message */
 {
         struct list_head          rx_list;      /* queue for attention */
         struct kib_conn          *rx_conn;      /* owning conn */
-        int                       rx_responded; /* responded to peer? */
-        int                       rx_posted;    /* posted? */
+        int                       rx_nob;       /* # bytes received (-1 while posted) */
         vv_l_key_t                rx_lkey;      /* local key */
         kib_msg_t                *rx_msg;       /* pre-mapped buffer (host vaddr) */
         vv_wr_t                   rx_wrq;       /* receive work item */
@@ -277,7 +270,6 @@ typedef struct kib_rx                           /* receive message */
 typedef struct kib_tx                           /* transmit message */
 {
         struct list_head          tx_list;      /* queue on idle_txs ibc_tx_queue etc. */
-        int                       tx_isnblk;    /* I'm reserved for non-blocking sends */
         struct kib_conn          *tx_conn;      /* owning conn */
         int                       tx_sending;   /* # tx callbacks outstanding */
         int                       tx_queued;    /* queued for sending */
@@ -285,7 +277,7 @@ typedef struct kib_tx                           /* transmit message */
         int                       tx_status;    /* completion status */
         unsigned long             tx_deadline;  /* completion deadline */
         __u64                     tx_cookie;    /* completion cookie */
-        lib_msg_t                *tx_libmsg[2]; /* lib msgs to finalize on completion */
+        lnet_msg_t               *tx_lntmsg[2]; /* lnet msgs to finalize on completion */
         vv_l_key_t                tx_lkey;      /* local key for message buffer */
         kib_msg_t                *tx_msg;       /* message buffer (host vaddr) */
         int                       tx_nwrq;      /* # send work items */
@@ -293,8 +285,8 @@ typedef struct kib_tx                           /* transmit message */
         vv_wr_t                   tx_wrq[2];    /* send work items... */
         vv_scatgat_t              tx_gl[2];     /* ...and their memory */
         kib_rdma_desc_t           tx_rd[1];     /* rdma descriptor */
-        kib_md_t                  tx_md;        /* FMA mapping descriptor */
-        __u64                    *tx_pages;     /* page array for mapping */
+        kib_md_t                  tx_md;        /* FMR mapping descriptor */
+        __u64                    *tx_pages;     /* page phys addrs */
 #else
         vv_wr_t                  *tx_wrq;       /* send work items... */
         vv_scatgat_t             *tx_gl;        /* ...and their memory */
@@ -302,9 +294,6 @@ typedef struct kib_tx                           /* transmit message */
 #endif
 } kib_tx_t;
 
-#define KIB_TX_UNMAPPED       0
-#define KIB_TX_MAPPED         1
-
 /* Passive connection request (listener callback) queued for handling by connd */
 typedef struct kib_pcreq
 {
@@ -337,15 +326,19 @@ typedef struct kib_conn
         __u64               ibc_incarnation;    /* which instance of the peer */
         __u64               ibc_txseq;          /* tx sequence number */
         __u64               ibc_rxseq;          /* rx sequence number */
+        __u32               ibc_version;        /* peer protocol version */
         atomic_t            ibc_refcount;       /* # users */
         int                 ibc_state;          /* what's happening */
-        atomic_t            ibc_nob;            /* # bytes buffered */
         int                 ibc_nsends_posted;  /* # uncompleted sends */
         int                 ibc_credits;        /* # credits I have */
         int                 ibc_outstanding_credits; /* # credits to return */
+        int                 ibc_reserved_credits; /* # credits for ACK/DONE msgs */
         int                 ibc_disconnect;     /* some disconnect callback fired */
         int                 ibc_comms_error;    /* set on comms error */
+        unsigned long       ibc_last_send;      /* time of last send */
         struct list_head    ibc_early_rxs;      /* rxs completed before ESTABLISHED */
+        struct list_head    ibc_tx_queue_nocred; /* sends that don't need a cred */
+        struct list_head    ibc_tx_queue_rsrvd; /* sends that need a reserved cred */
         struct list_head    ibc_tx_queue;       /* send queue */
         struct list_head    ibc_active_txs;     /* active tx awaiting completion */
         spinlock_t          ibc_lock;           /* serialise */
@@ -373,7 +366,7 @@ typedef struct kib_peer
 {
         struct list_head    ibp_list;           /* stash on global peer list */
         struct list_head    ibp_connd_list;     /* schedule on kib_connd_peers */
-        ptl_nid_t           ibp_nid;            /* who's on the other end(s) */
+        lnet_nid_t          ibp_nid;            /* who's on the other end(s) */
         __u32               ibp_ip;             /* IP to query for peer conn params */
         int                 ibp_port;           /* port to qery for peer conn params */
         __u64               ibp_incarnation;    /* peer's incarnation */
@@ -381,32 +374,46 @@ typedef struct kib_peer
         int                 ibp_persistence;    /* "known" peer refs */
         struct list_head    ibp_conns;          /* all active connections */
         struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
-        int                 ibp_connecting;     /* connecting+accepting */
+        int                 ibp_connecting;     /* current active connection attempts */
+        int                 ibp_accepting;      /* current passive connection attempts */
         int                 ibp_arp_count;      /* # arp attempts */
         unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
+        int                 ibp_error;          /* errno on closing this peer */
+        cfs_time_t          ibp_last_alive;     /* when (in jiffies) I was last alive */
 } kib_peer_t;
 
 
-extern lib_nal_t       kibnal_lib;
 extern kib_data_t      kibnal_data;
 extern kib_tunables_t  kibnal_tunables;
 
+int kibnal_startup (lnet_ni_t *ni);
+void kibnal_shutdown (lnet_ni_t *ni);
+int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+extern int kibnal_eager_recv (lnet_ni_t *ni, void *private,
+                              lnet_msg_t *lntmsg, void **new_private);
+int kibnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, 
+                int delayed, unsigned int niov, 
+                struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
 extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob);
-extern void kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid,
-                            __u64 dststamp, __u64 seq);
-extern int kibnal_unpack_msg(kib_msg_t *msg, int nob);
-extern kib_peer_t *kibnal_create_peer(ptl_nid_t nid);
+extern void kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits,
+                            lnet_nid_t dstnid, __u64 dststamp, __u64 seq);
+extern int  kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob);
+extern int  kibnal_create_peer(kib_peer_t **peerp, lnet_nid_t nid);
 extern void kibnal_destroy_peer(kib_peer_t *peer);
-extern int kibnal_del_peer(ptl_nid_t nid, int single_share);
-extern kib_peer_t *kibnal_find_peer_locked(ptl_nid_t nid);
+extern int  kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip);
+extern int  kibnal_del_peer(lnet_nid_t nid);
+extern kib_peer_t *kibnal_find_peer_locked(lnet_nid_t nid);
 extern void kibnal_unlink_peer_locked(kib_peer_t *peer);
+extern void kibnal_peer_alive(kib_peer_t *peer);
 extern int  kibnal_close_stale_conns_locked(kib_peer_t *peer,
                                             __u64 incarnation);
 extern kib_conn_t *kibnal_create_conn(cm_cep_handle_t cep);
 extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg);
 
-extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access);
+extern int  kibnal_alloc_pages(kib_pages_t **pp, int npages, int access);
 extern void kibnal_free_pages(kib_pages_t *p);
 
 extern void kibnal_check_sends(kib_conn_t *conn);
@@ -421,16 +428,12 @@ extern int  kibnal_set_qp_state(kib_conn_t *conn, vv_qp_state_t new_state);
 extern void kibnal_async_callback(vv_event_record_t ev);
 extern void kibnal_cq_callback(unsigned long context);
 extern void kibnal_passive_connreq(kib_pcreq_t *pcr, int reject);
-extern void kibnal_pause(int ticks);
+extern void kibnal_txlist_done (struct list_head *txlist, int status);
 extern void kibnal_queue_tx(kib_tx_t *tx, kib_conn_t *conn);
 extern int  kibnal_init_rdma(kib_tx_t *tx, int type, int nob,
                              kib_rdma_desc_t *dstrd, __u64 dstcookie);
-
-static inline int
-wrq_signals_completion (vv_wr_t *wrq)
-{
-        return wrq->completion_notification != 0;
-}
+extern int  kibnal_tunables_init(void);
+extern void kibnal_tunables_fini(void);
 
 #define kibnal_conn_addref(conn)                                \
 do {                                                            \
@@ -458,8 +461,8 @@ do {                                                                          \
 
 #define kibnal_peer_addref(peer)                                \
 do {                                                            \
-        CDEBUG(D_NET, "peer[%p] -> "LPX64" (%d)++\n",           \
-               (peer), (peer)->ibp_nid,                         \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
                atomic_read (&(peer)->ibp_refcount));            \
         LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
         atomic_inc(&(peer)->ibp_refcount);                      \
@@ -467,8 +470,8 @@ do {                                                            \
 
 #define kibnal_peer_decref(peer)                                \
 do {                                                            \
-        CDEBUG(D_NET, "peer[%p] -> "LPX64" (%d)--\n",           \
-               (peer), (peer)->ibp_nid,                         \
+        CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n",                \
+               (peer), libcfs_nid2str((peer)->ibp_nid),         \
                atomic_read (&(peer)->ibp_refcount));            \
         LASSERT(atomic_read(&(peer)->ibp_refcount) > 0);        \
         if (atomic_dec_and_test(&(peer)->ibp_refcount))         \
@@ -476,7 +479,7 @@ do {                                                            \
 } while (0)
 
 static inline struct list_head *
-kibnal_nid2peerlist (ptl_nid_t nid)
+kibnal_nid2peerlist (lnet_nid_t nid)
 {
         unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size;
 
@@ -493,38 +496,67 @@ kibnal_peer_active (kib_peer_t *peer)
 static inline void
 kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn)
 {
-        /* CAVEAT EMPTOR: tx takes caller's ref on conn */
-
+        struct list_head  *q;
+        
         LASSERT (tx->tx_nwrq > 0);              /* work items set up */
         LASSERT (!tx->tx_queued);               /* not queued for sending already */
 
+        tx->tx_queued = 1;
+        tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ);
+
         if (tx->tx_conn == NULL) {
                 kibnal_conn_addref(conn);
                 tx->tx_conn = conn;
+                LASSERT (tx->tx_msg->ibm_type != IBNAL_MSG_PUT_DONE);
         } else {
                 LASSERT (tx->tx_conn == conn);
                 LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE);
         }
-        tx->tx_queued = 1;
-        tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ;
-        list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+
+        if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                /* All messages have simple credit control */
+                q = &conn->ibc_tx_queue;
+        } else {
+                LASSERT (conn->ibc_version == IBNAL_MSG_VERSION);
+                
+                switch (tx->tx_msg->ibm_type) {
+                case IBNAL_MSG_PUT_REQ:
+                case IBNAL_MSG_GET_REQ:
+                        /* RDMA request: reserve a buffer for the RDMA reply
+                         * before sending */
+                        q = &conn->ibc_tx_queue_rsrvd;
+                        break;
+
+                case IBNAL_MSG_PUT_NAK:
+                case IBNAL_MSG_PUT_ACK:
+                case IBNAL_MSG_PUT_DONE:
+                case IBNAL_MSG_GET_DONE:
+                        /* RDMA reply/completion: no credits; peer has reserved
+                         * a reply buffer */
+                        q = &conn->ibc_tx_queue_nocred;
+                        break;
+                
+                case IBNAL_MSG_NOOP:
+                case IBNAL_MSG_IMMEDIATE:
+                        /* Otherwise: consume a credit before sending */
+                        q = &conn->ibc_tx_queue;
+                        break;
+                
+                default:
+                        LBUG();
+                        q = NULL;
+                }
+        }
+        
+        list_add_tail(&tx->tx_list, q);
 }
 
-static inline __u64
-kibnal_page2phys (struct page *p)
+static inline int
+kibnal_send_keepalive(kib_conn_t *conn) 
 {
-#if IBNAL_32BIT_PAGE2PHYS
-        CLASSERT (sizeof(typeof(page_to_phys(p))) == 4);
-        CLASSERT (sizeof(unsigned long) == 4);
-        /* page_to_phys returns a 32 bit physical address.  This must be a 32
-         * bit machine with <= 4G memory and we must ensure we don't sign
-         * extend when converting to 64 bits. */
-        return (unsigned long)page_to_phys(p);
-#else
-        CLASSERT (sizeof(typeof(page_to_phys(p))) == 8);
-        /* page_to_phys returns a 64 bit physical address :) */
-        return page_to_phys(p);
-#endif
+        return (*kibnal_tunables.kib_keepalive > 0) &&
+                time_after(jiffies, conn->ibc_last_send +
+                           *kibnal_tunables.kib_keepalive*HZ);
 }
 
 #if IBNAL_VOIDSTAR_SGADDR
index 139c5ea..490a7e9 100644 (file)
  *
  */
 
-#include "vibnal.h"
+#include "viblnd.h"
 
 void
 kibnal_tx_done (kib_tx_t *tx)
 {
-        ptl_err_t        ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL;
-        int              i;
+        lnet_msg_t *lntmsg[2];
+        int         rc = tx->tx_status;
+        int         i;
 
         LASSERT (!in_interrupt());
         LASSERT (!tx->tx_queued);               /* mustn't be queued for sending */
@@ -37,7 +38,7 @@ kibnal_tx_done (kib_tx_t *tx)
 
 #if IBNAL_USE_FMR
         if (tx->tx_md.md_fmrcount == 0 ||
-            (ptlrc != PTL_OK && tx->tx_md.md_active)) {
+            (rc != 0 && tx->tx_md.md_active)) {
                 vv_return_t      vvrc;
 
                 /* mapping must be active (it dropped fmrcount to 0) */
@@ -47,18 +48,14 @@ kibnal_tx_done (kib_tx_t *tx)
                                     1, &tx->tx_md.md_fmrhandle);
                 LASSERT (vvrc == vv_return_ok);
 
-                tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS;
+                tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps;
         }
         tx->tx_md.md_active = 0;
 #endif
-        for (i = 0; i < 2; i++) {
-                /* tx may have up to 2 libmsgs to finalise */
-                if (tx->tx_libmsg[i] == NULL)
-                        continue;
 
-                lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc);
-                tx->tx_libmsg[i] = NULL;
-        }
+        /* tx may have up to 2 lnet msgs to finalise */
+        lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL;
+        lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL;
         
         if (tx->tx_conn != NULL) {
                 kibnal_conn_decref(tx->tx_conn);
@@ -70,77 +67,71 @@ kibnal_tx_done (kib_tx_t *tx)
 
         spin_lock(&kibnal_data.kib_tx_lock);
 
-        if (tx->tx_isnblk) {
-                list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs);
-        } else {
-                list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
-                wake_up (&kibnal_data.kib_idle_tx_waitq);
-        }
+        list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
 
         spin_unlock(&kibnal_data.kib_tx_lock);
+
+        /* delay finalize until my descs have been freed */
+        for (i = 0; i < 2; i++) {
+                if (lntmsg[i] == NULL)
+                        continue;
+
+                lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc);
+        }
 }
 
-kib_tx_t *
-kibnal_get_idle_tx (int may_block) 
+void
+kibnal_txlist_done (struct list_head *txlist, int status)
 {
-        kib_tx_t      *tx = NULL;
-        ENTRY;
-        
-        for (;;) {
-                spin_lock(&kibnal_data.kib_tx_lock);
+        kib_tx_t *tx;
 
-                /* "normal" descriptor is free */
-                if (!list_empty (&kibnal_data.kib_idle_txs)) {
-                        tx = list_entry (kibnal_data.kib_idle_txs.next,
-                                         kib_tx_t, tx_list);
-                        break;
-                }
+        while (!list_empty (txlist)) {
+                tx = list_entry (txlist->next, kib_tx_t, tx_list);
 
-                if (!may_block) {
-                        /* may dip into reserve pool */
-                        if (list_empty (&kibnal_data.kib_idle_nblk_txs)) {
-                                CERROR ("reserved tx desc pool exhausted\n");
-                                break;
-                        }
+                list_del (&tx->tx_list);
+                /* complete now */
+                tx->tx_waiting = 0;
+                tx->tx_status = status;
+                kibnal_tx_done (tx);
+        }
+}
 
-                        tx = list_entry (kibnal_data.kib_idle_nblk_txs.next,
-                                         kib_tx_t, tx_list);
-                        break;
-                }
+kib_tx_t *
+kibnal_get_idle_tx (void) 
+{
+        kib_tx_t      *tx;
+        
+        spin_lock(&kibnal_data.kib_tx_lock);
 
-                /* block for idle tx */
+        if (list_empty (&kibnal_data.kib_idle_txs)) {
                 spin_unlock(&kibnal_data.kib_tx_lock);
-
-                wait_event (kibnal_data.kib_idle_tx_waitq,
-                            !list_empty (&kibnal_data.kib_idle_txs) ||
-                            kibnal_data.kib_shutdown);
+                return NULL;
         }
 
-        if (tx != NULL) {
-                list_del (&tx->tx_list);
-
-                /* Allocate a new completion cookie.  It might not be needed,
-                 * but we've got a lock right now and we're unlikely to
-                 * wrap... */
-                tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
+        tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
+        list_del (&tx->tx_list);
 
-                LASSERT (tx->tx_nwrq == 0);
-                LASSERT (!tx->tx_queued);
-                LASSERT (tx->tx_sending == 0);
-                LASSERT (!tx->tx_waiting);
-                LASSERT (tx->tx_status == 0);
-                LASSERT (tx->tx_conn == NULL);
-                LASSERT (tx->tx_libmsg[0] == NULL);
-                LASSERT (tx->tx_libmsg[1] == NULL);
-        }
+        /* Allocate a new completion cookie.  It might not be needed,
+         * but we've got a lock right now and we're unlikely to
+         * wrap... */
+        tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
 
         spin_unlock(&kibnal_data.kib_tx_lock);
-        
-        RETURN(tx);
+
+        LASSERT (tx->tx_nwrq == 0);
+        LASSERT (!tx->tx_queued);
+        LASSERT (tx->tx_sending == 0);
+        LASSERT (!tx->tx_waiting);
+        LASSERT (tx->tx_status == 0);
+        LASSERT (tx->tx_conn == NULL);
+        LASSERT (tx->tx_lntmsg[0] == NULL);
+        LASSERT (tx->tx_lntmsg[1] == NULL);
+        
+        return tx;
 }
 
 int
-kibnal_post_rx (kib_rx_t *rx, int credit)
+kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit)
 {
         kib_conn_t   *conn = rx->rx_conn;
         int           rc = 0;
@@ -148,6 +139,9 @@ kibnal_post_rx (kib_rx_t *rx, int credit)
         vv_return_t   vvrc;
 
         LASSERT (!in_interrupt());
+        /* old peers don't reserve rxs for RDMA replies */
+        LASSERT (!rsrvd_credit ||
+                 conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
         
         rx->rx_gl = (vv_scatgat_t) {
                 .v_address = KIBNAL_ADDR2SG(addr),
@@ -164,7 +158,7 @@ kibnal_post_rx (kib_rx_t *rx, int credit)
         };
 
         LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
-        LASSERT (!rx->rx_posted);
+        LASSERT (rx->rx_nob >= 0);              /* not posted */
 
         CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", 
                rx->rx_wrq.scatgat_list->length,
@@ -177,27 +171,31 @@ kibnal_post_rx (kib_rx_t *rx, int credit)
                 return 0;
         }
         
-        rx->rx_posted = 1;
-
+        rx->rx_nob = -1;                        /* flag posted */
+        
         spin_lock(&conn->ibc_lock);
         /* Serialise vv_post_receive; it's not re-entrant on the same QP */
         vvrc = vv_post_receive(kibnal_data.kib_hca,
                                conn->ibc_qp, &rx->rx_wrq);
-        spin_unlock(&conn->ibc_lock);
 
         if (vvrc == vv_return_ok) {
-                if (credit) {
-                        spin_lock(&conn->ibc_lock);
+                if (credit)
                         conn->ibc_outstanding_credits++;
-                        spin_unlock(&conn->ibc_lock);
+                if (rsrvd_credit)
+                        conn->ibc_reserved_credits++;
+
+                spin_unlock(&conn->ibc_lock);
 
+                if (credit || rsrvd_credit)
                         kibnal_check_sends(conn);
-                }
+
                 return 0;
         }
         
-        CERROR ("post rx -> "LPX64" failed %d\n", 
-                conn->ibc_peer->ibp_nid, vvrc);
+        spin_unlock(&conn->ibc_lock);
+
+        CERROR ("post rx -> %s failed %d\n", 
+                libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
         rc = -EIO;
         kibnal_close_conn(rx->rx_conn, rc);
         /* No more posts for this rx; so lose its ref */
@@ -218,7 +216,7 @@ kibnal_post_receives (kib_conn_t *conn)
                 /* +1 ref for rx desc.  This ref remains until kibnal_post_rx
                  * fails (i.e. actual failure or we're disconnecting) */
                 kibnal_conn_addref(conn);
-                rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
+                rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0);
                 if (rc != 0)
                         return rc;
         }
@@ -263,9 +261,8 @@ kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
         if (tx == NULL) {
                 spin_unlock(&conn->ibc_lock);
 
-                CWARN("Unmatched completion type %x cookie "LPX64
-                      " from "LPX64"\n",
-                      txtype, cookie, conn->ibc_peer->ibp_nid);
+                CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+                      txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 kibnal_close_conn (conn, -EPROTO);
                 return;
         }
@@ -274,12 +271,8 @@ kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
                 if (status < 0) {               /* failed? */
                         tx->tx_status = status;
                 } else if (txtype == IBNAL_MSG_GET_REQ) { 
-                        /* XXX layering violation: set REPLY data length */
-                        LASSERT (tx->tx_libmsg[1] != NULL);
-                        LASSERT (tx->tx_libmsg[1]->ev.type == 
-                                 PTL_EVENT_REPLY_END);
-
-                        tx->tx_libmsg[1]->ev.mlength = status;
+                        lnet_set_reply_msg_len(kibnal_data.kib_ni,
+                                               tx->tx_lntmsg[1], status);
                 }
         }
         
@@ -298,11 +291,11 @@ kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
 void
 kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) 
 {
-        kib_tx_t    *tx = kibnal_get_idle_tx(0);
+        kib_tx_t    *tx = kibnal_get_idle_tx();
         
         if (tx == NULL) {
-                CERROR("Can't get tx for completion %x for "LPX64"\n",
-                       type, conn->ibc_peer->ibp_nid);
+                CERROR("Can't get tx for completion %x for %s\n",
+                       type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 return;
         }
         
@@ -320,12 +313,15 @@ kibnal_handle_rx (kib_rx_t *rx)
         kib_conn_t   *conn = rx->rx_conn;
         int           credits = msg->ibm_credits;
         kib_tx_t     *tx;
-        int           rc;
+        int           rc = 0;
+        int           repost = 1;
+        int           rsrvd_credit = 0;
+        int           rc2;
 
         LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
 
-        CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n",
-                msg->ibm_type, credits, conn->ibc_peer->ibp_nid);
+        CDEBUG (D_NET, "Received %x[%d] from %s\n",
+                msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
         
         if (credits != 0) {
                 /* Have I received credits that will let me send? */
@@ -338,37 +334,38 @@ kibnal_handle_rx (kib_rx_t *rx)
 
         switch (msg->ibm_type) {
         default:
-                CERROR("Bad IBNAL message type %x from "LPX64"\n",
-                       msg->ibm_type, conn->ibc_peer->ibp_nid);
+                CERROR("Bad IBNAL message type %x from %s\n",
+                       msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                rc = -EPROTO;
                 break;
 
         case IBNAL_MSG_NOOP:
                 break;
 
         case IBNAL_MSG_IMMEDIATE:
-                lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx);
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
+                                msg->ibm_srcnid, rx, 0);
+                repost = rc < 0;                /* repost on error */
                 break;
                 
         case IBNAL_MSG_PUT_REQ:
-                rx->rx_responded = 0;
-                lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx);
-                if (rx->rx_responded)
-                        break;
-
-                /* I wasn't asked to transfer any payload data.  This happens
-                 * if the PUT didn't match, or got truncated. */
-                kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
-                                       msg->ibm_u.putreq.ibprm_cookie);
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                repost = rc < 0;                /* repost on error */
                 break;
 
         case IBNAL_MSG_PUT_NAK:
-                CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid);
+                rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
+                
+                CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, 
                                          msg->ibm_u.completion.ibcm_status,
                                          msg->ibm_u.completion.ibcm_cookie);
                 break;
 
         case IBNAL_MSG_PUT_ACK:
+                rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
+
                 spin_lock(&conn->ibc_lock);
                 tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
                                                    msg->ibm_u.putack.ibpam_src_cookie);
@@ -377,9 +374,9 @@ kibnal_handle_rx (kib_rx_t *rx)
                 spin_unlock(&conn->ibc_lock);
 
                 if (tx == NULL) {
-                        CERROR("Unmatched PUT_ACK from "LPX64"\n",
-                               conn->ibc_peer->ibp_nid);
-                        kibnal_close_conn(conn, -EPROTO);
+                        CERROR("Unmatched PUT_ACK from %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        rc = -EPROTO;
                         break;
                 }
 
@@ -390,47 +387,55 @@ kibnal_handle_rx (kib_rx_t *rx)
 
                 tx->tx_nwrq = 0;                /* overwrite PUT_REQ */
 
-                rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
-                                      kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
-                                      &msg->ibm_u.putack.ibpam_rd,
-                                      msg->ibm_u.putack.ibpam_dst_cookie);
-                if (rc < 0)
-                        CERROR("Can't setup rdma for PUT to "LPX64": %d\n",
-                               conn->ibc_peer->ibp_nid, rc);
+                rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, 
+                                       kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
+                                       &msg->ibm_u.putack.ibpam_rd,
+                                       msg->ibm_u.putack.ibpam_dst_cookie);
+                if (rc2 < 0)
+                        CERROR("Can't setup rdma for PUT to %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2);
 
                 spin_lock(&conn->ibc_lock);
-                if (tx->tx_status == 0 && rc < 0)
-                        tx->tx_status = rc;
+                if (tx->tx_status == 0 && rc2 < 0)
+                        tx->tx_status = rc2;
                 tx->tx_waiting = 0;             /* clear waiting and queue atomically */
                 kibnal_queue_tx_locked(tx, conn);
                 spin_unlock(&conn->ibc_lock);
                 break;
                 
         case IBNAL_MSG_PUT_DONE:
+                /* This buffer was pre-reserved by not returning the credit
+                 * when the PUT_REQ's buffer was reposted, so I just return it
+                 * now */
                 kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
                                          msg->ibm_u.completion.ibcm_status,
                                          msg->ibm_u.completion.ibcm_cookie);
                 break;
 
         case IBNAL_MSG_GET_REQ:
-                rx->rx_responded = 0;
-                lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx);
-                if (rx->rx_responded)           /* I responded to the GET_REQ */
-                        break;
-                /* NB GET didn't match (I'd have responded even with no payload
-                 * data) */
-                kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA,
-                                       msg->ibm_u.get.ibgm_cookie);
+                rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
+                                msg->ibm_srcnid, rx, 1);
+                repost = rc < 0;                /* repost on error */
                 break;
 
         case IBNAL_MSG_GET_DONE:
+                rsrvd_credit = 1;               /* rdma reply (was pre-reserved) */
+                
                 kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
                                          msg->ibm_u.completion.ibcm_status,
                                          msg->ibm_u.completion.ibcm_cookie);
                 break;
         }
 
-        kibnal_post_rx(rx, 1);
+        if (rc < 0)                             /* protocol error */
+                kibnal_close_conn(conn, rc);
+
+        if (repost) {
+                if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
+                        rsrvd_credit = 0;       /* peer isn't pre-reserving */
+
+                kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit);
+        }
 }
 
 void
@@ -441,42 +446,50 @@ kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq)
         unsigned long flags;
         int           rc;
 
-        CDEBUG (D_NET, "rx %p conn %p\n", rx, conn);
-        LASSERT (rx->rx_posted);
-        rx->rx_posted = 0;
+        CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
+        LASSERT (rx->rx_nob < 0);               /* was posted */
+        rx->rx_nob = 0;                         /* isn't now */
 
         if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
                 goto ignore;
 
         if (vvrc != vv_comp_status_success) {
-                CERROR("Rx from "LPX64" failed: %d\n", 
-                       conn->ibc_peer->ibp_nid, vvrc);
+                CERROR("Rx from %s failed: %d\n", 
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc);
                 goto failed;
         }
 
-        rc = kibnal_unpack_msg(msg, nob);
+        rc = kibnal_unpack_msg(msg, conn->ibc_version, nob);
         if (rc != 0) {
-                CERROR ("Error %d unpacking rx from "LPX64"\n",
-                        rc, conn->ibc_peer->ibp_nid);
+                CERROR ("Error %d unpacking rx from %s\n",
+                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 goto failed;
         }
 
-        if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+        rx->rx_nob = nob;                       /* Can trust 'nob' now */
+
+        if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid,
+                                     msg->ibm_srcnid) ||
+            !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, 
+                                     msg->ibm_dstnid) ||
             msg->ibm_srcstamp != conn->ibc_incarnation ||
-            msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
             msg->ibm_dststamp != kibnal_data.kib_incarnation) {
-                CERROR ("Stale rx from "LPX64"\n",
-                        conn->ibc_peer->ibp_nid);
+                CERROR ("Stale rx from %s\n",
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 goto failed;
         }
 
         if (msg->ibm_seq != rxseq) {
-                CERROR ("Out-of-sequence rx from "LPX64
+                CERROR ("Out-of-sequence rx from %s"
                         ": got "LPD64" but expected "LPD64"\n",
-                        conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq);
+                        libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                        msg->ibm_seq, rxseq);
                 goto failed;
         }
 
+        /* set time last known alive */
+        kibnal_peer_alive(conn->ibc_peer);
+
         /* racing with connection establishment/teardown! */
 
         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
@@ -546,8 +559,10 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
 
         /* Try to create an address that adaptor-tavor will munge into a valid
          * network address, given how it maps all phys mem into 1 region */
-        addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET;
+        addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET;
 
+        /* NB this relies entirely on there being a single region for the whole
+         * of memory, since "high" memory will wrap in the (void *) cast! */
         vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, 
                                     (void *)((unsigned long)addr),
                                     len, &mem_h, &l_key, &r_key);
@@ -585,7 +600,7 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
 int
 kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, 
                     vv_access_con_bit_mask_t access,
-                    int niov, struct iovec *iov, int offset, int nob)
+                    unsigned int niov, struct iovec *iov, int offset, int nob)
                  
 {
         /* active if I'm sending */
@@ -643,7 +658,7 @@ kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd,
 int
 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, 
                       vv_access_con_bit_mask_t access,
-                      int nkiov, ptl_kiov_t *kiov, int offset, int nob)
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 {
         /* active if I'm sending */
         int            active = ((access & vv_acc_r_mem_write) == 0);
@@ -695,7 +710,7 @@ kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
         LASSERT (tx->tx_md.md_fmrcount > 0);
         LASSERT (page_offset < PAGE_SIZE);
         LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
-        LASSERT (npages <= PTL_MD_MAX_IOV);
+        LASSERT (npages <= LNET_MAX_IOV);
 
         memset(&map_props, 0, sizeof(map_props));
 
@@ -730,7 +745,7 @@ kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
 int
 kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                      vv_access_con_bit_mask_t access,
-                     int niov, struct iovec *iov, int offset, int nob)
+                     unsigned int niov, struct iovec *iov, int offset, int nob)
                  
 {
         /* active if I'm sending */
@@ -741,7 +756,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
         int           npages;
         unsigned long page_offset;
         unsigned long vaddr;
-        
+
         LASSERT (nob > 0);
         LASSERT (niov > 0);
 
@@ -764,7 +779,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
         npages = 0;
 
         do {
-                LASSERT (npages < PTL_MD_MAX_IOV);
+                LASSERT (npages < LNET_MAX_IOV);
 
                 page = kibnal_kvaddr_to_page(vaddr);
                 if (page == NULL) {
@@ -772,7 +787,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                         return -EFAULT;
                 }
 
-                tx->tx_pages[npages++] = kibnal_page2phys(page);
+                tx->tx_pages[npages++] = lnet_page2phys(page);
 
                 fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
                 vaddr += fragnob;
@@ -786,7 +801,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 int
 kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                       vv_access_con_bit_mask_t access,
-                      int nkiov, ptl_kiov_t *kiov, int offset, int nob)
+                      int nkiov, lnet_kiov_t *kiov, int offset, int nob)
 {
         /* active if I'm sending */
         int            active = ((access & vv_acc_r_mem_write) == 0);
@@ -798,7 +813,7 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
 
         LASSERT (nob > 0);
         LASSERT (nkiov > 0);
-        LASSERT (nkiov <= PTL_MD_MAX_IOV);
+        LASSERT (nkiov <= LNET_MAX_IOV);
         LASSERT (!tx->tx_md.md_active);
         LASSERT ((rd != tx->tx_rd) == !active);
 
@@ -815,7 +830,7 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
         npages = 0;
 
         do {
-                LASSERT (npages < PTL_MD_MAX_IOV);
+                LASSERT (npages < LNET_MAX_IOV);
                 LASSERT (nkiov > 0);
 
                 if ((npages > 0 && kiov->kiov_offset != 0) ||
@@ -829,7 +844,7 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd,
                         return -EINVAL;
                 }
 
-                tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page);
+                tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page);
                 resid -= kiov->kiov_len;
                 kiov++;
                 nkiov--;
@@ -856,25 +871,42 @@ void
 kibnal_check_sends (kib_conn_t *conn)
 {
         kib_tx_t       *tx;
-        vv_return_t     vvrc;                        
+        vv_return_t     vvrc;
         int             rc;
+        int             consume_cred;
         int             done;
 
         /* Don't send anything until after the connection is established */
         if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
-                CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid);
+                CDEBUG(D_NET, "%s too soon\n",
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
                 return;
         }
         
         spin_lock(&conn->ibc_lock);
 
-        LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
-
+        LASSERT (conn->ibc_nsends_posted <=
+                 *kibnal_tunables.kib_concurrent_sends);
+        LASSERT (conn->ibc_reserved_credits >= 0);
+        
+        while (conn->ibc_reserved_credits > 0 &&
+               !list_empty(&conn->ibc_tx_queue_rsrvd)) {
+                LASSERT (conn->ibc_version != 
+                         IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+                tx = list_entry(conn->ibc_tx_queue_rsrvd.next,
+                                kib_tx_t, tx_list);
+                list_del(&tx->tx_list);
+                list_add_tail(&tx->tx_list, &conn->ibc_tx_queue);
+                conn->ibc_reserved_credits--;
+        }
+        
         if (list_empty(&conn->ibc_tx_queue) &&
-            conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
+            list_empty(&conn->ibc_tx_queue_nocred) &&
+            (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER ||
+             kibnal_send_keepalive(conn))) {
                 spin_unlock(&conn->ibc_lock);
                 
-                tx = kibnal_get_idle_tx(0);     /* don't block */
+                tx = kibnal_get_idle_tx();
                 if (tx != NULL)
                         kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
 
@@ -884,9 +916,22 @@ kibnal_check_sends (kib_conn_t *conn)
                         kibnal_queue_tx_locked(tx, conn);
         }
 
-        while (!list_empty (&conn->ibc_tx_queue)) {
-                tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
-
+        for (;;) {
+                if (!list_empty(&conn->ibc_tx_queue_nocred)) {
+                        LASSERT (conn->ibc_version != 
+                                 IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD);
+                        tx = list_entry (conn->ibc_tx_queue_nocred.next, 
+                                         kib_tx_t, tx_list);
+                        consume_cred = 0;
+                } else if (!list_empty (&conn->ibc_tx_queue)) {
+                        tx = list_entry (conn->ibc_tx_queue.next, 
+                                         kib_tx_t, tx_list);
+                        consume_cred = 1;
+                } else {
+                        /* nothing waiting */
+                        break;
+                }
+                
                 LASSERT (tx->tx_queued);
                 /* We rely on this for QP sizing */
                 LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
@@ -896,23 +941,27 @@ kibnal_check_sends (kib_conn_t *conn)
                 LASSERT (conn->ibc_credits >= 0);
                 LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
 
-                if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
-                        CDEBUG(D_NET, LPX64": posted enough\n",
-                               conn->ibc_peer->ibp_nid);
-                        break;
-                }
-                
-                if (conn->ibc_credits == 0) {   /* no credits */
-                        CDEBUG(D_NET, LPX64": no credits\n",
-                               conn->ibc_peer->ibp_nid);
+                if (conn->ibc_nsends_posted ==
+                    *kibnal_tunables.kib_concurrent_sends) {
+                        /* We've got some tx completions outstanding... */
+                        CDEBUG(D_NET, "%s: posted enough\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         break;
                 }
                 
-                if (conn->ibc_credits == 1 &&   /* last credit reserved for */
-                    conn->ibc_outstanding_credits == 0) { /* giving back credits */
-                        CDEBUG(D_NET, LPX64": not using last credit\n",
-                               conn->ibc_peer->ibp_nid);
-                        break;
+                if (consume_cred) {
+                        if (conn->ibc_credits == 0) {   /* no credits */
+                                CDEBUG(D_NET, "%s: no credits\n",
+                                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                                break;
+                        }
+                        
+                        if (conn->ibc_credits == 1 &&   /* last credit reserved for */
+                            conn->ibc_outstanding_credits == 0) { /* giving back credits */
+                                CDEBUG(D_NET, "%s: not using last credit\n",
+                                       libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                                break;
+                        }
                 }
                 
                 list_del (&tx->tx_list);
@@ -922,24 +971,28 @@ kibnal_check_sends (kib_conn_t *conn)
 
                 if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
                     (!list_empty(&conn->ibc_tx_queue) ||
-                     conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
+                     !list_empty(&conn->ibc_tx_queue_nocred) ||
+                     (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER &&
+                      !kibnal_send_keepalive(conn)))) {
                         /* redundant NOOP */
                         spin_unlock(&conn->ibc_lock);
                         kibnal_tx_done(tx);
                         spin_lock(&conn->ibc_lock);
-                        CDEBUG(D_NET, LPX64": redundant noop\n",
-                               conn->ibc_peer->ibp_nid);
+                        CDEBUG(D_NET, "%s: redundant noop\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         continue;
                 }
 
-                kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
+                kibnal_pack_msg(tx->tx_msg, conn->ibc_version,
+                                conn->ibc_outstanding_credits,
                                 conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
                                 conn->ibc_txseq);
 
                 conn->ibc_txseq++;
                 conn->ibc_outstanding_credits = 0;
                 conn->ibc_nsends_posted++;
-                conn->ibc_credits--;
+                if (consume_cred)
+                        conn->ibc_credits--;
 
                 /* CAVEAT EMPTOR!  This tx could be the PUT_DONE of an RDMA
                  * PUT.  If so, it was first queued here as a PUT_REQ, sent and
@@ -958,14 +1011,14 @@ kibnal_check_sends (kib_conn_t *conn)
                 LASSERT (tx->tx_nwrq > 0);
 #if 0
                 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) 
-                        CDEBUG(D_WARNING, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+                        CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
                                tx->tx_wrq[0].scatgat_list->v_address,
                                tx->tx_wrq[0].scatgat_list->length,
                                tx->tx_wrq[0].scatgat_list->l_key,
                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr,
                                tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key);
                 else
-                        CDEBUG(D_WARNING, "WORK[0]: %s gl %p for %d k %x\n",
+                        CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n",
                                tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????",
                                tx->tx_wrq[0].scatgat_list->v_address,
                                tx->tx_wrq[0].scatgat_list->length,
@@ -973,14 +1026,14 @@ kibnal_check_sends (kib_conn_t *conn)
 
                 if (tx->tx_nwrq > 1) {
                         if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) 
-                                CDEBUG(D_WARNING, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
+                                CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n",
                                        tx->tx_wrq[1].scatgat_list->v_address,
                                        tx->tx_wrq[1].scatgat_list->length,
                                        tx->tx_wrq[1].scatgat_list->l_key,
                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr,
                                        tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key);
                         else
-                                CDEBUG(D_WARNING, "WORK[1]: %s gl %p for %d k %x\n",
+                                CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n",
                                        tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????",
                                        tx->tx_wrq[1].scatgat_list->v_address,
                                        tx->tx_wrq[1].scatgat_list->length,
@@ -999,11 +1052,14 @@ kibnal_check_sends (kib_conn_t *conn)
                         rc = (vvrc == vv_return_ok) ? 0 : -EIO;
                 }
 
+                conn->ibc_last_send = jiffies;
+
                 if (rc != 0) {
                         /* NB credits are transferred in the actual
                          * message, which can only be the last work item */
                         conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits;
-                        conn->ibc_credits++;
+                        if (consume_cred)
+                                conn->ibc_credits++;
                         conn->ibc_nsends_posted--;
 
                         tx->tx_status = rc;
@@ -1017,11 +1073,11 @@ kibnal_check_sends (kib_conn_t *conn)
                         spin_unlock(&conn->ibc_lock);
                         
                         if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
-                                CERROR ("Error %d posting transmit to "LPX64"\n", 
-                                        vvrc, conn->ibc_peer->ibp_nid);
+                                CERROR ("Error %d posting transmit to %s\n", 
+                                        vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
                         else
-                                CDEBUG (D_NET, "Error %d posting transmit to "
-                                        LPX64"\n", rc, conn->ibc_peer->ibp_nid);
+                                CDEBUG (D_NET, "Error %d posting transmit to %s\n",
+                                        rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
 
                         kibnal_close_conn (conn, rc);
 
@@ -1049,10 +1105,11 @@ kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
         if (failed &&
             tx->tx_status == 0 &&
             conn->ibc_state == IBNAL_CONN_ESTABLISHED)
-                CERROR("tx -> "LPX64" type %x cookie "LPX64
+                CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64
                        "sending %d waiting %d: failed %d\n", 
-                       conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type, 
-                       tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc);
+                       libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                       tx->tx_msg->ibm_type, tx->tx_cookie,
+                       tx->tx_sending, tx->tx_waiting, vvrc);
 
         spin_lock(&conn->ibc_lock);
 
@@ -1080,10 +1137,12 @@ kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc)
         if (idle)
                 kibnal_tx_done (tx);
 
-        if (failed)
+        if (failed) {
                 kibnal_close_conn (conn, -EIO);
-        else
+        } else {
+                kibnal_peer_alive(conn->ibc_peer);
                 kibnal_check_sends(conn);
+        }
 
         kibnal_conn_decref(conn);               /* ...until here */
 }
@@ -1276,12 +1335,14 @@ kibnal_schedule_peer_arp (kib_peer_t *peer)
 }
 
 void
-kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
+kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
 {
         kib_peer_t      *peer;
         kib_conn_t      *conn;
         unsigned long    flags;
         rwlock_t        *g_lock = &kibnal_data.kib_global_lock;
+        int              retry;
+        int              rc;
 
         /* If I get here, I've committed to send, so I complete the tx with
          * failure on any problems */
@@ -1289,38 +1350,51 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
         LASSERT (tx->tx_conn == NULL);          /* only set when assigned a conn */
         LASSERT (tx->tx_nwrq > 0);              /* work items have been set up */
 
-        read_lock_irqsave(g_lock, flags);
+        for (retry = 0; ; retry = 1) {
+                read_lock_irqsave(g_lock, flags);
         
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
-                read_unlock_irqrestore(g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                tx->tx_waiting = 0;
-                kibnal_tx_done (tx);
-                return;
-        }
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL) {
+                        conn = kibnal_find_conn_locked (peer);
+                        if (conn != NULL) {
+                                kibnal_conn_addref(conn); /* 1 ref for me... */
+                                read_unlock_irqrestore(g_lock, flags);
 
-        conn = kibnal_find_conn_locked (peer);
-        if (conn != NULL) {
-                kibnal_conn_addref(conn);       /* 1 ref for me... */
-                read_unlock_irqrestore(g_lock, flags);
+                                kibnal_queue_tx (tx, conn);
+                                kibnal_conn_decref(conn); /* ...to here */
+                                return;
+                        }
+                }
                 
-                kibnal_queue_tx (tx, conn);
-                kibnal_conn_decref(conn);       /* ...to here */
-                return;
-        }
-        
-        /* Making one or more connections; I'll need a write lock... */
-        read_unlock(g_lock);
-        write_lock(g_lock);
+                /* Making one or more connections; I'll need a write lock... */
+                read_unlock(g_lock);
+                write_lock(g_lock);
+
+                peer = kibnal_find_peer_locked (nid);
+                if (peer != NULL)
+                        break;
 
-        peer = kibnal_find_peer_locked (nid);
-        if (peer == NULL) {
                 write_unlock_irqrestore(g_lock, flags);
-                tx->tx_status = -EHOSTUNREACH;
-                tx->tx_waiting = 0;
-                kibnal_tx_done (tx);
-                return;
+
+                if (retry) {
+                        CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
+
+                        tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
+                        kibnal_tx_done (tx);
+                        return;
+                }
+
+                rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid));
+                if (rc != 0) {
+                        CERROR("Can't add peer %s: %d\n",
+                               libcfs_nid2str(nid), rc);
+                        
+                        tx->tx_status = -EHOSTUNREACH;
+                        tx->tx_waiting = 0;
+                        kibnal_tx_done (tx);
+                        return;
+                }
         }
 
         conn = kibnal_find_conn_locked (peer);
@@ -1334,17 +1408,19 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
                 return;
         }
 
-        if (peer->ibp_connecting == 0) {
-                if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) {
+        if (peer->ibp_connecting == 0 &&
+            peer->ibp_accepting == 0) {
+                if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
+                      time_after_eq(jiffies, peer->ibp_reconnect_time))) {
                         write_unlock_irqrestore(g_lock, flags);
                         tx->tx_status = -EHOSTUNREACH;
                         tx->tx_waiting = 0;
                         kibnal_tx_done (tx);
                         return;
                 }
-        
+
                 peer->ibp_connecting = 1;
-                peer->ibp_arp_count = 1 + IBNAL_ARP_RETRIES;
+                peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries;
                 kibnal_schedule_peer_arp(peer);
         }
         
@@ -1355,45 +1431,30 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 }
 
 int
-kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist)
-{
-        /* I would guess that if kibnal_get_peer (nid) == NULL,
-           and we're not routing, then 'nid' is very distant :) */
-        if ( nal->libnal_ni.ni_pid.nid == nid ) {
-                *dist = 0;
-        } else {
-                *dist = 1;
-        }
-
-        return 0;
-}
-
-ptl_err_t
-kibnal_sendmsg(lib_nal_t    *nal, 
-               void         *private,
-               lib_msg_t    *libmsg,
-               ptl_hdr_t    *hdr, 
-               int           type, 
-               ptl_nid_t     nid, 
-               ptl_pid_t     pid,
-               unsigned int  payload_niov, 
-               struct iovec *payload_iov, 
-               ptl_kiov_t   *payload_kiov,
-               int           payload_offset,
-               int           payload_nob)
+kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
-        kib_msg_t  *ibmsg;
-        kib_tx_t   *tx;
-        int         nob;
-        int         rc;
+        lnet_hdr_t       *hdr = &lntmsg->msg_hdr; 
+        int               type = lntmsg->msg_type; 
+        lnet_process_id_t target = lntmsg->msg_target;
+        int               target_is_router = lntmsg->msg_target_is_router;
+        int               routing = lntmsg->msg_routing;
+        unsigned int      payload_niov = lntmsg->msg_niov; 
+        struct iovec     *payload_iov = lntmsg->msg_iov; 
+        lnet_kiov_t      *payload_kiov = lntmsg->msg_kiov;
+        unsigned int      payload_offset = lntmsg->msg_offset;
+        unsigned int      payload_nob = lntmsg->msg_len;
+        kib_msg_t        *ibmsg;
+        kib_tx_t         *tx;
+        int               nob;
+        int               rc;
 
         /* NB 'private' is different depending on what we're sending.... */
 
-        CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64
-               " pid %d\n", payload_nob, payload_niov, nid , pid);
+        CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+               payload_nob, payload_niov, libcfs_id2str(target));
 
         LASSERT (payload_nob == 0 || payload_niov > 0);
-        LASSERT (payload_niov <= PTL_MD_MAX_IOV);
+        LASSERT (payload_niov <= LNET_MAX_IOV);
 
         /* Thread context */
         LASSERT (!in_interrupt());
@@ -1403,108 +1464,49 @@ kibnal_sendmsg(lib_nal_t    *nal,
         switch (type) {
         default:
                 LBUG();
-                return (PTL_FAIL);
+                return (-EIO);
                 
-        case PTL_MSG_REPLY: {
-                /* reply's 'private' is the incoming receive */
-                kib_rx_t *rx = private;
-
-                LASSERT(rx != NULL);
-
-                if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) {
-                        /* RDMA not expected */
-                        nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
-                        if (nob > IBNAL_MSG_SIZE) {
-                                CERROR("REPLY for "LPX64" too big (RDMA not requested):"
-                                       "%d (max for message is %d)\n", 
-                                       nid, payload_nob, IBNAL_MSG_SIZE);
-                                CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n",
-                                       nob, nid);
-                                return PTL_FAIL;
-                        }
-                        break;
-                }
-
-                /* Incoming message consistent with RDMA? */
-                if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
-                        CERROR("REPLY to "LPX64" bad msg type %x!!!\n",
-                               nid, rx->rx_msg->ibm_type);
-                        return PTL_FAIL;
-                }
+        case LNET_MSG_ACK:
+                LASSERT (payload_nob == 0);
+                break;
 
-                /* NB rx_complete() will send GET_NAK when I return to it from
-                 * here, unless I set rx_responded! */
+        case LNET_MSG_GET:
+                if (routing || target_is_router)
+                        break;                  /* send IMMEDIATE */
+                
+                /* is the REPLY message too small for RDMA? */
+                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+                if (nob <= IBNAL_MSG_SIZE)
+                        break;                  /* send IMMEDIATE */
 
-                tx = kibnal_get_idle_tx(0);
+                tx = kibnal_get_idle_tx();
                 if (tx == NULL) {
-                        CERROR("Can't get tx for REPLY to "LPX64"\n", nid);
-                        return PTL_FAIL;
-                }
-
-                if (payload_nob == 0)
-                        rc = 0;
-                else if (payload_kiov == NULL)
-                        rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
-                                                 payload_niov, payload_iov, 
-                                                 payload_offset, payload_nob);
-                else
-                        rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
-                                                  payload_niov, payload_kiov,
-                                                  payload_offset, payload_nob);
-                if (rc != 0) {
-                        CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc);
-                        kibnal_tx_done(tx);
-                        return PTL_FAIL;
+                        CERROR("Can allocate txd for GET to %s: \n",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
                 }
                 
-                rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob,
-                                      &rx->rx_msg->ibm_u.get.ibgm_rd,
-                                      rx->rx_msg->ibm_u.get.ibgm_cookie);
-                if (rc < 0) {
-                        CERROR("Can't setup rdma for GET from "LPX64": %d\n", 
-                               nid, rc);
-                } else if (rc == 0) {
-                        /* No RDMA: local completion may happen now! */
-                        lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK);
-                } else {
-                        /* RDMA: lib_finalize(libmsg) when it completes */
-                        tx->tx_libmsg[0] = libmsg;
-                }
-
-                kibnal_queue_tx(tx, rx->rx_conn);
-                rx->rx_responded = 1;
-                return (rc >= 0) ? PTL_OK : PTL_FAIL;
-        }
-
-        case PTL_MSG_GET:
-                /* will the REPLY message be small enough not to need RDMA? */
-                nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]);
-                if (nob <= IBNAL_MSG_SIZE)
-                        break;
-
-                tx = kibnal_get_idle_tx(1);     /* may block; caller is an app thread */
-                LASSERT (tx != NULL);
-
                 ibmsg = tx->tx_msg;
                 ibmsg->ibm_u.get.ibgm_hdr = *hdr;
                 ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
 
-                if ((libmsg->md->options & PTL_MD_KIOV) == 0)
+                if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
                         rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
                                                  vv_acc_r_mem_write,
-                                                 libmsg->md->md_niov,
-                                                 libmsg->md->md_iov.iov,
-                                                 0, libmsg->md->length);
+                                                 lntmsg->msg_md->md_niov,
+                                                 lntmsg->msg_md->md_iov.iov,
+                                                 0, lntmsg->msg_md->md_length);
                 else
                         rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
                                                   vv_acc_r_mem_write,
-                                                  libmsg->md->md_niov,
-                                                  libmsg->md->md_iov.kiov,
-                                                  0, libmsg->md->length);
+                                                  lntmsg->msg_md->md_niov,
+                                                  lntmsg->msg_md->md_iov.kiov,
+                                                  0, lntmsg->msg_md->md_length);
                 if (rc != 0) {
-                        CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc);
+                        CERROR("Can't setup GET sink for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
                         kibnal_tx_done(tx);
-                        return PTL_FAIL;
+                        return -EIO;
                 }
 
 #if IBNAL_USE_FMR
@@ -1518,30 +1520,34 @@ kibnal_sendmsg(lib_nal_t    *nal,
 #endif
                 kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
 
-                tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg);
-                if (tx->tx_libmsg[1] == NULL) {
-                        CERROR("Can't create reply for GET -> "LPX64"\n", nid);
+                tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
+                                                         lntmsg);
+                if (tx->tx_lntmsg[1] == NULL) {
+                        CERROR("Can't create reply for GET -> %s\n",
+                               libcfs_nid2str(target.nid));
                         kibnal_tx_done(tx);
-                        return PTL_FAIL;
+                        return -EIO;
                 }
 
-                tx->tx_libmsg[0] = libmsg;      /* finalise libmsg[0,1] on completion */
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg[0,1] on completion */
                 tx->tx_waiting = 1;             /* waiting for GET_DONE */
-                kibnal_launch_tx(tx, nid);
-                return PTL_OK;
-
-        case PTL_MSG_ACK:
-                LASSERT (payload_nob == 0);
-                break;
+                kibnal_launch_tx(tx, target.nid);
+                return 0;
 
-        case PTL_MSG_PUT:
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
                 /* Is the payload small enough not to need RDMA? */
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
                 if (nob <= IBNAL_MSG_SIZE)
-                        break;
+                        break;                  /* send IMMEDIATE */
 
-                tx = kibnal_get_idle_tx(1);     /* may block: caller is app thread */
-                LASSERT (tx != NULL);
+                tx = kibnal_get_idle_tx();
+                if (tx == NULL) {
+                        CERROR("Can't allocate %s txd for %s\n",
+                               type == LNET_MSG_PUT ? "PUT" : "REPLY",
+                               libcfs_nid2str(target.nid));
+                        return -ENOMEM;
+                }
 
                 if (payload_kiov == NULL)
                         rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0,
@@ -1552,9 +1558,10 @@ kibnal_sendmsg(lib_nal_t    *nal,
                                                   payload_niov, payload_kiov,
                                                   payload_offset, payload_nob);
                 if (rc != 0) {
-                        CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc);
+                        CERROR("Can't setup PUT src for %s: %d\n",
+                               libcfs_nid2str(target.nid), rc);
                         kibnal_tx_done(tx);
-                        return PTL_FAIL;
+                        return -EIO;
                 }
 
                 ibmsg = tx->tx_msg;
@@ -1562,74 +1569,132 @@ kibnal_sendmsg(lib_nal_t    *nal,
                 ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
 
-                tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
                 tx->tx_waiting = 1;             /* waiting for PUT_{ACK,NAK} */
-                kibnal_launch_tx(tx, nid);
-                return PTL_OK;
+                kibnal_launch_tx(tx, target.nid);
+                return 0;
         }
 
+        /* send IMMEDIATE */
+
         LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
                  <= IBNAL_MSG_SIZE);
 
-        tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK ||
-                                  type == PTL_MSG_REPLY));
+        tx = kibnal_get_idle_tx();
         if (tx == NULL) {
-                CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid);
-                return PTL_NO_SPACE;
+                CERROR ("Can't send %d to %s: tx descs exhausted\n",
+                        type, libcfs_nid2str(target.nid));
+                return -ENOMEM;
         }
 
         ibmsg = tx->tx_msg;
         ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
 
-        if (payload_nob > 0) {
-                if (payload_kiov != NULL)
-                        lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload,
-                                          payload_niov, payload_kiov,
-                                          payload_offset, payload_nob);
-                else
-                        lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload,
-                                         payload_niov, payload_iov,
-                                         payload_offset, payload_nob);
-        }
+        if (payload_kiov != NULL)
+                lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
+                                    offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                    payload_niov, payload_kiov,
+                                    payload_offset, payload_nob);
+        else
+                lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
+                                   offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                   payload_niov, payload_iov,
+                                   payload_offset, payload_nob);
 
         nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
         kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
 
-        tx->tx_libmsg[0] = libmsg;              /* finalise libmsg on completion */
-        kibnal_launch_tx(tx, nid);
-        return PTL_OK;
+        tx->tx_lntmsg[0] = lntmsg;              /* finalise lntmsg on completion */
+        kibnal_launch_tx(tx, target.nid);
+        return 0;
 }
 
-ptl_err_t
-kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie,
-               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-               unsigned int payload_niov, struct iovec *payload_iov,
-               size_t payload_offset, size_t payload_len)
+void
+kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg)
 {
-        CDEBUG(D_NET, "  pid = %d, nid="LPU64"\n",
-               pid, nid);
-        return (kibnal_sendmsg(nal, private, cookie,
-                               hdr, type, nid, pid,
-                               payload_niov, payload_iov, NULL,
-                               payload_offset, payload_len));
+        lnet_process_id_t target = lntmsg->msg_target;
+        unsigned int      niov = lntmsg->msg_niov; 
+        struct iovec     *iov = lntmsg->msg_iov; 
+        lnet_kiov_t      *kiov = lntmsg->msg_kiov;
+        unsigned int      offset = lntmsg->msg_offset;
+        unsigned int      nob = lntmsg->msg_len;
+        kib_tx_t         *tx;
+        int               rc;
+        
+        tx = kibnal_get_idle_tx();
+        if (tx == NULL) {
+                CERROR("Can't get tx for REPLY to %s\n",
+                       libcfs_nid2str(target.nid));
+                goto failed_0;
+        }
+
+        if (nob == 0)
+                rc = 0;
+        else if (kiov == NULL)
+                rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, 
+                                         niov, iov, offset, nob);
+        else
+                rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0,
+                                          niov, kiov, offset, nob);
+
+        if (rc != 0) {
+                CERROR("Can't setup GET src for %s: %d\n",
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
+        
+        rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob,
+                              &rx->rx_msg->ibm_u.get.ibgm_rd,
+                              rx->rx_msg->ibm_u.get.ibgm_cookie);
+        if (rc < 0) {
+                CERROR("Can't setup rdma for GET from %s: %d\n", 
+                       libcfs_nid2str(target.nid), rc);
+                goto failed_1;
+        }
+        
+        if (rc == 0) {
+                /* No RDMA: local completion may happen now! */
+                lnet_finalize(ni, lntmsg, 0);
+        } else {
+                /* RDMA: lnet_finalize(lntmsg) when it
+                 * completes */
+                tx->tx_lntmsg[0] = lntmsg;
+        }
+        
+        kibnal_queue_tx(tx, rx->rx_conn);
+        return;
+        
+ failed_1:
+        kibnal_tx_done(tx);
+ failed_0:
+        lnet_finalize(ni, lntmsg, -EIO);
 }
 
-ptl_err_t
-kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, 
-                     ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-                     unsigned int payload_niov, ptl_kiov_t *payload_kiov, 
-                     size_t payload_offset, size_t payload_len)
+int
+kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+                   void **new_private)
 {
-        return (kibnal_sendmsg(nal, private, cookie,
-                               hdr, type, nid, pid,
-                               payload_niov, NULL, payload_kiov,
-                               payload_offset, payload_len));
+        kib_rx_t    *rx = private;
+        kib_conn_t  *conn = rx->rx_conn;
+
+        if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) {
+                /* Can't block if RDMA completions need normal credits */
+                LCONSOLE_ERROR("Dropping message from %s: no buffers free. "
+                               "%s is running an old version of LNET that may "
+                               "deadlock if messages wait for buffers)\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid),
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                return -EDEADLK;
+        }
+        
+        *new_private = private;
+        return 0;
 }
 
-ptl_err_t
-kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
-                 unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov,
-                 size_t offset, int mlen, int rlen)
+int
+kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+             unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
+             unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
         kib_rx_t    *rx = private;
         kib_msg_t   *rxmsg = rx->rx_msg;
@@ -1637,10 +1702,10 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
         kib_tx_t    *tx;
         kib_msg_t   *txmsg;
         int          nob;
-        int          rc;
+        int          post_cred = 1;
+        int          rc = 0;
         
         LASSERT (mlen <= rlen);
-        LASSERT (mlen >= 0);
         LASSERT (!in_interrupt());
         /* Either all pages or all vaddrs */
         LASSERT (!(kiov != NULL && iov != NULL));
@@ -1651,38 +1716,42 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                 
         case IBNAL_MSG_IMMEDIATE:
                 nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
-                if (nob > IBNAL_MSG_SIZE) {
-                        CERROR ("Immediate message from "LPX64" too big: %d\n",
-                                rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen);
-                        return (PTL_FAIL);
+                if (nob > rx->rx_nob) {
+                        CERROR ("Immediate message from %s too big: %d(%d)\n",
+                                libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
+                                nob, rx->rx_nob);
+                        rc = -EPROTO;
+                        break;
                 }
 
                 if (kiov != NULL)
-                        lib_copy_buf2kiov(niov, kiov, offset,
-                                          rxmsg->ibm_u.immediate.ibim_payload,
-                                          mlen);
+                        lnet_copy_flat2kiov(niov, kiov, offset,
+                                            IBNAL_MSG_SIZE, rxmsg,
+                                            offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                            mlen);
                 else
-                        lib_copy_buf2iov(niov, iov, offset,
-                                         rxmsg->ibm_u.immediate.ibim_payload,
-                                         mlen);
-
-                lib_finalize (nal, NULL, libmsg, PTL_OK);
-                return (PTL_OK);
+                        lnet_copy_flat2iov(niov, iov, offset,
+                                           IBNAL_MSG_SIZE, rxmsg,
+                                           offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+                                           mlen);
+                lnet_finalize (ni, lntmsg, 0);
+                break;
 
         case IBNAL_MSG_PUT_REQ:
-                /* NB rx_complete() will send PUT_NAK when I return to it from
-                 * here, unless I set rx_responded!  */
-
-                if (mlen == 0) { /* No payload to RDMA */
-                        lib_finalize(nal, NULL, libmsg, PTL_OK);
-                        return PTL_OK;
+                if (mlen == 0) {
+                        lnet_finalize(ni, lntmsg, 0);
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
                 }
-
-                tx = kibnal_get_idle_tx(0);
+                
+                tx = kibnal_get_idle_tx();
                 if (tx == NULL) {
-                        CERROR("Can't allocate tx for "LPX64"\n",
-                               conn->ibc_peer->ibp_nid);
-                        return PTL_FAIL;
+                        CERROR("Can't allocate tx for %s\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid));
+                        /* Not replying will break the connection */
+                        rc = -ENOMEM;
+                        break;
                 }
 
                 txmsg = tx->tx_msg;
@@ -1697,10 +1766,13 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
                                                   vv_acc_r_mem_write,
                                                   niov, kiov, offset, mlen);
                 if (rc != 0) {
-                        CERROR("Can't setup PUT sink for "LPX64": %d\n",
-                               conn->ibc_peer->ibp_nid, rc);
+                        CERROR("Can't setup PUT sink for %s: %d\n",
+                               libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
                         kibnal_tx_done(tx);
-                        return PTL_FAIL;
+                        /* tell peer it's over */
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc,
+                                               rxmsg->ibm_u.putreq.ibprm_cookie);
+                        break;
                 }
 
                 txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
@@ -1716,39 +1788,29 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
 #endif
                 kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
 
-                tx->tx_libmsg[0] = libmsg;      /* finalise libmsg on completion */
+                tx->tx_lntmsg[0] = lntmsg;      /* finalise lntmsg on completion */
                 tx->tx_waiting = 1;             /* waiting for PUT_DONE */
                 kibnal_queue_tx(tx, conn);
 
-                LASSERT (!rx->rx_responded);
-                rx->rx_responded = 1;
-                return PTL_OK;
+                if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD)
+                        post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */
+                break;
 
         case IBNAL_MSG_GET_REQ:
-                /* We get called here just to discard any junk after the
-                 * GET hdr. */
-                LASSERT (libmsg == NULL);
-                lib_finalize (nal, NULL, libmsg, PTL_OK);
-                return (PTL_OK);
+                if (lntmsg != NULL) {
+                        /* Optimized GET; RDMA lntmsg's payload */
+                        kibnal_reply(ni, rx, lntmsg);
+                } else {
+                        /* GET didn't match anything */
+                        kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, 
+                                               -ENODATA,
+                                               rxmsg->ibm_u.get.ibgm_cookie);
+                }
+                break;
         }
-}
 
-ptl_err_t
-kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg,
-              unsigned int niov, struct iovec *iov, 
-              size_t offset, size_t mlen, size_t rlen)
-{
-        return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL,
-                                offset, mlen, rlen));
-}
-
-ptl_err_t
-kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg,
-                     unsigned int niov, ptl_kiov_t *kiov, 
-                     size_t offset, size_t mlen, size_t rlen)
-{
-        return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov,
-                                offset, mlen, rlen));
+        kibnal_post_rx(rx, post_cred, 0);
+        return rc;
 }
 
 int
@@ -1770,6 +1832,41 @@ kibnal_thread_fini (void)
 }
 
 void
+kibnal_peer_alive (kib_peer_t *peer)
+{
+        /* This is racy, but everyone's only writing cfs_time_current() */
+        peer->ibp_last_alive = cfs_time_current();
+        mb();
+}
+
+void
+kibnal_peer_notify (kib_peer_t *peer)
+{
+        time_t        last_alive = 0;
+        int           error = 0;
+        unsigned long flags;
+        
+        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        if (list_empty(&peer->ibp_conns) &&
+            peer->ibp_accepting == 0 &&
+            peer->ibp_connecting == 0 &&
+            peer->ibp_error != 0) {
+                error = peer->ibp_error;
+                peer->ibp_error = 0;
+                
+                last_alive = cfs_time_current_sec() -
+                             cfs_duration_sec(cfs_time_current() -
+                                              peer->ibp_last_alive);
+        }
+        
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+        
+        if (error != 0)
+                lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive);
+}
+
+void
 kibnal_schedule_conn (kib_conn_t *conn)
 {
         unsigned long flags;
@@ -1787,7 +1884,7 @@ kibnal_schedule_conn (kib_conn_t *conn)
 void
 kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
-        /* This just does the immmediate housekeeping.  'error' is zero for a
+        /* This just does the immediate housekeeping.  'error' is zero for a
          * normal shutdown which can happen only after the connection has been
          * established.  If the connection is established, schedule the
          * connection to be finished off by the connd.  Otherwise the connd is
@@ -1808,48 +1905,33 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
 
         if (error == 0 &&
             list_empty(&conn->ibc_tx_queue) &&
+            list_empty(&conn->ibc_tx_queue_rsrvd) &&
+            list_empty(&conn->ibc_tx_queue_nocred) &&
             list_empty(&conn->ibc_active_txs)) {
-                CDEBUG(D_NET, "closing conn to "LPX64
+                CDEBUG(D_NET, "closing conn to %s"
                        " rx# "LPD64" tx# "LPD64"\n", 
-                       peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq);
+                       libcfs_nid2str(peer->ibp_nid),
+                       conn->ibc_txseq, conn->ibc_rxseq);
         } else {
-                CERROR("Closing conn to "LPX64": error %d%s%s"
+                CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s"
                        " rx# "LPD64" tx# "LPD64"\n",
-                       peer->ibp_nid, error,
+                       libcfs_nid2str(peer->ibp_nid), error,
                        list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+                       list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
+                       list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
                        conn->ibc_txseq, conn->ibc_rxseq);
-
-#if 0
-                /* can't skip down the queue without holding ibc_lock (see above) */
-                list_for_each(tmp, &conn->ibc_tx_queue) {
-                        kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
-                        
-                        CERROR("   queued tx type %x cookie "LPX64
-                               " sending %d waiting %d ticks %ld/%d\n", 
-                               tx->tx_msg->ibm_type, tx->tx_cookie, 
-                               tx->tx_sending, tx->tx_waiting,
-                               (long)(tx->tx_deadline - jiffies), HZ);
-                }
-
-                list_for_each(tmp, &conn->ibc_active_txs) {
-                        kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
-                        
-                        CERROR("   active tx type %x cookie "LPX64
-                               " sending %d waiting %d ticks %ld/%d\n", 
-                               tx->tx_msg->ibm_type, tx->tx_cookie, 
-                               tx->tx_sending, tx->tx_waiting,
-                               (long)(tx->tx_deadline - jiffies), HZ);
-                }
-#endif
         }
 
         list_del (&conn->ibc_list);
-        
-        if (list_empty (&peer->ibp_conns) &&    /* no more conns */
-            peer->ibp_persistence == 0 &&       /* non-persistent peer */
-            kibnal_peer_active(peer)) {         /* still in peer table */
-                kibnal_unlink_peer_locked (peer);
+
+        if (list_empty (&peer->ibp_conns)) {   /* no more conns */
+                if (peer->ibp_persistence == 0 && /* non-persistent peer */
+                    kibnal_peer_active(peer))     /* still in peer table */
+                        kibnal_unlink_peer_locked (peer);
+
+                /* set/clear error on last conn */
+                peer->ibp_error = conn->ibc_comms_error;
         }
 
         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
@@ -1894,84 +1976,76 @@ kibnal_handle_early_rxs(kib_conn_t *conn)
 }
 
 void
-kibnal_conn_disconnected(kib_conn_t *conn)
+kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs)
 {
-        LIST_HEAD        (zombies); 
-        struct list_head *tmp;
-        struct list_head *nxt;
-        kib_tx_t         *tx;
-
-        /* I'm the connd */
-        LASSERT (!in_interrupt());
-        LASSERT (current == kibnal_data.kib_connd);
-        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
-        
-        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
-
-        /* move QP to error state to make posted work items complete */
-        kibnal_set_qp_state(conn, vv_qp_state_error);
+        LIST_HEAD           (zombies); 
+        struct list_head    *tmp;
+        struct list_head    *nxt;
+        kib_tx_t            *tx;
 
         spin_lock(&conn->ibc_lock);
 
-        /* Complete all tx descs not waiting for sends to complete.
-         * NB we should be safe from RDMA now that the QP has changed state */
-
-        list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+        list_for_each_safe (tmp, nxt, txs) {
                 tx = list_entry (tmp, kib_tx_t, tx_list);
 
-                LASSERT (tx->tx_queued);
-
+                if (txs == &conn->ibc_active_txs) {
+                        LASSERT (!tx->tx_queued);
+                        LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+                } else {
+                        LASSERT (tx->tx_queued);
+                }
+                
                 tx->tx_status = -ECONNABORTED;
                 tx->tx_queued = 0;
                 tx->tx_waiting = 0;
                 
-                if (tx->tx_sending != 0)
-                        continue;
-
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
+                if (tx->tx_sending == 0) {
+                        list_del (&tx->tx_list);
+                        list_add (&tx->tx_list, &zombies);
+                }
         }
 
-        list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
-                tx = list_entry (tmp, kib_tx_t, tx_list);
-
-                LASSERT (!tx->tx_queued);
-                LASSERT (tx->tx_waiting ||
-                         tx->tx_sending != 0);
+        spin_unlock(&conn->ibc_lock);
 
-                tx->tx_status = -ECONNABORTED;
-                tx->tx_waiting = 0;
-                
-                if (tx->tx_sending != 0)
-                        continue;
+        kibnal_txlist_done(&zombies, -ECONNABORTED);
+}
 
-                list_del (&tx->tx_list);
-                list_add (&tx->tx_list, &zombies);
-        }
+void
+kibnal_conn_disconnected(kib_conn_t *conn)
+{
+        /* I'm the connd */
+        LASSERT (!in_interrupt());
+        LASSERT (current == kibnal_data.kib_connd);
+        LASSERT (conn->ibc_state >= IBNAL_CONN_INIT);
         
-        spin_unlock(&conn->ibc_lock);
+        kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
 
-        while (!list_empty(&zombies)) {
-                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+        /* move QP to error state to make posted work items complete */
+        kibnal_set_qp_state(conn, vv_qp_state_error);
 
-                list_del(&tx->tx_list);
-                kibnal_tx_done (tx);
-        }
+        /* Complete all tx descs not waiting for sends to complete.
+         * NB we should be safe from RDMA now that the QP has changed state */
+
+        kibnal_abort_txs(conn, &conn->ibc_tx_queue);
+        kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
+        kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred);
+        kibnal_abort_txs(conn, &conn->ibc_active_txs);
 
         kibnal_handle_early_rxs(conn);
+
+        kibnal_peer_notify(conn->ibc_peer);
 }
 
 void
-kibnal_peer_connect_failed (kib_peer_t *peer, int active)
+kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error)
 {
-        struct list_head  zombies;
-        kib_tx_t         *tx;
+        LIST_HEAD        (zombies);
         unsigned long     flags;
 
         /* Only the connd creates conns => single threaded */
+        LASSERT (error != 0);
         LASSERT (!in_interrupt());
         LASSERT (current == kibnal_data.kib_connd);
-        LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL);
 
         write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
 
@@ -1979,10 +2053,12 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active)
                 LASSERT (peer->ibp_connecting != 0);
                 peer->ibp_connecting--;
         } else {
-                LASSERT (!kibnal_peer_active(peer));
+                LASSERT (peer->ibp_accepting != 0);
+                peer->ibp_accepting--;
         }
         
-        if (peer->ibp_connecting != 0) {
+        if (peer->ibp_connecting != 0 ||
+            peer->ibp_accepting != 0) {
                 /* another connection attempt under way (loopback?)... */
                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
                 return;
@@ -1990,11 +2066,17 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active)
 
         if (list_empty(&peer->ibp_conns)) {
                 /* Say when active connection can be re-attempted */
-                peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval;
-                /* Increase reconnection interval */
-                peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2,
-                                                    IBNAL_MAX_RECONNECT_INTERVAL);
-        
+                peer->ibp_reconnect_interval *= 2;
+                peer->ibp_reconnect_interval =
+                        MAX(peer->ibp_reconnect_interval,
+                            *kibnal_tunables.kib_min_reconnect_interval);
+                peer->ibp_reconnect_interval =
+                        MIN(peer->ibp_reconnect_interval,
+                            *kibnal_tunables.kib_max_reconnect_interval);
+                
+                peer->ibp_reconnect_time = jiffies + 
+                                           peer->ibp_reconnect_interval * HZ;
+
                 /* Take peer's blocked transmits to complete with error */
                 list_add(&zombies, &peer->ibp_tx_queue);
                 list_del_init(&peer->ibp_tx_queue);
@@ -2004,6 +2086,8 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active)
                         /* failed connection attempt on non-persistent peer */
                         kibnal_unlink_peer_locked (peer);
                 }
+
+                peer->ibp_error = error;
         } else {
                 /* Can't have blocked transmits if there are connections */
                 LASSERT (list_empty(&peer->ibp_tx_queue));
@@ -2011,31 +2095,49 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active)
         
         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
+        kibnal_peer_notify(peer);
+
         if (list_empty (&zombies)) 
                 return;
         
-        CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid);
-        do {
-                tx = list_entry (zombies.next, kib_tx_t, tx_list);
+        CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n",
+                libcfs_nid2str(peer->ibp_nid));
 
-                list_del (&tx->tx_list);
-                /* complete now */
-                tx->tx_status = -EHOSTUNREACH;
-                kibnal_tx_done (tx);
-        } while (!list_empty (&zombies));
+        kibnal_txlist_done(&zombies, -EHOSTUNREACH);
 }
 
 void
-kibnal_connreq_done(kib_conn_t *conn, int active, int status)
+kibnal_reject(cm_cep_handle_t cep, int why)
 {
-        static cm_reject_data_t   rej;
+        static cm_reject_data_t   rejs[3];
+        cm_reject_data_t         *rej = &rejs[why];
+
+        LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0]));
+
+        /* If I wasn't so lazy, I'd initialise this only once; it's effective
+         * read-only */
+        rej->reason = cm_rej_code_usr_rej;
+        rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff;
+        rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff;
+        rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff;
+        rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff;
+        rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff;
+        rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff;
+        rej->priv_data[6] = why;
+
+        cm_reject(cep, rej);
+}
 
+void
+kibnal_connreq_done(kib_conn_t *conn, int active, int status)
+{
         struct list_head   txs;
         kib_peer_t        *peer = conn->ibc_peer;
-        kib_peer_t        *peer2;
         unsigned long      flags;
         kib_tx_t          *tx;
 
+        CDEBUG(D_NET,"%d\n", status);
+
         /* Only the connd creates conns => single threaded */
         LASSERT (!in_interrupt());
         LASSERT (current == kibnal_data.kib_connd);
@@ -2044,10 +2146,10 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
         if (active) {
                 LASSERT (peer->ibp_connecting > 0);
         } else {
-                LASSERT (!kibnal_peer_active(peer));
+                LASSERT (peer->ibp_accepting > 0);
         }
         
-        PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+        LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
         conn->ibc_connvars = NULL;
 
         if (status != 0) {
@@ -2059,15 +2161,13 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
                         /* got a connection reply but failed checks */
                         LASSERT (active);
-                        memset(&rej, 0, sizeof(rej));
-                        rej.reason = cm_rej_code_usr_rej;
-                        cm_reject(conn->ibc_cep, &rej);
+                        kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL);
                         break;
 
                 case IBNAL_CONN_ACTIVE_CONNECT:
                         LASSERT (active);
                         cm_cancel(conn->ibc_cep);
-                        kibnal_pause(HZ/10);
+                        cfs_pause(cfs_time_seconds(1)/10);
                         /* cm_connect() failed immediately or
                          * callback returned failure */
                         break;
@@ -2087,7 +2187,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
                         break;
                 }
 
-                kibnal_peer_connect_failed(conn->ibc_peer, active);
+                kibnal_peer_connect_failed(conn->ibc_peer, active, status);
                 kibnal_conn_disconnected(conn);
                 return;
         }
@@ -2101,24 +2201,10 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
                 LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
         }
         
+        conn->ibc_last_send = jiffies;
         kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
+        kibnal_peer_alive(peer);
 
-        if (!active) {
-                peer2 = kibnal_find_peer_locked(peer->ibp_nid);
-                if (peer2 != NULL) {
-                        /* already in the peer table; swap */
-                        conn->ibc_peer = peer2;
-                        kibnal_peer_addref(peer2);
-                        kibnal_peer_decref(peer);
-                        peer = conn->ibc_peer;
-                } else {
-                        /* add 'peer' to the peer table */
-                        kibnal_peer_addref(peer);
-                        list_add_tail(&peer->ibp_list,
-                                      kibnal_nid2peerlist(peer->ibp_nid));
-                }
-        }
-        
         /* Add conn to peer's list and nuke any dangling conns from a different
          * peer instance... */
         kibnal_conn_addref(conn);               /* +1 ref for ibc_list */
@@ -2134,19 +2220,21 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
                 kibnal_close_conn_locked(conn, -ECONNABORTED);
 
                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
-                kibnal_peer_connect_failed(peer, active);
+                kibnal_peer_connect_failed(peer, active, -ECONNABORTED);
                 return;
         }
 
         if (active)
                 peer->ibp_connecting--;
+        else
+                peer->ibp_accepting--;
 
         /* grab pending txs while I have the lock */
         list_add(&txs, &peer->ibp_tx_queue);
         list_del_init(&peer->ibp_tx_queue);
         
-        /* reset reconnect interval for next attempt */
-        peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL;
+        peer->ibp_reconnect_interval = 0;       /* OK to reconnect at any time */
+
         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
         /* Schedule blocked txs */
@@ -2205,12 +2293,12 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
                         break;
 
                 case IBNAL_CONN_DISCONNECT1:
-                        /* kibnal_terminate_conn is getting there; It'll see
+                        /* kibnal_disconnect_conn is getting there; It'll see
                          * ibc_disconnect set... */
                         break;
 
                 case IBNAL_CONN_DISCONNECT2:
-                        /* kibnal_terminate_conn got there already; complete
+                        /* kibnal_disconnect_conn got there already; complete
                          * the disconnect. */
                         kibnal_schedule_conn(conn);
                         break;
@@ -2225,7 +2313,7 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
                 LASSERT (!conn->ibc_disconnect);
                 conn->ibc_disconnect = 1;
 
-                /* kibnal_terminate_conn sent the disconnect request. */
+                /* kibnal_disconnect_conn sent the disconnect request. */
                 kibnal_schedule_conn(conn);
 
                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
@@ -2279,13 +2367,16 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
         static kib_msg_t        txmsg;
         static kib_msg_t        rxmsg;
         static cm_reply_data_t  reply;
-        static cm_reject_data_t reject;
 
         kib_conn_t         *conn = NULL;
         int                 rc = 0;
+        int                 reason;
         int                 rxmsgnob;
+        rwlock_t           *g_lock = &kibnal_data.kib_global_lock;
+        kib_peer_t         *peer;
+        kib_peer_t         *peer2;
+        unsigned long       flags;
         kib_connvars_t     *cv;
-        kib_peer_t         *tmp_peer;
         cm_return_t         cmrc;
         vv_return_t         vvrc;
         
@@ -2294,9 +2385,10 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
         LASSERT (!in_interrupt());
         LASSERT (current == kibnal_data.kib_connd);
 
-        if (cmreq->sid != IBNAL_SERVICE_NUMBER) {
+        if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) {
                 CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n",
-                       cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER);
+                       cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number));
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
         }
 
@@ -2304,63 +2396,121 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
         rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg));
         memcpy(&rxmsg, cmreq->priv_data, rxmsgnob);
 
-        rc = kibnal_unpack_msg(&rxmsg, rxmsgnob);
+        rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob);
         if (rc != 0) {
-                CERROR("Can't parse connection request: %d\n", rc);
+                /* SILENT! kibnal_unpack_msg() complains if required */
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
         }
 
+        if (rxmsg.ibm_version != IBNAL_MSG_VERSION)
+                CWARN("Connection from %s: old protocol version 0x%x\n",
+                      libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version);
+
         if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) {
-                CERROR("Unexpected connreq msg type: %x from "LPX64"\n",
-                       rxmsg.ibm_type, rxmsg.ibm_srcnid);
+                CERROR("Unexpected connreq msg type: %x from %s\n",
+                       rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid));
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
         }
 
-        if (rxmsg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) {
-                CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n",
-                       rxmsg.ibm_srcnid, rxmsg.ibm_dstnid);
+        if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+                                     rxmsg.ibm_dstnid)) {
+                CERROR("Can't accept %s: bad dst nid %s\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid), 
+                       libcfs_nid2str(rxmsg.ibm_dstnid));
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
         }
 
         if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
-                CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n",
-                       rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_queue_depth, 
+                CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid), 
+                       rxmsg.ibm_u.connparams.ibcp_queue_depth, 
                        IBNAL_MSG_QUEUE_SIZE);
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
         }
 
         if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
-                CERROR("Can't accept "LPX64": message size %d too big (%d max)\n",
-                       rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_msg_size, 
+                CERROR("Can't accept %s: message size %d too big (%d max)\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid), 
+                       rxmsg.ibm_u.connparams.ibcp_max_msg_size, 
                        IBNAL_MSG_SIZE);
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
         }
                 
         if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
-                CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n",
-                       rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_frags, 
+                CERROR("Can't accept %s: max frags %d too big (%d max)\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid), 
+                       rxmsg.ibm_u.connparams.ibcp_max_frags, 
                        IBNAL_MAX_RDMA_FRAGS);
+                reason = IBNAL_REJECT_FATAL;
+                goto reject;
+        }
+        
+        /* assume 'rxmsg.ibm_srcnid' is a new peer; create */
+        rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid);
+        if (rc != 0) {
+                CERROR("Can't create peer for %s\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid));
+                reason = IBNAL_REJECT_NO_RESOURCES;
                 goto reject;
         }
+
+        write_lock_irqsave(g_lock, flags);
+
+        peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid);
+        if (peer2 != NULL) {
+                /* tie-break connection race in favour of the higher NID */                
+                if (peer2->ibp_connecting != 0 &&
+                    rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) {
+                        write_unlock_irqrestore(g_lock, flags);
+
+                        CWARN("Conn race %s\n",
+                              libcfs_nid2str(peer2->ibp_nid));
+
+                        kibnal_peer_decref(peer);
+                        reason = IBNAL_REJECT_CONN_RACE;
+                        goto reject;
+                }
+
+                peer2->ibp_accepting++;
+                kibnal_peer_addref(peer2);
+
+                write_unlock_irqrestore(g_lock, flags);
+                kibnal_peer_decref(peer);
+                peer = peer2;
+        } else {
+                /* Brand new peer */
+                LASSERT (peer->ibp_accepting == 0);
+                peer->ibp_accepting = 1;
+
+                kibnal_peer_addref(peer);
+                list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid));
+
+                write_unlock_irqrestore(g_lock, flags);
+        }
                 
         conn = kibnal_create_conn(cep);
         if (conn == NULL) {
-                CERROR("Can't create conn for "LPX64"\n", rxmsg.ibm_srcnid);
-                goto reject;
-        }
-        
-        /* assume 'rxmsg.ibm_srcnid' is a new peer */
-        tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid);
-        if (tmp_peer == NULL) {
-                CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid);
-                kibnal_conn_decref(conn);
-                conn = NULL;
+                CERROR("Can't create conn for %s\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid));
+                kibnal_peer_connect_failed(peer, 0, -ENOMEM);
+                kibnal_peer_decref(peer);
+                reason = IBNAL_REJECT_NO_RESOURCES;
                 goto reject;
         }
 
-        conn->ibc_peer = tmp_peer;              /* conn takes over my ref */
+        conn->ibc_version = rxmsg.ibm_version;
+
+        conn->ibc_peer = peer;              /* conn takes over my ref */
         conn->ibc_incarnation = rxmsg.ibm_srcstamp;
         conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+        conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
+        LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+                 <= IBNAL_RX_MSGS);
 
         cv = conn->ibc_connvars;
 
@@ -2373,25 +2523,43 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
 
         vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
                              &cv->cv_path.sgid, &cv->cv_sgid_index);
-        LASSERT (vvrc == vv_return_ok);
+        if (vvrc != vv_return_ok) {
+                CERROR("gid2gid_index failed for %s: %d\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
+                rc = -EIO;
+                reason = IBNAL_REJECT_FATAL;
+                goto reject;
+        }
         
         vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
                                cv->cv_path.pkey, &cv->cv_pkey_index);
-        LASSERT (vvrc == vv_return_ok);
+        if (vvrc != vv_return_ok) {
+                CERROR("pkey2pkey_index failed for %s: %d\n",
+                       libcfs_nid2str(rxmsg.ibm_srcnid), vvrc);
+                rc = -EIO;
+                reason = IBNAL_REJECT_FATAL;
+                goto reject;
+        }
 
         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
-        if (rc != 0)
+        if (rc != 0) {
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
+        }
 
         rc = kibnal_post_receives(conn);
         if (rc != 0) {
-                CERROR("Can't post receives for "LPX64"\n", rxmsg.ibm_srcnid);
+                CERROR("Can't post receives for %s\n", 
+                       libcfs_nid2str(rxmsg.ibm_srcnid));
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
         }
 
         rc = kibnal_set_qp_state(conn, vv_qp_state_rtr);
-        if (rc != 0)
+        if (rc != 0) {
+                reason = IBNAL_REJECT_FATAL;
                 goto reject;
+        }
         
         memset(&reply, 0, sizeof(reply));
         reply.qpn                 = cv->cv_local_qpn;
@@ -2411,7 +2579,8 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
         txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
         txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
         txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
-        kibnal_pack_msg(&txmsg, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
+        kibnal_pack_msg(&txmsg, conn->ibc_version,
+                        0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0);
 
         /* ...and copy into reply to avoid alignment issues */
         memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob);
@@ -2427,13 +2596,13 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq)
         /* back out state change (no callback happening) */
         kibnal_set_conn_state(conn, IBNAL_CONN_INIT);
         rc = -EIO;
+        reason = IBNAL_REJECT_FATAL;
                 
  reject:
-        CERROR("Rejected connreq from "LPX64"\n", rxmsg.ibm_srcnid);
+        CDEBUG(D_NET, "Rejecting connreq from %s\n",
+               libcfs_nid2str(rxmsg.ibm_srcnid));
 
-        memset(&reject, 0, sizeof(reject));
-        reject.reason = cm_rej_code_usr_rej;
-        cm_reject(cep, &reject);
+        kibnal_reject(cep, reason);
 
         if (conn != NULL) {
                 LASSERT (rc != 0);
@@ -2458,12 +2627,11 @@ kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg)
                 return;
         }
 
-        PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr));
+        LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr));
         if (pcr == NULL) {
                 CERROR("Can't allocate passive connreq\n");
 
-                cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */
-                                 {.reason = cm_rej_code_no_res,}));
+                kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES);
                 cm_destroy_cep(cep);
                 return;
         }
@@ -2504,7 +2672,7 @@ kibnal_connect_conn (kib_conn_t *conn)
         kib_connvars_t           *cv = conn->ibc_connvars;
         kib_peer_t               *peer = conn->ibc_peer;
         cm_return_t               cmrc;
-        
+
         /* Only called by connd => statics OK */
         LASSERT (!in_interrupt());
         LASSERT (current == kibnal_data.kib_connd);
@@ -2512,12 +2680,12 @@ kibnal_connect_conn (kib_conn_t *conn)
 
         memset(&cmreq, 0, sizeof(cmreq));
         
-        cmreq.sid = IBNAL_SERVICE_NUMBER;
+        cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number);
 
         cmreq.cep_data.ca_guid              = kibnal_data.kib_hca_attrs.guid;
         cmreq.cep_data.qpn                  = cv->cv_local_qpn;
-        cmreq.cep_data.retry_cnt            = IBNAL_RETRY_CNT;
-        cmreq.cep_data.rtr_retry_cnt        = IBNAL_RNR_CNT;
+        cmreq.cep_data.retry_cnt            = *kibnal_tunables.kib_retry_cnt;
+        cmreq.cep_data.rtr_retry_cnt        = *kibnal_tunables.kib_rnr_cnt;
         cmreq.cep_data.start_psn            = cv->cv_rxpsn;
         cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT;
         // XXX ack_timeout?
@@ -2534,12 +2702,27 @@ kibnal_connect_conn (kib_conn_t *conn)
         msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
         msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
         msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
-        kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0);
+        kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0);
+
+        if (the_lnet.ln_testprotocompat != 0) {
+                /* single-shot proto check */
+                LNET_LOCK();
+                if ((the_lnet.ln_testprotocompat & 1) != 0) {
+                        msg.ibm_version++;
+                        the_lnet.ln_testprotocompat &= ~1;
+                }
+                if ((the_lnet.ln_testprotocompat & 2) != 0) {
+                        msg.ibm_magic = LNET_PROTO_MAGIC;
+                        the_lnet.ln_testprotocompat &= ~2;
+                }
+                LNET_UNLOCK();
+        }
 
         /* ...and copy into cmreq to avoid alignment issues */
         memcpy(&cmreq.priv_data, &msg, msg.ibm_nob);
         
-        CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid);
+        CDEBUG(D_NET, "Connecting %p to %s\n", conn,
+               libcfs_nid2str(peer->ibp_nid));
 
         kibnal_conn_addref(conn);               /* ++ref for CM callback */
         kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT);
@@ -2547,17 +2730,67 @@ kibnal_connect_conn (kib_conn_t *conn)
         cmrc = cm_connect(conn->ibc_cep, &cmreq, 
                           kibnal_active_connect_callback, conn);
         if (cmrc == cm_stat_success) {
-                CDEBUG(D_NET, "connection REQ sent to "LPX64"\n",
-                       peer->ibp_nid);
+                CDEBUG(D_NET, "connection REQ sent to %s\n",
+                       libcfs_nid2str(peer->ibp_nid));
                 return;
         }
 
-        CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
+        CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc);
         kibnal_conn_decref(conn);       /* drop callback's ref */
         kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
 }
 
 void
+kibnal_reconnect (kib_conn_t *conn, int why)
+{
+        kib_peer_t      *peer = conn->ibc_peer;
+        int              retry;
+        unsigned long    flags;
+        cm_return_t      cmrc;
+        cm_cep_handle_t  cep;
+        
+        LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
+
+        read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+        LASSERT (peer->ibp_connecting > 0);          /* 'conn' at least */
+
+        /* retry connection if it's still needed and no other connection
+         * attempts (active or passive) are in progress.
+         * Immediate reconnect is required, so I don't even look at the
+         * reconnection timeout etc */
+
+        retry = (!list_empty(&peer->ibp_tx_queue) &&
+                 peer->ibp_connecting == 1 &&
+                 peer->ibp_accepting == 0);
+        
+        read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+        if (!retry) {
+                kibnal_connreq_done(conn, 1, why);
+                return;
+        }
+
+        cep = cm_create_cep(cm_cep_transp_rc);
+        if (cep == NULL) {
+                CERROR("Can't create new CEP\n");
+                kibnal_connreq_done(conn, 1, -ENOMEM);
+                return;
+        }
+
+        cmrc = cm_cancel(conn->ibc_cep);
+        LASSERT (cmrc == cm_stat_success);
+        cmrc = cm_destroy_cep(conn->ibc_cep);
+        LASSERT (cmrc == cm_stat_success);
+
+        conn->ibc_cep = cep;
+
+        /* reuse conn; no need to peer->ibp_connecting++ */
+        kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
+        kibnal_connect_conn(conn);
+}
+
+void
 kibnal_check_connreply (kib_conn_t *conn)
 {
         static cm_rtu_data_t  rtu;
@@ -2568,7 +2801,6 @@ kibnal_check_connreply (kib_conn_t *conn)
         kib_peer_t       *peer = conn->ibc_peer;
         int               msgnob;
         cm_return_t       cmrc;
-        cm_cep_handle_t   cep;
         unsigned long     flags;
         int               rc;
 
@@ -2589,64 +2821,73 @@ kibnal_check_connreply (kib_conn_t *conn)
                 msgnob = MIN(cm_REP_priv_data_len, sizeof(msg));
                 memcpy(&msg, &reply->priv_data, msgnob);
 
-                rc = kibnal_unpack_msg(&msg, msgnob);
+                rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob);
                 if (rc != 0) {
-                        CERROR("Can't unpack reply from "LPX64"\n",
-                               peer->ibp_nid);
+                        CERROR("Can't unpack reply from %s\n",
+                               libcfs_nid2str(peer->ibp_nid));
                         kibnal_connreq_done(conn, 1, rc);
                         return;
                 }
 
                 if (msg.ibm_type != IBNAL_MSG_CONNACK ) {
-                        CERROR("Unexpected message type %d from "LPX64"\n",
-                               msg.ibm_type, peer->ibp_nid);
+                        CERROR("Unexpected message type %d from %s\n",
+                               msg.ibm_type, libcfs_nid2str(peer->ibp_nid));
                         kibnal_connreq_done(conn, 1, -EPROTO);
                         return;
                 }
 
                 if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) {
-                        CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n",
-                               peer->ibp_nid, msg.ibm_u.connparams.ibcp_queue_depth,
+                        CERROR("%s has incompatible queue depth %d(%d wanted)\n",
+                               libcfs_nid2str(peer->ibp_nid), 
+                               msg.ibm_u.connparams.ibcp_queue_depth,
                                IBNAL_MSG_QUEUE_SIZE);
                         kibnal_connreq_done(conn, 1, -EPROTO);
                         return;
                 }
                 
                 if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) {
-                        CERROR(LPX64" max message size %d too big (%d max)\n",
-                               peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_msg_size, 
+                        CERROR("%s max message size %d too big (%d max)\n",
+                               libcfs_nid2str(peer->ibp_nid), 
+                               msg.ibm_u.connparams.ibcp_max_msg_size, 
                                IBNAL_MSG_SIZE);
                         kibnal_connreq_done(conn, 1, -EPROTO);
                         return;
                 }
 
                 if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
-                        CERROR(LPX64" max frags %d too big (%d max)\n",
-                               peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_frags, 
+                        CERROR("%s max frags %d too big (%d max)\n",
+                               libcfs_nid2str(peer->ibp_nid),
+                               msg.ibm_u.connparams.ibcp_max_frags, 
                                IBNAL_MAX_RDMA_FRAGS);
                         kibnal_connreq_done(conn, 1, -EPROTO);
                         return;
                 }
                 
                 read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
-                rc = (msg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid ||
-                      msg.ibm_dststamp != kibnal_data.kib_incarnation) ?
-                     -ESTALE : 0;
+                if (lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid,
+                                            msg.ibm_dstnid) &&
+                    msg.ibm_dststamp == kibnal_data.kib_incarnation)
+                        rc = 0;
+                else
+                        rc = -ESTALE;
                 read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
                 if (rc != 0) {
-                        CERROR("Stale connection reply from "LPX64"\n",
-                               peer->ibp_nid);
+                        CERROR("Stale connection reply from %s\n",
+                               libcfs_nid2str(peer->ibp_nid));
                         kibnal_connreq_done(conn, 1, rc);
                         return;
                 }
 
                 conn->ibc_incarnation = msg.ibm_srcstamp;
                 conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
+                conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE;
+                LASSERT (conn->ibc_credits + conn->ibc_reserved_credits
+                         <= IBNAL_RX_MSGS);
                 
                 rc = kibnal_post_receives(conn);
                 if (rc != 0) {
-                        CERROR("Can't post receives for "LPX64"\n",
-                               peer->ibp_nid);
+                        CERROR("Can't post receives for %s\n",
+                               libcfs_nid2str(peer->ibp_nid));
                         kibnal_connreq_done(conn, 1, rc);
                         return;
                 }
@@ -2676,7 +2917,8 @@ kibnal_check_connreply (kib_conn_t *conn)
                         return;
                 }
 
-                CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc);
+                CERROR("cm_accept %s failed: %d\n", 
+                       libcfs_nid2str(peer->ibp_nid), cmrc);
                 /* Back out of RTU: no callback coming */
                 kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY);
                 kibnal_conn_decref(conn);
@@ -2686,37 +2928,72 @@ kibnal_check_connreply (kib_conn_t *conn)
 
         if (cv->cv_conndata.status == cm_event_conn_reject) {
 
-                if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) {
-                        CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid,
-                               cv->cv_conndata.data.reject.reason);
-                        kibnal_connreq_done(conn, 1, -ECONNREFUSED);
-                        return;
-                }
+                if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) {
+                        unsigned char *bytes =
+                                cv->cv_conndata.data.reject.priv_data;
+                        int   magic   = (bytes[0]) |
+                                        (bytes[1] << 8) |
+                                        (bytes[2] << 16) |
+                                        (bytes[3] << 24);
+                        int   version = (bytes[4]) |
+                                        (bytes[5] << 8);
+                        int   why     = (bytes[6]);
+
+                        /* Expected proto/version: she just doesn't like me (or
+                         * ran out of resources) */
+                        if (magic == IBNAL_MSG_MAGIC &&
+                            version == conn->ibc_version) {
+                                CERROR("conn -> %s rejected: fatal error %d\n",
+                                       libcfs_nid2str(peer->ibp_nid), why);
+
+                                if (why == IBNAL_REJECT_CONN_RACE) 
+                                        kibnal_reconnect(conn, -EALREADY);
+                                else
+                                        kibnal_connreq_done(conn, 1, -ECONNREFUSED);
+                                return;
+                        }
+                        
+                        /* Fail unless it's worth retrying with an old proto
+                         * version */
+                        if (!(magic == IBNAL_MSG_MAGIC &&
+                              version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD &&
+                              conn->ibc_version == IBNAL_MSG_VERSION)) {
+                                CERROR("conn -> %s rejected: bad protocol "
+                                       "magic/ver %08x/%x why %d\n",
+                                       libcfs_nid2str(peer->ibp_nid),
+                                       magic, version, why);
+
+                                kibnal_connreq_done(conn, 1, -ECONNREFUSED);
+                                return;
+                        }
 
-                CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid);
+                        conn->ibc_version = version;
+                        CWARN ("Connection to %s refused: "
+                               "retrying with old protocol version 0x%x\n", 
+                               libcfs_nid2str(peer->ibp_nid), version);
 
-                cep = cm_create_cep(cm_cep_transp_rc);
-                if (cep == NULL) {
-                        CERROR("Can't create new CEP\n");
-                        kibnal_connreq_done(conn, 1, -ENOMEM);
+                        kibnal_reconnect(conn, -ECONNREFUSED);
                         return;
-                }
-
-                cmrc = cm_cancel(conn->ibc_cep);
-                LASSERT (cmrc == cm_stat_success);
-                cmrc = cm_destroy_cep(conn->ibc_cep);
-                LASSERT (cmrc == cm_stat_success);
-
-                conn->ibc_cep = cep;
+                } else if (cv->cv_conndata.data.reject.reason == 
+                           cm_rej_code_stale_conn) {
+                        
+                        CWARN ("conn -> %s stale: retrying\n", 
+                               libcfs_nid2str(peer->ibp_nid));
 
-                /* retry connect */
-                kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP);
-                kibnal_connect_conn(conn);
-                return;
+                        kibnal_reconnect(conn, -ESTALE);
+                        return;
+                } else {
+                        CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n",
+                               libcfs_nid2str(peer->ibp_nid),
+                               cv->cv_conndata.data.reject.reason);
+                        kibnal_connreq_done(conn, 1, -ECONNREFUSED);
+                        return;
+                }
+                /* NOT REACHED */
         }
 
-        CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid,
-               cv->cv_conndata.status);
+        CDEBUG(D_NETERROR, "conn -> %s failed: %d\n", 
+               libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status);
         kibnal_connreq_done(conn, 1, -ECONNABORTED);
 }
 
@@ -2737,54 +3014,50 @@ kibnal_arp_done (kib_conn_t *conn)
         LASSERT (peer->ibp_arp_count > 0);
         
         if (cv->cv_arprc != ibat_stat_ok) {
-                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
-                peer->ibp_arp_count--;
-                if (peer->ibp_arp_count == 0) {
-                        /* final ARP attempt failed */
-                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
-                                                flags);
-                        CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", 
-                               peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
-                               cv->cv_arprc);
-                } else {
-                        /* Retry ARP: ibp_connecting++ so terminating conn
-                         * doesn't end peer's connection attempt */
-                        peer->ibp_connecting++;
-                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
-                                                flags);
-                        CWARN("Arp "LPX64"@%u.%u.%u.%u failed: %d "
-                              "(%d attempts left)\n", 
-                              peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
-                              cv->cv_arprc, peer->ibp_arp_count);
-
-                        kibnal_schedule_peer_arp(peer);
-                }
-                kibnal_connreq_done(conn, 1, -ENETUNREACH);
-                return;
+                CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n", 
+                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip),
+                       cv->cv_arprc);
+                goto failed;
         }
 
         if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) {
-                CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid);
+                CDEBUG(D_NET, "Got valid path for %s\n",
+                       libcfs_nid2str(peer->ibp_nid));
 
                 *path = *arp->primary_path;
 
                 vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid,
                                          &cv->cv_port);
-                LASSERT (vvrc == vv_return_ok);
+                if (vvrc != vv_return_ok) {
+                        CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n", 
+                              libcfs_nid2str(peer->ibp_nid),
+                              HIPQUAD(peer->ibp_ip), vvrc);
+                        goto failed;
+                }
 
                 vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port,
                                      &path->sgid, &cv->cv_sgid_index);
-                LASSERT (vvrc == vv_return_ok);
+                if (vvrc != vv_return_ok) {
+                        CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n", 
+                              libcfs_nid2str(peer->ibp_nid),
+                              HIPQUAD(peer->ibp_ip), vvrc);
+                        goto failed;
+                }
 
                 vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port,
                                        path->pkey, &cv->cv_pkey_index);
-                LASSERT (vvrc == vv_return_ok);
+                if (vvrc != vv_return_ok) {
+                        CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n", 
+                              libcfs_nid2str(peer->ibp_nid), 
+                              HIPQUAD(peer->ibp_ip), vvrc);
+                        goto failed;
+                }
 
                 path->mtu = IBNAL_IB_MTU;
 
         } else if ((arp->mask & IBAT_LID_VALID) != 0) {
-                CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n",
-                      peer->ibp_nid, HIPQUAD(peer->ibp_ip));
+                CWARN("Creating new path record for %s @ %u.%u.%u.%u\n",
+                      libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
 
                 cv->cv_pkey_index = IBNAL_PKEY_IDX;
                 cv->cv_sgid_index = IBNAL_SGID_IDX;
@@ -2794,11 +3067,21 @@ kibnal_arp_done (kib_conn_t *conn)
 
                 vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port,
                                          &path->sgid);
-                LASSERT (vvrc == vv_return_ok);
+                if (vvrc != vv_return_ok) {
+                        CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n", 
+                              libcfs_nid2str(peer->ibp_ip),
+                              HIPQUAD(peer->ibp_ip), vvrc);
+                        goto failed;
+                }
 
                 vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port,
                                          &path->slid);
-                LASSERT (vvrc == vv_return_ok);
+                if (vvrc != vv_return_ok) {
+                        CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n", 
+                              libcfs_nid2str(peer->ibp_ip), 
+                              HIPQUAD(peer->ibp_ip), vvrc);
+                        goto failed;
+                }
 
                 path->dgid          = arp->gid;
                 path->sl            = IBNAL_SERVICE_LEVEL;
@@ -2809,10 +3092,9 @@ kibnal_arp_done (kib_conn_t *conn)
                 path->pkey          = IBNAL_PKEY;
                 path->traffic_class = IBNAL_TRAFFIC_CLASS;
         } else {
-                CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n", 
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip));
-                kibnal_connreq_done(conn, 1, -ENETUNREACH);
-                return;
+                CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n",
+                      libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
+                goto failed;
         }
 
         rc = kibnal_set_qp_state(conn, vv_qp_state_init);
@@ -2822,27 +3104,53 @@ kibnal_arp_done (kib_conn_t *conn)
 
         /* do the actual connection request */
         kibnal_connect_conn(conn);
+        return;
+
+ failed:
+        write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+        peer->ibp_arp_count--;
+        if (peer->ibp_arp_count == 0) {
+                /* final ARP attempt failed */
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                        flags);
+                CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n", 
+                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip));
+        } else {
+                /* Retry ARP: ibp_connecting++ so terminating conn
+                 * doesn't end peer's connection attempt */
+                peer->ibp_connecting++;
+                write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                        flags);
+                CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n",
+                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), 
+                       peer->ibp_arp_count);
+                
+                kibnal_schedule_peer_arp(peer);
+        }
+        kibnal_connreq_done(conn, 1, -ENETUNREACH);
 }
 
 void
 kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
 {
         /* CAVEAT EMPTOR: tasklet context */
-        kib_conn_t      *conn = (kib_conn_t *)arg;
-        kib_peer_t      *peer = conn->ibc_peer;
+        kib_peer_t *peer;
+        kib_conn_t *conn = (kib_conn_t *)arg;
+
+        LASSERT (conn != NULL);
+        LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
+
+        peer = conn->ibc_peer;
 
         if (arprc != ibat_stat_ok)
-                CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n",
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc);
+                CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n",
+                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc);
         else
-                CDEBUG(D_NET, "Arp "LPX64"@%u.%u.%u.%u OK: LID %s PATH %s\n",
-                       peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
+                CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n",
+                       libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), 
                        (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid",
                        (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid");
 
-        LASSERT (conn != NULL);
-        LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
-
         conn->ibc_connvars->cv_arprc = arprc;
         if (arprc == ibat_stat_ok)
                 conn->ibc_connvars->cv_arp = *arp_data;
@@ -2865,18 +3173,18 @@ kibnal_arp_peer (kib_peer_t *peer)
 
         cep = cm_create_cep(cm_cep_transp_rc);
         if (cep == NULL) {
-                CERROR ("Can't create cep for conn->"LPX64"\n",
-                        peer->ibp_nid);
-                kibnal_peer_connect_failed(peer, 1);
+                CERROR ("Can't create cep for conn->%s\n",
+                        libcfs_nid2str(peer->ibp_nid));
+                kibnal_peer_connect_failed(peer, 1, -ENOMEM);
                 return;
         }
 
         conn = kibnal_create_conn(cep);
         if (conn == NULL) {
-                CERROR ("Can't allocate conn->"LPX64"\n",
-                        peer->ibp_nid);
+                CERROR ("Can't allocate conn->%s\n",
+                        libcfs_nid2str(peer->ibp_nid));
                 cm_destroy_cep(cep);
-                kibnal_peer_connect_failed(peer, 1);
+                kibnal_peer_connect_failed(peer, 1, -ENOMEM);
                 return;
         }
 
@@ -2912,39 +3220,41 @@ kibnal_arp_peer (kib_peer_t *peer)
 }
 
 int
-kibnal_conn_timed_out (kib_conn_t *conn)
+kibnal_check_txs (kib_conn_t *conn, struct list_head *txs)
 {
         kib_tx_t          *tx;
         struct list_head  *ttmp;
+        int                timed_out = 0;
 
         spin_lock(&conn->ibc_lock);
 
-        list_for_each (ttmp, &conn->ibc_tx_queue) {
+        list_for_each (ttmp, txs) {
                 tx = list_entry (ttmp, kib_tx_t, tx_list);
 
-                LASSERT (tx->tx_queued);
-
-                if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock(&conn->ibc_lock);
-                        return 1;
+                if (txs == &conn->ibc_active_txs) {
+                        LASSERT (!tx->tx_queued);
+                        LASSERT (tx->tx_waiting || tx->tx_sending != 0);
+                } else {
+                        LASSERT (tx->tx_queued);
                 }
-        }
-
-        list_for_each (ttmp, &conn->ibc_active_txs) {
-                tx = list_entry (ttmp, kib_tx_t, tx_list);
-
-                LASSERT (!tx->tx_queued);
-                LASSERT (tx->tx_waiting ||
-                         tx->tx_sending != 0);
 
                 if (time_after_eq (jiffies, tx->tx_deadline)) {
-                        spin_unlock(&conn->ibc_lock);
-                        return 1;
+                        timed_out = 1;
+                        break;
                 }
         }
 
         spin_unlock(&conn->ibc_lock);
-        return 0;
+        return timed_out;
+}
+
+int
+kibnal_conn_timed_out (kib_conn_t *conn)
+{
+        return  kibnal_check_txs(conn, &conn->ibc_tx_queue) ||
+                kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
+                kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
+                kibnal_check_txs(conn, &conn->ibc_active_txs);
 }
 
 void
@@ -2985,11 +3295,11 @@ kibnal_check_conns (int idx)
                         
                         kibnal_conn_addref(conn); /* 1 ref for me... */
 
-                        read_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                        read_unlock_irqrestore(&kibnal_data.kib_global_lock,
                                                flags);
 
-                        CERROR("Timed out RDMA with "LPX64"\n",
-                               peer->ibp_nid);
+                        CERROR("Timed out RDMA with %s\n",
+                               libcfs_nid2str(peer->ibp_nid));
 
                         kibnal_close_conn (conn, -ETIMEDOUT);
                         kibnal_conn_decref(conn); /* ...until here */
@@ -3037,7 +3347,7 @@ kibnal_disconnect_conn (kib_conn_t *conn)
         write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
 
         cm_cancel(conn->ibc_cep);
-        kibnal_pause(HZ/10);
+        cfs_pause(cfs_time_seconds(1)/10);
 
         if (!conn->ibc_disconnect)              /* CM callback will never happen now */
                 kibnal_conn_decref(conn);
@@ -3062,13 +3372,13 @@ kibnal_connd (void *arg)
         int                peer_index = 0;
         unsigned long      deadline = jiffies;
         
-        kportal_daemonize ("kibnal_connd");
-        kportal_blockallsigs ();
+        cfs_daemonize ("kibnal_connd");
+        cfs_block_allsigs ();
 
         init_waitqueue_entry (&wait, current);
         kibnal_data.kib_connd = current;
 
-        spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
 
         while (!kibnal_data.kib_shutdown) {
 
@@ -3096,7 +3406,7 @@ kibnal_connd (void *arg)
                         dropped_lock = 1;
 
                         kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq);
-                        PORTAL_FREE(pcr, sizeof(*pcr));
+                        LIBCFS_FREE(pcr, sizeof(*pcr));
 
                         spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
                 }
@@ -3167,9 +3477,9 @@ kibnal_connd (void *arg)
                          * connection within (n+1)/n times the timeout
                          * interval. */
 
-                        if (kibnal_tunables.kib_io_timeout > n * p)
+                        if (*kibnal_tunables.kib_timeout > n * p)
                                 chunk = (chunk * n * p) / 
-                                        kibnal_tunables.kib_io_timeout;
+                                        *kibnal_tunables.kib_timeout;
                         if (chunk == 0)
                                 chunk = 1;
 
@@ -3216,8 +3526,6 @@ kibnal_cq_callback (unsigned long unused_context)
 {
         unsigned long    flags;
 
-        CDEBUG(D_NET, "!!\n");
-
         spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
         kibnal_data.kib_ready = 1;
         wake_up(&kibnal_data.kib_sched_waitq);
@@ -3239,8 +3547,8 @@ kibnal_scheduler(void *arg)
         int             busy_loops = 0;
 
         snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
-        kportal_daemonize(name);
-        kportal_blockallsigs();
+        cfs_daemonize(name);
+        cfs_block_allsigs();
 
         init_waitqueue_entry(&wait, current);
 
@@ -3333,8 +3641,8 @@ kibnal_scheduler(void *arg)
                                  * I give a scheduler on another CPU a chance
                                  * to get the final SEND completion, so the tx
                                  * descriptor can get freed as I inspect it. */
-                                CERROR ("RDMA failed: %d\n", 
-                                        wc.completion_status);
+                                CDEBUG(D_NETERROR, "RDMA failed: %d\n", 
+                                       wc.completion_status);
                                 break;
 
                         default:
@@ -3348,7 +3656,7 @@ kibnal_scheduler(void *arg)
                 /* Nothing to do; sleep... */
 
                 set_current_state(TASK_INTERRUPTIBLE);
-                add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
+                add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait);
                 spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
                                        flags);
 
@@ -3364,13 +3672,3 @@ kibnal_scheduler(void *arg)
         kibnal_thread_fini();
         return (0);
 }
-
-
-lib_nal_t kibnal_lib = {
-        .libnal_data = &kibnal_data,      /* NAL private data */
-        .libnal_send = kibnal_send,
-        .libnal_send_pages = kibnal_send_pages,
-        .libnal_recv = kibnal_recv,
-        .libnal_recv_pages = kibnal_recv_pages,
-        .libnal_dist = kibnal_dist
-};
diff --git a/lnet/klnds/viblnd/viblnd_modparams.c b/lnet/klnds/viblnd/viblnd_modparams.c
new file mode 100644 (file)
index 0000000..1179d72
--- /dev/null
@@ -0,0 +1,237 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2004 Cluster File Systems, Inc.
+ *   Author: Eric Barton <eric@bartonsoftware.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#include "viblnd.h"
+
+static int service_number = 0x11b9a2;
+CFS_MODULE_PARM(service_number, "i", int, 0444,
+                "IB service number");
+
+static int min_reconnect_interval = 1;
+CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
+               "minimum connection retry interval (seconds)");
+
+static int max_reconnect_interval = 60;
+CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
+               "maximum connection retry interval (seconds)");
+
+static int concurrent_peers = 1152;
+CFS_MODULE_PARM(concurrent_peers, "i", int, 0444,
+               "maximum number of peers that may connect");
+
+static int cksum = 0;
+CFS_MODULE_PARM(cksum, "i", int, 0644,
+               "set non-zero to enable message (not RDMA) checksums");
+
+static int timeout = 50;
+CFS_MODULE_PARM(timeout, "i", int, 0644,
+               "timeout (seconds)");
+
+static int ntx = 256;
+CFS_MODULE_PARM(ntx, "i", int, 0444,
+               "# of message descriptors");
+
+static int credits = 128;
+CFS_MODULE_PARM(credits, "i", int, 0444,
+               "# concurrent sends");
+
+static int peer_credits = 8;
+CFS_MODULE_PARM(peer_credits, "i", int, 0444,
+               "# concurrent sends to 1 peer");
+
+static int arp_retries = 3;
+CFS_MODULE_PARM(arp_retries, "i", int, 0644,
+               "# of times to retry ARP");
+
+static char *hca_basename = "InfiniHost";
+CFS_MODULE_PARM(hca_basename, "s", charp, 0444,
+                "HCA base name");
+
+static char *ipif_basename = "ipoib";
+CFS_MODULE_PARM(ipif_basename, "s", charp, 0444,
+                "IPoIB interface base name");
+
+static int local_ack_timeout = 0x12;
+CFS_MODULE_PARM(local_ack_timeout, "i", int, 0644,
+                "ACK timeout for low-level 'sends'");
+
+static int retry_cnt = 7;
+CFS_MODULE_PARM(retry_cnt, "i", int, 0644,
+                "Retransmissions when no ACK received");
+
+static int rnr_cnt = 6;
+CFS_MODULE_PARM(rnr_cnt, "i", int, 0644,
+                "RNR retransmissions");
+
+static int rnr_nak_timer = 0x10;
+CFS_MODULE_PARM(rnr_nak_timer, "i", int, 0644,
+                "RNR retransmission interval");
+
+static int keepalive = 100;
+CFS_MODULE_PARM(keepalive, "i", int, 0644,
+                "Idle time in seconds before sending a keepalive");
+
+static int concurrent_sends = IBNAL_RX_MSGS;
+CFS_MODULE_PARM(concurrent_sends, "i", int, 0644,
+                "send work-queue sizing");
+
+#if IBNAL_USE_FMR
+static int fmr_remaps = 1000;
+CFS_MODULE_PARM(fmr_remaps, "i", int, 0444,
+                "FMR mappings allowed before unmap");
+#endif
+
+kib_tunables_t kibnal_tunables = {
+        .kib_service_number         = &service_number,
+        .kib_min_reconnect_interval = &min_reconnect_interval,
+        .kib_max_reconnect_interval = &max_reconnect_interval,
+        .kib_concurrent_peers       = &concurrent_peers,
+        .kib_cksum                  = &cksum,
+        .kib_timeout                = &timeout,
+        .kib_ntx                    = &ntx,
+        .kib_credits                = &credits,
+        .kib_peercredits            = &peer_credits,
+        .kib_arp_retries            = &arp_retries,
+        .kib_hca_basename           = &hca_basename,
+        .kib_ipif_basename          = &ipif_basename,
+        .kib_local_ack_timeout      = &local_ack_timeout,
+        .kib_retry_cnt              = &retry_cnt,
+        .kib_rnr_cnt                = &rnr_cnt,
+        .kib_rnr_nak_timer          = &rnr_nak_timer,
+        .kib_keepalive              = &keepalive,
+        .kib_concurrent_sends       = &concurrent_sends,
+#if IBNAL_USE_FMR
+        .kib_fmr_remaps             = &fmr_remaps,
+#endif
+};
+
+#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
+
+static char hca_basename_space[32];
+static char ipif_basename_space[32];
+
+static ctl_table kibnal_ctl_table[] = {
+       {1, "service_number", &service_number, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {2, "min_reconnect_interval", &min_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {3, "max_reconnect_interval", &max_reconnect_interval, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {4, "concurrent_peers", &concurrent_peers, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {5, "cksum", &cksum, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {6, "timeout", &timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {7, "ntx", &ntx, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {8, "credits", &credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {9, "peer_credits", &peer_credits, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+       {10, "arp_retries", &arp_retries, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {11, "hca_basename", hca_basename_space, 
+        sizeof(hca_basename_space), 0444, NULL, &proc_dostring},
+       {12, "ipif_basename", ipif_basename_space, 
+        sizeof(ipif_basename_space), 0444, NULL, &proc_dostring},
+       {13, "local_ack_timeout", &local_ack_timeout, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {14, "retry_cnt", &retry_cnt, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {15, "rnr_cnt", &rnr_cnt, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {16, "rnr_nak_timer", &rnr_nak_timer, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {17, "keepalive", &keepalive, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+       {18, "concurrent_sends", &concurrent_sends, 
+        sizeof(int), 0644, NULL, &proc_dointvec},
+#if IBNAL_USE_FMR
+       {19, "fmr_remaps", &fmr_remaps, 
+        sizeof(int), 0444, NULL, &proc_dointvec},
+#endif        
+       {0}
+};
+
+static ctl_table kibnal_top_ctl_table[] = {
+       {203, "vibnal", NULL, 0, 0555, kibnal_ctl_table},
+       {0}
+};
+
+void
+kibnal_initstrtunable(char *space, char *str, int size)
+{
+        strncpy(space, str, size);
+        space[size-1] = 0;
+}
+
+int
+kibnal_tunables_init ()
+{
+        kibnal_initstrtunable(hca_basename_space, hca_basename,
+                              sizeof(hca_basename_space));
+        kibnal_initstrtunable(ipif_basename_space, ipif_basename,
+                              sizeof(ipif_basename_space));
+
+       kibnal_tunables.kib_sysctl =
+               register_sysctl_table(kibnal_top_ctl_table, 0);
+       
+       if (kibnal_tunables.kib_sysctl == NULL)
+               CWARN("Can't setup /proc tunables\n");
+
+        if (*kibnal_tunables.kib_concurrent_sends > IBNAL_RX_MSGS)
+                *kibnal_tunables.kib_concurrent_sends = IBNAL_RX_MSGS;
+        if (*kibnal_tunables.kib_concurrent_sends < IBNAL_MSG_QUEUE_SIZE)
+                *kibnal_tunables.kib_concurrent_sends = IBNAL_MSG_QUEUE_SIZE;
+
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+       if (kibnal_tunables.kib_sysctl != NULL)
+               unregister_sysctl_table(kibnal_tunables.kib_sysctl);
+}
+
+#else
+
+int
+kibnal_tunables_init ()
+{
+       return 0;
+}
+
+void
+kibnal_tunables_fini ()
+{
+}
+
+#endif
+       
+               
+               
+
+       
+               
index 6dacf6d..26242c1 100644 (file)
@@ -12,7 +12,7 @@ typedef struct kib_connparams
 
 typedef struct
 {
-        ptl_hdr_t         ibim_hdr;             /* portals header */
+        lnet_hdr_t        ibim_hdr;             /* portals header */
         char              ibim_payload[0];      /* piggy-backed payload */
 } WIRE_ATTR kib_immediate_msg_t;
 
@@ -48,7 +48,7 @@ typedef struct
 
 typedef struct
 {
-        ptl_hdr_t         ibprm_hdr;            /* portals header */
+        lnet_hdr_t        ibprm_hdr;            /* portals header */
         __u64             ibprm_cookie;         /* opaque completion cookie */
 } WIRE_ATTR kib_putreq_msg_t;
 
@@ -61,7 +61,7 @@ typedef struct
 
 typedef struct
 {
-        ptl_hdr_t         ibgm_hdr;             /* portals header */
+        lnet_hdr_t        ibgm_hdr;             /* portals header */
         __u64             ibgm_cookie;          /* opaque completion cookie */
         kib_rdma_desc_t   ibgm_rd;              /* rdma descriptor */
 } WIRE_ATTR kib_get_msg_t;
@@ -98,13 +98,11 @@ typedef struct
         } WIRE_ATTR ibm_u;
 } WIRE_ATTR kib_msg_t;
 
-#define IBNAL_MSG_MAGIC       0x0be91b91        /* unique magic */
+#define IBNAL_MSG_MAGIC LNET_PROTO_VIB_MAGIC   /* unique magic */
 
-#if IBNAL_USE_FMA                              /* ensure version changes on FMA */
-#define IBNAL_MSG_VERSION           0x11
-#else
-#define IBNAL_MSG_VERSION           0x10
-#endif
+#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 0x10 /* previous version */
+
+#define IBNAL_MSG_VERSION           0x11       /* current version */
 
 #define IBNAL_MSG_CONNREQ           0xc0        /* connection request */
 #define IBNAL_MSG_CONNACK           0xc1        /* connection acknowledge */
@@ -116,3 +114,8 @@ typedef struct
 #define IBNAL_MSG_PUT_DONE          0xd5        /* completion (src->sink) */
 #define IBNAL_MSG_GET_REQ           0xd6        /* getreq (sink->src) */
 #define IBNAL_MSG_GET_DONE          0xd7        /* completion (src->sink: all OK) */
+
+/* connection rejection reasons */
+#define IBNAL_REJECT_CONN_RACE       0          /* You lost connection race */
+#define IBNAL_REJECT_NO_RESOURCES    1          /* Out of memory/conns etc */
+#define IBNAL_REJECT_FATAL           2          /* Anything else */
index d42171d..5a0e060 100644 (file)
@@ -5,11 +5,19 @@
 #include <string.h>
 #include <sys/types.h>
 #include <sys/wait.h>
-#include <portals/api-support.h>
-#include <portals/lib-types.h>
+
+#include <lnet/api-support.h>
+
+/* This ghastly hack to allows me to include lib-types.h It doesn't affect any
+ * assertions generated here (but fails-safe if it ever does) */
+typedef struct {
+        int     counter;
+} atomic_t;
+
+#include <lnet/lib-types.h>
 
 #define IBNAL_USE_FMR 1
-#include "vibnal_wire.h"
+#include "viblnd_wire.h"
 
 #ifndef HAVE_STRNLEN
 #define strnlen(s, i) strlen(s)
@@ -146,6 +154,10 @@ main (int argc, char **argv)
         CHECK_DEFINE (IBNAL_MSG_GET_REQ);
         CHECK_DEFINE (IBNAL_MSG_GET_DONE);
 
+        CHECK_DEFINE (IBNAL_REJECT_CONN_RACE);
+        CHECK_DEFINE (IBNAL_REJECT_NO_RESOURCES);
+        CHECK_DEFINE (IBNAL_REJECT_FATAL);
+
         CHECK_STRUCT (kib_connparams_t);
         CHECK_MEMBER (kib_connparams_t, ibcp_queue_depth);
         CHECK_MEMBER (kib_connparams_t, ibcp_max_msg_size);
index 7e3cc08..aaf9b2f 100644 (file)
        <string>1.0.0</string>
        <key>OSBundleLibraries</key>
        <dict>
-               <key>com.apple.kernel.bsd</key>
-               <string>1.1</string>
-               <key>com.apple.kernel.iokit</key>
-               <string>1.0.0b1</string>
-               <key>com.apple.kernel.mach</key>
-               <string>1.0.0b1</string>
+               <key>com.apple.kpi.bsd</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.libkern</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.mach</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.unsupported</key>
+               <string>8.0.0b1</string>
        </dict>
 </dict>
 </plist>
index aaaad93..0940a56 100644 (file)
@@ -2,7 +2,7 @@ MODULES = libcfs
 
 libcfs-linux-objs := linux-tracefile.o linux-debug.o
 libcfs-linux-objs += linux-prim.o linux-mem.o
-libcfs-linux-objs += linux-fs.o linux-sync.o
+libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o
 libcfs-linux-objs += linux-lwt.o linux-proc.o linux-curproc.o
 libcfs-linux-objs += linux-utils.o linux-module.o
 
@@ -24,10 +24,10 @@ sources:
 
 endif
 
-libcfs-all-objs := debug.o lwt.o module.o tracefile.o watchdog.o
+libcfs-all-objs := debug.o nidstrings.o lwt.o module.o tracefile.o watchdog.o
 
 libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs)
 
-EXTRA_PRE_CFLAGS := -I@LUSTRE@/../portals/libcfs
+EXTRA_PRE_CFLAGS := -I@LUSTRE@/../lnet/libcfs
 
 @INCLUDE_RULES@
index a818ab8..18381c1 100644 (file)
@@ -9,6 +9,13 @@ SUBDIRS += darwin
 endif
 DIST_SUBDIRS := $(SUBDIRS)
 
+if LIBLUSTRE
+noinst_LIBRARIES= libcfs.a
+libcfs_a_SOURCES= debug.c user-prim.c user-lock.c
+libcfs_a_CPPFLAGS = $(LLCPPFLAGS)
+libcfs_a_CFLAGS = $(LLCFLAGS)
+endif
+
 if MODULES
 
 if LINUX
@@ -18,12 +25,12 @@ endif
 if DARWIN
 macos_PROGRAMS := libcfs
 
-nodist_libcfs_SOURCES := debug.c module.c tracefile.c               \
-       darwin/darwin-debug.c darwin/darwin-fs.c darwin/darwin-mem.c \
-       darwin/darwin-module.c darwin/darwin-prim.c                  \
-       darwin/darwin-proc.c darwin/darwin-tracefile.c               \
-       darwin/darwin-utils.c darwin/darwin-sync.c                   \
-       darwin/darwin-curproc.c user-prim.c user-lock.c
+nodist_libcfs_SOURCES := darwin/darwin-sync.c darwin/darwin-mem.c      \
+       darwin/darwin-prim.c darwin/darwin-fs.c darwin/darwin-curproc.c \
+       darwin/darwin-tcpip.c darwin/darwin-utils.c                     \
+       darwin/darwin-debug.c darwin/darwin-proc.c                      \
+       darwin/darwin-tracefile.c darwin/darwin-module.c                \
+       debug.c module.c tracefile.c nidstrings.c watchdog.c
 
 libcfs_CFLAGS := $(EXTRA_KCFLAGS)
 libcfs_LDFLAGS := $(EXTRA_KLDFLAGS)
@@ -41,6 +48,5 @@ install-data-hook: $(install_data_hook)
 
 EXTRA_DIST := Info.plist
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  linux-*.c 
-MOSTLYCLEANFILES += linux/*.o darwin/*.o libcfs
-DIST_SOURCES := $(libcfs-all-objs:%.o=%.c) tracefile.h
+MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ linux-*.c linux/*.o darwin/*.o libcfs
+DIST_SOURCES := $(libcfs-all-objs:%.o=%.c) tracefile.h user-prim.c user-lock.c
index 8e77294..3f2077b 100644 (file)
@@ -8,4 +8,5 @@ EXTRA_DIST := \
        darwin-fs.c \
        darwin-prim.c \
        darwin-tracefile.c \
-       darwin-curproc.c
+       darwin-curproc.c \
+       darwin-tcpip.c
index d930051..e12394e 100644 (file)
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
 
 /*
- * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * Implementation of cfs_curproc API (see lnet/include/libcfs/curproc.h)
  * for XNU kernel.
  */
 
 static inline struct ucred *curproc_ucred(void)
 {
+#ifdef __DARWIN8__
+        return proc_ucred(current_proc());
+#else
         return current_proc()->p_cred->pc_ucred;
+#endif
 }
 
 uid_t  cfs_curproc_uid(void)
@@ -46,17 +50,30 @@ gid_t  cfs_curproc_gid(void)
 
 uid_t  cfs_curproc_fsuid(void)
 {
+#ifdef __DARWIN8__
+        return curproc_ucred()->cr_ruid;
+#else
         return current_proc()->p_cred->p_ruid;
+#endif
 }
 
 gid_t  cfs_curproc_fsgid(void)
 {
+#ifdef __DARWIN8__
+        return curproc_ucred()->cr_rgid;
+#else
         return current_proc()->p_cred->p_rgid;
+#endif
 }
 
 pid_t  cfs_curproc_pid(void)
 {
+#ifdef __DARWIN8__
+        /* no pid for each thread, return address of thread struct */
+        return (pid_t)current_thread();
+#else
         return current_proc()->p_pid;
+#endif
 }
 
 int    cfs_curproc_groups_nr(void)
@@ -94,17 +111,40 @@ void   cfs_curproc_groups_dump(gid_t *array, int size)
 
 mode_t cfs_curproc_umask(void)
 {
+#ifdef __DARWIN8__
+        /*
+         * XXX Liang:
+         *
+         * fd_cmask is not available in kexts, so we just assume 
+         * verything is permited.
+         */
+        return -1;
+#else
         return current_proc()->p_fd->fd_cmask;
+#endif
 }
 
 char  *cfs_curproc_comm(void)
 {
+#ifdef __DARWIN8__
+        /*
+         * Writing to proc->p_comm is not permited in Darwin8,
+         * because proc_selfname() only return a copy of proc->p_comm,
+         * so this function is not really working while user try to 
+         * change comm of current process.
+         */
+        static char     pcomm[MAXCOMLEN+1];
+
+        proc_selfname(pcomm, MAXCOMLEN+1);
+        return pcomm;
+#else
         return current_proc()->p_comm;
+#endif
 }
 
 cfs_kernel_cap_t cfs_curproc_cap_get(void)
 {
-        return 0;
+        return -1;
 }
 
 void cfs_curproc_cap_set(cfs_kernel_cap_t cap)
index 970c5b9..2152d40 100644 (file)
@@ -1,25 +1,77 @@
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/kp30.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
-void portals_debug_dumpstack(cfs_task_t *tsk)
+void libcfs_debug_dumpstack(cfs_task_t *tsk)
 { 
        return;
 }
 
-cfs_task_t *portals_current(void)
-{ 
-       return cfs_current();
+void libcfs_run_lbug_upcall(char *file, const char *fn, const int line)
+{
+}
+
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        libcfs_catastrophe = 1;
+        CEMERG("LBUG: pid: %u thread: %#x\n",
+              (unsigned)cfs_curproc_pid(), (unsigned)current_thread());
+        libcfs_debug_dumplog();
+        libcfs_run_lbug_upcall(file, func, line);
+        while (1)
+                cfs_schedule();
+
+       /* panic("lbug_with_loc(%s, %s, %d)", file, func, line) */
 }
 
-int portals_arch_debug_init(unsigned long bufsize)
+#if ENTRY_NESTING_SUPPORT
+
+static inline struct cfs_debug_data *__current_cdd(void)
 {
-       return 0;
+       struct cfs_debug_data *cdd;
+
+       cdd = (struct cfs_debug_data *)current_uthread()->uu_nlminfo;
+       if (cdd != NULL &&
+           cdd->magic1 == CDD_MAGIC1 && cdd->magic2 == CDD_MAGIC2 &&
+           cdd->nesting_level < 1000)
+               return cdd;
+       else
+               return NULL;
 }
 
-int portals_arch_debug_cleanup(void)
+static inline void __current_cdd_set(struct cfs_debug_data *cdd)
 {
-       return 0;
+       current_uthread()->uu_nlminfo = (void *)cdd;
+}
+
+void __entry_nesting(struct cfs_debug_data *child)
+{
+       struct cfs_debug_data *parent;
+
+       parent = __current_cdd();
+       if (parent != NULL) {
+               child->parent        = parent;
+               child->nesting_level = parent->nesting_level + 1;
+       }
+       __current_cdd_set(child);
+}
+
+void __exit_nesting(struct cfs_debug_data *child)
+{
+       __current_cdd_set(child->parent);
+}
+
+unsigned int __current_nesting_level(void)
+{
+       struct cfs_debug_data *cdd;
+
+       cdd = __current_cdd();
+       if (cdd != NULL)
+               return cdd->nesting_level;
+       else
+               return 0;
 }
+/* ENTRY_NESTING_SUPPORT */
+#endif
index 5b0f44c..45f37df 100644 (file)
 #include <sys/file.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
-#include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/uio.h>
 #include <sys/filedesc.h>
 #include <sys/namei.h>
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
  *
  * Public functions
  */
+
+#ifdef __DARWIN8__
+#include <sys/vnode.h>
+
+extern int vn_rdwr(enum uio_rw, vnode_t, caddr_t, int, off_t, enum uio_seg, int, kauth_cred_t, int *, proc_t);
+
+/* vnode_size() is not exported */
+static errno_t
+vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx)
+{
+        struct vnode_attr       va;
+        int                     error; 
+        
+        VATTR_INIT(&va);
+        VATTR_WANTED(&va, va_data_size);
+        error = vnode_getattr(vp, &va, ctx);
+        if (!error)
+                *sizep = va.va_data_size;
+        return(error);
+}
+
+/*
+ * XXX Liang:
+ *
+ * kern_file_*() are not safe for multi-threads now,
+ * however, we need them only for tracefiled, so it's
+ * not so important to implement for MT.
+ */
+int
+kern_file_size(struct cfs_kern_file *fp, off_t *psize) 
+{
+        int     error;
+        off_t   size;
+
+        error = vnode_size(fp->f_vp, &size, fp->f_ctxt);
+        if (error) 
+                return error;
+
+        if (psize)
+                *psize = size;
+        return 0;
+}
+
+struct cfs_kern_file *
+kern_file_open(const char * filename, int uflags, int mode, int *err)
+{
+        struct cfs_kern_file    *fp;
+        vnode_t         vp;
+        int             error;
+
+        fp = (struct cfs_kern_file *)_MALLOC(sizeof(struct cfs_kern_file), M_TEMP, M_WAITOK);
+        if (fp == NULL) {
+                if (err != NULL)
+                        *err = -ENOMEM;
+                return NULL;
+        }
+        fp->f_flags = FFLAGS(uflags);
+        fp->f_ctxt = vfs_context_create(NULL);
+
+        if ((error = vnode_open(filename, fp->f_flags, 
+                                mode, 0, &vp, fp->f_ctxt))){
+                if (err != NULL)
+                        *err = -error;
+                _FREE(fp, M_TEMP);
+        } else {
+                if (err != NULL)
+                        *err = 0;
+                fp->f_vp = vp;
+        }
+
+        return fp;
+}
+
+int
+kern_file_close(struct cfs_kern_file *fp)
+{
+        vnode_close(fp->f_vp, fp->f_flags, fp->f_ctxt);
+        vfs_context_rele(fp->f_ctxt);
+        _FREE(fp, M_TEMP);
+
+        return 0;
+}
+
+int
+kern_file_read(struct cfs_kern_file *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+        struct proc *p = current_proc();
+        int     resid;
+        int     error;
+
+        assert(buf != NULL);
+        assert(fp != NULL && fp->f_vp != NULL);
+
+        error = vn_rdwr(UIO_READ, fp->f_vp, buf, nbytes, *pos, 
+                        UIO_SYSSPACE32, 0, vfs_context_ucred(fp->f_ctxt), &resid, p);
+        if ((error) || (nbytes == resid)) {
+                if (!error)
+                        error = -EINVAL;
+                return error;
+        }
+        *pos += nbytes - resid;
+
+        return (int)(nbytes - resid);
+}
+
 int
-filp_node_size(struct file *fp, off_t *size)
+kern_file_write(struct cfs_kern_file *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+        struct proc *p = current_proc();
+        int     resid;
+        int     error;
+
+        assert(buf != NULL);
+        assert(fp != NULL && fp->f_vp != NULL);
+
+        error = vn_rdwr(UIO_WRITE, fp->f_vp, buf, nbytes, *pos, 
+                        UIO_SYSSPACE32, 0, vfs_context_ucred(fp->f_ctxt), &resid, p);
+        if ((error) || (nbytes == resid)) {
+                if (!error)
+                        error = -EINVAL;
+                return error;
+        }
+        *pos += nbytes - resid;
+
+        return (int)(nbytes - resid);
+
+}
+
+int
+kern_file_sync (struct cfs_kern_file *fp)
+{
+        return VNOP_FSYNC(fp->f_vp, MNT_WAIT, fp->f_ctxt);
+}
+
+#else  /* !__DARWIN8__ */
+
+int
+kern_file_size(struct file *fp, off_t *size)
 {
         struct vnode *vp = (struct vnode *)fp->f_data;
         struct stat sb;
@@ -60,12 +195,11 @@ filp_node_size(struct file *fp, off_t *size)
 }
 
 cfs_file_t *
-filp_open(const char * filename, int flags, int mode, int *err)
+kern_file_open(const char * filename, int flags, int mode, int *err)
 {
        struct nameidata nd;
-       register cfs_file_t     *fp;
+       cfs_file_t      *fp;
        register struct vnode   *vp;
-       cfs_file_t              *nfp;
        int                     rc;
        extern struct fileops   vnops;
        extern int nfiles;
@@ -73,16 +207,16 @@ filp_open(const char * filename, int flags, int mode, int *err)
 
         CFS_CONE_IN;
        nfiles++;
-       MALLOC_ZONE(nfp, cfs_file_t *, sizeof(cfs_file_t), M_FILE, M_WAITOK|M_ZERO);
-       bzero(nfp, sizeof(cfs_file_t));
-       nfp->f_count = 1;
-       fp = nfp;
+       MALLOC_ZONE(fp, cfs_file_t *, sizeof(cfs_file_t), M_FILE, M_WAITOK|M_ZERO);
+       bzero(fp, sizeof(cfs_file_t));
+       fp->f_count = 1;
+        LIST_CIRCLE(fp, f_list);
        NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, (char *)filename, current_proc());
        if ((rc = vn_open(&nd, flags, mode)) != 0){
                 printf("filp_open failed at (%d)\n", rc);
                 if (err != NULL)
                         *err = rc;
-               ffree(fp);
+                FREE_ZONE(fp, sizeof *fp, M_FILE);
                 CFS_CONE_EX;
                return NULL;
        }
@@ -117,7 +251,7 @@ frele_internal(cfs_file_t *fp)
 }
 
 int
-filp_close (cfs_file_t *fp)
+kern_file_close (cfs_file_t *fp)
 {
        struct vnode    *vp;
         CFS_DECL_CONE_DATA;
@@ -159,21 +293,28 @@ extern void bwillwrite(void);
  * Write buffer to filp inside kernel
  */
 int
-filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
+kern_file_write (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos)
 {
        struct uio auio;
        struct iovec aiov;
        struct proc *p = current_proc();
        long cnt, error = 0;
+        int flags = 0;
         CFS_DECL_CONE_DATA;
 
        aiov.iov_base = (void *)(uintptr_t)buf;
        aiov.iov_len = nbyte;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
-        if (pos != NULL)
+        if (pos != NULL) {
                auio.uio_offset = *pos;
-        else
+                /* 
+                 * Liang: If don't set FOF_OFFSET, vn_write()
+                 * will use fp->f_offset as the the real offset.
+                 * Same in vn_read()
+                 */
+                flags |= FOF_OFFSET;
+        } else
                 auio.uio_offset = (off_t)-1;
        if (nbyte > INT_MAX)
                return (EINVAL);
@@ -186,7 +327,7 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
         CFS_CONE_IN;
        if (fp->f_type == DTYPE_VNODE)
                bwillwrite();   /* empty stuff now */
-       if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) {
+       if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) {
                if (auio.uio_resid != cnt && (error == ERESTART ||\
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
@@ -200,7 +341,7 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
        else
                cnt -= auio.uio_resid;
         if (pos != NULL)
-                *pos = auio.uio_offset;
+                *pos += cnt;
        return cnt;
 }
 
@@ -208,21 +349,23 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
  * Read from filp inside kernel
  */
 int
-filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
+kern_file_read (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos)
 {
        struct uio auio;
        struct iovec aiov;
        struct proc *p = current_proc();
        long cnt, error = 0;
+        int  flags = 0;
         CFS_DECL_CONE_DATA;
 
        aiov.iov_base = (caddr_t)buf;
        aiov.iov_len = nbyte;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
-        if (pos != NULL)
+        if (pos != NULL) {
                auio.uio_offset = *pos;
-        else
+                flags |= FOF_OFFSET;
+        } else
                 auio.uio_offset = (off_t)-1;
        if (nbyte > INT_MAX)
                return (EINVAL);
@@ -233,7 +376,7 @@ filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
 
        cnt = nbyte;
         CFS_CONE_IN;
-       if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)) != 0) {
+       if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)) != 0) {
                if (auio.uio_resid != cnt && (error == ERESTART ||
                    error == EINTR || error == EWOULDBLOCK))
                        error = 0;
@@ -244,13 +387,13 @@ filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos)
        else
                cnt -= auio.uio_resid;
         if (pos != NULL)
-                *pos = auio.uio_offset;
+                *pos += cnt;
 
        return cnt;
 }
 
 int
-filp_fsync (cfs_file_t *fp)
+kern_file_sync (cfs_file_t *fp)
 {
        struct vnode *vp = (struct vnode *)fp->f_data;
        struct proc *p = current_proc();
@@ -271,60 +414,53 @@ filp_fsync (cfs_file_t *fp)
        return error;
 }
 
-int
-ref_file(cfs_file_t *fp)
+#endif /* !__DARWIN8__ */
+
+cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
 {
-        CFS_DECL_CONE_DATA;
+        return makedev(major, minor);
+}
 
-        CFS_CONE_IN;
-        fref(fp);
-        CFS_CONE_EX;
-        return 0;
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+        return major(rdev);
 }
 
-int 
-rele_file(cfs_file_t *fp)
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
 {
-        CFS_DECL_CONE_DATA;
+        return minor(rdev);
+}
 
-        CFS_CONE_IN;
-        frele(fp);
-        CFS_CONE_EX;
-        return 0;
+struct posix_acl *posix_acl_alloc(int count, int flags)
+{
+        static struct posix_acl acl;
+        return &acl;
 }
 
 /*
- * Private functions
+ * XXX Liang: I've not converted all of them, 
+ * more is needed? 
  */
-void vrele_safe(struct vnode *nd)
-{ 
-        CFS_DECL_CONE_DATA; 
-        
-        CFS_CONE_IN; 
-        vrele(nd); 
-        CFS_CONE_EX;
-}
-
-int
-path_lookup(const char *path, unsigned int flags, struct nameidata *nd)
+int cfs_oflags2univ(int flags) 
 {
-       int ret = 0;
-        CFS_DECL_CONE_DATA;
+        int f;
 
-        CFS_CONE_IN;
-       NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, (char *)path, current_proc());
-       if ((ret = namei(nd)) != 0){
-               CERROR("path_lookup fail!\n");
-       }
-        CFS_CONE_EX;
-
-       return ret;
+        f = flags & O_ACCMODE;
+        f |= (flags & O_CREAT) ? CFS_O_CREAT: 0;
+        f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0;
+        f |= (flags & O_EXCL) ? CFS_O_EXCL: 0;
+        f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0;
+        f |= (flags & O_APPEND) ? CFS_O_APPEND: 0;
+        f |= (flags & O_NOFOLLOW) ? CFS_O_NOFOLLOW: 0;
+        f |= (flags & O_SYNC)? CFS_O_SYNC: 0;
+        return f;
 }
 
-int 
-file_count(struct file *fp)
+/*
+ * XXX Liang: we don't need it in OSX.
+ * But it should be implemented anyway.
+ */
+int cfs_univ2oflags(int flags)
 {
-        return fcount(fp);
+        return flags;
 }
-
-
diff --git a/lnet/libcfs/darwin/darwin-internal.h b/lnet/libcfs/darwin/darwin-internal.h
new file mode 100644 (file)
index 0000000..6c83577
--- /dev/null
@@ -0,0 +1,22 @@
+#ifndef __LIBCFS_DARWIN_INTERNAL_H__
+#define __LIBCFS_DARWIN_INTERNAL_H__
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+
+int cfs_sysctl_isvalid(void);
+struct sysctl_oid *cfs_alloc_sysctl_node(struct sysctl_oid_list *parent, int nbr, int access,
+                                        const char *name, int (*handler) SYSCTL_HANDLER_ARGS);
+struct sysctl_oid *cfs_alloc_sysctl_int(struct sysctl_oid_list *parent, int n,
+                                       const char *name, int *ptr, int val);
+struct sysctl_oid * cfs_alloc_sysctl_long(struct sysctl_oid_list *parent, int nbr, int access,
+                                         const char *name, int *ptr, int val);
+struct sysctl_oid * cfs_alloc_sysctl_string(struct sysctl_oid_list *parent, int nbr, int access,
+                                           const char *name, char *ptr, int len);
+struct sysctl_oid * cfs_alloc_sysctl_struct(struct sysctl_oid_list *parent, int nbr, int access,
+                                           const char *name, void *ptr, int size);
+
+#endif
index 4cf16d7..3079a56 100644 (file)
@@ -2,7 +2,8 @@
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
  * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Liang Zhen <liangzhen@clusterfs.com>
+ *         Nikita Danilov <nikita@clusterfs.com>
  *
  * This file is part of Lustre, http://www.lustre.org.
  *
  * Darwin porting library
  * Make things easy to port
  */
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <mach/mach_types.h>
 #include <string.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sys/file.h>
-#include <sys/conf.h>
-#include <sys/vnode.h>
-#include <sys/uio.h>
-#include <sys/filedesc.h>
-#include <sys/namei.h>
-#include <miscfs/devfs/devfs.h>
-#include <kern/kalloc.h>
-#include <kern/zalloc.h>
-#include <kern/thread.h>
+#include <sys/malloc.h>
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
+#include "darwin-internal.h"
 
-/*
- * Definition of struct zone, copied from osfmk/kern/zalloc.h.
- */
-struct zone_hack {
-       int             count;          /* Number of elements used now */
-       vm_offset_t     free_elements;
-       vm_size_t       cur_size;       /* current memory utilization */
-       vm_size_t       max_size;       /* how large can this zone grow */
-       vm_size_t       elem_size;      /* size of an element */
-       vm_size_t       alloc_size;     /* size used for more memory */
-       char            *zone_name;     /* a name for the zone */
-       unsigned int
-       /* boolean_t */ exhaustible :1, /* (F) merely return if empty? */
-       /* boolean_t */ collectable :1, /* (F) garbage collect empty pages */
-       /* boolean_t */ expandable :1,  /* (T) expand zone (with message)? */
-       /* boolean_t */ allows_foreign :1,/* (F) allow non-zalloc space */
-       /* boolean_t */ doing_alloc :1, /* is zone expanding now? */
-       /* boolean_t */ waiting :1,     /* is thread waiting for expansion? */
-       /* boolean_t */ async_pending :1;       /* asynchronous allocation pending? */
-       struct zone_hack *      next_zone;      /* Link for all-zones list */
-       /*
-        * more fields follow, but we don't need them. We only need
-        * offset from the beginning of struct zone to ->next_zone
-        * field: it allows us to scan the list of all zones.
-        */
+#if CFS_INDIVIDUAL_ZONE
+extern zone_t zinit( vm_size_t, vm_size_t, vm_size_t, const char *);
+extern void * zalloc(zone_t zone);
+extern void *zalloc_noblock(zone_t zone);
+extern void zfree(zone_t zone, void *addr);
+
+struct cfs_zone_nob {
+        struct list_head       *z_nob;  /* Pointer to z_link */
+        struct list_head        z_link; /* Do NOT access it directly */       
 };
 
-decl_simple_lock_data(extern, all_zones_lock)
+static struct cfs_zone_nob      cfs_zone_nob;
+static spinlock_t               cfs_zone_guard;
 
-/*
- * returns true iff zone with name @name already exists.
- *
- * XXX nikita: this function is defined in this file only because there is no
- * better place to put it in.
- */
-zone_t cfs_find_zone(const char *name)
+cfs_mem_cache_t *mem_cache_find(const char *name, size_t objsize)
 {
-       struct zone_hack *scan;
+        cfs_mem_cache_t         *walker = NULL;
 
-       /* from osfmk/kern/zalloc.c */
-       extern zone_t first_zone;
+        LASSERT(cfs_zone_nob.z_nob != NULL);
 
-       LASSERT(name != NULL);
+        spin_lock(&cfs_zone_guard);
+        list_for_each_entry(walker, cfs_zone_nob.z_nob, mc_link) {
+                if (!strcmp(walker->mc_name, name) && \
+                    walker->mc_size == objsize)
+                        break;
+        }
+        spin_unlock(&cfs_zone_guard);
 
-       simple_lock(&all_zones_lock);
-       for (scan = (struct zone_hack *)first_zone;
-            scan != NULL; scan = scan->next_zone) {
-               if (!strcmp(scan->zone_name, name))
-                       break;
-       }
-       simple_unlock(&all_zones_lock);
-       return((zone_t)scan);
+        return walker;
 }
 
 /*
@@ -103,59 +71,120 @@ zone_t cfs_find_zone(const char *name)
  * survives kext unloading, so that @name cannot be just static string
  * embedded into kext image.
  */
-zone_t cfs_zinit(vm_size_t size, vm_size_t max, int alloc, const char *name)
+cfs_mem_cache_t *mem_cache_create(vm_size_t objsize, const char *name)
 {
+       cfs_mem_cache_t *mc = NULL;
         char *cname;
 
+       MALLOC(mc, cfs_mem_cache_t *, sizeof(cfs_mem_cache_t), M_TEMP, M_WAITOK|M_ZERO);
+       if (mc == NULL){
+               CERROR("cfs_mem_cache created fail!\n");
+               return NULL;
+       }
+
         cname = _MALLOC(strlen(name) + 1, M_TEMP, M_WAITOK);
         LASSERT(cname != NULL);
-        return zinit(size, max, alloc, strcpy(cname, name));
+        mc->mc_cache = zinit(objsize, (KMEM_MAX_ZONE * objsize), 0, strcpy(cname, name));
+        mc->mc_size = objsize;
+        CFS_INIT_LIST_HEAD(&mc->mc_link);
+        strncpy(mc->mc_name, name, 1 + strlen(name));
+        return mc;
+}
+
+void mem_cache_destroy(cfs_mem_cache_t *mc)
+{
+        /*
+         * zone can NOT be destroyed after creating, 
+         * so just keep it in list.
+         *
+         * We will not lost a zone after we unload
+         * libcfs, it can be found by from libcfs.zone
+         */
+        return;
 }
 
+#define mem_cache_alloc(mc)     zalloc((mc)->mc_cache)
+#ifdef __DARWIN8__
+# define mem_cache_alloc_nb(mc) zalloc((mc)->mc_cache)
+#else
+/* XXX Liang: Tiger doesn't export zalloc_noblock() */
+# define mem_cache_alloc_nb(mc) zalloc_noblock((mc)->mc_cache)
+#endif
+#define mem_cache_free(mc, p)   zfree((mc)->mc_cache, p)
+
+#else  /* !CFS_INDIVIDUAL_ZONE */
+
 cfs_mem_cache_t *
-cfs_mem_cache_create (const char *name, size_t objsize, size_t off, unsigned long arg1,
-               void (*arg2)(void *, cfs_mem_cache_t *, unsigned long),
-               void (*arg3)(void *, cfs_mem_cache_t *, unsigned long))
+mem_cache_find(const char *name, size_t objsize)
+{
+        return NULL;
+}
+
+cfs_mem_cache_t *mem_cache_create(vm_size_t size, const char *name)
 {
-       cfs_mem_cache_t *new = NULL;
+        cfs_mem_cache_t *mc = NULL;
 
-       MALLOC(new, cfs_mem_cache_t *, objsize, M_TEMP, M_WAITOK|M_ZERO);
-       if (new == NULL){
+       MALLOC(mc, cfs_mem_cache_t *, sizeof(cfs_mem_cache_t), M_TEMP, M_WAITOK|M_ZERO);
+       if (mc == NULL){
                CERROR("cfs_mem_cache created fail!\n");
                return NULL;
        }
-       new->size = objsize;
-        CFS_INIT_LIST_HEAD(&new->link);
-        strncpy(new->name, name, 1 + strlen(name));
-        new->zone = cfs_find_zone(name);
-        if (new->zone == NULL) {
-                new->zone = cfs_zinit (objsize, KMEM_MAX_ZONE * objsize, 0, name);
-                if (new->zone == NULL) {
-                        CERROR("zone create fault!\n");
-                        FREE (new, M_TEMP);
-                        return NULL;
-                }
-        }
-       return new;
+        mc->mc_cache = OSMalloc_Tagalloc(name, OSMT_DEFAULT);
+        mc->mc_size = size;
+        return mc;
 }
 
-int
-cfs_mem_cache_destroy (cfs_mem_cache_t *cachep)
+void mem_cache_destroy(cfs_mem_cache_t *mc)
 {
-        FREE (cachep, M_TEMP);
-       return 0;
+        OSMalloc_Tagfree(mc->mc_cache);
+        FREE(mc, M_TEMP);
 }
 
-void *
-cfs_mem_cache_alloc (cfs_mem_cache_t *cachep, int flags)
+#define mem_cache_alloc(mc)     OSMalloc((mc)->mc_size, (mc)->mc_cache)
+#define mem_cache_alloc_nb(mc)  OSMalloc_noblock((mc)->mc_size, (mc)->mc_cache)
+#define mem_cache_free(mc, p)   OSFree(p, (mc)->mc_size, (mc)->mc_cache)
+
+#endif /* !CFS_INDIVIDUAL_ZONE */
+
+cfs_mem_cache_t *
+cfs_mem_cache_create (const char *name,
+                      size_t objsize, size_t off, unsigned long arg1)
+{
+        cfs_mem_cache_t *mc;
+
+        mc = mem_cache_find(name, objsize);
+        if (mc)
+                return mc;
+        mc = mem_cache_create(objsize, name);
+       return mc;
+}
+
+int cfs_mem_cache_destroy (cfs_mem_cache_t *cachep)
 {
-        return (void *)zalloc(cachep->zone);
+        mem_cache_destroy(cachep);
+        return 0;
 }
 
-void
-cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp)
+void *cfs_mem_cache_alloc (cfs_mem_cache_t *cachep, int flags)
 {
-        zfree (cachep->zone, (vm_address_t)objp);
+        void *result;
+
+        /* zalloc_canblock() is not exported... Emulate it. */
+        if (flags & CFS_ALLOC_ATOMIC) {
+                result = (void *)mem_cache_alloc_nb(cachep);
+        } else {
+                LASSERT(get_preemption_level() == 0);
+                result = (void *)mem_cache_alloc(cachep);
+        }
+        if (result != NULL && (flags & CFS_ALLOC_ZERO))
+                memset(result, 0, cachep->mc_size);
+
+        return result;
+}
+
+void cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp)
+{
+        mem_cache_free(cachep, objp);
 }
 
 /* ---------------------------------------------------------------------------
@@ -167,38 +196,15 @@ cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp)
  * "Raw" pages
  */
 
-extern vm_map_t zone_map;
-static inline vm_map_t page_map(struct xnu_raw_page *pg)
-{
-        LASSERT(pg != NULL);
-
-        return pg->order == 0 ? zone_map : kernel_map;
-}
-
-static int raw_page_init(struct xnu_raw_page *pg)
-{
-       vm_size_t size = (1UL << pg->order) * PAGE_SIZE;
-       int upl_flags = UPL_SET_INTERNAL |
-                UPL_SET_LITE | UPL_SET_IO_WIRE | UPL_COPYOUT_FROM;
-        int     kr = 0;
-
-        /* XXX is it necessary? */
-       kr = vm_map_get_upl(page_map(pg),
-                            pg->virtual, &size, &pg->upl, 0, 0, &upl_flags, 0);
-        return kr;
-}
-
-static void raw_page_done(struct xnu_raw_page *pg)
-{
-       ubc_upl_abort(pg->upl, UPL_ABORT_FREE_ON_EMPTY);
-        return;
-}
+static unsigned int raw_pages = 0;
+static cfs_mem_cache_t  *raw_page_cache = NULL;
 
 static struct xnu_page_ops raw_page_ops;
 static struct xnu_page_ops *page_ops[XNU_PAGE_NTYPES] = {
         [XNU_PAGE_RAW] = &raw_page_ops
 };
 
+#if defined(LIBCFS_DEBUG)
 static int page_type_is_valid(cfs_page_t *page)
 {
         LASSERT(page != NULL);
@@ -209,6 +215,7 @@ static int page_is_raw(cfs_page_t *page)
 {
         return page->type == XNU_PAGE_RAW;
 }
+#endif
 
 static struct xnu_raw_page *as_raw(cfs_page_t *page)
 {
@@ -236,120 +243,83 @@ static struct xnu_page_ops raw_page_ops = {
         .page_address   = raw_page_address
 };
 
+extern int get_preemption_level(void);
 
-extern vm_size_t kalloc_max;
-extern vm_size_t kalloc_max_prerounded;
-extern int first_k_zone;
-extern struct zone *k_zone[16];
-extern vm_offset_t zalloc_canblock( register zone_t, boolean_t );
-extern vm_map_t zone_map;
-
-static inline vm_address_t
-page_zone_alloc(int flags, int order)
-{
-       register int zindex;
-       register vm_size_t allocsize;
-       vm_size_t size = (1UL << order) * PAGE_SIZE;
-       vm_address_t    addr;
-       kern_return_t   kr;
-
-       assert(order >= 0);
-       if (size > PAGE_SIZE){
-               /* XXX Liang:
-                * zalloc_canblock() call kernel_memory_allocate to allocate
-                * pages, kernel_memory_allocate cannot guarantee contig pages!
-                * So any request bigger then PAGE_SIZE should not call zalloc()
-                *
-                * NB. kmem_alloc_contig could be very slow!!!! Anyway, I dont
-                * know what will happen if order >= 1 :-(
-                * */
-               CDEBUG(D_MALLOC, "Allocate contig pages!\n");
-               kr = kmem_alloc_contig(kernel_map, &addr, size, 0, 0);
-               if (kr)
-                       return 0;
-               return addr;
-       }
-       allocsize = KALLOC_MINSIZE;
-       zindex = first_k_zone;
-       while (allocsize < size) {
-               allocsize <<= 1;
-               zindex++;
-       }
-       assert(allocsize < kalloc_max);
-       if (flags & M_NOWAIT != 0)
-               addr = zalloc_canblock(k_zone[zindex], FALSE);
-       else
-               addr = zalloc_canblock(k_zone[zindex], TRUE);
-       return addr;
-}
+struct list_head page_death_row;
+spinlock_t page_death_row_phylax;
 
-/* Allocate a "page", actually upl of darwin */
-struct xnu_raw_page *alloc_raw_pages(u_int32_t flags, u_int32_t order)
+static void raw_page_finish(struct xnu_raw_page *pg)
 {
-       kern_return_t   kr;
-       vm_size_t size = (1UL << order) * PAGE_SIZE;
-        u_int32_t mflags = 0;
-       struct xnu_raw_page *pg;
-
-        if (flags & CFS_ALLOC_ATOMIC != 0)
-                mflags |= M_NOWAIT;
-        else
-                mflags |= M_WAITOK;
-        if (flags & CFS_ALLOC_ZERO != 0)
-                mflags |= M_ZERO;
+        -- raw_pages;
+        if (pg->virtual != NULL)
+                cfs_mem_cache_free(raw_page_cache, pg->virtual);
+        cfs_free(pg);
+}
 
-       MALLOC (pg, struct xnu_raw_page *, sizeof *pg, M_TEMP, mflags);
-       if (pg == NULL)
-               return NULL;
-        pg->header.type = XNU_PAGE_RAW;
-        pg->order = order;
-       cfs_set_page_count(&pg->header, 1);
-       pg->virtual = page_zone_alloc(flags, order);
-       if (!pg->virtual)
-                /*
-                 * XXX nikita: Liang, shouldn't pg be freed here?
-                 */
-               return NULL;
+void raw_page_death_row_clean(void)
+{
+        struct xnu_raw_page *pg;
 
-        kr = raw_page_init(pg);
-       if (kr != 0) {
-               size = (1UL << order) * PAGE_SIZE;
-                kmem_free(page_map(pg), pg->virtual, size);
-               return NULL;
-       }
-       return pg;
+        spin_lock(&page_death_row_phylax);
+        while (!list_empty(&page_death_row)) {
+                pg = container_of(page_death_row.next,
+                                  struct xnu_raw_page, link);
+                list_del(&pg->link);
+                spin_unlock(&page_death_row_phylax);
+                raw_page_finish(pg);
+                spin_lock(&page_death_row_phylax);
+        }
+        spin_unlock(&page_death_row_phylax);
 }
 
 /* Free a "page" */
-void free_raw_pages(struct xnu_raw_page *pg, u_int32_t order)
+void free_raw_page(struct xnu_raw_page *pg)
 {
-       vm_size_t size = (1UL << order) * PAGE_SIZE;
-
        if (!atomic_dec_and_test(&pg->count))
                return;
-        raw_page_done(pg);
-        kmem_free(page_map(pg), pg->virtual, size);
-       FREE(pg, M_TEMP);
-}
-
-cfs_page_t *cfs_alloc_pages(u_int32_t flags, u_int32_t order)
-{
-        return &alloc_raw_pages(flags, order)->header;
+        /*
+         * kmem_free()->vm_map_remove()->vm_map_delete()->lock_write() may
+         * block. (raw_page_done()->upl_abort() can block too) On the other
+         * hand, cfs_free_page() may be called in non-blockable context. To
+         * work around this, park pages on global list when cannot block.
+         */
+        if (get_preemption_level() > 0) {
+                spin_lock(&page_death_row_phylax);
+                list_add(&pg->link, &page_death_row);
+                spin_unlock(&page_death_row_phylax);
+        } else {
+                raw_page_finish(pg);
+                raw_page_death_row_clean();
+        }
 }
 
 cfs_page_t *cfs_alloc_page(u_int32_t flags)
 {
-        return cfs_alloc_pages(flags, 0);
-}
-
-void cfs_free_pages(cfs_page_t *pages, int order)
-{
-        free_raw_pages(as_raw(pages), order);
+        struct xnu_raw_page *page;
+
+        /*
+         * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
+
+        page = cfs_alloc(sizeof *page, flags);
+        if (page != NULL) {
+                page->virtual = cfs_mem_cache_alloc(raw_page_cache, flags);
+                if (page->virtual != NULL) {
+                        ++ raw_pages;
+                        page->header.type = XNU_PAGE_RAW;
+                        atomic_set(&page->count, 1);
+                } else {
+                        cfs_free(page);
+                        page = NULL;
+                }
+        }
+        return page != NULL ? &page->header : NULL;
 }
 
-void cfs_free_page(cfs_page_t *page)
+void cfs_free_page(cfs_page_t *pages)
 {
-        cfs_free_pages(page, 0);
+        free_raw_page(as_raw(pages));
 }
 
 void cfs_get_page(cfs_page_t *p)
@@ -367,17 +337,16 @@ int cfs_page_count(cfs_page_t *p)
         return atomic_read(&as_raw(p)->count);
 }
 
-void cfs_set_page_count(cfs_page_t *p, int v)
-{
-        atomic_set(&as_raw(p)->count, v);
-}
-
 /*
  * Generic page operations
  */
 
 void *cfs_page_address(cfs_page_t *pg)
 {
+        /*
+         * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
         LASSERT(page_type_is_valid(pg));
         return page_ops[pg->type]->page_address(pg);
 }
@@ -425,14 +394,14 @@ void *cfs_alloc(size_t nr_bytes, u_int32_t flags)
         int mflags;
 
         mflags = 0;
-        if (flags & CFS_ALLOC_ATOMIC != 0) {
-                mflags |= 0 /* M_NOWAIT */;
+        if (flags & CFS_ALLOC_ATOMIC) {
+                mflags |= M_NOWAIT;
         } else {
                 LASSERT(get_preemption_level() == 0);
                 mflags |= M_WAITOK;
         }
 
-        if (flags & CFS_ALLOC_ZERO != 0)
+        if (flags & CFS_ALLOC_ZERO)
                 mflags |= M_ZERO;
 
         return _MALLOC(nr_bytes, M_TEMP, mflags);
@@ -451,5 +420,61 @@ void *cfs_alloc_large(size_t nr_bytes)
 
 void  cfs_free_large(void *addr)
 {
+        LASSERT(get_preemption_level() == 0);
         return _FREE(addr, M_TEMP);
 }
+
+/*
+ * Lookup cfs_zone_nob by sysctl.zone, if it cannot be 
+ * found (first load of * libcfs since boot), allocate 
+ * sysctl libcfs.zone.
+ */
+int cfs_mem_init(void)
+{
+#if     CFS_INDIVIDUAL_ZONE
+        int     rc;
+        size_t  len;
+
+        len = sizeof(struct cfs_zone_nob);
+        rc = sysctlbyname("libcfs.zone",
+                          (void *)&cfs_zone_nob, &len, NULL, 0);
+        if (rc == ENOENT) {
+                /* zone_nob is not register in libcfs_sysctl */
+                struct cfs_zone_nob  *nob;
+                struct sysctl_oid       *oid;
+
+                assert(cfs_sysctl_isvalid());
+
+                nob = _MALLOC(sizeof(struct cfs_zone_nob), 
+                              M_TEMP, M_WAITOK | M_ZERO);
+                CFS_INIT_LIST_HEAD(&nob->z_link);
+                nob->z_nob = &nob->z_link;
+                oid = cfs_alloc_sysctl_struct(NULL, OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN, 
+                                              "zone", nob, sizeof(struct cfs_zone_nob));
+                if (oid == NULL) {
+                        _FREE(nob, M_TEMP);
+                        return -ENOMEM;
+                }
+                sysctl_register_oid(oid);
+
+                cfs_zone_nob.z_nob = nob->z_nob;
+        }
+        spin_lock_init(&cfs_zone_guard);
+#endif
+        CFS_INIT_LIST_HEAD(&page_death_row);
+        spin_lock_init(&page_death_row_phylax);
+        raw_page_cache = cfs_mem_cache_create("raw-page", CFS_PAGE_SIZE, 0, 0);
+        return 0;
+}
+
+void cfs_mem_fini(void)
+{
+        raw_page_death_row_clean();
+        spin_lock_done(&page_death_row_phylax);
+        cfs_mem_cache_destroy(raw_page_cache);
+
+#if     CFS_INDIVIDUAL_ZONE
+        cfs_zone_nob.z_nob = NULL;
+        spin_lock_done(&cfs_zone_guard);
+#endif
+}
index 4f85862..10cb7d8 100644 (file)
 #include <sys/conf.h>
 #include <miscfs/devfs/devfs.h>
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
 
-int portal_ioctl_getdata(char *buf, char *end, void *arg)
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
 {
-        struct portal_ioctl_hdr *hdr;
-        struct portal_ioctl_data *data;
+        struct libcfs_ioctl_hdr *hdr;
+        struct libcfs_ioctl_data *data;
         int err = 0;
         ENTRY;
 
-        hdr = (struct portal_ioctl_hdr *)buf; 
-        data = (struct portal_ioctl_data *)buf;
-       /* portals_ioctl_data has been copied in by ioctl of osx */
-       memcpy(buf, arg, sizeof(struct portal_ioctl_data));
+        hdr = (struct libcfs_ioctl_hdr *)buf;
+        data = (struct libcfs_ioctl_data *)buf;
+       /* libcfs_ioctl_data has been copied in by ioctl of osx */
+       memcpy(buf, arg, sizeof(struct libcfs_ioctl_data));
 
-        if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
-                CERROR("PORTALS: version mismatch kernel vs application\n");
+        if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+                CERROR("LIBCFS: version mismatch kernel vs application\n");
                 RETURN(-EINVAL);
         }
 
         if (hdr->ioc_len + buf >= end) {
-                CERROR("PORTALS: user buffer exceeds kernel buffer\n");
+                CERROR("LIBCFS: user buffer exceeds kernel buffer\n");
                 RETURN(-EINVAL);
         }
 
-        if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
-                CERROR("PORTALS: user buffer too small for ioctl\n");
+        if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+                CERROR("LIBCFS: user buffer too small for ioctl\n");
                 RETURN(-EINVAL);
         }
        buf += size_round(sizeof(*data));
 
-        if (data->ioc_inllen1) { 
-                err = copy_from_user(buf, data->ioc_inlbuf1, size_round(data->ioc_inllen1)); 
+        if (data->ioc_inllen1) {
+                err = copy_from_user(buf, data->ioc_inlbuf1, size_round(data->ioc_inllen1));
                if (err)
                        RETURN(err);
-                data->ioc_inlbuf1 = buf; 
-                buf += size_round(data->ioc_inllen1); 
-        } 
-        
-        if (data->ioc_inllen2) { 
-                copy_from_user(buf, data->ioc_inlbuf2, size_round(data->ioc_inllen2)); 
+                data->ioc_inlbuf1 = buf;
+                buf += size_round(data->ioc_inllen1);
+        }
+
+        if (data->ioc_inllen2) {
+                copy_from_user(buf, data->ioc_inlbuf2, size_round(data->ioc_inllen2));
                if (err)
                        RETURN(err);
-                data->ioc_inlbuf2 = buf; 
-        } 
+                data->ioc_inlbuf2 = buf;
+        }
 
         RETURN(err);
 }
 
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+       /* 
+        * system call will copy out ioctl arg to user space
+        */
+       memcpy(arg, data, size);
+       return 0;
+}
+
 extern struct cfs_psdev_ops            libcfs_psdev_ops;
-struct portals_device_userstate                *mdev_state[16];
+struct libcfs_device_userstate         *mdev_state[16];
 
-static int 
+static int
 libcfs_psdev_open(dev_t dev, int flags, int devtype, struct proc *p)
-{ 
-       struct  portals_device_userstate *mstat = NULL;
+{
+       struct  libcfs_device_userstate *mstat = NULL;
        int     rc = 0;
-       int     devid; 
-       devid = minor(dev);    
+       int     devid;
+       devid = minor(dev);
 
-       if (devid > 16) return (-ENXIO);
+       if (devid > 16) return (ENXIO);
 
        if (libcfs_psdev_ops.p_open != NULL)
-               rc = libcfs_psdev_ops.p_open(0, &mstat);
+               rc = -libcfs_psdev_ops.p_open(0, &mstat);
        else
-               rc = -EPERM;
-       if (!rc)
-               return rc;
-       mdev_state[devid] = mstat;
+               rc = EPERM;
+       if (rc == 0)
+               mdev_state[devid] = mstat;
        return rc;
 }
 
-static int 
+static int
 libcfs_psdev_close(dev_t dev, int flags, int mode, struct proc *p)
 {
-       int     devid; 
-       devid = minor(dev);    
+       int     devid;
+       devid = minor(dev);
        int     rc = 0;
 
-       if (devid > 16) return (-ENXIO);
+       if (devid > 16) return (ENXIO);
 
        if (libcfs_psdev_ops.p_close != NULL)
-               rc = libcfs_psdev_ops.p_close(0, mdev_state[devid]);
+               rc = -libcfs_psdev_ops.p_close(0, mdev_state[devid]);
        else
-               rc = -EPERM;
-       if (rc)
-               return rc;
-       mdev_state[devid] = NULL;
+               rc = EPERM;
+       if (rc == 0)
+               mdev_state[devid] = NULL;
        return rc;
 }
 
-static int 
+static int
 libcfs_ioctl (dev_t dev, u_long cmd, caddr_t arg, int flag, struct proc *p)
-{ 
-       int rc = 0; 
-        struct cfs_psdev_file    pfile; 
-       int     devid; 
-       devid = minor(dev); 
+{
+       int rc = 0;
+        struct cfs_psdev_file    pfile;
+       int     devid;
+       devid = minor(dev);
        
-       if (devid > 16) return (-ENXIO);
+       if (devid > 16) return (ENXIO);
 
-       if (suser(p->p_ucred, &p->p_acflag)) 
-               return (-EPERM); 
+       if (!is_suser())
+               return (EPERM);
        
        pfile.off = 0;
        pfile.private_data = mdev_state[devid];
 
-       if (libcfs_psdev_ops.p_ioctl != NULL) 
-               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
-       else 
-               rc = -EPERM;
+       if (libcfs_psdev_ops.p_ioctl != NULL)
+               rc = -libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
+       else
+               rc = EPERM;
        return rc;
 }
 
 static struct cdevsw libcfs_devsw =
-{ 
-       libcfs_psdev_open,            /* open */ 
-       libcfs_psdev_close,           /* close */ 
-       NULL,                   /* read */ 
-       NULL,                   /* write */ 
-       libcfs_ioctl,           /* ioctl */ 
-       NULL,                   /* stop */ 
-       NULL,                   /* reset */ 
-       NULL,                   /* tty's */ 
-       NULL,                   /* select */ 
-       NULL,                   /* mmap */ 
-       NULL,                   /* strategy */ 
-       NULL,                   /* getc */ 
-       NULL,                   /* putc */ 
-       0                       /* type */ 
+{
+       .d_open     = libcfs_psdev_open,
+       .d_close    = libcfs_psdev_close,
+       .d_read     = eno_rdwrt,
+       .d_write    = eno_rdwrt,
+       .d_ioctl    = libcfs_ioctl,
+       .d_stop     = eno_stop,
+       .d_reset    = eno_reset,
+       .d_ttys     = NULL,
+       .d_select   = eno_select,
+       .d_mmap     = eno_mmap,
+       .d_strategy = eno_strat,
+       .d_getc     = eno_getc,
+       .d_putc     = eno_putc,
+       .d_type     = 0
 };
 
-cfs_psdev_t libcfs_dev = { 
-       -1, 
-       NULL, 
-       "portals", 
-       &libcfs_devsw, 
+cfs_psdev_t libcfs_dev = {
+       -1,
+       NULL,
+       "lnet",
+       &libcfs_devsw,
        NULL
 };
 
-void
-kportal_daemonize (char *str)
+extern spinlock_t trace_cpu_serializer;
+extern void cfs_sync_init(void);
+extern void cfs_sync_fini(void);
+extern int cfs_sysctl_init(void);
+extern void cfs_sysctl_fini(void);
+extern int cfs_mem_init(void);
+extern int cfs_mem_fini(void);
+extern void raw_page_death_row_clean(void);
+extern void cfs_thread_agent_init(void);
+extern void cfs_thread_agent_fini(void);
+extern void cfs_symbol_init(void);
+extern void cfs_symbol_fini(void);
+
+int libcfs_arch_init(void)
 {
-       printf("Daemonize request: %s.\n", str);
-       return;
+       cfs_sync_init();
+       cfs_sysctl_init();
+       cfs_mem_init();
+       cfs_thread_agent_init();
+       cfs_symbol_init();
+
+       spin_lock_init(&trace_cpu_serializer);
+
+       return 0;
 }
 
-void 
-kportal_blockallsigs(void)
+void libcfs_arch_cleanup(void)
 {
-       return;
+       spin_lock_done(&trace_cpu_serializer);
+
+       cfs_symbol_fini();
+       cfs_thread_agent_fini();
+       cfs_mem_fini();
+       cfs_sysctl_fini();
+       cfs_sync_fini();
 }
+
index fd2d120..cdcabd9 100644 (file)
  * Darwin porting library
  * Make things easy to port
  */
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <mach/mach_types.h>
 #include <string.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
 #include <sys/file.h>
 #include <sys/conf.h>
-#include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/filedesc.h>
 #include <sys/namei.h>
 #include <miscfs/devfs/devfs.h>
-#include <kern/kalloc.h>
-#include <kern/zalloc.h>
 #include <kern/thread.h>
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
 
-void    *darwin_current_journal_info = NULL;
-int     darwin_current_cap_effective = -1;
-
-/* 
- * cfs pseudo device, actually pseudo char device in darwin 
+/*
+ * cfs pseudo device, actually pseudo char device in darwin
  */
-#define KPORTAL_MAJOR  -1
+#define KLNET_MAJOR  -1
 
 kern_return_t  cfs_psdev_register(cfs_psdev_t *dev) {
-       dev->index = cdevsw_add(KPORTAL_MAJOR, dev->devsw);
+       dev->index = cdevsw_add(KLNET_MAJOR, dev->devsw);
        if (dev->index < 0) {
-               printf("portal_init: failed to allocate a major number!\n");
+               printf("libcfs_init: failed to allocate a major number!\n");
                return KERN_FAILURE;
        }
-       dev->handle = devfs_make_node(makedev (dev->index, 0), 
-                                      DEVFS_CHAR, UID_ROOT, 
+       dev->handle = devfs_make_node(makedev (dev->index, 0),
+                                      DEVFS_CHAR, UID_ROOT,
                                       GID_WHEEL, 0666, (char *)dev->name, 0);
        return KERN_SUCCESS;
 }
@@ -68,11 +60,11 @@ kern_return_t  cfs_psdev_deregister(cfs_psdev_t *dev) {
        return KERN_SUCCESS;
 }
 
-/* 
- * KPortal symbol register / unregister support 
+/*
+ * KPortal symbol register / unregister support
  */
-static struct rw_semaphore cfs_symbol_lock;
-struct list_head           cfs_symbol_list;
+struct rw_semaphore             cfs_symbol_lock;
+struct list_head                cfs_symbol_list;
 
 void *
 cfs_symbol_get(const char *name)
@@ -87,9 +79,9 @@ cfs_symbol_get(const char *name)
                         sym->ref ++;
                         break;
                 }
-        } 
+        }
         up_read(&cfs_symbol_lock);
-        if (sym != NULL) 
+        if (sym != NULL)
                 return sym->value;
         return NULL;
 }
@@ -108,7 +100,7 @@ cfs_symbol_put(const char *name)
                         LASSERT(sym->ref >= 0);
                         break;
                 }
-        } 
+        }
         up_read(&cfs_symbol_lock);
         LASSERT(sym != NULL);
 
@@ -167,7 +159,14 @@ cfs_symbol_unregister(const char *name)
 }
 
 void
-cfs_symbol_clean()
+cfs_symbol_init()
+{
+        CFS_INIT_LIST_HEAD(&cfs_symbol_list);
+        init_rwsem(&cfs_symbol_lock);
+}
+
+void
+cfs_symbol_fini()
 {
         struct list_head    *walker;
         struct cfs_symbol   *sym = NULL;
@@ -180,77 +179,225 @@ cfs_symbol_clean()
                 FREE(sym, M_TEMP);
         }
         up_write(&cfs_symbol_lock);
+
+        fini_rwsem(&cfs_symbol_lock);
         return;
 }
 
-/* 
- * Register sysctl table
- */
-cfs_sysctl_table_header_t *
-register_cfs_sysctl_table (cfs_sysctl_table_t *table, int arg)
+struct kernel_thread_arg
 {
-       cfs_sysctl_table_t      item;
-       int i = 0;
+       spinlock_t      lock;
+       atomic_t        inuse;
+       cfs_thread_t    func;
+       void            *arg;
+};
 
-       while ((item = table[i++]) != NULL) {
-               sysctl_register_oid(item); 
-       }
-       return table;
-}
+struct kernel_thread_arg cfs_thread_arg;
+
+#define THREAD_ARG_FREE                        0
+#define THREAD_ARG_HOLD                        1
+#define THREAD_ARG_RECV                        2
+
+#define set_targ_stat(a, v)            atomic_set(&(a)->inuse, v)
+#define get_targ_stat(a)               atomic_read(&(a)->inuse)
 
 /*
- * Unregister sysctl table
+ * Hold the thread argument and set the status of thread_status
+ * to THREAD_ARG_HOLD, if the thread argument is held by other
+ * threads (It's THREAD_ARG_HOLD already), current-thread has to wait.
  */
-void
-unregister_cfs_sysctl_table (cfs_sysctl_table_header_t *table) {
-       int i = 0;
-       cfs_sysctl_table_t      item;
+#define thread_arg_hold(pta, _func, _arg)                      \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               if (get_targ_stat(pta) == THREAD_ARG_FREE) {    \
+                       set_targ_stat((pta), THREAD_ARG_HOLD);  \
+                       (pta)->arg = (void *)_arg;              \
+                       (pta)->func = _func;                    \
+                       spin_unlock(&(pta)->lock);              \
+                       break;                                  \
+               }                                               \
+               spin_unlock(&(pta)->lock);                      \
+               cfs_schedule();                                 \
+       } while(1);                                             \
 
-       while ((item = table[i++]) != NULL) {
-               sysctl_unregister_oid(item); 
-       }
-       return;
-}
+/*
+ * Release the thread argument if the thread argument has been
+ * received by the child-thread (Status of thread_args is
+ * THREAD_ARG_RECV), otherwise current-thread has to wait.
+ * After release, the thread_args' status will be set to
+ * THREAD_ARG_FREE, and others can re-use the thread_args to
+ * create new kernel_thread.
+ */
+#define thread_arg_release(pta)                                        \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               if (get_targ_stat(pta) == THREAD_ARG_RECV) {    \
+                       (pta)->arg = NULL;                      \
+                       (pta)->func = NULL;                     \
+                       set_targ_stat(pta, THREAD_ARG_FREE);    \
+                       spin_unlock(&(pta)->lock);              \
+                       break;                                  \
+               }                                               \
+               spin_unlock(&(pta)->lock);                      \
+               cfs_schedule();                                 \
+       } while(1)
 
-struct kernel_thread_arg cfs_thread_arg;
+/*
+ * Receive thread argument (Used in child thread), set the status
+ * of thread_args to THREAD_ARG_RECV.
+ */
+#define __thread_arg_recv_fin(pta, _func, _arg, fin)           \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               if (get_targ_stat(pta) == THREAD_ARG_HOLD) {    \
+                       if (fin)                                \
+                           set_targ_stat(pta, THREAD_ARG_RECV);\
+                       _arg = (pta)->arg;                      \
+                       _func = (pta)->func;                    \
+                       spin_unlock(&(pta)->lock);              \
+                       break;                                  \
+               }                                               \
+               spin_unlock(&(pta)->lock);                      \
+               cfs_schedule();                                 \
+       } while (1);                                            \
+
+/*
+ * Just set the thread_args' status to THREAD_ARG_RECV
+ */
+#define thread_arg_fin(pta)                                    \
+       do {                                                    \
+               spin_lock(&(pta)->lock);                        \
+               assert( get_targ_stat(pta) == THREAD_ARG_HOLD); \
+               set_targ_stat(pta, THREAD_ARG_RECV);            \
+               spin_unlock(&(pta)->lock);                      \
+       } while(0)
+
+#define thread_arg_recv(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 1)
+#define thread_arg_keep(pta, f, a)     __thread_arg_recv_fin(pta, f, a, 0)
 
 void
-cfs_thread_agent_init()
-{ 
-        set_targ_stat(&cfs_thread_arg, THREAD_ARG_FREE); 
-        spin_lock_init(&cfs_thread_arg.lock);        
-        cfs_thread_arg.arg = NULL;                       
-        cfs_thread_arg.func = NULL;       
+cfs_thread_agent_init(void)
+{
+        set_targ_stat(&cfs_thread_arg, THREAD_ARG_FREE);
+        spin_lock_init(&cfs_thread_arg.lock);
+        cfs_thread_arg.arg = NULL;
+        cfs_thread_arg.func = NULL;
 }
 
 void
-cfs_thread_agent (void) 
+cfs_thread_agent_fini(void)
+{
+        assert(get_targ_stat(&cfs_thread_arg) == THREAD_ARG_FREE);
+
+        spin_lock_done(&cfs_thread_arg.lock);
+}
+
+/*
+ *
+ * All requests to create kernel thread will create a new
+ * thread instance of cfs_thread_agent, one by one.
+ * cfs_thread_agent will call the caller's thread function
+ * with argument supplied by caller.
+ */
+void
+cfs_thread_agent (void)
 {
         cfs_thread_t           func = NULL;
         void                   *arg = NULL;
 
         thread_arg_recv(&cfs_thread_arg, func, arg);
-        printf("entry of thread agent (func: %08lx).\n", (void *)func);
+        /* printf("entry of thread agent (func: %08lx).\n", (void *)func); */
         assert(func != NULL);
         func(arg);
-        printf("thread agent exit. (func: %08lx)\n", (void *)func);
-        (void) thread_terminate(current_act());
+        /* printf("thread agent exit. (func: %08lx)\n", (void *)func); */
+        (void) thread_terminate(current_thread());
 }
 
+extern thread_t kernel_thread(task_t task, void (*start)(void));
+
 int
 cfs_kernel_thread(cfs_thread_t  func, void *arg, int flag)
-{ 
-        int ret = 0;   
-        thread_t th = NULL;  
-                                                
-        thread_arg_hold(&cfs_thread_arg, func, arg); 
-        th = kernel_thread(kernel_task, cfs_thread_agent);  
-        thread_arg_release(&cfs_thread_arg);      
-        if (th == THREAD_NULL) 
-                ret = -1;  
+{
+        int ret = 0;
+        thread_t th = NULL;
+
+        thread_arg_hold(&cfs_thread_arg, func, arg);
+        th = kernel_thread(kernel_task, cfs_thread_agent);
+        thread_arg_release(&cfs_thread_arg);
+        if (th == THREAD_NULL)
+                ret = -1;
         return ret;
 }
 
+void cfs_daemonize(char *str)
+{
+        snprintf(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX, "%s", str);
+        return;
+}
+
+/*
+ * XXX Liang: kexts cannot access sigmask in Darwin8.
+ * it's almost impossible for us to get/set signal mask
+ * without patching kernel.
+ * Should we provide these functions in xnu?
+ *
+ * These signal functions almost do nothing now, we 
+ * need to investigate more about signal in Darwin.
+ */
+cfs_sigset_t cfs_get_blockedsigs()
+{
+        return (cfs_sigset_t)0;
+}
+
+extern int block_procsigmask(struct proc *p,  int bit);
+
+cfs_sigset_t cfs_block_allsigs()
+{
+        cfs_sigset_t    old = 0;
+#ifdef __DARWIN8__
+#else
+        block_procsigmask(current_proc(), -1);
+#endif
+        return old;
+}
+
+cfs_sigset_t cfs_block_sigs(sigset_t bit)
+{
+        cfs_sigset_t    old = 0;
+#ifdef __DARWIN8__
+#else
+        block_procsigmask(current_proc(), bit);
+#endif
+        return old;
+}
+
+void cfs_restore_sigs(cfs_sigset_t old)
+{
+}
+
+int cfs_signal_pending(void)
+
+{
+#ifdef __DARWIN8__
+        extern int thread_issignal(proc_t, thread_t, sigset_t);
+        return thread_issignal(current_proc(), current_thread(), (sigset_t)-1);
+#else
+        return SHOULDissignal(current_proc(), current_uthread())
+#endif
+}
+
+void cfs_clear_sigpending(void)
+{
+#ifdef __DARWIN8__
+#else
+        clear_procsiglist(current_proc(), -1);
+#endif
+}
+
+#ifdef __DARWIN8__
+
+#else /* !__DARWIN8__ */
+
 void lustre_cone_in(boolean_t *state, funnel_t **cone)
 {
         *cone = thread_funnel_get();
@@ -284,7 +431,7 @@ void lustre_net_ex(boolean_t state, funnel_t *cone)
         else if (cone == NULL)
                 (void) thread_funnel_set(network_flock, state);
 }
-
+#endif /* !__DARWIN8__ */
 
 void cfs_waitq_init(struct cfs_waitq *waitq)
 {
@@ -297,7 +444,7 @@ void cfs_waitlink_init(struct cfs_waitlink *link)
 }
 
 void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link)
-{ 
+{
         link->wl_waitq = waitq;
        ksleep_add(&waitq->wq_ksleep_chan, &link->wl_ksleep_link);
 }
@@ -329,6 +476,10 @@ int cfs_waitq_active(struct cfs_waitq *waitq)
 
 void cfs_waitq_signal(struct cfs_waitq *waitq)
 {
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
        ksleep_wake(&waitq->wq_ksleep_chan);
 }
 
@@ -342,61 +493,89 @@ void cfs_waitq_broadcast(struct cfs_waitq *waitq)
        ksleep_wake_all(&waitq->wq_ksleep_chan);
 }
 
-void cfs_waitq_wait(struct cfs_waitlink *link)
-{ 
-        ksleep_wait(&link->wl_waitq->wq_ksleep_chan);
+void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state)
+{
+        ksleep_wait(&link->wl_waitq->wq_ksleep_chan, state);
 }
 
-cfs_duration_t  cfs_waitq_timedwait(struct cfs_waitlink *link, 
+cfs_duration_t  cfs_waitq_timedwait(struct cfs_waitlink *link,
+                                    cfs_task_state_t state,
                                     cfs_duration_t timeout)
-{ 
-        CDEBUG(D_TRACE, "timeout: %llu\n", (long long unsigned)timeout); 
-        return ksleep_timedwait(&link->chan->c, timeout);
+{
+        return ksleep_timedwait(&link->wl_waitq->wq_ksleep_chan, 
+                                state, timeout);
 }
 
 typedef  void (*ktimer_func_t)(void *);
 void cfs_timer_init(cfs_timer_t *t, void (* func)(unsigned long), void *arg)
-{ 
+{
         ktimer_init(&t->t, (ktimer_func_t)func, arg);
 }
 
 void cfs_timer_done(struct cfs_timer *t)
-{ 
+{
         ktimer_done(&t->t);
 }
 
 void cfs_timer_arm(struct cfs_timer *t, cfs_time_t deadline)
-{ 
+{
         ktimer_arm(&t->t, deadline);
 }
 
 void cfs_timer_disarm(struct cfs_timer *t)
-{ 
+{
         ktimer_disarm(&t->t);
 }
 
 int  cfs_timer_is_armed(struct cfs_timer *t)
-{ 
+{
         return ktimer_is_armed(&t->t);
 }
 
 cfs_time_t cfs_timer_deadline(struct cfs_timer *t)
-{ 
+{
         return ktimer_deadline(&t->t);
 }
 
-int
-libcfs_arch_init(void)
+void cfs_enter_debugger(void)
 {
-       init_rwsem(&cfs_symbol_lock);
-        CFS_INIT_LIST_HEAD(&cfs_symbol_list);
-        cfs_thread_agent_init();
-       return 0;
+#ifdef __DARWIN8__
+        extern void Debugger(const char * reason);
+        Debugger("CFS");
+#else
+        extern void PE_enter_debugger(char *cause);
+        PE_enter_debugger("CFS");
+#endif
 }
 
-void
-libcfs_arch_cleanup(void)
+int cfs_online_cpus(void)
 {
-       cfs_symbol_clean();
-}
+        int     activecpu;
+        size_t  size;
+
+#ifdef __DARWIN8__ 
+        size = sizeof(int);
+        sysctlbyname("hw.activecpu", &activecpu, &size, NULL, 0);
+        return activecpu;
+#else
+        host_basic_info_data_t hinfo;
+        kern_return_t kret;
+        int count = HOST_BASIC_INFO_COUNT;
+#define BSD_HOST 1
+        kret = host_info(BSD_HOST, HOST_BASIC_INFO, &hinfo, &count);
+        if (kret == KERN_SUCCESS) 
+                return (hinfo.avail_cpus);
+        return(-EINVAL);
+#endif
+}
+
+int cfs_ncpus(void)
+{
+        int     ncpu;
+        size_t  size;
 
+        size = sizeof(int);
+
+        sysctlbyname("hw.ncpu", &ncpu, &size, NULL, 0);
+        return ncpu;
+}
index f2b48d5..a38902a 100644 (file)
 #include <sys/unistd.h>
 #include <mach/mach_types.h>
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
+
 #include <libcfs/libcfs.h>
 
-static cfs_sysctl_table_header_t *portals_table_header = NULL;
-extern unsigned int portal_debug;
-extern char debug_file_path[1024];
-extern unsigned int portal_subsystem_debug;
-extern unsigned int portal_printk;
-extern unsigned int portals_catastrophe;
-extern atomic_t portal_kmemory;
+#define LIBCFS_SYSCTL           "libcfs"
+#define LIBCFS_SYSCTL_SPRITE    "sprite"
+#define LIBCFS_SYSCTL_MAGIC     0xbabeface
+
+static struct libcfs_sysctl_sprite {
+        int                     ss_magic;
+        struct sysctl_oid_list  *ss_link;
+} libcfs_sysctl_sprite = { 0, NULL };
+
+static cfs_sysctl_table_header_t *libcfs_table_header = NULL;
+extern unsigned int libcfs_debug;
+extern unsigned int libcfs_subsystem_debug;
+extern unsigned int libcfs_printk;
+extern unsigned int libcfs_console_ratelimit;
+extern unsigned int libcfs_catastrophe;
+extern atomic_t libcfs_kmemory;
 
 extern long max_debug_mb;
 extern int cfs_trace_daemon SYSCTL_HANDLER_ARGS;
 extern int cfs_debug_mb SYSCTL_HANDLER_ARGS;
 /*
- * sysctl table for portals
+ * sysctl table for lnet
  */
-SYSCTL_NODE (,                 OID_AUTO,       portals,        CTLFLAG_RW,
-            0,                 "portals sysctl top");
 
-SYSCTL_INT(_portals,                   OID_AUTO,       debug,  
-            CTLTYPE_INT | CTLFLAG_RW ,                 &portal_debug,  
+SYSCTL_NODE (,                 OID_AUTO,       lnet,   CTLFLAG_RW,
+            0,                 "lnet sysctl top");
+
+SYSCTL_INT(_lnet,                      OID_AUTO,       debug,
+            CTLTYPE_INT | CTLFLAG_RW ,                 &libcfs_debug,
             0,         "debug");
-SYSCTL_INT(_portals,                   OID_AUTO,       subsystem_debug,        
-            CTLTYPE_INT | CTLFLAG_RW,                  &portal_subsystem_debug,        
+SYSCTL_INT(_lnet,                      OID_AUTO,       subsystem_debug,
+            CTLTYPE_INT | CTLFLAG_RW,                  &libcfs_subsystem_debug,
             0,         "subsystem debug");
-SYSCTL_INT(_portals,                   OID_AUTO,       printk, 
-            CTLTYPE_INT | CTLFLAG_RW,                  &portal_printk, 
+SYSCTL_INT(_lnet,                      OID_AUTO,       printk,
+            CTLTYPE_INT | CTLFLAG_RW,                  &libcfs_printk,
             0,         "printk");
-SYSCTL_STRING(_portals,                        OID_AUTO,       debug_path,     
-            CTLTYPE_STRING | CTLFLAG_RW,               debug_file_path,        
+SYSCTL_INT(_lnet,                      OID_AUTO,       console_ratelimit,
+            CTLTYPE_INT | CTLFLAG_RW,                  &libcfs_console_ratelimit,
+            0,         "console_ratelimit");
+SYSCTL_STRING(_lnet,                   OID_AUTO,       debug_path,
+            CTLTYPE_STRING | CTLFLAG_RW,               debug_file_path,
             1024,      "debug path");
-SYSCTL_INT(_portals,                   OID_AUTO,       memused,        
-            CTLTYPE_INT | CTLFLAG_RW,                  (int *)&portal_kmemory.counter, 
+SYSCTL_INT(_lnet,                      OID_AUTO,       memused,
+            CTLTYPE_INT | CTLFLAG_RW,                  (int *)&libcfs_kmemory.counter,
             0,         "memused");
-SYSCTL_PROC(_portals,                  OID_AUTO,       trace_daemon,
+SYSCTL_INT(_lnet,                      OID_AUTO,       catastrophe,
+            CTLTYPE_INT | CTLFLAG_RW,                  (int *)&libcfs_catastrophe,
+            0,         "catastrophe");
+SYSCTL_PROC(_lnet,                     OID_AUTO,       trace_daemon,
             CTLTYPE_STRING | CTLFLAG_RW,               0,
             0,         &cfs_trace_daemon,              "A",    "trace daemon");
-SYSCTL_PROC(_portals,                  OID_AUTO,       debug_mb,
+SYSCTL_PROC(_lnet,                     OID_AUTO,       debug_mb,
             CTLTYPE_INT | CTLFLAG_RW,                  &max_debug_mb,
             0,         &cfs_debug_mb,                  "L",    "max debug size");
-#warning "add 'catastrophe' entry for LBUG detection"
 
 
 static cfs_sysctl_table_t      top_table[] = {
-       &sysctl__portals,
-       &sysctl__portals_debug,
-       &sysctl__portals_subsystem_debug,
-       &sysctl__portals_printk,
-       &sysctl__portals_debug_path,
-       &sysctl__portals_memused,
-       &sysctl__portals_trace_daemon,
-       &sysctl__portals_debug_mb,
+       &sysctl__lnet,
+       &sysctl__lnet_debug,
+       &sysctl__lnet_subsystem_debug,
+       &sysctl__lnet_printk,
+       &sysctl__lnet_console_ratelimit,
+       &sysctl__lnet_debug_path,
+       &sysctl__lnet_memused,
+       &sysctl__lnet_catastrophe,
+       &sysctl__lnet_trace_daemon,
+       &sysctl__lnet_debug_mb,
        NULL
 };
 
+/*
+ * Register sysctl table
+ */
+cfs_sysctl_table_header_t *
+cfs_register_sysctl_table (cfs_sysctl_table_t *table, int arg)
+{
+        cfs_sysctl_table_t      item;
+        int i = 0;
+
+        while ((item = table[i++]) != NULL) 
+                sysctl_register_oid(item);
+        return table;
+}
+
+/*
+ * Unregister sysctl table
+ */
+void
+cfs_unregister_sysctl_table (cfs_sysctl_table_header_t *table) {
+        int i = 0;
+        cfs_sysctl_table_t      item;
+
+        while ((item = table[i++]) != NULL) 
+                sysctl_unregister_oid(item);
+        return;
+}
+
+/*
+ * Allocate a sysctl oid. 
+ */
+static struct sysctl_oid *
+cfs_alloc_sysctl(struct sysctl_oid_list *parent, int nbr, int access,
+                 const char *name, void *arg1, int arg2, const char *fmt,
+                 int (*handler) SYSCTL_HANDLER_ARGS)
+{
+        struct sysctl_oid *oid;
+        char    *sname = NULL;
+        char    *sfmt = NULL;
+
+        if (strlen(name) + 1 > CTL_MAXNAME) {
+                printf("libcfs: sysctl name: %s is too long.\n", name);
+                return NULL;
+        }
+        oid = (struct sysctl_oid*)_MALLOC(sizeof(struct sysctl_oid), 
+                                          M_TEMP, M_WAITOK | M_ZERO);
+        if (oid == NULL) 
+                return NULL;
+
+        sname = (char *)_MALLOC(sizeof(CTL_MAXNAME), 
+                                M_TEMP, M_WAITOK | M_ZERO);
+        if (sname == NULL) 
+                goto error;
+        strcpy(sname, name);
+
+        sfmt = (char *)_MALLOC(4, M_TEMP, M_WAITOK | M_ZERO);
+        if (sfmt == NULL) 
+                goto error;
+        strcpy(sfmt, fmt);
+
+        if (parent == NULL)
+                oid->oid_parent = &sysctl__children;
+        else
+                oid->oid_parent = parent;
+        oid->oid_number = nbr;
+        oid->oid_kind = access;
+        oid->oid_name = sname;
+        oid->oid_handler = handler;
+        oid->oid_fmt = sfmt;
+
+        if ((access & CTLTYPE) == CTLTYPE_NODE){
+                /* It's a sysctl node */
+                struct sysctl_oid_list *link;
+
+                link = (struct sysctl_oid_list *)_MALLOC(sizeof(struct sysctl_oid_list), 
+                                                         M_TEMP, M_WAITOK | M_ZERO);
+                if (link == NULL)
+                        goto error;
+                oid->oid_arg1 = link;
+                oid->oid_arg2 = 0;
+        } else {
+                oid->oid_arg1 = arg1;
+                oid->oid_arg2 = arg2;
+        }
+
+        return oid;
+error:
+        if (sfmt != NULL)
+                _FREE(sfmt, M_TEMP);
+        if (sname != NULL)
+                _FREE(sname, M_TEMP);
+        if (oid != NULL)
+                _FREE(oid, M_TEMP);
+        return NULL;
+}
+
+void cfs_free_sysctl(struct sysctl_oid *oid)
+{
+        if (oid->oid_name != NULL)
+                _FREE((void *)oid->oid_name, M_TEMP);
+        if (oid->oid_fmt != NULL)
+                _FREE((void *)oid->oid_fmt, M_TEMP);
+        if ((oid->oid_kind & CTLTYPE_NODE != 0) && oid->oid_arg1)
+                /* XXX Liang: need to assert the list is empty */
+                _FREE(oid->oid_arg1, M_TEMP);
+        _FREE(oid, M_TEMP);
+}
+
+#define CFS_SYSCTL_ISVALID ((libcfs_sysctl_sprite.ss_magic == LIBCFS_SYSCTL_MAGIC) && \
+                            (libcfs_sysctl_sprite.ss_link != NULL))       
+
+int
+cfs_sysctl_isvalid(void)
+{
+        return CFS_SYSCTL_ISVALID;
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_node(struct sysctl_oid_list *parent, int nbr, int access,
+                      const char *name, int (*handler) SYSCTL_HANDLER_ARGS)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_NODE | access, name,
+                                NULL, 0, "N", handler);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_int(struct sysctl_oid_list *parent, int nbr, int access,
+                     const char *name, int *ptr, int val)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_INT | access, name, 
+                                ptr, val, "I", sysctl_handle_int);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_long(struct sysctl_oid_list *parent, int nbr, int access,
+                      const char *name, int *ptr, int val)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_INT | access, name, 
+                                ptr, val, "L", sysctl_handle_long);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_string(struct sysctl_oid_list *parent, int nbr, int access,
+                        const char *name, char *ptr, int len)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_STRING | access, name, 
+                                ptr, len, "A", sysctl_handle_string);
+}
+
+struct sysctl_oid *
+cfs_alloc_sysctl_struct(struct sysctl_oid_list *parent, int nbr, int access,
+                        const char *name, void *ptr, int size)
+{
+        if (parent == NULL && CFS_SYSCTL_ISVALID)
+                parent = libcfs_sysctl_sprite.ss_link;
+        return cfs_alloc_sysctl(parent, nbr, CTLTYPE_OPAQUE | access, name,
+                                ptr, size, "S", sysctl_handle_opaque);
+}
+
 /* no proc in osx */
 cfs_proc_dir_entry_t *
 cfs_create_proc_entry(char *name, int mod, cfs_proc_dir_entry_t *parent)
@@ -111,8 +295,8 @@ int
 insert_proc(void)
 {
 #if 1
-        if (!portals_table_header) 
-                portals_table_header = register_cfs_sysctl_table(top_table, 0);
+        if (!libcfs_table_header) 
+                libcfs_table_header = cfs_register_sysctl_table(top_table, 0);
 #endif
        return 0;
 }
@@ -121,11 +305,80 @@ void
 remove_proc(void)
 {
 #if 1
-        if (portals_table_header != NULL) 
-                unregister_cfs_sysctl_table(portals_table_header); 
-        portals_table_header = NULL;
+        if (libcfs_table_header != NULL) 
+                cfs_unregister_sysctl_table(libcfs_table_header); 
+        libcfs_table_header = NULL;
 #endif
        return;
 }
 
+int
+cfs_sysctl_init(void)
+{
+        struct sysctl_oid               *oid_root;
+        struct sysctl_oid               *oid_sprite;
+        struct libcfs_sysctl_sprite     *sprite;
+        size_t  len; 
+        int     rc;
+
+        len = sizeof(struct libcfs_sysctl_sprite);
+        rc = sysctlbyname("libcfs.sprite", 
+                          (void *)&libcfs_sysctl_sprite, &len, NULL, 0);
+        if (rc == 0) {
+                /* 
+                 * XXX Liang: assert (rc == 0 || rc == ENOENT)
+                 *
+                 * libcfs.sprite has been registered by previous 
+                 * loading of libcfs 
+                 */
+                if (libcfs_sysctl_sprite.ss_magic != LIBCFS_SYSCTL_MAGIC) {
+                        printf("libcfs: magic number of libcfs.sprite "
+                               "is not right (%lx, %lx)\n", 
+                               libcfs_sysctl_sprite.ss_magic,
+                               LIBCFS_SYSCTL_MAGIC);
+                        return -1;
+                }
+                assert(libcfs_sysctl_sprite.ss_link != NULL);
+                printf("libcfs: registered libcfs.sprite found.\n");
+                return 0;
+        }
+        oid_root = cfs_alloc_sysctl_node(NULL, OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN,
+                                         LIBCFS_SYSCTL, 0);
+        if (oid_root == NULL)
+                return -1;
+        sysctl_register_oid(oid_root);
+
+        sprite = (struct libcfs_sysctl_sprite *)_MALLOC(sizeof(struct libcfs_sysctl_sprite), 
+                                                        M_TEMP, M_WAITOK | M_ZERO);
+        if (sprite == NULL) {
+                sysctl_unregister_oid(oid_root);
+                cfs_free_sysctl(oid_root);
+                return -1;
+        }
+        sprite->ss_magic = LIBCFS_SYSCTL_MAGIC;
+        sprite->ss_link = (struct sysctl_oid_list *)oid_root->oid_arg1;
+        oid_sprite = cfs_alloc_sysctl_struct((struct sysctl_oid_list *)oid_root->oid_arg1, 
+                                             OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN, 
+                                             LIBCFS_SYSCTL_SPRITE, sprite, 
+                                             sizeof(struct libcfs_sysctl_sprite));
+        if (oid_sprite == NULL) {
+                cfs_free_sysctl(oid_sprite);
+                sysctl_unregister_oid(oid_root);
+                cfs_free_sysctl(oid_root);
+                return -1;
+        }
+        sysctl_register_oid(oid_sprite);
+
+        libcfs_sysctl_sprite.ss_magic = sprite->ss_magic;
+        libcfs_sysctl_sprite.ss_link = sprite->ss_link;
+
+        return 0;
+}
+
+void
+cfs_sysctl_fini(void)
+{
+        libcfs_sysctl_sprite.ss_magic = 0;
+        libcfs_sysctl_sprite.ss_link = NULL;
+}
 
index 7ac24f6..dc2af0f 100644 (file)
@@ -23,7 +23,7 @@
  *
  * Created by nikita on Sun Jul 18 2004.
  *
- * Prototypes of XNU synchronization primitives.
+ * XNU synchronization primitives.
  */
 
 /*
  * A lot can be optimized here.
  */
 
-#include <mach/mach_types.h>
-#include <sys/types.h>
-#include <kern/simple_lock.h>
+#define DEBUG_SUBSYSTEM S_LNET
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#ifdef __DARWIN8__
+# include <kern/locks.h>
+#else
+# include <mach/mach_types.h>
+# include <sys/types.h>
+# include <kern/simple_lock.h>
+#endif
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
@@ -62,14 +66,35 @@ extern int get_preemption_level(void);
 #define get_preemption_level() (0)
 #endif
 
-/*
- * Warning: low level portals debugging code (portals_debug_msg(), for
- * example), uses spin-locks, so debugging output here may lead to nasty
- * surprises.
- */
-
 #if SMP
+#ifdef __DARWIN8__
+
+static lck_grp_t       *cfs_lock_grp = NULL;
+#warning "Verify definition of lck_spin_t hasn't been changed while building!"
+
+/* hw_lock_* are not exported by Darwin8 */
+static inline void xnu_spin_init(xnu_spin_t *s)
+{
+        SLASSERT(cfs_lock_grp != NULL);
+        //*s = lck_spin_alloc_init(cfs_lock_grp, LCK_ATTR_NULL);
+        lck_spin_init((lck_spin_t *)s, cfs_lock_grp, LCK_ATTR_NULL);
+}
+
+static inline void xnu_spin_done(xnu_spin_t *s)
+{
+        SLASSERT(cfs_lock_grp != NULL);
+        //lck_spin_free(*s, cfs_lock_grp);
+        //*s = NULL;
+        lck_spin_destroy((lck_spin_t *)s, cfs_lock_grp);
+}
 
+#define xnu_spin_lock(s)        lck_spin_lock((lck_spin_t *)(s))
+#define xnu_spin_unlock(s)      lck_spin_unlock((lck_spin_t *)(s))
+
+#warning "Darwin8 does not export lck_spin_try_lock"
+#define xnu_spin_try(s)         (1)
+
+#else /* DARWIN8 */
 extern void                    hw_lock_init(hw_lock_t);
 extern void                    hw_lock_lock(hw_lock_t);
 extern void                    hw_lock_unlock(hw_lock_t);
@@ -77,10 +102,33 @@ extern unsigned int                hw_lock_to(hw_lock_t, unsigned int);
 extern unsigned int            hw_lock_try(hw_lock_t);
 extern unsigned int            hw_lock_held(hw_lock_t);
 
+#define xnu_spin_init(s)        hw_lock_init(s)
+#define xnu_spin_done(s)        do {} while (0)
+#define xnu_spin_lock(s)        hw_lock_lock(s)
+#define xnu_spin_unlock(s)      hw_lock_unlock(s)
+#define xnu_spin_try(s)         hw_lock_try(s)
+#endif /* DARWIN8 */
+
+#else /* SMP */
+#define xnu_spin_init(s)        do {} while (0)
+#define xnu_spin_done(s)        do {} while (0)
+#define xnu_spin_lock(s)        do {} while (0)
+#define xnu_spin_unlock(s)      do {} while (0)
+#define xnu_spin_try(s)         (1)
+#endif /* SMP */
+
+/*
+ * Warning: low level libcfs debugging code (libcfs_debug_msg(), for
+ * example), uses spin-locks, so debugging output here may lead to nasty
+ * surprises.
+ *
+ * In uniprocessor version of spin-lock. Only checks.
+ */
+
 void kspin_init(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
-       hw_lock_init(&spin->lock);
+       xnu_spin_init(&spin->lock);
        ON_SYNC_DEBUG(spin->magic = KSPIN_MAGIC);
        ON_SYNC_DEBUG(spin->owner = NULL);
 }
@@ -90,26 +138,37 @@ void kspin_done(struct kspin *spin)
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
        SLASSERT(spin->owner == NULL);
+        xnu_spin_done(&spin->lock);
 }
 
 void kspin_lock(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner != current_thread);
+       SLASSERT(spin->owner != current_thread());
+
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
 
-       hw_lock_lock(&spin->lock);
+       xnu_spin_lock(&spin->lock);
        SLASSERT(spin->owner == NULL);
-       ON_SYNC_DEBUG(spin->owner = current_thread);
+       ON_SYNC_DEBUG(spin->owner = current_thread());
 }
 
 void kspin_unlock(struct kspin *spin)
 {
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == current_thread);
+       SLASSERT(spin->owner == current_thread());
        ON_SYNC_DEBUG(spin->owner = NULL);
-       hw_lock_unlock(&spin->lock);
+       xnu_spin_unlock(&spin->lock);
 }
 
 int  kspin_trylock(struct kspin *spin)
@@ -117,84 +176,133 @@ int  kspin_trylock(struct kspin *spin)
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
 
-       if (hw_lock_try(&spin->lock)) {
+       if (xnu_spin_try(&spin->lock)) {
                SLASSERT(spin->owner == NULL);
-               ON_SYNC_DEBUG(spin->owner = current_thread);
+               ON_SYNC_DEBUG(spin->owner = current_thread());
                return 1;
        } else
                return 0;
 }
 
-/* SMP */
-#else
-
-/*
- * uniprocessor version of spin-lock. Only checks.
- */
-
-void kspin_init(struct kspin *spin)
+#if XNU_SYNC_DEBUG
+int kspin_islocked(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
-       ON_SYNC_DEBUG(spin->magic = KSPIN_MAGIC);
-       ON_SYNC_DEBUG(spin->owner = NULL);
+       SLASSERT(spin->magic == KSPIN_MAGIC);
+       return spin->owner == current_thread();
 }
 
-void kspin_done(struct kspin *spin)
+int kspin_isnotlocked(struct kspin *spin)
 {
        SLASSERT(spin != NULL);
        SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == NULL);
+       return spin->owner != current_thread();
 }
+#endif
 
-void kspin_lock(struct kspin *spin)
+/*
+ * read/write spin-lock
+ */
+void krw_spin_init(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == NULL);
-       ON_SYNC_DEBUG(spin->owner = current_thread);
+       SLASSERT(rwspin != NULL);
+
+       kspin_init(&rwspin->guard);
+       rwspin->count = 0;
+       ON_SYNC_DEBUG(rwspin->magic = KRW_SPIN_MAGIC);
 }
 
-void kspin_unlock(struct kspin *spin)
+void krw_spin_done(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == current_thread);
-       ON_SYNC_DEBUG(spin->owner = NULL);
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+       SLASSERT(rwspin->count == 0);
+       kspin_done(&rwspin->guard);
 }
 
-int kspin_trylock(struct kspin *spin)
+void krw_spin_down_r(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       SLASSERT(spin->owner == NULL);
-       ON_SYNC_DEBUG(spin->owner = current_thread);
-       return 1;
+        int i;
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+
+       kspin_lock(&rwspin->guard);
+        while(rwspin->count < 0) {
+                i = -1;
+               kspin_unlock(&rwspin->guard);
+                while (--i != 0 && rwspin->count < 0)
+                        continue;
+                kspin_lock(&rwspin->guard);
+        }
+       ++ rwspin->count;
+       kspin_unlock(&rwspin->guard);
 }
 
-/* SMP */
-#endif
+void krw_spin_down_w(struct krw_spin *rwspin)
+{
+        int i;
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+
+       kspin_lock(&rwspin->guard);
+        while (rwspin->count != 0) {
+                i = -1;
+               kspin_unlock(&rwspin->guard);
+                while (--i != 0 && rwspin->count != 0)
+                        continue;
+               kspin_lock(&rwspin->guard);
+        }
+       rwspin->count = -1;
+       kspin_unlock(&rwspin->guard);
+}
 
-#if XNU_SYNC_DEBUG
-int kspin_islocked(struct kspin *spin)
+void krw_spin_up_r(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       return spin->owner == current_thread;
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+       SLASSERT(rwspin->count > 0);
+
+       kspin_lock(&rwspin->guard);
+       -- rwspin->count;
+       kspin_unlock(&rwspin->guard);
 }
 
-int kspin_isnotlocked(struct kspin *spin)
+void krw_spin_up_w(struct krw_spin *rwspin)
 {
-       SLASSERT(spin != NULL);
-       SLASSERT(spin->magic == KSPIN_MAGIC);
-       return spin->owner != current_thread;
+       SLASSERT(rwspin != NULL);
+       SLASSERT(rwspin->magic == KRW_SPIN_MAGIC);
+       SLASSERT(rwspin->count == -1);
+
+       kspin_lock(&rwspin->guard);
+       rwspin->count = 0;
+       kspin_unlock(&rwspin->guard);
 }
-#endif
 
+/*
+ * semaphore 
+ */
+#ifdef __DARWIN8__
+
+#define xnu_waitq_init(q, a)            do {} while (0)
+#define xnu_waitq_done(q)               do {} while (0)
+#define xnu_waitq_wakeup_one(q, e, s)   ({wakeup_one((void *)(e)); KERN_SUCCESS;})
+#define xnu_waitq_wakeup_all(q, e, s)   ({wakeup((void *)(e)); KERN_SUCCESS;})
+#define xnu_waitq_assert_wait(q, e, s)  assert_wait((e), s)
+
+#else /* DARWIN8 */
+
+#define xnu_waitq_init(q, a)            wait_queue_init((q), a)
+#define xnu_waitq_done(q)               do {} while (0)
+#define xnu_waitq_wakeup_one(q, e, s)   wait_queue_wakeup_one((q), (event_t)(e), s)
+#define xnu_waitq_wakeup_all(q, e, s)   wait_queue_wakeup_all((q), (event_t)(e), s)
+#define xnu_waitq_assert_wait(q, e, s)  wait_queue_assert_wait((q), (event_t)(e), s)
+
+#endif /* DARWIN8 */
 void ksem_init(struct ksem *sem, int value)
 {
        SLASSERT(sem != NULL);
        kspin_init(&sem->guard);
-       wait_queue_init(&sem->q, SYNC_POLICY_FIFO);
+       xnu_waitq_init(&sem->q, SYNC_POLICY_FIFO);
        sem->value = value;
        ON_SYNC_DEBUG(sem->magic = KSEM_MAGIC);
 }
@@ -221,11 +329,11 @@ int ksem_up(struct ksem *sem, int value)
        kspin_lock(&sem->guard);
        sem->value += value;
        if (sem->value == 0)
-               result = wait_queue_wakeup_one(&sem->q, (event_t)sem,
-                                              THREAD_AWAKENED);
+               result = xnu_waitq_wakeup_one(&sem->q, sem,
+                                             THREAD_AWAKENED);
        else
-               result = wait_queue_wakeup_all(&sem->q, (event_t)sem,
-                                              THREAD_AWAKENED);
+               result = xnu_waitq_wakeup_all(&sem->q, sem,
+                                             THREAD_AWAKENED);
        kspin_unlock(&sem->guard);
        SLASSERT(result == KERN_SUCCESS || result == KERN_NOT_WAITING);
        return (result == KERN_SUCCESS) ? 0 : 1;
@@ -242,8 +350,8 @@ void ksem_down(struct ksem *sem, int value)
 
        kspin_lock(&sem->guard);
        while (sem->value < value) {
-               result = wait_queue_assert_wait(&sem->q, (event_t)sem,
-                                               THREAD_UNINT);
+               result = xnu_waitq_assert_wait(&sem->q, sem,
+                                              THREAD_UNINT);
                SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING);
                kspin_unlock(&sem->guard);
                if (result == THREAD_WAITING)
@@ -292,18 +400,18 @@ void kmut_lock(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       SLASSERT(mut->owner != current_thread);
+       SLASSERT(mut->owner != current_thread());
        SLASSERT(get_preemption_level() == 0);
 
        ksem_down(&mut->s, 1);
-       ON_SYNC_DEBUG(mut->owner = current_thread);
+       ON_SYNC_DEBUG(mut->owner = current_thread());
 }
 
 void kmut_unlock(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       SLASSERT(mut->owner == current_thread);
+       SLASSERT(mut->owner == current_thread());
 
        ON_SYNC_DEBUG(mut->owner = NULL);
        ksem_up(&mut->s, 1);
@@ -321,14 +429,14 @@ int kmut_islocked(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       return mut->owner == current_thread;
+       return mut->owner == current_thread();
 }
 
 int kmut_isnotlocked(struct kmut *mut)
 {
        SLASSERT(mut != NULL);
        SLASSERT(mut->magic == KMUT_MAGIC);
-       return mut->owner != current_thread;
+       return mut->owner != current_thread();
 }
 #endif
 
@@ -560,7 +668,7 @@ void ksleep_link_init(struct ksleep_link *link)
 
        CFS_INIT_LIST_HEAD(&link->linkage);
        link->flags = 0;
-       link->event = current_thread;
+       link->event = current_thread();
        link->hits  = 0;
        link->forward = NULL;
        ON_SYNC_DEBUG(link->magic = KSLEEP_LINK_MAGIC);
@@ -620,6 +728,11 @@ static void add_hit(struct ksleep_chan *chan, event_t event)
 {
        struct ksleep_link *scan;
 
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
        SLASSERT(kspin_islocked(&chan->guard));
        list_for_each_entry(scan, &chan->waiters, linkage) {
                if (scan->event == event) {
@@ -629,7 +742,7 @@ static void add_hit(struct ksleep_chan *chan, event_t event)
        }
 }
 
-void ksleep_wait(struct ksleep_chan *chan)
+void ksleep_wait(struct ksleep_chan *chan, cfs_task_state_t state)
 {
        event_t event;
        int     result;
@@ -640,10 +753,10 @@ void ksleep_wait(struct ksleep_chan *chan)
        SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC);
        SLASSERT(get_preemption_level() == 0);
 
-       event = current_thread;
+       event = current_thread();
        kspin_lock(&chan->guard);
        if (!has_hits(chan, event)) {
-               result = assert_wait(event, THREAD_UNINT);
+               result = assert_wait(event, state);
                kspin_unlock(&chan->guard);
                SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING);
                if (result == THREAD_WAITING)
@@ -653,12 +766,16 @@ void ksleep_wait(struct ksleep_chan *chan)
        EXIT;
 }
 
-int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
+/*
+ * Sleep on @chan for no longer than @timeout nano-seconds. Return remaining
+ * sleep time (non-zero only if thread was waken by a signal (not currently
+ * implemented), or waitq was already in the "signalled" state).
+ */
+int64_t ksleep_timedwait(struct ksleep_chan *chan, 
+                         cfs_task_state_t state,
+                         uint64_t timeout)
 {
        event_t event;
-       int64_t     result; 
-        AbsoluteTime clock_current; 
-        AbsoluteTime clock_delay;
 
        ENTRY;
 
@@ -666,22 +783,20 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
        SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC);
        SLASSERT(get_preemption_level() == 0);
 
-       CDEBUG(D_TRACE, "timeout: %llu\n", (long long unsigned)timeout);
-
-       event = current_thread;
-       result = 0;
+       event = current_thread();
        kspin_lock(&chan->guard);
        if (!has_hits(chan, event)) {
-               result = assert_wait(event, THREAD_UNINT);
+                int      result;
+                uint64_t expire;
+               result = assert_wait(event, state);
                if (timeout > 0) {
                        /*
                         * arm a timer. thread_set_timer()'s first argument is
                         * uint32_t, so we have to cook deadline ourselves.
                         */
-                       clock_get_uptime(&clock_current);
-                       nanoseconds_to_absolutetime(timeout, &clock_delay);
-                       ADD_ABSOLUTETIME(&clock_current, &clock_delay);
-                       thread_set_timer_deadline(clock_current);
+                       nanoseconds_to_absolutetime(timeout, &expire);
+                        clock_absolutetime_interval_to_deadline(expire, &expire);
+                       thread_set_timer_deadline(expire);
                }
                kspin_unlock(&chan->guard);
                SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING);
@@ -689,19 +804,22 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
                        result = thread_block(THREAD_CONTINUE_NULL);
                thread_cancel_timer();
 
-                clock_get_uptime(&clock_delay);
-                SUB_ABSOLUTETIME(&clock_delay, &clock_current);
-                if (result == THREAD_TIMED_OUT)
-                        result = 0;
-                else {
-                        absolutetime_to_nanoseconds(clock_delay, &result);
-                        if (result < 0)
-                                result = 0;
-                }
-       } else
+               if (result == THREAD_TIMED_OUT)
+                        timeout = 0;
+               else {
+                        uint64_t now;
+                        clock_get_uptime(&now);
+                        if (expire > now)
+                               absolutetime_to_nanoseconds(expire - now, &timeout);
+                        else
+                                timeout = 0;
+               }
+       } else  {
+                /* just return timeout, because I've got event and don't need to wait */
                kspin_unlock(&chan->guard);
+        }
 
-        RETURN(result);
+        RETURN(timeout);
 }
 
 /*
@@ -710,9 +828,11 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout)
  */
 void ksleep_wake(struct ksleep_chan *chan)
 {
-       ENTRY;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
        ksleep_wake_nr(chan, 1);
-       EXIT;
 }
 
 /*
@@ -734,7 +854,10 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr)
        struct ksleep_link *scan;
        int result;
 
-       ENTRY;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
 
        SLASSERT(chan != NULL);
        SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC);
@@ -747,8 +870,6 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr)
                if (forward != NULL)
                        kspin_lock(&forward->guard);
                result = thread_wakeup(scan->event);
-               CDEBUG(D_INFO, "waking 0x%x: %d\n",
-                      (unsigned int)scan->event, result);
                SLASSERT(result == KERN_SUCCESS || result == KERN_NOT_WAITING);
                if (result == KERN_NOT_WAITING) {
                        ++ scan->hits;
@@ -761,7 +882,6 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr)
                        break;
        }
        kspin_unlock(&chan->guard);
-       EXIT;
 }
 
 void ktimer_init(struct ktimer *t, void (*func)(void *), void *arg)
@@ -807,6 +927,9 @@ static void ktimer_actor(void *arg0, void *arg1)
                t->func(t->arg);
 }
 
+extern boolean_t thread_call_func_cancel(thread_call_func_t, thread_call_param_t, boolean_t);
+extern void thread_call_func_delayed(thread_call_func_t, thread_call_param_t, uint64_t);
+
 static void ktimer_disarm_locked(struct ktimer *t)
 {
        SLASSERT(t != NULL);
@@ -815,15 +938,29 @@ static void ktimer_disarm_locked(struct ktimer *t)
        thread_call_func_cancel(ktimer_actor, t, FALSE);
 }
 
+/*
+ * Received deadline is nanoseconds, but time checked by 
+ * thread_call is absolute time (The abstime unit is equal to 
+ * the length of one bus cycle, so the duration is dependent 
+ * on the bus speed of the computer), so we need to convert
+ * nanotime to abstime by nanoseconds_to_absolutetime().
+ *
+ * Refer to _delayed_call_timer(...)
+ *
+ * if thread_call_func_delayed is not exported in the future,
+ * we can use timeout() or bsd_timeout() to replace it.
+ */
 void ktimer_arm(struct ktimer *t, u_int64_t deadline)
 {
+        cfs_time_t    abstime;
        SLASSERT(t != NULL);
        SLASSERT(t->magic == KTIMER_MAGIC);
 
        kspin_lock(&t->guard);
        ktimer_disarm_locked(t);
        t->armed = 1;
-       thread_call_func_delayed(ktimer_actor, t, *(AbsoluteTime *)&deadline);
+        nanoseconds_to_absolutetime(deadline, &abstime);
+       thread_call_func_delayed(ktimer_actor, t, deadline);
        kspin_unlock(&t->guard);
 }
 
@@ -857,6 +994,26 @@ u_int64_t ktimer_deadline(struct ktimer *t)
        return t->deadline;
 }
 
+void cfs_sync_init(void) 
+{
+#ifdef __DARWIN8__
+        /* Initialize lock group */
+        cfs_lock_grp = lck_grp_alloc_init("libcfs sync", LCK_GRP_ATTR_NULL);
+#endif
+}
+
+void cfs_sync_fini(void)
+{
+#ifdef __DARWIN8__
+        /* 
+         * XXX Liang: destroy lock group. As we haven't called lock_done
+         * for all locks, cfs_lock_grp may not be freed by kernel(reference 
+         * count > 1).
+         */
+        lck_grp_free(cfs_lock_grp);
+        cfs_lock_grp = NULL;
+#endif
+}
 /*
  * Local variables:
  * c-indentation-style: "K&R"
diff --git a/lnet/libcfs/darwin/darwin-tcpip.c b/lnet/libcfs/darwin/darwin-tcpip.c
new file mode 100644 (file)
index 0000000..c6609a7
--- /dev/null
@@ -0,0 +1,1339 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ * 
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * 
+ * This file is part of Lustre, http://www.lustre.org.
+ * 
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ * 
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * 
+ * Darwin porting library
+ * Make things easy to port
+ */ 
+
+#include <mach/mach_types.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sockio.h>
+#include <sys/protosw.h>
+#include <net/if.h>
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+static __inline__ struct sockaddr_in
+blank_sin()
+{
+        struct sockaddr_in  blank = { sizeof(struct sockaddr_in), AF_INET };
+        return (blank);
+}
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+        int      i;
+
+        LASSERT (n > 0);
+
+        for (i = 0; i < n && names[i] != NULL; i++)
+                LIBCFS_FREE(names[i], IFNAMSIZ);
+                
+        LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+#ifdef __DARWIN8__
+/*
+ * Darwin 8.x 
+ *
+ * No hack kernel structre, all using KPI.
+ */
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+        struct ifreq    ifr;
+        socket_t        so;
+        __u32           val;
+        int             nob;
+        int             rc;
+
+        rc = -sock_socket(PF_INET, SOCK_STREAM, 0, 
+                          NULL, NULL, &so);
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return rc;
+        }
+
+        nob = strnlen(name, IFNAMSIZ);
+        if (nob == IFNAMSIZ) {
+                CERROR("Interface name %s too long\n", name);
+                rc = -EINVAL;
+                goto out;
+        }
+
+        CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+        bzero(&ifr, sizeof(ifr));
+        strcpy(ifr.ifr_name, name);
+        rc = -sock_ioctl (so, SIOCGIFFLAGS, &ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get flags for interface %s\n", name);
+                goto out;
+        }
+        
+        if ((ifr.ifr_flags & IFF_UP) == 0) {
+                CDEBUG(D_NET, "Interface %s down\n", name);
+                *up = 0;
+                *ip = *mask = 0;
+                goto out;
+        }
+
+        *up = 1;
+
+        bzero(&ifr, sizeof(ifr));
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        rc = -sock_ioctl(so, SIOCGIFADDR, &ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get IP address for interface %s\n", name);
+                goto out;
+        }
+        
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *ip = ntohl(val);
+
+        bzero(&ifr, sizeof(ifr));
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        rc = -sock_ioctl(so, SIOCGIFNETMASK, &ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get netmask for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *mask = ntohl(val);
+out:
+        sock_close(so);
+        return rc;
+}
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+        /* Allocate and fill in 'names', returning # interfaces/error */
+        char           **names;
+        int             toobig;
+        int             nalloc;
+        int             nfound;
+        socket_t        so;
+        struct ifreq   *ifr;
+        struct ifconf   ifc;
+        int             rc;
+        int             nob;
+        int             i;
+
+        rc = -sock_socket(PF_INET, SOCK_STREAM, 0, 
+                          NULL, NULL, &so);
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (rc);
+        }
+
+        nalloc = 16;    /* first guess at max interfaces */
+        toobig = 0;
+        for (;;) {
+                if (nalloc * sizeof(*ifr) > CFS_PAGE_SIZE) {
+                        toobig = 1;
+                        nalloc = CFS_PAGE_SIZE/sizeof(*ifr);
+                        CWARN("Too many interfaces: only enumerating first %d\n",
+                              nalloc);
+                }
+
+                LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+                if (ifr == NULL) {
+                        CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                                rc = -ENOMEM;
+                        goto out0;
+                }
+                                
+                ifc.ifc_buf = (char *)ifr;
+                ifc.ifc_len = nalloc * sizeof(*ifr);
+                                        
+#if 1
+                /*
+                 * XXX Liang:
+                 * sock_ioctl(..., SIOCGIFCONF, ...) is not supposed to be used in
+                 * kernel space because it always try to copy result to userspace. 
+                 * So we can't get interfaces name by sock_ioctl(...,SIOCGIFCONF,...).
+                 * I've created a bug for Apple, let's wait...
+                 */
+                nfound = 0;
+                for (i = 0; i < 16; i++) {
+                        struct ifreq    en;
+                        bzero(&en, sizeof(en));
+                        snprintf(en.ifr_name, IFNAMSIZ, "en%d", i);
+                        rc = -sock_ioctl (so, SIOCGIFFLAGS, &en);
+                        if (rc != 0)
+                                continue;
+                        strcpy(ifr[nfound++].ifr_name, en.ifr_name);
+                }
+
+#else           /* NOT in using now */
+                rc = -sock_ioctl(so, SIOCGIFCONF, (caddr_t)&ifc);
+                                
+                if (rc < 0) {
+                        CERROR ("Error %d enumerating interfaces\n", rc);
+                        goto out1;
+                }
+
+                nfound = ifc.ifc_len/sizeof(*ifr);
+                LASSERT (nfound <= nalloc);
+#endif
+
+                if (nfound < nalloc || toobig)
+                        break;
+
+                LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+                nalloc *= 2;
+        }
+        if (nfound == 0)
+                goto out1;
+
+        LIBCFS_ALLOC(names, nfound * sizeof(*names));
+        if (names == NULL) {
+                rc = -ENOMEM;
+                goto out1;
+        }
+        /* NULL out all names[i] */
+        memset (names, 0, nfound * sizeof(*names));
+
+        for (i = 0; i < nfound; i++) {
+
+                nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+                if (nob == IFNAMSIZ) {
+                        /* no space for terminating NULL */
+                        CERROR("interface name %.*s too long (%d max)\n",
+                               nob, ifr[i].ifr_name, IFNAMSIZ);
+                        rc = -ENAMETOOLONG;
+                        goto out2;
+                }
+
+                LIBCFS_ALLOC(names[i], IFNAMSIZ);
+                if (names[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out2;
+                }
+
+                memcpy(names[i], ifr[i].ifr_name, nob);
+                names[i][nob] = 0;
+        }
+
+        *namesp = names;
+        rc = nfound;
+
+out2:
+        if (rc < 0)
+                libcfs_ipif_free_enumeration(names, nfound);
+out1:
+        LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+out0:
+        sock_close(so);
+        return rc;
+
+}
+
+/*
+ * Public entry of socket upcall.
+ *
+ * so_upcall can only be installed while create/accept of socket in 
+ * Darwin 8.0, so we setup libcfs_sock_upcall() as upcall for all 
+ * sockets in creat/accept, it will call upcall provided by user 
+ * which can be setup after create/accept of socket.
+ */
+static void libcfs_sock_upcall(socket_t so, void* arg, int waitf)
+{
+        cfs_socket_t    *sock;
+
+        sock = (cfs_socket_t *)arg;
+        LASSERT(sock->s_magic == CFS_SOCK_MAGIC);
+
+        if ((sock->s_flags & CFS_SOCK_UPCALL) != 0 && sock->s_upcall != NULL)
+                sock->s_upcall(so, sock->s_upcallarg, waitf);
+        return;
+}
+
+void libcfs_sock_set_cb(cfs_socket_t *sock, so_upcall callback, void *arg)
+{
+        sock->s_upcall = callback;
+        sock->s_upcallarg = arg;
+        sock->s_flags |= CFS_SOCK_UPCALL;
+        return;
+}
+
+void libcfs_sock_reset_cb(cfs_socket_t *sock)
+{
+        sock->s_flags &= ~CFS_SOCK_UPCALL;
+        sock->s_upcall = NULL;
+        sock->s_upcallarg = NULL;
+        return;
+}
+
+static int
+libcfs_sock_create (cfs_socket_t **sockp, int *fatal,
+                    __u32 local_ip, int local_port)
+{
+        struct sockaddr_in  locaddr;
+        cfs_socket_t    *sock;
+        int             option;
+        int             optlen;
+        int             rc;
+
+        /* All errors are fatal except bind failure if the port is in use */
+        *fatal = 1;
+
+        sock = _MALLOC(sizeof(cfs_socket_t), M_TEMP, M_WAITOK|M_ZERO);
+        if (!sock) {
+                CERROR("Can't allocate cfs_socket.\n");
+                return -ENOMEM;
+        }
+        *sockp = sock;
+        sock->s_magic = CFS_SOCK_MAGIC;
+
+        rc = -sock_socket(PF_INET, SOCK_STREAM, 0, 
+                          libcfs_sock_upcall, sock, &C2B_SOCK(sock));
+        if (rc != 0) 
+                goto out;
+        option = 1;
+        optlen = sizeof(option);
+        rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, 
+                              SO_REUSEADDR, &option, optlen);
+        if (rc != 0)
+                goto out;
+
+        /* can't specify a local port without a local IP */
+        LASSERT (local_ip == 0 || local_port != 0);
+
+        if (local_ip != 0 || local_port != 0) {
+                bzero (&locaddr, sizeof (locaddr));
+                locaddr.sin_len = sizeof(struct sockaddr_in);
+                locaddr.sin_family = AF_INET;
+                locaddr.sin_port = htons (local_port);
+                locaddr.sin_addr.s_addr = (local_ip != 0) ? htonl(local_ip) : INADDR_ANY;
+                rc = -sock_bind(C2B_SOCK(sock), (struct sockaddr *)&locaddr);
+                if (rc == -EADDRINUSE) {
+                        CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                        *fatal = 0;
+                        goto out;
+                }
+                if (rc != 0) {
+                        CERROR("Error trying to bind to port %d: %d\n",
+                               local_port, rc);
+                        goto out;
+                }
+        }
+        return 0;
+out:
+        if (C2B_SOCK(sock) != NULL) 
+                sock_close(C2B_SOCK(sock));
+        FREE(sock, M_TEMP);
+        return rc;
+}
+
+int
+libcfs_sock_listen (cfs_socket_t **sockp,
+                   __u32 local_ip, int local_port, int backlog)
+{
+        cfs_socket_t    *sock;
+        int             fatal;
+        int             rc;
+
+        rc = libcfs_sock_create(&sock, &fatal, local_ip, local_port);
+        if (rc != 0)  {
+                if (!fatal)
+                        CERROR("Can't create socket: port %d already in use\n",
+                                local_port);
+                return rc;
+
+        }
+        rc = -sock_listen(C2B_SOCK(sock), backlog);
+        if (rc == 0) {
+                *sockp = sock;
+                return 0;
+        }
+
+        if (C2B_SOCK(sock) != NULL) 
+                sock_close(C2B_SOCK(sock));
+        FREE(sock, M_TEMP);
+        return rc;
+}
+
+int
+libcfs_sock_accept (cfs_socket_t **newsockp, cfs_socket_t *sock)
+{
+        cfs_socket_t   *newsock;
+        int             rc;
+
+        newsock = _MALLOC(sizeof(cfs_socket_t), M_TEMP, M_WAITOK|M_ZERO);
+        if (!newsock) {
+                CERROR("Can't allocate cfs_socket.\n");
+                return -ENOMEM;
+        }
+        newsock->s_magic = CFS_SOCK_MAGIC;
+        /*
+         * thread will sleep in sock_accept by calling of msleep(), 
+         * it can be interrupted because msleep() use PCATCH as argument.
+         */
+        rc = -sock_accept(C2B_SOCK(sock), NULL, 0, 0, 
+                          libcfs_sock_upcall, newsock, &C2B_SOCK(newsock));
+        if (rc) {
+                if (C2B_SOCK(newsock) != NULL) 
+                        sock_close(C2B_SOCK(newsock));
+                FREE(newsock, M_TEMP);
+                if ((sock->s_flags & CFS_SOCK_DOWN) != 0)
+                        /* shutdown by libcfs_sock_abort_accept(), fake 
+                         * error number for lnet_acceptor() */
+                        rc = -EAGAIN;
+                return rc;
+        }
+        *newsockp = newsock;
+        return 0;
+}
+
+void
+libcfs_sock_abort_accept (cfs_socket_t *sock)
+{
+        /*
+         * XXX Liang: 
+         *
+         * we want to wakeup thread blocked by sock_accept, but we don't
+         * know the address where thread is sleeping on, so we cannot 
+         * wakeup it directly.
+         * The thread slept in sock_accept will be waken up while:
+         * 1. interrupt by signal
+         * 2. new connection is coming (sonewconn)
+         * 3. disconnecting of the socket (soisconnected)
+         * 
+         * Cause we can't send signal to a thread directly(no KPI), so the 
+         * only thing can be done here is disconnect the socket (by 
+         * sock_shutdown() or sth else? ).
+         *
+         * Shutdown request of socket with SHUT_WR or SHUT_RDWR will
+         * be issured to the protocol.
+         * sock_shutdown()->tcp_usr_shutdown()->tcp_usrclosed()->
+         * tcp_close()->soisdisconnected(), it will wakeup thread by
+         * wakeup((caddr_t)&so->so_timeo);
+         */
+        sock->s_flags |= CFS_SOCK_DOWN;
+        sock_shutdown(C2B_SOCK(sock), SHUT_RDWR);
+}
+
+int
+libcfs_sock_read (cfs_socket_t *sock, void *buffer, int nob, int timeout)
+{
+        size_t          rcvlen;
+        int             rc;
+        cfs_duration_t  to = cfs_time_seconds(timeout);
+        cfs_time_t      then;
+        struct timeval  tv;
+
+        LASSERT(nob > 0);
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct  msghdr  msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = 0,
+                };
+                cfs_duration_usec(to, &tv);
+                rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_RCVTIMEO,
+                                      &tv, sizeof(tv));
+                if (rc != 0) {
+                        CERROR("Can't set socket recv timeout "
+                                        "%ld.%06d: %d\n",
+                                        (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                        return rc;
+                }
+
+                then = cfs_time_current();
+                rc = -sock_receive(C2B_SOCK(sock), &msg, 0, &rcvlen);
+                to -= cfs_time_current() - then;
+
+                if (rc != 0 && rc != -EWOULDBLOCK)
+                        return rc;
+                if (rcvlen == nob)
+                        return 0;
+
+                if (to <= 0)
+                        return -EAGAIN;
+
+                buffer = ((char *)buffer) + rcvlen;
+                nob -= rcvlen;
+        }
+        return 0;
+}
+
+int
+libcfs_sock_write (cfs_socket_t *sock, void *buffer, int nob, int timeout)
+{
+        size_t          sndlen;
+        int             rc;
+        cfs_duration_t  to = cfs_time_seconds(timeout);
+        cfs_time_t      then;
+        struct timeval  tv;
+
+        LASSERT(nob > 0);
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct  msghdr  msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0,
+                };
+
+                if (timeout != 0) {
+                        cfs_duration_usec(to, &tv);
+                        rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDTIMEO,
+                                              &tv, sizeof(tv));
+                        if (rc != 0) {
+                                CERROR("Can't set socket send timeout "
+                                       "%ld.%06d: %d\n",
+                                       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                                return rc;
+                        }
+                }
+
+                then = cfs_time_current();
+                rc = -sock_send(C2B_SOCK(sock), &msg, 
+                                ((timeout == 0) ? MSG_DONTWAIT : 0), &sndlen);
+                to -= cfs_time_current() - then;
+
+                if (rc != 0 && rc != -EWOULDBLOCK)
+                        return rc;
+                if (sndlen == nob)
+                        return 0;
+
+                if (to <= 0)
+                        return -EAGAIN;
+                buffer = ((char *)buffer) + sndlen;
+                nob -= sndlen;
+        }
+        return 0;
+
+}
+
+int
+libcfs_sock_getaddr (cfs_socket_t *sock, int remote, __u32 *ip, int *port)
+{
+        struct sockaddr_in sin;
+        int                rc;
+
+        if (remote != 0) 
+                /* Get remote address */
+                rc = -sock_getpeername(C2B_SOCK(sock), (struct sockaddr *)&sin, sizeof(sin));
+        else 
+                /* Get local address */
+                rc = -sock_getsockname(C2B_SOCK(sock), (struct sockaddr *)&sin, sizeof(sin));
+        if (rc != 0) {
+                CERROR ("Error %d getting sock %s IP/port\n",
+                         rc, remote ? "peer" : "local");
+                return rc;
+        }
+
+        if (ip != NULL)
+                *ip = ntohl (sin.sin_addr.s_addr);
+
+        if (port != NULL)
+                *port = ntohs (sin.sin_port);
+        return 0;
+}
+
+int
+libcfs_sock_setbuf (cfs_socket_t *sock, int txbufsize, int rxbufsize)
+{
+        int                 option;
+        int                 rc;
+        
+        if (txbufsize != 0) {
+                option = txbufsize;
+                rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDBUF,
+                                     (char *)&option, sizeof (option));
+                if (rc != 0) {
+                        CERROR ("Can't set send buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                } 
+        } 
+        
+        if (rxbufsize != 0) {
+                option = rxbufsize;
+                rc = -sock_setsockopt (C2B_SOCK(sock), SOL_SOCKET, SO_RCVBUF,
+                                      (char *)&option, sizeof (option));
+                if (rc != 0) {
+                        CERROR ("Can't set receive buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+        }
+        return 0;
+}
+
+int
+libcfs_sock_getbuf (cfs_socket_t *sock, int *txbufsize, int *rxbufsize)
+{
+        int                 option;
+        int                 optlen;
+        int                 rc; 
+        
+        if (txbufsize != NULL) {
+                optlen = sizeof(option);
+                rc = -sock_getsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDBUF,
+                                (char *)&option, &optlen);
+                if (rc != 0) {
+                        CERROR ("Can't get send buffer size: %d\n", rc);
+                        return (rc);
+                }
+                *txbufsize = option;
+        } 
+        
+        if (rxbufsize != NULL) {
+                optlen = sizeof(option);
+                rc = -sock_getsockopt (C2B_SOCK(sock), SOL_SOCKET, SO_RCVBUF,
+                                (char *)&option, &optlen);
+                if (rc != 0) {
+                        CERROR ("Can't get receive buffer size: %d\n", rc);
+                        return (rc);
+                }
+                *rxbufsize = option;
+        }
+        return 0;
+}
+
+void
+libcfs_sock_release (cfs_socket_t *sock)
+{
+        if (C2B_SOCK(sock) != NULL) {
+                sock_shutdown(C2B_SOCK(sock), 2);
+                sock_close(C2B_SOCK(sock));
+        }
+        FREE(sock, M_TEMP);
+}
+
+int
+libcfs_sock_connect (cfs_socket_t **sockp, int *fatal,
+                     __u32 local_ip, int local_port,
+                     __u32 peer_ip, int peer_port)
+{
+        cfs_socket_t       *sock;
+        struct sockaddr_in  srvaddr;
+        int                 rc; 
+        
+        rc = libcfs_sock_create(&sock, fatal, local_ip, local_port);
+        if (rc != 0)
+                return rc;
+
+        bzero(&srvaddr, sizeof(srvaddr));
+        srvaddr.sin_len = sizeof(struct sockaddr_in);
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(peer_port);
+        srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+        rc = -sock_connect(C2B_SOCK(sock), (struct sockaddr *)&srvaddr, 0);
+        if (rc == 0) {
+                *sockp = sock;
+                return 0;
+        }
+
+        *fatal = !(rc == -EADDRNOTAVAIL || rc == -EADDRINUSE);
+        CDEBUG(*fatal ? D_NETERROR : D_NET,
+               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+               HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+        libcfs_sock_release(sock);
+        return rc;
+}
+
+#else   /* !__DARWIN8__ */
+
+/*
+ * To use bigger buffer for socket:
+ * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so
+ *    we must patch kernel).
+ * 2. Increase net.inet.tcp.reass.maxsegments
+ * 3. Increase net.inet.tcp.sendspace
+ * 4. Increase net.inet.tcp.recvspace
+ * 5. Increase kern.ipc.maxsockbuf
+ */
+#define KSOCK_MAX_BUF        (1152*1024)
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+        struct socket      *so;
+        struct ifreq       ifr;
+        int                nob;
+        int                rc;
+        __u32              val;
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_NET_IN;
+        rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
+        CFS_NET_EX;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (-rc);
+        }
+        nob = strnlen(name, IFNAMSIZ);
+        if (nob == IFNAMSIZ) {
+                CERROR("Interface name %s too long\n", name);
+                rc = -EINVAL;
+                goto out;
+        }
+
+        CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+        strcpy(ifr.ifr_name, name);
+        CFS_NET_IN;
+        rc = ifioctl(so, SIOCGIFFLAGS, (caddr_t)&ifr, current_proc());
+        CFS_NET_EX;
+
+        if (rc != 0) {
+                CERROR("Can't get flags for interface %s\n", name);
+                goto out;
+        }
+        if ((ifr.ifr_flags & IFF_UP) == 0) {
+                CDEBUG(D_NET, "Interface %s down\n", name);
+                *up = 0;
+                *ip = *mask = 0;
+                goto out;
+        }
+       
+        *up = 1;
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        CFS_NET_IN;
+        rc = ifioctl(so, SIOCGIFADDR, (caddr_t)&ifr, current_proc());
+        CFS_NET_EX;
+
+        if (rc != 0) {
+                CERROR("Can't get IP address for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *ip = ntohl(val);
+
+        strcpy(ifr.ifr_name, name);
+        *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin();
+        CFS_NET_IN;
+        rc = ifioctl(so, SIOCGIFNETMASK, (caddr_t)&ifr, current_proc());
+        CFS_NET_EX;
+
+        if (rc != 0) {
+                CERROR("Can't get netmask for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *mask = ntohl(val);
+out:
+        CFS_NET_IN;
+        soclose(so);
+        CFS_NET_EX;
+        return -rc;
+}
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+        /* Allocate and fill in 'names', returning # interfaces/error */
+        char           **names;
+        int             toobig;
+        int             nalloc;
+        int             nfound;
+        struct socket  *so;
+        struct ifreq   *ifr;
+        struct ifconf   ifc;
+        int             rc;
+        int             nob;
+        int             i;
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_NET_IN;
+        rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
+        CFS_NET_EX;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (-rc);
+        }
+
+        nalloc = 16;    /* first guess at max interfaces */
+        toobig = 0;
+        for (;;) {
+                if (nalloc * sizeof(*ifr) > CFS_PAGE_SIZE) {
+                        toobig = 1;
+                        nalloc = CFS_PAGE_SIZE/sizeof(*ifr);
+                        CWARN("Too many interfaces: only enumerating first %d\n",
+                              nalloc);
+                }
+
+                LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+                if (ifr == NULL) {
+                        CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                                rc = -ENOMEM;
+                        goto out0;
+                }
+                                
+                ifc.ifc_buf = (char *)ifr;
+                ifc.ifc_len = nalloc * sizeof(*ifr);
+                                        
+                CFS_NET_IN;
+                rc = -ifioctl(so, SIOCGIFCONF, (caddr_t)&ifc, current_proc());
+                CFS_NET_EX;
+                                
+                if (rc < 0) {
+                        CERROR ("Error %d enumerating interfaces\n", rc);
+                        goto out1;
+                }
+
+                nfound = ifc.ifc_len/sizeof(*ifr);
+                LASSERT (nfound <= nalloc);
+
+                if (nfound < nalloc || toobig)
+                        break;
+
+                LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+                nalloc *= 2;
+        }
+        if (nfound == 0)
+                goto out1;
+
+        LIBCFS_ALLOC(names, nfound * sizeof(*names));
+        if (names == NULL) {
+                rc = -ENOMEM;
+                goto out1;
+        }
+        /* NULL out all names[i] */
+        memset (names, 0, nfound * sizeof(*names));
+
+        for (i = 0; i < nfound; i++) {
+
+                nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+                if (nob == IFNAMSIZ) {
+                        /* no space for terminating NULL */
+                        CERROR("interface name %.*s too long (%d max)\n",
+                               nob, ifr[i].ifr_name, IFNAMSIZ);
+                        rc = -ENAMETOOLONG;
+                        goto out2;
+                }
+
+                LIBCFS_ALLOC(names[i], IFNAMSIZ);
+                if (names[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out2;
+                }
+
+                memcpy(names[i], ifr[i].ifr_name, nob);
+                names[i][nob] = 0;
+        }
+
+        *namesp = names;
+        rc = nfound;
+
+out2:
+        if (rc < 0)
+                libcfs_ipif_free_enumeration(names, nfound);
+out1:
+        LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+out0:
+        CFS_NET_IN;
+        soclose(so);
+        CFS_NET_EX;
+        return rc;
+}
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+                    __u32 local_ip, int local_port)
+{
+        struct sockaddr_in  locaddr;
+        struct socket      *so;
+        struct sockopt      sopt;
+        int                 option;
+        int                 rc;
+        CFS_DECL_FUNNEL_DATA;
+
+        *fatal = 1;
+        CFS_NET_IN;
+        rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
+        CFS_NET_EX;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (-rc);
+        }
+        
+        bzero(&sopt, sizeof sopt);
+        option = 1;
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_name = SO_REUSEADDR;
+        sopt.sopt_val = &option;
+        sopt.sopt_valsize = sizeof(option);
+        CFS_NET_IN;
+        rc = sosetopt(so, &sopt);
+        if (rc != 0) {
+                CFS_NET_EX;
+                CERROR ("Can't set sock reuse address: %d\n", rc);
+                goto out;
+        }
+        /* can't specify a local port without a local IP */
+        LASSERT (local_ip == 0 || local_port != 0);
+
+        if (local_ip != 0 || local_port != 0) {
+                bzero (&locaddr, sizeof (locaddr));
+                locaddr.sin_len = sizeof(struct sockaddr_in);
+                locaddr.sin_family = AF_INET;
+                locaddr.sin_port = htons (local_port);
+                locaddr.sin_addr.s_addr = (local_ip != 0) ? htonl(local_ip) :
+                                                            INADDR_ANY;
+
+                rc = sobind(so, (struct sockaddr *)&locaddr);
+                if (rc == EADDRINUSE) {
+                        CFS_NET_EX;
+                        CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                        *fatal = 0;
+                        goto out;
+                }
+                if (rc != 0) {
+                        CFS_NET_EX;
+                        CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n",
+                        HIPQUAD(local_ip), rc);
+                        goto out;
+                }
+        }
+        *sockp = so;
+        return 0;
+out:
+        CFS_NET_IN;
+        soclose(so);
+        CFS_NET_EX;
+        return -rc;
+}
+
+int
+libcfs_sock_listen (struct socket **sockp,
+                    __u32 local_ip, int local_port, int backlog)
+{
+        int      fatal;
+        int      rc;
+        CFS_DECL_FUNNEL_DATA;
+
+        rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+        if (rc != 0) {
+                if (!fatal)
+                        CERROR("Can't create socket: port %d already in use\n",
+                               local_port);
+                return rc;
+        }
+        CFS_NET_IN;
+        rc = solisten(*sockp, backlog);
+        CFS_NET_EX;
+        if (rc == 0)
+                return 0;
+        CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+        CFS_NET_IN;
+        soclose(*sockp);
+        CFS_NET_EX;
+        return -rc;
+}
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+        struct socket *so;
+        struct sockaddr *sa;
+        int error, s;
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_NET_IN;
+        s = splnet();
+        if ((sock->so_options & SO_ACCEPTCONN) == 0) {
+                splx(s);
+                CFS_NET_EX;
+                return (-EINVAL);
+        }
+
+        if ((sock->so_state & SS_NBIO) && sock->so_comp.tqh_first == NULL) {
+                splx(s);
+                CFS_NET_EX;
+                return (-EWOULDBLOCK);
+        }
+
+        error = 0;
+        while (TAILQ_EMPTY(&sock->so_comp) && sock->so_error == 0) {
+                if (sock->so_state & SS_CANTRCVMORE) {
+                        sock->so_error = ECONNABORTED;
+                        break;
+                }
+                error = tsleep((caddr_t)&sock->so_timeo, PSOCK | PCATCH,
+                                "accept", 0);
+                if (error) {
+                        splx(s);
+                        CFS_NET_EX;
+                        return (-error);
+                }
+        }
+        if (sock->so_error) {
+                error = sock->so_error;
+                sock->so_error = 0;
+                splx(s);
+                CFS_NET_EX;
+                return (-error);
+        }
+
+        /*
+         * At this point we know that there is at least one connection
+         * ready to be accepted. Remove it from the queue prior to
+         * allocating the file descriptor for it since falloc() may
+         * block allowing another process to accept the connection
+         * instead.
+         */
+        so = TAILQ_FIRST(&sock->so_comp);
+        TAILQ_REMOVE(&sock->so_comp, so, so_list);
+        sock->so_qlen--;
+
+        so->so_state &= ~SS_COMP;
+        so->so_head = NULL;
+        sa = 0;
+        (void) soaccept(so, &sa);
+
+        *newsockp = so;
+        FREE(sa, M_SONAME);
+        splx(s);
+        CFS_NET_EX;
+        return (-error);
+}
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+        wakeup(&sock->so_timeo);
+}
+
+/*
+ * XXX Liang: timeout for write is not supported yet.
+ */
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        CFS_DECL_NET_DATA;
+
+        while (nob > 0) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct  uio suio = {
+                        .uio_iov        = &iov,
+                        .uio_iovcnt     = 1,
+                        .uio_offset     = 0,
+                        .uio_resid      = nob,
+                        .uio_segflg     = UIO_SYSSPACE,
+                        .uio_rw         = UIO_WRITE,
+                        .uio_procp      = NULL
+                };
+                                
+                CFS_NET_IN;
+                rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0);
+                CFS_NET_EX;
+                                
+                if (rc != 0) {
+                        if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+                             rc == EWOULDBLOCK))
+                        rc = 0;
+                        if ( rc != 0 )
+                                return -rc;
+                        rc = nob - suio.uio_resid;
+                        buffer = ((char *)buffer) + rc;
+                        nob = suio.uio_resid;
+                        continue;
+                }
+                break;
+        }
+        return (0);
+}
+
+/*
+ * XXX Liang: timeout for read is not supported yet.
+ */
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        CFS_DECL_NET_DATA;
+
+        while (nob > 0) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct uio  ruio = {
+                        .uio_iov        = &iov,
+                        .uio_iovcnt     = 1,
+                        .uio_offset     = 0,
+                        .uio_resid      = nob,
+                        .uio_segflg     = UIO_SYSSPACE,
+                        .uio_rw         = UIO_READ,
+                        .uio_procp      = NULL
+                };
+                
+                CFS_NET_IN;
+                rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0);
+                CFS_NET_EX;
+                
+                if (rc != 0) {
+                        if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+                                rc == EWOULDBLOCK))
+                                rc = 0;
+                        if (rc != 0)
+                                return -rc;
+                        rc = nob - ruio.uio_resid;
+                        buffer = ((char *)buffer) + rc;
+                        nob = ruio.uio_resid;
+                        continue;
+                }
+                break;
+        }
+        return (0);
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+        struct sockopt  sopt;
+        int             rc = 0;
+        int             option;
+        CFS_DECL_NET_DATA;
+
+        bzero(&sopt, sizeof sopt);
+        sopt.sopt_dir = SOPT_SET;
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_val = &option;
+        sopt.sopt_valsize = sizeof(option);
+
+        if (txbufsize != 0) {
+                option = txbufsize;
+                if (option > KSOCK_MAX_BUF)
+                        option = KSOCK_MAX_BUF;
+        
+                sopt.sopt_name = SO_SNDBUF;
+                CFS_NET_IN;
+                rc = sosetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't set send buffer %d: %d\n",
+                                option, rc);
+                        
+                        return -rc;
+                }
+        }
+                
+        if (rxbufsize != 0) {
+                option = rxbufsize;
+                sopt.sopt_name = SO_RCVBUF;
+                CFS_NET_IN;
+                rc = sosetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't set receive buffer %d: %d\n",
+                                option, rc);
+                        return -rc;
+                }
+        }
+        return 0;
+}
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+        struct sockaddr_in *sin;
+        struct sockaddr    *sa = NULL;
+        int                rc;
+        CFS_DECL_NET_DATA;
+
+        if (remote != 0) {
+                CFS_NET_IN;
+                rc = sock->so_proto->pr_usrreqs->pru_peeraddr(sock, &sa);
+                CFS_NET_EX;
+
+                if (rc != 0) {
+                        if (sa) FREE(sa, M_SONAME);
+                        CERROR ("Error %d getting sock peer IP\n", rc);
+                        return -rc;
+                }
+        } else {
+                CFS_NET_IN;
+                rc = sock->so_proto->pr_usrreqs->pru_sockaddr(sock, &sa);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        if (sa) FREE(sa, M_SONAME);
+                        CERROR ("Error %d getting sock local IP\n", rc);
+                        return -rc;
+                }
+        }
+        if (sa != NULL) {
+                sin = (struct sockaddr_in *)sa;
+                if (ip != NULL)
+                        *ip = ntohl (sin->sin_addr.s_addr);
+                if (port != NULL)
+                        *port = ntohs (sin->sin_port);
+                if (sa) 
+                        FREE(sa, M_SONAME);
+        }
+        return 0;
+}
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+        struct sockopt  sopt;
+        int rc;
+        CFS_DECL_NET_DATA;
+
+        bzero(&sopt, sizeof sopt);
+        sopt.sopt_dir = SOPT_GET;
+        sopt.sopt_level = SOL_SOCKET;
+
+        if (txbufsize != NULL) {
+                sopt.sopt_val = txbufsize;
+                sopt.sopt_valsize = sizeof(*txbufsize);
+                sopt.sopt_name = SO_SNDBUF;
+                CFS_NET_IN;
+                rc = sogetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't get send buffer size: %d\n", rc);
+                        return -rc;
+                }
+        }
+
+        if (rxbufsize != NULL) {
+                sopt.sopt_val = rxbufsize;
+                sopt.sopt_valsize = sizeof(*rxbufsize);
+                sopt.sopt_name = SO_RCVBUF;
+                CFS_NET_IN;
+                rc = sogetopt(sock, &sopt);
+                CFS_NET_EX;
+                if (rc != 0) {
+                        CERROR ("Can't get receive buffer size: %d\n", rc);
+                        return -rc;
+                }
+        }
+        return 0;
+}
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+                     __u32 local_ip, int local_port,
+                     __u32 peer_ip, int peer_port)
+{
+        struct sockaddr_in  srvaddr;
+        struct socket      *so;
+        int                 s;
+        int                 rc; 
+        CFS_DECL_FUNNEL_DATA;
+        
+        rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+        if (rc != 0)
+                return rc;
+        so = *sockp;
+        bzero(&srvaddr, sizeof(srvaddr));
+        srvaddr.sin_len = sizeof(struct sockaddr_in);
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons (peer_port);
+        srvaddr.sin_addr.s_addr = htonl (peer_ip);
+
+        CFS_NET_IN;
+        rc = soconnect(so, (struct sockaddr *)&srvaddr);
+        if (rc != 0) {
+                CFS_NET_EX;
+                if (rc != EADDRNOTAVAIL && rc != EADDRINUSE)
+                        CDEBUG(D_NETERROR,
+                               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+                               HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+                goto out;
+        }
+        s = splnet();
+        while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+                CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n");
+                (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz);
+        }
+        if ((rc = so->so_error) != 0) {
+                so->so_error = 0;
+                splx(s);
+                CFS_NET_EX;
+                CDEBUG(D_NETERROR,
+                       "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+                       HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+                goto out;
+        }
+        LASSERT(so->so_state & SS_ISCONNECTED);
+        splx(s);
+        CFS_NET_EX;
+        if (sockp)
+                *sockp = so;
+        return (0);
+out:
+        CFS_NET_IN;
+        soshutdown(so, 2);
+        soclose(so);
+        CFS_NET_EX;
+        return (-rc);
+}
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+        CFS_DECL_FUNNEL_DATA;
+        CFS_NET_IN;
+        soshutdown(sock, 0);
+        CFS_NET_EX;
+}
+
+#endif
index c621129..bb1dc72 100644 (file)
@@ -1,5 +1,5 @@
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 #define LUSTRE_TRACEFILE_PRIVATE
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
 extern union trace_data_union trace_data[NR_CPUS];
 extern char *tracefile;
 extern long long tracefile_size;
-extern struct rw_semaphore tracefile_sem;
 extern int trace_start_thread(void);
 extern void trace_stop_thread(void);
 
 long max_debug_mb = M_TCD_MAX_PAGES;
 static long max_permit_mb = (64 * 1024);
 
-inline struct trace_cpu_data *
-__trace_get_tcd (unsigned long *flags)
+spinlock_t trace_cpu_serializer;
+
+/*
+ * thread currently executing tracefile code or NULL if none does. Used to
+ * detect recursive calls to libcfs_debug_msg().
+ */
+static thread_t trace_owner = NULL;
+
+extern int get_preemption_level(void);
+extern atomic_t tage_allocated;
+
+struct rw_semaphore tracefile_sem;
+
+int tracefile_init_arch() {
+    init_rwsem(&tracefile_sem);
+#error "Todo: initialise per-cpu console buffers"
+    return 0;
+}
+
+void tracefile_fini_arch() {
+}
+
+void tracefile_read_lock() {
+    down_read(&tracefile_sem);
+}
+
+void tracefile_read_unlock() {
+    up_read(&tracefile_sem);
+}
+
+void tracefile_write_lock() {
+    down_write(&tracefile_sem);
+}
+
+void tracefile_write_unlock() {
+    up_write(&tracefile_sem);
+}
+
+char *trace_get_console_buffer(void)
+{
+#error "todo: return a per-cpu/interrupt console buffer and disable pre-emption"
+}
+
+void trace_put_console_buffer(char *buffer)
+{
+#error "todo: re-enable pre-emption"
+}
+
+struct trace_cpu_data *trace_get_tcd(void)
+{
+       struct trace_cpu_data *tcd;
+       int nr_pages;
+       struct list_head pages;
+
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+
+       /*
+        * debugging check for recursive call to libcfs_debug_msg()
+        */
+       if (trace_owner == current_thread()) {
+                /*
+                 * Cannot assert here.
+                 */
+               printk(KERN_EMERG "recursive call to %s", __FUNCTION__);
+               /*
+                 * "The death of God left the angels in a strange position."
+                */
+               cfs_enter_debugger();
+       }
+       tcd = &trace_data[0].tcd;
+        CFS_INIT_LIST_HEAD(&pages);
+       if (get_preemption_level() == 0)
+               nr_pages = trace_refill_stock(tcd, CFS_ALLOC_STD, &pages);
+       else
+               nr_pages = 0;
+       spin_lock(&trace_cpu_serializer);
+       trace_owner = current_thread();
+       tcd->tcd_cur_stock_pages += nr_pages;
+       list_splice(&pages, &tcd->tcd_stock_pages);
+       return tcd;
+}
+
+extern void raw_page_death_row_clean(void);
+
+void __trace_put_tcd(struct trace_cpu_data *tcd)
 {
-       return &trace_data[0].tcd;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       LASSERT(trace_owner == current_thread());
+       trace_owner = NULL;
+       spin_unlock(&trace_cpu_serializer);
+       if (get_preemption_level() == 0)
+               /* purge all pending pages */
+               raw_page_death_row_clean();
 }
 
-inline void
-__trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags)
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage)
 {
-       return;
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       /* XNU has global tcd, and all pages are owned by it */
+       return 1;
 }
 
 void
-set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, 
+set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask,
                    const int line, unsigned long stack)
-{ 
-       struct timeval tv; 
+{
+       struct timeval tv;
        
-       do_gettimeofday(&tv); 
-       header->ph_subsys = subsys; 
-       header->ph_mask = mask; 
-       header->ph_cpu_id = smp_processor_id(); 
-       header->ph_sec = (__u32)tv.tv_sec; 
-       header->ph_usec = tv.tv_usec; 
-       header->ph_stack = stack; 
-       header->ph_pid = 0; 
-       header->ph_line_num = line; 
-       header->ph_extern_pid = 0;
-}
-
-void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, 
-                            int len, char *file, const char *fn)
-{ 
-       char *prefix = NULL, *ptype = NULL;
-                       
-       if ((mask & D_EMERG) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_EMERG; 
-       } else if ((mask & D_ERROR) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_ERR; 
-       } else if ((mask & D_WARNING) != 0) { 
-               prefix = "Lustre"; 
-               ptype = KERN_WARNING; 
-       } else if (portal_printk != 0 || (mask & D_CONSOLE)) {
-               prefix = "Lustre"; 
-               ptype = KERN_INFO; 
-       } 
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       do_gettimeofday(&tv);
+       header->ph_subsys = subsys;
+       header->ph_mask = mask;
+       header->ph_cpu_id = smp_processor_id();
+       header->ph_sec = (__u32)tv.tv_sec;
+       header->ph_usec = tv.tv_usec;
+       header->ph_stack = stack;
+       header->ph_pid = cfs_curproc_pid();
+       header->ph_line_num = line;
+       header->ph_extern_pid = (__u32)current_thread();
+}
+
+void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf,
+                     int len, const char *file, const char *fn)
+{
+       char *prefix = "Lustre", *ptype = KERN_INFO;
+
+       /*
+        * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       if ((mask & D_EMERG) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_EMERG;
+       } else if ((mask & D_ERROR) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_ERR;
+       } else if ((mask & D_WARNING) != 0) {
+               prefix = "Lustre";
+               ptype = KERN_WARNING;
+       } else if ((mask & libcfs_printk) != 0 || (mask & D_CONSOLE)) {
+               prefix = "Lustre";
+               ptype = KERN_INFO;
+       }
 
        if ((mask & D_CONSOLE) != 0) {
                printk("%s%s: %.*s", ptype, prefix, len, buf);
        } else {
-               printk("%s%s: %d:%d:(%s:%d:%s()) %*s", ptype, prefix, hdr->ph_pid, 
-                      hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
+               printk("%s%s: %d:%d:(%s:%d:%s()) %*s",
+                      ptype, prefix, hdr->ph_pid, hdr->ph_extern_pid,
+                      file, hdr->ph_line_num, fn, len, buf);
        }
 }
 
 /*
  * Sysctl handle of libcfs
  */
+#define MAX_TRACEFILE_PATH_LEN  256
 int cfs_trace_daemon SYSCTL_HANDLER_ARGS
 {
        int error = 0;
        char *name = NULL;
 
-       MALLOC(name, char *, req->newlen + 1, M_TEMP, M_WAITOK | M_ZERO);
+        if (req->newptr == USER_ADDR_NULL) {
+                /* a read */
+                if (tracefile)
+                        error = sysctl_handle_string(oidp, tracefile, 0, req);
+                else
+                        error = sysctl_handle_string(oidp, "NA", 0, req);
+
+                return error;
+        }
+        
+        /* now hanle write requests */
+       MALLOC(name, char *, MAX_TRACEFILE_PATH_LEN + 1, M_TEMP, M_WAITOK | M_ZERO);
        if (name == NULL)
                return -ENOMEM;
-       down_write(&tracefile_sem);
-       error = sysctl_handle_string(oidp, name, req->newlen + 1, req);
-       if (!error || req->newptr != NULL) {
-               /* write */
+        name[0] = '\0';
+       tracefile_write_lock();
+       error = sysctl_handle_string(oidp, name, MAX_TRACEFILE_PATH_LEN + 1, req);
+       if (!error) {
                if (strcmp(name, "stop") == 0) {
                        /* stop tracefile daemon */
                        tracefile = NULL;
                        trace_stop_thread();
-                       goto out; 
-               }else if (strncmp(name, "size=", 5) == 0) { 
-                       tracefile_size = simple_strtoul(name + 5, NULL, 0); 
-                       if (tracefile_size < 10 || tracefile_size > 20480) 
-                               tracefile_size = TRACEFILE_SIZE; 
-                       else 
-                               tracefile_size <<= 20; 
+                       goto out;
+               }else if (strncmp(name, "size=", 5) == 0) {
+                       tracefile_size = simple_strtoul(name + 5, NULL, 0);
+                       if (tracefile_size < 10 || tracefile_size > 20480)
+                               tracefile_size = TRACEFILE_SIZE;
+                       else
+                               tracefile_size <<= 20;
                        goto out;
 
                }
-               if (name[0] != '/') { 
-                       error = -EINVAL; 
-                       goto out; 
-               } 
-               if (tracefile != NULL) 
+               if (name[0] != '/') {
+                       error = -EINVAL;
+                       goto out;
+               }
+               if (tracefile != NULL)
                        cfs_free(tracefile);
-               tracefile = name; 
-               name = NULL; 
+               tracefile = name;
+               name = NULL;
                trace_start_thread();
-       } else if (req->newptr != NULL) {
+       } else {
                /* Something was wrong with the write request */
                printf("sysctl debug daemon failed: %d.\n", error);
                goto out;
-       } else {
-               /* Read request */
-               SYSCTL_OUT(req, tracefile, sizeof(tracefile));
        }
 out:
-       if (name != NULL) 
+       if (name != NULL)
                FREE(name, M_TEMP);
-       up_write(&tracefile_sem);
+       tracefile_write_unlock();
        return error;
 }
+#undef MAX_TRACEFILE_PATH_LEN
 
 
 int cfs_debug_mb SYSCTL_HANDLER_ARGS
@@ -138,27 +255,29 @@ int cfs_debug_mb SYSCTL_HANDLER_ARGS
        int error = 0;
 
        error = sysctl_handle_long(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
-       if (!error && req->newptr != NULL) {
+       if (!error && req->newptr != USER_ADDR_NULL) {
                /* We have a new value stored in the standard location */
                if (max_debug_mb <= 0)
                        return -EINVAL;
                if (max_debug_mb > max_permit_mb) {
                        printf("sysctl debug_mb is too big: %d.\n", max_debug_mb);
                        return 0;
-               } 
-               for (i = 0; i < NR_CPUS; i++) { 
-                       struct trace_cpu_data *tcd; 
-                       tcd = &trace_data[i].tcd; 
+               }
+               for (i = 0; i < NR_CPUS; i++) {
+                       struct trace_cpu_data *tcd;
+                       tcd = &trace_data[i].tcd;
                        tcd->tcd_max_pages = max_debug_mb;
                }
-       } else if (req->newptr != NULL) {
+       } else if (req->newptr != USER_ADDR_NULL) {
                /* Something was wrong with the write request */
                printf ("sysctl debug_mb fault: %d.\n", error);
-       } else {
-               /* Read request */
-               error = SYSCTL_OUT(req, &max_debug_mb, sizeof max_debug_mb);
        }
+
        return error;
 }
 
-
+void
+trace_call_on_all_cpus(void (*fn)(void *arg), void *arg)
+{
+#error "tbd"
+}
index 630db6b..cfd7a2d 100644 (file)
  * Darwin porting library
  * Make things easy to port
  */
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <mach/mach_types.h>
 #include <string.h>
 #include <sys/errno.h>
 #include <sys/types.h>
 #include <sys/fcntl.h>
-#include <portals/types.h>
+#include <lnet/types.h>
+
+#include <libcfs/kp30.h>
 
 #ifndef isspace
 inline int
 isspace(char c)
-{ 
+{
         return (c == ' ' || c == '\t' || c == '\n' || c == '\12');
 }
 #endif
@@ -98,12 +100,12 @@ strstr(const char *in, const char *str)
 
 char *
 strrchr(const char *p, int ch)
-{ 
-        const char *end = p + strlen(p); 
-        do { 
-                if (*end == (char)ch) 
-                        return (char *)end; 
-        } while (--end >= p); 
+{
+        const char *end = p + strlen(p);
+        do {
+                if (*end == (char)ch)
+                        return (char *)end;
+        } while (--end >= p);
         return NULL;
 }
 
@@ -273,7 +275,7 @@ int convert_server_error(__u64 ecode)
        int sign;
        int code;
 
-        static int errno_xlate[] = {
+       static int errno_xlate[] = {
                /* success is always success */
                [0]                     = 0,
                [LINUX_EPERM]           = EPERM,
@@ -358,7 +360,8 @@ int convert_server_error(__u64 ecode)
                [LINUX_ELIBMAX]         = EINVAL /* ELIBMAX */,
                [LINUX_ELIBEXEC]        = EINVAL /* ELIBEXEC */,
                [LINUX_EILSEQ]          = EILSEQ,
-               [LINUX_ERESTART]        = ERESTART,
+               [LINUX_ERESTART]        = EINVAL /* because ERESTART is
+                                                  * negative in XNU */,
                [LINUX_ESTRPIPE]        = EINVAL /* ESTRPIPE */,
                [LINUX_EUSERS]          = EUSERS,
                [LINUX_ENOTSOCK]        = ENOTSOCK,
@@ -398,22 +401,19 @@ int convert_server_error(__u64 ecode)
                [LINUX_EDQUOT]          = EDQUOT,
                [LINUX_ENOMEDIUM]       = EINVAL /* ENOMEDIUM */,
                [LINUX_EMEDIUMTYPE]     = EINVAL /* EMEDIUMTYPE */,
-        };
+       };
        code = (int)ecode;
-        if (code >= 0) {
+       if (code >= 0) {
                sign = +1;
        } else {
                sign = -1;
                code = -code;
        }
-       if (code < (sizeof errno_xlate) / (sizeof errno_xlate[0]))
+       if (code < (sizeof errno_xlate) / (sizeof errno_xlate[0])) {
                code = errno_xlate[code];
-       else
-               /*
-                * Unknown error. Reserved for the future.
-                */
-               code = EINVAL;
-        return sign * code;
+               LASSERT(code >= 0);
+        }
+       return sign * code;
 }
 
 enum {
@@ -448,7 +448,7 @@ static inline void obit_convert(int *cflag, int *sflag,
  */
 int convert_client_oflag(int cflag, int *result)
 {
-       int sflag;
+       int sflag = 0;
 
        cflag = 0;
        obit_convert(&cflag, &sflag, O_RDONLY,   LINUX_O_RDONLY);
@@ -480,3 +480,99 @@ int convert_client_oflag(int cflag, int *result)
        } else
                return -EINVAL;
 }
+
+#ifdef __DARWIN8__
+#else /* !__DARWIN8__ */
+extern int unix_syscall();
+extern int unix_syscall_return();
+
+extern int ktrsysret();
+extern int ktrace();
+
+extern int ast_taken();
+extern int ast_check();
+
+extern int trap();
+extern int syscall_trace();
+
+static int is_addr_in_range(void *addr, void *start, void *end)
+{
+       return start <= addr && addr <= end;
+}
+
+extern void cfs_thread_agent (void);
+
+static int is_last_frame(void *addr)
+{
+       if (addr == NULL)
+               return 1;
+       else if (is_addr_in_range(addr, unix_syscall, unix_syscall_return))
+               return 1;
+       else if (is_addr_in_range(addr, ktrsysret, ktrace))
+               return 1;
+       else if (is_addr_in_range(addr, ast_taken, ast_check))
+               return 1;
+       else if (is_addr_in_range(addr, trap, syscall_trace))
+               return 1;
+       else if (is_addr_in_range(addr, cfs_thread_agent, cfs_kernel_thread))
+               return 1;
+       else
+               return 0;
+}
+
+static void *get_frame(int i)
+{
+       void *result;
+
+#define CASE(i) case (i): result = __builtin_return_address(i); break
+       switch (i + 1) {
+               CASE(1);
+               CASE(2);
+               CASE(3);
+               CASE(4);
+               CASE(5);
+               CASE(6);
+               CASE(7);
+               CASE(8);
+               CASE(9);
+               CASE(10);
+               CASE(11);
+               CASE(12);
+               CASE(13);
+               CASE(14);
+               CASE(15);
+               CASE(16);
+               CASE(17);
+               CASE(18);
+               CASE(19);
+               CASE(20);
+       default:
+               panic("impossible frame number: %d\n", i);
+               result = NULL;
+       }
+       return result;
+}
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{
+       int i;
+
+       memset(trace, 0, sizeof *trace);
+       for (i = 0; i < sizeof_array(trace->frame); ++ i) {
+               void *addr;
+
+               addr = get_frame(i);
+               trace->frame[i] = addr;
+               if (is_last_frame(addr))
+                       break;
+       }
+}
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        if (0 <= frame_no && frame_no < sizeof_array(trace->frame))
+                return trace->frame[frame_no];
+        else
+                return NULL;
+}
+#endif /* !__DARWIN8__ */
index 3ef33d8..18bc5d5 100644 (file)
 # define EXPORT_SYMTAB
 #endif
 
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/kp30.h>
 #include <libcfs/libcfs.h>
-
 #include "tracefile.h"
 
-unsigned int portal_subsystem_debug = ~0 - (S_PORTALS);
-EXPORT_SYMBOL(portal_subsystem_debug);
+static char debug_file_name[1024];
 
-unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA |
-                             D_RPCTRACE | D_VFSTRACE | D_CONFIG | D_IOCTL |
-                             D_CONSOLE);
-EXPORT_SYMBOL(portal_debug);
+#ifdef __KERNEL__
+unsigned int libcfs_subsystem_debug = ~0;
+EXPORT_SYMBOL(libcfs_subsystem_debug);
 
-unsigned int portal_printk;
-EXPORT_SYMBOL(portal_printk);
+unsigned int libcfs_debug = (D_EMERG | D_ERROR | D_WARNING | D_CONSOLE |
+                             D_NETERROR | D_HA | D_CONFIG | D_IOCTL |
+                             D_DLMTRACE | D_RPCTRACE | D_VFSTRACE);
+EXPORT_SYMBOL(libcfs_debug);
 
-unsigned int portal_stack;
-EXPORT_SYMBOL(portal_stack);
+unsigned int libcfs_printk;
+EXPORT_SYMBOL(libcfs_printk);
 
-unsigned int portals_catastrophe;
-EXPORT_SYMBOL(portals_catastrophe);
+unsigned int libcfs_console_ratelimit = 1;
+EXPORT_SYMBOL(libcfs_console_ratelimit);
 
-#ifdef __KERNEL__
-atomic_t portal_kmemory = ATOMIC_INIT(0);
-EXPORT_SYMBOL(portal_kmemory);
-#endif
+unsigned int libcfs_debug_binary = 1;
+EXPORT_SYMBOL(libcfs_debug_binary);
+
+unsigned int libcfs_stack;
+EXPORT_SYMBOL(libcfs_stack);
+
+unsigned int portal_enter_debugger;
+EXPORT_SYMBOL(portal_enter_debugger);
+
+unsigned int libcfs_catastrophe;
+EXPORT_SYMBOL(libcfs_catastrophe);
+
+atomic_t libcfs_kmemory = ATOMIC_INIT(0);
+EXPORT_SYMBOL(libcfs_kmemory);
 
 static cfs_waitq_t debug_ctlwq;
 
 char debug_file_path[1024] = "/tmp/lustre-log";
-static char debug_file_name[1024];
 
-void portals_debug_dumplog_internal(void *arg)
+int libcfs_panic_in_progress;
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_subsys2str(int subsys)
+{
+        switch (subsys) {
+        default:
+                return NULL;
+        case S_UNDEFINED:
+                return "undefined";
+        case S_MDC:
+                return "mdc";
+        case S_MDS:
+                return "mds";
+        case S_OSC:
+                return "osc";
+        case S_OST:
+                return "ost";
+        case S_CLASS:
+                return "class";
+        case S_LOG:
+                return "log";
+        case S_LLITE:
+                return "llite";
+        case S_RPC:
+                return "rpc";
+        case S_LNET:
+                return "lnet";
+        case S_LND:
+                return "lnd";
+        case S_PINGER:
+                return "pinger";
+        case S_FILTER:
+                return "filter";
+        case S_ECHO:
+                return "echo";
+        case S_LDLM:
+                return "ldlm";
+        case S_LOV:
+                return "lov";
+        case S_LMV:
+                return "lmv";
+        case S_SEC:
+                return "sec";
+        case S_GSS:
+                return "gss";
+        case S_MGC:
+                return "mgc";
+        case S_MGS:
+                return "mgs";
+        case S_FID:
+                return "fid";
+        case S_FLD:
+                return "fld";
+        }
+}
+
+/* libcfs_debug_token2mask() expects the returned
+ * string in lower-case */
+const char *
+libcfs_debug_dbg2str(int debug)
+{
+        switch (debug) {
+        default:
+                return NULL;
+        case D_TRACE:
+                return "trace";
+        case D_INODE:
+                return "inode";
+        case D_SUPER:
+                return "super";
+        case D_EXT2:
+                return "ext2";
+        case D_MALLOC:
+                return "malloc";
+        case D_CACHE:
+                return "cache";
+        case D_INFO:
+                return "info";
+        case D_IOCTL:
+                return "ioctl";
+        case D_NETERROR:
+                return "neterror";
+        case D_NET:
+                return "net";
+        case D_WARNING:
+                return "warning";
+        case D_BUFFS:
+                return "buffs";
+        case D_OTHER:
+                return "other";
+        case D_DENTRY:
+                return "dentry";
+        case D_PAGE:
+                return "page";
+        case D_DLMTRACE:
+                return "dlmtrace";
+        case D_ERROR:
+                return "error";
+        case D_EMERG:
+                return "emerg";
+        case D_HA:
+                return "ha";
+        case D_RPCTRACE:
+                return "rpctrace";
+        case D_VFSTRACE:
+                return "vfstrace";
+        case D_READA:
+                return "reada";
+        case D_MMAP:
+                return "mmap";
+        case D_CONFIG:
+                return "config";
+        case D_CONSOLE:
+                return "console";
+        case D_QUOTA:
+                return "quota";
+        case D_SEC:
+                return "sec";
+        }
+}
+
+int
+libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys)
+{
+        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                 libcfs_debug_dbg2str;
+        int           len = 0;
+        const char   *token;
+        int           bit;
+        int           i;
+
+        if (mask == 0) {                        /* "0" */
+                if (size > 0)
+                        str[0] = '0';
+                len = 1;
+        } else {                                /* space-separated tokens */
+                for (i = 0; i < 32; i++) {
+                        bit = 1 << i;
+
+                        if ((mask & bit) == 0)
+                                continue;
+
+                        token = fn(bit);
+                        if (token == NULL)              /* unused bit */
+                                continue;
+
+                        if (len > 0) {                  /* separator? */
+                                if (len < size)
+                                        str[len] = ' ';
+                                len++;
+                        }
+                
+                        while (*token != 0) {
+                                if (len < size)
+                                        str[len] = *token;
+                                token++;
+                                len++;
+                        }
+                }
+        }
+
+        /* terminate 'str' */
+        if (len < size)
+                str[len] = 0;
+        else
+                str[size - 1] = 0;
+
+        return len;
+}
+
+int
+libcfs_debug_token2mask(int *mask, const char *str, int len, int is_subsys)
+{
+        const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str :
+                                                 libcfs_debug_dbg2str;
+        int           i;
+        int           j;
+        int           bit;
+        const char   *token;
+
+        /* match against known tokens */
+        for (i = 0; i < 32; i++) {
+                bit = 1 << i;
+
+                token = fn(bit);
+                if (token == NULL)              /* unused? */
+                        continue;
+                
+                /* strcasecmp */
+                for (j = 0; ; j++) {
+                        if (j == len) {         /* end of token */
+                                if (token[j] == 0) {
+                                        *mask = bit;
+                                        return 0;
+                                }
+                                break;
+                        }
+                        
+                        if (token[j] == 0)
+                                break;
+                                
+                        if (str[j] == token[j])
+                                continue;
+                        
+                        if (str[j] < 'A' || 'Z' < str[j])
+                                break;
+
+                        if (str[j] - 'A' + 'a' != token[j])
+                                break;
+                }
+        }
+        
+        return -EINVAL;                         /* no match */
+}
+
+int
+libcfs_debug_str2mask(int *mask, const char *str, int is_subsys)
+{
+        int         m = 0;
+        int         matched = 0;
+        char        op = 0;
+        int         n;
+        int         t;
+
+        /* <str> must be a list of debug tokens or numbers separated by
+         * whitespace and optionally an operator ('+' or '-').  If an operator
+         * appears first in <str>, '*mask' is used as the starting point
+         * (relative), otherwise 0 is used (absolute).  An operator applies to
+         * all following tokens up to the next operator. */
+        
+        while (*str != 0) {
+                while (isspace(*str)) /* skip whitespace */
+                        str++;
+
+                if (*str == 0)
+                        break;
+
+                if (*str == '+' || *str == '-') {
+                        op = *str++;
+
+                        /* op on first token == relative */
+                        if (!matched)
+                                m = *mask;
+
+                        while (isspace(*str)) /* skip whitespace */
+                                str++;
+
+                        if (*str == 0)          /* trailing op */
+                                return -EINVAL;
+                }
+
+                /* find token length */
+                for (n = 0; str[n] != 0 && !isspace(str[n]); n++);
+
+                /* match token */
+                if (libcfs_debug_token2mask(&t, str, n, is_subsys) != 0)
+                        return -EINVAL;
+                
+                matched = 1;
+                if (op == '-')
+                        m &= ~t;
+                else
+                        m |= t;
+                
+                str += n;
+        }
+
+        if (!matched)
+                return -EINVAL;
+
+        *mask = m;
+        return 0;
+}
+
+void libcfs_debug_dumplog_internal(void *arg)
 {
         CFS_DECL_JOURNAL_DATA;
 
         CFS_PUSH_JOURNAL;
 
-        snprintf(debug_file_name, sizeof(debug_file_path) - 1,
-                 "%s.%ld.%ld", debug_file_path, cfs_time_current_sec(), (long)arg);
+        snprintf(debug_file_name, sizeof(debug_file_path) - 1, "%s.%ld.%ld",
+                 debug_file_path, cfs_time_current_sec(), (long)arg);
         printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name);
         tracefile_dump_all_pages(debug_file_name);
 
         CFS_POP_JOURNAL;
 }
 
-int portals_debug_dumplog_thread(void *arg)
+int libcfs_debug_dumplog_thread(void *arg)
 {
-        kportal_daemonize("");
-        reparent_to_init();
-        portals_debug_dumplog_internal(arg);
+        cfs_daemonize("");
+        libcfs_debug_dumplog_internal(arg);
         cfs_waitq_signal(&debug_ctlwq);
         return 0;
 }
 
-void portals_debug_dumplog(void)
+void libcfs_debug_dumplog(void)
 {
         int            rc;
         cfs_waitlink_t wait;
@@ -94,90 +378,51 @@ void portals_debug_dumplog(void)
         set_current_state(TASK_INTERRUPTIBLE);
         cfs_waitq_add(&debug_ctlwq, &wait);
 
-        rc = cfs_kernel_thread(portals_debug_dumplog_thread,
+        rc = cfs_kernel_thread(libcfs_debug_dumplog_thread,
                                (void *)(long)cfs_curproc_pid(),
                                CLONE_VM | CLONE_FS | CLONE_FILES);
         if (rc < 0)
                 printk(KERN_ERR "LustreError: cannot start log dump thread: "
                        "%d\n", rc);
         else
-                schedule();
+                cfs_waitq_wait(&wait, CFS_TASK_INTERRUPTIBLE);
 
         /* be sure to teardown if kernel_thread() failed */
         cfs_waitq_del(&debug_ctlwq, &wait);
         set_current_state(TASK_RUNNING);
 }
 
-#ifdef PORTALS_DUMP_ON_PANIC
-static int panic_dumplog(struct notifier_block *self, unsigned long unused1,
-                         void *unused2)
+int libcfs_debug_init(unsigned long bufsize)
 {
-        static int handled_panic; /* to avoid recursive calls to notifiers */
+        int    rc;
 
-        if (handled_panic)
-                return 0;
-        else
-                handled_panic = 1;
-
-        if (in_interrupt()) {
-                trace_debug_print();
-                return 0;
-        }
-
-        while (current->lock_depth >= 0)
-                unlock_kernel();
-        portals_debug_dumplog();
-        return 0;
-}
-
-static struct notifier_block lustre_panic_notifier = {
-        notifier_call :     panic_dumplog,
-        next :              NULL,
-        priority :          10000
-};
-#endif
+        cfs_waitq_init(&debug_ctlwq);
+        rc = tracefile_init();
 
-#ifdef CRAY_PORTALS
-extern void *lus_portals_debug;
-#endif
+        if (rc == 0)
+                libcfs_register_panic_notifier();
 
-int portals_debug_init(unsigned long bufsize)
-{
-        cfs_waitq_init(&debug_ctlwq);
-#ifdef CRAY_PORTALS
-        lus_portals_debug = &portals_debug_msg;
-#endif
-#ifdef PORTALS_DUMP_ON_PANIC
-        /* This is currently disabled because it spews far too much to the
-         * console on the rare cases it is ever triggered. */
-        notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier);
-#endif
-        return tracefile_init();
+        return rc;
 }
 
-int portals_debug_cleanup(void)
+int libcfs_debug_cleanup(void)
 {
+        libcfs_unregister_panic_notifier();
         tracefile_exit();
-#ifdef PORTALS_DUMP_ON_PANIC
-        notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier);
-#endif
-#ifdef CRAY_PORTALS
-        lus_portals_debug = NULL;
-#endif
         return 0;
 }
 
-int portals_debug_clear_buffer(void)
+int libcfs_debug_clear_buffer(void)
 {
         trace_flush_pages();
         return 0;
 }
 
-/* Debug markers, although printed by S_PORTALS
+/* Debug markers, although printed by S_LNET
  * should not be be marked as such. */
 #undef DEBUG_SUBSYSTEM
 #define DEBUG_SUBSYSTEM S_UNDEFINED
-int portals_debug_mark_buffer(char *text)
+int libcfs_debug_mark_buffer(char *text)
 {
         CDEBUG(D_TRACE,"***************************************************\n");
         CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text);
@@ -186,75 +431,293 @@ int portals_debug_mark_buffer(char *text)
         return 0;
 }
 #undef DEBUG_SUBSYSTEM
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
-void portals_debug_set_level(unsigned int debug_level)
+void libcfs_debug_set_level(unsigned int debug_level)
 {
         printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n",
                debug_level);
-        portal_debug = debug_level;
+        libcfs_debug = debug_level;
+}
+
+EXPORT_SYMBOL(libcfs_debug_dumplog);
+EXPORT_SYMBOL(libcfs_debug_set_level);
+
+
+#else /* !__KERNEL__ */
+
+#include <libcfs/libcfs.h>
+
+#ifdef HAVE_SYS_USER_H
+# include <sys/user.h>
+#endif
+
+#ifdef HAVE_CATAMOUNT_DATA_H
+#include <catamount/data.h>
+#include <catamount/lputs.h>
+
+static char source_nid[16];
+/* 0 indicates no messages to console, 1 is errors, > 1 is all debug messages */
+static int toconsole = 1;
+unsigned int libcfs_console_ratelimit = 1;
+#else /* !HAVE_CATAMOUNT_DATA_H */
+#ifdef HAVE_NETDB_H
+#include <sys/utsname.h>
+#endif /* HAVE_CATAMOUNT_DATA_H */
+struct utsname *tmp_utsname;
+static char source_nid[sizeof(tmp_utsname->nodename)];
+#endif /* __KERNEL__ */
+
+static int source_pid;
+int smp_processor_id = 1;
+char debug_file_path[1024];
+FILE *debug_file_fd;
+
+int portals_do_debug_dumplog(void *arg)
+{
+        printf("Look in %s\n", debug_file_name);
+        return 0;
 }
 
-char *portals_nid2str(int nal, ptl_nid_t nid, char *str)
+
+void portals_debug_print(void)
 {
-        if (nid == PTL_NID_ANY) {
-                snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY");
-                return str;
-        }
+        return;
+}
 
-        switch(NALID_FROM_IFACE(nal)){
-/* XXX this could be a nal method of some sort, 'cept it's config
- * dependent whether (say) socknal NIDs are actually IP addresses... */
-#if !CRAY_PORTALS
-        case TCPNAL:
-                /* userspace NAL */
-        case IIBNAL:
-        case VIBNAL:
-        case OPENIBNAL:
-        case RANAL:
-        case SOCKNAL: {
-                /* HIPQUAD requires __u32, but we can't cast in it */
-                __u32 nid32 = (__u32)nid;
-                if ((__u32)(nid >> 32)) {
-                        snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u",
-                                 (__u32)(nid >> 32), HIPQUAD(nid32));
-                } else {
-                        snprintf(str, PTL_NALFMT_SIZE, "%u.%u.%u.%u",
-                                 HIPQUAD(nid32));
-                }
-                break;
+
+void libcfs_debug_dumplog(void)
+{
+        printf("Look in %s\n", debug_file_name);
+        return;
+}
+
+int libcfs_debug_init(unsigned long bufsize)
+{
+        char *debug_mask = NULL;
+        char *debug_subsys = NULL;
+        char *debug_filename;
+
+#ifdef HAVE_CATAMOUNT_DATA_H
+        char *debug_console = NULL;
+        char *debug_ratelimit = NULL;
+
+        snprintf(source_nid, sizeof(source_nid) - 1, "%u", _my_pnid);
+        source_pid = _my_pid;
+
+        debug_console = getenv("LIBLUSTRE_DEBUG_CONSOLE");
+        if (debug_console != NULL) {
+                toconsole = strtoul(debug_console, NULL, 0);
+                CDEBUG(D_INFO, "set liblustre toconsole to %u\n", toconsole);
+        }
+        debug_ratelimit = getenv("LIBLUSTRE_DEBUG_CONSOLE_RATELIMIT");
+        if (debug_ratelimit != NULL) {
+                libcfs_console_ratelimit = strtoul(debug_ratelimit, NULL, 0);
+                CDEBUG(D_INFO, "set liblustre console ratelimit to %u\n", libcfs_console_ratelimit);
         }
-        case QSWNAL:
-        case GMNAL:
-        case LONAL:
-                snprintf(str, PTL_NALFMT_SIZE, "%u:%u",
-                         (__u32)(nid >> 32), (__u32)nid);
-                break;
 #else
-        case PTL_IFACE_SS:
-        case PTL_IFACE_SS_ACCEL:
-                snprintf(str, PTL_NALFMT_SIZE, "%u", (__u32)nid);
-                break;
+        struct utsname myname;
+
+        if (uname(&myname) == 0)
+                strcpy(source_nid, myname.nodename);
+        source_pid = getpid();
 #endif
-        default:
-                snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx",
-                         nal, (long long)nid);
-                break;
+        /* debug masks */
+        debug_mask = getenv("LIBLUSTRE_DEBUG_MASK");
+        if (debug_mask)
+                libcfs_debug = (unsigned int) strtol(debug_mask, NULL, 0);
+
+        debug_subsys = getenv("LIBLUSTRE_DEBUG_SUBSYS");
+        if (debug_subsys)
+                libcfs_subsystem_debug =
+                                (unsigned int) strtol(debug_subsys, NULL, 0);
+
+        debug_filename = getenv("LIBLUSTRE_DEBUG_BASE");
+        if (debug_filename)
+                strncpy(debug_file_path,debug_filename,sizeof(debug_file_path));
+
+        debug_filename = getenv("LIBLUSTRE_DEBUG_FILE");
+        if (debug_filename)
+                strncpy(debug_file_name,debug_filename,sizeof(debug_file_path));
+
+        if (debug_file_name[0] == '\0' && debug_file_path[0] != '\0')
+                snprintf(debug_file_name, sizeof(debug_file_name) - 1,
+                         "%s-%s-%lu.log", debug_file_path, source_nid, time(0));
+
+        if (strcmp(debug_file_name, "stdout") == 0 ||
+            strcmp(debug_file_name, "-") == 0) {
+                debug_file_fd = stdout;
+        } else if (strcmp(debug_file_name, "stderr") == 0) {
+                debug_file_fd = stderr;
+        } else if (debug_file_name[0] != '\0') {
+                debug_file_fd = fopen(debug_file_name, "w");
+                if (debug_file_fd == NULL)
+                        fprintf(stderr, "%s: unable to open '%s': %s\n",
+                                source_nid, debug_file_name, strerror(errno));
         }
-        return str;
+
+        if (debug_file_fd == NULL)
+                debug_file_fd = stdout;
+
+        return 0;
 }
 
-char *portals_id2str(int nal, ptl_process_id_t id, char *str)
+int libcfs_debug_cleanup(void)
 {
-        int   len;
+        if (debug_file_fd != stdout && debug_file_fd != stderr)
+                fclose(debug_file_fd);
+        return 0;
+}
 
-        portals_nid2str(nal, id.nid, str);
-        len = strlen(str);
-        snprintf(str + len, PTL_NALFMT_SIZE - len, "-%u", id.pid);
-        return str;
+int libcfs_debug_clear_buffer(void)
+{
+        return 0;
+}
+
+int libcfs_debug_mark_buffer(char *text)
+{
+
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
+        fprintf(debug_file_fd, "*******************************************************************************\n");
+
+        return 0;
+}
+
+#ifdef HAVE_CATAMOUNT_DATA_H
+#define CATAMOUNT_MAXLINE (256-4)
+void catamount_printline(char *buf, size_t size)
+{
+    char *pos = buf;
+    int prsize = size;
+
+    while (prsize > 0){
+        lputs(pos);
+        pos += CATAMOUNT_MAXLINE;
+        prsize -= CATAMOUNT_MAXLINE;
+    }
+}
+#endif
+
+int
+libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls,
+                   int subsys, int mask,
+                   const char *file, const char *fn, const int line,
+                   const char *format1, va_list args,
+                   const char *format2, ...)
+{
+        struct timeval tv;
+        int            nob;
+        int            remain;
+        va_list        ap;
+        char           buf[PAGE_SIZE]; /* size 4096 used for compatimble with linux,
+                                        * where message can`t be exceed PAGE_SIZE */
+        int            console = 0;
+        char *prefix = "Lustre";
+
+#ifdef HAVE_CATAMOUNT_DATA_H
+        /* toconsole == 0 - all messages to debug_file_fd
+         * toconsole == 1 - warnings to console, all to debug_file_fd
+         * toconsole >  1 - all debug to console */
+        if ( ((mask & D_CANTMASK) &&
+             (toconsole == 1)) || (toconsole > 1)) {
+                console = 1;
+        }
+#endif
+
+        if ((!console) && (!debug_file_fd)) {
+                return 0;
+        }
+
+        if (mask & (D_EMERG | D_ERROR))
+               prefix = "LustreError";
+
+        nob = snprintf(buf, sizeof(buf), "%s: %u-%s:(%s:%d:%s()): ", prefix,
+                       source_pid, source_nid, file, line, fn);
+
+        remain = sizeof(buf) - nob;
+        if (format1) {
+                nob += vsnprintf(&buf[nob], remain, format1, args);
+        }
+
+        remain = sizeof(buf) - nob;
+        if ((format2) && (remain > 0)) {
+                va_start(ap, format2);
+                nob += vsnprintf(&buf[nob], remain, format2, ap);
+                va_end(ap);
+        }
+
+#ifdef HAVE_CATAMOUNT_DATA_H
+        if (console) {
+                /* check rate limit for console */
+                if (cdls != NULL) {
+                        cfs_time_t t = cdls->cdls_next +
+                                       cfs_time_seconds(CDEBUG_MAX_LIMIT + 10);
+                        cfs_duration_t  dmax = cfs_time_seconds(CDEBUG_MAX_LIMIT);
+
+                        if (libcfs_console_ratelimit &&
+                                cdls->cdls_next != 0 &&     /* not first time ever */
+                                !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+
+                                /* skipping a console message */
+                                cdls->cdls_count++;
+                                goto out_file;
+                        }
+
+                        if (cfs_time_after(cfs_time_current(), t)) {
+                                /* last timeout was a long time ago */
+                                cdls->cdls_delay /= 8;
+                        } else {
+                                cdls->cdls_delay *= 2;
+
+                                if (cdls->cdls_delay < CFS_TICK)
+                                        cdls->cdls_delay = CFS_TICK;
+                                else if (cdls->cdls_delay > dmax)
+                                        cdls->cdls_delay = dmax;
+                        }
+
+                        /* ensure cdls_next is never zero after it's been seen */
+                        cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+                }
+
+                if (cdls != NULL && cdls->cdls_count != 0) {
+                        char buf2[100];
+
+                        nob = snprintf(buf2, sizeof(buf2),
+                                       "Skipped %d previous similar message%s\n",
+                                       cdls->cdls_count, (cdls->cdls_count > 1) ? "s" : "");
+
+                        catamount_printline(buf2, nob);
+                        cdls->cdls_count = 0;
+                        goto out_file;
+                }
+                catamount_printline(buf, nob);
+       }
+out_file:
+        /* return on toconsole > 1, as we don't want the user getting
+        * spammed by the debug data */
+        if (toconsole > 1)
+                return 0;
+#endif
+        if (debug_file_fd == NULL)
+                return 0;
+
+        gettimeofday(&tv, NULL);
+
+        fprintf(debug_file_fd, "%lu.%06lu:%u:%s:(%s:%d:%s()): %s",
+                tv.tv_sec, tv.tv_usec, source_pid, source_nid,
+                file, line, fn, buf);
+
+        return 0;
+}
+
+void
+libcfs_assertion_failed(const char *expr, const char *file, const char *func,
+                        const int line)
+{
+        libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line,
+                         "ASSERTION(%s) failed\n", expr);
+        abort();
 }
 
-EXPORT_SYMBOL(portals_debug_dumplog);
-EXPORT_SYMBOL(portals_debug_set_level);
-EXPORT_SYMBOL(portals_nid2str);
-EXPORT_SYMBOL(portals_id2str);
+#endif /* __KERNEL__ */
index 49f8e87..8bf35cc 100644 (file)
@@ -1,4 +1,4 @@
 EXTRA_DIST := linux-debug.c linux-lwt.c linux-prim.c linux-tracefile.c \
        linux-fs.c linux-mem.c linux-proc.c linux-utils.c linux-lock.c  \
-       linux-module.c linux-sync.c linux-curproc.c
+       linux-module.c linux-sync.c linux-curproc.c linux-tcpip.c
 
index 719e48b..e446169 100644 (file)
@@ -20,7 +20,7 @@
 
 #include <linux/sched.h>
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
index 62779de..abc07b2 100644 (file)
@@ -24,7 +24,9 @@
 # define EXPORT_SYMTAB
 #endif
 
+#ifdef HAVE_KERNEL_CONFIG_H
 #include <linux/config.h>
+#endif
 #include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/notifier.h>
@@ -47,7 +49,7 @@
 #include <linux/miscdevice.h>
 #include <linux/version.h>
 
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/kp30.h>
 #include <libcfs/linux/portals_compat25.h>
@@ -59,9 +61,9 @@
 #include <linux/kallsyms.h>
 #endif
 
-char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall";
+char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall";
 
-void portals_run_upcall(char **argv)
+void libcfs_run_upcall(char **argv)
 {
         int   rc;
         int   argc;
@@ -71,7 +73,7 @@ void portals_run_upcall(char **argv)
                 NULL};
         ENTRY;
 
-        argv[0] = portals_upcall;
+        argv[0] = lnet_upcall;
         argc = 1;
         while (argv[argc] != NULL)
                 argc++;
@@ -80,15 +82,15 @@ void portals_run_upcall(char **argv)
 
         rc = USERMODEHELPER(argv[0], argv, envp);
         if (rc < 0) {
-                CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; "
-                       "check /proc/sys/portals/upcall\n",
+                CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; "
+                       "check /proc/sys/lnet/upcall\n",
                        rc, argv[0], argv[1],
                        argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
                        argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
                        argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
                        argc < 6 ? "" : ",...");
         } else {
-                CWARN("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n",
+                CWARN("Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n",
                        argv[0], argv[1],
                        argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
                        argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
@@ -97,7 +99,7 @@ void portals_run_upcall(char **argv)
         }
 }
 
-void portals_run_lbug_upcall(char *file, const char *fn, const int line)
+void libcfs_run_lbug_upcall(char *file, const char *fn, const int line)
 {
         char *argv[6];
         char buf[32];
@@ -111,18 +113,50 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line)
         argv[4] = buf;
         argv[5] = NULL;
 
-        portals_run_upcall (argv);
+        libcfs_run_upcall (argv);
+}
+
+#ifdef __arch_um__
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        libcfs_catastrophe = 1;
+        libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line,
+                         "LBUG - trying to dump log to /tmp/lustre-log\n");
+        libcfs_debug_dumplog();
+        libcfs_run_lbug_upcall(file, func, line);
+        asm("int $3");
+        panic("LBUG");
 }
+#else
+/* coverity[+kill] */
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        libcfs_catastrophe = 1;
+        libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line, "LBUG\n");
+
+        if (in_interrupt()) {
+                panic("LBUG in interrupt.\n");
+                /* not reached */
+        }
+
+        libcfs_debug_dumpstack(NULL);
+        libcfs_debug_dumplog();
+        libcfs_run_lbug_upcall(file, func, line);
+        set_task_state(current, TASK_UNINTERRUPTIBLE);
+        while (1)
+                schedule();
+}
+#endif /* __arch_um__ */
 
 #ifdef __KERNEL__
 
-void portals_debug_dumpstack(struct task_struct *tsk)
+void libcfs_debug_dumpstack(struct task_struct *tsk)
 {
 #if defined(__arch_um__)
         if (tsk != NULL)
                 CWARN("stack dump for pid %d (%d) requested; wake up gdb.\n",
                       tsk->pid, UML_PID(tsk));
-        asm("int $3");
+        //asm("int $3");
 #elif defined(HAVE_SHOW_TASK)
         /* this is exported by lustre kernel version 42 */
         extern void show_task(struct task_struct *);
@@ -133,18 +167,71 @@ void portals_debug_dumpstack(struct task_struct *tsk)
         show_task(tsk);
 #else
         CWARN("can't show stack: kernel doesn't export show_task\n");
+        if ((tsk == NULL) || (tsk == current))
+                dump_stack();
 #endif
 }
 
-cfs_task_t *portals_current(void)
+cfs_task_t *libcfs_current(void)
 {
         CWARN("current task struct is %p\n", current);
         return current;
 }
-EXPORT_SYMBOL(portals_debug_dumpstack);
-EXPORT_SYMBOL(portals_current);
+
+static int panic_notifier(struct notifier_block *self, unsigned long unused1,
+                         void *unused2)
+{
+        if (libcfs_panic_in_progress)
+                return 0;
+
+        libcfs_panic_in_progress = 1;
+        mb();
+
+#ifdef LNET_DUMP_ON_PANIC
+        /* This is currently disabled because it spews far too much to the
+         * console on the rare cases it is ever triggered. */
+
+        if (in_interrupt()) {
+                trace_debug_print();
+        } else {
+                while (current->lock_depth >= 0)
+                        unlock_kernel();
+
+                libcfs_debug_dumplog_internal((void *)(long)cfs_curproc_pid());
+        }
+#endif
+        return 0;
+}
+
+static struct notifier_block libcfs_panic_notifier = {
+        notifier_call :     panic_notifier,
+        next :              NULL,
+        priority :          10000
+};
+
+void libcfs_register_panic_notifier(void)
+{
+#ifdef HAVE_ATOMIC_PANIC_NOTIFIER
+        atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+#else
+        notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier);
+#endif
+}
+
+void libcfs_unregister_panic_notifier(void)
+{
+#ifdef HAVE_ATOMIC_PANIC_NOTIFIER
+        atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+#else
+        notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier);
+#endif
+}
+
+EXPORT_SYMBOL(libcfs_debug_dumpstack);
+EXPORT_SYMBOL(libcfs_current);
 
 #endif /* __KERNEL__ */
 
-EXPORT_SYMBOL(portals_run_upcall);
-EXPORT_SYMBOL(portals_run_lbug_upcall);
+EXPORT_SYMBOL(libcfs_run_upcall);
+EXPORT_SYMBOL(libcfs_run_lbug_upcall);
+EXPORT_SYMBOL(lbug_with_loc);
index 61b7166..061944c 100644 (file)
@@ -1,6 +1,7 @@
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
 #include <linux/fs.h>
+#include <linux/kdev_t.h>
 #include <linux/ctype.h>
 #include <asm/uaccess.h>
 
@@ -14,12 +15,12 @@ cfs_filp_open (const char *name, int flags, int mode, int *err)
         */
        cfs_file_t      *filp = NULL;
 
-       filp = filp_open(name, flags, mode); 
-       if (IS_ERR(filp)) { 
+       filp = filp_open(name, flags, mode);
+       if (IS_ERR(filp)) {
                int rc;
 
-               rc = PTR_ERR(filp); 
-               printk(KERN_ERR "LustreError: can't open %s file: err %d\n", 
+               rc = PTR_ERR(filp);
+               printk(KERN_ERR "LustreError: can't open %s file: err %d\n",
                                name, rc);
                if (err)
                        *err = rc;
@@ -28,4 +29,87 @@ cfs_filp_open (const char *name, int flags, int mode, int *err)
        return filp;
 }
 
+/* write a userspace buffer to disk.
+ * NOTE: this returns 0 on success, not the number of bytes written. */
+ssize_t
+cfs_user_write (cfs_file_t *filp, const char *buf, size_t count, loff_t *offset)
+{
+       mm_segment_t fs;
+       ssize_t size = 0;
+
+       fs = get_fs();
+       set_fs(KERNEL_DS);
+       while (count > 0) {
+               size = filp->f_op->write(filp, (char *)buf, count, offset);
+               if (size < 0)
+                       break;
+               count -= size;
+               size = 0;
+       }
+       set_fs(fs);
+
+       return size;
+}
+
+cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+        return MKDEV(major, minor);
+}
+
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+        return MAJOR(rdev);
+}
+
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
+{
+        return MINOR(rdev);
+}
+
+#if !(CFS_O_CREAT == O_CREAT && CFS_O_EXCL == O_EXCL &&        \
+     CFS_O_TRUNC == O_TRUNC && CFS_O_APPEND == O_APPEND &&\
+     CFS_O_NONBLOCK == O_NONBLOCK && CFS_O_NDELAY == O_NDELAY &&\
+     CFS_O_SYNC == O_SYNC && CFS_O_ASYNC == FASYNC &&\
+     CFS_O_DIRECT == O_DIRECT && CFS_O_LARGEFILE == O_LARGEFILE &&\
+     CFS_O_DIRECTORY == O_DIRECTORY && CFS_O_NOFOLLOW == O_NOFOLLOW)
+
+int cfs_oflags2univ(int flags)
+{
+       int f; 
+       
+       f = flags & O_ACCMODE;
+       f |= (flags & O_CREAT) ? CFS_O_CREAT: 0;
+       f |= (flags & O_EXCL) ? CFS_O_EXCL: 0;
+       f |= (flags & O_NOCTTY) ? CFS_O_NOCTTY: 0;
+       f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0;
+       f |= (flags & O_APPEND) ? CFS_O_APPEND: 0;
+       f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0;
+       f |= (flags & O_SYNC)? CFS_O_SYNC: 0;
+       f |= (flags & FASYNC)? CFS_O_ASYNC: 0;
+       f |= (flags & O_DIRECTORY)? CFS_O_DIRECTORY: 0;
+       f |= (flags & O_DIRECT)? CFS_O_DIRECT: 0;
+       f |= (flags & O_LARGEFILE)? CFS_O_LARGEFILE: 0;
+       f |= (flags & O_NOFOLLOW)? CFS_O_NOFOLLOW: 0;
+       f |= (flags & O_NOATIME)? CFS_O_NOATIME: 0;
+       return f;
+}
+#else
+
+int cfs_oflags2univ(int flags)
+{
+       return (flags);
+}
+#endif
+
+/* 
+ * XXX Liang: we don't need cfs_univ2oflags() now.
+ */
+int cfs_univ2oflags(int flags)
+{
+       return (flags);
+}
+
 EXPORT_SYMBOL(cfs_filp_open);
+EXPORT_SYMBOL(cfs_user_write);
+EXPORT_SYMBOL(cfs_oflags2univ);
+EXPORT_SYMBOL(cfs_univ2oflags);
index a1d713e..01511d6 100644 (file)
@@ -1,4 +1,4 @@
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
 #include <arch-linux/cfs_lock.h>
 #include <libcfs/libcfs.h>
index 32adc80..520c54c 100644 (file)
@@ -1,2 +1,2 @@
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
index fb2c6a0..f327814 100644 (file)
@@ -18,7 +18,7 @@
  *   along with Lustre; if not, write to the Free Software
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <libcfs/libcfs.h>
 
-void *
-cfs_alloc(size_t nr_bytes, u_int32_t flags)
+static unsigned int cfs_alloc_flags_to_gfp(u_int32_t flags)
 {
-       void *ptr = NULL;
        unsigned int mflags = 0;
 
 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-       if (flags & CFS_ALLOC_ATOMIC)
-               mflags |= __GFP_HIGH;
+        if (flags & CFS_ALLOC_ATOMIC)
+                mflags |= __GFP_HIGH;
         else if (flags & CFS_ALLOC_WAIT)
                 mflags |= __GFP_WAIT;
-       else
-               mflags |= (__GFP_HIGH | __GFP_WAIT);
-
-       if (flags & CFS_ALLOC_FS)
-               mflags |= __GFP_FS;
-       if (flags & CFS_ALLOC_IO)
-               mflags |= __GFP_IO | __GFP_HIGHIO;
+        else
+                mflags |= (__GFP_HIGH | __GFP_WAIT);
+        if (flags & CFS_ALLOC_IO)
+                mflags |= __GFP_IO | __GFP_HIGHIO;
 #else
         if (flags & CFS_ALLOC_ATOMIC)
                 mflags |= __GFP_HIGH;
         else
                 mflags |= __GFP_WAIT;
-        if (flags & CFS_ALLOC_FS)
-                mflags |= __GFP_FS;
+        if (flags & CFS_ALLOC_NOWARN)
+                mflags |= __GFP_NOWARN;
         if (flags & CFS_ALLOC_IO)
                 mflags |= __GFP_IO;
 #endif
+        if (flags & CFS_ALLOC_FS)
+                mflags |= __GFP_FS;
+        return mflags;
+}
+
+void *
+cfs_alloc(size_t nr_bytes, u_int32_t flags)
+{
+       void *ptr = NULL;
 
-       ptr = kmalloc(nr_bytes, mflags);
+       ptr = kmalloc(nr_bytes, cfs_alloc_flags_to_gfp(flags));
        if (ptr != NULL && (flags & CFS_ALLOC_ZERO))
                memset(ptr, 0, nr_bytes);
        return ptr;
@@ -79,83 +83,37 @@ cfs_free_large(void *addr)
        vfree(addr);
 }
 
-cfs_page_t *
-cfs_alloc_pages(unsigned int flags, unsigned int order)
+cfs_page_t *cfs_alloc_page(unsigned int flags)
 {
-        unsigned int mflags = 0;
-
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-       if (flags & CFS_ALLOC_ATOMIC)
-               mflags |= __GFP_HIGH;
-        else if (flags & CFS_ALLOC_WAIT)
-                mflags |= __GFP_WAIT;
-       else
-               mflags |= (__GFP_HIGH | __GFP_WAIT);
-
-       if (flags & CFS_ALLOC_FS)
-               mflags |= __GFP_FS;
-       if (flags & CFS_ALLOC_IO)
-               mflags |= __GFP_IO | __GFP_HIGHIO;
-        if (flags & CFS_ALLOC_HIGH)
-                mflags |=  __GFP_HIGHMEM;
-#else
-        if (flags & CFS_ALLOC_ATOMIC)
-                mflags |= __GFP_HIGH;
-        else
-                mflags |= __GFP_WAIT;
-        if (flags & CFS_ALLOC_FS)
-                mflags |= __GFP_FS;
-        if (flags & CFS_ALLOC_IO)
-                mflags |= __GFP_IO;
-        if (flags & CFS_ALLOC_HIGH)
-                mflags |=  __GFP_HIGHMEM;
-#endif
-
-        return alloc_pages(mflags, order);
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
+        return alloc_pages(cfs_alloc_flags_to_gfp(flags), 0);
 }
 
 cfs_mem_cache_t *
 cfs_mem_cache_create (const char *name, size_t size, size_t offset,
-                      unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
-                      void (*dtor)(void*, cfs_mem_cache_t *, unsigned long))
+                      unsigned long flags)
 {
-        return kmem_cache_create(name, size, offset, flags, ctor, dtor);
+        return kmem_cache_create(name, size, offset, flags, NULL, NULL);
 }
 
 int
 cfs_mem_cache_destroy (cfs_mem_cache_t * cachep)
 {
+#ifdef HAVE_KMEM_CACHE_DESTROY_INT
         return kmem_cache_destroy(cachep);
+#else
+        kmem_cache_destroy(cachep);
+        return 0;
+#endif
 }
 
 void *
 cfs_mem_cache_alloc(cfs_mem_cache_t *cachep, int flags)
 {
-        unsigned int mflags = 0;
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-       if (flags & CFS_SLAB_ATOMIC)
-               mflags |= __GFP_HIGH;
-        else if (flags & CFS_ALLOC_WAIT)
-                mflags |= __GFP_WAIT;
-       else
-               mflags |= (__GFP_HIGH | __GFP_WAIT);
-
-       if (flags & CFS_SLAB_FS)
-               mflags |= __GFP_FS;
-       if (flags & CFS_SLAB_IO)
-               mflags |= __GFP_IO | __GFP_HIGHIO;
-#else
-        if (flags & CFS_SLAB_ATOMIC)
-                mflags |= __GFP_HIGH;
-        else
-                mflags |= __GFP_WAIT;
-        if (flags & CFS_SLAB_FS)
-                mflags |= __GFP_FS;
-        if (flags & CFS_SLAB_IO)
-                mflags |= __GFP_IO;
-#endif
-
-        return kmem_cache_alloc(cachep, mflags);
+        return kmem_cache_alloc(cachep, cfs_alloc_flags_to_gfp(flags));
 }
 
 void
@@ -168,7 +126,7 @@ EXPORT_SYMBOL(cfs_alloc);
 EXPORT_SYMBOL(cfs_free);
 EXPORT_SYMBOL(cfs_alloc_large);
 EXPORT_SYMBOL(cfs_free_large);
-EXPORT_SYMBOL(cfs_alloc_pages);
+EXPORT_SYMBOL(cfs_alloc_page);
 EXPORT_SYMBOL(cfs_mem_cache_create);
 EXPORT_SYMBOL(cfs_mem_cache_destroy);
 EXPORT_SYMBOL(cfs_mem_cache_alloc);
index 4b2558b..6f21853 100644 (file)
@@ -1,48 +1,25 @@
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/libcfs.h>
 #include <libcfs/kp30.h>
 
-#define PORTAL_MINOR 240
+#define LNET_MINOR 240
 
-
-void
-kportal_daemonize (char *str)
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
 {
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) 
-       daemonize(str);
-#else 
-       daemonize(); 
-       snprintf (current->comm, sizeof (current->comm), "%s", str);
-#endif
-}
-
-void
-kportal_blockallsigs ()
-{ 
-       unsigned long  flags; 
-       
-       SIGNAL_MASK_LOCK(current, flags); 
-       sigfillset(&current->blocked); 
-       RECALC_SIGPENDING; 
-       SIGNAL_MASK_UNLOCK(current, flags);
-}
-
-int portal_ioctl_getdata(char *buf, char *end, void *arg)
-{
-        struct portal_ioctl_hdr *hdr;
-        struct portal_ioctl_data *data;
+        struct libcfs_ioctl_hdr   *hdr;
+        struct libcfs_ioctl_data  *data;
         int err;
         ENTRY;
 
-        hdr = (struct portal_ioctl_hdr *)buf;
-        data = (struct portal_ioctl_data *)buf;
+        hdr = (struct libcfs_ioctl_hdr *)buf;
+        data = (struct libcfs_ioctl_data *)buf;
 
         err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
         if (err)
                 RETURN(err);
 
-        if (hdr->ioc_version != PORTAL_IOCTL_VERSION) {
+        if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
                 CERROR("PORTALS: version mismatch kernel vs application\n");
                 RETURN(-EINVAL);
         }
@@ -53,7 +30,7 @@ int portal_ioctl_getdata(char *buf, char *end, void *arg)
         }
 
 
-        if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) {
+        if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
                 CERROR("PORTALS: user buffer too small for ioctl\n");
                 RETURN(-EINVAL);
         }
@@ -62,7 +39,7 @@ int portal_ioctl_getdata(char *buf, char *end, void *arg)
         if (err)
                 RETURN(err);
 
-        if (portal_ioctl_is_invalid(data)) {
+        if (libcfs_ioctl_is_invalid(data)) {
                 CERROR("PORTALS: ioctl not correctly formatted\n");
                 RETURN(-EINVAL);
         }
@@ -76,18 +53,25 @@ int portal_ioctl_getdata(char *buf, char *end, void *arg)
 
         RETURN(0);
 }
-                                                                                                                                                                        
+
+int libcfs_ioctl_popdata(void *arg, void *data, int size)
+{
+       if (copy_to_user((char *)arg, data, size))
+               return -EFAULT;
+       return 0;
+}
+
 extern struct cfs_psdev_ops          libcfs_psdev_ops;
 
-static int 
+static int
 libcfs_psdev_open(struct inode * inode, struct file * file)
-{ 
-       struct portals_device_userstate **pdu = NULL;
+{
+       struct libcfs_device_userstate **pdu = NULL;
        int    rc = 0;
 
-       if (!inode) 
+       if (!inode)
                return (-EINVAL);
-       pdu = (struct portals_device_userstate **)&file->private_data;
+       pdu = (struct libcfs_device_userstate **)&file->private_data;
        if (libcfs_psdev_ops.p_open != NULL)
                rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
        else
@@ -96,13 +80,13 @@ libcfs_psdev_open(struct inode * inode, struct file * file)
 }
 
 /* called when closing /dev/device */
-static int 
+static int
 libcfs_psdev_release(struct inode * inode, struct file * file)
 {
-       struct portals_device_userstate *pdu;
+       struct libcfs_device_userstate *pdu;
        int    rc = 0;
 
-       if (!inode) 
+       if (!inode)
                return (-EINVAL);
        pdu = file->private_data;
        if (libcfs_psdev_ops.p_close != NULL)
@@ -112,59 +96,56 @@ libcfs_psdev_release(struct inode * inode, struct file * file)
        return rc;
 }
 
-static int 
-libcfs_ioctl(struct inode *inode, struct file *file, 
+static int
+libcfs_ioctl(struct inode *inode, struct file *file,
             unsigned int cmd, unsigned long arg)
-{ 
+{
        struct cfs_psdev_file    pfile;
        int    rc = 0;
 
-       if (current->fsuid != 0) 
-               return -EACCES; 
-       
-       if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE || 
-            _IOC_NR(cmd) < IOC_PORTAL_MIN_NR  || 
-            _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) { 
-               CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", 
-                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); 
-               return (-EINVAL); 
-       } 
-       
+       if (current->fsuid != 0)
+               return -EACCES;
+
+       if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE ||
+            _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  ||
+            _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) {
+               CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n",
+                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd));
+               return (-EINVAL);
+       }
+
        /* Handle platform-dependent IOC requests */
-       switch (cmd) { 
-       case IOC_PORTAL_PANIC: 
-               if (!capable (CAP_SYS_BOOT)) 
-                       return (-EPERM); 
-               panic("debugctl-invoked panic"); 
+       switch (cmd) {
+       case IOC_LIBCFS_PANIC:
+               if (!capable (CAP_SYS_BOOT))
+                       return (-EPERM);
+               panic("debugctl-invoked panic");
                return (0);
-       case IOC_PORTAL_MEMHOG: 
-               if (!capable (CAP_SYS_ADMIN)) 
+       case IOC_LIBCFS_MEMHOG:
+               if (!capable (CAP_SYS_ADMIN))
                        return -EPERM;
                /* go thought */
        }
 
        pfile.off = 0;
        pfile.private_data = file->private_data;
-       if (libcfs_psdev_ops.p_ioctl != NULL) 
-               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); 
+       if (libcfs_psdev_ops.p_ioctl != NULL)
+               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg);
        else
                rc = -EPERM;
        return (rc);
 }
 
-static struct file_operations libcfs_fops = { 
-       ioctl:   libcfs_ioctl, 
-       open:    libcfs_psdev_open, 
+static struct file_operations libcfs_fops = {
+       ioctl:   libcfs_ioctl,
+       open:    libcfs_psdev_open,
        release: libcfs_psdev_release
 };
 
-cfs_psdev_t libcfs_dev = { 
-       PORTAL_MINOR, 
-       "portals", 
+cfs_psdev_t libcfs_dev = {
+       LNET_MINOR,
+       "lnet",
        &libcfs_fops
 };
 
-EXPORT_SYMBOL(kportal_blockallsigs);
-EXPORT_SYMBOL(kportal_daemonize);
-
 
index 95365ee..fe5d61f 100644 (file)
-#define DEBUG_SUBSYSTEM S_PORTALS
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#ifdef HAVE_KERNEL_CONFIG_H
 #include <linux/config.h>
+#endif
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <libcfs/libcfs.h>
+
+#if defined(CONFIG_KGDB)
+#include <asm/kgdb.h>
+#endif
+
+void cfs_enter_debugger(void)
+{
+#if defined(CONFIG_KGDB)
+        BREAKPOINT();
+#elif defined(__arch_um__)
+        asm("int $3");
+#else
+        /* nothing */
+#endif
+}
+
+void cfs_daemonize(char *str) {
+        unsigned long flags;
+
+        lock_kernel();
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63))
+        daemonize(str);
+#else
+        daemonize();
+        exit_files(current);
+        reparent_to_init();
+        snprintf (current->comm, sizeof (current->comm), "%s", str);
+#endif
+        SIGNAL_MASK_LOCK(current, flags);
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+        unlock_kernel();
+}
+
+int cfs_daemonize_ctxt(char *str) {
+        struct task_struct *tsk = current;
+        struct fs_struct *fs = NULL;
+
+        cfs_daemonize(str);
+        fs = copy_fs_struct(tsk->fs);
+        if (fs == NULL)
+                return -ENOMEM;
+        exit_fs(tsk);
+        tsk->fs = fs;
+        return 0;
+}
+
+
+sigset_t
+cfs_get_blockedsigs(void)
+{
+        unsigned long          flags;
+        sigset_t        old;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        old = current->blocked;
+        SIGNAL_MASK_UNLOCK(current, flags);
+        return old;
+}
+
+sigset_t
+cfs_block_allsigs(void)
+{
+        unsigned long          flags;
+        sigset_t        old;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        old = current->blocked;
+        sigfillset(&current->blocked);
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+
+        return old;
+}
+
+sigset_t
+cfs_block_sigs(sigset_t bits)
+{
+        unsigned long  flags;
+        sigset_t        old;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        old = current->blocked;
+        current->blocked = bits;
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+        return old;
+}
+
+void
+cfs_restore_sigs (cfs_sigset_t old)
+{
+        unsigned long  flags;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        current->blocked = old;
+        RECALC_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+}
+
+int
+cfs_signal_pending(void)
+{
+        return signal_pending(current);
+}
+
+void
+cfs_clear_sigpending(void)
+{
+        unsigned long flags;
+
+        SIGNAL_MASK_LOCK(current, flags);
+        CLEAR_SIGPENDING;
+        SIGNAL_MASK_UNLOCK(current, flags);
+}
 
 int
 libcfs_arch_init(void)
-{ 
-       return 0;
+{
+        return 0;
 }
 
 void
 libcfs_arch_cleanup(void)
 {
-       return; 
+        return;
 }
 
 EXPORT_SYMBOL(libcfs_arch_init);
 EXPORT_SYMBOL(libcfs_arch_cleanup);
+EXPORT_SYMBOL(cfs_daemonize);
+EXPORT_SYMBOL(cfs_daemonize_ctxt);
+EXPORT_SYMBOL(cfs_block_allsigs);
+EXPORT_SYMBOL(cfs_block_sigs);
+EXPORT_SYMBOL(cfs_get_blockedsigs);
+EXPORT_SYMBOL(cfs_restore_sigs);
+EXPORT_SYMBOL(cfs_signal_pending);
+EXPORT_SYMBOL(cfs_clear_sigpending);
index 77277ba..3efdd46 100644 (file)
@@ -26,7 +26,9 @@
 # define EXPORT_SYMTAB
 #endif
 
+#ifdef HAVE_KERNEL_CONFIG_H
 #include <linux/config.h>
+#endif
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/sysctl.h>
 
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/kp30.h>
 #include <asm/div64.h>
 #include "tracefile.h"
 
-static struct ctl_table_header *portals_table_header = NULL;
-extern char debug_file_path[1024];
-extern char portals_upcall[1024];
+static struct ctl_table_header *lnet_table_header = NULL;
+extern char lnet_upcall[1024];
 
-#define PSDEV_PORTALS  (0x100)
+#define PSDEV_LNET  (0x100)
 enum {
         PSDEV_DEBUG = 1,          /* control debugging */
         PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
-        PSDEV_PRINTK,             /* force all errors to console */
-        PSDEV_CONSOLE,            /* allow _any_ messages to console */
+        PSDEV_PRINTK,             /* force all messages to console */
+        PSDEV_CONSOLE_RATELIMIT,  /* ratelimit console messages */
         PSDEV_DEBUG_PATH,         /* crashdump log location */
         PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
-        PSDEV_PORTALS_UPCALL,     /* User mode upcall script  */
-        PSDEV_PORTALS_MEMUSED,    /* bytes currently PORTAL_ALLOCated */
-        PSDEV_PORTALS_CATASTROPHE,/* if we have LBUGged or panic'd */
+        PSDEV_LNET_UPCALL,        /* User mode upcall script  */
+        PSDEV_LNET_MEMUSED,       /* bytes currently PORTAL_ALLOCated */
+        PSDEV_LNET_CATASTROPHE,   /* if we have LBUGged or panic'd */
 };
 
-static struct ctl_table portals_table[] = {
-        {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
-         &proc_dointvec},
-        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug,
+int LL_PROC_PROTO(proc_dobitmasks);
+
+static struct ctl_table lnet_table[] = {
+        {PSDEV_DEBUG, "debug", &libcfs_debug, sizeof(int), 0644, NULL,
+         &proc_dobitmasks},
+        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &libcfs_subsystem_debug,
+         sizeof(int), 0644, NULL, &proc_dobitmasks},
+        {PSDEV_PRINTK, "printk", &libcfs_printk, sizeof(int), 0644, NULL,
+         &proc_dobitmasks},
+        {PSDEV_CONSOLE_RATELIMIT, "console_ratelimit",&libcfs_console_ratelimit,
          sizeof(int), 0644, NULL, &proc_dointvec},
-        {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL,
-         &proc_dointvec},
         {PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
          sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
-        {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
-         sizeof(portals_upcall), 0644, NULL, &proc_dostring,
+        {PSDEV_LNET_UPCALL, "upcall", lnet_upcall,
+         sizeof(lnet_upcall), 0644, NULL, &proc_dostring,
          &sysctl_string},
-        {PSDEV_PORTALS_MEMUSED, "memused", (int *)&portal_kmemory.counter,
+        {PSDEV_LNET_MEMUSED, "memused", (int *)&libcfs_kmemory.counter,
          sizeof(int), 0444, NULL, &proc_dointvec},
-        {PSDEV_PORTALS_CATASTROPHE, "catastrophe", &portals_catastrophe,
+        {PSDEV_LNET_CATASTROPHE, "catastrophe", &libcfs_catastrophe,
          sizeof(int), 0444, NULL, &proc_dointvec},
         {0}
 };
 
 static struct ctl_table top_table[2] = {
-        {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table},
+        {PSDEV_LNET, "lnet", NULL, 0, 0555, lnet_table},
         {0}
 };
 
+int LL_PROC_PROTO(proc_dobitmasks)
+{
+        const int     tmpstrlen = 512;
+        char         *str;
+        int           rc = 0;
+        /* the proc filling api stumps me always, coax proc_dointvec
+         * and proc_dostring into doing the drudgery by cheating
+         * with a dummy ctl_table
+         */
+        struct ctl_table dummy = *table;
+        unsigned int *mask = (unsigned int *)table->data;
+        int           is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0;
+
+       str = kmalloc(tmpstrlen, GFP_USER);
+        if (str == NULL)
+                return -ENOMEM;
+
+        if (write) {
+                size_t oldlen = *lenp;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8)
+                loff_t oldpos = *ppos;
+#endif
 
-#ifdef PORTALS_PROFILING
-/*
- * profiling stuff.  we do this statically for now 'cause its simple,
- * but we could do some tricks with elf sections to have this array
- * automatically built.
- */
-#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, }
-
-struct prof_ent prof_ents[] = {
-        def_prof(our_recvmsg),
-        def_prof(our_sendmsg),
-        def_prof(socknal_recv),
-        def_prof(lib_parse),
-        def_prof(conn_list_walk),
-        def_prof(memcpy),
-        def_prof(lib_finalize),
-        def_prof(pingcli_time),
-        def_prof(gmnal_send),
-        def_prof(gmnal_recv),
-};
+                dummy.proc_handler = &proc_dointvec;
 
-EXPORT_SYMBOL(prof_ents);
+                /* old proc interface allows user to specify just an int
+                 * value; be compatible and don't break userland.
+                 */
+                rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos);
 
-/*
- * this function is as crazy as the proc filling api
- * requires.
- *
- * buffer: page allocated for us to scribble in.  the
- *  data returned to the user will be taken from here.
- * *start: address of the pointer that will tell the 
- *  caller where in buffer the data the user wants is.
- * ppos: offset in the entire /proc file that the user
- *  currently wants.
- * wanted: the amount of data the user wants.
- *
- * while going, 'curpos' is the offset in the entire
- * file where we currently are.  We only actually
- * start filling buffer when we get to a place in
- * the file that the user cares about.
- *
- * we take care to only sprintf when the user cares because
- * we're holding a lock while we do this.
- *
- * we're smart and know that we generate fixed size lines.
- * we only start writing to the buffer when the user cares.
- * This is unpredictable because we don't snapshot the
- * list between calls that are filling in a file from
- * the list.  The list could change mid read and the
- * output will look very weird indeed.  oh well.
- */
+                if (rc != -EINVAL)
+                        goto out;
 
-static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted,
-                          int *eof, void *data)
-{
-        int len = 0, i;
-        int curpos;
-        char *header = "Interval        Cycles_per (Starts Finishes Total)\n";
-        int header_len = strlen(header);
-        char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)";
-        int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1);
-
-        *start = buffer;
-
-        if (ppos < header_len) {
-                int diff = MIN(header_len, wanted);
-                memcpy(buffer, header + ppos, diff);
-                len += diff;
-                ppos += diff;
-        }
+                /* using new interface */
+                dummy.data = str;
+                dummy.maxlen = tmpstrlen;
+                dummy.proc_handler = &proc_dostring;
 
-        if (len >= wanted)
-                goto out;
+                /* proc_dointvec might have changed these */
+                *lenp = oldlen;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8)
+                *ppos = oldpos;
+#endif
 
-        curpos = header_len;
+                rc = ll_proc_dostring(&dummy, write, filp, buffer, lenp, ppos);
 
-        for ( i = 0; i < MAX_PROFS ; i++) {
-                int copied;
-                struct prof_ent *pe = &prof_ents[i];
-                long long cycles_per;
-                /*
-                 * find the part of the array that the buffer wants
-                 */
-                if (ppos >= (curpos + line_len))  {
-                        curpos += line_len;
-                        continue;
-                }
-                /* the clever caller split a line */
-                if (ppos > curpos) {
-                        *start = buffer + (ppos - curpos);
-                }
-
-                if (pe->finishes == 0)
-                        cycles_per = 0;
-                else
-                {
-                        cycles_per = pe->total_cycles;
-                        do_div (cycles_per, pe->finishes);
-                }
-
-                copied = sprintf(buffer + len, format, pe->str, cycles_per,
-                                 pe->starts, pe->finishes, pe->total_cycles);
-
-                len += copied;
-
-                /* pad to line len, -1 for \n */
-                if ((copied < line_len-1)) {
-                        int diff = (line_len-1) - copied;
-                        memset(buffer + len, ' ', diff);
-                        len += diff;
-                        copied += diff;
-                }
-
-                buffer[len++]= '\n';
-
-                /* bail if we have enough */
-                if (((buffer + len) - *start) >= wanted)
-                        break;
-
-                curpos += line_len;
-        }
+                if (rc != 0)
+                        goto out;
 
-        /* lameness */
-        if (i == MAX_PROFS)
-                *eof = 1;
- out:
+                rc = libcfs_debug_str2mask(mask, dummy.data, is_subsys);
+        } else {
+                dummy.data = str;
+                dummy.maxlen = tmpstrlen;
+                dummy.proc_handler = &proc_dostring;
 
-        return MIN(((buffer + len) - *start), wanted);
-}
+                libcfs_debug_mask2str(dummy.data, dummy.maxlen,*mask,is_subsys);
 
-/*
- * all kids love /proc :/
- */
-static unsigned char basedir[]="net/portals";
-#endif /* PORTALS_PROFILING */
+                rc = ll_proc_dostring(&dummy, write, filp, buffer, lenp, ppos);
+        }
+
+out:
+        kfree(str);
+        return rc;
+}
 
 int insert_proc(void)
 {
         struct proc_dir_entry *ent;
-#if PORTALS_PROFILING
-        unsigned char dir[128];
-
-        if (ARRAY_SIZE(prof_ents) != MAX_PROFS) {
-                CERROR("profiling enum and array are out of sync.\n");
-                return -1;
-        }
-
-        /*
-         * This is pretty lame.  assuming that failure just
-         * means that they already existed.
-         */
-        strcat(dir, basedir);
-        create_proc_entry(dir, S_IFDIR, 0);
-
-        strcat(dir, "/cycles");
-        ent = create_proc_entry(dir, 0, 0);
-        if (!ent) {
-                CERROR("couldn't register %s?\n", dir);
-                return -1;
-        }
-
-        ent->data = NULL;
-        ent->read_proc = prof_read_proc;
-#endif /* PORTALS_PROFILING */
 
 #ifdef CONFIG_SYSCTL
-        if (!portals_table_header)
-                portals_table_header = register_sysctl_table(top_table, 0);
+        if (!lnet_table_header)
+                lnet_table_header = register_sysctl_table(top_table, 0);
 #endif
 
-        ent = create_proc_entry("sys/portals/dump_kernel", 0, NULL);
+        ent = create_proc_entry("sys/lnet/dump_kernel", 0, NULL);
         if (ent == NULL) {
                 CERROR("couldn't register dump_kernel\n");
                 return -1;
         }
         ent->write_proc = trace_dk;
 
-        ent = create_proc_entry("sys/portals/daemon_file", 0, NULL);
+        ent = create_proc_entry("sys/lnet/daemon_file", 0, NULL);
         if (ent == NULL) {
                 CERROR("couldn't register daemon_file\n");
                 return -1;
@@ -283,7 +192,7 @@ int insert_proc(void)
         ent->write_proc = trace_write_daemon_file;
         ent->read_proc = trace_read_daemon_file;
 
-        ent = create_proc_entry("sys/portals/debug_mb", 0, NULL);
+        ent = create_proc_entry("sys/lnet/debug_mb", 0, NULL);
         if (ent == NULL) {
                 CERROR("couldn't register debug_mb\n");
                 return -1;
@@ -296,29 +205,13 @@ int insert_proc(void)
 
 void remove_proc(void)
 {
-#if PORTALS_PROFILING
-        unsigned char dir[128];
-        int end;
-
-        dir[0]='\0';
-        strcat(dir, basedir);
-
-        end = strlen(dir);
-
-        strcat(dir, "/cycles");
-        remove_proc_entry(dir, 0);
-
-        dir[end] = '\0';
-        remove_proc_entry(dir, 0);
-#endif /* PORTALS_PROFILING */
-
-        remove_proc_entry("sys/portals/dump_kernel", NULL);
-        remove_proc_entry("sys/portals/daemon_file", NULL);
-        remove_proc_entry("sys/portals/debug_mb", NULL);
+        remove_proc_entry("sys/lnet/dump_kernel", NULL);
+        remove_proc_entry("sys/lnet/daemon_file", NULL);
+        remove_proc_entry("sys/lnet/debug_mb", NULL);
 
 #ifdef CONFIG_SYSCTL
-        if (portals_table_header)
-                unregister_sysctl_table(portals_table_header);
-        portals_table_header = NULL;
+        if (lnet_table_header)
+                unregister_sysctl_table(lnet_table_header);
+        lnet_table_header = NULL;
 #endif
 }
index 32adc80..520c54c 100644 (file)
@@ -1,2 +1,2 @@
-# define DEBUG_SUBSYSTEM S_PORTALS
+# define DEBUG_SUBSYSTEM S_LNET
 
diff --git a/lnet/libcfs/linux/linux-tcpip.c b/lnet/libcfs/linux/linux-tcpip.c
new file mode 100644 (file)
index 0000000..9cb85ef
--- /dev/null
@@ -0,0 +1,687 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/kp30.h>
+#include <libcfs/libcfs.h>
+
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/file.h>
+/* For sys_open & sys_close */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+#include <linux/syscalls.h>
+#else
+#include <linux/fs.h>
+#endif
+
+int
+libcfs_sock_ioctl(int cmd, unsigned long arg)
+{
+        mm_segment_t   oldmm = get_fs();
+        struct socket  *sock;
+        int             fd;
+        int             rc;
+        struct file     *sock_filp;
+
+        rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return rc;
+        }
+
+        fd = sock_map_fd(sock);
+        if (fd < 0) {
+                rc = fd;
+                sock_release(sock);
+                goto out;
+        }
+
+        sock_filp = fget(fd);
+        if (!sock_filp) {
+                rc = -ENOMEM;
+                goto out_fd;
+        }
+
+        set_fs(KERNEL_DS);
+#ifdef HAVE_UNLOCKED_IOCTL
+        if (sock_filp->f_op->unlocked_ioctl)
+                rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg);
+        else
+#endif
+             {
+                lock_kernel();
+                rc =sock_filp->f_op->ioctl(sock_filp->f_dentry->d_inode,
+                                           sock_filp, cmd, arg);
+                unlock_kernel();
+             }
+        set_fs(oldmm);
+
+        fput(sock_filp);
+
+ out_fd:
+        sys_close(fd);
+ out:
+        return rc;
+}
+
+int
+libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask)
+{
+        struct ifreq   ifr;
+        int            nob;
+        int            rc;
+        __u32          val;
+
+        nob = strnlen(name, IFNAMSIZ);
+        if (nob == IFNAMSIZ) {
+                CERROR("Interface name %s too long\n", name);
+                rc = -EINVAL;
+                goto out;
+        }
+
+        CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ);
+
+        strcpy(ifr.ifr_name, name);
+        rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get flags for interface %s\n", name);
+                goto out;
+        }
+
+        if ((ifr.ifr_flags & IFF_UP) == 0) {
+                CDEBUG(D_NET, "Interface %s down\n", name);
+                *up = 0;
+                *ip = *mask = 0;
+                goto out;
+        }
+
+        *up = 1;
+
+        strcpy(ifr.ifr_name, name);
+        ifr.ifr_addr.sa_family = AF_INET;
+        rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get IP address for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
+        *ip = ntohl(val);
+
+        strcpy(ifr.ifr_name, name);
+        ifr.ifr_addr.sa_family = AF_INET;
+        rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
+
+        if (rc != 0) {
+                CERROR("Can't get netmask for interface %s\n", name);
+                goto out;
+        }
+
+        val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
+        *mask = ntohl(val);
+
+ out:
+        return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_query);
+
+int
+libcfs_ipif_enumerate (char ***namesp)
+{
+        /* Allocate and fill in 'names', returning # interfaces/error */
+        char           **names;
+        int             toobig;
+        int             nalloc;
+        int             nfound;
+        struct ifreq   *ifr;
+        struct ifconf   ifc;
+        int             rc;
+        int             nob;
+        int             i;
+
+
+        nalloc = 16;        /* first guess at max interfaces */
+        toobig = 0;
+        for (;;) {
+                if (nalloc * sizeof(*ifr) > CFS_PAGE_SIZE) {
+                        toobig = 1;
+                        nalloc = CFS_PAGE_SIZE/sizeof(*ifr);
+                        CWARN("Too many interfaces: only enumerating first %d\n",
+                              nalloc);
+                }
+
+                LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
+                if (ifr == NULL) {
+                        CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc);
+                        rc = -ENOMEM;
+                        goto out0;
+                }
+
+                ifc.ifc_buf = (char *)ifr;
+                ifc.ifc_len = nalloc * sizeof(*ifr);
+
+                rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
+
+                if (rc < 0) {
+                        CERROR ("Error %d enumerating interfaces\n", rc);
+                        goto out1;
+                }
+
+                LASSERT (rc == 0);
+
+                nfound = ifc.ifc_len/sizeof(*ifr);
+                LASSERT (nfound <= nalloc);
+
+                if (nfound < nalloc || toobig)
+                        break;
+
+                LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+                nalloc *= 2;
+        }
+
+        if (nfound == 0)
+                goto out1;
+
+        LIBCFS_ALLOC(names, nfound * sizeof(*names));
+        if (names == NULL) {
+                rc = -ENOMEM;
+                goto out1;
+        }
+        /* NULL out all names[i] */
+        memset (names, 0, nfound * sizeof(*names));
+
+        for (i = 0; i < nfound; i++) {
+
+                nob = strnlen (ifr[i].ifr_name, IFNAMSIZ);
+                if (nob == IFNAMSIZ) {
+                        /* no space for terminating NULL */
+                        CERROR("interface name %.*s too long (%d max)\n",
+                               nob, ifr[i].ifr_name, IFNAMSIZ);
+                        rc = -ENAMETOOLONG;
+                        goto out2;
+                }
+
+                LIBCFS_ALLOC(names[i], IFNAMSIZ);
+                if (names[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out2;
+                }
+
+                memcpy(names[i], ifr[i].ifr_name, nob);
+                names[i][nob] = 0;
+        }
+
+        *namesp = names;
+        rc = nfound;
+
+ out2:
+        if (rc < 0)
+                libcfs_ipif_free_enumeration(names, nfound);
+ out1:
+        LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
+ out0:
+        return rc;
+}
+
+EXPORT_SYMBOL(libcfs_ipif_enumerate);
+
+void
+libcfs_ipif_free_enumeration (char **names, int n)
+{
+        int      i;
+
+        LASSERT (n > 0);
+
+        for (i = 0; i < n && names[i] != NULL; i++)
+                LIBCFS_FREE(names[i], IFNAMSIZ);
+
+        LIBCFS_FREE(names, n * sizeof(*names));
+}
+
+EXPORT_SYMBOL(libcfs_ipif_free_enumeration);
+
+int
+libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        mm_segment_t   oldmm = get_fs();
+        long           ticks = timeout * HZ;
+        unsigned long  then;
+        struct timeval tv;
+
+        LASSERT (nob > 0);
+        /* Caller may pass a zero timeout if she thinks the socket buffer is
+         * empty enough to take the whole message immediately */
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
+                };
+
+                if (timeout != 0) {
+                        /* Set send timeout to remaining time */
+                        tv = (struct timeval) {
+                                .tv_sec = ticks / HZ,
+                                .tv_usec = ((ticks % HZ) * 1000000) / HZ
+                        };
+                        set_fs(KERNEL_DS);
+                        rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
+                                             (char *)&tv, sizeof(tv));
+                        set_fs(oldmm);
+                        if (rc != 0) {
+                                CERROR("Can't set socket send timeout "
+                                       "%ld.%06d: %d\n",
+                                       (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                                return rc;
+                        }
+                }
+
+                set_fs (KERNEL_DS);
+                then = jiffies;
+                rc = sock_sendmsg (sock, &msg, iov.iov_len);
+                ticks -= jiffies - then;
+                set_fs (oldmm);
+
+                if (rc == nob)
+                        return 0;
+
+                if (rc < 0)
+                        return rc;
+
+                if (rc == 0) {
+                        CERROR ("Unexpected zero rc\n");
+                        return (-ECONNABORTED);
+                }
+
+                if (ticks <= 0)
+                        return -EAGAIN;
+
+                buffer = ((char *)buffer) + rc;
+                nob -= rc;
+        }
+
+        return (0);
+}
+EXPORT_SYMBOL(libcfs_sock_write);
+
+int
+libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout)
+{
+        int            rc;
+        mm_segment_t   oldmm = get_fs();
+        long           ticks = timeout * HZ;
+        unsigned long  then;
+        struct timeval tv;
+
+        LASSERT (nob > 0);
+        LASSERT (ticks > 0);
+
+        for (;;) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct msghdr msg = {
+                        .msg_name       = NULL,
+                        .msg_namelen    = 0,
+                        .msg_iov        = &iov,
+                        .msg_iovlen     = 1,
+                        .msg_control    = NULL,
+                        .msg_controllen = 0,
+                        .msg_flags      = 0
+                };
+
+                /* Set receive timeout to remaining time */
+                tv = (struct timeval) {
+                        .tv_sec = ticks / HZ,
+                        .tv_usec = ((ticks % HZ) * 1000000) / HZ
+                };
+                set_fs(KERNEL_DS);
+                rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
+                                     (char *)&tv, sizeof(tv));
+                set_fs(oldmm);
+                if (rc != 0) {
+                        CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
+                               (long)tv.tv_sec, (int)tv.tv_usec, rc);
+                        return rc;
+                }
+
+                set_fs(KERNEL_DS);
+                then = jiffies;
+                rc = sock_recvmsg(sock, &msg, iov.iov_len, 0);
+                ticks -= jiffies - then;
+                set_fs(oldmm);
+
+                if (rc < 0)
+                        return rc;
+
+                if (rc == 0)
+                        return -ECONNRESET;
+
+                buffer = ((char *)buffer) + rc;
+                nob -= rc;
+
+                if (nob == 0)
+                        return 0;
+
+                if (ticks <= 0)
+                        return -ETIMEDOUT;
+        }
+}
+
+EXPORT_SYMBOL(libcfs_sock_read);
+
+static int
+libcfs_sock_create (struct socket **sockp, int *fatal,
+                    __u32 local_ip, int local_port)
+{
+        struct sockaddr_in  locaddr;
+        struct socket      *sock;
+        int                 rc;
+        int                 option;
+        mm_segment_t        oldmm = get_fs();
+
+        /* All errors are fatal except bind failure if the port is in use */
+        *fatal = 1;
+
+        rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock);
+        *sockp = sock;
+        if (rc != 0) {
+                CERROR ("Can't create socket: %d\n", rc);
+                return (rc);
+        }
+
+        set_fs (KERNEL_DS);
+        option = 1;
+        rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
+                             (char *)&option, sizeof (option));
+        set_fs (oldmm);
+        if (rc != 0) {
+                CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
+                goto failed;
+        }
+
+        if (local_ip != 0 || local_port != 0) {
+                memset(&locaddr, 0, sizeof(locaddr));
+                locaddr.sin_family = AF_INET;
+                locaddr.sin_port = htons(local_port);
+                locaddr.sin_addr.s_addr = (local_ip == 0) ?
+                                          INADDR_ANY : htonl(local_ip);
+
+                rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr,
+                                     sizeof(locaddr));
+                if (rc == -EADDRINUSE) {
+                        CDEBUG(D_NET, "Port %d already in use\n", local_port);
+                        *fatal = 0;
+                        goto failed;
+                }
+                if (rc != 0) {
+                        CERROR("Error trying to bind to port %d: %d\n",
+                               local_port, rc);
+                        goto failed;
+                }
+        }
+
+        return 0;
+
+ failed:
+        sock_release(sock);
+        return rc;
+}
+
+int
+libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize)
+{
+        mm_segment_t        oldmm = get_fs();
+        int                 option;
+        int                 rc;
+
+        if (txbufsize != 0) {
+                option = txbufsize;
+                set_fs (KERNEL_DS);
+                rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+                                     (char *)&option, sizeof (option));
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't set send buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+        }
+
+        if (rxbufsize != 0) {
+                option = rxbufsize;
+                set_fs (KERNEL_DS);
+                rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF,
+                                      (char *)&option, sizeof (option));
+                set_fs (oldmm);
+                if (rc != 0) {
+                        CERROR ("Can't set receive buffer %d: %d\n",
+                                option, rc);
+                        return (rc);
+                }
+        }
+
+        return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_setbuf);
+
+int
+libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port)
+{
+        struct sockaddr_in sin;
+        int                len = sizeof (sin);
+        int                rc;
+
+        rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len,
+                                 remote ? 2 : 0);
+        if (rc != 0) {
+                CERROR ("Error %d getting sock %s IP/port\n",
+                        rc, remote ? "peer" : "local");
+                return rc;
+        }
+
+        if (ip != NULL)
+                *ip = ntohl (sin.sin_addr.s_addr);
+
+        if (port != NULL)
+                *port = ntohs (sin.sin_port);
+
+        return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getaddr);
+
+int
+libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize)
+{
+
+        if (txbufsize != NULL) {
+                *txbufsize = sock->sk->sk_sndbuf;
+        }
+
+        if (rxbufsize != NULL) {
+                *rxbufsize = sock->sk->sk_rcvbuf;
+        }
+
+        return 0;
+}
+
+EXPORT_SYMBOL(libcfs_sock_getbuf);
+
+int
+libcfs_sock_listen (struct socket **sockp,
+                    __u32 local_ip, int local_port, int backlog)
+{
+        int      fatal;
+        int      rc;
+
+        rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port);
+        if (rc != 0) {
+                if (!fatal)
+                        CERROR("Can't create socket: port %d already in use\n",
+                               local_port);
+                return rc;
+        }
+
+        rc = (*sockp)->ops->listen(*sockp, backlog);
+        if (rc == 0)
+                return 0;
+
+        CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
+        sock_release(*sockp);
+        return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_listen);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12)
+int sock_create_lite(int family, int type, int protocol, struct socket **res)
+{
+        int err = 0;
+        struct socket *sock;
+
+        sock = sock_alloc();
+        if (!sock) {
+                err = -ENOMEM;
+                goto out;
+        }
+        sock->type = type;
+out:
+        *res = sock;
+        return err;
+}
+#endif
+
+int
+libcfs_sock_accept (struct socket **newsockp, struct socket *sock)
+{
+        wait_queue_t   wait;
+        struct socket *newsock;
+        int            rc;
+
+        init_waitqueue_entry(&wait, current);
+
+        /* XXX this should add a ref to sock->ops->owner, if
+         * TCP could be a module */
+        rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
+        if (rc) {
+                CERROR("Can't allocate socket\n");
+                return rc;
+        }
+
+        newsock->ops = sock->ops;
+
+        set_current_state(TASK_INTERRUPTIBLE);
+        add_wait_queue(sock->sk->sk_sleep, &wait);
+
+        rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+        if (rc == -EAGAIN) {
+                /* Nothing ready, so wait for activity */
+                schedule();
+                rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
+        }
+
+        remove_wait_queue(sock->sk->sk_sleep, &wait);
+        set_current_state(TASK_RUNNING);
+
+        if (rc != 0)
+                goto failed;
+
+        *newsockp = newsock;
+        return 0;
+
+ failed:
+        sock_release(newsock);
+        return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_accept);
+
+void
+libcfs_sock_abort_accept (struct socket *sock)
+{
+        wake_up_all(sock->sk->sk_sleep);
+}
+
+EXPORT_SYMBOL(libcfs_sock_abort_accept);
+
+int
+libcfs_sock_connect (struct socket **sockp, int *fatal,
+                     __u32 local_ip, int local_port,
+                     __u32 peer_ip, int peer_port)
+{
+        struct sockaddr_in  srvaddr;
+        int                 rc;
+
+        rc = libcfs_sock_create(sockp, fatal, local_ip, local_port);
+        if (rc != 0)
+                return rc;
+
+        memset (&srvaddr, 0, sizeof (srvaddr));
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons(peer_port);
+        srvaddr.sin_addr.s_addr = htonl(peer_ip);
+
+        rc = (*sockp)->ops->connect(*sockp,
+                                    (struct sockaddr *)&srvaddr, sizeof(srvaddr),
+                                    0);
+        if (rc == 0)
+                return 0;
+
+        /* EADDRNOTAVAIL probably means we're already connected to the same
+         * peer/port on the same local port on a differently typed
+         * connection.  Let our caller retry with a different local
+         * port... */
+        *fatal = !(rc == -EADDRNOTAVAIL);
+
+        CDEBUG(*fatal ? D_NETERROR : D_NET,
+               "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc,
+               HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port);
+
+        sock_release(*sockp);
+        return rc;
+}
+
+EXPORT_SYMBOL(libcfs_sock_connect);
+
+void
+libcfs_sock_release (struct socket *sock)
+{
+        sock_release(sock);
+}
+
+EXPORT_SYMBOL(libcfs_sock_release);
index daba696..1fb38cf 100644 (file)
@@ -1,4 +1,4 @@
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 #define LUSTRE_TRACEFILE_PRIVATE
 
 #include <libcfs/libcfs.h>
 extern union trace_data_union trace_data[NR_CPUS];
 extern char *tracefile;
 extern long long tracefile_size;
-extern struct rw_semaphore tracefile_sem;
 
-inline struct trace_cpu_data *
-__trace_get_tcd(unsigned long *flags) 
+char *trace_console_buffers[NR_CPUS][3];
+
+struct rw_semaphore tracefile_sem;
+
+int tracefile_init_arch()
+{
+       int    i;
+       int    j;
+
+       init_rwsem(&tracefile_sem);
+
+       for (i = 0; i < NR_CPUS; i++)
+               for (j = 0; j < 3; j++) {
+                       trace_console_buffers[i][j] =
+                               kmalloc(TRACE_CONSOLE_BUFFER_SIZE,
+                                       GFP_KERNEL);
+
+                       if (trace_console_buffers[i][j] == NULL) {
+                               tracefile_fini_arch();
+                               printk(KERN_ERR
+                                      "Can't allocate "
+                                      "console message buffer\n");
+                               return -ENOMEM;
+                       }
+               }
+
+       return 0;
+}
+
+void tracefile_fini_arch()
+{
+       int    i;
+       int    j;
+
+       for (i = 0; i < NR_CPUS; i++)
+               for (j = 0; j < 3; j++)
+                       if (trace_console_buffers[i][j] != NULL) {
+                               kfree(trace_console_buffers[i][j]);
+                               trace_console_buffers[i][j] = NULL;
+                       }
+}
+
+void tracefile_read_lock()
+{
+       down_read(&tracefile_sem);
+}
+
+void tracefile_read_unlock()
+{
+       up_read(&tracefile_sem);
+}
+
+void tracefile_write_lock()
+{
+       down_write(&tracefile_sem);
+}
+
+void tracefile_write_unlock()
+{
+       up_write(&tracefile_sem);
+}
+
+char *
+trace_get_console_buffer(void)
 {
-       struct trace_cpu_data *ret;           
+       int  cpu = get_cpu();
+       int  idx;
 
-       int cpu = get_cpu();                
-       local_irq_save(*flags);               
-       ret = &trace_data[cpu].tcd;     
+       if (in_irq()) {
+               idx = 0;
+       } else if (in_softirq()) {
+               idx = 1;
+       } else {
+               idx = 2;
+       }
 
-       return ret;                             
+       return trace_console_buffers[cpu][idx];
 }
 
-inline void 
-trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags)
+void
+trace_put_console_buffer(char *buffer)
 {
-       local_irq_restore(flags); 
-       put_cpu();               
+       put_cpu();
+}
+
+struct trace_cpu_data *
+trace_get_tcd(void)
+{
+       int cpu;
+
+       if (in_interrupt()) /* no logging in IRQ context */
+               return NULL;
+
+       cpu = get_cpu();
+       return &trace_data[cpu].tcd;
+}
+
+void
+trace_put_tcd (struct trace_cpu_data *tcd)
+{
+       __LASSERT (!in_interrupt());
+       put_cpu();
+}
+
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage)
+{
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
+       return tcd->tcd_cpu == tage->cpu;
 }
 
 void
-set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, 
+set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask,
                    const int line, unsigned long stack)
-{ 
-       struct timeval tv; 
-       
-       do_gettimeofday(&tv); 
-       
-       header->ph_subsys = subsys; 
-       header->ph_mask = mask; 
-       header->ph_cpu_id = smp_processor_id(); 
-       header->ph_sec = (__u32)tv.tv_sec; 
-       header->ph_usec = tv.tv_usec; 
-       header->ph_stack = stack; 
-       header->ph_pid = current->pid; 
-       header->ph_line_num = line; 
-#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) 
+{
+       struct timeval tv;
+
+       do_gettimeofday(&tv);
+
+       header->ph_subsys = subsys;
+       header->ph_mask = mask;
+       header->ph_cpu_id = smp_processor_id();
+       header->ph_sec = (__u32)tv.tv_sec;
+       header->ph_usec = tv.tv_usec;
+       header->ph_stack = stack;
+       header->ph_pid = current->pid;
+       header->ph_line_num = line;
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
        header->ph_extern_pid = current->thread.extern_pid;
-#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) 
+#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
        header->ph_extern_pid = current->thread.mode.tt.extern_pid;
-#else 
+#else
        header->ph_extern_pid = 0;
 #endif
        return;
 }
 
-void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, 
-                            int len, char *file, const char *fn)
-{ 
-       char *prefix = NULL, *ptype = NULL; 
-       
-       if ((mask & D_EMERG) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_EMERG; 
-       } else if ((mask & D_ERROR) != 0) { 
-               prefix = "LustreError"; 
-               ptype = KERN_ERR; 
-       } else if ((mask & D_WARNING) != 0) { 
-               prefix = "Lustre"; 
-               ptype = KERN_WARNING; 
-       } else if (portal_printk != 0 || (mask & D_CONSOLE)) {
-               prefix = "Lustre"; 
-               ptype = KERN_INFO; 
-       } 
+void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf,
+                            int len, const char *file, const char *fn)
+{
+       char *prefix = "Lustre", *ptype = NULL;
+
+       if ((mask & D_EMERG) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_EMERG;
+       } else if ((mask & D_ERROR) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_ERR;
+       } else if ((mask & D_WARNING) != 0) {
+               prefix = "Lustre";
+               ptype = KERN_WARNING;
+       } else if ((mask & libcfs_printk) != 0 || (mask & D_CONSOLE)) {
+               prefix = "Lustre";
+               ptype = KERN_INFO;
+       }
 
        if ((mask & D_CONSOLE) != 0) {
                printk("%s%s: %.*s", ptype, prefix, len, buf);
        } else {
-               printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, 
+               printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid,
                       hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
        }
        return;
 }
 
-int trace_write_daemon_file(struct file *file, const char *buffer, 
+int trace_write_daemon_file(struct file *file, const char *buffer,
                            unsigned long count, void *data)
-{ 
-       char *name; 
-       unsigned long off; 
-       int rc; 
-       
-       name = kmalloc(count + 1, GFP_KERNEL); 
-       if (name == NULL) 
-               return -ENOMEM; 
-       
-       if (copy_from_user(name, buffer, count)) { 
-               rc = -EFAULT; 
-               goto out; 
-       } 
-       
-       /* be nice and strip out trailing '\n' */ 
-       for (off = count ; off > 2 && isspace(name[off - 1]); off--) 
-               ; 
-       
-       name[off] = '\0'; 
-       
-       down_write(&tracefile_sem); 
-       if (strcmp(name, "stop") == 0) { 
-               tracefile = NULL; 
-               trace_stop_thread(); 
-               goto out_sem; 
-       } else if (strncmp(name, "size=", 5) == 0) { 
-               tracefile_size = simple_strtoul(name + 5, NULL, 0); 
-               if (tracefile_size < 10 || tracefile_size > 20480) 
-                       tracefile_size = TRACEFILE_SIZE; 
-               else 
-                       tracefile_size <<= 20; 
-               goto out_sem; 
-       } 
-       
-       if (name[0] != '/') { 
-               rc = -EINVAL; 
-               goto out_sem; 
-       } 
-       
-       if (tracefile != NULL) 
-               kfree(tracefile); 
-       
-       tracefile = name; 
-       name = NULL; 
-       printk(KERN_INFO "Lustre: debug daemon will attempt to start writing " 
-              "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10)); 
-       
-       trace_start_thread(); 
-out_sem: 
-       up_write(&tracefile_sem); 
-out: 
+{
+       char *name;
+       unsigned long off;
+       int rc;
+
+       name = kmalloc(count + 1, GFP_KERNEL);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user(name, buffer, count)) {
+               rc = -EFAULT;
+               goto out;
+       }
+
+       /* be nice and strip out trailing '\n' */
+       for (off = count ; off > 2 && isspace(name[off - 1]); off--)
+               ;
+
+       name[off] = '\0';
+
+       tracefile_write_lock();
+       if (strcmp(name, "stop") == 0) {
+               tracefile = NULL;
+               trace_stop_thread();
+               goto out_sem;
+       } else if (strncmp(name, "size=", 5) == 0) {
+               tracefile_size = simple_strtoul(name + 5, NULL, 0);
+               if (tracefile_size < 10 || tracefile_size > 20480)
+                       tracefile_size = TRACEFILE_SIZE;
+               else
+                       tracefile_size <<= 20;
+               goto out_sem;
+       }
+
+       if (name[0] != '/') {
+               rc = -EINVAL;
+               goto out_sem;
+       }
+
+       if (tracefile != NULL)
+               kfree(tracefile);
+
+       tracefile = name;
+       name = NULL;
+       printk(KERN_INFO "Lustre: debug daemon will attempt to start writing "
+              "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10));
+
+       trace_start_thread();
+out_sem:
+       tracefile_write_unlock();
+out:
        kfree(name);
        return count;
 }
 
-int trace_read_daemon_file(char *page, char **start, off_t off, int count, 
+int trace_read_daemon_file(char *page, char **start, off_t off, int count,
                           int *eof, void *data)
-{ 
-       int rc; 
-       
-       down_read(&tracefile_sem); 
-       rc = snprintf(page, count, "%s", tracefile); 
-       up_read(&tracefile_sem); 
+{
+       int rc;
+
+       tracefile_read_lock();
+       rc = snprintf(page, count, "%s", tracefile);
+       tracefile_read_unlock();
 
        return rc;
 }
 
-int trace_write_debug_mb(struct file *file, const char *buffer, 
+int trace_write_debug_mb(struct file *file, const char *buffer,
                         unsigned long count, void *data)
-{ 
-       char string[32]; 
-       int i; 
-       unsigned max; 
-       
-       if (count >= sizeof(string)) { 
-               printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n", 
-                      count); 
-               return -EOVERFLOW; 
-       } 
-       
-       if (copy_from_user(string, buffer, count)) 
-               return -EFAULT; 
-       
-       max = simple_strtoul(string, NULL, 0); 
-       if (max == 0) 
+{
+       char string[32];
+       int i;
+       unsigned max;
+
+       if (count >= sizeof(string)) {
+               printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n",
+                      count);
+               return -EOVERFLOW;
+       }
+
+       if (copy_from_user(string, buffer, count))
+               return -EFAULT;
+
+       max = simple_strtoul(string, NULL, 0);
+       if (max == 0)
                return -EINVAL;
 
-       if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || max >= 512) { 
-               printk(KERN_ERR "Lustre: Refusing to set debug buffer size to " 
-                      "%dMB, which is more than 80%% of available RAM (%lu)\n", 
-                      max, (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5); 
-               return -EINVAL; 
-       } 
-
-       max /= smp_num_cpus; 
-       
-       for (i = 0; i < NR_CPUS; i++) { 
-               struct trace_cpu_data *tcd; 
-               tcd = &trace_data[i].tcd; 
-               tcd->tcd_max_pages = max << (20 - PAGE_SHIFT); 
-       } 
+       if (max > (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5 || max >= 512) {
+               printk(KERN_ERR "Lustre: Refusing to set debug buffer size to "
+                      "%dMB, which is more than 80%% of available RAM (%lu)\n",
+                      max, (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5);
+               return -EINVAL;
+       }
+
+       max /= smp_num_cpus;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               struct trace_cpu_data *tcd;
+               tcd = &trace_data[i].tcd;
+               tcd->tcd_max_pages = max << (20 - CFS_PAGE_SHIFT);
+       }
        return count;
 }
 
 int trace_read_debug_mb(char *page, char **start, off_t off, int count,
                                        int *eof, void *data)
-{ 
-       struct trace_cpu_data *tcd; 
-       unsigned long flags; 
+{
+       struct trace_cpu_data *tcd;
        int rc;
-                                       
-       tcd = trace_get_tcd(flags); 
-       rc = snprintf(page, count, "%lu\n", 
-                     (tcd->tcd_max_pages >> (20 - PAGE_SHIFT)) * smp_num_cpus); 
-       trace_put_tcd(tcd, flags); 
+
+       tcd = trace_get_tcd();
+       __LASSERT (tcd != NULL);
+
+       rc = snprintf(page, count, "%lu\n",
+                     (tcd->tcd_max_pages >> (20 - CFS_PAGE_SHIFT)) * smp_num_cpus);
+
+       trace_put_tcd(tcd);
        return rc;
 }
 
+void
+trace_call_on_all_cpus(void (*fn)(void *arg), void *arg)
+{
+        cpumask_t cpus_allowed = current->cpus_allowed;
+       /* use cpus_allowed to quiet 2.4 UP kernel warning only */
+        cpumask_t m = cpus_allowed;
+        int       cpu;
+
+       /* Run the given routine on every CPU in thread context */
+        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+                if (!cpu_online(cpu))
+                       continue;
+
+               cpus_clear(m);
+               cpu_set(cpu, m);
+               set_cpus_allowed(current, m);
+
+               fn(arg);
+
+               set_cpus_allowed(current, cpus_allowed);
+        }
+}
index 67ecb0c..60f7cb8 100644 (file)
@@ -24,8 +24,8 @@
 /*
  * miscellaneous libcfs stuff
  */
-#define DEBUG_SUBSYSTEM S_PORTALS
-#include <portals/types.h>
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/types.h>
 
 /*
  * Convert server error code to client format. Error codes are from
@@ -35,13 +35,26 @@ int convert_server_error(__u64 ecode)
 {
        return ecode;
 }
+EXPORT_SYMBOL(convert_server_error);
 
 /*
  * convert <fcntl.h> flag from client to server.
  */
-int convert_client_oflag(int cflag)
+int convert_client_oflag(int cflag, int *result)
 {
-       return cflag;
+        *result = cflag;
+       return 0;
 }
+EXPORT_SYMBOL(convert_client_oflag);
 
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{}
+
+EXPORT_SYMBOL(cfs_stack_trace_fill);
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        return NULL;
+}
+EXPORT_SYMBOL(cfs_stack_trace_frame);
 
index b4ae10f..3ed5d45 100644 (file)
@@ -24,7 +24,9 @@
 # define EXPORT_SYMTAB
 #endif
 
+#ifdef HAVE_KERNEL_CONFIG_H
 #include <linux/config.h>
+#endif
 #include <linux/module.h>
 #include <linux/kmod.h>
 #include <linux/kernel.h>
@@ -39,7 +41,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/kp30.h>
 
@@ -118,7 +120,7 @@ lwt_control (int enable, int clear)
                         continue;
 
                 for (j = 0; j < lwt_pages_per_cpu; j++) {
-                        memset (p->lwtp_events, 0, PAGE_SIZE);
+                        memset (p->lwtp_events, 0, CFS_PAGE_SIZE);
 
                         p = list_entry (p->lwtp_list.next,
                                         lwt_page_t, lwtp_list);
@@ -138,7 +140,7 @@ int
 lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, 
               void *user_ptr, int user_size)
 {
-        const int    events_per_page = PAGE_SIZE / sizeof(lwt_event_t);
+        const int    events_per_page = CFS_PAGE_SIZE / sizeof(lwt_event_t);
         const int    bytes_per_page = events_per_page * sizeof(lwt_event_t);
         lwt_page_t  *p;
         int          i;
@@ -189,7 +191,7 @@ lwt_init ()
 
        /* NULL pointers, zero scalars */
        memset (lwt_cpus, 0, sizeof (lwt_cpus));
-        lwt_pages_per_cpu = LWT_MEMORY / (num_online_cpus() * PAGE_SIZE);
+        lwt_pages_per_cpu = LWT_MEMORY / (num_online_cpus() * CFS_PAGE_SIZE);
 
        for (i = 0; i < num_online_cpus(); i++)
                for (j = 0; j < lwt_pages_per_cpu; j++) {
@@ -202,7 +204,7 @@ lwt_init ()
                                return (-ENOMEM);
                        }
 
-                        PORTAL_ALLOC(lwtp, sizeof (*lwtp));
+                        LIBCFS_ALLOC(lwtp, sizeof (*lwtp));
                        if (lwtp == NULL) {
                                CERROR ("Can't allocate lwtp\n");
                                 __free_page(page);
@@ -212,7 +214,7 @@ lwt_init ()
 
                         lwtp->lwtp_page = page;
                         lwtp->lwtp_events = page_address(page);
-                       memset (lwtp->lwtp_events, 0, PAGE_SIZE);
+                       memset (lwtp->lwtp_events, 0, CFS_PAGE_SIZE);
 
                        if (j == 0) {
                                INIT_LIST_HEAD (&lwtp->lwtp_list);
@@ -253,7 +255,7 @@ lwt_fini ()
                         }
                         
                         __free_page (lwtp->lwtp_page);
-                        PORTAL_FREE (lwtp, sizeof (*lwtp));
+                        LIBCFS_FREE (lwtp, sizeof (*lwtp));
                 }
 }
 
diff --git a/lnet/libcfs/misc.c b/lnet/libcfs/misc.c
new file mode 100644 (file)
index 0000000..0ace40d
--- /dev/null
@@ -0,0 +1,53 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ *   Author: Nikita Danilov <nikita@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+/*
+ * On-wire format is native kdev_t format of Linux kernel 2.6
+ */
+enum {
+       WIRE_RDEV_MINORBITS = 20,
+       WIRE_RDEV_MINORMASK = ((1U << WIRE_RDEV_MINORBITS) - 1)
+};
+
+cfs_wire_rdev_t cfs_wire_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+        return (major << WIRE_RDEV_MINORBITS) | minor;
+}
+
+cfs_major_nr_t  cfs_wire_rdev_major(cfs_wire_rdev_t rdev)
+{
+        return rdev >> WIRE_RDEV_MINORBITS;
+}
+
+cfs_minor_nr_t  cfs_wire_rdev_minor(cfs_wire_rdev_t rdev)
+{
+        return rdev & WIRE_RDEV_MINORMASK;
+}
+
index 7da61f4..5e273cb 100644 (file)
 #ifndef EXPORT_SYMTAB
 # define EXPORT_SYMTAB
 #endif
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
-#include <portals/lib-p30.h>
-#include <portals/p30.h>
+#include <lnet/lib-lnet.h>
+#include <lnet/lnet.h>
 #include <libcfs/kp30.h>
-
-struct nal_cmd_handler {
-        int                  nch_number;
-        nal_cmd_handler_fn  *nch_handler;
-        void                *nch_private;
-};
-
-static struct nal_cmd_handler nal_cmd[16];
-struct semaphore nal_cmd_mutex;
-
-#ifdef PORTAL_DEBUG
-void kportal_assertion_failed(char *expr, char *file, const char *func,
-                              const int line)
-{
-        portals_debug_msg(0, D_EMERG, file, func, line, CDEBUG_STACK,
-                          "ASSERTION(%s) failed\n", expr);
-        LBUG_WITH_LOC(file, func, line);
-}
-#endif
+#include "tracefile.h"
 
 void
-kportal_memhog_free (struct portals_device_userstate *pdu)
+kportal_memhog_free (struct libcfs_device_userstate *ldu)
 {
-        cfs_page_t **level0p = &pdu->pdu_memhog_root_page;
+        cfs_page_t **level0p = &ldu->ldu_memhog_root_page;
         cfs_page_t **level1p;
         cfs_page_t **level2p;
         int           count1;
@@ -71,28 +53,28 @@ kportal_memhog_free (struct portals_device_userstate *pdu)
                                *level2p != NULL) {
 
                                 cfs_free_page(*level2p);
-                                pdu->pdu_memhog_pages--;
+                                ldu->ldu_memhog_pages--;
                                 level2p++;
                                 count2++;
                         }
 
                         cfs_free_page(*level1p);
-                        pdu->pdu_memhog_pages--;
+                        ldu->ldu_memhog_pages--;
                         level1p++;
                         count1++;
                 }
 
                 cfs_free_page(*level0p);
-                pdu->pdu_memhog_pages--;
+                ldu->ldu_memhog_pages--;
 
                 *level0p = NULL;
         }
 
-        LASSERT (pdu->pdu_memhog_pages == 0);
+        LASSERT (ldu->ldu_memhog_pages == 0);
 }
 
 int
-kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flags)
+kportal_memhog_alloc (struct libcfs_device_userstate *ldu, int npages, int flags)
 {
         cfs_page_t **level0p;
         cfs_page_t **level1p;
@@ -100,8 +82,8 @@ kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flag
         int           count1;
         int           count2;
 
-        LASSERT (pdu->pdu_memhog_pages == 0);
-        LASSERT (pdu->pdu_memhog_root_page == NULL);
+        LASSERT (ldu->ldu_memhog_pages == 0);
+        LASSERT (ldu->ldu_memhog_root_page == NULL);
 
         if (npages < 0)
                 return -EINVAL;
@@ -109,41 +91,41 @@ kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flag
         if (npages == 0)
                 return 0;
 
-        level0p = &pdu->pdu_memhog_root_page;
+        level0p = &ldu->ldu_memhog_root_page;
         *level0p = cfs_alloc_page(flags);
         if (*level0p == NULL)
                 return -ENOMEM;
-        pdu->pdu_memhog_pages++;
+        ldu->ldu_memhog_pages++;
 
         level1p = (cfs_page_t **)cfs_page_address(*level0p);
         count1 = 0;
         memset(level1p, 0, CFS_PAGE_SIZE);
 
-        while (pdu->pdu_memhog_pages < npages &&
+        while (ldu->ldu_memhog_pages < npages &&
                count1 < CFS_PAGE_SIZE/sizeof(cfs_page_t *)) {
 
-                if (cfs_signal_pending(cfs_current()))
+                if (cfs_signal_pending())
                         return (-EINTR);
 
                 *level1p = cfs_alloc_page(flags);
                 if (*level1p == NULL)
                         return -ENOMEM;
-                pdu->pdu_memhog_pages++;
+                ldu->ldu_memhog_pages++;
 
                 level2p = (cfs_page_t **)cfs_page_address(*level1p);
                 count2 = 0;
                 memset(level2p, 0, CFS_PAGE_SIZE);
 
-                while (pdu->pdu_memhog_pages < npages &&
+                while (ldu->ldu_memhog_pages < npages &&
                        count2 < CFS_PAGE_SIZE/sizeof(cfs_page_t *)) {
 
-                        if (cfs_signal_pending(cfs_current()))
+                        if (cfs_signal_pending())
                                 return (-EINTR);
 
                         *level2p = cfs_alloc_page(flags);
                         if (*level2p == NULL)
                                 return (-ENOMEM);
-                        pdu->pdu_memhog_pages++;
+                        ldu->ldu_memhog_pages++;
 
                         level2p++;
                         count2++;
@@ -159,17 +141,17 @@ kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flag
 /* called when opening /dev/device */
 static int libcfs_psdev_open(unsigned long flags, void *args)
 {
-        struct portals_device_userstate *pdu;
+        struct libcfs_device_userstate *ldu;
         ENTRY;
 
         PORTAL_MODULE_USE;
 
-        PORTAL_ALLOC(pdu, sizeof(*pdu));
-        if (pdu != NULL) {
-                pdu->pdu_memhog_pages = 0;
-                pdu->pdu_memhog_root_page = NULL;
+        LIBCFS_ALLOC(ldu, sizeof(*ldu));
+        if (ldu != NULL) {
+                ldu->ldu_memhog_pages = 0;
+                ldu->ldu_memhog_root_page = NULL;
         }
-        *(struct portals_device_userstate **)args = pdu;
+        *(struct libcfs_device_userstate **)args = ldu;
 
         RETURN(0);
 }
@@ -177,157 +159,49 @@ static int libcfs_psdev_open(unsigned long flags, void *args)
 /* called when closing /dev/device */
 static int libcfs_psdev_release(unsigned long flags, void *args)
 {
-        struct portals_device_userstate *pdu;
+        struct libcfs_device_userstate *ldu;
         ENTRY;
 
-        pdu = (struct portals_device_userstate *)args;
-        if (pdu != NULL) {
-                kportal_memhog_free(pdu);
-                PORTAL_FREE(pdu, sizeof(*pdu));
+        ldu = (struct libcfs_device_userstate *)args;
+        if (ldu != NULL) {
+                kportal_memhog_free(ldu);
+                LIBCFS_FREE(ldu, sizeof(*ldu));
         }
 
         PORTAL_MODULE_UNUSE;
         RETURN(0);
 }
 
-static inline void freedata(void *data, int len)
-{
-        PORTAL_FREE(data, len);
-}
-
-struct nal_cmd_handler *
-libcfs_find_nal_cmd_handler(int nal)
-{
-        int    i;
-
-        for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++)
-                if (nal_cmd[i].nch_handler != NULL &&
-                    nal_cmd[i].nch_number == nal)
-                        return (&nal_cmd[i]);
-
-        return (NULL);
-}
-
-int
-libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *private)
-{
-        struct nal_cmd_handler *cmd;
-        int                     i;
-        int                     rc;
-
-        CDEBUG(D_IOCTL, "Register NAL %x, handler: %p\n", nal, handler);
-
-        mutex_down(&nal_cmd_mutex);
-
-        if (libcfs_find_nal_cmd_handler(nal) != NULL) {
-                mutex_up (&nal_cmd_mutex);
-                return (-EBUSY);
-        }
-
-        cmd = NULL;
-        for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++)
-                if (nal_cmd[i].nch_handler == NULL) {
-                        cmd = &nal_cmd[i];
-                        break;
-                }
-
-        if (cmd == NULL) {
-                rc = -EBUSY;
-        } else {
-                rc = 0;
-                cmd->nch_number = nal;
-                cmd->nch_handler = handler;
-                cmd->nch_private = private;
-        }
-
-        mutex_up(&nal_cmd_mutex);
-
-        return rc;
-}
-EXPORT_SYMBOL(libcfs_nal_cmd_register);
-
-void
-libcfs_nal_cmd_unregister(int nal)
-{
-        struct nal_cmd_handler *cmd;
-
-        CDEBUG(D_IOCTL, "Unregister NAL %x\n", nal);
-
-        mutex_down(&nal_cmd_mutex);
-        cmd = libcfs_find_nal_cmd_handler(nal);
-        LASSERT (cmd != NULL);
-        cmd->nch_handler = NULL;
-        cmd->nch_private = NULL;
-        mutex_up(&nal_cmd_mutex);
-}
-EXPORT_SYMBOL(libcfs_nal_cmd_unregister);
-
-int
-libcfs_nal_cmd(struct portals_cfg *pcfg)
-{
-#if CRAY_PORTALS
-        /* pretend success */
-        RETURN(0);
-#else
-        struct nal_cmd_handler *cmd;
-        __u32 nal = pcfg->pcfg_nal;
-        int   rc = -EINVAL;
-        ENTRY;
-
-        if (pcfg->pcfg_version != PORTALS_CFG_VERSION) {
-                RETURN(-EINVAL);
-        }
-
-        mutex_down(&nal_cmd_mutex);
-        cmd = libcfs_find_nal_cmd_handler(nal);
-        if (cmd != NULL) {
-                CDEBUG(D_IOCTL, "calling handler nal: %x, cmd: %d\n", nal,
-                       pcfg->pcfg_command);
-                rc = cmd->nch_handler(pcfg, cmd->nch_private);
-        } else {
-                CERROR("invalid nal: %x, cmd: %d\n", nal, pcfg->pcfg_command);
-        }
-        mutex_up(&nal_cmd_mutex);
-
-        RETURN(rc);
-#endif
-}
-EXPORT_SYMBOL(libcfs_nal_cmd);
-
 static struct rw_semaphore ioctl_list_sem;
 static struct list_head ioctl_list;
 
 int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand)
 {
         int rc = 0;
-        down_read(&ioctl_list_sem);
+
+        down_write(&ioctl_list_sem);
         if (!list_empty(&hand->item))
                 rc = -EBUSY;
-        up_read(&ioctl_list_sem);
-
-        if (rc == 0) {
-                down_write(&ioctl_list_sem);
+        else
                 list_add_tail(&hand->item, &ioctl_list);
-                up_write(&ioctl_list_sem);
-        }
-        RETURN(0);
+        up_write(&ioctl_list_sem);
+
+        return rc;
 }
 EXPORT_SYMBOL(libcfs_register_ioctl);
 
 int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand)
 {
         int rc = 0;
-        down_read(&ioctl_list_sem);
+
+        down_write(&ioctl_list_sem);
         if (list_empty(&hand->item))
                 rc = -ENOENT;
-        up_read(&ioctl_list_sem);
-
-        if (rc == 0) {
-                down_write(&ioctl_list_sem);
+        else
                 list_del_init(&hand->item);
-                up_write(&ioctl_list_sem);
-        }
-        RETURN(0);
+        up_write(&ioctl_list_sem);
+
+        return rc;
 }
 EXPORT_SYMBOL(libcfs_deregister_ioctl);
 
@@ -335,112 +209,67 @@ static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *a
 {
         char    buf[1024];
         int err = -EINVAL;
-        struct portal_ioctl_data *data;
+        struct libcfs_ioctl_data *data;
         ENTRY;
 
         /* 'cmd' and permissions get checked in our arch-specific caller */
 
-        if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) {
+        if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) {
                 CERROR("PORTALS ioctl: data error\n");
                 RETURN(-EINVAL);
         }
-        data = (struct portal_ioctl_data *)buf;
+        data = (struct libcfs_ioctl_data *)buf;
 
         switch (cmd) {
-        case IOC_PORTAL_CLEAR_DEBUG:
-                portals_debug_clear_buffer();
+        case IOC_LIBCFS_CLEAR_DEBUG:
+                libcfs_debug_clear_buffer();
                 RETURN(0);
         /*
-         * case IOC_PORTAL_PANIC:
+         * case IOC_LIBCFS_PANIC:
          * Handled in arch/cfs_module.c
          */
-        case IOC_PORTAL_MARK_DEBUG:
-                if (data->ioc_inlbuf1 == NULL ||
-                    data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
-                        RETURN(-EINVAL);
-                portals_debug_mark_buffer(data->ioc_inlbuf1);
-                RETURN(0);
-        case IOC_PORTAL_DMSG:
+        case IOC_LIBCFS_MARK_DEBUG:
                 if (data->ioc_inlbuf1 == NULL ||
                     data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0')
                         RETURN(-EINVAL);
-                printk("%s", data->ioc_inlbuf1);
+                libcfs_debug_mark_buffer(data->ioc_inlbuf1);
                 RETURN(0);
 #if LWT_SUPPORT
-        case IOC_PORTAL_LWT_CONTROL:
-                err = lwt_control (data->ioc_flags, data->ioc_misc);
+        case IOC_LIBCFS_LWT_CONTROL:
+                err = lwt_control ((data->ioc_flags & 1) != 0, 
+                                   (data->ioc_flags & 2) != 0);
                 break;
 
-        case IOC_PORTAL_LWT_SNAPSHOT: {
+        case IOC_LIBCFS_LWT_SNAPSHOT: {
                 cycles_t   now;
                 int        ncpu;
                 int        total_size;
 
                 err = lwt_snapshot (&now, &ncpu, &total_size,
                                     data->ioc_pbuf1, data->ioc_plen1);
-                data->ioc_nid = now;
-                data->ioc_count = ncpu;
-                data->ioc_misc = total_size;
+                data->ioc_u64[0] = now;
+                data->ioc_u32[0] = ncpu;
+                data->ioc_u32[1] = total_size;
 
                 /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */
-                data->ioc_nid2 = sizeof(lwt_event_t);
-                data->ioc_nid3 = offsetof(lwt_event_t, lwte_where);
+                data->ioc_u32[2] = sizeof(lwt_event_t);
+                data->ioc_u32[3] = offsetof(lwt_event_t, lwte_where);
 
                 if (err == 0 &&
-                    copy_to_user((char *)arg, data, sizeof (*data)))
+                    libcfs_ioctl_popdata(arg, data, sizeof (*data)))
                         err = -EFAULT;
                 break;
         }
 
-        case IOC_PORTAL_LWT_LOOKUP_STRING:
+        case IOC_LIBCFS_LWT_LOOKUP_STRING:
                 err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
                                          data->ioc_pbuf2, data->ioc_plen2);
                 if (err == 0 &&
-                    copy_to_user((char *)arg, data, sizeof (*data)))
+                    libcfs_ioctl_popdata(arg, data, sizeof (*data)))
                         err = -EFAULT;
                 break;
 #endif
-        case IOC_PORTAL_NAL_CMD: {
-                struct portals_cfg pcfg;
-
-                if (data->ioc_plen1 != sizeof(pcfg)) {
-                        CERROR("Bad ioc_plen1 %d (wanted "LPSZ")\n",
-                               data->ioc_plen1, sizeof(pcfg));
-                        err = -EINVAL;
-                        break;
-                }
-
-                if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1,
-                                   sizeof(pcfg))) {
-                        err = -EFAULT;
-                        break;
-                }
-
-                CDEBUG (D_IOCTL, "nal command nal %x cmd %d\n", pcfg.pcfg_nal,
-                        pcfg.pcfg_command);
-                if (pcfg.pcfg_version != PORTALS_CFG_VERSION) {
-                        /* set this so userspace can tell when they
-                         * have an incompatible version and print a
-                         * decent message to the user
-                         */
-                        pcfg.pcfg_version = PORTALS_CFG_VERSION;
-                        if (copy_to_user((char *)data->ioc_pbuf1, &pcfg,
-                                         sizeof (pcfg)))
-                                err = -EFAULT;
-                        else
-                                err = -EINVAL;
-                } else {
-                        err = libcfs_nal_cmd(&pcfg);
-
-                        if (err == 0 &&
-                            copy_to_user((char *)data->ioc_pbuf1, &pcfg,
-                                         sizeof (pcfg)))
-                                err = -EFAULT;
-                }
-                break;
-        }
-
-        case IOC_PORTAL_MEMHOG:
+        case IOC_LIBCFS_MEMHOG:
                 if (pfile->private_data == NULL) {
                         err = -EINVAL;
                 } else {
@@ -454,17 +283,39 @@ static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *a
                 }
                 break;
 
+        case IOC_LIBCFS_PING_TEST: {
+                extern void (kping_client)(struct libcfs_ioctl_data *);
+                void (*ping)(struct libcfs_ioctl_data *);
+
+                CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n",
+                       data->ioc_count, libcfs_nid2str(data->ioc_nid),
+                       libcfs_nid2str(data->ioc_nid));
+                ping = PORTAL_SYMBOL_GET(kping_client);
+                if (!ping)
+                        CERROR("PORTAL_SYMBOL_GET failed\n");
+                else {
+                        ping(data);
+                        PORTAL_SYMBOL_PUT(kping_client);
+                }
+                RETURN(0);
+        }
+
         default: {
                 struct libcfs_ioctl_handler *hand;
                 err = -EINVAL;
                 down_read(&ioctl_list_sem);
                 list_for_each_entry(hand, &ioctl_list, item) {
-                        err = hand->handle_ioctl(data, cmd, (unsigned long)arg);
-                        if (err != -EINVAL)
+                        err = hand->handle_ioctl(cmd, data);
+                        if (err != -EINVAL) {
+                                if (err == 0)
+                                        err = libcfs_ioctl_popdata(arg, 
+                                                        data, sizeof (*data));
                                 break;
+                        }
                 }
                 up_read(&ioctl_list_sem);
-                } break;
+                break;
+        }
         }
 
         RETURN(err);
@@ -488,6 +339,7 @@ extern cfs_psdev_t libcfs_dev;
 extern struct rw_semaphore tracefile_sem;
 extern struct semaphore trace_thread_sem;
 
+extern void libcfs_init_nidstrings(void);
 extern int libcfs_arch_init(void);
 extern void libcfs_arch_cleanup(void);
 
@@ -496,15 +348,15 @@ static int init_libcfs_module(void)
         int rc;
 
         libcfs_arch_init();
+        libcfs_init_nidstrings();
         init_rwsem(&tracefile_sem);
         init_mutex(&trace_thread_sem);
-        init_mutex(&nal_cmd_mutex);
         init_rwsem(&ioctl_list_sem);
         CFS_INIT_LIST_HEAD(&ioctl_list);
 
-        rc = portals_debug_init(5 * 1024 * 1024);
+        rc = libcfs_debug_init(5 * 1024 * 1024);
         if (rc < 0) {
-                printk(KERN_ERR "LustreError: portals_debug_init: %d\n", rc);
+                printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc);
                 return (rc);
         }
 
@@ -537,7 +389,7 @@ static int init_libcfs_module(void)
         lwt_fini();
  cleanup_debug:
 #endif
-        portals_debug_cleanup();
+        libcfs_debug_cleanup();
         return rc;
 }
 
@@ -548,7 +400,7 @@ static void exit_libcfs_module(void)
         remove_proc();
 
         CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n",
-               atomic_read(&portal_kmemory));
+               atomic_read(&libcfs_kmemory));
 
         rc = cfs_psdev_deregister(&libcfs_dev);
         if (rc)
@@ -558,16 +410,14 @@ static void exit_libcfs_module(void)
         lwt_fini();
 #endif
 
-        if (atomic_read(&portal_kmemory) != 0)
+        if (atomic_read(&libcfs_kmemory) != 0)
                 CERROR("Portals memory leaked: %d bytes\n",
-                       atomic_read(&portal_kmemory));
+                       atomic_read(&libcfs_kmemory));
 
-        rc = portals_debug_cleanup();
+        rc = libcfs_debug_cleanup();
         if (rc)
-                printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc);
+                printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n", rc);
         libcfs_arch_cleanup();
 }
 
-EXPORT_SYMBOL(kportal_assertion_failed);
-
 cfs_module(libcfs, "1.0.0", init_libcfs_module, exit_libcfs_module);
diff --git a/lnet/libcfs/nidstrings.c b/lnet/libcfs/nidstrings.c
new file mode 100644 (file)
index 0000000..78a255d
--- /dev/null
@@ -0,0 +1,533 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Phil Schwan <phil@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lnet.h>
+#include <libcfs/kp30.h>
+#ifndef __KERNEL__
+#ifdef HAVE_GETHOSTBYNAME
+# include <netdb.h>
+#endif
+#endif
+
+/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids
+ * consistent in all conversion functions.  Some code fragments are copied
+ * around for the sake of clarity...
+ */
+
+/* CAVEAT EMPTOR! Racey temporary buffer allocation!
+ * Choose the number of nidstrings to support the MAXIMUM expected number of
+ * concurrent users.  If there are more, the returned string will be volatile.
+ * NB this number must allow for a process to be descheduled for a timeslice
+ * between getting its string and using it.
+ */
+
+#define LNET_NIDSTR_COUNT  128     /* # of nidstrings */
+#define LNET_NIDSTR_SIZE   32      /* size of each one (see below for usage) */
+
+static char      libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE];
+static int       libcfs_nidstring_idx = 0;
+
+#ifdef __KERNEL__
+static spinlock_t libcfs_nidstring_lock;
+
+void libcfs_init_nidstrings (void)
+{
+        spin_lock_init(&libcfs_nidstring_lock);
+}
+
+# define NIDSTR_LOCK(f)   spin_lock_irqsave(&libcfs_nidstring_lock, f)
+# define NIDSTR_UNLOCK(f) spin_unlock_irqrestore(&libcfs_nidstring_lock, f)
+#else
+# define NIDSTR_LOCK(f)   (f=0)                 /* avoid unused var warnings */
+# define NIDSTR_UNLOCK(f) (f=0)
+#endif
+
+static char *
+libcfs_next_nidstring (void)
+{
+        char          *str;
+        unsigned long  flags;
+
+        NIDSTR_LOCK(flags);
+
+        str = libcfs_nidstrings[libcfs_nidstring_idx++];
+        if (libcfs_nidstring_idx ==
+            sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0]))
+                libcfs_nidstring_idx = 0;
+
+        NIDSTR_UNLOCK(flags);
+        return str;
+}
+
+static int  libcfs_lo_str2addr(char *str, int nob, __u32 *addr);
+static void libcfs_ip_addr2str(__u32 addr, char *str);
+static int  libcfs_ip_str2addr(char *str, int nob, __u32 *addr);
+static void libcfs_decnum_addr2str(__u32 addr, char *str);
+static void libcfs_hexnum_addr2str(__u32 addr, char *str);
+static int  libcfs_num_str2addr(char *str, int nob, __u32 *addr);
+
+struct netstrfns {
+        int          nf_type;
+        char        *nf_name;
+        char        *nf_modname;
+        void       (*nf_addr2str)(__u32 addr, char *str);
+        int        (*nf_str2addr)(char *str, int nob, __u32 *addr);
+};
+
+static struct netstrfns  libcfs_netstrfns[] = {
+        {/* .nf_type      */  LOLND,
+         /* .nf_name      */  "lo",
+         /* .nf_modname   */  "klolnd",
+         /* .nf_addr2str  */  libcfs_decnum_addr2str,
+         /* .nf_str2addr  */  libcfs_lo_str2addr},
+        {/* .nf_type      */  SOCKLND,
+         /* .nf_name      */  "tcp",
+         /* .nf_modname   */  "ksocklnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  O2IBLND,
+         /* .nf_name      */  "o2ib",
+         /* .nf_modname   */  "ko2iblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  CIBLND,
+         /* .nf_name      */  "cib",
+         /* .nf_modname   */  "kciblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  OPENIBLND,
+         /* .nf_name      */  "openib",
+         /* .nf_modname   */  "kopeniblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  IIBLND,
+         /* .nf_name      */  "iib",
+         /* .nf_modname   */  "kiiblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  VIBLND,
+         /* .nf_name      */  "vib",
+         /* .nf_modname   */  "kviblnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  RALND,
+         /* .nf_name      */  "ra",
+         /* .nf_modname   */  "kralnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  QSWLND,
+         /* .nf_name      */  "elan",
+         /* .nf_modname   */  "kqswlnd",
+         /* .nf_addr2str  */  libcfs_decnum_addr2str,
+         /* .nf_str2addr  */  libcfs_num_str2addr},
+        {/* .nf_type      */  GMLND,
+         /* .nf_name      */  "gm",
+         /* .nf_modname   */  "kgmlnd",
+         /* .nf_addr2str  */  libcfs_hexnum_addr2str,
+         /* .nf_str2addr  */  libcfs_num_str2addr},
+        {/* .nf_type      */  MXLND,
+         /* .nf_name      */  "mx",
+         /* .nf_modname   */  "kmxlnd",
+         /* .nf_addr2str  */  libcfs_ip_addr2str,
+         /* .nf_str2addr  */  libcfs_ip_str2addr},
+        {/* .nf_type      */  PTLLND,
+         /* .nf_name      */  "ptl",
+         /* .nf_modname   */  "kptllnd",
+         /* .nf_addr2str  */  libcfs_decnum_addr2str,
+         /* .nf_str2addr  */  libcfs_num_str2addr},
+        /* placeholder for net0 alias.  It MUST BE THE LAST ENTRY */
+        {/* .nf_type      */  -1},
+};
+
+const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]);
+
+int
+libcfs_lo_str2addr(char *str, int nob, __u32 *addr)
+{
+        *addr = 0;
+        return 1;
+}
+
+void
+libcfs_ip_addr2str(__u32 addr, char *str)
+{
+#if 0   /* never lookup */
+#if !defined(__KERNEL__) && defined HAVE_GETHOSTBYNAME
+        __u32           netip = htonl(addr);
+        struct hostent *he = gethostbyaddr(&netip, sizeof(netip), AF_INET);
+
+        if (he != NULL) {
+                snprintf(str, LNET_NIDSTR_SIZE, "%s", he->h_name);
+                return;
+        }
+#endif
+#endif
+        snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u",
+                 (addr >> 24) & 0xff, (addr >> 16) & 0xff,
+                 (addr >> 8) & 0xff, addr & 0xff);
+}
+
+/* CAVEAT EMPTOR XscanfX
+ * I use "%n" at the end of a sscanf format to detect trailing junk.  However
+ * sscanf may return immediately if it sees the terminating '0' in a string, so
+ * I initialise the %n variable to the expected length.  If sscanf sets it;
+ * fine, if it doesn't, then the scan ended at the end of the string, which is
+ * fine too :) */
+
+int
+libcfs_ip_str2addr(char *str, int nob, __u32 *addr)
+{
+        int   a;
+        int   b;
+        int   c;
+        int   d;
+        int   n = nob;                          /* XscanfX */
+
+        /* numeric IP? */
+        if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 &&
+            n == nob &&
+            (a & ~0xff) == 0 && (b & ~0xff) == 0 &&
+            (c & ~0xff) == 0 && (d & ~0xff) == 0) {
+                *addr = ((a<<24)|(b<<16)|(c<<8)|d);
+                return 1;
+        }
+
+#if !defined(__KERNEL__) && defined HAVE_GETHOSTBYNAME
+        /* known hostname? */
+        if (('a' <= str[0] && str[0] <= 'z') ||
+            ('A' <= str[0] && str[0] <= 'Z')) {
+                char *tmp;
+
+                LIBCFS_ALLOC(tmp, nob + 1);
+                if (tmp != NULL) {
+                        struct hostent *he;
+
+                        memcpy(tmp, str, nob);
+                        tmp[nob] = 0;
+
+                        he = gethostbyname(tmp);
+
+                        LIBCFS_FREE(tmp, nob);
+
+                        if (he != NULL) {
+                                __u32 ip = *(__u32 *)he->h_addr;
+
+                                *addr = ntohl(ip);
+                                return 1;
+                        }
+                }
+        }
+#endif
+        return 0;
+}
+
+void
+libcfs_decnum_addr2str(__u32 addr, char *str)
+{
+        snprintf(str, LNET_NIDSTR_SIZE, "%u", addr);
+}
+
+void
+libcfs_hexnum_addr2str(__u32 addr, char *str)
+{
+        snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr);
+}
+
+int
+libcfs_num_str2addr(char *str, int nob, __u32 *addr)
+{
+        int     n;
+
+        n = nob;
+        if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob)
+                return 1;
+
+        n = nob;
+        if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob)
+                return 1;
+
+        n = nob;
+        if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob)
+                return 1;
+        
+        return 0;
+}
+
+struct netstrfns *
+libcfs_lnd2netstrfns(int lnd)
+{
+        int    i;
+
+        if (lnd >= 0)
+                for (i = 0; i < libcfs_nnetstrfns; i++)
+                        if (lnd == libcfs_netstrfns[i].nf_type)
+                                return &libcfs_netstrfns[i];
+
+        return NULL;
+}
+
+struct netstrfns *
+libcfs_name2netstrfns(char *name)
+{
+        int    i;
+
+        for (i = 0; i < libcfs_nnetstrfns; i++)
+                if (libcfs_netstrfns[i].nf_type >= 0 &&
+                    !strcmp(libcfs_netstrfns[i].nf_name, name))
+                        return &libcfs_netstrfns[i];
+
+        return NULL;
+}
+
+int
+libcfs_isknown_lnd(int type)
+{
+        return libcfs_lnd2netstrfns(type) != NULL;
+}
+
+char *
+libcfs_lnd2modname(int lnd)
+{
+        struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+        return (nf == NULL) ? NULL : nf->nf_modname;
+}
+
+char *
+libcfs_lnd2str(int lnd)
+{
+        char           *str;
+        struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+
+        if (nf != NULL)
+                return nf->nf_name;
+
+        str = libcfs_next_nidstring();
+        snprintf(str, LNET_NIDSTR_SIZE, "?%u?", lnd);
+        return str;
+}
+
+int
+libcfs_str2lnd(char *str)
+{
+        struct netstrfns *nf = libcfs_name2netstrfns(str);
+
+        if (nf != NULL)
+                return nf->nf_type;
+
+        return -1;
+}
+
+char *
+libcfs_net2str(__u32 net)
+{
+        int               lnd = LNET_NETTYP(net);
+        int               num = LNET_NETNUM(net);
+        struct netstrfns *nf  = libcfs_lnd2netstrfns(lnd);
+        char             *str = libcfs_next_nidstring();
+
+        if (nf == NULL)
+                snprintf(str, LNET_NIDSTR_SIZE, "<%u:%u>", lnd, num);
+        else if (num == 0)
+                snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name);
+        else
+                snprintf(str, LNET_NIDSTR_SIZE, "%s%u", nf->nf_name, num);
+
+        return str;
+}
+
+char *
+libcfs_nid2str(lnet_nid_t nid)
+{
+        __u32             addr = LNET_NIDADDR(nid);
+        __u32             net = LNET_NIDNET(nid);
+        int               lnd = LNET_NETTYP(net);
+        int               nnum = LNET_NETNUM(net);
+        struct netstrfns *nf;
+        char             *str;
+        int               nob;
+
+        if (nid == LNET_NID_ANY)
+                return "LNET_NID_ANY";
+
+        nf = libcfs_lnd2netstrfns(lnd);
+        str = libcfs_next_nidstring();
+
+        if (nf == NULL)
+                snprintf(str, LNET_NIDSTR_SIZE, "%x@<%u:%u>", addr, lnd, nnum);
+        else {
+                nf->nf_addr2str(addr, str);
+                nob = strlen(str);
+                if (nnum == 0)
+                        snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s",
+                                 nf->nf_name);
+                else
+                        snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%u",
+                                 nf->nf_name, nnum);
+        }
+
+        return str;
+}
+
+static struct netstrfns *
+libcfs_str2net_internal(char *str, __u32 *net)
+{
+        struct netstrfns *nf;
+        int               nob;
+        int               netnum;
+        int               i;
+
+        for (i = 0; i < libcfs_nnetstrfns; i++) {
+                nf = &libcfs_netstrfns[i];
+                if (nf->nf_type >= 0 &&
+                    !strncmp(str, nf->nf_name, strlen(nf->nf_name)))
+                        break;
+        }
+
+        if (i == libcfs_nnetstrfns)
+                return NULL;
+
+        nob = strlen(nf->nf_name);
+
+        if (strlen(str) == (unsigned int)nob) {
+                netnum = 0;
+        } else {
+                if (nf->nf_type == LOLND) /* net number not allowed */
+                        return NULL;
+
+                str += nob;
+                i = strlen(str);
+                if (sscanf(str, "%u%n", &netnum, &i) < 1 ||
+                    i != (int)strlen(str))
+                        return NULL;
+        }
+
+        *net = LNET_MKNET(nf->nf_type, netnum);
+        return nf;
+}
+
+__u32
+libcfs_str2net(char *str)
+{
+        __u32  net;
+
+        if (libcfs_str2net_internal(str, &net) != NULL)
+                return net;
+
+        return LNET_NIDNET(LNET_NID_ANY);
+}
+
+lnet_nid_t
+libcfs_str2nid(char *str)
+{
+        char             *sep = strchr(str, '@');
+        struct netstrfns *nf;
+        __u32             net;
+        __u32             addr;
+
+        if (sep != NULL) {
+                nf = libcfs_str2net_internal(sep + 1, &net);
+                if (nf == NULL)
+                        return LNET_NID_ANY;
+        } else {
+                sep = str + strlen(str);
+                net = LNET_MKNET(SOCKLND, 0);
+                nf = libcfs_lnd2netstrfns(SOCKLND);
+                LASSERT (nf != NULL);
+        }
+
+        if (!nf->nf_str2addr(str, sep - str, &addr))
+                return LNET_NID_ANY;
+
+        return LNET_MKNID(net, addr);
+}
+
+char *
+libcfs_id2str(lnet_process_id_t id)
+{
+        char *str = libcfs_next_nidstring();
+
+        snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s",
+                 ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "",
+                 (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid));
+        return str;
+}
+
+int
+libcfs_str2anynid(lnet_nid_t *nidp, char *str)
+{
+        if (!strcmp(str, "*")) {
+                *nidp = LNET_NID_ANY;
+                return 1;
+        }
+
+        *nidp = libcfs_str2nid(str);
+        return *nidp != LNET_NID_ANY;
+}
+
+#ifdef __KERNEL__
+void
+libcfs_setnet0alias(int lnd)
+{
+        struct netstrfns *nf = libcfs_lnd2netstrfns(lnd);
+        struct netstrfns *nf0 = &libcfs_netstrfns[libcfs_nnetstrfns - 1];
+
+        /* Ghastly hack to allow LNET to inter-operate with portals.
+         * NET type 0 becomes an alias for whatever local network we have, and
+         * this assignment here means we can parse and print its NIDs */
+
+        LASSERT (nf != NULL);
+        LASSERT (nf0->nf_type < 0);
+
+        nf0->nf_name = "zero";//nf->nf_name;
+        nf0->nf_modname = nf->nf_modname;
+        nf0->nf_addr2str = nf->nf_addr2str;
+        nf0->nf_str2addr = nf->nf_str2addr;
+        mb();
+        nf0->nf_type = 0;
+}
+
+EXPORT_SYMBOL(libcfs_isknown_lnd);
+EXPORT_SYMBOL(libcfs_lnd2modname);
+EXPORT_SYMBOL(libcfs_lnd2str);
+EXPORT_SYMBOL(libcfs_str2lnd);
+EXPORT_SYMBOL(libcfs_net2str);
+EXPORT_SYMBOL(libcfs_nid2str);
+EXPORT_SYMBOL(libcfs_str2net);
+EXPORT_SYMBOL(libcfs_str2nid);
+EXPORT_SYMBOL(libcfs_id2str);
+EXPORT_SYMBOL(libcfs_str2anynid);
+EXPORT_SYMBOL(libcfs_setnet0alias);
+#else  /* __KERNEL__ */
+void
+libcfs_setnet0alias(int lnd)
+{
+        LCONSOLE_ERROR("Liblustre cannot interoperate with old Portals.\n"
+                       "portals_compatibility must be set to 'none'.\n");
+}
+#endif
index e93ff1b..0b8e61e 100644 (file)
@@ -22,7 +22,7 @@
  */
 
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 #define LUSTRE_TRACEFILE_PRIVATE
 #include "tracefile.h"
 
 /* XXX move things up to the top, comment */
 union trace_data_union trace_data[NR_CPUS] __cacheline_aligned;
 
-struct rw_semaphore tracefile_sem;
 char *tracefile = NULL;
-long long tracefile_size = TRACEFILE_SIZE;
+int64_t tracefile_size = TRACEFILE_SIZE;
 static struct tracefiled_ctl trace_tctl;
 struct semaphore trace_thread_sem;
 static int thread_running = 0;
 
-static void put_pages_on_daemon_list_on_cpu(void *info);
+atomic_t tage_allocated = ATOMIC_INIT(0);
+
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                         struct trace_cpu_data *tcd);
 
 static inline struct trace_page *tage_from_list(struct list_head *list)
 {
@@ -51,71 +53,91 @@ static struct trace_page *tage_alloc(int gfp)
         cfs_page_t        *page;
         struct trace_page *tage;
 
+        /*
+         * Don't spam console with allocation failures: they will be reported
+         * by upper layer anyway.
+         */
+        gfp |= CFS_ALLOC_NOWARN;
         page = cfs_alloc_page(gfp);
         if (page == NULL)
                 return NULL;
-        
+
         tage = cfs_alloc(sizeof(*tage), gfp);
         if (tage == NULL) {
                 cfs_free_page(page);
                 return NULL;
         }
-        
+
         tage->page = page;
+        atomic_inc(&tage_allocated);
         return tage;
 }
 
 static void tage_free(struct trace_page *tage)
 {
-        LASSERT(tage != NULL);
-        LASSERT(tage->page != NULL);
+        __LASSERT(tage != NULL);
+        __LASSERT(tage->page != NULL);
 
         cfs_free_page(tage->page);
         cfs_free(tage);
+        atomic_dec(&tage_allocated);
 }
 
 static void tage_to_tail(struct trace_page *tage, struct list_head *queue)
 {
-        LASSERT(tage != NULL);
-        LASSERT(queue != NULL);
+        __LASSERT(tage != NULL);
+        __LASSERT(queue != NULL);
 
         list_move_tail(&tage->linkage, queue);
 }
 
-static void LASSERT_TAGE_INVARIANT(struct trace_page *tage)
+int trace_refill_stock(struct trace_cpu_data *tcd, int gfp,
+                       struct list_head *stock)
 {
-        LASSERT(tage != NULL);
-        LASSERT(tage->page != NULL);
-        LASSERTF(tage->used <= CFS_PAGE_SIZE, "used = %u, PAGE_SIZE %lu\n",
-                 tage->used, CFS_PAGE_SIZE);
-        LASSERTF(cfs_page_count(tage->page) > 0, "count = %d\n",
-                 cfs_page_count(tage->page));
+        int i;
+
+        /*
+         * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+         * from here: this will lead to infinite recursion.
+         */
+
+        for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) {
+                struct trace_page *tage;
+
+                tage = tage_alloc(gfp);
+                if (tage == NULL)
+                        break;
+                list_add_tail(&tage->linkage, stock);
+        }
+        return i;
 }
 
 /* return a page that has 'len' bytes left at the end */
-static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd,
-                                         unsigned long len)
+static struct trace_page *trace_get_tage_try(struct trace_cpu_data *tcd,
+                                             unsigned long len)
 {
         struct trace_page *tage;
 
-        if (len > CFS_PAGE_SIZE) {
-                printk(KERN_ERR "cowardly refusing to write %lu bytes in a "
-                       "page\n", len);
-                return NULL;
-        }
-
-        if (!list_empty(&tcd->tcd_pages)) {
+        if (tcd->tcd_cur_pages > 0) {
+                __LASSERT(!list_empty(&tcd->tcd_pages));
                 tage = tage_from_list(tcd->tcd_pages.prev);
                 if (tage->used + len <= CFS_PAGE_SIZE)
                         return tage;
         }
 
         if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
-                tage = tage_alloc(CFS_ALLOC_ATOMIC);
-                if (tage == NULL) {
-                        /* the kernel should print a message for us.  fall back
-                         * to using the last page in the ring buffer. */
-                        goto ring_buffer;
+                if (tcd->tcd_cur_stock_pages > 0) {
+                        tage = tage_from_list(tcd->tcd_stock_pages.prev);
+                        -- tcd->tcd_cur_stock_pages;
+                        list_del_init(&tage->linkage);
+                } else {
+                        tage = tage_alloc(CFS_ALLOC_ATOMIC);
+                        if (tage == NULL) {
+                                printk(KERN_WARNING
+                                       "failure to allocate a tage (%ld)\n",
+                                       tcd->tcd_cur_pages);
+                                return NULL;
+                        }
                 }
 
                 tage->used = 0;
@@ -125,131 +147,346 @@ static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd,
 
                 if (tcd->tcd_cur_pages > 8 && thread_running) {
                         struct tracefiled_ctl *tctl = &trace_tctl;
+                        /*
+                         * wake up tracefiled to process some pages.
+                         */
                         cfs_waitq_signal(&tctl->tctl_waitq);
                 }
                 return tage;
         }
+        return NULL;
+}
 
- ring_buffer:
-        if (thread_running) {
-                int pgcount = tcd->tcd_cur_pages / 10;
-                struct page_collection pc;
-                struct trace_page *tage;
-                struct trace_page *tmp;
+static void tcd_shrink(struct trace_cpu_data *tcd)
+{
+        int pgcount = tcd->tcd_cur_pages / 10;
+        struct page_collection pc;
+        struct trace_page *tage;
+        struct trace_page *tmp;
 
-                printk(KERN_WARNING "debug daemon buffer overflowed; discarding"
-                       " 10%% of pages (%d)\n", pgcount + 1);
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
 
-                CFS_INIT_LIST_HEAD(&pc.pc_pages);
-                spin_lock_init(&pc.pc_lock);
+        printk(KERN_WARNING "debug daemon buffer overflowed; discarding"
+               " 10%% of pages (%d of %ld)\n", pgcount + 1, tcd->tcd_cur_pages);
 
-                list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
-                        if (pgcount-- == 0)
-                                break;
+        CFS_INIT_LIST_HEAD(&pc.pc_pages);
+        spin_lock_init(&pc.pc_lock);
 
-                        list_move_tail(&tage->linkage, &pc.pc_pages);
-                        tcd->tcd_cur_pages--;
-                }
-                put_pages_on_daemon_list_on_cpu(&pc);
+        list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
+                if (pgcount-- == 0)
+                        break;
 
-                LASSERT(!list_empty(&tcd->tcd_pages));
+                list_move_tail(&tage->linkage, &pc.pc_pages);
+                tcd->tcd_cur_pages--;
         }
+        put_pages_on_tcd_daemon_list(&pc, tcd);
+}
 
-        if (list_empty(&tcd->tcd_pages))
-                return NULL;
+/* return a page that has 'len' bytes left at the end */
+static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd,
+                                         unsigned long len)
+{
+        struct trace_page *tage;
+
+       /*
+        * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
+        * from here: this will lead to infinite recursion.
+        */
 
-        tage = tage_from_list(tcd->tcd_pages.next);
-        tage->used = 0;
-        tage_to_tail(tage, &tcd->tcd_pages);
+        if (len > CFS_PAGE_SIZE) {
+                printk(KERN_ERR
+                       "cowardly refusing to write %lu bytes in a page\n", len);
+                return NULL;
+        }
 
+        tage = trace_get_tage_try(tcd, len);
+        if (tage != NULL)
+                return tage;
+        if (thread_running)
+                tcd_shrink(tcd);
+        if (tcd->tcd_cur_pages > 0) {
+                tage = tage_from_list(tcd->tcd_pages.next);
+                tage->used = 0;
+                tage_to_tail(tage, &tcd->tcd_pages);
+        }
         return tage;
 }
 
-void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
-                       const int line, unsigned long stack, char *format, ...)
+int libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls, int subsys, int mask,
+                       const char *file, const char *fn, const int line,
+                       const char *format1, va_list args,
+                       const char *format2, ...)                      
 {
-        struct trace_cpu_data *tcd;
-        struct ptldebug_header header;
-        struct trace_page *tage;
-        char *debug_buf = format;
-        int known_size, needed = 85 /* average message length */, max_nob;
-        va_list       ap;
-        unsigned long flags;
-
-#ifdef CRAY_PORTALS
-        if (mask == D_PORTALS && !(portal_debug & D_PORTALS))
-                return;
-#endif
+        struct trace_cpu_data   *tcd = NULL;
+        struct ptldebug_header   header;
+        struct trace_page       *tage;
+        /* string_buf is used only if tcd != NULL, and is always set then */
+        char                    *string_buf = NULL;
+        char                    *debug_buf;
+        int                      known_size;
+        int                      needed = 85; /* average message length */
+        int                      max_nob;
+        va_list                  ap;
+        int                      depth;
+        int                      i;
+        int                      remain;
+
         if (strchr(file, '/'))
                 file = strrchr(file, '/') + 1;
 
-        if (*(format + strlen(format) - 1) != '\n')
-                printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n",
-                       file, line, fn);
 
-        tcd = trace_get_tcd(flags);
-        if (tcd->tcd_shutting_down)
-                goto out;
+        set_ptldebug_header(&header, subsys, mask, line, CDEBUG_STACK());
 
-        set_ptldebug_header(&header, subsys, mask, line, stack);
-        known_size = sizeof(header) + strlen(file) + strlen(fn) + 2; // nulls
+        tcd = trace_get_tcd();
+        if (tcd == NULL)                /* arch may not log in IRQ context */
+                goto console;
 
- retry:
-        tage = trace_get_tage(tcd, needed + known_size);
-        if (tage == NULL) {
-                debug_buf = format;
-                if (needed + known_size > CFS_PAGE_SIZE)
-                        mask |= D_ERROR;
-                needed = strlen(format);
-                goto out;
+        if (tcd->tcd_shutting_down) {
+                trace_put_tcd(tcd);
+                tcd = NULL;
+                goto console;
         }
 
-        debug_buf = cfs_page_address(tage->page) + tage->used + known_size;
+        depth = __current_nesting_level();
+        known_size = strlen(file) + 1 + depth;
+        if (fn)
+                known_size += strlen(fn) + 1;
+
+        if (libcfs_debug_binary)
+                known_size += sizeof(header);
+
+        /*/
+         * '2' used because vsnprintf return real size required for output
+         * _without_ terminating NULL.
+         * if needed is to small for this format.
+         */
+        for (i=0;i<2;i++) {
+                tage = trace_get_tage(tcd, needed + known_size + 1);
+                if (tage == NULL) {
+                        if (needed + known_size > CFS_PAGE_SIZE)
+                                mask |= D_ERROR;
 
-        max_nob = CFS_PAGE_SIZE - tage->used - known_size;
-        LASSERT(max_nob > 0);
-        va_start(ap, format);
-        needed = vsnprintf(debug_buf, max_nob, format, ap);
-        va_end(ap);
+                        trace_put_tcd(tcd);
+                        tcd = NULL;
+                        goto console;
+                }
 
-        if (needed > max_nob) /* overflow.  oh poop. */
-                goto retry;
+                string_buf = (char *)cfs_page_address(tage->page)+tage->used+known_size;
+
+                max_nob = CFS_PAGE_SIZE - tage->used - known_size;
+                if (max_nob <= 0) {
+                        printk(KERN_EMERG "negative max_nob: %i\n", max_nob);
+                        mask |= D_ERROR;
+                        trace_put_tcd(tcd);
+                        tcd = NULL;
+                        goto console;
+                }
+
+                needed = 0;
+                if (format1) {
+                        va_copy(ap, args);
+                        needed = vsnprintf(string_buf, max_nob, format1, ap);
+                        va_end(ap);
+                }
+               
+
+                if (format2) {
+                       remain = max_nob - needed;
+                        if (remain < 0)
+                                remain = 0;
+               
+                        va_start(ap, format2);
+                        needed += vsnprintf(string_buf+needed, remain, format2, ap);
+                        va_end(ap);
+                }
 
+                if (needed < max_nob) /* well. printing ok.. */
+                        break;
+        }
+       
+        if (*(string_buf+needed-1) != '\n')
+                printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n",
+                       file, line, fn);
+       
         header.ph_len = known_size + needed;
-        debug_buf = cfs_page_address(tage->page) + tage->used;
+        debug_buf = (char *)cfs_page_address(tage->page) + tage->used;
 
-        memcpy(debug_buf, &header, sizeof(header));
-        tage->used += sizeof(header);
-        debug_buf += sizeof(header);
+        if (libcfs_debug_binary) {
+                memcpy(debug_buf, &header, sizeof(header));
+                tage->used += sizeof(header);
+                debug_buf += sizeof(header);
+        }
+
+        /* indent message according to the nesting level */
+        while (depth-- > 0) {
+                *(debug_buf++) = '.';
+                ++ tage->used;
+        }
 
         strcpy(debug_buf, file);
         tage->used += strlen(file) + 1;
         debug_buf += strlen(file) + 1;
 
-        strcpy(debug_buf, fn);
-        tage->used += strlen(fn) + 1;
-        debug_buf += strlen(fn) + 1;
+        if (fn) {
+                strcpy(debug_buf, fn);
+                tage->used += strlen(fn) + 1;
+                debug_buf += strlen(fn) + 1;
+        }
+
+        __LASSERT(debug_buf == string_buf);
 
         tage->used += needed;
-        if (tage->used > CFS_PAGE_SIZE)
-                printk(KERN_EMERG
-                       "tage->used == %u in portals_debug_msg\n", tage->used);
+        __LASSERT (tage->used <= CFS_PAGE_SIZE);
+
+console:
+        if (!((mask & D_CANTMASK) != 0 || (mask & libcfs_printk) != 0)) {
+                /* no console output requested */
+                if (tcd != NULL)
+                        trace_put_tcd(tcd);
+                return 1;
+        }
 
- out:
-        if ((mask & (D_EMERG | D_ERROR | D_WARNING | D_CONSOLE)) || portal_printk)
-                print_to_console(&header, mask, debug_buf, needed, file, fn);
+        if (cdls != NULL) {
+                cfs_time_t      t = cdls->cdls_next +
+                                    cfs_time_seconds(CDEBUG_MAX_LIMIT + 10);
+                cfs_duration_t  dmax = cfs_time_seconds(CDEBUG_MAX_LIMIT);
+
+                if (libcfs_console_ratelimit &&
+                    cdls->cdls_next != 0 &&     /* not first time ever */
+                    !cfs_time_after(cfs_time_current(), cdls->cdls_next)) {
+                        /* skipping a console message */
+                        cdls->cdls_count++;
+                        if (tcd != NULL)
+                                trace_put_tcd(tcd);
+                        return 1;
+                }
+
+                if (cfs_time_after(cfs_time_current(), t)) {
+                        /* last timeout was a long time ago */
+                        cdls->cdls_delay /= 8;
+                } else {
+                        cdls->cdls_delay *= 2;
+
+                        if (cdls->cdls_delay < CFS_TICK)
+                                cdls->cdls_delay = CFS_TICK;
+                        else if (cdls->cdls_delay > dmax)
+                                cdls->cdls_delay = dmax;
+                }
+
+                /* ensure cdls_next is never zero after it's been seen */
+                cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1;
+        }
+
+        if (tcd != NULL) {
+                print_to_console(&header, mask, string_buf, needed, file, fn);
+                trace_put_tcd(tcd);
+        } else {
+                string_buf = trace_get_console_buffer();
+
+                needed = 0;
+                if (format1 != NULL) {
+                        va_copy(ap, args);
+                        needed = vsnprintf(string_buf, TRACE_CONSOLE_BUFFER_SIZE, format1, ap);
+                        va_end(ap);
+                }
+                if (format2 != NULL) {
+                        remain = TRACE_CONSOLE_BUFFER_SIZE - needed;
+                        if (remain > 0) {
+                                va_start(ap, format2);
+                                needed += vsnprintf(string_buf+needed, remain, format2, ap);
+                                va_end(ap);
+                        }
+                }
+                print_to_console(&header, mask,
+                                 string_buf, needed, file, fn);
+
+                trace_put_console_buffer(string_buf);
+        }
+
+        if (cdls != NULL && cdls->cdls_count != 0) {
+                string_buf = trace_get_console_buffer();
+
+                needed = snprintf(string_buf, TRACE_CONSOLE_BUFFER_SIZE,
+                         "Skipped %d previous similar message%s\n",
+                         cdls->cdls_count, (cdls->cdls_count > 1) ? "s" : "");
+
+                print_to_console(&header, mask,
+                                 string_buf, needed, file, fn);
 
-        trace_put_tcd(tcd, flags);
+                trace_put_console_buffer(string_buf);
+                cdls->cdls_count = 0;
+        }
+
+        return 0;
+}
+EXPORT_SYMBOL(libcfs_debug_vmsg2);
+
+void
+libcfs_assertion_failed(const char *expr, const char *file,
+                        const char *func, const int line)
+{
+        libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line,
+                         "ASSERTION(%s) failed\n", expr);
+        LBUG();
+}
+EXPORT_SYMBOL(libcfs_assertion_failed);
+
+void
+trace_assertion_failed(const char *str,
+                       const char *fn, const char *file, int line)
+{
+        struct ptldebug_header hdr;
+
+        libcfs_panic_in_progress = 1;
+        libcfs_catastrophe = 1;
+        mb();
+
+        set_ptldebug_header(&hdr, DEBUG_SUBSYSTEM, D_EMERG, line,
+                            CDEBUG_STACK());
+
+        print_to_console(&hdr, D_EMERG, str, strlen(str), file, fn);
+
+        LIBCFS_PANIC("Lustre debug assertion failure\n");
+
+        /* not reached */
+}
+
+static void
+panic_collect_pages(struct page_collection *pc)
+{
+        /* Do the collect_pages job on a single CPU: assumes that all other
+         * CPUs have been stopped during a panic.  If this isn't true for some
+         * arch, this will have to be implemented separately in each arch.  */
+        int                    i;
+        struct trace_cpu_data *tcd;
+
+        CFS_INIT_LIST_HEAD(&pc->pc_pages);
+
+        for (i = 0; i < NR_CPUS; i++) {
+                tcd = &trace_data[i].tcd;
+
+                list_splice(&tcd->tcd_pages, &pc->pc_pages);
+                CFS_INIT_LIST_HEAD(&tcd->tcd_pages);
+                tcd->tcd_cur_pages = 0;
+
+                if (pc->pc_want_daemon_pages) {
+                        list_splice(&tcd->tcd_daemon_pages, &pc->pc_pages);
+                        CFS_INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
+                        tcd->tcd_cur_daemon_pages = 0;
+                }
+        }
 }
-EXPORT_SYMBOL(portals_debug_msg);
 
 static void collect_pages_on_cpu(void *info)
 {
         struct trace_cpu_data *tcd;
-        unsigned long flags;
         struct page_collection *pc = info;
 
-        tcd = trace_get_tcd(flags);
+        tcd = trace_get_tcd();
+        __LASSERT (tcd != NULL);
 
         spin_lock(&pc->pc_lock);
         list_splice(&tcd->tcd_pages, &pc->pc_pages);
@@ -262,15 +499,17 @@ static void collect_pages_on_cpu(void *info)
         }
         spin_unlock(&pc->pc_lock);
 
-        trace_put_tcd(tcd, flags);
+        trace_put_tcd(tcd);
 }
 
 static void collect_pages(struct page_collection *pc)
 {
-        /* needs to be fixed up for preempt */
         CFS_INIT_LIST_HEAD(&pc->pc_pages);
-        collect_pages_on_cpu(pc);
-        smp_call_function(collect_pages_on_cpu, pc, 0, 1);
+
+        if (libcfs_panic_in_progress)
+                panic_collect_pages(pc);
+        else
+                trace_call_on_all_cpus(collect_pages_on_cpu, pc);
 }
 
 static void put_pages_back_on_cpu(void *info)
@@ -278,18 +517,18 @@ static void put_pages_back_on_cpu(void *info)
         struct page_collection *pc = info;
         struct trace_cpu_data *tcd;
         struct list_head *cur_head;
-        unsigned long flags;
         struct trace_page *tage;
         struct trace_page *tmp;
 
-        tcd = trace_get_tcd(flags);
+        tcd = trace_get_tcd();
+        __LASSERT (tcd != NULL);
 
         cur_head = tcd->tcd_pages.next;
 
         spin_lock(&pc->pc_lock);
         list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
 
-                LASSERT_TAGE_INVARIANT(tage);
+                __LASSERT_TAGE_INVARIANT(tage);
 
                 if (tage->cpu != smp_processor_id())
                         continue;
@@ -299,34 +538,29 @@ static void put_pages_back_on_cpu(void *info)
         }
         spin_unlock(&pc->pc_lock);
 
-        trace_put_tcd(tcd, flags);
+        trace_put_tcd(tcd);
 }
 
 static void put_pages_back(struct page_collection *pc)
 {
-        /* needs to be fixed up for preempt */
-        put_pages_back_on_cpu(pc);
-        smp_call_function(put_pages_back_on_cpu, pc, 0, 1);
+        if (!libcfs_panic_in_progress)
+                trace_call_on_all_cpus(put_pages_back_on_cpu, pc);
 }
 
 /* Add pages to a per-cpu debug daemon ringbuffer.  This buffer makes sure that
  * we have a good amount of data at all times for dumping during an LBUG, even
  * if we have been steadily writing (and otherwise discarding) pages via the
  * debug daemon. */
-static void put_pages_on_daemon_list_on_cpu(void *info)
+static void put_pages_on_tcd_daemon_list(struct page_collection *pc,
+                                         struct trace_cpu_data *tcd)
 {
-        struct page_collection *pc = info;
-        struct trace_cpu_data *tcd;
         struct trace_page *tage;
         struct trace_page *tmp;
-        unsigned long flags;
-
-        tcd = trace_get_tcd(flags);
 
         spin_lock(&pc->pc_lock);
         list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) {
 
-                LASSERT_TAGE_INVARIANT(tage);
+                __LASSERT_TAGE_INVARIANT(tage);
 
                 if (tage->cpu != smp_processor_id())
                         continue;
@@ -337,10 +571,10 @@ static void put_pages_on_daemon_list_on_cpu(void *info)
                 if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
                         struct trace_page *victim;
 
-                        LASSERT(!list_empty(&tcd->tcd_daemon_pages));
+                        __LASSERT(!list_empty(&tcd->tcd_daemon_pages));
                         victim = tage_from_list(tcd->tcd_daemon_pages.next);
 
-                        LASSERT_TAGE_INVARIANT(victim);
+                        __LASSERT_TAGE_INVARIANT(victim);
 
                         list_del(&victim->linkage);
                         tage_free(victim);
@@ -348,14 +582,23 @@ static void put_pages_on_daemon_list_on_cpu(void *info)
                 }
         }
         spin_unlock(&pc->pc_lock);
+}
 
-        trace_put_tcd(tcd, flags);
+static void put_pages_on_daemon_list_on_cpu(void *info)
+{
+        struct trace_cpu_data *tcd;
+
+        tcd = trace_get_tcd();
+        __LASSERT (tcd != NULL);
+
+        put_pages_on_tcd_daemon_list(info, tcd);
+
+        trace_put_tcd(tcd);
 }
 
 static void put_pages_on_daemon_list(struct page_collection *pc)
 {
-        put_pages_on_daemon_list_on_cpu(pc);
-        smp_call_function(put_pages_on_daemon_list_on_cpu, pc, 0, 1);
+        trace_call_on_all_cpus(put_pages_on_daemon_list_on_cpu, pc);
 }
 
 void trace_debug_print(void)
@@ -372,11 +615,11 @@ void trace_debug_print(void)
                 char *p, *file, *fn;
                 cfs_page_t *page;
 
-                LASSERT_TAGE_INVARIANT(tage);
+                __LASSERT_TAGE_INVARIANT(tage);
 
                 page = tage->page;
                 p = cfs_page_address(page);
-                while (p < ((char *)cfs_page_address(page) + CFS_PAGE_SIZE)) {
+                while (p < ((char *)cfs_page_address(page) + tage->used)) {
                         struct ptldebug_header *hdr;
                         int len;
                         hdr = (void *)p;
@@ -388,6 +631,8 @@ void trace_debug_print(void)
                         len = hdr->ph_len - (p - (char *)hdr);
 
                         print_to_console(hdr, D_EMERG, p, len, file, fn);
+
+                        p += len;
                 }
 
                 list_del(&tage->linkage);
@@ -401,13 +646,14 @@ int tracefile_dump_all_pages(char *filename)
         cfs_file_t *filp;
         struct trace_page *tage;
         struct trace_page *tmp;
-        CFS_DECL_MMSPACE;
         int rc;
 
-        down_write(&tracefile_sem);
+        CFS_DECL_MMSPACE;
+
+        tracefile_write_lock();
 
         filp = cfs_filp_open(filename,
-                             O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0666, &rc);
+                             O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600, &rc);
         if (!filp) {
                 printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
                        filename, rc);
@@ -427,14 +673,15 @@ int tracefile_dump_all_pages(char *filename)
         CFS_MMSPACE_OPEN;
         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
 
-                LASSERT_TAGE_INVARIANT(tage);
+                __LASSERT_TAGE_INVARIANT(tage);
 
                 rc = cfs_filp_write(filp, cfs_page_address(tage->page),
                                     tage->used, cfs_filp_poff(filp));
-                if (rc != tage->used) {
+                if (rc != (int)tage->used) {
                         printk(KERN_WARNING "wanted to write %u but wrote "
                                "%d\n", tage->used, rc);
                         put_pages_back(&pc);
+                        __LASSERT(list_empty(&pc.pc_pages));
                         break;
                 }
                 list_del(&tage->linkage);
@@ -447,7 +694,7 @@ int tracefile_dump_all_pages(char *filename)
  close:
         cfs_filp_close(filp);
  out:
-        up_write(&tracefile_sem);
+        tracefile_write_unlock();
         return rc;
 }
 
@@ -463,7 +710,7 @@ void trace_flush_pages(void)
         collect_pages(&pc);
         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
 
-                LASSERT_TAGE_INVARIANT(tage);
+                __LASSERT_TAGE_INVARIANT(tage);
 
                 list_del(&tage->linkage);
                 tage_free(tage);
@@ -481,15 +728,17 @@ int trace_dk(struct file *file, const char *buffer, unsigned long count,
         if (name == NULL)
                 return -ENOMEM;
 
-        if (copy_from_user(name, buffer, count)) {
+        if (copy_from_user((void *)name, (void *)buffer, count)) {
                 rc = -EFAULT;
                 goto out;
         }
 
+#if !defined(__WINNT__)
         if (name[0] != '/') {
                 rc = -EINVAL;
                 goto out;
         }
+#endif
 
         /* be nice and strip out trailing '\n' */
         for (off = count ; off > 2 && isspace(name[off - 1]); off--)
@@ -512,13 +761,13 @@ static int tracefiled(void *arg)
         struct trace_page *tmp;
         struct ptldebug_header *hdr;
         cfs_file_t *filp;
-        CFS_DECL_MMSPACE;
         int rc;
 
+        CFS_DECL_MMSPACE;
+
         /* we're started late enough that we pick up init's fs context */
         /* this is so broken in uml?  what on earth is going on? */
-        kportal_daemonize("ktracefiled");
-        reparent_to_init();
+        cfs_daemonize("ktracefiled");
 
         spin_lock_init(&pc.pc_lock);
         complete(&tctl->tctl_start);
@@ -529,7 +778,8 @@ static int tracefiled(void *arg)
                 cfs_waitlink_init(&__wait);
                 cfs_waitq_add(&tctl->tctl_waitq, &__wait);
                 set_current_state(TASK_INTERRUPTIBLE);
-                cfs_waitq_timedwait(&__wait, cfs_time_seconds(1));
+                cfs_waitq_timedwait(&__wait, CFS_TASK_INTERRUPTIBLE,
+                                    cfs_time_seconds(1));
                 cfs_waitq_del(&tctl->tctl_waitq, &__wait);
 
                 if (atomic_read(&tctl->tctl_shutdown))
@@ -541,16 +791,18 @@ static int tracefiled(void *arg)
                         continue;
 
                 filp = NULL;
-                down_read(&tracefile_sem);
+                tracefile_read_lock();
                 if (tracefile != NULL) {
-                        filp = cfs_filp_open(tracefile, O_CREAT|O_RDWR|O_LARGEFILE,
-                                        0600, &rc);
+                        filp = cfs_filp_open(tracefile,
+                                             O_CREAT | O_RDWR | O_LARGEFILE,
+                                             0600, &rc);
                         if (!(filp))
                                 printk("couldn't open %s: %d\n", tracefile, rc);
                 }
-                up_read(&tracefile_sem);
+                tracefile_read_unlock();
                 if (filp == NULL) {
                         put_pages_on_daemon_list(&pc);
+                        __LASSERT(list_empty(&pc.pc_pages));
                         continue;
                 }
 
@@ -558,7 +810,7 @@ static int tracefiled(void *arg)
 
                 /* mark the first header, so we can sort in chunks */
                 tage = tage_from_list(pc.pc_pages.next);
-                LASSERT_TAGE_INVARIANT(tage);
+                __LASSERT_TAGE_INVARIANT(tage);
 
                 hdr = cfs_page_address(tage->page);
                 hdr->ph_flags |= PH_FLAG_FIRST_RECORD;
@@ -566,25 +818,27 @@ static int tracefiled(void *arg)
                 list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
                         static loff_t f_pos;
 
-                        LASSERT_TAGE_INVARIANT(tage);
+                        __LASSERT_TAGE_INVARIANT(tage);
 
-                        if (f_pos >= tracefile_size)
+                        if (f_pos >= (off_t)tracefile_size)
                                 f_pos = 0;
                         else if (f_pos > cfs_filp_size(filp))
                                 f_pos = cfs_filp_size(filp);
 
                         rc = cfs_filp_write(filp, cfs_page_address(tage->page),
                                             tage->used, &f_pos);
-                        if (rc != tage->used) {
+                        if (rc != (int)tage->used) {
                                 printk(KERN_WARNING "wanted to write %u but "
                                        "wrote %d\n", tage->used, rc);
                                 put_pages_back(&pc);
+                                __LASSERT(list_empty(&pc.pc_pages));
                         }
                 }
                 CFS_MMSPACE_CLOSE;
 
                 cfs_filp_close(filp);
                 put_pages_on_daemon_list(&pc);
+                __LASSERT(list_empty(&pc.pc_pages));
         }
         complete(&tctl->tctl_stop);
         return 0;
@@ -633,17 +887,26 @@ void trace_stop_thread(void)
 int tracefile_init(void)
 {
         struct trace_cpu_data *tcd;
-        int i;
+        int                    i;
+        int                    rc;
+
+        rc = tracefile_init_arch();
+        if (rc != 0)
+                return rc;
 
         for (i = 0; i < NR_CPUS; i++) {
                 tcd = &trace_data[i].tcd;
                 CFS_INIT_LIST_HEAD(&tcd->tcd_pages);
+                CFS_INIT_LIST_HEAD(&tcd->tcd_stock_pages);
                 CFS_INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
                 tcd->tcd_cur_pages = 0;
+                tcd->tcd_cur_stock_pages = 0;
                 tcd->tcd_cur_daemon_pages = 0;
                 tcd->tcd_max_pages = TCD_MAX_PAGES;
                 tcd->tcd_shutting_down = 0;
+                tcd->tcd_cpu = i;
         }
+
         return 0;
 }
 
@@ -652,21 +915,21 @@ static void trace_cleanup_on_cpu(void *info)
         struct trace_cpu_data *tcd;
         struct trace_page *tage;
         struct trace_page *tmp;
-        unsigned long flags;
 
-        tcd = trace_get_tcd(flags);
+        tcd = trace_get_tcd();
+        __LASSERT (tcd != NULL);
 
         tcd->tcd_shutting_down = 1;
 
         list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
-                LASSERT_TAGE_INVARIANT(tage);
+                __LASSERT_TAGE_INVARIANT(tage);
 
                 list_del(&tage->linkage);
                 tage_free(tage);
         }
         tcd->tcd_cur_pages = 0;
 
-        trace_put_tcd(tcd, flags);
+        trace_put_tcd(tcd);
 }
 
 static void trace_cleanup(void)
@@ -676,8 +939,9 @@ static void trace_cleanup(void)
         CFS_INIT_LIST_HEAD(&pc.pc_pages);
         spin_lock_init(&pc.pc_lock);
 
-        trace_cleanup_on_cpu(&pc);
-        smp_call_function(trace_cleanup_on_cpu, &pc, 0, 1);
+        trace_call_on_all_cpus(trace_cleanup_on_cpu, &pc);
+
+        tracefile_fini_arch();
 }
 
 void tracefile_exit(void)
index 4e7fdde..f3568e9 100644 (file)
@@ -3,6 +3,16 @@
 
 #include <libcfs/libcfs.h>
 
+/* trace file lock routines */
+
+int  tracefile_init_arch(void);
+void tracefile_fini_arch(void);
+
+void tracefile_read_lock(void);
+void tracefile_read_unlock(void);
+void tracefile_write_lock(void);
+void tracefile_write_unlock(void);
+
 int tracefile_dump_all_pages(char *filename);
 void trace_debug_print(void);
 void trace_flush_pages(void);
@@ -21,38 +31,112 @@ int trace_read_debug_mb(char *page, char **start, off_t off, int count,
 int trace_dk(struct file *file, const char *buffer, unsigned long count,
              void *data);
 
+extern void libcfs_debug_dumplog_internal(void *arg);
+extern void libcfs_register_panic_notifier(void);
+extern void libcfs_unregister_panic_notifier(void);
+extern int  libcfs_panic_in_progress;
+
 #ifdef LUSTRE_TRACEFILE_PRIVATE
 /*
  * Private declare for tracefile
  */
-#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
+#define TCD_MAX_PAGES (5 << (20 - CFS_PAGE_SHIFT))
+#define TCD_STOCK_PAGES (TCD_MAX_PAGES)
 
 #define TRACEFILE_SIZE (500 << 20)
 
+/* Size of a buffer for sprinting console messages to in IRQ context (no
+ * logging in IRQ context) */
+#define TRACE_CONSOLE_BUFFER_SIZE   1024
+
 union trace_data_union {
        struct trace_cpu_data {
+               /*
+                * pages with trace records not yet processed by tracefiled.
+                */
                struct list_head        tcd_pages;
+               /* number of pages on ->tcd_pages */
                unsigned long           tcd_cur_pages;
 
+               /*
+                * pages with trace records already processed by
+                * tracefiled. These pages are kept in memory, so that some
+                * portion of log can be written in the event of LBUG. This
+                * list is maintained in LRU order.
+                *
+                * Pages are moved to ->tcd_daemon_pages by tracefiled()
+                * (put_pages_on_daemon_list()). LRU pages from this list are
+                * discarded when list grows too large.
+                */
                struct list_head        tcd_daemon_pages;
+               /* number of pages on ->tcd_daemon_pages */
                unsigned long           tcd_cur_daemon_pages;
 
+               /*
+                * Maximal number of pages allowed on ->tcd_pages and
+                * ->tcd_daemon_pages each. Always TCD_MAX_PAGES in current
+                * implementation.
+                */
                unsigned long           tcd_max_pages;
+
+               /*
+                * preallocated pages to write trace records into. Pages from
+                * ->tcd_stock_pages are moved to ->tcd_pages by
+                * portals_debug_msg().
+                *
+                * This list is necessary, because on some platforms it's
+                * impossible to perform efficient atomic page allocation in a
+                * non-blockable context.
+                *
+                * Such platforms fill ->tcd_stock_pages "on occasion", when
+                * tracing code is entered in blockable context.
+                *
+                * trace_get_tage_try() tries to get a page from
+                * ->tcd_stock_pages first and resorts to atomic page
+                * allocation only if this queue is empty. ->tcd_stock_pages
+                * is replenished when tracing code is entered in blocking
+                * context (darwin-tracefile.c:trace_get_tcd()). We try to
+                * maintain TCD_STOCK_PAGES (40 by default) pages in this
+                * queue. Atomic allocation is only required if more than
+                * TCD_STOCK_PAGES pagesful are consumed by trace records all
+                * emitted in non-blocking contexts. Which is quite unlikely.
+                */
+               struct list_head        tcd_stock_pages;
+               /* number of pages on ->tcd_stock_pages */
+               unsigned long           tcd_cur_stock_pages;
+
                int                     tcd_shutting_down;
+               int                     tcd_cpu;
        } tcd;
        char __pad[SMP_CACHE_BYTES];
 };
 
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
 struct page_collection {
        struct list_head        pc_pages;
+       /*
+        * spin-lock protecting ->pc_pages. It is taken by smp_call_function()
+        * call-back functions. XXX nikita: Which is horrible: all processors
+        * receive NMI at the same time only to be serialized by this
+        * lock. Probably ->pc_pages should be replaced with an array of
+        * NR_CPUS elements accessed locklessly.
+        */
        spinlock_t              pc_lock;
+       /*
+        * if this flag is set, collect_pages() will spill both
+        * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise,
+        * only ->tcd_pages are spilled.
+        */
        int                     pc_want_daemon_pages;
 };
 
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
 struct tracefiled_ctl {
        struct completion       tctl_start;
        struct completion       tctl_stop;
-       cfs_waitq_t             tctl_waitq; 
+       cfs_waitq_t             tctl_waitq;
        pid_t                   tctl_pid;
        atomic_t                tctl_shutdown;
 };
@@ -60,6 +144,8 @@ struct tracefiled_ctl {
 /*
  * small data-structure for each page owned by tracefiled.
  */
+/* XXX nikita: this declaration is internal to tracefile.c and should probably
+ * be moved there */
 struct trace_page {
        /*
         * page itself
@@ -83,14 +169,42 @@ struct trace_page {
 extern void set_ptldebug_header(struct ptldebug_header *header,
                           int subsys, int mask, const int line,
                           unsigned long stack);
-extern void print_to_console(struct ptldebug_header *hdr, int mask,
-                            char *buf, int len, char *file, const char *fn);
-extern struct trace_cpu_data * __trace_get_tcd (unsigned long *flags);
-extern void __trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags);
+extern void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf,
+                            int len, const char *file, const char *fn);
+
+extern struct trace_cpu_data *trace_get_tcd(void);
+extern void trace_put_tcd(struct trace_cpu_data *tcd);
+extern char *trace_get_console_buffer(void);
+extern void trace_put_console_buffer(char *buffer);
+
+extern void trace_call_on_all_cpus(void (*fn)(void *arg), void *arg);
+
+int trace_refill_stock(struct trace_cpu_data *tcd, int gfp,
+                      struct list_head *stock);
+
+
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage);
+
+extern void trace_assertion_failed(const char *str, const char *fn,
+                                  const char *file, int line);
+
+/* ASSERTION that is safe to use within the debug system */
+#define __LASSERT(cond)                                                                \
+({                                                                             \
+       if (unlikely(!(cond))) {                                                \
+                trace_assertion_failed("ASSERTION("#cond") failed",            \
+                                      __FUNCTION__, __FILE__, __LINE__);       \
+       }                                                                       \
+})
 
-#define trace_get_tcd(f)       __trace_get_tcd(&(f))
-#define trace_put_tcd(t, f)    __trace_put_tcd(t, f)
+#define __LASSERT_TAGE_INVARIANT(tage)                 \
+({                                                     \
+        __LASSERT(tage != NULL);                       \
+        __LASSERT(tage->page != NULL);                 \
+        __LASSERT(tage->used <= CFS_PAGE_SIZE);                \
+        __LASSERT(cfs_page_count(tage->page) > 0);     \
+})
 
 #endif /* LUSTRE_TRACEFILE_PRIVATE */
 
-#endif /* __PORTALS_TRACEFILE_H */
+#endif /* __LIBCFS_TRACEFILE_H__ */
index 99dcd7f..a1a6779 100644 (file)
 
 /*
  * liblustre is single-threaded, so most "synchronization" APIs are trivial.
+ *
+ * XXX Liang: There are several branches share lnet with b_hd_newconfig,
+ * if we define lock APIs at here, there will be conflict with liblustre
+ * in other branches.
  */
 
 #ifndef __KERNEL__
 
+#include <stdlib.h>
+#include <libcfs/libcfs.h>
 /*
  * Optional debugging (magic stamping and checking ownership) can be added.
  */
 
+#if 0
 /*
  * spin_lock
  *
@@ -89,19 +96,6 @@ void spin_unlock_bh(spinlock_t *lock)
         (void)lock;
 }
 
-void spin_lock_irqsave(spinlock_t *lock, unsigned long flags)
-{
-        LASSERT(lock != NULL);
-        (void)lock;
-}
-
-void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
-{
-        LASSERT(lock != NULL);
-        (void)lock;
-}
-
-
 /*
  * Semaphore
  *
@@ -227,6 +221,7 @@ void up_write(struct rw_semaphore *s)
         LASSERT(s != NULL);
         (void)s;
 }
+#endif
 
 /* !__KERNEL__ */
 #endif
index ddc994c..8d968a0 100644 (file)
 #include <sys/mman.h>
 #ifndef  __CYGWIN__
 #include <stdint.h>
+#ifdef HAVE_ASM_PAGE_H
 #include <asm/page.h>
+#endif
+#ifdef HAVE_SYS_USER_H
+#include <sys/user.h>
+#endif
 #else
 #include <sys/types.h>
 #endif
 #include <stdlib.h>
 #include <string.h>
+#include <signal.h>
 #include <errno.h>
 #include <sys/stat.h>
 #include <sys/vfs.h>
 
 #include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
 
 /*
  * Sleep channel. No-op implementation.
@@ -98,6 +105,7 @@ int cfs_waitq_active(struct cfs_waitq *waitq)
 {
         LASSERT(waitq != NULL);
         (void)waitq;
+        return 0;
 }
 
 void cfs_waitq_signal(struct cfs_waitq *waitq)
@@ -112,7 +120,7 @@ void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr)
         (void)waitq;
 }
 
-void cfs_waitq_broadcast(struct cfs_waitq *waitq)
+void cfs_waitq_broadcast(struct cfs_waitq *waitq, int state)
 {
         LASSERT(waitq != NULL);
         (void)waitq;
@@ -124,27 +132,24 @@ void cfs_waitq_wait(struct cfs_waitlink *link)
         (void)link;
 }
 
-int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int64_t timeout)
+int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int state, int64_t timeout)
 {
         LASSERT(link != NULL);
         (void)link;
+        return 0;
 }
 
 /*
  * Allocator
  */
 
-cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order)
+cfs_page_t *cfs_alloc_page(unsigned int flags)
 {
         cfs_page_t *pg = malloc(sizeof(*pg));
 
         if (!pg)
                 return NULL;
-#if 0 //#ifdef MAP_ANONYMOUS
-        pg->addr = mmap(0, PAGE_SIZE << order, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
-#else
-        pg->addr = malloc(PAGE_SIZE << order);
-#endif
+        pg->addr = malloc(CFS_PAGE_SIZE);
 
         if (!pg->addr) {
                 free(pg);
@@ -153,26 +158,12 @@ cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order)
         return pg;
 }
 
-void cfs_free_pages(struct page *pg, int what)
+void cfs_free_page(cfs_page_t *pg)
 {
-#if 0 //#ifdef MAP_ANONYMOUS
-        munmap(pg->addr, PAGE_SIZE);
-#else
         free(pg->addr);
-#endif
         free(pg);
 }
 
-cfs_page_t *cfs_alloc_page(unsigned int flags)
-{
-        return cfs_alloc_pages(flags, 0);
-}
-
-void cfs_free_page(cfs_page_t *pg, int what)
-{
-        cfs_free_page(pg, what);
-}
-
 void *cfs_page_address(cfs_page_t *pg)
 {
         return pg->addr;
@@ -188,40 +179,11 @@ void cfs_kunmap(cfs_page_t *pg)
 }
 
 /*
- * Memory allocator
- */
-void *cfs_alloc(size_t nr_bytes, u_int32_t flags)
-{
-        void *result;
-
-        result = malloc(nr_bytes);
-        if (result != NULL && (flags & CFS_ALLOC_ZERO))
-               memset(result, 0, nr_bytes);
-}
-
-void cfs_free(void *addr)
-{
-        free(addr);
-}
-
-void *cfs_alloc_large(size_t nr_bytes)
-{
-        return cfs_alloc(nr_bytes, 0);
-}
-
-void  cfs_free_large(void *addr)
-{
-        return cfs_free(addr);
-}
-
-/*
  * SLAB allocator
  */
 
 cfs_mem_cache_t *
-cfs_mem_cache_create(const char *, size_t, size_t, unsigned long,
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long),
-                     void (*)(void *, cfs_mem_cache_t *, unsigned long))
+cfs_mem_cache_create(const char *name, size_t objsize, size_t off, unsigned long flags)
 {
         cfs_mem_cache_t *c;
 
@@ -243,7 +205,7 @@ int cfs_mem_cache_destroy(cfs_mem_cache_t *c)
 
 void *cfs_mem_cache_alloc(cfs_mem_cache_t *c, int gfp)
 {
-        return cfs_alloc(c, gfp);
+        return cfs_alloc(c->size, gfp);
 }
 
 void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr)
@@ -251,6 +213,138 @@ void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr)
         cfs_free(addr);
 }
 
+/*
+ * This uses user-visible declarations from <linux/kdev_t.h>
+ */
+#ifdef __LINUX__
+#include <linux/kdev_t.h>
+#endif
+
+#ifndef MKDEV
+
+#define MAJOR(dev)      ((dev)>>8)
+#define MINOR(dev)      ((dev) & 0xff)
+#define MKDEV(ma,mi)    ((ma)<<8 | (mi))
+
+#endif
+
+cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor)
+{
+        return MKDEV(major, minor);
+}
+
+cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev)
+{
+        return MAJOR(rdev);
+}
+
+cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev)
+{
+        return MINOR(rdev);
+}
+
+void cfs_enter_debugger(void)
+{
+        /*
+         * nothing for now.
+         */
+}
+
+void cfs_daemonize(char *str)
+{
+        return;
+}
+
+cfs_sigset_t cfs_block_allsigs(void)
+{
+        cfs_sigset_t   all;
+        cfs_sigset_t   old;
+        int            rc;
+
+        sigfillset(&all);
+        rc = sigprocmask(SIG_SETMASK, &all, &old);
+        LASSERT(rc == 0);
+
+        return old;
+}
+
+cfs_sigset_t cfs_block_sigs(cfs_sigset_t blocks)
+{
+        cfs_sigset_t   old;
+        int   rc;
+        
+        rc = sigprocmask(SIG_SETMASK, &blocks, &old);
+        LASSERT (rc == 0);
+
+        return old;
+}
+
+void cfs_restore_sigs(cfs_sigset_t old)
+{
+        int   rc = sigprocmask(SIG_SETMASK, &old, NULL);
+
+        LASSERT (rc == 0);
+}
+
+int cfs_signal_pending(void)
+{
+        cfs_sigset_t    empty;
+        cfs_sigset_t    set;
+        int  rc;
+
+        rc = sigpending(&set);
+        LASSERT (rc == 0);
+
+        sigemptyset(&empty);
+
+        return !memcmp(&empty, &set, sizeof(set));
+}
+
+void cfs_clear_sigpending(void)
+{
+        return;
+}
+
+#ifdef __LINUX__
+
+/*
+ * In glibc (NOT in Linux, so check above is not right), implement
+ * stack-back-tracing through backtrace() function.
+ */
+#include <execinfo.h>
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{
+        backtrace(trace->frame, sizeof_array(trace->frame));
+}
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        if (0 <= frame_no && frame_no < sizeof_array(trace->frame))
+                return trace->frame[frame_no];
+        else
+                return NULL;
+}
+
+#else
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{}
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+        return NULL;
+}
+
+/* __LINUX__ */
+#endif
+
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        /* No libcfs_catastrophe in userspace! */
+        libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line, "LBUG\n");
+        abort();
+}
+
 
 /* !__KERNEL__ */
 #endif
index c9be01a..3000e8f 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
 #include <libcfs/kp30.h>
 #include <libcfs/libcfs.h>
-#include <libcfs/linux/portals_compat25.h>
-
-
+#include "tracefile.h"
 
 struct lc_watchdog {
-        struct timer_list lcw_timer; /* kernel timer */
+        cfs_timer_t       lcw_timer; /* kernel timer */
         struct list_head  lcw_list;
         struct timeval    lcw_last_touched;
-        struct task_struct *lcw_task;
+        cfs_task_t       *lcw_task;
 
-        void (*lcw_callback)(struct lc_watchdog *,
-                            struct task_struct *,
-                            void *data);
-        void *lcw_data;
+        void            (*lcw_callback)(pid_t, void *);
+        void             *lcw_data;
 
-        int lcw_pid;
-        int lcw_time; /* time until watchdog fires, in ms */
+        pid_t             lcw_pid;
+        int               lcw_time; /* time until watchdog fires, in ms */
 
         enum {
                 LC_WATCHDOG_DISABLED,
@@ -49,6 +45,7 @@ struct lc_watchdog {
         } lcw_state;
 };
 
+#ifdef WITH_WATCHDOG
 /*
  * The dispatcher will complete lcw_start_completion when it starts,
  * and lcw_stop_completion when it exits.
@@ -78,36 +75,44 @@ static DECLARE_MUTEX(lcw_refcount_sem);
  * List of timers that have fired that need their callbacks run by the
  * dispatcher.
  */
-static spinlock_t       lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; /* BH lock! */
 static struct list_head lcw_pending_timers = \
         LIST_HEAD_INIT(lcw_pending_timers);
 
-static struct task_struct *lcw_lookup_task(struct lc_watchdog *lcw)
+#ifdef HAVE_TASKLIST_LOCK
+static void
+lcw_dump(struct lc_watchdog *lcw)
 {
-        struct task_struct *tsk;
-        unsigned long flags;
+        cfs_task_t *tsk;
         ENTRY;
 
-        read_lock_irqsave(&tasklist_lock, flags);
+        read_lock(&tasklist_lock);
         tsk = find_task_by_pid(lcw->lcw_pid);
-        read_unlock_irqrestore(&tasklist_lock, flags);
-        if (!tsk) {
+
+        if (tsk == NULL) {
                 CWARN("Process %d was not found in the task list; "
-                      "watchdog callback may be incomplete\n", lcw->lcw_pid);
+                      "watchdog callback may be incomplete\n", (int)lcw->lcw_pid);
         } else if (tsk != lcw->lcw_task) {
-                tsk = NULL;
                 CWARN("The current process %d did not set the watchdog; "
-                      "watchdog callback may be incomplete\n", lcw->lcw_pid);
+                      "watchdog callback may be incomplete\n", (int)lcw->lcw_pid);
+        } else {
+                libcfs_debug_dumpstack(tsk);
         }
-
-        RETURN(tsk);
+        
+        read_unlock(&tasklist_lock);
+        EXIT;
 }
+#else
+static void
+lcw_dump(struct lc_watchdog *lcw)
+{
+        CERROR("unable to dump stack because of missing export\n");
+}
+#endif
 
 static void lcw_cb(unsigned long data)
 {
         struct lc_watchdog *lcw = (struct lc_watchdog *)data;
-        struct task_struct *tsk;
-        unsigned long flags;
 
         ENTRY;
 
@@ -118,47 +123,47 @@ static void lcw_cb(unsigned long data)
 
         lcw->lcw_state = LC_WATCHDOG_EXPIRED;
 
-        CWARN("Watchdog triggered for pid %d: it was inactive for %dms\n",
-              lcw->lcw_pid, (lcw->lcw_time * 1000) / HZ);
+        /* NB this warning should appear on the console, but may not get into
+         * the logs since we're running in a softirq handler */
+
+        CWARN("Watchdog triggered for pid %d: it was inactive for %ldms\n",
+              (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time) * 1000);
+        lcw_dump(lcw);
 
-        tsk = lcw_lookup_task(lcw);
-        if (tsk != NULL)
-                portals_debug_dumpstack(tsk);
+        spin_lock_bh(&lcw_pending_timers_lock);
 
-        spin_lock_irqsave(&lcw_pending_timers_lock, flags);
         if (list_empty(&lcw->lcw_list)) {
                 list_add(&lcw->lcw_list, &lcw_pending_timers);
                 wake_up(&lcw_event_waitq);
         }
-        spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+
+        spin_unlock_bh(&lcw_pending_timers_lock);
 
         EXIT;
 }
 
 static int is_watchdog_fired(void)
 {
-        unsigned long flags;
         int rc;
 
         if (test_bit(LCW_FLAG_STOP, &lcw_flags))
                 return 1;
 
-        spin_lock_irqsave(&lcw_pending_timers_lock, flags);
+        spin_lock_bh(&lcw_pending_timers_lock);
         rc = !list_empty(&lcw_pending_timers);
-        spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+        spin_unlock_bh(&lcw_pending_timers_lock);
         return rc;
 }
 
 static int lcw_dispatch_main(void *data)
 {
-        int rc = 0;
-        unsigned long flags;
+        int                 rc = 0;
+        unsigned long       flags;
         struct lc_watchdog *lcw;
-        struct task_struct *tsk;
 
         ENTRY;
 
-        kportal_daemonize("lc_watchdogd");
+        cfs_daemonize("lc_watchdogd");
 
         SIGNAL_MASK_LOCK(current, flags);
         sigfillset(&current->blocked);
@@ -173,9 +178,9 @@ static int lcw_dispatch_main(void *data)
                 if (test_bit(LCW_FLAG_STOP, &lcw_flags)) {
                         CDEBUG(D_INFO, "LCW_FLAG_STOP was set, shutting down...\n");
 
-                        spin_lock_irqsave(&lcw_pending_timers_lock, flags);
+                        spin_lock_bh(&lcw_pending_timers_lock);
                         rc = !list_empty(&lcw_pending_timers);
-                        spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+                        spin_unlock_bh(&lcw_pending_timers_lock);
                         if (rc) {
                                 CERROR("pending timers list was not empty at "
                                        "time of watchdog dispatch shutdown\n");
@@ -183,29 +188,24 @@ static int lcw_dispatch_main(void *data)
                         break;
                 }
 
-                spin_lock_irqsave(&lcw_pending_timers_lock, flags);
+                spin_lock_bh(&lcw_pending_timers_lock);
                 while (!list_empty(&lcw_pending_timers)) {
 
                         lcw = list_entry(lcw_pending_timers.next,
                                          struct lc_watchdog,
                                          lcw_list);
                         list_del_init(&lcw->lcw_list);
-                        spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+                        spin_unlock_bh(&lcw_pending_timers_lock);
 
-                        CDEBUG(D_INFO, "found lcw for pid %d\n", lcw->lcw_pid);
+                        CDEBUG(D_INFO, "found lcw for pid %d: inactive for %ldms\n", 
+                               (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time) * 1000);
 
-                        if (lcw->lcw_state != LC_WATCHDOG_DISABLED) {
-                                /*
-                                 * sanity check the task against our
-                                 * watchdog
-                                 */
-                                tsk = lcw_lookup_task(lcw);
-                                lcw->lcw_callback(lcw, tsk, lcw->lcw_data);
-                        }
+                        if (lcw->lcw_state != LC_WATCHDOG_DISABLED)
+                                lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data);
 
-                        spin_lock_irqsave(&lcw_pending_timers_lock, flags);
+                        spin_lock_bh(&lcw_pending_timers_lock);
                 }
-                spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+                spin_unlock_bh(&lcw_pending_timers_lock);
         }
 
         complete(&lcw_stop_completion);
@@ -255,26 +255,24 @@ static void lcw_dispatch_stop(void)
 }
 
 struct lc_watchdog *lc_watchdog_add(int timeout_ms,
-                                    void (*callback)(struct lc_watchdog *,
-                                                     struct task_struct *,
-                                                     void *),
+                                    void (*callback)(pid_t, void *),
                                     void *data)
 {
         struct lc_watchdog *lcw = NULL;
         ENTRY;
 
-        PORTAL_ALLOC(lcw, sizeof(*lcw));
-        if (!lcw) {
+        LIBCFS_ALLOC(lcw, sizeof(*lcw));
+        if (lcw == NULL) {
                 CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n");
                 RETURN(ERR_PTR(-ENOMEM));
         }
 
-        lcw->lcw_task = cfs_current();
-        lcw->lcw_pid = cfs_curproc_pid();
-        lcw->lcw_time = (timeout_ms * HZ) / 1000;
-        lcw->lcw_callback = callback ? callback : lc_watchdog_dumplog;
-        lcw->lcw_data = data;
-        lcw->lcw_state = LC_WATCHDOG_DISABLED;
+        lcw->lcw_task     = cfs_current();
+        lcw->lcw_pid      = cfs_curproc_pid();
+        lcw->lcw_time     = cfs_time_seconds(timeout_ms) / 1000;
+        lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog;
+        lcw->lcw_data     = data;
+        lcw->lcw_state    = LC_WATCHDOG_DISABLED;
 
         INIT_LIST_HEAD(&lcw->lcw_list);
 
@@ -298,40 +296,31 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms,
 }
 EXPORT_SYMBOL(lc_watchdog_add);
 
-static long
-timeval_sub(struct timeval *large, struct timeval *small)
-{
-        return (large->tv_sec - small->tv_sec) * 1000000 +
-                (large->tv_usec - small->tv_usec);
-}
-
 static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
 {
         struct timeval newtime;
-        unsigned long timediff;
+        struct timeval timediff;
 
         do_gettimeofday(&newtime);
         if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
-                timediff = timeval_sub(&newtime, &lcw->lcw_last_touched);
+                cfs_timeval_sub(&newtime, &lcw->lcw_last_touched, &timediff);
                 CWARN("Expired watchdog for pid %d %s after %lu.%.4lus\n",
                       lcw->lcw_pid,
                       message,
-                      timediff / 1000000,
-                      (timediff % 1000000) / 100);
+                      timediff.tv_sec,
+                      timediff.tv_usec / 100);
         }
         lcw->lcw_last_touched = newtime;
 }
 
 void lc_watchdog_touch(struct lc_watchdog *lcw)
 {
-        unsigned long flags;
         ENTRY;
         LASSERT(lcw != NULL);
 
-        spin_lock_irqsave(&lcw_pending_timers_lock, flags);
-        if (!list_empty(&lcw->lcw_list))
-                list_del_init(&lcw->lcw_list);
-        spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+        spin_lock_bh(&lcw_pending_timers_lock);
+        list_del_init(&lcw->lcw_list);
+        spin_unlock_bh(&lcw_pending_timers_lock);
 
         lcw_update_time(lcw, "touched");
         lcw->lcw_state = LC_WATCHDOG_ENABLED;
@@ -344,14 +333,13 @@ EXPORT_SYMBOL(lc_watchdog_touch);
 
 void lc_watchdog_disable(struct lc_watchdog *lcw)
 {
-        unsigned long flags;
         ENTRY;
         LASSERT(lcw != NULL);
 
-        spin_lock_irqsave(&lcw_pending_timers_lock, flags);
+        spin_lock_bh(&lcw_pending_timers_lock);
         if (!list_empty(&lcw->lcw_list))
                 list_del_init(&lcw->lcw_list);
-        spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+        spin_unlock_bh(&lcw_pending_timers_lock);
 
         lcw_update_time(lcw, "disabled");
         lcw->lcw_state = LC_WATCHDOG_DISABLED;
@@ -362,7 +350,6 @@ EXPORT_SYMBOL(lc_watchdog_disable);
 
 void lc_watchdog_delete(struct lc_watchdog *lcw)
 {
-        unsigned long flags;
         ENTRY;
         LASSERT(lcw != NULL);
 
@@ -370,17 +357,17 @@ void lc_watchdog_delete(struct lc_watchdog *lcw)
 
         lcw_update_time(lcw, "deleted");
 
-        spin_lock_irqsave(&lcw_pending_timers_lock, flags);
+        spin_lock_bh(&lcw_pending_timers_lock);
         if (!list_empty(&lcw->lcw_list))
                 list_del_init(&lcw->lcw_list);
-        spin_unlock_irqrestore(&lcw_pending_timers_lock, flags);
+        spin_unlock_bh(&lcw_pending_timers_lock);
 
         down(&lcw_refcount_sem);
         if (--lcw_refcount == 0)
                 lcw_dispatch_stop();
         up(&lcw_refcount_sem);
 
-        PORTAL_FREE(lcw, sizeof(*lcw));
+        LIBCFS_FREE(lcw, sizeof(*lcw));
 
         EXIT;
 }
@@ -390,13 +377,37 @@ EXPORT_SYMBOL(lc_watchdog_delete);
  * Provided watchdog handlers
  */
 
-extern void portals_debug_dumplog_internal(void *arg);
-
-void lc_watchdog_dumplog(struct lc_watchdog *lcw,
-                         struct task_struct *tsk,
-                         void               *data)
+void lc_watchdog_dumplog(pid_t pid, void *data)
 {
-        tsk = tsk ? tsk : current;
-        portals_debug_dumplog_internal((void *)(long)tsk->pid);
+        libcfs_debug_dumplog_internal((void *)((unsigned long)pid));
 }
 EXPORT_SYMBOL(lc_watchdog_dumplog);
+
+#else   /* !defined(WITH_WATCHDOG) */
+
+struct lc_watchdog *lc_watchdog_add(int timeout_ms,
+                                    void (*callback)(pid_t pid, void *),
+                                    void *data)
+{
+        static struct lc_watchdog      watchdog;
+        return &watchdog;
+}
+EXPORT_SYMBOL(lc_watchdog_add);
+
+void lc_watchdog_touch(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_touch);
+
+void lc_watchdog_disable(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_disable);
+
+void lc_watchdog_delete(struct lc_watchdog *lcw)
+{
+}
+EXPORT_SYMBOL(lc_watchdog_delete);
+
+#endif
+
diff --git a/lnet/libcfs/winnt/winnt-curproc.c b/lnet/libcfs/winnt/winnt-curproc.c
new file mode 100644 (file)
index 0000000..e21c5c9
--- /dev/null
@@ -0,0 +1,453 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ *
+ * Impletion of winnt curproc routines.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+
+/*
+ * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h)
+ * for Linux kernel.
+ */
+
+cfs_task_t this_task = 
+    { 0, 0, 0, 0, 0, 0, 0, 
+      0, 0, 0, 0,  1, 0,  0, 0, 0,
+      "sysetm\0" };
+
+
+uid_t  cfs_curproc_uid(void)
+{
+    return this_task.uid;
+}
+
+gid_t  cfs_curproc_gid(void)
+{
+    return this_task.gid;
+}
+
+uid_t  cfs_curproc_fsuid(void)
+{
+    return this_task.fsuid;
+}
+
+gid_t cfs_curproc_fsgid(void)
+{
+    return this_task.fsgid;
+}
+
+pid_t cfs_curproc_pid(void)
+{
+    return cfs_current()->pid;
+}
+
+int cfs_curproc_groups_nr(void)
+{
+    return this_task.ngroups;
+}
+
+void cfs_curproc_groups_dump(gid_t *array, int size)
+{
+    LASSERT(size <= NGROUPS);
+    size = min_t(int, size, this_task.ngroups);
+    memcpy(array, this_task.groups, size * sizeof(__u32));
+}
+
+int cfs_curproc_is_in_groups(gid_t gid)
+{
+    return in_group_p(gid);
+}
+
+mode_t cfs_curproc_umask(void)
+{
+    return this_task.umask;
+}
+
+char  *cfs_curproc_comm(void)
+{
+    return this_task.comm;
+}
+
+cfs_kernel_cap_t cfs_curproc_cap_get(void)
+{
+    return this_task.cap_effective;
+}
+
+void cfs_curproc_cap_set(cfs_kernel_cap_t cap)
+{
+    this_task.cap_effective = cap;
+}
+
+
+/*
+ * Implementation of linux task management routines
+ */
+
+
+/* global of the task manager structure */
+
+TASK_MAN TaskMan;
+
+
+/*
+ *  task slot routiens
+ */
+
+PTASK_SLOT
+alloc_task_slot()
+{
+    PTASK_SLOT task = NULL;
+
+    if (TaskMan.slab) {
+        task = cfs_mem_cache_alloc(TaskMan.slab, 0);
+    } else {
+        task = cfs_alloc(sizeof(TASK_SLOT), 0);
+    }
+
+    return task;
+}
+
+void
+init_task_slot(PTASK_SLOT task)
+{
+    memset(task, 0, sizeof(TASK_SLOT));
+    task->Magic = TASKSLT_MAGIC;
+    task->task  = this_task;
+    task->task.pid = (pid_t)PsGetCurrentThreadId();
+    cfs_init_event(&task->Event, TRUE, FALSE);
+}
+
+
+void
+cleanup_task_slot(PTASK_SLOT task)
+{
+    if (TaskMan.slab) {
+        cfs_mem_cache_free(TaskMan.slab, task);
+    } else {
+        cfs_free(task);
+    }
+}
+
+/*
+ *  task manager related routines
+ */
+
+VOID
+task_manager_notify(
+    IN HANDLE   ProcessId,
+    IN HANDLE   ThreadId,
+    IN BOOLEAN  Create
+    )
+{
+    PLIST_ENTRY ListEntry = NULL; 
+    PTASK_SLOT  TaskSlot  = NULL;
+
+    spin_lock(&(TaskMan.Lock));
+
+    ListEntry = TaskMan.TaskList.Flink;
+
+    while (ListEntry != (&(TaskMan.TaskList))) {
+
+        TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+
+        if (TaskSlot->Pid == ProcessId && TaskSlot->Tid == ThreadId) {
+
+            if (Create) {
+/*
+                DbgPrint("task_manager_notify: Pid=%xh Tid %xh resued (TaskSlot->Tet = %xh)...\n",
+                         ProcessId, ThreadId, TaskSlot->Tet);
+*/
+            } else {
+                /* remove the taskslot */
+                RemoveEntryList(&(TaskSlot->Link));
+                TaskMan.NumOfTasks--;
+
+                /* now free the task slot */
+                cleanup_task_slot(TaskSlot);
+            }
+        }
+
+        ListEntry = ListEntry->Flink;
+    }
+
+    spin_unlock(&(TaskMan.Lock));
+}
+
+int
+init_task_manager()
+{
+    NTSTATUS    status;
+
+    /* initialize the content and magic */
+    memset(&TaskMan, 0, sizeof(TASK_MAN));
+    TaskMan.Magic = TASKMAN_MAGIC;
+
+    /* initialize the spinlock protection */
+    spin_lock_init(&TaskMan.Lock);
+
+    /* create slab memory cache */
+    TaskMan.slab = cfs_mem_cache_create(
+        "TSLT", sizeof(TASK_SLOT), 0, 0);
+
+    /* intialize the list header */
+    InitializeListHead(&(TaskMan.TaskList));
+
+    /* set the thread creation/destruction notify routine */
+    status = PsSetCreateThreadNotifyRoutine(task_manager_notify);
+
+    if (!NT_SUCCESS(status)) {
+        cfs_enter_debugger();
+    }
+
+    return 0;
+}
+
+void
+cleanup_task_manager()
+{
+    PLIST_ENTRY ListEntry = NULL; 
+    PTASK_SLOT  TaskSlot  = NULL;
+
+    /* we must stay in system since we succeed to register the
+       CreateThreadNotifyRoutine: task_manager_notify */
+    cfs_enter_debugger();
+
+
+    /* cleanup all the taskslots attached to the list */
+    spin_lock(&(TaskMan.Lock));
+
+    while (!IsListEmpty(&(TaskMan.TaskList))) {
+
+        ListEntry = TaskMan.TaskList.Flink;
+        TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+
+        RemoveEntryList(ListEntry);
+        cleanup_task_slot(TaskSlot);
+    }
+
+    spin_unlock(&TaskMan.Lock);
+
+    /* destroy the taskslot cache slab */
+    cfs_mem_cache_destroy(TaskMan.slab);
+    memset(&TaskMan, 0, sizeof(TASK_MAN));
+}
+
+
+/*
+ * schedule routines (task slot list)
+ */
+
+
+cfs_task_t *
+cfs_current()
+{
+    HANDLE      Pid = PsGetCurrentProcessId();
+    HANDLE      Tid = PsGetCurrentThreadId();
+    PETHREAD    Tet = PsGetCurrentThread();
+
+    PLIST_ENTRY ListEntry = NULL; 
+    PTASK_SLOT  TaskSlot  = NULL;
+
+    spin_lock(&(TaskMan.Lock));
+
+    ListEntry = TaskMan.TaskList.Flink;
+
+    while (ListEntry != (&(TaskMan.TaskList))) {
+
+        TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+
+        if (TaskSlot->Pid == Pid && TaskSlot->Tid == Tid) {
+            if (TaskSlot->Tet != Tet) {
+
+/*
+                DbgPrint("cfs_current: Pid=%xh Tid %xh Tet = %xh resued (TaskSlot->Tet = %xh)...\n",
+                         Pid, Tid, Tet, TaskSlot->Tet);
+*/
+                //
+                // The old thread was already exit. This must be a
+                // new thread which get the same Tid to the previous.
+                //
+
+                TaskSlot->Tet = Tet;
+            }
+            break;
+
+        } else {
+
+            if ((ULONG)TaskSlot->Pid > (ULONG)Pid) {
+                TaskSlot = NULL;
+                break;
+            } else if ((ULONG)TaskSlot->Pid == (ULONG)Pid) {
+                if ((ULONG)TaskSlot->Tid > (ULONG)Tid) {
+                    TaskSlot = NULL;
+                    break;
+                }
+            }
+
+            TaskSlot =  NULL;
+        }
+
+        ListEntry = ListEntry->Flink;
+    }
+
+    if (!TaskSlot) {
+
+        TaskSlot = alloc_task_slot();
+
+        if (!TaskSlot) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        init_task_slot(TaskSlot);
+
+        TaskSlot->Pid = Pid;
+        TaskSlot->Tid = Tid;
+        TaskSlot->Tet = Tet;
+
+        if (ListEntry == (&(TaskMan.TaskList))) {
+            //
+            // Empty case or the biggest case, put it to the tail.
+            //
+            InsertTailList(&(TaskMan.TaskList), &(TaskSlot->Link));
+        } else {
+            //
+            // Get a slot and smaller than it's tid, put it just before.
+            //
+            InsertHeadList(ListEntry->Blink, &(TaskSlot->Link));
+        }
+
+        TaskMan.NumOfTasks++;
+    }
+
+    //
+    // To Check whether he task structures are arranged in the expected order ?
+    //
+
+    {
+        PTASK_SLOT  Prev = NULL, Curr = NULL;
+        
+        ListEntry = TaskMan.TaskList.Flink;
+
+        while (ListEntry != (&(TaskMan.TaskList))) {
+
+            Curr = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link);
+            ListEntry = ListEntry->Flink;
+
+            if (Prev) {
+                if ((ULONG)Prev->Pid > (ULONG)Curr->Pid) {
+                    cfs_enter_debugger();
+                } else if ((ULONG)Prev->Pid == (ULONG)Curr->Pid) {
+                    if ((ULONG)Prev->Tid > (ULONG)Curr->Tid) {
+                        cfs_enter_debugger();
+                    }
+                }
+            }
+
+            Prev = Curr;
+        }
+    }
+
+errorout:
+
+    spin_unlock(&(TaskMan.Lock));
+
+    if (!TaskSlot) {
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    return (&(TaskSlot->task));
+}
+
+int
+schedule_timeout(int64_t time)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        cfs_enter_debugger();
+        return 0;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+
+    if (time == MAX_SCHEDULE_TIMEOUT) {
+        time = 0;
+    }
+
+    return (cfs_wait_event(&(slot->Event), time) != 0);
+}
+
+int
+schedule()
+{
+    return schedule_timeout(0);
+}
+
+int
+wake_up_process(
+    cfs_task_t * task
+    )
+{
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        cfs_enter_debugger();
+        return 0;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+
+    cfs_wake_event(&(slot->Event));
+
+    return TRUE;
+}
+
+void
+sleep_on(
+    cfs_waitq_t *waitq
+    )
+{
+       cfs_waitlink_t link;
+       
+       cfs_waitlink_init(&link);
+       cfs_waitq_add(waitq, &link);
+       cfs_waitq_wait(&link, CFS_TASK_INTERRUPTIBLE);
+       cfs_waitq_del(waitq, &link);
+}
+
+EXPORT_SYMBOL(cfs_curproc_uid);
+EXPORT_SYMBOL(cfs_curproc_pid);
+EXPORT_SYMBOL(cfs_curproc_gid);
+EXPORT_SYMBOL(cfs_curproc_fsuid);
+EXPORT_SYMBOL(cfs_curproc_fsgid);
+EXPORT_SYMBOL(cfs_curproc_umask);
+EXPORT_SYMBOL(cfs_curproc_comm);
+EXPORT_SYMBOL(cfs_curproc_groups_nr);
+EXPORT_SYMBOL(cfs_curproc_groups_dump);
+EXPORT_SYMBOL(cfs_curproc_is_in_groups);
+EXPORT_SYMBOL(cfs_curproc_cap_get);
+EXPORT_SYMBOL(cfs_curproc_cap_set);
diff --git a/lnet/libcfs/winnt/winnt-debug.c b/lnet/libcfs/winnt/winnt-debug.c
new file mode 100644 (file)
index 0000000..9e94f84
--- /dev/null
@@ -0,0 +1,1057 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/kp30.h>
+#include <libcfs/libcfs.h>
+#include "tracefile.h"
+
+void lnet_debug_dumpstack(cfs_task_t *tsk)
+{ 
+       return;
+}
+
+cfs_task_t *lnet_current(void)
+{ 
+       return cfs_current();
+}
+
+int lnet_arch_debug_init(unsigned long bufsize)
+{
+       return 0;
+}
+
+int lnet_arch_debug_cleanup(void)
+{
+       return 0;
+}
+
+void lnet_run_lbug_upcall(char *file, const char *fn, const int line)
+{
+}
+
+void lbug_with_loc(char *file, const char *func, const int line)
+{
+        libcfs_catastrophe = 1;
+        CEMERG("LBUG: pid: %u thread: %#x\n",
+              (unsigned)cfs_curproc_pid(), (unsigned)PsGetCurrentThread());
+        // portals_debug_dumplog();
+        // portals_run_lbug_upcall(file, func, line);
+}
+
+#if TDI_LIBCFS_DBG
+
+/*
+ * Definitions
+ */
+
+LONG  KsDebugLevel = 0x5;
+
+
+/*
+ * Routines
+ */
+
+
+/*
+ * KsNtStatusToString
+ *   Get the error message for a specified nt status
+ *
+ * Arguments:
+ *   Status - nt status code
+ *
+ * Return Value:
+ *   PUCHAR - message string for the status code
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+PUCHAR
+KsNtStatusToString (IN NTSTATUS Status)
+{
+    switch (Status) {
+
+    case 0x00000000: return "STATUS_SUCCESS";
+    case 0x00000001: return "STATUS_WAIT_1";
+    case 0x00000002: return "STATUS_WAIT_2";
+    case 0x00000003: return "STATUS_WAIT_3";
+    case 0x0000003F: return "STATUS_WAIT_63";
+    case 0x00000080: return "STATUS_ABANDONED_WAIT_0";
+    case 0x000000BF: return "STATUS_ABANDONED_WAIT_63";
+    case 0x000000C0: return "STATUS_USER_APC";
+    case 0x00000100: return "STATUS_KERNEL_APC";
+    case 0x00000101: return "STATUS_ALERTED";
+    case 0x00000102: return "STATUS_TIMEOUT";
+    case 0x00000103: return "STATUS_PENDING";
+    case 0x00000104: return "STATUS_REPARSE";
+    case 0x00000105: return "STATUS_MORE_ENTRIES";
+    case 0x00000106: return "STATUS_NOT_ALL_ASSIGNED";
+    case 0x00000107: return "STATUS_SOME_NOT_MAPPED";
+    case 0x00000108: return "STATUS_OPLOCK_BREAK_IN_PROGRESS";
+    case 0x00000109: return "STATUS_VOLUME_MOUNTED";
+    case 0x0000010A: return "STATUS_RXACT_COMMITTED";
+    case 0x0000010B: return "STATUS_NOTIFY_CLEANUP";
+    case 0x0000010C: return "STATUS_NOTIFY_ENUM_DIR";
+    case 0x0000010D: return "STATUS_NO_QUOTAS_FOR_ACCOUNT";
+    case 0x0000010E: return "STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED";
+    case 0x00000110: return "STATUS_PAGE_FAULT_TRANSITION";
+    case 0x00000111: return "STATUS_PAGE_FAULT_DEMAND_ZERO";
+    case 0x00000112: return "STATUS_PAGE_FAULT_COPY_ON_WRITE";
+    case 0x00000113: return "STATUS_PAGE_FAULT_GUARD_PAGE";
+    case 0x00000114: return "STATUS_PAGE_FAULT_PAGING_FILE";
+    case 0x00000115: return "STATUS_CACHE_PAGE_LOCKED";
+    case 0x00000116: return "STATUS_CRASH_DUMP";
+    case 0x00000117: return "STATUS_BUFFER_ALL_ZEROS";
+    case 0x00000118: return "STATUS_REPARSE_OBJECT";
+    case 0x00000119: return "STATUS_RESOURCE_REQUIREMENTS_CHANGED";
+    case 0x00000120: return "STATUS_TRANSLATION_COMPLETE";
+    case 0x00000121: return "STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY";
+    case 0x00010001: return "DBG_EXCEPTION_HANDLED";
+    case 0x00010002: return "DBG_CONTINUE";
+    case 0x40000000: return "STATUS_OBJECT_NAME_EXISTS";
+    case 0x40000001: return "STATUS_THREAD_WAS_SUSPENDED";
+    case 0x40000002: return "STATUS_WORKING_SET_LIMIT_RANGE";
+    case 0x40000003: return "STATUS_IMAGE_NOT_AT_BASE";
+    case 0x40000004: return "STATUS_RXACT_STATE_CREATED";
+    case 0x40000005: return "STATUS_SEGMENT_NOTIFICATION";
+    case 0x40000006: return "STATUS_LOCAL_USER_SESSION_KEY";
+    case 0x40000007: return "STATUS_BAD_CURRENT_DIRECTORY";
+    case 0x40000008: return "STATUS_SERIAL_MORE_WRITES";
+    case 0x40000009: return "STATUS_REGISTRY_RECOVERED";
+    case 0x4000000A: return "STATUS_FT_READ_RECOVERY_FROM_BACKUP";
+    case 0x4000000B: return "STATUS_FT_WRITE_RECOVERY";
+    case 0x4000000C: return "STATUS_SERIAL_COUNTER_TIMEOUT";
+    case 0x4000000D: return "STATUS_NULL_LM_PASSWORD";
+    case 0x4000000E: return "STATUS_IMAGE_MACHINE_TYPE_MISMATCH";
+    case 0x4000000F: return "STATUS_RECEIVE_PARTIAL";
+    case 0x40000010: return "STATUS_RECEIVE_EXPEDITED";
+    case 0x40000011: return "STATUS_RECEIVE_PARTIAL_EXPEDITED";
+    case 0x40000012: return "STATUS_EVENT_DONE";
+    case 0x40000013: return "STATUS_EVENT_PENDING";
+    case 0x40000014: return "STATUS_CHECKING_FILE_SYSTEM";
+    case 0x40000015: return "STATUS_FATAL_APP_EXIT";
+    case 0x40000016: return "STATUS_PREDEFINED_HANDLE";
+    case 0x40000017: return "STATUS_WAS_UNLOCKED";
+    case 0x40000018: return "STATUS_SERVICE_NOTIFICATION";
+    case 0x40000019: return "STATUS_WAS_LOCKED";
+    case 0x4000001A: return "STATUS_LOG_HARD_ERROR";
+    case 0x4000001B: return "STATUS_ALREADY_WIN32";
+    case 0x4000001C: return "STATUS_WX86_UNSIMULATE";
+    case 0x4000001D: return "STATUS_WX86_CONTINUE";
+    case 0x4000001E: return "STATUS_WX86_SINGLE_STEP";
+    case 0x4000001F: return "STATUS_WX86_BREAKPOINT";
+    case 0x40000020: return "STATUS_WX86_EXCEPTION_CONTINUE";
+    case 0x40000021: return "STATUS_WX86_EXCEPTION_LASTCHANCE";
+    case 0x40000022: return "STATUS_WX86_EXCEPTION_CHAIN";
+    case 0x40000023: return "STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE";
+    case 0x40000024: return "STATUS_NO_YIELD_PERFORMED";
+    case 0x40000025: return "STATUS_TIMER_RESUME_IGNORED";
+    case 0x40000026: return "STATUS_ARBITRATION_UNHANDLED";
+    case 0x40000027: return "STATUS_CARDBUS_NOT_SUPPORTED";
+    case 0x40000028: return "STATUS_WX86_CREATEWX86TIB";
+    case 0x40000029: return "STATUS_MP_PROCESSOR_MISMATCH";
+    case 0x40010001: return "DBG_REPLY_LATER";
+    case 0x40010002: return "DBG_UNABLE_TO_PROVIDE_HANDLE";
+    case 0x40010003: return "DBG_TERMINATE_THREAD";
+    case 0x40010004: return "DBG_TERMINATE_PROCESS";
+    case 0x40010005: return "DBG_CONTROL_C";
+    case 0x40010006: return "DBG_PRINTEXCEPTION_C";
+    case 0x40010007: return "DBG_RIPEXCEPTION";
+    case 0x40010008: return "DBG_CONTROL_BREAK";
+    case 0x80000001: return "STATUS_GUARD_PAGE_VIOLATION";
+    case 0x80000002: return "STATUS_DATATYPE_MISALIGNMENT";
+    case 0x80000003: return "STATUS_BREAKPOINT";
+    case 0x80000004: return "STATUS_SINGLE_STEP";
+    case 0x80000005: return "STATUS_BUFFER_OVERFLOW";
+    case 0x80000006: return "STATUS_NO_MORE_FILES";
+    case 0x80000007: return "STATUS_WAKE_SYSTEM_DEBUGGER";
+    case 0x8000000A: return "STATUS_HANDLES_CLOSED";
+    case 0x8000000B: return "STATUS_NO_INHERITANCE";
+    case 0x8000000C: return "STATUS_GUID_SUBSTITUTION_MADE";
+    case 0x8000000D: return "STATUS_PARTIAL_COPY";
+    case 0x8000000E: return "STATUS_DEVICE_PAPER_EMPTY";
+    case 0x8000000F: return "STATUS_DEVICE_POWERED_OFF";
+    case 0x80000010: return "STATUS_DEVICE_OFF_LINE";
+    case 0x80000011: return "STATUS_DEVICE_BUSY";
+    case 0x80000012: return "STATUS_NO_MORE_EAS";
+    case 0x80000013: return "STATUS_INVALID_EA_NAME";
+    case 0x80000014: return "STATUS_EA_LIST_INCONSISTENT";
+    case 0x80000015: return "STATUS_INVALID_EA_FLAG";
+    case 0x80000016: return "STATUS_VERIFY_REQUIRED";
+    case 0x80000017: return "STATUS_EXTRANEOUS_INFORMATION";
+    case 0x80000018: return "STATUS_RXACT_COMMIT_NECESSARY";
+    case 0x8000001A: return "STATUS_NO_MORE_ENTRIES";
+    case 0x8000001B: return "STATUS_FILEMARK_DETECTED";
+    case 0x8000001C: return "STATUS_MEDIA_CHANGED";
+    case 0x8000001D: return "STATUS_BUS_RESET";
+    case 0x8000001E: return "STATUS_END_OF_MEDIA";
+    case 0x8000001F: return "STATUS_BEGINNING_OF_MEDIA";
+    case 0x80000020: return "STATUS_MEDIA_CHECK";
+    case 0x80000021: return "STATUS_SETMARK_DETECTED";
+    case 0x80000022: return "STATUS_NO_DATA_DETECTED";
+    case 0x80000023: return "STATUS_REDIRECTOR_HAS_OPEN_HANDLES";
+    case 0x80000024: return "STATUS_SERVER_HAS_OPEN_HANDLES";
+    case 0x80000025: return "STATUS_ALREADY_DISCONNECTED";
+    case 0x80000026: return "STATUS_LONGJUMP";
+    case 0x80010001: return "DBG_EXCEPTION_NOT_HANDLED";
+    case 0xC0000001: return "STATUS_UNSUCCESSFUL";
+    case 0xC0000002: return "STATUS_NOT_IMPLEMENTED";
+    case 0xC0000003: return "STATUS_INVALID_INFO_CLASS";
+    case 0xC0000004: return "STATUS_INFO_LENGTH_MISMATCH";
+    case 0xC0000005: return "STATUS_ACCESS_VIOLATION";
+    case 0xC0000006: return "STATUS_IN_PAGE_ERROR";
+    case 0xC0000007: return "STATUS_PAGEFILE_QUOTA";
+    case 0xC0000008: return "STATUS_INVALID_HANDLE";
+    case 0xC0000009: return "STATUS_BAD_INITIAL_STACK";
+    case 0xC000000A: return "STATUS_BAD_INITIAL_PC";
+    case 0xC000000B: return "STATUS_INVALID_CID";
+    case 0xC000000C: return "STATUS_TIMER_NOT_CANCELED";
+    case 0xC000000D: return "STATUS_INVALID_PARAMETER";
+    case 0xC000000E: return "STATUS_NO_SUCH_DEVICE";
+    case 0xC000000F: return "STATUS_NO_SUCH_FILE";
+    case 0xC0000010: return "STATUS_INVALID_DEVICE_REQUEST";
+    case 0xC0000011: return "STATUS_END_OF_FILE";
+    case 0xC0000012: return "STATUS_WRONG_VOLUME";
+    case 0xC0000013: return "STATUS_NO_MEDIA_IN_DEVICE";
+    case 0xC0000014: return "STATUS_UNRECOGNIZED_MEDIA";
+    case 0xC0000015: return "STATUS_NONEXISTENT_SECTOR";
+    case 0xC0000016: return "STATUS_MORE_PROCESSING_REQUIRED";
+    case 0xC0000017: return "STATUS_NO_MEMORY";
+    case 0xC0000018: return "STATUS_CONFLICTING_ADDRESSES";
+    case 0xC0000019: return "STATUS_NOT_MAPPED_VIEW";
+    case 0xC000001A: return "STATUS_UNABLE_TO_FREE_VM";
+    case 0xC000001B: return "STATUS_UNABLE_TO_DELETE_SECTION";
+    case 0xC000001C: return "STATUS_INVALID_SYSTEM_SERVICE";
+    case 0xC000001D: return "STATUS_ILLEGAL_INSTRUCTION";
+    case 0xC000001E: return "STATUS_INVALID_LOCK_SEQUENCE";
+    case 0xC000001F: return "STATUS_INVALID_VIEW_SIZE";
+    case 0xC0000020: return "STATUS_INVALID_FILE_FOR_SECTION";
+    case 0xC0000021: return "STATUS_ALREADY_COMMITTED";
+    case 0xC0000022: return "STATUS_ACCESS_DENIED";
+    case 0xC0000023: return "STATUS_BUFFER_TOO_SMALL";
+    case 0xC0000024: return "STATUS_OBJECT_TYPE_MISMATCH";
+    case 0xC0000025: return "STATUS_NONCONTINUABLE_EXCEPTION";
+    case 0xC0000026: return "STATUS_INVALID_DISPOSITION";
+    case 0xC0000027: return "STATUS_UNWIND";
+    case 0xC0000028: return "STATUS_BAD_STACK";
+    case 0xC0000029: return "STATUS_INVALID_UNWIND_TARGET";
+    case 0xC000002A: return "STATUS_NOT_LOCKED";
+    case 0xC000002B: return "STATUS_PARITY_ERROR";
+    case 0xC000002C: return "STATUS_UNABLE_TO_DECOMMIT_VM";
+    case 0xC000002D: return "STATUS_NOT_COMMITTED";
+    case 0xC000002E: return "STATUS_INVALID_PORT_ATTRIBUTES";
+    case 0xC000002F: return "STATUS_PORT_MESSAGE_TOO_LONG";
+    case 0xC0000030: return "STATUS_INVALID_PARAMETER_MIX";
+    case 0xC0000031: return "STATUS_INVALID_QUOTA_LOWER";
+    case 0xC0000032: return "STATUS_DISK_CORRUPT_ERROR";
+    case 0xC0000033: return "STATUS_OBJECT_NAME_INVALID";
+    case 0xC0000034: return "STATUS_OBJECT_NAME_NOT_FOUND";
+    case 0xC0000035: return "STATUS_OBJECT_NAME_COLLISION";
+    case 0xC0000037: return "STATUS_PORT_DISCONNECTED";
+    case 0xC0000038: return "STATUS_DEVICE_ALREADY_ATTACHED";
+    case 0xC0000039: return "STATUS_OBJECT_PATH_INVALID";
+    case 0xC000003A: return "STATUS_OBJECT_PATH_NOT_FOUND";
+    case 0xC000003B: return "STATUS_OBJECT_PATH_SYNTAX_BAD";
+    case 0xC000003C: return "STATUS_DATA_OVERRUN";
+    case 0xC000003D: return "STATUS_DATA_LATE_ERROR";
+    case 0xC000003E: return "STATUS_DATA_ERROR";
+    case 0xC000003F: return "STATUS_CRC_ERROR";
+    case 0xC0000040: return "STATUS_SECTION_TOO_BIG";
+    case 0xC0000041: return "STATUS_PORT_CONNECTION_REFUSED";
+    case 0xC0000042: return "STATUS_INVALID_PORT_HANDLE";
+    case 0xC0000043: return "STATUS_SHARING_VIOLATION";
+    case 0xC0000044: return "STATUS_QUOTA_EXCEEDED";
+    case 0xC0000045: return "STATUS_INVALID_PAGE_PROTECTION";
+    case 0xC0000046: return "STATUS_MUTANT_NOT_OWNED";
+    case 0xC0000047: return "STATUS_SEMAPHORE_LIMIT_EXCEEDED";
+    case 0xC0000048: return "STATUS_PORT_ALREADY_SET";
+    case 0xC0000049: return "STATUS_SECTION_NOT_IMAGE";
+    case 0xC000004A: return "STATUS_SUSPEND_COUNT_EXCEEDED";
+    case 0xC000004B: return "STATUS_THREAD_IS_TERMINATING";
+    case 0xC000004C: return "STATUS_BAD_WORKING_SET_LIMIT";
+    case 0xC000004D: return "STATUS_INCOMPATIBLE_FILE_MAP";
+    case 0xC000004E: return "STATUS_SECTION_PROTECTION";
+    case 0xC000004F: return "STATUS_EAS_NOT_SUPPORTED";
+    case 0xC0000050: return "STATUS_EA_TOO_LARGE";
+    case 0xC0000051: return "STATUS_NONEXISTENT_EA_ENTRY";
+    case 0xC0000052: return "STATUS_NO_EAS_ON_FILE";
+    case 0xC0000053: return "STATUS_EA_CORRUPT_ERROR";
+    case 0xC0000054: return "STATUS_FILE_LOCK_CONFLICT";
+    case 0xC0000055: return "STATUS_LOCK_NOT_GRANTED";
+    case 0xC0000056: return "STATUS_DELETE_PENDING";
+    case 0xC0000057: return "STATUS_CTL_FILE_NOT_SUPPORTED";
+    case 0xC0000058: return "STATUS_UNKNOWN_REVISION";
+    case 0xC0000059: return "STATUS_REVISION_MISMATCH";
+    case 0xC000005A: return "STATUS_INVALID_OWNER";
+    case 0xC000005B: return "STATUS_INVALID_PRIMARY_GROUP";
+    case 0xC000005C: return "STATUS_NO_IMPERSONATION_TOKEN";
+    case 0xC000005D: return "STATUS_CANT_DISABLE_MANDATORY";
+    case 0xC000005E: return "STATUS_NO_LOGON_SERVERS";
+    case 0xC000005F: return "STATUS_NO_SUCH_LOGON_SESSION";
+    case 0xC0000060: return "STATUS_NO_SUCH_PRIVILEGE";
+    case 0xC0000061: return "STATUS_PRIVILEGE_NOT_HELD";
+    case 0xC0000062: return "STATUS_INVALID_ACCOUNT_NAME";
+    case 0xC0000063: return "STATUS_USER_EXISTS";
+    case 0xC0000064: return "STATUS_NO_SUCH_USER";
+    case 0xC0000065: return "STATUS_GROUP_EXISTS";
+    case 0xC0000066: return "STATUS_NO_SUCH_GROUP";
+    case 0xC0000067: return "STATUS_MEMBER_IN_GROUP";
+    case 0xC0000068: return "STATUS_MEMBER_NOT_IN_GROUP";
+    case 0xC0000069: return "STATUS_LAST_ADMIN";
+    case 0xC000006A: return "STATUS_WRONG_PASSWORD";
+    case 0xC000006B: return "STATUS_ILL_FORMED_PASSWORD";
+    case 0xC000006C: return "STATUS_PASSWORD_RESTRICTION";
+    case 0xC000006D: return "STATUS_LOGON_FAILURE";
+    case 0xC000006E: return "STATUS_ACCOUNT_RESTRICTION";
+    case 0xC000006F: return "STATUS_INVALID_LOGON_HOURS";
+    case 0xC0000070: return "STATUS_INVALID_WORKSTATION";
+    case 0xC0000071: return "STATUS_PASSWORD_EXPIRED";
+    case 0xC0000072: return "STATUS_ACCOUNT_DISABLED";
+    case 0xC0000073: return "STATUS_NONE_MAPPED";
+    case 0xC0000074: return "STATUS_TOO_MANY_LUIDS_REQUESTED";
+    case 0xC0000075: return "STATUS_LUIDS_EXHAUSTED";
+    case 0xC0000076: return "STATUS_INVALID_SUB_AUTHORITY";
+    case 0xC0000077: return "STATUS_INVALID_ACL";
+    case 0xC0000078: return "STATUS_INVALID_SID";
+    case 0xC0000079: return "STATUS_INVALID_SECURITY_DESCR";
+    case 0xC000007A: return "STATUS_PROCEDURE_NOT_FOUND";
+    case 0xC000007B: return "STATUS_INVALID_IMAGE_FORMAT";
+    case 0xC000007C: return "STATUS_NO_TOKEN";
+    case 0xC000007D: return "STATUS_BAD_INHERITANCE_ACL";
+    case 0xC000007E: return "STATUS_RANGE_NOT_LOCKED";
+    case 0xC000007F: return "STATUS_DISK_FULL";
+    case 0xC0000080: return "STATUS_SERVER_DISABLED";
+    case 0xC0000081: return "STATUS_SERVER_NOT_DISABLED";
+    case 0xC0000082: return "STATUS_TOO_MANY_GUIDS_REQUESTED";
+    case 0xC0000083: return "STATUS_GUIDS_EXHAUSTED";
+    case 0xC0000084: return "STATUS_INVALID_ID_AUTHORITY";
+    case 0xC0000085: return "STATUS_AGENTS_EXHAUSTED";
+    case 0xC0000086: return "STATUS_INVALID_VOLUME_LABEL";
+    case 0xC0000087: return "STATUS_SECTION_NOT_EXTENDED";
+    case 0xC0000088: return "STATUS_NOT_MAPPED_DATA";
+    case 0xC0000089: return "STATUS_RESOURCE_DATA_NOT_FOUND";
+    case 0xC000008A: return "STATUS_RESOURCE_TYPE_NOT_FOUND";
+    case 0xC000008B: return "STATUS_RESOURCE_NAME_NOT_FOUND";
+    case 0xC000008C: return "STATUS_ARRAY_BOUNDS_EXCEEDED";
+    case 0xC000008D: return "STATUS_FLOAT_DENORMAL_OPERAND";
+    case 0xC000008E: return "STATUS_FLOAT_DIVIDE_BY_ZERO";
+    case 0xC000008F: return "STATUS_FLOAT_INEXACT_RESULT";
+    case 0xC0000090: return "STATUS_FLOAT_INVALID_OPERATION";
+    case 0xC0000091: return "STATUS_FLOAT_OVERFLOW";
+    case 0xC0000092: return "STATUS_FLOAT_STACK_CHECK";
+    case 0xC0000093: return "STATUS_FLOAT_UNDERFLOW";
+    case 0xC0000094: return "STATUS_INTEGER_DIVIDE_BY_ZERO";
+    case 0xC0000095: return "STATUS_INTEGER_OVERFLOW";
+    case 0xC0000096: return "STATUS_PRIVILEGED_INSTRUCTION";
+    case 0xC0000097: return "STATUS_TOO_MANY_PAGING_FILES";
+    case 0xC0000098: return "STATUS_FILE_INVALID";
+    case 0xC0000099: return "STATUS_ALLOTTED_SPACE_EXCEEDED";
+    case 0xC000009A: return "STATUS_INSUFFICIENT_RESOURCES";
+    case 0xC000009B: return "STATUS_DFS_EXIT_PATH_FOUND";
+    case 0xC000009C: return "STATUS_DEVICE_DATA_ERROR";
+    case 0xC000009D: return "STATUS_DEVICE_NOT_CONNECTED";
+    case 0xC000009E: return "STATUS_DEVICE_POWER_FAILURE";
+    case 0xC000009F: return "STATUS_FREE_VM_NOT_AT_BASE";
+    case 0xC00000A0: return "STATUS_MEMORY_NOT_ALLOCATED";
+    case 0xC00000A1: return "STATUS_WORKING_SET_QUOTA";
+    case 0xC00000A2: return "STATUS_MEDIA_WRITE_PROTECTED";
+    case 0xC00000A3: return "STATUS_DEVICE_NOT_READY";
+    case 0xC00000A4: return "STATUS_INVALID_GROUP_ATTRIBUTES";
+    case 0xC00000A5: return "STATUS_BAD_IMPERSONATION_LEVEL";
+    case 0xC00000A6: return "STATUS_CANT_OPEN_ANONYMOUS";
+    case 0xC00000A7: return "STATUS_BAD_VALIDATION_CLASS";
+    case 0xC00000A8: return "STATUS_BAD_TOKEN_TYPE";
+    case 0xC00000A9: return "STATUS_BAD_MASTER_BOOT_RECORD";
+    case 0xC00000AA: return "STATUS_INSTRUCTION_MISALIGNMENT";
+    case 0xC00000AB: return "STATUS_INSTANCE_NOT_AVAILABLE";
+    case 0xC00000AC: return "STATUS_PIPE_NOT_AVAILABLE";
+    case 0xC00000AD: return "STATUS_INVALID_PIPE_STATE";
+    case 0xC00000AE: return "STATUS_PIPE_BUSY";
+    case 0xC00000AF: return "STATUS_ILLEGAL_FUNCTION";
+    case 0xC00000B0: return "STATUS_PIPE_DISCONNECTED";
+    case 0xC00000B1: return "STATUS_PIPE_CLOSING";
+    case 0xC00000B2: return "STATUS_PIPE_CONNECTED";
+    case 0xC00000B3: return "STATUS_PIPE_LISTENING";
+    case 0xC00000B4: return "STATUS_INVALID_READ_MODE";
+    case 0xC00000B5: return "STATUS_IO_TIMEOUT";
+    case 0xC00000B6: return "STATUS_FILE_FORCED_CLOSED";
+    case 0xC00000B7: return "STATUS_PROFILING_NOT_STARTED";
+    case 0xC00000B8: return "STATUS_PROFILING_NOT_STOPPED";
+    case 0xC00000B9: return "STATUS_COULD_NOT_INTERPRET";
+    case 0xC00000BA: return "STATUS_FILE_IS_A_DIRECTORY";
+    case 0xC00000BB: return "STATUS_NOT_SUPPORTED";
+    case 0xC00000BC: return "STATUS_REMOTE_NOT_LISTENING";
+    case 0xC00000BD: return "STATUS_DUPLICATE_NAME";
+    case 0xC00000BE: return "STATUS_BAD_NETWORK_PATH";
+    case 0xC00000BF: return "STATUS_NETWORK_BUSY";
+    case 0xC00000C0: return "STATUS_DEVICE_DOES_NOT_EXIST";
+    case 0xC00000C1: return "STATUS_TOO_MANY_COMMANDS";
+    case 0xC00000C2: return "STATUS_ADAPTER_HARDWARE_ERROR";
+    case 0xC00000C3: return "STATUS_INVALID_NETWORK_RESPONSE";
+    case 0xC00000C4: return "STATUS_UNEXPECTED_NETWORK_ERROR";
+    case 0xC00000C5: return "STATUS_BAD_REMOTE_ADAPTER";
+    case 0xC00000C6: return "STATUS_PRINT_QUEUE_FULL";
+    case 0xC00000C7: return "STATUS_NO_SPOOL_SPACE";
+    case 0xC00000C8: return "STATUS_PRINT_CANCELLED";
+    case 0xC00000C9: return "STATUS_NETWORK_NAME_DELETED";
+    case 0xC00000CA: return "STATUS_NETWORK_ACCESS_DENIED";
+    case 0xC00000CB: return "STATUS_BAD_DEVICE_TYPE";
+    case 0xC00000CC: return "STATUS_BAD_NETWORK_NAME";
+    case 0xC00000CD: return "STATUS_TOO_MANY_NAMES";
+    case 0xC00000CE: return "STATUS_TOO_MANY_SESSIONS";
+    case 0xC00000CF: return "STATUS_SHARING_PAUSED";
+    case 0xC00000D0: return "STATUS_REQUEST_NOT_ACCEPTED";
+    case 0xC00000D1: return "STATUS_REDIRECTOR_PAUSED";
+    case 0xC00000D2: return "STATUS_NET_WRITE_FAULT";
+    case 0xC00000D3: return "STATUS_PROFILING_AT_LIMIT";
+    case 0xC00000D4: return "STATUS_NOT_SAME_DEVICE";
+    case 0xC00000D5: return "STATUS_FILE_RENAMED";
+    case 0xC00000D6: return "STATUS_VIRTUAL_CIRCUIT_CLOSED";
+    case 0xC00000D7: return "STATUS_NO_SECURITY_ON_OBJECT";
+    case 0xC00000D8: return "STATUS_CANT_WAIT";
+    case 0xC00000D9: return "STATUS_PIPE_EMPTY";
+    case 0xC00000DA: return "STATUS_CANT_ACCESS_DOMAIN_INFO";
+    case 0xC00000DB: return "STATUS_CANT_TERMINATE_SELF";
+    case 0xC00000DC: return "STATUS_INVALID_SERVER_STATE";
+    case 0xC00000DD: return "STATUS_INVALID_DOMAIN_STATE";
+    case 0xC00000DE: return "STATUS_INVALID_DOMAIN_ROLE";
+    case 0xC00000DF: return "STATUS_NO_SUCH_DOMAIN";
+    case 0xC00000E0: return "STATUS_DOMAIN_EXISTS";
+    case 0xC00000E1: return "STATUS_DOMAIN_LIMIT_EXCEEDED";
+    case 0xC00000E2: return "STATUS_OPLOCK_NOT_GRANTED";
+    case 0xC00000E3: return "STATUS_INVALID_OPLOCK_PROTOCOL";
+    case 0xC00000E4: return "STATUS_INTERNAL_DB_CORRUPTION";
+    case 0xC00000E5: return "STATUS_INTERNAL_ERROR";
+    case 0xC00000E6: return "STATUS_GENERIC_NOT_MAPPED";
+    case 0xC00000E7: return "STATUS_BAD_DESCRIPTOR_FORMAT";
+    case 0xC00000E8: return "STATUS_INVALID_USER_BUFFER";
+    case 0xC00000E9: return "STATUS_UNEXPECTED_IO_ERROR";
+    case 0xC00000EA: return "STATUS_UNEXPECTED_MM_CREATE_ERR";
+    case 0xC00000EB: return "STATUS_UNEXPECTED_MM_MAP_ERROR";
+    case 0xC00000EC: return "STATUS_UNEXPECTED_MM_EXTEND_ERR";
+    case 0xC00000ED: return "STATUS_NOT_LOGON_PROCESS";
+    case 0xC00000EE: return "STATUS_LOGON_SESSION_EXISTS";
+    case 0xC00000EF: return "STATUS_INVALID_PARAMETER_1";
+    case 0xC00000F0: return "STATUS_INVALID_PARAMETER_2";
+    case 0xC00000F1: return "STATUS_INVALID_PARAMETER_3";
+    case 0xC00000F2: return "STATUS_INVALID_PARAMETER_4";
+    case 0xC00000F3: return "STATUS_INVALID_PARAMETER_5";
+    case 0xC00000F4: return "STATUS_INVALID_PARAMETER_6";
+    case 0xC00000F5: return "STATUS_INVALID_PARAMETER_7";
+    case 0xC00000F6: return "STATUS_INVALID_PARAMETER_8";
+    case 0xC00000F7: return "STATUS_INVALID_PARAMETER_9";
+    case 0xC00000F8: return "STATUS_INVALID_PARAMETER_10";
+    case 0xC00000F9: return "STATUS_INVALID_PARAMETER_11";
+    case 0xC00000FA: return "STATUS_INVALID_PARAMETER_12";
+    case 0xC00000FB: return "STATUS_REDIRECTOR_NOT_STARTED";
+    case 0xC00000FC: return "STATUS_REDIRECTOR_STARTED";
+    case 0xC00000FD: return "STATUS_STACK_OVERFLOW";
+    case 0xC00000FE: return "STATUS_NO_SUCH_PACKAGE";
+    case 0xC00000FF: return "STATUS_BAD_FUNCTION_TABLE";
+    case 0xC0000100: return "STATUS_VARIABLE_NOT_FOUND";
+    case 0xC0000101: return "STATUS_DIRECTORY_NOT_EMPTY";
+    case 0xC0000102: return "STATUS_FILE_CORRUPT_ERROR";
+    case 0xC0000103: return "STATUS_NOT_A_DIRECTORY";
+    case 0xC0000104: return "STATUS_BAD_LOGON_SESSION_STATE";
+    case 0xC0000105: return "STATUS_LOGON_SESSION_COLLISION";
+    case 0xC0000106: return "STATUS_NAME_TOO_LONG";
+    case 0xC0000107: return "STATUS_FILES_OPEN";
+    case 0xC0000108: return "STATUS_CONNECTION_IN_USE";
+    case 0xC0000109: return "STATUS_MESSAGE_NOT_FOUND";
+    case 0xC000010A: return "STATUS_PROCESS_IS_TERMINATING";
+    case 0xC000010B: return "STATUS_INVALID_LOGON_TYPE";
+    case 0xC000010C: return "STATUS_NO_GUID_TRANSLATION";
+    case 0xC000010D: return "STATUS_CANNOT_IMPERSONATE";
+    case 0xC000010E: return "STATUS_IMAGE_ALREADY_LOADED";
+    case 0xC000010F: return "STATUS_ABIOS_NOT_PRESENT";
+    case 0xC0000110: return "STATUS_ABIOS_LID_NOT_EXIST";
+    case 0xC0000111: return "STATUS_ABIOS_LID_ALREADY_OWNED";
+    case 0xC0000112: return "STATUS_ABIOS_NOT_LID_OWNER";
+    case 0xC0000113: return "STATUS_ABIOS_INVALID_COMMAND";
+    case 0xC0000114: return "STATUS_ABIOS_INVALID_LID";
+    case 0xC0000115: return "STATUS_ABIOS_SELECTOR_NOT_AVAILABLE";
+    case 0xC0000116: return "STATUS_ABIOS_INVALID_SELECTOR";
+    case 0xC0000117: return "STATUS_NO_LDT";
+    case 0xC0000118: return "STATUS_INVALID_LDT_SIZE";
+    case 0xC0000119: return "STATUS_INVALID_LDT_OFFSET";
+    case 0xC000011A: return "STATUS_INVALID_LDT_DESCRIPTOR";
+    case 0xC000011B: return "STATUS_INVALID_IMAGE_NE_FORMAT";
+    case 0xC000011C: return "STATUS_RXACT_INVALID_STATE";
+    case 0xC000011D: return "STATUS_RXACT_COMMIT_FAILURE";
+    case 0xC000011E: return "STATUS_MAPPED_FILE_SIZE_ZERO";
+    case 0xC000011F: return "STATUS_TOO_MANY_OPENED_FILES";
+    case 0xC0000120: return "STATUS_CANCELLED";
+    case 0xC0000121: return "STATUS_CANNOT_DELETE";
+    case 0xC0000122: return "STATUS_INVALID_COMPUTER_NAME";
+    case 0xC0000123: return "STATUS_FILE_DELETED";
+    case 0xC0000124: return "STATUS_SPECIAL_ACCOUNT";
+    case 0xC0000125: return "STATUS_SPECIAL_GROUP";
+    case 0xC0000126: return "STATUS_SPECIAL_USER";
+    case 0xC0000127: return "STATUS_MEMBERS_PRIMARY_GROUP";
+    case 0xC0000128: return "STATUS_FILE_CLOSED";
+    case 0xC0000129: return "STATUS_TOO_MANY_THREADS";
+    case 0xC000012A: return "STATUS_THREAD_NOT_IN_PROCESS";
+    case 0xC000012B: return "STATUS_TOKEN_ALREADY_IN_USE";
+    case 0xC000012C: return "STATUS_PAGEFILE_QUOTA_EXCEEDED";
+    case 0xC000012D: return "STATUS_COMMITMENT_LIMIT";
+    case 0xC000012E: return "STATUS_INVALID_IMAGE_LE_FORMAT";
+    case 0xC000012F: return "STATUS_INVALID_IMAGE_NOT_MZ";
+    case 0xC0000130: return "STATUS_INVALID_IMAGE_PROTECT";
+    case 0xC0000131: return "STATUS_INVALID_IMAGE_WIN_16";
+    case 0xC0000132: return "STATUS_LOGON_SERVER_CONFLICT";
+    case 0xC0000133: return "STATUS_TIME_DIFFERENCE_AT_DC";
+    case 0xC0000134: return "STATUS_SYNCHRONIZATION_REQUIRED";
+    case 0xC0000135: return "STATUS_DLL_NOT_FOUND";
+    case 0xC0000136: return "STATUS_OPEN_FAILED";
+    case 0xC0000137: return "STATUS_IO_PRIVILEGE_FAILED";
+    case 0xC0000138: return "STATUS_ORDINAL_NOT_FOUND";
+    case 0xC0000139: return "STATUS_ENTRYPOINT_NOT_FOUND";
+    case 0xC000013A: return "STATUS_CONTROL_C_EXIT";
+    case 0xC000013B: return "STATUS_LOCAL_DISCONNECT";
+    case 0xC000013C: return "STATUS_REMOTE_DISCONNECT";
+    case 0xC000013D: return "STATUS_REMOTE_RESOURCES";
+    case 0xC000013E: return "STATUS_LINK_FAILED";
+    case 0xC000013F: return "STATUS_LINK_TIMEOUT";
+    case 0xC0000140: return "STATUS_INVALID_CONNECTION";
+    case 0xC0000141: return "STATUS_INVALID_ADDRESS";
+    case 0xC0000142: return "STATUS_DLL_INIT_FAILED";
+    case 0xC0000143: return "STATUS_MISSING_SYSTEMFILE";
+    case 0xC0000144: return "STATUS_UNHANDLED_EXCEPTION";
+    case 0xC0000145: return "STATUS_APP_INIT_FAILURE";
+    case 0xC0000146: return "STATUS_PAGEFILE_CREATE_FAILED";
+    case 0xC0000147: return "STATUS_NO_PAGEFILE";
+    case 0xC0000148: return "STATUS_INVALID_LEVEL";
+    case 0xC0000149: return "STATUS_WRONG_PASSWORD_CORE";
+    case 0xC000014A: return "STATUS_ILLEGAL_FLOAT_CONTEXT";
+    case 0xC000014B: return "STATUS_PIPE_BROKEN";
+    case 0xC000014C: return "STATUS_REGISTRY_CORRUPT";
+    case 0xC000014D: return "STATUS_REGISTRY_IO_FAILED";
+    case 0xC000014E: return "STATUS_NO_EVENT_PAIR";
+    case 0xC000014F: return "STATUS_UNRECOGNIZED_VOLUME";
+    case 0xC0000150: return "STATUS_SERIAL_NO_DEVICE_INITED";
+    case 0xC0000151: return "STATUS_NO_SUCH_ALIAS";
+    case 0xC0000152: return "STATUS_MEMBER_NOT_IN_ALIAS";
+    case 0xC0000153: return "STATUS_MEMBER_IN_ALIAS";
+    case 0xC0000154: return "STATUS_ALIAS_EXISTS";
+    case 0xC0000155: return "STATUS_LOGON_NOT_GRANTED";
+    case 0xC0000156: return "STATUS_TOO_MANY_SECRETS";
+    case 0xC0000157: return "STATUS_SECRET_TOO_LONG";
+    case 0xC0000158: return "STATUS_INTERNAL_DB_ERROR";
+    case 0xC0000159: return "STATUS_FULLSCREEN_MODE";
+    case 0xC000015A: return "STATUS_TOO_MANY_CONTEXT_IDS";
+    case 0xC000015B: return "STATUS_LOGON_TYPE_NOT_GRANTED";
+    case 0xC000015C: return "STATUS_NOT_REGISTRY_FILE";
+    case 0xC000015D: return "STATUS_NT_CROSS_ENCRYPTION_REQUIRED";
+    case 0xC000015E: return "STATUS_DOMAIN_CTRLR_CONFIG_ERROR";
+    case 0xC000015F: return "STATUS_FT_MISSING_MEMBER";
+    case 0xC0000160: return "STATUS_ILL_FORMED_SERVICE_ENTRY";
+    case 0xC0000161: return "STATUS_ILLEGAL_CHARACTER";
+    case 0xC0000162: return "STATUS_UNMAPPABLE_CHARACTER";
+    case 0xC0000163: return "STATUS_UNDEFINED_CHARACTER";
+    case 0xC0000164: return "STATUS_FLOPPY_VOLUME";
+    case 0xC0000165: return "STATUS_FLOPPY_ID_MARK_NOT_FOUND";
+    case 0xC0000166: return "STATUS_FLOPPY_WRONG_CYLINDER";
+    case 0xC0000167: return "STATUS_FLOPPY_UNKNOWN_ERROR";
+    case 0xC0000168: return "STATUS_FLOPPY_BAD_REGISTERS";
+    case 0xC0000169: return "STATUS_DISK_RECALIBRATE_FAILED";
+    case 0xC000016A: return "STATUS_DISK_OPERATION_FAILED";
+    case 0xC000016B: return "STATUS_DISK_RESET_FAILED";
+    case 0xC000016C: return "STATUS_SHARED_IRQ_BUSY";
+    case 0xC000016D: return "STATUS_FT_ORPHANING";
+    case 0xC000016E: return "STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT";
+    case 0xC0000172: return "STATUS_PARTITION_FAILURE";
+    case 0xC0000173: return "STATUS_INVALID_BLOCK_LENGTH";
+    case 0xC0000174: return "STATUS_DEVICE_NOT_PARTITIONED";
+    case 0xC0000175: return "STATUS_UNABLE_TO_LOCK_MEDIA";
+    case 0xC0000176: return "STATUS_UNABLE_TO_UNLOAD_MEDIA";
+    case 0xC0000177: return "STATUS_EOM_OVERFLOW";
+    case 0xC0000178: return "STATUS_NO_MEDIA";
+    case 0xC000017A: return "STATUS_NO_SUCH_MEMBER";
+    case 0xC000017B: return "STATUS_INVALID_MEMBER";
+    case 0xC000017C: return "STATUS_KEY_DELETED";
+    case 0xC000017D: return "STATUS_NO_LOG_SPACE";
+    case 0xC000017E: return "STATUS_TOO_MANY_SIDS";
+    case 0xC000017F: return "STATUS_LM_CROSS_ENCRYPTION_REQUIRED";
+    case 0xC0000180: return "STATUS_KEY_HAS_CHILDREN";
+    case 0xC0000181: return "STATUS_CHILD_MUST_BE_VOLATILE";
+    case 0xC0000182: return "STATUS_DEVICE_CONFIGURATION_ERROR";
+    case 0xC0000183: return "STATUS_DRIVER_INTERNAL_ERROR";
+    case 0xC0000184: return "STATUS_INVALID_DEVICE_STATE";
+    case 0xC0000185: return "STATUS_IO_DEVICE_ERROR";
+    case 0xC0000186: return "STATUS_DEVICE_PROTOCOL_ERROR";
+    case 0xC0000187: return "STATUS_BACKUP_CONTROLLER";
+    case 0xC0000188: return "STATUS_LOG_FILE_FULL";
+    case 0xC0000189: return "STATUS_TOO_LATE";
+    case 0xC000018A: return "STATUS_NO_TRUST_LSA_SECRET";
+    case 0xC000018B: return "STATUS_NO_TRUST_SAM_ACCOUNT";
+    case 0xC000018C: return "STATUS_TRUSTED_DOMAIN_FAILURE";
+    case 0xC000018D: return "STATUS_TRUSTED_RELATIONSHIP_FAILURE";
+    case 0xC000018E: return "STATUS_EVENTLOG_FILE_CORRUPT";
+    case 0xC000018F: return "STATUS_EVENTLOG_CANT_START";
+    case 0xC0000190: return "STATUS_TRUST_FAILURE";
+    case 0xC0000191: return "STATUS_MUTANT_LIMIT_EXCEEDED";
+    case 0xC0000192: return "STATUS_NETLOGON_NOT_STARTED";
+    case 0xC0000193: return "STATUS_ACCOUNT_EXPIRED";
+    case 0xC0000194: return "STATUS_POSSIBLE_DEADLOCK";
+    case 0xC0000195: return "STATUS_NETWORK_CREDENTIAL_CONFLICT";
+    case 0xC0000196: return "STATUS_REMOTE_SESSION_LIMIT";
+    case 0xC0000197: return "STATUS_EVENTLOG_FILE_CHANGED";
+    case 0xC0000198: return "STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT";
+    case 0xC0000199: return "STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT";
+    case 0xC000019A: return "STATUS_NOLOGON_SERVER_TRUST_ACCOUNT";
+    case 0xC000019B: return "STATUS_DOMAIN_TRUST_INCONSISTENT";
+    case 0xC000019C: return "STATUS_FS_DRIVER_REQUIRED";
+    case 0xC0000202: return "STATUS_NO_USER_SESSION_KEY";
+    case 0xC0000203: return "STATUS_USER_SESSION_DELETED";
+    case 0xC0000204: return "STATUS_RESOURCE_LANG_NOT_FOUND";
+    case 0xC0000205: return "STATUS_INSUFF_SERVER_RESOURCES";
+    case 0xC0000206: return "STATUS_INVALID_BUFFER_SIZE";
+    case 0xC0000207: return "STATUS_INVALID_ADDRESS_COMPONENT";
+    case 0xC0000208: return "STATUS_INVALID_ADDRESS_WILDCARD";
+    case 0xC0000209: return "STATUS_TOO_MANY_ADDRESSES";
+    case 0xC000020A: return "STATUS_ADDRESS_ALREADY_EXISTS";
+    case 0xC000020B: return "STATUS_ADDRESS_CLOSED";
+    case 0xC000020C: return "STATUS_CONNECTION_DISCONNECTED";
+    case 0xC000020D: return "STATUS_CONNECTION_RESET";
+    case 0xC000020E: return "STATUS_TOO_MANY_NODES";
+    case 0xC000020F: return "STATUS_TRANSACTION_ABORTED";
+    case 0xC0000210: return "STATUS_TRANSACTION_TIMED_OUT";
+    case 0xC0000211: return "STATUS_TRANSACTION_NO_RELEASE";
+    case 0xC0000212: return "STATUS_TRANSACTION_NO_MATCH";
+    case 0xC0000213: return "STATUS_TRANSACTION_RESPONDED";
+    case 0xC0000214: return "STATUS_TRANSACTION_INVALID_ID";
+    case 0xC0000215: return "STATUS_TRANSACTION_INVALID_TYPE";
+    case 0xC0000216: return "STATUS_NOT_SERVER_SESSION";
+    case 0xC0000217: return "STATUS_NOT_CLIENT_SESSION";
+    case 0xC0000218: return "STATUS_CANNOT_LOAD_REGISTRY_FILE";
+    case 0xC0000219: return "STATUS_DEBUG_ATTACH_FAILED";
+    case 0xC000021A: return "STATUS_SYSTEM_PROCESS_TERMINATED";
+    case 0xC000021B: return "STATUS_DATA_NOT_ACCEPTED";
+    case 0xC000021C: return "STATUS_NO_BROWSER_SERVERS_FOUND";
+    case 0xC000021D: return "STATUS_VDM_HARD_ERROR";
+    case 0xC000021E: return "STATUS_DRIVER_CANCEL_TIMEOUT";
+    case 0xC000021F: return "STATUS_REPLY_MESSAGE_MISMATCH";
+    case 0xC0000220: return "STATUS_MAPPED_ALIGNMENT";
+    case 0xC0000221: return "STATUS_IMAGE_CHECKSUM_MISMATCH";
+    case 0xC0000222: return "STATUS_LOST_WRITEBEHIND_DATA";
+    case 0xC0000223: return "STATUS_CLIENT_SERVER_PARAMETERS_INVALID";
+    case 0xC0000224: return "STATUS_PASSWORD_MUST_CHANGE";
+    case 0xC0000225: return "STATUS_NOT_FOUND";
+    case 0xC0000226: return "STATUS_NOT_TINY_STREAM";
+    case 0xC0000227: return "STATUS_RECOVERY_FAILURE";
+    case 0xC0000228: return "STATUS_STACK_OVERFLOW_READ";
+    case 0xC0000229: return "STATUS_FAIL_CHECK";
+    case 0xC000022A: return "STATUS_DUPLICATE_OBJECTID";
+    case 0xC000022B: return "STATUS_OBJECTID_EXISTS";
+    case 0xC000022C: return "STATUS_CONVERT_TO_LARGE";
+    case 0xC000022D: return "STATUS_RETRY";
+    case 0xC000022E: return "STATUS_FOUND_OUT_OF_SCOPE";
+    case 0xC000022F: return "STATUS_ALLOCATE_BUCKET";
+    case 0xC0000230: return "STATUS_PROPSET_NOT_FOUND";
+    case 0xC0000231: return "STATUS_MARSHALL_OVERFLOW";
+    case 0xC0000232: return "STATUS_INVALID_VARIANT";
+    case 0xC0000233: return "STATUS_DOMAIN_CONTROLLER_NOT_FOUND";
+    case 0xC0000234: return "STATUS_ACCOUNT_LOCKED_OUT";
+    case 0xC0000235: return "STATUS_HANDLE_NOT_CLOSABLE";
+    case 0xC0000236: return "STATUS_CONNECTION_REFUSED";
+    case 0xC0000237: return "STATUS_GRACEFUL_DISCONNECT";
+    case 0xC0000238: return "STATUS_ADDRESS_ALREADY_ASSOCIATED";
+    case 0xC0000239: return "STATUS_ADDRESS_NOT_ASSOCIATED";
+    case 0xC000023A: return "STATUS_CONNECTION_INVALID";
+    case 0xC000023B: return "STATUS_CONNECTION_ACTIVE";
+    case 0xC000023C: return "STATUS_NETWORK_UNREACHABLE";
+    case 0xC000023D: return "STATUS_HOST_UNREACHABLE";
+    case 0xC000023E: return "STATUS_PROTOCOL_UNREACHABLE";
+    case 0xC000023F: return "STATUS_PORT_UNREACHABLE";
+    case 0xC0000240: return "STATUS_REQUEST_ABORTED";
+    case 0xC0000241: return "STATUS_CONNECTION_ABORTED";
+    case 0xC0000242: return "STATUS_BAD_COMPRESSION_BUFFER";
+    case 0xC0000243: return "STATUS_USER_MAPPED_FILE";
+    case 0xC0000244: return "STATUS_AUDIT_FAILED";
+    case 0xC0000245: return "STATUS_TIMER_RESOLUTION_NOT_SET";
+    case 0xC0000246: return "STATUS_CONNECTION_COUNT_LIMIT";
+    case 0xC0000247: return "STATUS_LOGIN_TIME_RESTRICTION";
+    case 0xC0000248: return "STATUS_LOGIN_WKSTA_RESTRICTION";
+    case 0xC0000249: return "STATUS_IMAGE_MP_UP_MISMATCH";
+    case 0xC0000250: return "STATUS_INSUFFICIENT_LOGON_INFO";
+    case 0xC0000251: return "STATUS_BAD_DLL_ENTRYPOINT";
+    case 0xC0000252: return "STATUS_BAD_SERVICE_ENTRYPOINT";
+    case 0xC0000253: return "STATUS_LPC_REPLY_LOST";
+    case 0xC0000254: return "STATUS_IP_ADDRESS_CONFLICT1";
+    case 0xC0000255: return "STATUS_IP_ADDRESS_CONFLICT2";
+    case 0xC0000256: return "STATUS_REGISTRY_QUOTA_LIMIT";
+    case 0xC0000257: return "STATUS_PATH_NOT_COVERED";
+    case 0xC0000258: return "STATUS_NO_CALLBACK_ACTIVE";
+    case 0xC0000259: return "STATUS_LICENSE_QUOTA_EXCEEDED";
+    case 0xC000025A: return "STATUS_PWD_TOO_SHORT";
+    case 0xC000025B: return "STATUS_PWD_TOO_RECENT";
+    case 0xC000025C: return "STATUS_PWD_HISTORY_CONFLICT";
+    case 0xC000025E: return "STATUS_PLUGPLAY_NO_DEVICE";
+    case 0xC000025F: return "STATUS_UNSUPPORTED_COMPRESSION";
+    case 0xC0000260: return "STATUS_INVALID_HW_PROFILE";
+    case 0xC0000261: return "STATUS_INVALID_PLUGPLAY_DEVICE_PATH";
+    case 0xC0000262: return "STATUS_DRIVER_ORDINAL_NOT_FOUND";
+    case 0xC0000263: return "STATUS_DRIVER_ENTRYPOINT_NOT_FOUND";
+    case 0xC0000264: return "STATUS_RESOURCE_NOT_OWNED";
+    case 0xC0000265: return "STATUS_TOO_MANY_LINKS";
+    case 0xC0000266: return "STATUS_QUOTA_LIST_INCONSISTENT";
+    case 0xC0000267: return "STATUS_FILE_IS_OFFLINE";
+    case 0xC0000268: return "STATUS_EVALUATION_EXPIRATION";
+    case 0xC0000269: return "STATUS_ILLEGAL_DLL_RELOCATION";
+    case 0xC000026A: return "STATUS_LICENSE_VIOLATION";
+    case 0xC000026B: return "STATUS_DLL_INIT_FAILED_LOGOFF";
+    case 0xC000026C: return "STATUS_DRIVER_UNABLE_TO_LOAD";
+    case 0xC000026D: return "STATUS_DFS_UNAVAILABLE";
+    case 0xC000026E: return "STATUS_VOLUME_DISMOUNTED";
+    case 0xC000026F: return "STATUS_WX86_INTERNAL_ERROR";
+    case 0xC0000270: return "STATUS_WX86_FLOAT_STACK_CHECK";
+    case 0xC0000271: return "STATUS_VALIDATE_CONTINUE";
+    case 0xC0000272: return "STATUS_NO_MATCH";
+    case 0xC0000273: return "STATUS_NO_MORE_MATCHES";
+    case 0xC0000275: return "STATUS_NOT_A_REPARSE_POINT";
+    case 0xC0000276: return "STATUS_IO_REPARSE_TAG_INVALID";
+    case 0xC0000277: return "STATUS_IO_REPARSE_TAG_MISMATCH";
+    case 0xC0000278: return "STATUS_IO_REPARSE_DATA_INVALID";
+    case 0xC0000279: return "STATUS_IO_REPARSE_TAG_NOT_HANDLED";
+    case 0xC0000280: return "STATUS_REPARSE_POINT_NOT_RESOLVED";
+    case 0xC0000281: return "STATUS_DIRECTORY_IS_A_REPARSE_POINT";
+    case 0xC0000282: return "STATUS_RANGE_LIST_CONFLICT";
+    case 0xC0000283: return "STATUS_SOURCE_ELEMENT_EMPTY";
+    case 0xC0000284: return "STATUS_DESTINATION_ELEMENT_FULL";
+    case 0xC0000285: return "STATUS_ILLEGAL_ELEMENT_ADDRESS";
+    case 0xC0000286: return "STATUS_MAGAZINE_NOT_PRESENT";
+    case 0xC0000287: return "STATUS_REINITIALIZATION_NEEDED";
+    case 0x80000288: return "STATUS_DEVICE_REQUIRES_CLEANING";
+    case 0x80000289: return "STATUS_DEVICE_DOOR_OPEN";
+    case 0xC000028A: return "STATUS_ENCRYPTION_FAILED";
+    case 0xC000028B: return "STATUS_DECRYPTION_FAILED";
+    case 0xC000028C: return "STATUS_RANGE_NOT_FOUND";
+    case 0xC000028D: return "STATUS_NO_RECOVERY_POLICY";
+    case 0xC000028E: return "STATUS_NO_EFS";
+    case 0xC000028F: return "STATUS_WRONG_EFS";
+    case 0xC0000290: return "STATUS_NO_USER_KEYS";
+    case 0xC0000291: return "STATUS_FILE_NOT_ENCRYPTED";
+    case 0xC0000292: return "STATUS_NOT_EXPORT_FORMAT";
+    case 0xC0000293: return "STATUS_FILE_ENCRYPTED";
+    case 0x40000294: return "STATUS_WAKE_SYSTEM";
+    case 0xC0000295: return "STATUS_WMI_GUID_NOT_FOUND";
+    case 0xC0000296: return "STATUS_WMI_INSTANCE_NOT_FOUND";
+    case 0xC0000297: return "STATUS_WMI_ITEMID_NOT_FOUND";
+    case 0xC0000298: return "STATUS_WMI_TRY_AGAIN";
+    case 0xC0000299: return "STATUS_SHARED_POLICY";
+    case 0xC000029A: return "STATUS_POLICY_OBJECT_NOT_FOUND";
+    case 0xC000029B: return "STATUS_POLICY_ONLY_IN_DS";
+    case 0xC000029C: return "STATUS_VOLUME_NOT_UPGRADED";
+    case 0xC000029D: return "STATUS_REMOTE_STORAGE_NOT_ACTIVE";
+    case 0xC000029E: return "STATUS_REMOTE_STORAGE_MEDIA_ERROR";
+    case 0xC000029F: return "STATUS_NO_TRACKING_SERVICE";
+    case 0xC00002A0: return "STATUS_SERVER_SID_MISMATCH";
+    case 0xC00002A1: return "STATUS_DS_NO_ATTRIBUTE_OR_VALUE";
+    case 0xC00002A2: return "STATUS_DS_INVALID_ATTRIBUTE_SYNTAX";
+    case 0xC00002A3: return "STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED";
+    case 0xC00002A4: return "STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS";
+    case 0xC00002A5: return "STATUS_DS_BUSY";
+    case 0xC00002A6: return "STATUS_DS_UNAVAILABLE";
+    case 0xC00002A7: return "STATUS_DS_NO_RIDS_ALLOCATED";
+    case 0xC00002A8: return "STATUS_DS_NO_MORE_RIDS";
+    case 0xC00002A9: return "STATUS_DS_INCORRECT_ROLE_OWNER";
+    case 0xC00002AA: return "STATUS_DS_RIDMGR_INIT_ERROR";
+    case 0xC00002AB: return "STATUS_DS_OBJ_CLASS_VIOLATION";
+    case 0xC00002AC: return "STATUS_DS_CANT_ON_NON_LEAF";
+    case 0xC00002AD: return "STATUS_DS_CANT_ON_RDN";
+    case 0xC00002AE: return "STATUS_DS_CANT_MOD_OBJ_CLASS";
+    case 0xC00002AF: return "STATUS_DS_CROSS_DOM_MOVE_FAILED";
+    case 0xC00002B0: return "STATUS_DS_GC_NOT_AVAILABLE";
+    case 0xC00002B1: return "STATUS_DIRECTORY_SERVICE_REQUIRED";
+    case 0xC00002B2: return "STATUS_REPARSE_ATTRIBUTE_CONFLICT";
+    case 0xC00002B3: return "STATUS_CANT_ENABLE_DENY_ONLY";
+    case 0xC00002B4: return "STATUS_FLOAT_MULTIPLE_FAULTS";
+    case 0xC00002B5: return "STATUS_FLOAT_MULTIPLE_TRAPS";
+    case 0xC00002B6: return "STATUS_DEVICE_REMOVED";
+    case 0xC00002B7: return "STATUS_JOURNAL_DELETE_IN_PROGRESS";
+    case 0xC00002B8: return "STATUS_JOURNAL_NOT_ACTIVE";
+    case 0xC00002B9: return "STATUS_NOINTERFACE";
+    case 0xC00002C1: return "STATUS_DS_ADMIN_LIMIT_EXCEEDED";
+    case 0xC00002C2: return "STATUS_DRIVER_FAILED_SLEEP";
+    case 0xC00002C3: return "STATUS_MUTUAL_AUTHENTICATION_FAILED";
+    case 0xC00002C4: return "STATUS_CORRUPT_SYSTEM_FILE";
+    case 0xC00002C5: return "STATUS_DATATYPE_MISALIGNMENT_ERROR";
+    case 0xC00002C6: return "STATUS_WMI_READ_ONLY";
+    case 0xC00002C7: return "STATUS_WMI_SET_FAILURE";
+    case 0xC00002C8: return "STATUS_COMMITMENT_MINIMUM";
+    case 0xC00002C9: return "STATUS_REG_NAT_CONSUMPTION";
+    case 0xC00002CA: return "STATUS_TRANSPORT_FULL";
+    case 0xC00002CB: return "STATUS_DS_SAM_INIT_FAILURE";
+    case 0xC00002CC: return "STATUS_ONLY_IF_CONNECTED";
+    case 0xC00002CD: return "STATUS_DS_SENSITIVE_GROUP_VIOLATION";
+    case 0xC00002CE: return "STATUS_PNP_RESTART_ENUMERATION";
+    case 0xC00002CF: return "STATUS_JOURNAL_ENTRY_DELETED";
+    case 0xC00002D0: return "STATUS_DS_CANT_MOD_PRIMARYGROUPID";
+    case 0xC00002D1: return "STATUS_SYSTEM_IMAGE_BAD_SIGNATURE";
+    case 0xC00002D2: return "STATUS_PNP_REBOOT_REQUIRED";
+    case 0xC00002D3: return "STATUS_POWER_STATE_INVALID";
+    case 0xC00002D4: return "STATUS_DS_INVALID_GROUP_TYPE";
+    case 0xC00002D5: return "STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN";
+    case 0xC00002D6: return "STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN";
+    case 0xC00002D7: return "STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER";
+    case 0xC00002D8: return "STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER";
+    case 0xC00002D9: return "STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER";
+    case 0xC00002DA: return "STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER";
+    case 0xC00002DB: return "STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER";
+    case 0xC00002DC: return "STATUS_DS_HAVE_PRIMARY_MEMBERS";
+    case 0xC00002DD: return "STATUS_WMI_NOT_SUPPORTED";
+    case 0xC00002DE: return "STATUS_INSUFFICIENT_POWER";
+    case 0xC00002DF: return "STATUS_SAM_NEED_BOOTKEY_PASSWORD";
+    case 0xC00002E0: return "STATUS_SAM_NEED_BOOTKEY_FLOPPY";
+    case 0xC00002E1: return "STATUS_DS_CANT_START";
+    case 0xC00002E2: return "STATUS_DS_INIT_FAILURE";
+    case 0xC00002E3: return "STATUS_SAM_INIT_FAILURE";
+    case 0xC00002E4: return "STATUS_DS_GC_REQUIRED";
+    case 0xC00002E5: return "STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY";
+    case 0xC00002E6: return "STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS";
+    case 0xC00002E7: return "STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED";
+    case 0xC00002E8: return "STATUS_MULTIPLE_FAULT_VIOLATION";
+    case 0xC0000300: return "STATUS_NOT_SUPPORTED_ON_SBS";
+    case 0xC0009898: return "STATUS_WOW_ASSERTION";
+    case 0xC0010001: return "DBG_NO_STATE_CHANGE";
+    case 0xC0010002: return "DBG_APP_NOT_IDLE";
+    case 0xC0020001: return "RPC_NT_INVALID_STRING_BINDING";
+    case 0xC0020002: return "RPC_NT_WRONG_KIND_OF_BINDING";
+    case 0xC0020003: return "RPC_NT_INVALID_BINDING";
+    case 0xC0020004: return "RPC_NT_PROTSEQ_NOT_SUPPORTED";
+    case 0xC0020005: return "RPC_NT_INVALID_RPC_PROTSEQ";
+    case 0xC0020006: return "RPC_NT_INVALID_STRING_UUID";
+    case 0xC0020007: return "RPC_NT_INVALID_ENDPOINT_FORMAT";
+    case 0xC0020008: return "RPC_NT_INVALID_NET_ADDR";
+    case 0xC0020009: return "RPC_NT_NO_ENDPOINT_FOUND";
+    case 0xC002000A: return "RPC_NT_INVALID_TIMEOUT";
+    case 0xC002000B: return "RPC_NT_OBJECT_NOT_FOUND";
+    case 0xC002000C: return "RPC_NT_ALREADY_REGISTERED";
+    case 0xC002000D: return "RPC_NT_TYPE_ALREADY_REGISTERED";
+    case 0xC002000E: return "RPC_NT_ALREADY_LISTENING";
+    case 0xC002000F: return "RPC_NT_NO_PROTSEQS_REGISTERED";
+    case 0xC0020010: return "RPC_NT_NOT_LISTENING";
+    case 0xC0020011: return "RPC_NT_UNKNOWN_MGR_TYPE";
+    case 0xC0020012: return "RPC_NT_UNKNOWN_IF";
+    case 0xC0020013: return "RPC_NT_NO_BINDINGS";
+    case 0xC0020014: return "RPC_NT_NO_PROTSEQS";
+    case 0xC0020015: return "RPC_NT_CANT_CREATE_ENDPOINT";
+    case 0xC0020016: return "RPC_NT_OUT_OF_RESOURCES";
+    case 0xC0020017: return "RPC_NT_SERVER_UNAVAILABLE";
+    case 0xC0020018: return "RPC_NT_SERVER_TOO_BUSY";
+    case 0xC0020019: return "RPC_NT_INVALID_NETWORK_OPTIONS";
+    case 0xC002001A: return "RPC_NT_NO_CALL_ACTIVE";
+    case 0xC002001B: return "RPC_NT_CALL_FAILED";
+    case 0xC002001C: return "RPC_NT_CALL_FAILED_DNE";
+    case 0xC002001D: return "RPC_NT_PROTOCOL_ERROR";
+    case 0xC002001F: return "RPC_NT_UNSUPPORTED_TRANS_SYN";
+    case 0xC0020021: return "RPC_NT_UNSUPPORTED_TYPE";
+    case 0xC0020022: return "RPC_NT_INVALID_TAG";
+    case 0xC0020023: return "RPC_NT_INVALID_BOUND";
+    case 0xC0020024: return "RPC_NT_NO_ENTRY_NAME";
+    case 0xC0020025: return "RPC_NT_INVALID_NAME_SYNTAX";
+    case 0xC0020026: return "RPC_NT_UNSUPPORTED_NAME_SYNTAX";
+    case 0xC0020028: return "RPC_NT_UUID_NO_ADDRESS";
+    case 0xC0020029: return "RPC_NT_DUPLICATE_ENDPOINT";
+    case 0xC002002A: return "RPC_NT_UNKNOWN_AUTHN_TYPE";
+    case 0xC002002B: return "RPC_NT_MAX_CALLS_TOO_SMALL";
+    case 0xC002002C: return "RPC_NT_STRING_TOO_LONG";
+    case 0xC002002D: return "RPC_NT_PROTSEQ_NOT_FOUND";
+    case 0xC002002E: return "RPC_NT_PROCNUM_OUT_OF_RANGE";
+    case 0xC002002F: return "RPC_NT_BINDING_HAS_NO_AUTH";
+    case 0xC0020030: return "RPC_NT_UNKNOWN_AUTHN_SERVICE";
+    case 0xC0020031: return "RPC_NT_UNKNOWN_AUTHN_LEVEL";
+    case 0xC0020032: return "RPC_NT_INVALID_AUTH_IDENTITY";
+    case 0xC0020033: return "RPC_NT_UNKNOWN_AUTHZ_SERVICE";
+    case 0xC0020034: return "EPT_NT_INVALID_ENTRY";
+    case 0xC0020035: return "EPT_NT_CANT_PERFORM_OP";
+    case 0xC0020036: return "EPT_NT_NOT_REGISTERED";
+    case 0xC0020037: return "RPC_NT_NOTHING_TO_EXPORT";
+    case 0xC0020038: return "RPC_NT_INCOMPLETE_NAME";
+    case 0xC0020039: return "RPC_NT_INVALID_VERS_OPTION";
+    case 0xC002003A: return "RPC_NT_NO_MORE_MEMBERS";
+    case 0xC002003B: return "RPC_NT_NOT_ALL_OBJS_UNEXPORTED";
+    case 0xC002003C: return "RPC_NT_INTERFACE_NOT_FOUND";
+    case 0xC002003D: return "RPC_NT_ENTRY_ALREADY_EXISTS";
+    case 0xC002003E: return "RPC_NT_ENTRY_NOT_FOUND";
+    case 0xC002003F: return "RPC_NT_NAME_SERVICE_UNAVAILABLE";
+    case 0xC0020040: return "RPC_NT_INVALID_NAF_ID";
+    case 0xC0020041: return "RPC_NT_CANNOT_SUPPORT";
+    case 0xC0020042: return "RPC_NT_NO_CONTEXT_AVAILABLE";
+    case 0xC0020043: return "RPC_NT_INTERNAL_ERROR";
+    case 0xC0020044: return "RPC_NT_ZERO_DIVIDE";
+    case 0xC0020045: return "RPC_NT_ADDRESS_ERROR";
+    case 0xC0020046: return "RPC_NT_FP_DIV_ZERO";
+    case 0xC0020047: return "RPC_NT_FP_UNDERFLOW";
+    case 0xC0020048: return "RPC_NT_FP_OVERFLOW";
+    case 0xC0030001: return "RPC_NT_NO_MORE_ENTRIES";
+    case 0xC0030002: return "RPC_NT_SS_CHAR_TRANS_OPEN_FAIL";
+    case 0xC0030003: return "RPC_NT_SS_CHAR_TRANS_SHORT_FILE";
+    case 0xC0030004: return "RPC_NT_SS_IN_NULL_CONTEXT";
+    case 0xC0030005: return "RPC_NT_SS_CONTEXT_MISMATCH";
+    case 0xC0030006: return "RPC_NT_SS_CONTEXT_DAMAGED";
+    case 0xC0030007: return "RPC_NT_SS_HANDLES_MISMATCH";
+    case 0xC0030008: return "RPC_NT_SS_CANNOT_GET_CALL_HANDLE";
+    case 0xC0030009: return "RPC_NT_NULL_REF_POINTER";
+    case 0xC003000A: return "RPC_NT_ENUM_VALUE_OUT_OF_RANGE";
+    case 0xC003000B: return "RPC_NT_BYTE_COUNT_TOO_SMALL";
+    case 0xC003000C: return "RPC_NT_BAD_STUB_DATA";
+    case 0xC0020049: return "RPC_NT_CALL_IN_PROGRESS";
+    case 0xC002004A: return "RPC_NT_NO_MORE_BINDINGS";
+    case 0xC002004B: return "RPC_NT_GROUP_MEMBER_NOT_FOUND";
+    case 0xC002004C: return "EPT_NT_CANT_CREATE";
+    case 0xC002004D: return "RPC_NT_INVALID_OBJECT";
+    case 0xC002004F: return "RPC_NT_NO_INTERFACES";
+    case 0xC0020050: return "RPC_NT_CALL_CANCELLED";
+    case 0xC0020051: return "RPC_NT_BINDING_INCOMPLETE";
+    case 0xC0020052: return "RPC_NT_COMM_FAILURE";
+    case 0xC0020053: return "RPC_NT_UNSUPPORTED_AUTHN_LEVEL";
+    case 0xC0020054: return "RPC_NT_NO_PRINC_NAME";
+    case 0xC0020055: return "RPC_NT_NOT_RPC_ERROR";
+    case 0x40020056: return "RPC_NT_UUID_LOCAL_ONLY";
+    case 0xC0020057: return "RPC_NT_SEC_PKG_ERROR";
+    case 0xC0020058: return "RPC_NT_NOT_CANCELLED";
+    case 0xC0030059: return "RPC_NT_INVALID_ES_ACTION";
+    case 0xC003005A: return "RPC_NT_WRONG_ES_VERSION";
+    case 0xC003005B: return "RPC_NT_WRONG_STUB_VERSION";
+    case 0xC003005C: return "RPC_NT_INVALID_PIPE_OBJECT";
+    case 0xC003005D: return "RPC_NT_INVALID_PIPE_OPERATION";
+    case 0xC003005E: return "RPC_NT_WRONG_PIPE_VERSION";
+    case 0xC003005F: return "RPC_NT_PIPE_CLOSED";
+    case 0xC0030060: return "RPC_NT_PIPE_DISCIPLINE_ERROR";
+    case 0xC0030061: return "RPC_NT_PIPE_EMPTY";
+    case 0xC0020062: return "RPC_NT_INVALID_ASYNC_HANDLE";
+    case 0xC0020063: return "RPC_NT_INVALID_ASYNC_CALL";
+    case 0x400200AF: return "RPC_NT_SEND_INCOMPLETE";
+    case 0xC0140001: return "STATUS_ACPI_INVALID_OPCODE";
+    case 0xC0140002: return "STATUS_ACPI_STACK_OVERFLOW";
+    case 0xC0140003: return "STATUS_ACPI_ASSERT_FAILED";
+    case 0xC0140004: return "STATUS_ACPI_INVALID_INDEX";
+    case 0xC0140005: return "STATUS_ACPI_INVALID_ARGUMENT";
+    case 0xC0140006: return "STATUS_ACPI_FATAL";
+    case 0xC0140007: return "STATUS_ACPI_INVALID_SUPERNAME";
+    case 0xC0140008: return "STATUS_ACPI_INVALID_ARGTYPE";
+    case 0xC0140009: return "STATUS_ACPI_INVALID_OBJTYPE";
+    case 0xC014000A: return "STATUS_ACPI_INVALID_TARGETTYPE";
+    case 0xC014000B: return "STATUS_ACPI_INCORRECT_ARGUMENT_COUNT";
+    case 0xC014000C: return "STATUS_ACPI_ADDRESS_NOT_MAPPED";
+    case 0xC014000D: return "STATUS_ACPI_INVALID_EVENTTYPE";
+    case 0xC014000E: return "STATUS_ACPI_HANDLER_COLLISION";
+    case 0xC014000F: return "STATUS_ACPI_INVALID_DATA";
+    case 0xC0140010: return "STATUS_ACPI_INVALID_REGION";
+    case 0xC0140011: return "STATUS_ACPI_INVALID_ACCESS_SIZE";
+    case 0xC0140012: return "STATUS_ACPI_ACQUIRE_GLOBAL_LOCK";
+    case 0xC0140013: return "STATUS_ACPI_ALREADY_INITIALIZED";
+    case 0xC0140014: return "STATUS_ACPI_NOT_INITIALIZED";
+    case 0xC0140015: return "STATUS_ACPI_INVALID_MUTEX_LEVEL";
+    case 0xC0140016: return "STATUS_ACPI_MUTEX_NOT_OWNED";
+    case 0xC0140017: return "STATUS_ACPI_MUTEX_NOT_OWNER";
+    case 0xC0140018: return "STATUS_ACPI_RS_ACCESS";
+    case 0xC0140019: return "STATUS_ACPI_INVALID_TABLE";
+    case 0xC0140020: return "STATUS_ACPI_REG_HANDLER_FAILED";
+    case 0xC0140021: return "STATUS_ACPI_POWER_REQUEST_FAILED";
+    case 0xC00A0001: return "STATUS_CTX_WINSTATION_NAME_INVALID";
+    case 0xC00A0002: return "STATUS_CTX_INVALID_PD";
+    case 0xC00A0003: return "STATUS_CTX_PD_NOT_FOUND";
+    case 0x400A0004: return "STATUS_CTX_CDM_CONNECT";
+    case 0x400A0005: return "STATUS_CTX_CDM_DISCONNECT";
+    case 0xC00A0006: return "STATUS_CTX_CLOSE_PENDING";
+    case 0xC00A0007: return "STATUS_CTX_NO_OUTBUF";
+    case 0xC00A0008: return "STATUS_CTX_MODEM_INF_NOT_FOUND";
+    case 0xC00A0009: return "STATUS_CTX_INVALID_MODEMNAME";
+    case 0xC00A000A: return "STATUS_CTX_RESPONSE_ERROR";
+    case 0xC00A000B: return "STATUS_CTX_MODEM_RESPONSE_TIMEOUT";
+    case 0xC00A000C: return "STATUS_CTX_MODEM_RESPONSE_NO_CARRIER";
+    case 0xC00A000D: return "STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE";
+    case 0xC00A000E: return "STATUS_CTX_MODEM_RESPONSE_BUSY";
+    case 0xC00A000F: return "STATUS_CTX_MODEM_RESPONSE_VOICE";
+    case 0xC00A0010: return "STATUS_CTX_TD_ERROR";
+    case 0xC00A0012: return "STATUS_CTX_LICENSE_CLIENT_INVALID";
+    case 0xC00A0013: return "STATUS_CTX_LICENSE_NOT_AVAILABLE";
+    case 0xC00A0014: return "STATUS_CTX_LICENSE_EXPIRED";
+    case 0xC00A0015: return "STATUS_CTX_WINSTATION_NOT_FOUND";
+    case 0xC00A0016: return "STATUS_CTX_WINSTATION_NAME_COLLISION";
+    case 0xC00A0017: return "STATUS_CTX_WINSTATION_BUSY";
+    case 0xC00A0018: return "STATUS_CTX_BAD_VIDEO_MODE";
+    case 0xC00A0022: return "STATUS_CTX_GRAPHICS_INVALID";
+    case 0xC00A0024: return "STATUS_CTX_NOT_CONSOLE";
+    case 0xC00A0026: return "STATUS_CTX_CLIENT_QUERY_TIMEOUT";
+    case 0xC00A0027: return "STATUS_CTX_CONSOLE_DISCONNECT";
+    case 0xC00A0028: return "STATUS_CTX_CONSOLE_CONNECT";
+    case 0xC00A002A: return "STATUS_CTX_SHADOW_DENIED";
+    case 0xC00A002B: return "STATUS_CTX_WINSTATION_ACCESS_DENIED";
+    case 0xC00A002E: return "STATUS_CTX_INVALID_WD";
+    case 0xC00A002F: return "STATUS_CTX_WD_NOT_FOUND";
+    case 0xC00A0030: return "STATUS_CTX_SHADOW_INVALID";
+    case 0xC00A0031: return "STATUS_CTX_SHADOW_DISABLED";
+    case 0xC00A0032: return "STATUS_RDP_PROTOCOL_ERROR";
+    case 0xC00A0033: return "STATUS_CTX_CLIENT_LICENSE_NOT_SET";
+    case 0xC00A0034: return "STATUS_CTX_CLIENT_LICENSE_IN_USE";
+    case 0xC0040035: return "STATUS_PNP_BAD_MPS_TABLE";
+    case 0xC0040036: return "STATUS_PNP_TRANSLATION_FAILED";
+    case 0xC0040037: return "STATUS_PNP_IRQ_TRANSLATION_FAILED";
+    default:         return "STATUS_UNKNOWN";
+    }
+}
+
+
+/*
+ * KsPrintf
+ *   This function is variable-argument, level-sensitive debug print routine.
+ *   If the specified debug level for the print statement is lower or equal
+ *   to the current debug level, the message will be printed.
+ *
+ * Arguments:
+ *   DebugPrintLevel - Specifies at which debugging level the string should
+ *                     be printed
+ *   DebugMessage - Variable argument ascii c string
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES: 
+ *   N/A
+ */
+
+VOID
+KsPrintf(
+    LONG  DebugPrintLevel,
+    PCHAR DebugMessage,
+    ...
+    )
+{
+    va_list  ap;
+
+    va_start(ap, DebugMessage);
+
+    if (DebugPrintLevel <= KsDebugLevel)
+    {
+        CHAR buffer[0x200];
+
+        vsprintf(buffer, DebugMessage, ap);
+
+        KdPrint(("TID:%8.8x: %s", PsGetCurrentThread(), buffer));
+    }
+
+    va_end(ap);
+
+} // KsPrint()
+
+#endif
diff --git a/lnet/libcfs/winnt/winnt-fs.c b/lnet/libcfs/winnt/winnt-fs.c
new file mode 100644 (file)
index 0000000..128781b
--- /dev/null
@@ -0,0 +1,541 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (C) 2001 Cluster File Systems, Inc. <braam@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+const CHAR *dos_file_prefix = "\\??\\";
+
+/*
+ * cfs_filp_open
+ *     To open or create a file in kernel mode
+ *
+ * Arguments:
+ *   name:  name of the file to be opened or created, no dos path prefix
+ *   flags: open/creation attribute options
+ *   mode:  access mode/permission to open or create
+ *   err:   error code
+ *
+ * Return Value:
+ *   the pointer to the cfs_file_t or NULL if it fails
+ *
+ * Notes: 
+ *   N/A
+ */
+
+cfs_file_t *cfs_filp_open(const char *name, int flags, int mode, int *err)
+{
+    cfs_file_t *        fp = NULL;
+
+    NTSTATUS            Status;
+
+    OBJECT_ATTRIBUTES   ObjectAttributes;
+    HANDLE              FileHandle;
+    IO_STATUS_BLOCK     IoStatus;
+    ACCESS_MASK         DesiredAccess;
+    ULONG               CreateDisposition;
+    ULONG               ShareAccess;
+    ULONG               CreateOptions;
+
+    USHORT              NameLength = 0;
+    USHORT              PrefixLength = 0;
+
+    UNICODE_STRING      UnicodeName;
+    PWCHAR              UnicodeString = NULL;
+
+    ANSI_STRING         AnsiName;
+    PUCHAR              AnsiString = NULL;
+
+    /* Analyze the flags settings */
+
+    if (cfs_is_flag_set(flags, O_WRONLY)) {
+        DesiredAccess = (GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = 0;
+    }  else if (cfs_is_flag_set(flags, O_RDWR)) {
+        DesiredAccess = (GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ | FILE_SHARE_WRITE;
+    } else {
+        DesiredAccess = (GENERIC_READ | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ;
+    }
+
+    if (cfs_is_flag_set(flags, O_CREAT)) {
+        if (cfs_is_flag_set(flags, O_EXCL)) {
+            CreateDisposition = FILE_CREATE;
+        } else {
+            CreateDisposition = FILE_OPEN_IF;
+        }
+    } else {
+        CreateDisposition = FILE_OPEN;
+    }
+
+    if (cfs_is_flag_set(flags, O_TRUNC)) {
+        if (cfs_is_flag_set(flags, O_EXCL)) {
+            CreateDisposition = FILE_OVERWRITE;
+        } else {
+            CreateDisposition = FILE_OVERWRITE_IF;
+        }
+    }
+
+    CreateOptions = 0;
+
+    if (cfs_is_flag_set(flags, O_DIRECTORY)) {
+        cfs_set_flag(CreateOptions,  FILE_DIRECTORY_FILE);
+    }
+
+    if (cfs_is_flag_set(flags, O_SYNC)) {
+         cfs_set_flag(CreateOptions, FILE_WRITE_THROUGH);
+    }
+
+    if (cfs_is_flag_set(flags, O_DIRECT)) {
+         cfs_set_flag(CreateOptions, FILE_NO_INTERMEDIATE_BUFFERING);
+    }
+
+    /* Initialize the unicode path name for the specified file */
+
+    NameLength = (USHORT)strlen(name);
+
+    if (name[0] != '\\') {
+        PrefixLength = (USHORT)strlen(dos_file_prefix);
+    }
+
+    AnsiString = cfs_alloc( sizeof(CHAR) * (NameLength + PrefixLength + 1),
+                            CFS_ALLOC_ZERO);
+    if (NULL == AnsiString) {
+        if (err) *err = -ENOMEM;
+        return NULL;
+    }
+
+    UnicodeString = cfs_alloc( sizeof(WCHAR) * (NameLength + PrefixLength + 1),
+                               CFS_ALLOC_ZERO);
+
+    if (NULL == UnicodeString) {
+        if (err) *err = -ENOMEM;
+        cfs_free(AnsiString);
+        return NULL;
+    }
+
+    if (PrefixLength) {
+        RtlCopyMemory(&AnsiString[0], dos_file_prefix , PrefixLength);
+    }
+
+    RtlCopyMemory(&AnsiString[PrefixLength], name, NameLength);
+    NameLength += PrefixLength;
+
+    AnsiName.MaximumLength = NameLength + 1;
+    AnsiName.Length = NameLength;
+    AnsiName.Buffer = AnsiString;
+
+    UnicodeName.MaximumLength = (NameLength + 1) * sizeof(WCHAR);
+    UnicodeName.Length = 0;
+    UnicodeName.Buffer = (PWSTR)UnicodeString;
+
+    RtlAnsiStringToUnicodeString(&UnicodeName, &AnsiName, FALSE);
+
+    /* Setup the object attributes structure for the file. */
+
+    InitializeObjectAttributes(
+            &ObjectAttributes,
+            &UnicodeName,
+            OBJ_CASE_INSENSITIVE |
+            OBJ_KERNEL_HANDLE,
+            NULL,
+            NULL );
+
+    /* Now to open or create the file now */
+
+    Status = ZwCreateFile(
+            &FileHandle,
+            DesiredAccess,
+            &ObjectAttributes,
+            &IoStatus,
+            0,
+            FILE_ATTRIBUTE_NORMAL,
+            ShareAccess,
+            CreateDisposition,
+            CreateOptions,
+            NULL,
+            0 );
+
+    /* Check the returned status of IoStatus... */
+
+    if (!NT_SUCCESS(IoStatus.Status)) {
+        *err = cfs_error_code(IoStatus.Status);
+        cfs_free(UnicodeString);
+        cfs_free(AnsiString);
+        return NULL;
+    }
+
+    /* Allocate the cfs_file_t: libcfs file object */
+
+    fp = cfs_alloc(sizeof(cfs_file_t) + NameLength, CFS_ALLOC_ZERO);
+
+    if (NULL == fp) {
+        Status = ZwClose(FileHandle);
+        ASSERT(NT_SUCCESS(Status));
+        *err = -ENOMEM;
+        cfs_free(UnicodeString);
+        cfs_free(AnsiString);
+        return NULL;
+    }
+
+    fp->f_handle = FileHandle;
+    strcpy(fp->f_name, name);
+    fp->f_flags = flags;
+    fp->f_mode  = (mode_t)mode;
+    fp->f_count = 1;
+    *err = 0;
+
+    /* free the memory of temporary name strings */
+    cfs_free(UnicodeString);
+    cfs_free(AnsiString);
+
+    return fp;
+}
+
+
+/*
+ * cfs_filp_close
+ *     To close the opened file and release the filp structure
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   ZERO: on success
+ *   Non-Zero: on failure
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_filp_close(cfs_file_t *fp)
+{
+    NTSTATUS    Status;
+
+    ASSERT(fp != NULL);
+    ASSERT(fp->f_handle != NULL);
+
+    /* release the file handle */
+    Status = ZwClose(fp->f_handle);
+    ASSERT(NT_SUCCESS(Status));
+
+    /* free the file flip structure */
+    cfs_free(fp);
+    return 0;
+}
+
+
+/*
+ * cfs_filp_read
+ *     To read data from the opened file
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *   buf:  pointer to the buffer to contain the data
+ *   nbytes: size in bytes to be read from the file
+ *   pos:  offset in file where reading starts, if pos
+ *         NULL, then read from current file offset
+ *
+ * Return Value:
+ *   Actual size read into the buffer in success case
+ *   Error code in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_filp_read(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+    LARGE_INTEGER   address;
+    NTSTATUS        Status;
+    IO_STATUS_BLOCK IoStatus;
+
+    int             rc = 0;
+
+    /* Read data from the file into the specified buffer */
+
+    if (pos != NULL) {
+        address.QuadPart = *pos;
+    } else {
+        address.QuadPart = fp->f_pos;
+    }
+
+    Status = ZwReadFile( fp->f_handle,
+                         0,
+                         NULL,
+                         NULL,
+                         &IoStatus,
+                         buf,
+                         nbytes,
+                         &address,
+                         NULL );
+
+    if (!NT_SUCCESS(IoStatus.Status)) {
+        rc = cfs_error_code(IoStatus.Status);
+    } else {
+        rc = (int)IoStatus.Information;
+        fp->f_pos = address.QuadPart + rc;
+        if (pos != NULL) {
+            *pos = fp->f_pos;
+        }   
+    }
+
+    return rc;     
+}
+
+
+/*
+ * cfs_filp_wrtie
+ *     To write specified data to the opened file
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *   buf:  pointer to the buffer containing the data
+ *   nbytes: size in bytes to be written to the file
+ *   pos:  offset in file where writing starts, if pos
+ *         NULL, then write to current file offset
+ *
+ * Return Value:
+ *   Actual size written into the buffer in success case
+ *   Error code in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_filp_write(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos)
+{
+    LARGE_INTEGER   address;
+    NTSTATUS        Status;
+    IO_STATUS_BLOCK IoStatus;
+    int             rc = 0;
+
+    /* Write user specified data into the file */
+
+    if (pos != NULL) {
+        address.QuadPart = *pos;
+    } else {
+        address.QuadPart = fp->f_pos;
+    }
+
+    Status = ZwWriteFile( fp->f_handle,
+                         0,
+                         NULL,
+                         NULL,
+                         &IoStatus,
+                         buf,
+                         nbytes,
+                         &address,
+                         NULL );
+
+    if (!NT_SUCCESS(Status)) {
+        rc =  cfs_error_code(Status);
+    } else {
+        rc = (int)IoStatus.Information;
+        fp->f_pos = address.QuadPart + rc;
+        if (pos != NULL) {
+            *pos = fp->f_pos;
+        }   
+    }
+
+    return rc;
+}
+
+
+NTSTATUS
+CompletionRoutine(
+    PDEVICE_OBJECT DeviceObject,
+    PIRP Irp,
+    PVOID Context)
+{
+    /* copy the IoStatus result */
+    *Irp->UserIosb = Irp->IoStatus;
+    
+    /* singal the event we set */
+    KeSetEvent(Irp->UserEvent, 0, FALSE);
+   
+    /* free the Irp we allocated */
+    IoFreeIrp(Irp);
+    
+    return STATUS_MORE_PROCESSING_REQUIRED;
+}
+
+
+/*
+ * cfs_filp_fsync
+ *     To sync the dirty data of the file to disk
+ *
+ * Arguments:
+ *   fp: the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   Zero:  in success case
+ *   Error code: in failure case
+ *
+ * Notes: 
+ *   Nt kernel doesn't export such a routine to flush a file,
+ *   we must allocate our own Irp and issue it to the file
+ *   system driver.
+ */
+
+int cfs_filp_fsync(cfs_file_t *fp)
+{
+
+    PFILE_OBJECT            FileObject;
+    PDEVICE_OBJECT          DeviceObject;
+
+    NTSTATUS                Status;
+    PIRP                    Irp;
+    KEVENT                  Event;
+    IO_STATUS_BLOCK         IoSb;
+    PIO_STACK_LOCATION      IrpSp;
+
+    /* get the FileObject and the DeviceObject */
+
+    Status = ObReferenceObjectByHandle(
+                fp->f_handle,
+                FILE_WRITE_DATA,
+                NULL,
+                KernelMode,
+                (PVOID*)&FileObject,
+                NULL );
+
+    if (!NT_SUCCESS(Status)) {
+        return cfs_error_code(Status);
+    }
+
+    DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+    /* allocate a new Irp */
+
+    Irp = IoAllocateIrp(DeviceObject->StackSize, FALSE);
+
+    if (!Irp) {
+
+        ObDereferenceObject(FileObject);
+        return -ENOMEM;
+    }
+
+    /* intialize the event */
+    KeInitializeEvent(&Event, SynchronizationEvent, FALSE);
+
+    /* setup the Irp */
+    Irp->UserEvent = &Event;
+    Irp->UserIosb = &IoSb;
+    Irp->RequestorMode = KernelMode;
+
+    Irp->Tail.Overlay.Thread = PsGetCurrentThread();
+    Irp->Tail.Overlay.OriginalFileObject = FileObject;
+
+    /* setup the Irp stack location */
+    IrpSp = IoGetNextIrpStackLocation(Irp);
+
+    IrpSp->MajorFunction = IRP_MJ_FLUSH_BUFFERS;
+    IrpSp->DeviceObject = DeviceObject;
+    IrpSp->FileObject = FileObject;
+
+    IoSetCompletionRoutine(Irp, CompletionRoutine, 0, TRUE, TRUE, TRUE);
+
+
+    /* issue the Irp to the underlying file system driver */
+    IoCallDriver(DeviceObject, Irp);
+
+    /* wait until it is finished */
+    KeWaitForSingleObject(&Event, Executive, KernelMode, TRUE, 0);
+
+    /* cleanup our reference on it */
+    ObDereferenceObject(FileObject);
+
+    Status = IoSb.Status;
+
+    return cfs_error_code(Status);
+}
+
+/*
+ * cfs_get_file
+ *     To increase the reference of the file object
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   Zero:  in success case
+ *   Non-Zero: in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_get_file(cfs_file_t *fp)
+{
+    InterlockedIncrement(&(fp->f_count));
+    return 0;
+}
+
+
+/*
+ * cfs_put_file
+ *     To decrease the reference of the file object
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   Zero:  in success case
+ *   Non-Zero: in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_put_file(cfs_file_t *fp)
+{
+    if (InterlockedDecrement(&(fp->f_count)) == 0) {
+        cfs_filp_close(fp);
+    }
+
+    return 0;
+}
+
+
+/*
+ * cfs_file_count
+ *   To query the reference count of the file object
+ *
+ * Arguments:
+ *   fp:   the pointer of the cfs_file_t strcture
+ *
+ * Return Value:
+ *   the reference count of the file object
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_file_count(cfs_file_t *fp)
+{
+    return (int)(fp->f_count);
+}
diff --git a/lnet/libcfs/winnt/winnt-lock.c b/lnet/libcfs/winnt/winnt-lock.c
new file mode 100644 (file)
index 0000000..12dbc67
--- /dev/null
@@ -0,0 +1,353 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+
+#if _X86_
+
+void __declspec (naked) FASTCALL
+atomic_add(
+    int i,
+    atomic_t *v
+    )
+{
+    // ECX = i
+    // EDX = v ; [EDX][0] = v->counter
+
+    __asm {
+        lock add dword ptr [edx][0], ecx
+        ret
+    }
+}
+
+void __declspec (naked) FASTCALL
+atomic_sub(
+    int i,
+    atomic_t *v
+   ) 
+{
+    // ECX = i
+    // EDX = v ; [EDX][0] = v->counter
+
+    __asm {
+        lock sub dword ptr [edx][0], ecx
+        ret
+    }
+}
+
+void __declspec (naked) FASTCALL
+atomic_inc(
+    atomic_t *v
+    )
+{
+    //InterlockedIncrement((PULONG)(&((v)->counter)));
+
+    //` ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        lock inc dword ptr [ecx][0]
+        ret
+    }
+}
+
+void __declspec (naked) FASTCALL
+atomic_dec(
+    atomic_t *v
+    )
+{
+    // ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        lock dec dword ptr [ecx][0]
+        ret
+    }
+}
+
+int __declspec (naked) FASTCALL 
+atomic_sub_and_test(
+    int i,
+    atomic_t *v
+    )
+{
+
+    // ECX = i
+    // EDX = v ; [EDX][0] = v->counter
+
+    __asm {
+        xor eax, eax
+        lock sub dword ptr [edx][0], ecx
+        sete al
+        ret
+    }
+}
+
+int __declspec (naked) FASTCALL
+atomic_inc_and_test(
+    atomic_t *v
+    )
+{
+    // ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        xor eax, eax
+        lock inc dword ptr [ecx][0]
+        sete al
+        ret
+    }
+}
+
+int __declspec (naked) FASTCALL
+atomic_dec_and_test(
+    atomic_t *v
+    )
+{
+    // ECX = v ; [ECX][0] = v->counter
+
+    __asm {
+        xor eax, eax
+        lock dec dword ptr [ecx][0]
+        sete al
+        ret
+    }
+}
+
+#else
+
+void FASTCALL
+atomic_add(
+    int i,
+    atomic_t *v
+    )
+{
+    InterlockedExchangeAdd( (PULONG)(&((v)->counter)) , (LONG) (i));
+}
+
+void FASTCALL
+atomic_sub(
+    int i,
+    atomic_t *v
+   ) 
+{
+    InterlockedExchangeAdd( (PULONG)(&((v)->counter)) , (LONG) (-1*i));
+}
+
+void FASTCALL
+atomic_inc(
+    atomic_t *v
+    )
+{
+   InterlockedIncrement((PULONG)(&((v)->counter)));
+}
+
+void FASTCALL
+atomic_dec(
+    atomic_t *v
+    )
+{
+    InterlockedDecrement((PULONG)(&((v)->counter)));
+}
+
+int FASTCALL 
+atomic_sub_and_test(
+    int i,
+    atomic_t *v
+    )
+{
+    int counter, result;
+
+    do {
+
+        counter = v->counter;
+        result = counter - i;
+
+    } while ( InterlockedCompareExchange(
+                &(v->counter),
+                result,
+                counter) !=  counter);
+
+    return (result == 0);
+}
+
+int FASTCALL
+atomic_inc_and_test(
+    atomic_t *v
+    )
+{
+    int counter, result;
+
+    do {
+
+        counter = v->counter;
+        result = counter + 1;
+
+    } while ( InterlockedCompareExchange(
+                &(v->counter),
+                result,
+                counter) !=  counter);
+
+    return (result == 0);
+}
+
+int FASTCALL
+atomic_dec_and_test(
+    atomic_t *v
+    )
+{
+    int counter, result;
+
+    do {
+
+        counter = v->counter;
+        result = counter + 1;
+
+    } while ( InterlockedCompareExchange(
+                &(v->counter),
+                result,
+                counter) !=  counter);
+
+    return (result == 0);
+}
+
+#endif
+
+
+/*
+ * rw spinlock
+ */
+
+
+void
+rwlock_init(rwlock_t * rwlock)
+{
+    spin_lock_init(&rwlock->guard);
+    rwlock->count = 0;
+}
+
+void
+rwlock_fini(rwlock_t * rwlock)
+{
+}
+
+void
+read_lock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    slot->irql = KeRaiseIrqlToDpcLevel();
+
+    while (TRUE) {
+           spin_lock(&rwlock->guard);
+        if (rwlock->count >= 0)
+            break;
+        spin_unlock(&rwlock->guard);
+    }
+
+       rwlock->count++;
+       spin_unlock(&rwlock->guard);
+}
+
+void
+read_unlock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    spin_lock(&rwlock->guard);
+       ASSERT(rwlock->count > 0);
+    rwlock->count--;
+    if (rwlock < 0) {
+        cfs_enter_debugger();
+    }
+       spin_unlock(&rwlock->guard);
+
+    KeLowerIrql(slot->irql);
+}
+
+void
+write_lock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    slot->irql = KeRaiseIrqlToDpcLevel();
+
+    while (TRUE) {
+           spin_lock(&rwlock->guard);
+        if (rwlock->count == 0)
+            break;
+        spin_unlock(&rwlock->guard);
+    }
+
+       rwlock->count = -1;
+       spin_unlock(&rwlock->guard);
+}
+
+void
+write_unlock(rwlock_t * rwlock)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    ASSERT(slot->Magic == TASKSLT_MAGIC);
+   
+    spin_lock(&rwlock->guard);
+       ASSERT(rwlock->count == -1);
+    rwlock->count = 0;
+       spin_unlock(&rwlock->guard);
+
+    KeLowerIrql(slot->irql);
+}
diff --git a/lnet/libcfs/winnt/winnt-lwt.c b/lnet/libcfs/winnt/winnt-lwt.c
new file mode 100644 (file)
index 0000000..272cbcf
--- /dev/null
@@ -0,0 +1,20 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+# define DEBUG_SUBSYSTEM S_LNET
+
diff --git a/lnet/libcfs/winnt/winnt-mem.c b/lnet/libcfs/winnt/winnt-mem.c
new file mode 100644 (file)
index 0000000..6b66a95
--- /dev/null
@@ -0,0 +1,332 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+
+
+cfs_mem_cache_t *cfs_page_t_slab = NULL;
+cfs_mem_cache_t *cfs_page_p_slab = NULL;
+
+/*
+ * cfs_alloc_page
+ *   To allocate the cfs_page_t and also 1 page of memory
+ *
+ * Arguments:
+ *   flags:  the allocation options
+ *
+ * Return Value:
+ *   pointer to the cfs_page_t strcture in success or
+ *   NULL in failure case
+ *
+ * Notes: 
+ *   N/A
+ */
+
+cfs_page_t * cfs_alloc_page(int flags)
+{
+    cfs_page_t *pg;
+    pg = cfs_mem_cache_alloc(cfs_page_t_slab, 0);
+    
+    if (NULL == pg) {
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    memset(pg, 0, sizeof(cfs_page_t));
+    pg->addr = cfs_mem_cache_alloc(cfs_page_p_slab, 0);
+    atomic_set(&pg->count, 1);
+
+    if (pg->addr) {
+        if (cfs_is_flag_set(flags, CFS_ALLOC_ZERO)) {
+            memset(pg->addr, 0, CFS_PAGE_SIZE);
+        }
+    } else {
+        cfs_enter_debugger();
+        cfs_mem_cache_free(cfs_page_t_slab, pg);
+        pg = NULL;
+    }
+
+    return pg;
+}
+
+/*
+ * cfs_free_page
+ *   To free the cfs_page_t including the page
+ *
+ * Arguments:
+ *   pg:  pointer to the cfs_page_t strcture
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+void cfs_free_page(cfs_page_t *pg)
+{
+    ASSERT(pg != NULL);
+    ASSERT(pg->addr  != NULL);
+    ASSERT(atomic_read(&pg->count) <= 1);
+
+    cfs_mem_cache_free(cfs_page_p_slab, pg->addr);
+    cfs_mem_cache_free(cfs_page_t_slab, pg);
+}
+
+
+/*
+ * cfs_alloc
+ *   To allocate memory from system pool
+ *
+ * Arguments:
+ *   nr_bytes:  length in bytes of the requested buffer
+ *   flags:     flags indiction
+ *
+ * Return Value:
+ *   NULL: if there's no enough memory space in system
+ *   the address of the allocated memory in success.
+ *
+ * Notes: 
+ *   This operation can be treated as atomic.
+ */
+
+void *
+cfs_alloc(size_t nr_bytes, u_int32_t flags)
+{
+       void *ptr;
+
+    /* Ignore the flags: always allcoate from NonPagedPool */
+
+       ptr = ExAllocatePoolWithTag(NonPagedPool, nr_bytes, 'Lufs');
+
+       if (ptr != NULL && (flags & CFS_ALLOC_ZERO)) {
+               memset(ptr, 0, nr_bytes);
+    }
+
+    if (!ptr) {
+        cfs_enter_debugger();
+    }
+
+       return ptr;
+}
+
+/*
+ * cfs_free
+ *   To free the sepcified memory to system pool
+ *
+ * Arguments:
+ *   addr:   pointer to the buffer to be freed
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *    This operation can be treated as atomic.
+ */
+
+void
+cfs_free(void *addr)
+{
+       ExFreePool(addr);
+}
+
+/*
+ * cfs_alloc_large
+ *   To allocate large block of memory from system pool
+ *
+ * Arguments:
+ *   nr_bytes:  length in bytes of the requested buffer
+ *
+ * Return Value:
+ *   NULL: if there's no enough memory space in system
+ *   the address of the allocated memory in success.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void *
+cfs_alloc_large(size_t nr_bytes)
+{
+       return cfs_alloc(nr_bytes, 0);
+}
+
+/*
+ * cfs_free_large
+ *   To free the sepcified memory to system pool
+ *
+ * Arguments:
+ *   addr:   pointer to the buffer to be freed
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_free_large(void *addr)
+{
+       cfs_free(addr);
+}
+
+
+/*
+ * cfs_mem_cache_create
+ *   To create a SLAB cache
+ *
+ * Arguments:
+ *   name:   name string of the SLAB cache to be created
+ *   size:   size in bytes of SLAB entry buffer
+ *   offset: offset in the page
+ *   flags:  SLAB creation flags
+*
+ * Return Value:
+ *   The poitner of cfs_memory_cache structure in success.
+ *   NULL pointer in failure case.
+ *
+ * Notes: 
+ *   1, offset won't be used here.
+ *   2, it could be better to induce a lock to protect the access of the
+ *       SLAB structure on SMP if there's not outside lock protection.
+ *   3, parameters C/D are removed.
+ */
+
+cfs_mem_cache_t *
+cfs_mem_cache_create(
+    const char * name,
+    size_t size,
+    size_t offset,
+    unsigned long flags
+    )
+{
+    cfs_mem_cache_t * kmc = NULL;
+
+    /*  The name of the SLAB could not exceed 20 chars */
+
+    if (name && strlen(name) >= 20) {
+        goto errorout;
+    }
+
+    /* Allocate and initialize the SLAB strcture */
+
+    kmc = cfs_alloc (sizeof(cfs_mem_cache_t), 0);
+
+    if (NULL == kmc) {
+        goto errorout;
+    }
+
+    memset(kmc, 0, sizeof(cfs_mem_cache_t));
+
+    kmc->flags = flags;
+
+    if (name) {
+        strcpy(&kmc->name[0], name);
+    }
+
+    /* Initialize the corresponding LookAside list */
+
+    ExInitializeNPagedLookasideList(
+            &(kmc->npll),
+            NULL,
+            NULL,
+            0,
+            size,
+            'pnmk',
+            0);
+errorout:
+
+    return kmc;
+}
+
+/*
+ * cfs_mem_cache_destroy
+ *   To destroy the unused SLAB cache
+ *
+ * Arguments:
+ *   kmc: the SLAB cache to be destroied.
+ *
+ * Return Value:
+ *   0: in success case.
+ *   1: in failure case.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_mem_cache_destroy (cfs_mem_cache_t * kmc)
+{
+    ASSERT(kmc != NULL);
+
+    ExDeleteNPagedLookasideList(&(kmc->npll));
+
+    cfs_free(kmc);
+
+    return 0;
+}
+
+/*
+ * cfs_mem_cache_alloc
+ *   To allocate an object (LookAside entry) from the SLAB
+ *
+ * Arguments:
+ *   kmc:   the SLAB cache to be allocated from.
+ *   flags: flags for allocation options
+ *
+ * Return Value:
+ *   object buffer address: in success case.
+ *   NULL: in failure case.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void *cfs_mem_cache_alloc(cfs_mem_cache_t * kmc, int flags)
+{
+    void *buf = NULL;
+
+    buf = ExAllocateFromNPagedLookasideList(&(kmc->npll));
+
+    return buf;
+}
+
+/*
+ * cfs_mem_cache_free
+ *   To free an object (LookAside entry) to the SLAB cache
+ *
+ * Arguments:
+ *   kmc: the SLAB cache to be freed to.
+ *   buf: the pointer to the object to be freed.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_mem_cache_free(cfs_mem_cache_t * kmc, void * buf)
+{
+    ExFreeToNPagedLookasideList(&(kmc->npll), buf);
+}
diff --git a/lnet/libcfs/winnt/winnt-module.c b/lnet/libcfs/winnt/winnt-module.c
new file mode 100644 (file)
index 0000000..2b6b008
--- /dev/null
@@ -0,0 +1,160 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LIBCFS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+#define LIBCFS_MINOR 240
+
+int libcfs_ioctl_getdata(char *buf, char *end, void *arg)
+{
+        struct libcfs_ioctl_hdr *hdr;
+        struct libcfs_ioctl_data *data;
+        int err;
+        ENTRY;
+
+        hdr = (struct libcfs_ioctl_hdr *)buf;
+        data = (struct libcfs_ioctl_data *)buf;
+
+        err = copy_from_user(buf, (void *)arg, sizeof(*hdr));
+        if (err)
+                RETURN(err);
+
+        if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) {
+                CERROR(("LIBCFS: version mismatch kernel vs application\n"));
+                RETURN(-EINVAL);
+        }
+
+        if (hdr->ioc_len + buf >= end) {
+                CERROR(("LIBCFS: user buffer exceeds kernel buffer\n"));
+                RETURN(-EINVAL);
+        }
+
+        if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) {
+                CERROR(("LIBCFS: user buffer too small for ioctl\n"));
+                RETURN(-EINVAL);
+        }
+
+        err = copy_from_user(buf, (void *)arg, hdr->ioc_len);
+        if (err)
+                RETURN(err);
+
+        if (libcfs_ioctl_is_invalid(data)) {
+                CERROR(("LIBCFS: ioctl not correctly formatted\n"));
+                RETURN(-EINVAL);
+        }
+
+        if (data->ioc_inllen1)
+                data->ioc_inlbuf1 = &data->ioc_bulk[0];
+
+        if (data->ioc_inllen2)
+                data->ioc_inlbuf2 = &data->ioc_bulk[0] +
+                        size_round(data->ioc_inllen1);
+
+        RETURN(0);
+}
+                                                                                                                                                                        
+extern struct cfs_psdev_ops          libcfs_psdev_ops;
+
+static int 
+libcfs_psdev_open(cfs_file_t * file)
+{ 
+       struct libcfs_device_userstate **pdu = NULL;
+       int    rc = 0;
+
+       pdu = (struct libcfs_device_userstate **)&file->private_data;
+       if (libcfs_psdev_ops.p_open != NULL)
+               rc = libcfs_psdev_ops.p_open(0, (void *)pdu);
+       else
+               return (-EPERM);
+       return rc;
+}
+
+/* called when closing /dev/device */
+static int 
+libcfs_psdev_release(cfs_file_t * file)
+{
+       struct libcfss_device_userstate *pdu;
+       int    rc = 0;
+
+       pdu = file->private_data;
+       if (libcfs_psdev_ops.p_close != NULL)
+               rc = libcfs_psdev_ops.p_close(0, (void *)pdu);
+       else
+               rc = -EPERM;
+       return rc;
+}
+
+static int 
+libcfs_ioctl(cfs_file_t * file, unsigned int cmd, ulong_ptr arg)
+{ 
+       struct cfs_psdev_file    pfile;
+       int    rc = 0;
+
+       if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || 
+            _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR  || 
+            _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) { 
+               CDEBUG(D_IOCTL, ("invalid ioctl ( type %d, nr %d, size %d )\n", 
+                      _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd))); 
+               return (-EINVAL); 
+       } 
+       
+       /* Handle platform-dependent IOC requests */
+       switch (cmd) { 
+       case IOC_LIBCFS_PANIC: 
+               if (!capable (CAP_SYS_BOOT)) 
+                       return (-EPERM); 
+               CERROR(("debugctl-invoked panic"));
+        KeBugCheckEx('LUFS', (ULONG_PTR)libcfs_ioctl, (ULONG_PTR)NULL, (ULONG_PTR)NULL, (ULONG_PTR)NULL);
+
+               return (0);
+       case IOC_LIBCFS_MEMHOG:
+
+               if (!capable (CAP_SYS_ADMIN)) 
+                       return -EPERM;
+        break;
+       }
+
+       pfile.off = 0;
+       pfile.private_data = file->private_data;
+       if (libcfs_psdev_ops.p_ioctl != NULL) 
+               rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); 
+       else
+               rc = -EPERM;
+       return (rc);
+}
+
+static struct file_operations libcfs_fops = {
+    /* lseek: */  NULL,
+    /* read: */   NULL,
+    /* write: */  NULL,
+    /* ioctl: */  libcfs_ioctl,
+    /* open: */   libcfs_psdev_open,
+    /* release:*/ libcfs_psdev_release
+};
+
+cfs_psdev_t libcfs_dev = { 
+       LIBCFS_MINOR, 
+       "lnet", 
+       &libcfs_fops
+};
+
diff --git a/lnet/libcfs/winnt/winnt-prim.c b/lnet/libcfs/winnt/winnt-prim.c
new file mode 100644 (file)
index 0000000..064b071
--- /dev/null
@@ -0,0 +1,650 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+
+/*
+ *  Thread routines
+ */
+
+/*
+ * cfs_thread_proc
+ *   Lustre thread procedure wrapper routine (It's an internal routine)
+ *
+ * Arguments:
+ *   context:  a structure of cfs_thread_context_t, containing
+ *             all the necessary parameters
+ *
+ * Return Value:
+ *   void: N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_thread_proc(
+    void * context
+    )
+{
+    cfs_thread_context_t * thread_context = 
+        (cfs_thread_context_t *) context;
+
+    /* Execute the specified function ... */
+
+    if (thread_context->func) {
+        (thread_context->func)(thread_context->arg);
+    }
+
+    /* Free the context memory */
+   
+    cfs_free(context);
+
+    /* Terminate this system thread */
+
+    PsTerminateSystemThread(STATUS_SUCCESS);
+}
+
+/*
+ * cfs_kernel_thread
+ *   Create a system thread to execute the routine specified
+ *
+ * Arguments:
+ *   func:  function to be executed in the thread
+ *   arg:   argument transferred to func function
+ *   flag:  thread creation flags.
+ *
+ * Return Value:
+ *   int:   0 on success or error codes
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_kernel_thread(int (*func)(void *), void *arg, int flag)
+{
+    cfs_handle_t  thread = NULL;
+    NTSTATUS      status;
+    cfs_thread_context_t * context = NULL;
+
+    /* Allocate the context to be transferred to system thread */
+
+    context = cfs_alloc(sizeof(cfs_thread_context_t), CFS_ALLOC_ZERO);
+
+    if (!context) {
+        return -ENOMEM;
+    }
+
+    context->func  = func;
+    context->arg   = arg;
+
+    /* Create system thread with the cfs_thread_proc wrapper */
+
+    status = PsCreateSystemThread(
+                &thread,
+                (ACCESS_MASK)0L,
+                0, 0, 0,
+                cfs_thread_proc,
+                context);
+
+    if (!NT_SUCCESS(status)) {
+
+
+        cfs_free(context);
+
+        /* We need translate the nt status to linux error code */
+
+        return cfs_error_code(status);
+    }
+
+    //
+    //  Query the thread id of the newly created thread
+    //
+
+    ZwClose(thread);
+
+    return 0;
+}
+
+
+/*
+ * Symbols routines
+ */
+
+
+static CFS_DECL_RWSEM(cfs_symbol_lock);
+CFS_LIST_HEAD(cfs_symbol_list);
+
+int MPSystem = FALSE;
+
+/*
+ * cfs_symbol_get
+ *   To query the specified symbol form the symbol table
+ *
+ * Arguments:
+ *   name:  the symbol name to be queried
+ *
+ * Return Value:
+ *   If the symbol is in the table, return the address of it.
+ *   If not, return NULL.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void *
+cfs_symbol_get(const char *name)
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+
+    down_read(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            sym->ref ++;
+            break;
+        } 
+    } 
+    up_read(&cfs_symbol_lock);
+
+    if (sym != NULL) 
+        return sym->value;
+
+    return NULL;
+}
+
+/*
+ * cfs_symbol_put
+ *   To decrease the reference of  the specified symbol
+ *
+ * Arguments:
+ *   name:  the symbol name to be dereferred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_symbol_put(const char *name)
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+
+    down_read(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            LASSERT(sym->ref > 0);
+            sym->ref--;
+            break;
+        } 
+    } 
+    up_read(&cfs_symbol_lock);
+
+    LASSERT(sym != NULL);
+}
+
+
+/*
+ * cfs_symbol_register
+ *   To register the specified symbol infromation
+ *
+ * Arguments:
+ *   name:  the symbol name to be dereferred
+ *   value: the value that the symbol stands for
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   Zero: Succeed to register
+ *   Non-Zero: Fail to register the symbol
+ */
+
+int
+cfs_symbol_register(const char *name, const void *value)
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+    struct cfs_symbol   *new = NULL;
+
+    new = cfs_alloc(sizeof(struct cfs_symbol), CFS_ALLOC_ZERO);
+    if (!new) {
+        return (-ENOMEM);
+    }
+    strncpy(new->name, name, CFS_SYMBOL_LEN);
+    new->value = (void *)value;
+    new->ref = 0;
+    CFS_INIT_LIST_HEAD(&new->sym_list);
+
+    down_write(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            up_write(&cfs_symbol_lock);
+            cfs_free(new);
+            return 0; // alreay registerred
+        }
+    }
+    list_add_tail(&new->sym_list, &cfs_symbol_list);
+    up_write(&cfs_symbol_lock);
+
+    return 0;
+}
+
+/*
+ * cfs_symbol_unregister
+ *   To unregister/remove the specified symbol
+ *
+ * Arguments:
+ *   name:  the symbol name to be dereferred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_symbol_unregister(const char *name)
+{
+    struct list_head    *walker;
+    struct list_head    *nxt;
+    struct cfs_symbol   *sym = NULL;
+
+    down_write(&cfs_symbol_lock);
+    list_for_each_safe(walker, nxt, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        if (!strcmp(sym->name, name)) {
+            LASSERT(sym->ref == 0);
+            list_del (&sym->sym_list);
+            cfs_free(sym);
+            break;
+        }
+    }
+    up_write(&cfs_symbol_lock);
+}
+
+/*
+ * cfs_symbol_clean
+ *   To clean all the symbols
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void
+cfs_symbol_clean()
+{
+    struct list_head    *walker;
+    struct cfs_symbol   *sym = NULL;
+
+    down_write(&cfs_symbol_lock);
+    list_for_each(walker, &cfs_symbol_list) {
+        sym = list_entry (walker, struct cfs_symbol, sym_list);
+        LASSERT(sym->ref == 0);
+        list_del (&sym->sym_list);
+        cfs_free(sym);
+    }
+    up_write(&cfs_symbol_lock);
+    return;
+}
+
+
+
+/*
+ * Timer routines
+ */
+
+
+/* Timer dpc procedure */
+static void
+cfs_timer_dpc_proc (
+    IN PKDPC Dpc,
+    IN PVOID DeferredContext,
+    IN PVOID SystemArgument1,
+    IN PVOID SystemArgument2)
+{
+    cfs_timer_t *   timer;
+    KIRQL           Irql;
+
+    timer = (cfs_timer_t *) DeferredContext;
+
+    /* clear the flag */
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    cfs_clear_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED);
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+
+    /* call the user specified timer procedure */
+    timer->proc((unsigned long)(timer->arg));
+}
+
+/*
+ * cfs_timer_init
+ *   To initialize the cfs_timer_t
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be initialized
+ *   func:   the timer callback procedure
+ *   arg:    argument for the callback proc
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_init(cfs_timer_t *timer, void (*func)(unsigned long), void *arg)
+{
+    memset(timer, 0, sizeof(cfs_timer_t));
+
+    timer->proc = func;
+    timer->arg  = arg;
+
+    KeInitializeSpinLock(&(timer->Lock));
+    KeInitializeTimer(&timer->Timer);
+    KeInitializeDpc (&timer->Dpc, cfs_timer_dpc_proc, timer);
+
+    cfs_set_flag(timer->Flags, CFS_TIMER_FLAG_INITED);
+}
+
+/*
+ * cfs_timer_done
+ *   To finialize the cfs_timer_t (unused)
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be cleaned up
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_done(cfs_timer_t *timer)
+{
+    return;
+}
+
+/*
+ * cfs_timer_arm
+ *   To schedule the timer while touching @deadline
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be freed
+ *   dealine: timeout value to wake up the timer
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_arm(cfs_timer_t *timer, cfs_time_t deadline)
+{
+    LARGE_INTEGER   timeout;
+    KIRQL           Irql;
+
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    if (!cfs_is_flag_set(timer->Flags, CFS_TIMER_FLAG_TIMERED)){
+
+        timeout.QuadPart = (LONGLONG)-1*1000*1000*10/HZ*deadline;
+
+        if (KeSetTimer(&timer->Timer, timeout, &timer->Dpc )) {
+            cfs_set_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED);
+        }
+
+        timer->deadline = deadline;
+    }
+
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+}
+
+/*
+ * cfs_timer_disarm
+ *   To discard the timer to be scheduled
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be discarded
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_timer_disarm(cfs_timer_t *timer)
+{
+    KIRQL   Irql;
+
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    KeCancelTimer(&(timer->Timer));
+    cfs_clear_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED);
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+}
+
+
+/*
+ * cfs_timer_is_armed
+ *   To check the timer is scheduled or not
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be checked
+ *
+ * Return Value:
+ *   1:  if it's armed.
+ *   0:  if it's not.
+ *
+ * Notes: 
+ *   N/A
+ */
+
+int cfs_timer_is_armed(cfs_timer_t *timer)
+{
+    int     rc = 0;
+    KIRQL   Irql;
+
+    KeAcquireSpinLock(&(timer->Lock), &Irql);
+    if (cfs_is_flag_set(timer->Flags, CFS_TIMER_FLAG_TIMERED)) {
+        rc = 1;
+    }
+    KeReleaseSpinLock(&(timer->Lock), Irql);
+
+    return rc;
+}
+
+/*
+ * cfs_timer_deadline
+ *   To query the deadline of the timer
+ *
+ * Arguments:
+ *   timer:  the cfs_timer to be queried
+ *
+ * Return Value:
+ *   the deadline value
+ *
+ * Notes: 
+ *   N/A
+ */
+
+cfs_time_t cfs_timer_deadline(cfs_timer_t * timer)
+{
+    return timer->deadline;
+}
+
+/*
+ * daemonize routine stub
+ */
+
+void cfs_daemonize(char *str)
+{
+    return;
+}
+
+/*
+ *  routine related with sigals
+ */
+
+cfs_sigset_t cfs_get_blockedsigs()
+{
+        return 0;
+}
+
+cfs_sigset_t cfs_block_allsigs()
+{
+        return 0;
+}
+
+cfs_sigset_t cfs_block_sigs(sigset_t bit)
+{
+        return 0;
+}
+
+void cfs_restore_sigs(cfs_sigset_t old)
+{
+}
+
+int cfs_signal_pending(void)
+{
+    return 0;
+}
+
+void cfs_clear_sigpending(void)
+{
+    return;
+}
+
+/**
+ **  Initialize routines 
+ **/
+
+int
+libcfs_arch_init(void)
+{ 
+    int         rc;
+
+    spinlock_t  lock;
+    /* Workground to check the system is MP build or UP build */
+    spin_lock_init(&lock);
+    spin_lock(&lock);
+    MPSystem = (int)lock.lock;
+    /* MP build system: it's a real spin, for UP build system, it
+       only raises the IRQL to DISPATCH_LEVEL */
+    spin_unlock(&lock);
+
+    /* create slab memory caches for page alloctors */
+    cfs_page_t_slab = cfs_mem_cache_create(
+        "CPGT", sizeof(cfs_page_t), 0, 0 );
+
+    cfs_page_p_slab = cfs_mem_cache_create(
+        "CPGP", CFS_PAGE_SIZE, 0, 0 );
+
+    if ( cfs_page_t_slab == NULL ||
+         cfs_page_p_slab == NULL ){
+        rc = -ENOMEM;
+        goto errorout;
+    }    
+
+    rc = init_task_manager();
+
+    if (rc != 0) {
+        cfs_enter_debugger();
+        KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing task manager ...\n"));
+        goto errorout;
+    }
+
+    /* initialize the proc file system */
+    rc = proc_init_fs();
+
+    if (rc != 0) {
+        cfs_enter_debugger();
+        KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing proc fs ...\n"));
+        cleanup_task_manager();
+        goto errorout;
+    }
+
+    /* initialize the tdi data */
+    rc = ks_init_tdi_data();
+
+    if (rc != 0) {
+        cfs_enter_debugger();
+        KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing tdi ...\n"));
+        proc_destroy_fs();
+        cleanup_task_manager();
+        goto errorout;
+    }
+
+errorout:
+
+    if (rc != 0) {
+        /* destroy the taskslot cache slab */
+        if (cfs_page_t_slab) {
+            cfs_mem_cache_destroy(cfs_page_t_slab);
+        }
+        if (cfs_page_p_slab) {
+            cfs_mem_cache_destroy(cfs_page_p_slab);
+        }
+    }
+
+    return rc;
+}
+
+void
+libcfs_arch_cleanup(void)
+{
+    /* finialize the tdi data */
+    ks_fini_tdi_data();
+
+    /* detroy the whole proc fs tree and nodes */
+    proc_destroy_fs();
+
+    /* destroy the taskslot cache slab */
+    if (cfs_page_t_slab) {
+        cfs_mem_cache_destroy(cfs_page_t_slab);
+    }
+
+    if (cfs_page_p_slab) {
+        cfs_mem_cache_destroy(cfs_page_p_slab);
+    }
+
+       return; 
+}
+
+EXPORT_SYMBOL(libcfs_arch_init);
+EXPORT_SYMBOL(libcfs_arch_cleanup);
diff --git a/lnet/libcfs/winnt/winnt-proc.c b/lnet/libcfs/winnt/winnt-proc.c
new file mode 100644 (file)
index 0000000..ebce30d
--- /dev/null
@@ -0,0 +1,1990 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+# define DEBUG_SUBSYSTEM S_LNET
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+#include "tracefile.h"
+
+#ifdef __KERNEL__
+
+
+/*
+ *  /proc emulator routines ...
+ */
+
+/* The root node of the proc fs emulation: /proc */
+cfs_proc_entry_t *              proc_fs_root = NULL;
+
+
+/* The sys root: /proc/sys */
+cfs_proc_entry_t *              proc_sys_root = NULL;
+
+
+/* The sys root: /proc/dev | to implement misc device */
+
+cfs_proc_entry_t *              proc_dev_root = NULL;
+
+
+/* SLAB object for cfs_proc_entry_t allocation */
+
+cfs_mem_cache_t *               proc_entry_cache = NULL;
+
+/* root node for sysctl table */
+
+cfs_sysctl_table_header_t       root_table_header;
+
+/* The global lock to protect all the access */
+
+#if LIBCFS_PROCFS_SPINLOCK
+spinlock_t                      proc_fs_lock;
+
+#define INIT_PROCFS_LOCK()      spin_lock_init(&proc_fs_lock)
+#define LOCK_PROCFS()           spin_lock(&proc_fs_lock)
+#define UNLOCK_PROCFS()         spin_unlock(&proc_fs_lock)
+
+#else
+
+mutex_t                         proc_fs_lock;
+
+#define INIT_PROCFS_LOCK()      init_mutex(&proc_fs_lock)
+#define LOCK_PROCFS()           mutex_down(&proc_fs_lock)
+#define UNLOCK_PROCFS()         mutex_up(&proc_fs_lock)
+
+#endif
+
+static ssize_t
+proc_file_read(struct file * file, const char * buf, size_t nbytes, loff_t *ppos)
+{
+    char    *page;
+    ssize_t retval=0;
+    int eof=0;
+    ssize_t n, count;
+    char    *start;
+    cfs_proc_entry_t * dp;
+
+    dp = (cfs_proc_entry_t  *) file->private_data;
+    if (!(page = (char*) cfs_alloc(CFS_PAGE_SIZE, 0)))
+        return -ENOMEM;
+
+    while ((nbytes > 0) && !eof) {
+
+        count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
+
+        start = NULL;
+        if (dp->read_proc) {
+            n = dp->read_proc( page, &start, (long)*ppos,
+                               count, &eof, dp->data);
+        } else
+            break;
+
+        if (!start) {
+            /*
+             * For proc files that are less than 4k
+             */
+            start = page + *ppos;
+            n -= (ssize_t)(*ppos);
+            if (n <= 0)
+                break;
+            if (n > count)
+                n = count;
+        }
+        if (n == 0)
+            break;  /* End of file */
+        if (n < 0) {
+            if (retval == 0)
+                retval = n;
+            break;
+        }
+        
+        n -= copy_to_user((void *)buf, start, n);
+        if (n == 0) {
+            if (retval == 0)
+                retval = -EFAULT;
+            break;
+        }
+
+        *ppos += n;
+        nbytes -= n;
+        buf += n;
+        retval += n;
+    }
+    cfs_free(page);
+
+    return retval;
+}
+
+static ssize_t
+proc_file_write(struct file * file, const char * buffer,
+                size_t count, loff_t *ppos)
+{
+    cfs_proc_entry_t  * dp;
+    
+    dp = (cfs_proc_entry_t *) file->private_data;
+
+    if (!dp->write_proc)
+        return -EIO;
+
+    /* FIXME: does this routine need ppos?  probably... */
+    return dp->write_proc(file, buffer, count, dp->data);
+}
+
+struct file_operations proc_file_operations = {
+    /*lseek:*/      NULL, //proc_file_lseek,
+    /*read:*/       proc_file_read,
+    /*write:*/      proc_file_write,
+    /*ioctl:*/      NULL,
+    /*open:*/       NULL,
+    /*release:*/    NULL
+};
+
+/* allocate proc entry block */
+
+cfs_proc_entry_t *
+proc_alloc_entry()
+{
+    cfs_proc_entry_t * entry = NULL;
+
+    entry = cfs_mem_cache_alloc(proc_entry_cache, 0);
+    if (!entry) {
+        return NULL;
+    }
+
+    memset(entry, 0, sizeof(cfs_proc_entry_t));
+
+    entry->magic = CFS_PROC_ENTRY_MAGIC;
+    RtlInitializeSplayLinks(&(entry->s_link));
+    entry->proc_fops = &proc_file_operations;
+
+    return entry;
+}
+
+/* free the proc entry block */
+
+void
+proc_free_entry(cfs_proc_entry_t * entry)
+
+{
+    ASSERT(entry->magic == CFS_PROC_ENTRY_MAGIC);
+
+    cfs_mem_cache_free(proc_entry_cache, entry);
+}
+
+/* dissect the path string for a given full proc path */
+
+void
+proc_dissect_name(
+    char *path,
+    char **first,
+    int  *first_len,
+    char **remain
+    )
+{
+    int i = 0, j = 0, len = 0;
+
+    *first = *remain = NULL;
+    *first_len = 0;
+
+    len = strlen(path);
+
+    while (i < len && (path[i] == '/')) i++;
+
+    if (i < len) {
+
+        *first = path + i;
+        while (i < len && (path[i] != '/')) i++;
+        *first_len = (path + i - *first);
+
+        if (i + 1 < len) {
+            *remain = path + i + 1;
+        }
+    }
+}
+
+/* search the children entries of the parent entry */
+
+cfs_proc_entry_t *
+proc_search_splay (
+    cfs_proc_entry_t *  parent,
+    char *              name
+    )
+{
+    cfs_proc_entry_t *  node;
+    PRTL_SPLAY_LINKS    link;
+
+    ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY));
+
+    link = parent->root;
+
+    while (link) {
+
+        ANSI_STRING ename,nname;
+        long        result;
+
+        node = CONTAINING_RECORD(link, cfs_proc_entry_t, s_link);
+
+        ASSERT(node->magic == CFS_PROC_ENTRY_MAGIC);
+
+        /*  Compare the prefix in the tree with the full name */
+
+        RtlInitAnsiString(&ename, name);
+        RtlInitAnsiString(&nname, node->name);
+
+        result = RtlCompareString(&nname, &ename,TRUE);
+
+        if (result > 0) {
+
+            /*  The prefix is greater than the full name
+                so we go down the left child          */
+
+            link = RtlLeftChild(link);
+
+        } else if (result < 0) {
+
+            /*  The prefix is less than the full name
+                so we go down the right child      */
+            //
+
+            link = RtlRightChild(link);
+
+        } else {
+
+            /*  We got the entry in the splay tree and
+                make it root node instead           */
+
+            parent->root = RtlSplay(link);
+
+            return node;
+        }
+
+        /* we need continue searching down the tree ... */
+    }
+
+    /*  There's no the exptected entry in the splay tree */
+
+    return NULL;
+}
+
+int
+proc_insert_splay (
+    cfs_proc_entry_t * parent,
+    cfs_proc_entry_t * child
+    )
+{
+    cfs_proc_entry_t * entry;
+
+    ASSERT(parent != NULL && child != NULL);
+    ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(child->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY));
+
+    if (!parent->root) {
+        parent->root = &(child->s_link);
+    } else {
+        entry = CONTAINING_RECORD(parent->root, cfs_proc_entry_t, s_link);
+        while (TRUE) {
+            long        result;
+            ANSI_STRING ename, cname;
+
+            ASSERT(entry->magic == CFS_PROC_ENTRY_MAGIC);
+
+            RtlInitAnsiString(&ename, entry->name);
+            RtlInitAnsiString(&cname, child->name);
+
+            result = RtlCompareString(&ename, &cname,TRUE);
+
+            if (result == 0) {
+                cfs_enter_debugger();
+                if (entry == child) {
+                    break;
+                }
+                return FALSE;
+            }
+
+            if (result > 0) {
+                if (RtlLeftChild(&entry->s_link) == NULL) {
+                    RtlInsertAsLeftChild(&entry->s_link, &child->s_link);
+                    break;
+                } else {
+                    entry = CONTAINING_RECORD( RtlLeftChild(&entry->s_link),
+                                               cfs_proc_entry_t, s_link);
+                }
+            } else {
+                if (RtlRightChild(&entry->s_link) == NULL) {
+                    RtlInsertAsRightChild(&entry->s_link, &child->s_link);
+                    break;
+                } else {
+                    entry = CONTAINING_RECORD( RtlRightChild(&entry->s_link),
+                                               cfs_proc_entry_t, s_link );
+                }
+            }
+        }
+    }
+
+    cfs_set_flag(child->flags, CFS_PROC_FLAG_ATTACHED);
+    parent->nlink++;
+
+    return TRUE;
+}
+
+
+/* remove a child entry from the splay tree */
+int
+proc_remove_splay (
+    cfs_proc_entry_t *  parent,
+    cfs_proc_entry_t *  child
+    )
+{
+    cfs_proc_entry_t * entry = NULL;
+
+    ASSERT(parent != NULL && child != NULL);
+    ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(child->magic == CFS_PROC_ENTRY_MAGIC);
+    ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY));
+    ASSERT(cfs_is_flag_set(child->flags, CFS_PROC_FLAG_ATTACHED));
+
+    entry = proc_search_splay(parent, child->name);
+
+    if (entry) {
+        ASSERT(entry == child);
+        parent->root = RtlDelete(&(entry->s_link));
+        parent->nlink--;
+    } else {
+        cfs_enter_debugger();
+        return FALSE;
+    }
+
+    return TRUE;
+}
+
+
+/* search a node inside the proc fs tree */
+
+cfs_proc_entry_t *
+proc_search_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *  entry;
+    cfs_proc_entry_t *  parent;
+    char *first, *remain;
+    int   flen;
+    char *ename = NULL;
+
+    parent = root;
+    entry = NULL;
+
+    ename = cfs_alloc(0x21, CFS_ALLOC_ZERO);
+
+    if (ename == NULL) {
+        goto errorout;
+    }
+
+again:
+
+    /* dissect the file name string */
+    proc_dissect_name(name, &first, &flen, &remain);
+
+    if (first) {
+
+        if (flen >= 0x20) {
+            cfs_enter_debugger();
+            entry = NULL;
+            goto errorout;
+        }
+
+        memset(ename, 0, 0x20);
+        memcpy(ename, first, flen);
+
+        entry = proc_search_splay(parent, ename);
+
+        if (!entry) {
+            goto errorout;
+        }
+
+        if (remain) {
+            name = remain;
+            parent = entry;
+
+            goto again;
+        }
+    }
+
+errorout:
+
+    if (ename) {
+        cfs_free(ename);
+    }
+
+    return entry;   
+}
+
+/* insert the path nodes to the proc fs tree */
+
+cfs_proc_entry_t *
+proc_insert_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *entry;
+    cfs_proc_entry_t *parent;
+    char *first, *remain;
+    int flen;
+    char ename[0x20];
+
+    parent = root;
+    entry = NULL;
+
+again:
+
+    proc_dissect_name(name, &first, &flen, &remain);
+
+    if (first) {
+
+        if (flen >= 0x20) {
+            return NULL;
+        }
+
+        memset(ename, 0, 0x20);
+        memcpy(ename, first, flen);
+
+        entry = proc_search_splay(parent, ename);
+
+        if (!entry) {
+            entry = proc_alloc_entry();
+            memcpy(entry->name, ename, flen);
+
+            if (entry) {
+                if(!proc_insert_splay(parent, entry)) {
+                    proc_free_entry(entry);
+                    entry = NULL;
+                }
+            }
+        }
+
+        if (!entry) {
+            return NULL;
+        }
+
+        if (remain) {
+            entry->mode |= S_IFDIR | S_IRUGO | S_IXUGO;
+            cfs_set_flag(entry->flags, CFS_PROC_FLAG_DIRECTORY);
+            name = remain;
+            parent = entry;
+            goto again;
+        }
+    }
+
+    return entry;   
+}
+
+/* remove the path nodes from the proc fs tree */
+
+void
+proc_remove_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *entry;
+    char *first, *remain;
+    int  flen;
+    char ename[0x20];
+
+    entry  = NULL;
+
+    proc_dissect_name(name, &first, &flen, &remain);
+
+    if (first) {
+
+        memset(ename, 0, 0x20);
+        memcpy(ename, first, flen);
+
+        entry = proc_search_splay(root, ename);
+
+        if (entry) {
+
+            if (remain) {
+                ASSERT(S_ISDIR(entry->mode));
+                proc_remove_entry(remain, entry);
+            }
+
+            if (!entry->nlink) {
+                proc_remove_splay(root, entry);
+                proc_free_entry(entry);
+            }
+        }
+    } else {
+        cfs_enter_debugger();
+    }
+}
+
+/* create proc entry and insert it into the proc fs */
+
+cfs_proc_entry_t *
+create_proc_entry (
+    char *              name,
+    mode_t              mode,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t *parent = root;
+    cfs_proc_entry_t *entry  = NULL;
+
+    if (S_ISDIR(mode)) {
+        if ((mode & S_IALLUGO) == 0)
+        mode |= S_IRUGO | S_IXUGO;
+    } else {
+        if ((mode & S_IFMT) == 0)
+            mode |= S_IFREG;
+        if ((mode & S_IALLUGO) == 0)
+            mode |= S_IRUGO;
+    }
+
+    LOCK_PROCFS();
+
+    ASSERT(NULL != proc_fs_root);
+
+    if (!parent) {
+        parent = proc_fs_root;
+    }
+
+    entry = proc_search_entry(name, parent);
+
+    if (!entry) {
+        entry = proc_insert_entry(name, parent);
+        if (!entry) {
+            /* Failed to create/insert the splay node ... */
+            cfs_enter_debugger();
+            goto errorout;
+        }
+        /* Initializing entry ... */
+        entry->mode = mode;
+
+        if (S_ISDIR(mode)) {
+            cfs_set_flag(entry->flags, CFS_PROC_FLAG_DIRECTORY);
+        }
+    }
+
+errorout:
+
+    UNLOCK_PROCFS();
+
+    return entry;
+}
+
+
+/* search the specified entry form the proc fs */
+
+cfs_proc_entry_t *
+search_proc_entry(
+    char *              name,
+    cfs_proc_entry_t *  root
+    )
+{
+    cfs_proc_entry_t * entry;
+
+    LOCK_PROCFS();
+    if (root == NULL) {
+        root = proc_fs_root;
+    }
+    entry = proc_search_entry(name, root);
+    UNLOCK_PROCFS();
+
+    return entry;    
+}
+
+/* remove the entry from the proc fs */
+
+void
+remove_proc_entry(
+    char *              name,
+    cfs_proc_entry_t *  parent
+    )
+{
+    LOCK_PROCFS();
+    if (parent == NULL) {
+        parent = proc_fs_root;
+    }
+    proc_remove_entry(name, parent);
+    UNLOCK_PROCFS();
+}
+
+
+void proc_destroy_splay(cfs_proc_entry_t * entry)
+{
+    cfs_proc_entry_t * node;
+
+    if (S_ISDIR(entry->mode)) {
+
+        while (entry->root) {
+            node = CONTAINING_RECORD(entry->root, cfs_proc_entry_t, s_link);
+            entry->root = RtlDelete(&(node->s_link));
+            proc_destroy_splay(node);
+        }
+    }
+
+    proc_free_entry(entry);
+}
+
+
+/* destory the whole proc fs tree */
+
+void proc_destroy_fs()
+{
+    LOCK_PROCFS();
+
+    if (proc_fs_root) {
+        proc_destroy_splay(proc_fs_root);
+    }
+
+    if (proc_entry_cache) {
+        cfs_mem_cache_destroy(proc_entry_cache);
+    }
+   
+    UNLOCK_PROCFS();
+}
+
+/* initilaize / build the proc fs tree */
+
+int proc_init_fs()
+{
+    cfs_proc_entry_t * root = NULL;
+
+    memset(&(root_table_header), 0, sizeof(struct ctl_table_header));
+    INIT_LIST_HEAD(&(root_table_header.ctl_entry));
+
+    INIT_PROCFS_LOCK();
+    proc_entry_cache = cfs_mem_cache_create(
+                            NULL,
+                            sizeof(cfs_proc_entry_t),
+                            0,
+                            0
+                            );
+
+    if (!proc_entry_cache) {
+        return (-ENOMEM);
+    }
+
+    root = proc_alloc_entry();
+
+    if (!root) {
+        proc_destroy_fs();
+        return (-ENOMEM);
+    }
+
+    root->magic = CFS_PROC_ENTRY_MAGIC;
+    root->flags = CFS_PROC_FLAG_DIRECTORY;
+    root->mode  = S_IFDIR | S_IRUGO | S_IXUGO;
+    root->nlink = 3; // root should never be deleted.
+
+    root->name[0]='p';
+    root->name[1]='r';
+    root->name[2]='o';
+    root->name[3]='c';
+
+    proc_fs_root = root;
+
+    proc_sys_root = create_proc_entry("sys", S_IFDIR, root);
+
+    if (!proc_sys_root) {
+        proc_free_entry(root);
+        proc_fs_root = NULL;
+        proc_destroy_fs();
+        return (-ENOMEM);
+    }
+
+    proc_sys_root->nlink = 1;
+
+    proc_dev_root = create_proc_entry("dev", S_IFDIR, root);
+
+    if (!proc_dev_root) {
+        proc_free_entry(proc_sys_root);
+        proc_sys_root = NULL;
+        proc_free_entry(proc_fs_root);
+        proc_fs_root = NULL;
+        proc_destroy_fs();
+        return (-ENOMEM);
+    }
+
+    proc_dev_root->nlink = 1;
+   
+    return 0;
+}
+
+
+static ssize_t do_rw_proc(int write, struct file * file, char * buf,
+              size_t count, loff_t *ppos)
+{
+    int op;
+    cfs_proc_entry_t *de;
+    struct ctl_table *table;
+    size_t res;
+    ssize_t error;
+    
+    de = (cfs_proc_entry_t *) file->proc_dentry; 
+
+    if (!de || !de->data)
+        return -ENOTDIR;
+    table = (struct ctl_table *) de->data;
+    if (!table || !table->proc_handler)
+        return -ENOTDIR;
+    op = (write ? 002 : 004);
+
+//  if (ctl_perm(table, op))
+//      return -EPERM;
+    
+    res = count;
+
+    /*
+     * FIXME: we need to pass on ppos to the handler.
+     */
+
+    error = (*table->proc_handler) (table, write, file, buf, &res);
+    if (error)
+        return error;
+    return res;
+}
+
+static ssize_t proc_readsys(struct file * file, char * buf,
+                size_t count, loff_t *ppos)
+{
+    return do_rw_proc(0, file, buf, count, ppos);
+}
+
+static ssize_t proc_writesys(struct file * file, const char * buf,
+                 size_t count, loff_t *ppos)
+{
+    return do_rw_proc(1, file, (char *) buf, count, ppos);
+}
+
+
+struct file_operations proc_sys_file_operations = {
+    /*lseek:*/      NULL,
+    /*read:*/       proc_readsys,
+    /*write:*/      proc_writesys,
+    /*ioctl:*/      NULL,
+    /*open:*/       NULL,
+    /*release:*/    NULL
+};
+
+
+/* Scan the sysctl entries in table and add them all into /proc */
+void register_proc_table(cfs_sysctl_table_t * table, cfs_proc_entry_t * root)
+{
+    cfs_proc_entry_t * de;
+    int len;
+    mode_t mode;
+    
+    for (; table->ctl_name; table++) {
+        /* Can't do anything without a proc name. */
+        if (!table->procname)
+            continue;
+        /* Maybe we can't do anything with it... */
+        if (!table->proc_handler && !table->child) {
+            printk(KERN_WARNING "SYSCTL: Can't register %s\n",
+                table->procname);
+            continue;
+        }
+
+        len = strlen(table->procname);
+        mode = table->mode;
+
+        de = NULL;
+        if (table->proc_handler)
+            mode |= S_IFREG;
+        else {
+            de = search_proc_entry(table->procname, root);
+            if (de) {
+                break;
+            }
+            /* If the subdir exists already, de is non-NULL */
+        }
+
+        if (!de) {
+
+            de = create_proc_entry((char *)table->procname, mode, root);
+            if (!de)
+                continue;
+            de->data = (void *) table;
+            if (table->proc_handler) {
+                de->proc_fops = &proc_sys_file_operations;
+            }
+        }
+        table->de = de;
+        if (de->mode & S_IFDIR)
+            register_proc_table(table->child, de);
+    }
+}
+
+
+/*
+ * Unregister a /proc sysctl table and any subdirectories.
+ */
+void unregister_proc_table(cfs_sysctl_table_t * table, cfs_proc_entry_t *root)
+{
+    cfs_proc_entry_t *de;
+    for (; table->ctl_name; table++) {
+        if (!(de = table->de))
+            continue;
+        if (de->mode & S_IFDIR) {
+            if (!table->child) {
+                printk (KERN_ALERT "Help - malformed sysctl tree on free\n");
+                continue;
+            }
+            unregister_proc_table(table->child, de);
+
+            /* Don't unregister directories which still have entries.. */
+            if (de->nlink)
+                continue;
+        }
+
+        /* Don't unregister proc entries that are still being used.. */
+        if (de->nlink)
+            continue;
+
+        table->de = NULL;
+        remove_proc_entry((char *)table->procname, root);
+    }
+}
+
+/* The generic string strategy routine: */
+int sysctl_string(cfs_sysctl_table_t *table, int *name, int nlen,
+          void *oldval, size_t *oldlenp,
+          void *newval, size_t newlen, void **context)
+{
+    int l, len;
+    
+    if (!table->data || !table->maxlen) 
+        return -ENOTDIR;
+    
+    if (oldval && oldlenp) {
+        if(get_user(len, oldlenp))
+            return -EFAULT;
+        if (len) {
+            l = strlen(table->data);
+            if (len > l) len = l;
+            if (len >= table->maxlen)
+                len = table->maxlen;
+            if(copy_to_user(oldval, table->data, len))
+                return -EFAULT;
+            if(put_user(0, ((char *) oldval) + len))
+                return -EFAULT;
+            if(put_user(len, oldlenp))
+                return -EFAULT;
+        }
+    }
+    if (newval && newlen) {
+        len = newlen;
+        if (len > table->maxlen)
+            len = table->maxlen;
+        if(copy_from_user(table->data, newval, len))
+            return -EFAULT;
+        if (len == table->maxlen)
+            len--;
+        ((char *) table->data)[len] = 0;
+    }
+    return 0;
+}
+
+/**
+ * simple_strtoul - convert a string to an unsigned long
+ * @cp: The start of the string
+ * @endp: A pointer to the end of the parsed string will be placed here
+ * @base: The number base to use
+ */
+unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base)
+{
+    unsigned long result = 0, value;
+
+    if (!base) {
+        base = 10;
+        if (*cp == '0') {
+            base = 8;
+            cp++;
+            if ((*cp == 'x') && isxdigit(cp[1])) {
+                cp++;
+                base = 16;
+            }
+        }
+    }
+    while (isxdigit(*cp) &&
+           (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) {
+        result = result*base + value;
+        cp++;
+    }
+    if (endp)
+        *endp = (char *)cp;
+    return result;
+}
+
+#define OP_SET  0
+#define OP_AND  1
+#define OP_OR   2
+#define OP_MAX  3
+#define OP_MIN  4
+
+
+static int do_proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp,
+          void *buffer, size_t *lenp, int conv, int op)
+{
+    int *i, vleft, first=1, neg, val;
+    size_t left, len;
+    
+    #define TMPBUFLEN 20
+    char buf[TMPBUFLEN], *p;
+    
+    if (!table->data || !table->maxlen || !*lenp)
+    {
+        *lenp = 0;
+        return 0;
+    }
+    
+    i = (int *) table->data;
+    vleft = table->maxlen / sizeof(int);
+    left = *lenp;
+    
+    for (; left && vleft--; i++, first=0) {
+        if (write) {
+            while (left) {
+                char c;
+                if(get_user(c,(char *) buffer))
+                    return -EFAULT;
+                if (!isspace(c))
+                    break;
+                left--;
+                ((char *) buffer)++;
+            }
+            if (!left)
+                break;
+            neg = 0;
+            len = left;
+            if (len > TMPBUFLEN-1)
+                len = TMPBUFLEN-1;
+            if(copy_from_user(buf, buffer, len))
+                return -EFAULT;
+            buf[len] = 0;
+            p = buf;
+            if (*p == '-' && left > 1) {
+                neg = 1;
+                left--, p++;
+            }
+            if (*p < '0' || *p > '9')
+                break;
+            val = simple_strtoul(p, &p, 0) * conv;
+            len = p-buf;
+            if ((len < left) && *p && !isspace(*p))
+                break;
+            if (neg)
+                val = -val;
+            (char *)buffer += len;
+            left -= len;
+            switch(op) {
+            case OP_SET:    *i = val; break;
+            case OP_AND:    *i &= val; break;
+            case OP_OR: *i |= val; break;
+            case OP_MAX:    if(*i < val)
+                        *i = val;
+                    break;
+            case OP_MIN:    if(*i > val)
+                        *i = val;
+                    break;
+            }
+        } else {
+            p = buf;
+            if (!first)
+                *p++ = '\t';
+            sprintf(p, "%d", (*i) / conv);
+            len = strlen(buf);
+            if (len > left)
+                len = left;
+            if(copy_to_user(buffer, buf, len))
+                return -EFAULT;
+            left -= len;
+            (char *)buffer += len;
+        }
+    }
+
+    if (!write && !first && left) {
+        if(put_user('\n', (char *) buffer))
+            return -EFAULT;
+        left--, ((char *)buffer)++;
+    }
+    if (write) {
+        p = (char *) buffer;
+        while (left) {
+            char c;
+            if(get_user(c, p++))
+                return -EFAULT;
+            if (!isspace(c))
+                break;
+            left--;
+        }
+    }
+    if (write && first)
+        return -EINVAL;
+    *lenp -= left;
+    memset(&(filp->f_pos) , 0, sizeof(loff_t));
+    filp->f_pos += (loff_t)(*lenp);
+    return 0;
+}
+
+/**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string. 
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp,
+             void *buffer, size_t *lenp)
+{
+    return do_proc_dointvec(table,write,filp,buffer,lenp,1,OP_SET);
+}
+
+
+/**
+ * proc_dostring - read a string sysctl
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @filp: the file structure
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ *
+ * Reads/writes a string from/to the user buffer. If the kernel
+ * buffer provided is not large enough to hold the string, the
+ * string is truncated. The copied string is %NULL-terminated.
+ * If the string is being read by the user process, it is copied
+ * and a newline '\n' is added. It is truncated if the buffer is
+ * not large enough.
+ *
+ * Returns 0 on success.
+ */
+int proc_dostring(cfs_sysctl_table_t *table, int write, struct file *filp,
+          void *buffer, size_t *lenp)
+{
+    size_t len;
+    char *p, c;
+    
+    if (!table->data || !table->maxlen || !*lenp ||
+        (filp->f_pos && !write)) {
+        *lenp = 0;
+        return 0;
+    }
+    
+    if (write) {
+        len = 0;
+        p = buffer;
+        while (len < *lenp) {
+            if(get_user(c, p++))
+                return -EFAULT;
+            if (c == 0 || c == '\n')
+                break;
+            len++;
+        }
+        if (len >= (size_t)table->maxlen)
+            len = (size_t)table->maxlen-1;
+        if(copy_from_user(table->data, buffer, len))
+            return -EFAULT;
+        ((char *) table->data)[len] = 0;
+        filp->f_pos += *lenp;
+    } else {
+        len = (size_t)strlen(table->data);
+        if (len > (size_t)table->maxlen)
+            len = (size_t)table->maxlen;
+        if (len > *lenp)
+            len = *lenp;
+        if (len)
+            if(copy_to_user(buffer, table->data, len))
+                return -EFAULT;
+        if (len < *lenp) {
+            if(put_user('\n', ((char *) buffer) + len))
+                return -EFAULT;
+            len++;
+        }
+        *lenp = len;
+        filp->f_pos += len;
+    }
+    return 0;
+}
+
+/* Perform the actual read/write of a sysctl table entry. */
+int do_sysctl_strategy (cfs_sysctl_table_t *table, 
+            int *name, int nlen,
+            void *oldval, size_t *oldlenp,
+            void *newval, size_t newlen, void **context)
+{
+    int op = 0, rc;
+    size_t len;
+
+    if (oldval)
+        op |= 004;
+    if (newval) 
+        op |= 002;
+
+    if (table->strategy) {
+        rc = table->strategy(table, name, nlen, oldval, oldlenp,
+                     newval, newlen, context);
+        if (rc < 0)
+            return rc;
+        if (rc > 0)
+            return 0;
+    }
+
+    /* If there is no strategy routine, or if the strategy returns
+     * zero, proceed with automatic r/w */
+    if (table->data && table->maxlen) {
+        if (oldval && oldlenp) {
+            get_user(len, oldlenp);
+            if (len) {
+                if (len > (size_t)table->maxlen)
+                    len = (size_t)table->maxlen;
+                if(copy_to_user(oldval, table->data, len))
+                    return -EFAULT;
+                if(put_user(len, oldlenp))
+                    return -EFAULT;
+            }
+        }
+        if (newval && newlen) {
+            len = newlen;
+            if (len > (size_t)table->maxlen)
+                len = (size_t)table->maxlen;
+            if(copy_from_user(table->data, newval, len))
+                return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+static int parse_table(int *name, int nlen,
+               void *oldval, size_t *oldlenp,
+               void *newval, size_t newlen,
+               cfs_sysctl_table_t *table, void **context)
+{
+    int n;
+
+repeat:
+
+    if (!nlen)
+        return -ENOTDIR;
+    if (get_user(n, name))
+        return -EFAULT;
+    for ( ; table->ctl_name; table++) {
+        if (n == table->ctl_name || table->ctl_name == CTL_ANY) {
+            int error;
+            if (table->child) {
+/*
+                if (ctl_perm(table, 001))
+                    return -EPERM;
+*/
+                if (table->strategy) {
+                    error = table->strategy(
+                        table, name, nlen,
+                        oldval, oldlenp,
+                        newval, newlen, context);
+                    if (error)
+                        return error;
+                }
+                name++;
+                nlen--;
+                table = table->child;
+                goto repeat;
+            }
+            error = do_sysctl_strategy(table, name, nlen,
+                           oldval, oldlenp,
+                           newval, newlen, context);
+            return error;
+        }
+    }
+    return -ENOTDIR;
+}
+
+int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp,
+           void *newval, size_t newlen)
+{
+    struct list_head *tmp;
+
+    if (nlen <= 0 || nlen >= CTL_MAXNAME)
+        return -ENOTDIR;
+    if (oldval) {
+        int old_len;
+        if (!oldlenp || get_user(old_len, oldlenp))
+            return -EFAULT;
+    }
+    tmp = &root_table_header.ctl_entry;
+    do {
+        struct ctl_table_header *head =
+            list_entry(tmp, struct ctl_table_header, ctl_entry);
+        void *context = NULL;
+        int error = parse_table(name, nlen, oldval, oldlenp, 
+                    newval, newlen, head->ctl_table,
+                    &context);
+        if (context)
+            cfs_free(context);
+        if (error != -ENOTDIR)
+            return error;
+        tmp = tmp->next;
+    } while (tmp != &root_table_header.ctl_entry);
+    return -ENOTDIR;
+}
+
+/**
+ * register_sysctl_table - register a sysctl heirarchy
+ * @table: the top-level table structure
+ * @insert_at_head: whether the entry should be inserted in front or at the end
+ *
+ * Register a sysctl table heirarchy. @table should be a filled in ctl_table
+ * array. An entry with a ctl_name of 0 terminates the table. 
+ *
+ * The members of the &ctl_table structure are used as follows:
+ *
+ * ctl_name - This is the numeric sysctl value used by sysctl(2). The number
+ *            must be unique within that level of sysctl
+ *
+ * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
+ *            enter a sysctl file
+ *
+ * data - a pointer to data for use by proc_handler
+ *
+ * maxlen - the maximum size in bytes of the data
+ *
+ * mode - the file permissions for the /proc/sys file, and for sysctl(2)
+ *
+ * child - a pointer to the child sysctl table if this entry is a directory, or
+ *         %NULL.
+ *
+ * proc_handler - the text handler routine (described below)
+ *
+ * strategy - the strategy routine (described below)
+ *
+ * de - for internal use by the sysctl routines
+ *
+ * extra1, extra2 - extra pointers usable by the proc handler routines
+ *
+ * Leaf nodes in the sysctl tree will be represented by a single file
+ * under /proc; non-leaf nodes will be represented by directories.
+ *
+ * sysctl(2) can automatically manage read and write requests through
+ * the sysctl table.  The data and maxlen fields of the ctl_table
+ * struct enable minimal validation of the values being written to be
+ * performed, and the mode field allows minimal authentication.
+ *
+ * More sophisticated management can be enabled by the provision of a
+ * strategy routine with the table entry.  This will be called before
+ * any automatic read or write of the data is performed.
+ *
+ * The strategy routine may return
+ *
+ * < 0 - Error occurred (error is passed to user process)
+ *
+ * 0   - OK - proceed with automatic read or write.
+ *
+ * > 0 - OK - read or write has been done by the strategy routine, so
+ *       return immediately.
+ *
+ * There must be a proc_handler routine for any terminal nodes
+ * mirrored under /proc/sys (non-terminals are handled by a built-in
+ * directory handler).  Several default handlers are available to
+ * cover common cases -
+ *
+ * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
+ * proc_dointvec_minmax(), proc_doulongvec_ms_jiffies_minmax(),
+ * proc_doulongvec_minmax()
+ *
+ * It is the handler's job to read the input buffer from user memory
+ * and process it. The handler should return 0 on success.
+ *
+ * This routine returns %NULL on a failure to register, and a pointer
+ * to the table header on success.
+ */
+struct ctl_table_header *register_sysctl_table(cfs_sysctl_table_t * table, 
+                           int insert_at_head)
+{
+    struct ctl_table_header *tmp;
+    tmp = cfs_alloc(sizeof(struct ctl_table_header), 0);
+    if (!tmp)
+        return NULL;
+    tmp->ctl_table = table;
+
+    INIT_LIST_HEAD(&tmp->ctl_entry);
+    if (insert_at_head)
+        list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
+    else
+        list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+#ifdef CONFIG_PROC_FS
+    register_proc_table(table, proc_sys_root);
+#endif
+    return tmp;
+}
+
+/**
+ * unregister_sysctl_table - unregister a sysctl table heirarchy
+ * @header: the header returned from register_sysctl_table
+ *
+ * Unregisters the sysctl table and all children. proc entries may not
+ * actually be removed until they are no longer used by anyone.
+ */
+void unregister_sysctl_table(struct ctl_table_header * header)
+{
+    list_del(&header->ctl_entry);
+#ifdef CONFIG_PROC_FS
+    unregister_proc_table(header->ctl_table, proc_sys_root);
+#endif
+    cfs_free(header);
+}
+
+
+int cfs_psdev_register(cfs_psdev_t * psdev)
+{
+    cfs_proc_entry_t *  entry;
+
+    entry = create_proc_entry (
+                (char *)psdev->name,
+                S_IFREG,
+                proc_dev_root
+            );
+
+    if (!entry) {
+        return -ENOMEM;
+    }
+
+    entry->flags |= CFS_PROC_FLAG_MISCDEV;
+
+    entry->proc_fops = psdev->fops;
+    entry->data = (void *)psdev;
+
+    return 0;
+}
+
+int cfs_psdev_deregister(cfs_psdev_t * psdev)
+{
+    cfs_proc_entry_t *  entry;
+
+    entry = search_proc_entry (
+                (char *)psdev->name,
+                proc_dev_root
+            );
+
+    if (entry) {
+
+        ASSERT(entry->data == (void *)psdev);
+        ASSERT(entry->flags & CFS_PROC_FLAG_MISCDEV);
+
+        remove_proc_entry(
+            (char *)psdev->name,
+            proc_dev_root
+            );
+    }
+
+    return 0;
+}
+
+extern char debug_file_path[1024];
+
+#define PSDEV_LNET  (0x100)
+enum {
+        PSDEV_DEBUG = 1,          /* control debugging */
+        PSDEV_SUBSYSTEM_DEBUG,    /* control debugging */
+        PSDEV_PRINTK,             /* force all messages to console */
+        PSDEV_CONSOLE_RATELIMIT,  /* rate limit console messages */
+        PSDEV_DEBUG_PATH,         /* crashdump log location */
+        PSDEV_DEBUG_DUMP_PATH,    /* crashdump tracelog location */
+        PSDEV_LIBCFS_MEMUSED,     /* bytes currently PORTAL_ALLOCated */
+};
+
+static struct ctl_table lnet_table[] = {
+        {PSDEV_DEBUG, "debug", &libcfs_debug, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &libcfs_subsystem_debug,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {PSDEV_PRINTK, "printk", &libcfs_printk, sizeof(int), 0644, NULL,
+         &proc_dointvec},
+        {PSDEV_CONSOLE_RATELIMIT, "console_ratelimit", &libcfs_console_ratelimit, 
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
+         sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
+/*
+        {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall,
+         sizeof(portals_upcall), 0644, NULL, &proc_dostring,
+         &sysctl_string},
+*/
+        {PSDEV_LIBCFS_MEMUSED, "memused", (int *)&libcfs_kmemory.counter,
+         sizeof(int), 0644, NULL, &proc_dointvec},
+        {0}
+};
+
+static struct ctl_table top_table[2] = {
+        {PSDEV_LNET, "lnet", NULL, 0, 0555, lnet_table},
+        {0}
+};
+
+int insert_proc(void)
+{
+        cfs_proc_entry_t *ent;
+
+        ent = create_proc_entry("sys/lnet/dump_kernel", 0, NULL);
+        if (ent == NULL) {
+                CERROR(("couldn't register dump_kernel\n"));
+                return -1;
+        }
+        ent->write_proc = trace_dk;
+
+        ent = create_proc_entry("sys/lnet/daemon_file", 0, NULL);
+        if (ent == NULL) {
+                CERROR(("couldn't register daemon_file\n"));
+                return -1;
+        }
+        ent->write_proc = trace_write_daemon_file;
+        ent->read_proc = trace_read_daemon_file;
+
+        ent = create_proc_entry("sys/lnet/debug_mb", 0, NULL);
+        if (ent == NULL) {
+                CERROR(("couldn't register debug_mb\n"));
+                return -1;
+        }
+        ent->write_proc = trace_write_debug_mb;
+        ent->read_proc = trace_read_debug_mb;
+
+        return 0;
+}
+
+void remove_proc(void)
+{
+        remove_proc_entry("sys/portals/dump_kernel", NULL);
+        remove_proc_entry("sys/portals/daemon_file", NULL);
+        remove_proc_entry("sys/portals/debug_mb", NULL);
+
+#ifdef CONFIG_SYSCTL
+        if (portals_table_header)
+                unregister_sysctl_table(portals_table_header);
+        portals_table_header = NULL;
+#endif
+}
+
+
+/*
+ *  proc process routines of kernel space
+ */
+
+cfs_file_t *
+lustre_open_file(char * filename)
+{
+    int rc = 0;
+    cfs_file_t * fh = NULL;
+    cfs_proc_entry_t * fp = NULL;
+
+    fp = search_proc_entry(filename, proc_fs_root);
+
+    if (!fp) {
+        rc =  -ENOENT;
+        return NULL;
+    }
+
+    fh = cfs_alloc(sizeof(cfs_file_t), CFS_ALLOC_ZERO);
+
+    if (!fh) {
+        rc =  -ENOMEM;
+        return NULL;
+    }
+
+    fh->private_data = (void *)fp;
+    fh->f_op = fp->proc_fops;
+
+    if (fh->f_op->open) {
+        rc = (fh->f_op->open)(fh);
+    } else {
+        fp->nlink++;
+    }
+
+    if (0 != rc) {
+        cfs_free(fh);
+        return NULL;
+    }
+
+    return fh;
+}
+
+int
+lustre_close_file(cfs_file_t * fh)
+{
+    int rc = 0;
+    cfs_proc_entry_t * fp = NULL;
+
+    fp = (cfs_proc_entry_t *) fh->private_data;
+
+    if (fh->f_op->release) {
+        rc = (fh->f_op->release)(fh);
+    } else {
+        fp->nlink--;
+    }
+
+    cfs_free(fh);
+
+    return rc;
+}
+
+int
+lustre_do_ioctl( cfs_file_t * fh,
+                 unsigned long cmd,
+                 ulong_ptr arg )
+{
+    int rc = 0;
+
+    if (fh->f_op->ioctl) {
+        rc = (fh->f_op->ioctl)(fh, cmd, arg);
+    }
+
+    if (rc != 0) {
+        printk("lustre_do_ioctl: fialed: cmd = %xh arg = %xh rc = %d\n",
+                cmd, arg, rc);
+    }
+
+    return rc;
+}
+    
+int
+lustre_ioctl_file(cfs_file_t * fh, PCFS_PROC_IOCTL devctl)
+{
+    int         rc = 0;
+    ulong_ptr   data;
+
+    data = (ulong_ptr)devctl + sizeof(CFS_PROC_IOCTL);
+
+    /* obd ioctl code */
+    if (_IOC_TYPE(devctl->cmd) == 'f') {
+#if 0
+        struct obd_ioctl_data * obd = (struct obd_ioctl_data *) data;
+
+        if ( devctl->cmd != (ULONG)OBD_IOC_BRW_WRITE  &&
+             devctl->cmd != (ULONG)OBD_IOC_BRW_READ ) {
+
+            unsigned long off = obd->ioc_len;
+
+            if (obd->ioc_pbuf1) {
+                obd->ioc_pbuf1 = (char *)(data + off);
+                off += size_round(obd->ioc_plen1);
+            }
+
+            if (obd->ioc_pbuf2) {
+                obd->ioc_pbuf2 = (char *)(data + off);
+            }
+        }
+ #endif
+   }
+
+    rc = lustre_do_ioctl(fh, devctl->cmd, data);
+
+    return rc;
+} 
+
+
+size_t
+lustre_read_file(
+    cfs_file_t *    fh,
+    loff_t          off,
+    size_t          size,
+    char *          buf
+    )
+{
+    size_t rc = 0;
+
+    if (fh->f_op->read) {
+        rc = (fh->f_op->read) (fh, buf, size, &off);
+    }
+
+    return rc;
+}
+
+size_t
+lustre_write_file(
+    cfs_file_t *    fh,
+    loff_t          off,
+    size_t          size,
+    char *          buf
+    )
+{
+    size_t rc = 0;
+
+    if (fh->f_op->write) {
+        rc = (fh->f_op->write)(fh, buf, size, &off);
+    }
+
+    return rc;
+}  
+
+#else /* !__KERNEL__ */
+
+#include <lnet/api-support.h>
+#include <liblustre.h>
+#include <lustre_lib.h>
+
+/*
+ * proc process routines of user space
+ */
+
+HANDLE cfs_proc_open (char * filename, int oflag)
+{
+    NTSTATUS            status;
+    IO_STATUS_BLOCK     iosb;
+    int                 rc;
+
+    HANDLE              FileHandle = INVALID_HANDLE_VALUE;
+    OBJECT_ATTRIBUTES   ObjectAttributes;
+    ACCESS_MASK         DesiredAccess;
+    ULONG               CreateDisposition;
+    ULONG               ShareAccess;
+    ULONG               CreateOptions;
+    UNICODE_STRING      UnicodeName;
+    USHORT              NameLength;
+
+    PFILE_FULL_EA_INFORMATION Ea = NULL;
+    ULONG               EaLength;
+    UCHAR               EaBuffer[EA_MAX_LENGTH];
+
+    /* Check the filename: should start with "/proc" or "/dev" */
+    NameLength = (USHORT)strlen(filename);
+    if (NameLength > 0x05) {
+        if (_strnicmp(filename, "/proc/", 6) == 0) {
+            filename += 6;
+            NameLength -=6;
+            if (NameLength <= 0) {
+                rc = -EINVAL;
+                goto errorout;
+            }
+        } else if (_strnicmp(filename, "/dev/", 5) == 0) {
+        } else {
+            rc = -EINVAL;
+            goto errorout;
+        }
+    } else {
+        rc = -EINVAL;
+        goto errorout;
+    }
+
+    /* Analyze the flags settings */
+
+    if (cfs_is_flag_set(oflag, O_WRONLY)) {
+        DesiredAccess = (GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = 0;
+    }  else if (cfs_is_flag_set(oflag, O_RDWR)) {
+        DesiredAccess = (GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ | FILE_SHARE_WRITE;
+    } else {
+        DesiredAccess = (GENERIC_READ | SYNCHRONIZE);
+        ShareAccess = FILE_SHARE_READ;
+    }
+
+    if (cfs_is_flag_set(oflag, O_CREAT)) {
+        if (cfs_is_flag_set(oflag, O_EXCL)) {
+            CreateDisposition = FILE_CREATE;
+            rc = -EINVAL;
+            goto errorout;
+        } else {
+            CreateDisposition = FILE_OPEN_IF;
+        }
+    } else {
+        CreateDisposition = FILE_OPEN;
+    }
+
+    if (cfs_is_flag_set(oflag, O_TRUNC)) {
+        if (cfs_is_flag_set(oflag, O_EXCL)) {
+            CreateDisposition = FILE_OVERWRITE;
+        } else {
+            CreateDisposition = FILE_OVERWRITE_IF;
+        }
+    }
+
+    CreateOptions = 0;
+
+    if (cfs_is_flag_set(oflag, O_DIRECTORY)) {
+        cfs_set_flag(CreateOptions,  FILE_DIRECTORY_FILE);
+    }
+
+    if (cfs_is_flag_set(oflag, O_SYNC)) {
+         cfs_set_flag(CreateOptions, FILE_WRITE_THROUGH);
+    }
+
+    if (cfs_is_flag_set(oflag, O_DIRECT)) {
+         cfs_set_flag(CreateOptions, FILE_NO_INTERMEDIATE_BUFFERING);
+    }
+
+    /* Initialize the unicode path name for the specified file */
+    RtlInitUnicodeString(&UnicodeName, LUSTRE_PROC_SYMLNK);
+
+    /* Setup the object attributes structure for the file. */
+    InitializeObjectAttributes(
+            &ObjectAttributes,
+            &UnicodeName,
+            OBJ_CASE_INSENSITIVE,
+            NULL,
+            NULL );
+
+    /* building EA for the proc entry ...  */
+    Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer;
+    Ea->NextEntryOffset = 0;
+    Ea->Flags = 0;
+    Ea->EaNameLength = (UCHAR)NameLength;
+    Ea->EaValueLength = 0;
+    RtlCopyMemory(
+        &(Ea->EaName),
+        filename,
+        NameLength + 1
+        );
+    EaLength = sizeof(FILE_FULL_EA_INFORMATION) - 1 +
+                               Ea->EaNameLength + 1;
+
+    /* Now to open or create the file now */
+    status = ZwCreateFile(
+                &FileHandle,
+                DesiredAccess,
+                &ObjectAttributes,
+                &iosb,
+                0,
+                FILE_ATTRIBUTE_NORMAL,
+                ShareAccess,
+                CreateDisposition,
+                CreateOptions,
+                Ea,
+                EaLength );
+
+    /* Check the returned status of Iosb ... */
+
+    if (!NT_SUCCESS(status)) {
+        rc = cfs_error_code(status);
+        goto errorout;
+    }
+
+errorout:
+
+    return FileHandle;
+}
+
+int cfs_proc_close(HANDLE handle)
+{
+    if (handle) {
+        NtClose((HANDLE)handle);
+    }
+
+    return 0;
+}
+
+int cfs_proc_read(HANDLE handle, void *buffer, unsigned int count)
+{
+    NTSTATUS            status;
+    IO_STATUS_BLOCK     iosb;
+    LARGE_INTEGER       offset;
+
+
+    offset.QuadPart = 0;
+
+    /* read file data */
+    status = NtReadFile(
+                (HANDLE)handle,
+                0,
+                NULL,
+                NULL,
+                &iosb,
+                buffer,
+                count,
+                &offset,
+                NULL);                     
+
+    /* check the return status */
+    if (!NT_SUCCESS(status)) {
+        printf("NtReadFile request failed 0x%0x\n", status);
+        goto errorout;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(status)) {
+        return iosb.Information;
+    }
+
+    return cfs_error_code(status);
+}
+
+
+int cfs_proc_write(HANDLE handle, void *buffer, unsigned int count)
+{
+    NTSTATUS            status;
+    IO_STATUS_BLOCK     iosb;
+    LARGE_INTEGER       offset;
+
+    offset.QuadPart = -1;
+
+    /* write buffer to the opened file */
+    status = NtWriteFile(
+                (HANDLE)handle,
+                0,
+                NULL,
+                NULL,
+                &iosb,
+                buffer,
+                count,
+                &offset,
+                NULL);                     
+
+    /* check the return status */
+    if (!NT_SUCCESS(status)) {
+        printf("NtWriteFile request failed 0x%0x\n", status);
+        goto errorout;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(status)) {
+        return iosb.Information;
+    }
+
+    return cfs_error_code(status);
+}
+
+int cfs_proc_ioctl(HANDLE handle, int cmd, void *buffer)
+{
+    PUCHAR          procdat = NULL;
+    CFS_PROC_IOCTL  procctl;
+    ULONG           length = 0;
+    ULONG           extra = 0;
+
+    NTSTATUS        status;
+    IO_STATUS_BLOCK iosb;
+
+    procctl.cmd = cmd;
+
+    if(_IOC_TYPE(cmd) == IOC_LIBCFS_TYPE) {
+        struct libcfs_ioctl_data * portal;
+        portal = (struct libcfs_ioctl_data *) buffer;
+        length = portal->ioc_len;
+    } else if (_IOC_TYPE(cmd) == 'f') {
+        struct obd_ioctl_data * obd;
+        obd = (struct obd_ioctl_data *) buffer;
+        length = obd->ioc_len;
+        extra = size_round(obd->ioc_plen1) + size_round(obd->ioc_plen2);
+    } else if(_IOC_TYPE(cmd) == 'u') {
+        length = 4;
+        extra  = 0;
+    } else {
+        printf("user:winnt-proc:cfs_proc_ioctl: un-supported ioctl type ...\n");
+        cfs_enter_debugger();
+        status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    procctl.len = length + extra;
+    procdat = malloc(length + extra + sizeof(CFS_PROC_IOCTL));
+
+    if (NULL == procdat) {
+        printf("user:winnt-proc:cfs_proc_ioctl: no enough memory ...\n");
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+    memset(procdat, 0, length + extra + sizeof(CFS_PROC_IOCTL));
+    memcpy(procdat, &procctl, sizeof(CFS_PROC_IOCTL));
+    memcpy(&procdat[sizeof(CFS_PROC_IOCTL)], buffer, length);
+    length += sizeof(CFS_PROC_IOCTL);
+
+    if (_IOC_TYPE(cmd) == 'f') {
+
+        char *ptr;
+        struct obd_ioctl_data * data;
+        struct obd_ioctl_data * obd;
+
+        data = (struct obd_ioctl_data *) buffer;
+        obd  = (struct obd_ioctl_data *) (procdat + sizeof(CFS_PROC_IOCTL));
+        ptr = obd->ioc_bulk;
+
+        if (data->ioc_inlbuf1) {
+                obd->ioc_inlbuf1 = ptr;
+                LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr);
+        }
+
+        if (data->ioc_inlbuf2) {
+                obd->ioc_inlbuf2 = ptr;
+                LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr);
+        }
+        if (data->ioc_inlbuf3) {
+                obd->ioc_inlbuf3 = ptr;
+                LOGL(data->ioc_inlbuf3, data->ioc_inllen3, ptr);
+        }
+        if (data->ioc_inlbuf4) {
+                obd->ioc_inlbuf4 = ptr;
+                LOGL(data->ioc_inlbuf4, data->ioc_inllen4, ptr);
+        }
+    
+        if ( cmd != (ULONG)OBD_IOC_BRW_WRITE  &&
+             cmd != (ULONG)OBD_IOC_BRW_READ ) {
+
+            if (data->ioc_pbuf1 && data->ioc_plen1) {
+                obd->ioc_pbuf1 = &procdat[length];
+                memcpy(obd->ioc_pbuf1, data->ioc_pbuf1, data->ioc_plen1); 
+                length += size_round(data->ioc_plen1);
+            }
+
+            if (data->ioc_pbuf2 && data->ioc_plen2) {
+                obd->ioc_pbuf2 = &procdat[length];
+                memcpy(obd->ioc_pbuf2, data->ioc_pbuf2, data->ioc_plen2);
+                length += size_round(data->ioc_plen2);
+            }
+        }
+
+        if (obd_ioctl_is_invalid(obd)) {
+            cfs_enter_debugger();
+        }
+    }
+
+    status = NtDeviceIoControlFile(
+                (HANDLE)handle,
+                NULL, NULL, NULL, &iosb,
+                IOCTL_LIBCFS_ENTRY,
+                procdat, length,
+                procdat, length );
+
+
+    if (NT_SUCCESS(status)) {
+        memcpy(buffer, &procdat[sizeof(CFS_PROC_IOCTL)], procctl.len); 
+    }
+
+errorout:
+
+    if (procdat) {
+        free(procdat);
+    }
+
+    return cfs_error_code(status);
+}
+
+#endif /* __KERNEL__ */
diff --git a/lnet/libcfs/winnt/winnt-sync.c b/lnet/libcfs/winnt/winnt-sync.c
new file mode 100644 (file)
index 0000000..5094bef
--- /dev/null
@@ -0,0 +1,449 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LIBCFS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+
+
+/*
+ * Wait queue routines
+ */
+
+/*
+ * cfs_waitq_init
+ *   To initialize the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_init(cfs_waitq_t *waitq)
+{
+    waitq->magic = CFS_WAITQ_MAGIC;
+    waitq->flags = 0;
+    INIT_LIST_HEAD(&(waitq->waiters));
+    spin_lock_init(&(waitq->guard));
+}
+
+/*
+ * cfs_waitlink_init
+ *   To initialize the wake link node
+ *
+ * Arguments:
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitlink_init(cfs_waitlink_t *link)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+
+    memset(link, 0, sizeof(cfs_waitlink_t));
+
+    link->magic = CFS_WAITLINK_MAGIC;
+    link->flags = 0;
+
+    link->event = &(slot->Event);
+    link->hits  = &(slot->hits);
+
+    atomic_inc(&slot->count);
+
+    INIT_LIST_HEAD(&(link->waitq[0].link));
+    INIT_LIST_HEAD(&(link->waitq[1].link));
+
+    link->waitq[0].waitl = link->waitq[1].waitl = link;
+}
+
+
+/*
+ * cfs_waitlink_fini
+ *   To finilize the wake link node
+ *
+ * Arguments:
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitlink_fini(cfs_waitlink_t *link)
+{
+    cfs_task_t * task = cfs_current();
+    PTASK_SLOT   slot = NULL;
+
+    if (!task) {
+        /* should bugchk here */
+        cfs_enter_debugger();
+        return;
+    }
+
+    slot = CONTAINING_RECORD(task, TASK_SLOT, task);
+    cfs_assert(slot->Magic == TASKSLT_MAGIC);
+    cfs_assert(link->magic == CFS_WAITLINK_MAGIC);
+    cfs_assert(link->waitq[0].waitq == NULL);
+    cfs_assert(link->waitq[1].waitq == NULL);
+
+    atomic_dec(&slot->count);
+}
+
+
+/*
+ * cfs_waitq_add_internal
+ *   To queue the wait link node to the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:   pointer to the cfs_waitlink_t structure
+ *   int:    queue no (Normal or Forward waitq)
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_add_internal(cfs_waitq_t *waitq,
+                            cfs_waitlink_t *link,
+                            __u32 waitqid )
+{ 
+    LASSERT(waitq != NULL);
+    LASSERT(link != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+    LASSERT(waitqid < CFS_WAITQ_CHANNELS);
+
+    spin_lock(&(waitq->guard));
+    LASSERT(link->waitq[waitqid].waitq == NULL);
+    link->waitq[waitqid].waitq = waitq;
+    if (link->flags & CFS_WAITQ_EXCLUSIVE) {
+        list_add_tail(&link->waitq[waitqid].link, &waitq->waiters);
+    } else {
+        list_add(&link->waitq[waitqid].link, &waitq->waiters);
+    }
+    spin_unlock(&(waitq->guard));
+}
+/*
+ * cfs_waitq_add
+ *   To queue the wait link node to the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_add(cfs_waitq_t *waitq,
+                   cfs_waitlink_t *link)
+{ 
+    cfs_waitq_add_internal(waitq, link, CFS_WAITQ_CHAN_NORMAL);
+}
+
+/*
+ * cfs_waitq_add_exclusive
+ *   To set the wait link node to exclusive mode
+ *   and queue it to the wait queue
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:  pointer to the cfs_wait_link structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_add_exclusive( cfs_waitq_t *waitq,
+                              cfs_waitlink_t *link)
+{
+    LASSERT(waitq != NULL);
+    LASSERT(link != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+
+       link->flags |= CFS_WAITQ_EXCLUSIVE;
+    cfs_waitq_add(waitq, link);
+}
+
+/*
+ * cfs_waitq_forward
+ *   To be determinated.
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_forward( cfs_waitlink_t *link,
+                        cfs_waitq_t *waitq)
+{
+    cfs_waitq_add_internal(waitq, link, CFS_WAITQ_CHAN_FORWARD);
+}
+
+/*
+ * cfs_waitq_del
+ *   To remove the wait link node from the waitq
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_ waitq_t structure
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_del( cfs_waitq_t *waitq,
+                    cfs_waitlink_t *link)
+{
+    int i = 0;
+
+    LASSERT(waitq != NULL);
+    LASSERT(link != NULL);
+
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+
+    spin_lock(&(waitq->guard));
+
+    for (i=0; i < CFS_WAITQ_CHANNELS; i++) {
+        if (link->waitq[i].waitq == waitq)
+            break;
+    }
+
+    if (i < CFS_WAITQ_CHANNELS) {
+        link->waitq[i].waitq = NULL;
+        list_del_init(&link->waitq[i].link);
+    } else {
+        cfs_enter_debugger();
+    }
+
+    spin_unlock(&(waitq->guard));
+}
+
+/*
+ * cfs_waitq_active
+ *   Is the waitq active (not empty) ?
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_ waitq_t structure
+ *
+ * Return Value:
+ *   Zero: the waitq is empty
+ *   Non-Zero: the waitq is active
+ *
+ * Notes: 
+ *   We always returns TRUE here, the same to Darwin.
+ */
+
+int cfs_waitq_active(cfs_waitq_t *waitq)
+{
+    LASSERT(waitq != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+
+       return (1);
+}
+
+/*
+ * cfs_waitq_signal_nr
+ *   To wake up all the non-exclusive tasks plus nr exclusive
+ *   ones in the waitq
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *   nr:    number of exclusive tasks to be woken up
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+
+void cfs_waitq_signal_nr(cfs_waitq_t *waitq, int nr)
+{
+    int     result;
+    cfs_waitlink_channel_t * scan;
+
+    LASSERT(waitq != NULL);
+    LASSERT(waitq->magic == CFS_WAITQ_MAGIC);
+
+    spin_lock(&waitq->guard);
+
+    list_for_each_entry(scan, &waitq->waiters, cfs_waitlink_channel_t, link) {
+
+        cfs_waitlink_t *waitl = scan->waitl;
+
+        result = cfs_wake_event(waitl->event);
+        LASSERT( result == FALSE || result == TRUE );
+
+        if (result) {
+            atomic_inc(waitl->hits);
+        }
+
+        if ((waitl->flags & CFS_WAITQ_EXCLUSIVE) && --nr == 0)
+            break;
+    }
+
+    spin_unlock(&waitq->guard);
+    return;
+}
+
+/*
+ * cfs_waitq_signal
+ *   To wake up all the non-exclusive tasks and 1 exclusive
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_signal(cfs_waitq_t *waitq)
+{
+    cfs_waitq_signal_nr(waitq, 1);
+}
+
+
+/*
+ * cfs_waitq_broadcast
+ *   To wake up all the tasks in the waitq
+ *
+ * Arguments:
+ *   waitq:  pointer to the cfs_waitq_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_broadcast(cfs_waitq_t *waitq)
+{
+    LASSERT(waitq != NULL);
+    LASSERT(waitq->magic ==CFS_WAITQ_MAGIC);
+
+       cfs_waitq_signal_nr(waitq, 0);
+}
+
+/*
+ * cfs_waitq_wait
+ *   To wait on the link node until it is signaled.
+ *
+ * Arguments:
+ *   link:  pointer to the cfs_waitlink_t structure
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes: 
+ *   N/A
+ */
+
+void cfs_waitq_wait(cfs_waitlink_t *link, cfs_task_state_t state)
+{ 
+    LASSERT(link != NULL);
+    LASSERT(link->magic == CFS_WAITLINK_MAGIC);
+
+    if (atomic_read(link->hits) > 0) {
+        atomic_dec(link->hits);
+        LASSERT((__u32)atomic_read(link->hits) < (__u32)0xFFFFFF00);
+    } else {
+        cfs_wait_event(link->event, 0);
+    }
+}
+
+/*
+ * cfs_waitq_timedwait
+ *   To wait the link node to be signaled with a timeout limit
+ *
+ * Arguments:
+ *   link:   pointer to the cfs_waitlink_t structure
+ *   timeout: the timeout limitation
+ *
+ * Return Value:
+ *   Woken up: return the difference of the current time and
+ *             the timeout
+ *   Timeout:  return 0
+ *
+ * Notes: 
+ *   What if it happens to be woken up at the just timeout time !?
+ */
+
+cfs_duration_t cfs_waitq_timedwait( cfs_waitlink_t *link,
+                                    cfs_task_state_t state,
+                                    cfs_duration_t timeout)
+{ 
+
+    if (atomic_read(link->hits) > 0) {
+        atomic_dec(link->hits);
+        LASSERT((__u32)atomic_read(link->hits) < (__u32)0xFFFFFF00);
+        return TRUE;
+    }
+
+    return (cfs_duration_t)cfs_wait_event(link->event, timeout);
+}
+
+
diff --git a/lnet/libcfs/winnt/winnt-tcpip.c b/lnet/libcfs/winnt/winnt-tcpip.c
new file mode 100644 (file)
index 0000000..d0c725c
--- /dev/null
@@ -0,0 +1,6706 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LIBCFS
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+#include <lnet/lnet.h>
+
+#define TDILND_MODULE_NAME L"Tdilnd"
+
+ks_data_t ks_data;
+
+ULONG
+ks_tdi_send_flags(ULONG SockFlags)
+{
+    ULONG   TdiFlags = 0;
+
+    if (cfs_is_flag_set(SockFlags, MSG_OOB)) {
+        cfs_set_flag(TdiFlags, TDI_SEND_EXPEDITED);
+    }
+
+    if (cfs_is_flag_set(SockFlags, MSG_MORE)) {
+        cfs_set_flag(TdiFlags, TDI_SEND_PARTIAL);
+    }
+
+    if (cfs_is_flag_set(SockFlags, MSG_DONTWAIT)) {
+        cfs_set_flag(TdiFlags, TDI_SEND_NON_BLOCKING);
+    }
+
+    return TdiFlags;
+}
+
+NTSTATUS
+KsIrpCompletionRoutine(
+    IN PDEVICE_OBJECT    DeviceObject,
+    IN PIRP              Irp,
+    IN PVOID             Context
+    )
+{
+    if (NULL != Context) {
+        KeSetEvent((PKEVENT)Context, IO_NETWORK_INCREMENT, FALSE);
+    }
+
+    return STATUS_MORE_PROCESSING_REQUIRED;
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+    UNREFERENCED_PARAMETER(Irp);
+}
+
+
+/*
+ * KsBuildTdiIrp
+ *   Allocate a new IRP and initialize it to be issued to tdi
+ *
+ * Arguments:
+ *   DeviceObject:  device object created by the underlying
+ *                  TDI transport driver
+ *
+ * Return Value:
+ *   PRIP:   the allocated Irp in success or NULL in failure.
+ *
+ * NOTES:
+ *   N/A
+ */
+
+PIRP
+KsBuildTdiIrp(
+    IN PDEVICE_OBJECT    DeviceObject
+    )
+{
+    PIRP                Irp;
+    PIO_STACK_LOCATION  IrpSp;
+
+    //
+    // Allocating the IRP ...
+    //
+
+    Irp = IoAllocateIrp(DeviceObject->StackSize, FALSE);
+
+    if (NULL != Irp) {
+
+        //
+        // Getting the Next Stack Location ...
+        //
+
+        IrpSp = IoGetNextIrpStackLocation(Irp);
+
+        //
+        // Initializing Irp ...
+        //
+
+        IrpSp->MajorFunction = IRP_MJ_INTERNAL_DEVICE_CONTROL;
+        IrpSp->Parameters.DeviceIoControl.IoControlCode = 0;
+    }
+
+    return Irp;
+}
+
+/*
+ * KsSubmitTdiIrp
+ *   Issue the Irp to the underlying tdi driver
+ *
+ * Arguments:
+ *   DeviceObject:  the device object created by TDI driver
+ *   Irp:           the I/O request packet to be processed
+ *   bSynchronous:  synchronous or not. If true, we need wait
+ *                  until the process is finished.
+ *   Information:   returned info
+ *
+ * Return Value:
+ *   NTSTATUS:      kernel status code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsSubmitTdiIrp(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN BOOLEAN          bSynchronous,
+    OUT PULONG          Information
+    )
+{
+    NTSTATUS            Status;
+    KEVENT              Event;
+
+    if (bSynchronous) {
+
+        KeInitializeEvent(
+            &Event,
+            SynchronizationEvent,
+            FALSE
+            );
+
+
+        IoSetCompletionRoutine(
+            Irp,
+            KsIrpCompletionRoutine,
+            &Event,
+            TRUE,
+            TRUE,
+            TRUE
+            );
+    }
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    if (bSynchronous) {
+
+        if (STATUS_PENDING == Status) {
+
+            Status = KeWaitForSingleObject(
+                        &Event,
+                        Executive,
+                        KernelMode,
+                        FALSE,
+                        NULL
+                        );
+        }
+
+        Status = Irp->IoStatus.Status;
+
+        if (Information) {
+            *Information = (ULONG)(Irp->IoStatus.Information);
+        }
+
+        Irp->MdlAddress = NULL;
+        IoFreeIrp(Irp);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        KsPrint((2, "KsSubmitTdiIrp: Error when submitting the Irp: Status = %xh (%s) ...\n",
+                    Status, KsNtStatusToString(Status)));
+    }
+
+    return (Status);
+}
+
+
+
+/*
+ * KsOpenControl
+ *   Open the Control Channel Object ...
+ *
+ * Arguments:
+ *   DeviceName:   the device name to be opened
+ *   Handle:       opened handle in success case
+ *   FileObject:   the fileobject of the device
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsOpenControl(
+    IN PUNICODE_STRING      DeviceName,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   )
+{
+    NTSTATUS          Status = STATUS_SUCCESS;
+
+    OBJECT_ATTRIBUTES ObjectAttributes;
+    IO_STATUS_BLOCK   IoStatus;
+
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Initializing ...
+    //
+
+    InitializeObjectAttributes(
+        &ObjectAttributes,
+        DeviceName,
+        OBJ_CASE_INSENSITIVE |
+        OBJ_KERNEL_HANDLE,
+        NULL,
+        NULL
+        );
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Creating the Transport Address Object ...
+    //
+
+    Status = ZwCreateFile(
+                Handle,
+                FILE_READ_DATA | FILE_WRITE_DATA,
+                &ObjectAttributes,
+                &IoStatus,
+                0,
+                FILE_ATTRIBUTE_NORMAL,
+                FILE_SHARE_READ | FILE_SHARE_WRITE,
+                FILE_OPEN,
+                0,
+                NULL,
+                0
+                );
+
+
+    if (NT_SUCCESS(Status)) {
+
+        //
+        // Now Obtaining the FileObject of the Transport Address ...
+        //
+
+        Status = ObReferenceObjectByHandle(
+                    *Handle,
+                    FILE_ANY_ACCESS,
+                    NULL,
+                    KernelMode,
+                    FileObject,
+                    NULL
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            cfs_enter_debugger();
+            ZwClose(*Handle);
+        }
+
+    } else {
+
+        cfs_enter_debugger();
+    }
+
+    return (Status);
+}
+
+
+/*
+ * KsCloseControl
+ *   Release the Control Channel Handle and FileObject
+ *
+ * Arguments:
+ *   Handle:       the channel handle to be released
+ *   FileObject:   the fileobject to be released
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsCloseControl(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+   )
+{
+    NTSTATUS  Status = STATUS_SUCCESS;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    if (FileObject) {
+
+        ObDereferenceObject(FileObject);
+    }
+
+    if (Handle) {
+
+        Status = ZwClose(Handle);
+    }
+
+    ASSERT(NT_SUCCESS(Status));
+
+    return (Status);
+}
+
+
+/*
+ * KsOpenAddress
+ *   Open the tdi address object
+ *
+ * Arguments:
+ *   DeviceName:   device name of the address object
+ *   pAddress:     tdi address of the address object
+ *   AddressLength: length in bytes of the tdi address
+ *   Handle:       the newly opened handle
+ *   FileObject:   the newly opened fileobject
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsOpenAddress(
+    IN PUNICODE_STRING      DeviceName,
+    IN PTRANSPORT_ADDRESS   pAddress,
+    IN ULONG                AddressLength,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   )
+{
+    NTSTATUS          Status = STATUS_SUCCESS;
+
+    PFILE_FULL_EA_INFORMATION Ea = NULL;
+    ULONG             EaLength;
+    UCHAR             EaBuffer[EA_MAX_LENGTH];
+
+    OBJECT_ATTRIBUTES ObjectAttributes;
+    IO_STATUS_BLOCK   IoStatus;
+
+    //
+    // Building EA for the Address Object to be Opened ...
+    //
+
+    Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer;
+    Ea->NextEntryOffset = 0;
+    Ea->Flags = 0;
+    Ea->EaNameLength = TDI_TRANSPORT_ADDRESS_LENGTH;
+    Ea->EaValueLength = (USHORT)AddressLength;
+    RtlCopyMemory(
+        &(Ea->EaName),
+        TdiTransportAddress,
+        Ea->EaNameLength + 1
+        );
+    RtlMoveMemory(
+        &(Ea->EaName[Ea->EaNameLength + 1]),
+        pAddress,
+        AddressLength
+        );
+    EaLength =  sizeof(FILE_FULL_EA_INFORMATION) +
+                Ea->EaNameLength + AddressLength;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+
+    //
+    // Initializing ...
+    //
+
+    InitializeObjectAttributes(
+        &ObjectAttributes,
+        DeviceName,
+        OBJ_CASE_INSENSITIVE |
+        OBJ_KERNEL_HANDLE,
+        NULL,
+        NULL
+        );
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Creating the Transport Address Object ...
+    //
+
+    Status = ZwCreateFile(
+                Handle,
+                FILE_READ_DATA | FILE_WRITE_DATA,
+                &ObjectAttributes,
+                &IoStatus,
+                0,
+                FILE_ATTRIBUTE_NORMAL,
+                FILE_SHARE_READ | FILE_SHARE_WRITE, /* 0: DON'T REUSE */
+                FILE_OPEN,
+                0,
+                Ea,
+                EaLength
+                );
+
+
+    if (NT_SUCCESS(Status)) {
+
+        //
+        // Now Obtaining the FileObject of the Transport Address ...
+        //
+
+        Status = ObReferenceObjectByHandle(
+                    *Handle,
+                    FILE_ANY_ACCESS,
+                    NULL,
+                    KernelMode,
+                    FileObject,
+                    NULL
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            cfs_enter_debugger();
+            ZwClose(*Handle);
+        }
+
+    } else {
+
+        cfs_enter_debugger();
+    }
+
+    return (Status);
+}
+
+/*
+ * KsCloseAddress
+ *   Release the Hanlde and FileObject of an opened tdi
+ *   address object
+ *
+ * Arguments:
+ *   Handle:       the handle to be released
+ *   FileObject:   the fileobject to be released
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsCloseAddress(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+)
+{
+    NTSTATUS  Status = STATUS_SUCCESS;
+
+    if (FileObject) {
+
+        ObDereferenceObject(FileObject);
+    }
+
+    if (Handle) {
+
+        Status = ZwClose(Handle);
+    }
+
+    ASSERT(NT_SUCCESS(Status));
+
+    return (Status);
+}
+
+
+/*
+ * KsOpenConnection
+ *   Open a tdi connection object
+ *
+ * Arguments:
+ *   DeviceName:   device name of the connection object
+ *   ConnectionContext: the connection context
+ *   Handle:       the newly opened handle
+ *   FileObject:   the newly opened fileobject
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsOpenConnection(
+    IN PUNICODE_STRING      DeviceName,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    OUT HANDLE *            Handle,
+    OUT PFILE_OBJECT *      FileObject
+   )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    PFILE_FULL_EA_INFORMATION Ea = NULL;
+    ULONG               EaLength;
+    UCHAR               EaBuffer[EA_MAX_LENGTH];
+
+    OBJECT_ATTRIBUTES   ObjectAttributes;
+    IO_STATUS_BLOCK     IoStatus;
+
+    //
+    // Building EA for the Address Object to be Opened ...
+    //
+
+    Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer;
+    Ea->NextEntryOffset = 0;
+    Ea->Flags = 0;
+    Ea->EaNameLength = TDI_CONNECTION_CONTEXT_LENGTH;
+    Ea->EaValueLength = (USHORT)sizeof(CONNECTION_CONTEXT);
+    RtlCopyMemory(
+        &(Ea->EaName),
+        TdiConnectionContext,
+        Ea->EaNameLength + 1
+        );
+    RtlMoveMemory(
+        &(Ea->EaName[Ea->EaNameLength + 1]),
+        &ConnectionContext,
+        sizeof(CONNECTION_CONTEXT)
+        );
+    EaLength = sizeof(FILE_FULL_EA_INFORMATION) - 1 +
+                               Ea->EaNameLength + 1 + sizeof(CONNECTION_CONTEXT);
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+
+    //
+    // Initializing ...
+    //
+
+    InitializeObjectAttributes(
+        &ObjectAttributes,
+        DeviceName,
+        OBJ_CASE_INSENSITIVE |
+        OBJ_KERNEL_HANDLE,
+        NULL,
+        NULL
+        );
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    //
+    // Creating the Connection Object ...
+    //
+
+    Status = ZwCreateFile(
+                Handle,
+                FILE_READ_DATA | FILE_WRITE_DATA,
+                &ObjectAttributes,
+                &IoStatus,
+                NULL,
+                FILE_ATTRIBUTE_NORMAL,
+                0,
+                FILE_OPEN,
+                0,
+                Ea,
+                EaLength
+                );
+
+
+    if (NT_SUCCESS(Status)) {
+
+        //
+        // Now Obtaining the FileObject of the Transport Address ...
+        //
+
+        Status = ObReferenceObjectByHandle(
+                    *Handle,
+                    FILE_ANY_ACCESS,
+                    NULL,
+                    KernelMode,
+                    FileObject,
+                    NULL
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            cfs_enter_debugger();
+            ZwClose(*Handle);
+        }
+
+    } else {
+
+        cfs_enter_debugger();
+    }
+
+    return (Status);
+}
+
+/*
+ * KsCloseConnection
+ *   Release the Hanlde and FileObject of an opened tdi
+ *   connection object
+ *
+ * Arguments:
+ *   Handle:       the handle to be released
+ *   FileObject:   the fileobject to be released
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsCloseConnection(
+    IN HANDLE             Handle,
+    IN PFILE_OBJECT       FileObject
+    )
+{
+    NTSTATUS  Status = STATUS_SUCCESS;
+
+    if (FileObject) {
+
+        ObDereferenceObject(FileObject);
+    }
+
+    if (Handle) {
+
+        Status = ZwClose(Handle);
+    }
+
+    ASSERT(NT_SUCCESS(Status));
+
+    return (Status);
+}
+
+
+/*
+ * KsAssociateAddress
+ *   Associate an address object with a connection object
+ *
+ * Arguments:
+ *   AddressHandle:  the handle of the address object
+ *   ConnectionObject:  the FileObject of the connection
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsAssociateAddress(
+    IN HANDLE           AddressHandle,
+    IN PFILE_OBJECT     ConnectionObject
+    )
+{
+    NTSTATUS            Status;
+    PDEVICE_OBJECT      DeviceObject;
+    PIRP                Irp;
+
+    //
+    // Getting the DeviceObject from Connection FileObject
+    //
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    //
+    // Building Tdi Internal Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Assocating the Address Object with the Connection Object
+        //
+
+        TdiBuildAssociateAddress(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            NULL,
+            NULL,
+            AddressHandle
+            );
+
+        //
+        // Calling the Transprot Driver with the Prepared Irp
+        //
+
+        Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL);
+    }
+
+    return (Status);
+}
+
+
+/*
+ * KsDisassociateAddress
+ *   Disassociate the connection object (the relationship will
+ *   the corresponding address object will be dismissed. )
+ *
+ * Arguments:
+ *   ConnectionObject:  the FileObject of the connection
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsDisassociateAddress(
+    IN PFILE_OBJECT     ConnectionObject
+    )
+{
+    NTSTATUS            Status;
+    PDEVICE_OBJECT      DeviceObject;
+    PIRP                   Irp;
+
+    //
+    // Getting the DeviceObject from Connection FileObject
+    //
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    //
+    // Building Tdi Internal Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Disassocating the Address Object with the Connection Object
+        //
+
+        TdiBuildDisassociateAddress(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            NULL,
+            NULL
+            );
+
+        //
+        // Calling the Transprot Driver with the Prepared Irp
+        //
+
+        Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL);
+    }
+
+    return (Status);
+}
+
+
+/*
+
+//
+// Connection Control Event Callbacks
+//
+
+TDI_EVENT_CONNECT
+TDI_EVENT_DISCONNECT
+TDI_EVENT_ERROR
+
+//
+// Tcp Event Callbacks
+//
+
+TDI_EVENT_RECEIVE
+TDI_EVENT_RECEIVE_EXPEDITED
+TDI_EVENT_CHAINED_RECEIVE
+TDI_EVENT_CHAINED_RECEIVE_EXPEDITED
+
+//
+// Udp Event Callbacks
+//
+
+TDI_EVENT_RECEIVE_DATAGRAM
+TDI_EVENT_CHAINED_RECEIVE_DATAGRAM
+
+*/
+
+
+/*
+ * KsSetEventHandlers
+ *   Set the tdi event callbacks with an address object
+ *
+ * Arguments:
+ *   AddressObject: the FileObject of the address object
+ *   EventContext:  the parameter for the callbacks
+ *   Handlers:      the handlers indictor array
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsSetEventHandlers(
+    IN PFILE_OBJECT                         AddressObject,  // Address File Object
+    IN PVOID                                EventContext,   // Context for Handlers
+    IN PKS_EVENT_HANDLERS                   Handlers        // Handlers Indictor
+   )
+{
+    NTSTATUS             Status = STATUS_SUCCESS;
+    PDEVICE_OBJECT       DeviceObject;
+    USHORT               i = 0;
+
+    DeviceObject = IoGetRelatedDeviceObject(AddressObject);
+
+    for (i=0; i < TDI_EVENT_MAXIMUM_HANDLER; i++) {
+
+        //
+        // Setup the tdi event callback handler if requested.
+        //
+
+        if (Handlers->IsActive[i]) {
+
+            PIRP            Irp;
+
+            //
+            // Building Tdi Internal Irp ...
+            //
+
+            Irp = KsBuildTdiIrp(DeviceObject);
+
+            if (NULL == Irp) {
+
+                Status = STATUS_INSUFFICIENT_RESOURCES;
+
+            } else {
+
+                //
+                // Building the Irp to set the Event Handler ...
+                //
+
+                TdiBuildSetEventHandler(
+                    Irp,
+                    DeviceObject,
+                    AddressObject,
+                    NULL,
+                    NULL,
+                    i,                      /* tdi event type */
+                    Handlers->Handler[i],   /* tdi event handler */
+                    EventContext            /* context for the handler */
+                    );
+
+                //
+                // Calling the Transprot Driver with the Prepared Irp
+                //
+
+                Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL);
+
+                //
+                // tcp/ip tdi does not support these two event callbacks
+                //
+
+                if ((!NT_SUCCESS(Status)) && ( i == TDI_EVENT_SEND_POSSIBLE ||
+                     i == TDI_EVENT_CHAINED_RECEIVE_EXPEDITED )) {
+                    cfs_enter_debugger();
+                    Status = STATUS_SUCCESS;
+                }
+            }
+
+            if (!NT_SUCCESS(Status)) {
+                cfs_enter_debugger();
+                goto errorout;
+            }
+        }
+    }
+
+
+errorout:
+
+    if (!NT_SUCCESS(Status)) {
+
+        KsPrint((2, "KsSetEventHandlers: Error Status = %xh (%s)\n",
+                    Status, KsNtStatusToString(Status) ));
+    }
+
+    return (Status);
+}
+
+
+
+/*
+ * KsQueryAddressInfo
+ *   Query the address of the FileObject specified
+ *
+ * Arguments:
+ *   FileObject:  the FileObject to be queried
+ *   AddressInfo: buffer to contain the address info
+ *   AddressSize: length of the AddressInfo buffer
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryAddressInfo(
+    PFILE_OBJECT            FileObject,
+    PTDI_ADDRESS_INFO       AddressInfo,
+    PULONG                  AddressSize
+   )
+{
+    NTSTATUS          Status = STATUS_UNSUCCESSFUL;
+    PIRP              Irp = NULL;
+    PMDL              Mdl;
+    PDEVICE_OBJECT    DeviceObject;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+    RtlZeroMemory(AddressInfo, *(AddressSize));
+
+    //
+    // Allocating the Tdi Setting Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Locking the User Buffer / Allocating a MDL for it
+        //
+
+        Status = KsLockUserBuffer(
+                    AddressInfo,
+                    FALSE,
+                    *(AddressSize),
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            IoFreeIrp(Irp);
+            Irp = NULL;
+        }
+    }
+
+    if (Irp) {
+
+        LASSERT(NT_SUCCESS(Status));
+
+        TdiBuildQueryInformation(
+                    Irp,
+                    DeviceObject,
+                    FileObject,
+                    NULL,
+                    NULL,
+                    TDI_QUERY_ADDRESS_INFO,
+                    Mdl
+                    );
+
+        Status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    AddressSize
+                    );
+
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        //TDI_BUFFER_OVERFLOW
+    }
+
+    return (Status);
+}
+
+/*
+ * KsQueryProviderInfo
+ *   Query the underlying transport device's information
+ *
+ * Arguments:
+ *   TdiDeviceName:  the transport device's name string
+ *   ProviderInfo:   TDI_PROVIDER_INFO struncture
+ *
+ * Return Value:
+ *   NTSTATUS:       Nt system status code
+  *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryProviderInfo(
+    PWSTR               TdiDeviceName,
+    PTDI_PROVIDER_INFO  ProviderInfo
+   )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    PIRP                Irp = NULL;
+    PMDL                Mdl = NULL;
+
+    UNICODE_STRING      ControlName;
+
+    HANDLE              Handle;
+    PFILE_OBJECT        FileObject;
+    PDEVICE_OBJECT      DeviceObject;
+
+    ULONG               ProviderSize = 0;
+
+    RtlInitUnicodeString(&ControlName, TdiDeviceName);
+
+    //
+    // Open the Tdi Control Channel
+    //
+
+    Status = KsOpenControl(
+                &ControlName,
+                &Handle,
+                &FileObject
+                );
+
+    if (!NT_SUCCESS(Status)) {
+
+        KsPrint((2, "KsQueryProviderInfo: Fail to open the tdi control channel.\n"));
+        return (Status);
+    }
+
+    //
+    // Obtain The Related Device Object
+    //
+
+    DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+    ProviderSize = sizeof(TDI_PROVIDER_INFO);
+    RtlZeroMemory(ProviderInfo, ProviderSize);
+
+    //
+    // Allocating the Tdi Setting Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Locking the User Buffer / Allocating a MDL for it
+        //
+
+        Status = KsLockUserBuffer(
+                    ProviderInfo,
+                    FALSE,
+                    ProviderSize,
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+
+            IoFreeIrp(Irp);
+            Irp = NULL;
+        }
+    }
+
+    if (Irp) {
+
+        LASSERT(NT_SUCCESS(Status));
+
+        TdiBuildQueryInformation(
+                    Irp,
+                    DeviceObject,
+                    FileObject,
+                    NULL,
+                    NULL,
+                    TDI_QUERY_PROVIDER_INFO,
+                    Mdl
+                    );
+
+        Status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    &ProviderSize
+                    );
+
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        //TDI_BUFFER_OVERFLOW
+    }
+
+    KsCloseControl(Handle, FileObject);
+
+    return (Status);
+}
+
+/*
+ * KsQueryConnectionInfo
+ *   Query the connection info of the FileObject specified
+ *   (some statics data of the traffic)
+ *
+ * Arguments:
+ *   FileObject:     the FileObject to be queried
+ *   ConnectionInfo: buffer to contain the connection info
+ *   ConnectionSize: length of the ConnectionInfo buffer
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryConnectionInfo(
+    PFILE_OBJECT            ConnectionObject,
+    PTDI_CONNECTION_INFO    ConnectionInfo,
+    PULONG                  ConnectionSize
+   )
+{
+    NTSTATUS          Status = STATUS_UNSUCCESSFUL;
+    PIRP              Irp = NULL;
+    PMDL              Mdl;
+    PDEVICE_OBJECT    DeviceObject;
+
+    LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL );
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    RtlZeroMemory(ConnectionInfo, *(ConnectionSize));
+
+    //
+    // Allocating the Tdi Query Irp ...
+    //
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        //
+        // Locking the User Buffer / Allocating a MDL for it
+        //
+
+        Status = KsLockUserBuffer(
+                    ConnectionInfo,
+                    FALSE,
+                    *(ConnectionSize),
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (NT_SUCCESS(Status)) {
+
+            IoFreeIrp(Irp);
+            Irp = NULL;
+        }
+    }
+
+    if (Irp) {
+
+        LASSERT(NT_SUCCESS(Status));
+
+        TdiBuildQueryInformation(
+                    Irp,
+                    DeviceObject,
+                    ConnectionObject,
+                    NULL,
+                    NULL,
+                    TDI_QUERY_CONNECTION_INFO,
+                    Mdl
+                    );
+
+        Status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    ConnectionSize
+                    );
+
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    return (Status);
+}
+
+
+/*
+ * KsInitializeTdiAddress
+ *   Initialize the tdi addresss
+ *
+ * Arguments:
+ *   pTransportAddress: tdi address to be initialized
+ *   IpAddress:         the ip address of object
+ *   IpPort:            the ip port of the object
+ *
+ * Return Value:
+ *   ULONG: the total size of the tdi address
+ *
+ * NOTES:
+ *   N/A
+ */
+
+ULONG
+KsInitializeTdiAddress(
+    IN OUT PTA_IP_ADDRESS   pTransportAddress,
+    IN ULONG                IpAddress,
+    IN USHORT               IpPort
+    )
+{
+    pTransportAddress->TAAddressCount = 1;
+    pTransportAddress->Address[ 0 ].AddressLength = TDI_ADDRESS_LENGTH_IP;
+    pTransportAddress->Address[ 0 ].AddressType   = TDI_ADDRESS_TYPE_IP;
+    pTransportAddress->Address[ 0 ].Address[ 0 ].sin_port = IpPort;
+    pTransportAddress->Address[ 0 ].Address[ 0 ].in_addr  = IpAddress;
+
+    return (FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) + TDI_ADDRESS_LENGTH_IP);
+}
+
+/*
+ * KsQueryTdiAddressLength
+ *   Query the total size of the tdi address
+ *
+ * Arguments:
+ *   pTransportAddress: tdi address to be queried
+ *
+ * Return Value:
+ *   ULONG: the total size of the tdi address
+ *
+ * NOTES:
+ *   N/A
+ */
+
+ULONG
+KsQueryTdiAddressLength(
+    PTRANSPORT_ADDRESS      pTransportAddress
+    )
+{
+    ULONG                   TotalLength = 0;
+    LONG                    i;
+
+    PTA_ADDRESS UNALIGNED   pTaAddress = NULL;
+
+    ASSERT (NULL != pTransportAddress);
+
+    TotalLength  = FIELD_OFFSET(TRANSPORT_ADDRESS, Address) +
+                   FIELD_OFFSET(TA_ADDRESS, Address) * pTransportAddress->TAAddressCount;
+
+    pTaAddress = (TA_ADDRESS UNALIGNED *)pTransportAddress->Address;
+
+    for (i = 0; i < pTransportAddress->TAAddressCount; i++)
+    {
+        TotalLength += pTaAddress->AddressLength;
+        pTaAddress = (TA_ADDRESS UNALIGNED *)((PCHAR)pTaAddress +
+                                           FIELD_OFFSET(TA_ADDRESS,Address) +
+                                           pTaAddress->AddressLength );
+    }
+
+    return (TotalLength);
+}
+
+
+/*
+ * KsQueryIpAddress
+ *   Query the ip address of the tdi object
+ *
+ * Arguments:
+ *   FileObject: tdi object to be queried
+ *   TdiAddress: TdiAddress buffer, to store the queried
+ *               tdi ip address
+ *   AddressLength: buffer length of the TdiAddress
+ *
+ * Return Value:
+ *   ULONG: the total size of the tdi ip address
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsQueryIpAddress(
+    PFILE_OBJECT    FileObject,
+    PVOID           TdiAddress,
+    ULONG*          AddressLength
+    )
+{
+    NTSTATUS        Status;
+
+    PTDI_ADDRESS_INFO   TdiAddressInfo;
+    ULONG               Length;
+
+
+    //
+    // Maximum length of TDI_ADDRESSS_INFO with one TRANSPORT_ADDRESS
+    //
+
+    Length = MAX_ADDRESS_LENGTH;
+
+    TdiAddressInfo = (PTDI_ADDRESS_INFO)
+                        ExAllocatePoolWithTag(
+                            NonPagedPool,
+                            Length,
+                            'KSAI' );
+
+    if (NULL == TdiAddressInfo) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+
+    Status = KsQueryAddressInfo(
+        FileObject,
+        TdiAddressInfo,
+        &Length
+        );
+
+errorout:
+
+    if (NT_SUCCESS(Status))
+    {
+        if (*AddressLength < Length) {
+
+            Status = STATUS_BUFFER_TOO_SMALL;
+
+        } else {
+
+            *AddressLength = Length;
+            RtlCopyMemory(
+                TdiAddress,
+                &(TdiAddressInfo->Address),
+                Length
+                );
+
+            Status = STATUS_SUCCESS;
+        }
+
+    } else {
+
+    }
+
+
+    if (NULL != TdiAddressInfo) {
+
+        ExFreePool(TdiAddressInfo);
+    }
+
+    return Status;
+}
+
+
+/*
+ * KsErrorEventHandler
+ *   the common error event handler callback
+ *
+ * Arguments:
+ *   TdiEventContext: should be the socket
+ *   Status: the error code
+ *
+ * Return Value:
+ *   Status: STATS_SUCCESS
+ *
+ * NOTES:
+ *   We need not do anything in such a severe
+ *   error case. System will process it for us.
+ */
+
+NTSTATUS
+KsErrorEventHandler(
+    IN PVOID        TdiEventContext,
+    IN NTSTATUS     Status
+   )
+{
+    KsPrint((2, "KsErrorEventHandler called at Irql = %xh ...\n",
+                KeGetCurrentIrql()));
+
+    cfs_enter_debugger();
+
+    return (STATUS_SUCCESS);
+}
+
+
+/*
+ * ks_set_handlers
+ *   setup all the event handler callbacks
+ *
+ * Arguments:
+ *   tconn: the tdi connecton object
+ *
+ * Return Value:
+ *   int: ks error code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+int
+ks_set_handlers(
+    ksock_tconn_t *     tconn
+    )
+{
+    NTSTATUS            status = STATUS_SUCCESS;
+    KS_EVENT_HANDLERS   handlers;
+
+    /* to make sure the address object is opened already */
+    if (tconn->kstc_addr.FileObject == NULL) {
+        goto errorout;
+    }
+
+    /* initialize the handlers indictor array. for sender and listenr,
+       there are different set of callbacks. for child, we just return. */
+
+    memset(&handlers, 0, sizeof(KS_EVENT_HANDLERS));
+
+    SetEventHandler(handlers, TDI_EVENT_ERROR, KsErrorEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_DISCONNECT, KsDisconnectEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE, KsTcpReceiveEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE_EXPEDITED, KsTcpReceiveExpeditedEventHandler);
+    SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE, KsTcpChainedReceiveEventHandler);
+
+    // SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE_EXPEDITED, KsTcpChainedReceiveExpeditedEventHandler);
+
+    if (tconn->kstc_type == kstt_listener) {
+        SetEventHandler(handlers, TDI_EVENT_CONNECT, KsConnectEventHandler);
+    } else if (tconn->kstc_type == kstt_child) {
+        goto errorout;
+    }
+
+    /* set all the event callbacks */
+    status = KsSetEventHandlers(
+                tconn->kstc_addr.FileObject, /* Address File Object  */
+                tconn,                       /* Event Context */
+                &handlers                    /* Event callback handlers */
+                );
+
+errorout:
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * ks_reset_handlers
+ *   disable all the event handler callbacks (set to NULL)
+ *
+ * Arguments:
+ *   tconn: the tdi connecton object
+ *
+ * Return Value:
+ *   int: ks error code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+int
+ks_reset_handlers(
+    ksock_tconn_t *     tconn
+    )
+{
+    NTSTATUS            status = STATUS_SUCCESS;
+    KS_EVENT_HANDLERS   handlers;
+
+    /* to make sure the address object is opened already */
+    if (tconn->kstc_addr.FileObject == NULL) {
+        goto errorout;
+    }
+
+    /* initialize the handlers indictor array. for sender and listenr,
+       there are different set of callbacks. for child, we just return. */
+
+    memset(&handlers, 0, sizeof(KS_EVENT_HANDLERS));
+
+    SetEventHandler(handlers, TDI_EVENT_ERROR, NULL);
+    SetEventHandler(handlers, TDI_EVENT_DISCONNECT, NULL);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE, NULL);
+    SetEventHandler(handlers, TDI_EVENT_RECEIVE_EXPEDITED, NULL);
+    SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE, NULL);
+    // SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE_EXPEDITED, NULL);
+
+    if (tconn->kstc_type == kstt_listener) {
+        SetEventHandler(handlers, TDI_EVENT_CONNECT, NULL);
+    } else if (tconn->kstc_type == kstt_child) {
+        goto errorout;
+    }
+
+    /* set all the event callbacks */
+    status = KsSetEventHandlers(
+                tconn->kstc_addr.FileObject, /* Address File Object  */
+                tconn,                       /* Event Context */
+                &handlers                    /* Event callback handlers */
+                );
+
+errorout:
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * KsAcceptCompletionRoutine
+ *   Irp completion routine for TdiBuildAccept (KsConnectEventHandler)
+ *
+ *   Here system gives us a chance to check the conneciton is built
+ *   ready or not.
+ *
+ * Arguments:
+ *   DeviceObject:  the device object of the transport driver
+ *   Irp:           the Irp is being completed.
+ *   Context:       the context we specified when issuing the Irp
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsAcceptCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    )
+{
+    ksock_tconn_t * child = (ksock_tconn_t *) Context;
+    ksock_tconn_t * parent = child->child.kstc_parent;
+
+    KsPrint((2, "KsAcceptCompletionRoutine: called at Irql: %xh\n",
+                KeGetCurrentIrql() ));
+
+    KsPrint((2, "KsAcceptCompletionRoutine: Context = %xh Status = %xh\n",
+                 Context, Irp->IoStatus.Status));
+
+    LASSERT(child->kstc_type == kstt_child);
+
+    spin_lock(&(child->kstc_lock));
+
+    LASSERT(parent->kstc_state == ksts_listening);
+    LASSERT(child->kstc_state == ksts_connecting);
+
+    if (NT_SUCCESS(Irp->IoStatus.Status)) {
+
+        child->child.kstc_accepted = TRUE;
+
+        child->kstc_state = ksts_connected;
+
+        /* wake up the daemon thread which waits on this event */
+        KeSetEvent(
+            &(parent->listener.kstc_accept_event),
+            0,
+            FALSE
+            );
+
+        spin_unlock(&(child->kstc_lock));
+
+        KsPrint((2, "KsAcceptCompletionRoutine: Get %xh now signal the event ...\n", parent));
+
+    } else {
+
+        /* re-use this child connecton  */
+        child->child.kstc_accepted = FALSE;
+        child->child.kstc_busy = FALSE;
+        child->kstc_state = ksts_associated;
+
+        spin_unlock(&(child->kstc_lock));
+    }
+
+    /* now free the Irp */
+    IoFreeIrp(Irp);
+
+    /* drop the refer count of the child */
+    ks_put_tconn(child);
+
+    return (STATUS_MORE_PROCESSING_REQUIRED);
+}
+
+
+/*
+ * ks_get_vacancy_backlog
+ *   Get a vacancy listeing child from the backlog list
+ *
+ * Arguments:
+ *   parent: the listener daemon connection
+ *
+ * Return Value:
+ *   the child listening connection or NULL in failure
+ *
+ * Notes
+ *   Parent's lock should be acquired before calling.
+ */
+
+ksock_tconn_t *
+ks_get_vacancy_backlog(
+    ksock_tconn_t *  parent
+    )
+{
+    ksock_tconn_t * child;
+
+    LASSERT(parent->kstc_type == kstt_listener);
+    LASSERT(parent->kstc_state == ksts_listening);
+
+    if (list_empty(&(parent->listener.kstc_listening.list))) {
+
+        child = NULL;
+
+    } else {
+
+        struct list_head * tmp;
+
+        /* check the listening queue and try to get a free connecton */
+
+        list_for_each(tmp, &(parent->listener.kstc_listening.list)) {
+            child = list_entry (tmp, ksock_tconn_t, child.kstc_link);
+            spin_lock(&(child->kstc_lock));
+
+            if (!child->child.kstc_busy) {
+                LASSERT(child->kstc_state == ksts_associated);
+                child->child.kstc_busy = TRUE;
+                spin_unlock(&(child->kstc_lock));
+                break;
+            } else {
+                spin_unlock(&(child->kstc_lock));
+                child = NULL;
+            }
+        }
+    }
+
+    return child;
+}
+
+ks_addr_slot_t *
+KsSearchIpAddress(PUNICODE_STRING  DeviceName)
+{
+    ks_addr_slot_t * slot = NULL;
+    PLIST_ENTRY      list = NULL;
+
+    spin_lock(&ks_data.ksnd_addrs_lock);
+
+    list = ks_data.ksnd_addrs_list.Flink;
+    while (list != &ks_data.ksnd_addrs_list) {
+        slot = CONTAINING_RECORD(list, ks_addr_slot_t, link);
+        if (RtlCompareUnicodeString(
+                    DeviceName,
+                    &slot->devname,
+                    TRUE) == 0) {
+            break;
+        }
+        list = list->Flink;
+        slot = NULL;
+    }
+
+    spin_unlock(&ks_data.ksnd_addrs_lock);
+
+    return slot;
+}
+
+void
+KsCleanupIpAddresses()
+{
+    spin_lock(&ks_data.ksnd_addrs_lock);
+
+    while (!IsListEmpty(&ks_data.ksnd_addrs_list)) {
+
+        ks_addr_slot_t * slot = NULL;
+        PLIST_ENTRY      list = NULL;
+
+        list = RemoveHeadList(&ks_data.ksnd_addrs_list);
+        slot = CONTAINING_RECORD(list, ks_addr_slot_t, link);
+        cfs_free(slot);
+        ks_data.ksnd_naddrs--;
+    }
+
+    cfs_assert(ks_data.ksnd_naddrs == 0);
+    spin_unlock(&ks_data.ksnd_addrs_lock);
+}
+
+VOID
+KsAddAddressHandler(
+    IN  PTA_ADDRESS      Address,
+    IN  PUNICODE_STRING  DeviceName,
+    IN  PTDI_PNP_CONTEXT Context
+    )
+{
+    PTDI_ADDRESS_IP IpAddress = NULL;
+
+    if ( Address->AddressType == TDI_ADDRESS_TYPE_IP &&
+         Address->AddressLength == TDI_ADDRESS_LENGTH_IP ) {
+
+        ks_addr_slot_t * slot = NULL;
+
+        IpAddress = (PTDI_ADDRESS_IP) &Address->Address[0];
+        KsPrint((1, "KsAddAddressHandle: Device=%wZ Context=%xh IpAddress=%xh(%d.%d.%d.%d)\n",
+                  DeviceName, Context, IpAddress->in_addr,
+                   (IpAddress->in_addr & 0xFF000000) >> 24,
+                   (IpAddress->in_addr & 0x00FF0000) >> 16,
+                   (IpAddress->in_addr & 0x0000FF00) >> 8,
+                   (IpAddress->in_addr & 0x000000FF) >> 0 ));
+
+        slot = KsSearchIpAddress(DeviceName);
+
+        if (slot != NULL) {
+            slot->up = TRUE;
+            slot->ip_addr = ntohl(IpAddress->in_addr);
+        } else {
+            slot = cfs_alloc(sizeof(ks_addr_slot_t) + DeviceName->Length, CFS_ALLOC_ZERO);
+            if (slot != NULL) {
+                spin_lock(&ks_data.ksnd_addrs_lock);
+                InsertTailList(&ks_data.ksnd_addrs_list, &slot->link);
+                sprintf(slot->iface, "eth%d", ks_data.ksnd_naddrs++);
+                slot->ip_addr = ntohl(IpAddress->in_addr);
+                slot->up = TRUE;
+                RtlMoveMemory(&slot->buffer[0], DeviceName->Buffer, DeviceName->Length);
+                slot->devname.Length = DeviceName->Length;
+                slot->devname.MaximumLength = DeviceName->Length + sizeof(WCHAR);
+                slot->devname.Buffer = slot->buffer;
+                spin_unlock(&ks_data.ksnd_addrs_lock);
+            }
+        }
+    }
+}
+
+VOID
+KsDelAddressHandler(
+    IN  PTA_ADDRESS      Address,
+    IN  PUNICODE_STRING  DeviceName,
+    IN  PTDI_PNP_CONTEXT Context
+    )
+{
+    PTDI_ADDRESS_IP IpAddress = NULL;
+
+    if ( Address->AddressType == TDI_ADDRESS_TYPE_IP &&
+         Address->AddressLength == TDI_ADDRESS_LENGTH_IP ) {
+
+        ks_addr_slot_t * slot = NULL;
+
+        slot = KsSearchIpAddress(DeviceName);
+
+        if (slot != NULL) {
+            slot->up = FALSE;
+        }
+
+        IpAddress = (PTDI_ADDRESS_IP) &Address->Address[0];
+        KsPrint((1, "KsDelAddressHandle: Device=%wZ Context=%xh IpAddress=%xh(%d.%d.%d.%d)\n",
+                  DeviceName, Context, IpAddress->in_addr,
+                   (IpAddress->in_addr & 0xFF000000) >> 24,
+                   (IpAddress->in_addr & 0x00FF0000) >> 16,
+                   (IpAddress->in_addr & 0x0000FF00) >> 8,
+                   (IpAddress->in_addr & 0x000000FF) >> 0 ));
+    }
+}
+
+NTSTATUS
+KsRegisterPnpHandlers()
+{
+    TDI20_CLIENT_INTERFACE_INFO ClientInfo;
+
+    /* initialize the global ks_data members */
+    RtlInitUnicodeString(&ks_data.ksnd_client_name, TDILND_MODULE_NAME);
+    spin_lock_init(&ks_data.ksnd_addrs_lock);
+    InitializeListHead(&ks_data.ksnd_addrs_list);
+
+    /* register the pnp handlers */
+    RtlZeroMemory(&ClientInfo, sizeof(ClientInfo));
+    ClientInfo.TdiVersion = TDI_CURRENT_VERSION;
+
+    ClientInfo.ClientName = &ks_data.ksnd_client_name;
+    ClientInfo.AddAddressHandlerV2 =  KsAddAddressHandler;
+    ClientInfo.DelAddressHandlerV2 =  KsDelAddressHandler;
+
+    return TdiRegisterPnPHandlers(&ClientInfo, sizeof(ClientInfo),
+                                  &ks_data.ksnd_pnp_handle);
+}
+
+VOID
+KsDeregisterPnpHandlers()
+{
+    if (ks_data.ksnd_pnp_handle) {
+
+        /* De-register the pnp handlers */
+
+        TdiDeregisterPnPHandlers(ks_data.ksnd_pnp_handle);
+        ks_data.ksnd_pnp_handle = NULL;
+
+        /* cleanup all the ip address slots */
+        KsCleanupIpAddresses();
+    }
+}
+
+/*
+ * KsConnectEventHandler
+ *   Connect event handler event handler, called by the underlying TDI
+ *   transport in response to an incoming request to the listening daemon.
+ *
+ *   it will grab a vacancy backlog from the children tconn list, and
+ *   build an acception Irp with it, then transfer the Irp to TDI driver.
+ *
+ * Arguments:
+ *   TdiEventContext:  the tdi connnection object of the listening daemon
+ *   ......
+ *
+ * Return Value:
+ *   Nt kernel status code
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsConnectEventHandler(
+    IN PVOID                    TdiEventContext,
+    IN LONG                     RemoteAddressLength,
+    IN PVOID                    RemoteAddress,
+    IN LONG                     UserDataLength,
+    IN PVOID                    UserData,
+    IN LONG                     OptionsLength,
+    IN PVOID                    Options,
+    OUT CONNECTION_CONTEXT *    ConnectionContext,
+    OUT PIRP *                  AcceptIrp
+    )
+{
+    ksock_tconn_t *             parent;
+    ksock_tconn_t *             child;
+
+    PFILE_OBJECT                FileObject;
+    PDEVICE_OBJECT              DeviceObject;
+    NTSTATUS                    Status;
+
+    PIRP                        Irp = NULL;
+    PTDI_CONNECTION_INFORMATION ConnectionInfo = NULL;
+
+    KsPrint((2,"KsConnectEventHandler: call at Irql: %u\n", KeGetCurrentIrql()));
+    parent = (ksock_tconn_t *) TdiEventContext;
+
+    LASSERT(parent->kstc_type == kstt_listener);
+
+    spin_lock(&(parent->kstc_lock));
+
+    if (parent->kstc_state == ksts_listening) {
+
+        /* allocate a new ConnectionInfo to backup the peer's info */
+
+        ConnectionInfo = (PTDI_CONNECTION_INFORMATION)ExAllocatePoolWithTag(
+                NonPagedPool, sizeof(TDI_CONNECTION_INFORMATION) +
+                RemoteAddressLength, 'iCsK' );
+
+        if (NULL == ConnectionInfo) {
+
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        /* initializing ConnectionInfo structure ... */
+
+        ConnectionInfo->UserDataLength = UserDataLength;
+        ConnectionInfo->UserData = UserData;
+        ConnectionInfo->OptionsLength = OptionsLength;
+        ConnectionInfo->Options = Options;
+        ConnectionInfo->RemoteAddressLength = RemoteAddressLength;
+        ConnectionInfo->RemoteAddress = ConnectionInfo + 1;
+
+        RtlCopyMemory(
+                ConnectionInfo->RemoteAddress,
+                RemoteAddress,
+                RemoteAddressLength
+                );
+
+        /* get the vacancy listening child tdi connections */
+
+        child = ks_get_vacancy_backlog(parent);
+
+        if (child) {
+
+            spin_lock(&(child->kstc_lock));
+            child->child.kstc_info.ConnectionInfo = ConnectionInfo;
+            child->child.kstc_info.Remote = ConnectionInfo->RemoteAddress;
+            child->kstc_state = ksts_connecting;
+            spin_unlock(&(child->kstc_lock));
+
+        } else {
+
+            KsPrint((2, "KsConnectEventHandler: No enough backlogs: Refsued the connectio: %xh\n", parent));
+
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+
+            goto errorout;
+        }
+
+        FileObject = child->child.kstc_info.FileObject;
+        DeviceObject = IoGetRelatedDeviceObject (FileObject);
+
+        Irp = KsBuildTdiIrp(DeviceObject);
+
+        TdiBuildAccept(
+                Irp,
+                DeviceObject,
+                FileObject,
+                KsAcceptCompletionRoutine,
+                child,
+                NULL,
+                NULL
+                );
+
+        IoSetNextIrpStackLocation(Irp);
+
+        /* grap the refer of the child tdi connection */
+        ks_get_tconn(child);
+
+        Status = STATUS_MORE_PROCESSING_REQUIRED;
+
+        *AcceptIrp = Irp;
+        *ConnectionContext = child;
+
+    } else {
+
+        Status = STATUS_CONNECTION_REFUSED;
+        goto errorout;
+    }
+
+    spin_unlock(&(parent->kstc_lock));
+
+    return Status;
+
+errorout:
+
+    spin_unlock(&(parent->kstc_lock));
+
+    {
+        *AcceptIrp = NULL;
+        *ConnectionContext = NULL;
+
+        if (ConnectionInfo) {
+
+            ExFreePool(ConnectionInfo);
+        }
+
+        if (Irp) {
+
+            IoFreeIrp (Irp);
+        }
+    }
+
+    return Status;
+}
+
+/*
+ * KsDisconnectCompletionRoutine
+ *   the Irp completion routine for TdiBuildDisconect
+ *
+ *   We just signal the event and return MORE_PRO... to
+ *   let the caller take the responsibility of the Irp.
+ *
+ * Arguments:
+ *   DeviceObject:  the device object of the transport
+ *   Irp:           the Irp is being completed.
+ *   Context:       the event specified by the caller
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsDisconectCompletionRoutine (
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    )
+{
+
+    KeSetEvent((PKEVENT) Context, 0, FALSE);
+
+    return STATUS_MORE_PROCESSING_REQUIRED;
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+
+/*
+ * KsDisconnectHelper
+ *   the routine to be executed in the WorkItem procedure
+ *   this routine is to disconnect a tdi connection
+ *
+ * Arguments:
+ *   Workitem:  the context transferred to the workitem
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   tconn is already referred in abort_connecton ...
+ */
+
+VOID
+KsDisconnectHelper(PKS_DISCONNECT_WORKITEM WorkItem)
+{
+    ksock_tconn_t * tconn = WorkItem->tconn;
+
+    DbgPrint("KsDisconnectHelper: disconnecting tconn=%p\n", tconn);
+    ks_disconnect_tconn(tconn, WorkItem->Flags);
+
+    KeSetEvent(&(WorkItem->Event), 0, FALSE);
+
+    spin_lock(&(tconn->kstc_lock));
+    cfs_clear_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY);
+    spin_unlock(&(tconn->kstc_lock));
+    ks_put_tconn(tconn);
+}
+
+
+/*
+ * KsDisconnectEventHandler
+ *   Disconnect event handler event handler, called by the underlying TDI transport
+ *   in response to an incoming disconnection notification from a remote node.
+ *
+ * Arguments:
+ *   ConnectionContext:  tdi connnection object
+ *   DisconnectFlags:    specifies the nature of the disconnection
+ *   ......
+ *
+ * Return Value:
+ *   Nt kernel status code
+ *
+ * Notes:
+ *   N/A
+ */
+
+
+NTSTATUS
+KsDisconnectEventHandler(
+    IN PVOID                TdiEventContext,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN LONG                 DisconnectDataLength,
+    IN PVOID                DisconnectData,
+    IN LONG                 DisconnectInformationLength,
+    IN PVOID                DisconnectInformation,
+    IN ULONG                DisconnectFlags
+    )
+{
+    ksock_tconn_t *         tconn;
+    NTSTATUS                Status;
+    PKS_DISCONNECT_WORKITEM WorkItem;
+
+    tconn = (ksock_tconn_t *)ConnectionContext;
+
+    KsPrint((2, "KsTcpDisconnectEventHandler: called at Irql: %xh\n",
+                KeGetCurrentIrql() ));
+
+    KsPrint((2, "tconn = %x DisconnectFlags= %xh\n",
+                 tconn, DisconnectFlags));
+
+    ks_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    WorkItem = &(tconn->kstc_disconnect);
+
+    if (tconn->kstc_state != ksts_connected) {
+
+        Status = STATUS_SUCCESS;
+
+    } else {
+
+        if (cfs_is_flag_set(DisconnectFlags, TDI_DISCONNECT_ABORT)) {
+
+            Status = STATUS_REMOTE_DISCONNECT;
+
+        } else if (cfs_is_flag_set(DisconnectFlags, TDI_DISCONNECT_RELEASE)) {
+
+            Status = STATUS_GRACEFUL_DISCONNECT;
+        }
+
+        if (!cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY)) {
+
+            ks_get_tconn(tconn);
+
+            WorkItem->Flags = DisconnectFlags;
+            WorkItem->tconn = tconn;
+
+            cfs_set_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY);
+
+            /* queue the workitem to call */
+            ExQueueWorkItem(&(WorkItem->WorkItem), DelayedWorkQueue);
+        }
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+    ks_put_tconn(tconn);
+
+    return  (Status);
+}
+
+NTSTATUS
+KsTcpReceiveCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    )
+{
+    NTSTATUS Status = Irp->IoStatus.Status;
+
+    if (NT_SUCCESS(Status)) {
+
+        ksock_tconn_t *tconn = Context->tconn;
+
+        PKS_TSDU_DAT  KsTsduDat = Context->CompletionContext;
+        PKS_TSDU_BUF  KsTsduBuf = Context->CompletionContext;
+
+        KsPrint((1, "KsTcpReceiveCompletionRoutine: Total %xh bytes.\n",
+                   Context->KsTsduMgr->TotalBytes ));
+
+        spin_lock(&(tconn->kstc_lock));
+
+        if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+            if (cfs_is_flag_set(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING)) {
+                cfs_clear_flag(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING);
+            } else {
+                cfs_enter_debugger();
+            }
+        } else {
+            ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType);
+            if (cfs_is_flag_set(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING)) {
+                cfs_clear_flag(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING);
+            } else {
+                cfs_enter_debugger();
+            }
+        }
+
+        spin_unlock(&(tconn->kstc_lock));
+
+        /* wake up the thread waiting for the completion of this Irp */
+        KeSetEvent(Context->Event, 0, FALSE);
+
+        /* re-active the ks connection and wake up the scheduler */
+        if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+            tconn->kstc_sched_cb( tconn, FALSE, NULL,
+                                  Context->KsTsduMgr->TotalBytes );
+        }
+
+    } else {
+
+        /* un-expected errors occur, we must abort the connection */
+        ks_abort_tconn(Context->tconn);
+    }
+
+    if (Context) {
+
+        /* Freeing the Context structure... */
+        ExFreePool(Context);
+        Context = NULL;
+    }
+
+
+    /* free the Irp */
+    if (Irp) {
+        IoFreeIrp(Irp);
+    }
+
+    return (Status);
+}
+
+
+/*
+ * KsTcpCompletionRoutine
+ *   the Irp completion routine for TdiBuildSend and TdiBuildReceive ...
+ *   We need call the use's own CompletionRoutine if specified. Or
+ *   it's a synchronous case, we need signal the event.
+ *
+ * Arguments:
+ *   DeviceObject:  the device object of the transport
+ *   Irp:           the Irp is being completed.
+ *   Context:       the context we specified when issuing the Irp
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsTcpCompletionRoutine(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp,
+    IN PVOID            Context
+    )
+{
+    if (Context) {
+
+        PKS_TCP_COMPLETION_CONTEXT  CompletionContext = NULL;
+        ksock_tconn_t * tconn = NULL;
+
+        CompletionContext = (PKS_TCP_COMPLETION_CONTEXT) Context;
+        tconn = CompletionContext->tconn;
+
+        /* release the chained mdl */
+        KsReleaseMdl(Irp->MdlAddress, FALSE);
+        Irp->MdlAddress = NULL;
+
+        if (CompletionContext->CompletionRoutine) {
+
+            if ( CompletionContext->bCounted &&
+                 InterlockedDecrement(&CompletionContext->ReferCount) != 0 ) {
+                    goto errorout;
+            }
+
+            //
+            // Giving control to user specified CompletionRoutine ...
+            //
+
+            CompletionContext->CompletionRoutine(
+                    Irp,
+                    CompletionContext
+                    );
+
+        } else {
+
+            //
+            // Signaling  the Event ...
+            //
+
+            KeSetEvent(CompletionContext->Event, 0, FALSE);
+        }
+
+        /* drop the reference count of the tconn object */
+        ks_put_tconn(tconn);
+
+    } else {
+
+        cfs_enter_debugger();
+    }
+
+errorout:
+
+    return STATUS_MORE_PROCESSING_REQUIRED;
+}
+
+/*
+ * KsTcpSendCompletionRoutine
+ *   the user specified Irp completion routine for asynchronous
+ *   data transmission requests.
+ *
+ *   It will do th cleanup job of the ksock_tx_t and wake up the
+ *   ks scheduler thread
+ *
+ * Arguments:
+ *   Irp:           the Irp is being completed.
+ *   Context:       the context we specified when issuing the Irp
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * Notes:
+ *   N/A
+ */
+
+NTSTATUS
+KsTcpSendCompletionRoutine(
+    IN PIRP                         Irp,
+    IN PKS_TCP_COMPLETION_CONTEXT   Context
+    )
+{
+    NTSTATUS        Status = Irp->IoStatus.Status;
+    ULONG           rc = Irp->IoStatus.Information;
+    ksock_tconn_t * tconn = Context->tconn;
+    PKS_TSDUMGR     KsTsduMgr = Context->KsTsduMgr;
+
+    ENTRY;
+
+    LASSERT(tconn) ;
+
+    if (NT_SUCCESS(Status)) {
+
+        if (Context->bCounted) {
+            PVOID   tx = Context->CompletionContext;
+
+            ASSERT(tconn->kstc_update_tx != NULL);
+
+            /* update the tx, rebasing the kiov or iov pointers */
+            tx = tconn->kstc_update_tx(tconn, tx, rc);
+
+            /* update the KsTsudMgr total bytes */
+            spin_lock(&tconn->kstc_lock);
+            KsTsduMgr->TotalBytes -= rc;
+            spin_unlock(&tconn->kstc_lock);
+
+            /*
+             * now it's time to re-queue the conns into the
+             * scheduler queue and wake the scheduler thread.
+             */
+
+            if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+                tconn->kstc_sched_cb( tconn, TRUE, tx, 0);
+            }
+
+        } else {
+
+            PKS_TSDU            KsTsdu = Context->CompletionContext;
+            PKS_TSDU_BUF        KsTsduBuf = Context->CompletionContext2;
+            PKS_TSDU_DAT        KsTsduDat = Context->CompletionContext2;
+
+            spin_lock(&tconn->kstc_lock);
+            /* This is bufferred sending ... */
+            ASSERT(KsTsduBuf->StartOffset == 0);
+
+            if (KsTsduBuf->DataLength > Irp->IoStatus.Information) {
+                /* not fully sent .... we have to abort the connection */
+                spin_unlock(&tconn->kstc_lock);
+                ks_abort_tconn(tconn);
+                goto errorout;
+            }
+
+            if (KsTsduBuf->TsduType  == TSDU_TYPE_BUF) {
+                /* free the buffer */
+                ExFreePool(KsTsduBuf->UserBuffer);
+                KsTsduMgr->TotalBytes -= KsTsduBuf->DataLength;
+                KsTsdu->StartOffset   += sizeof(KS_TSDU_BUF);
+            } else if (KsTsduDat->TsduType  == TSDU_TYPE_DAT) {
+                KsTsduMgr->TotalBytes -= KsTsduDat->DataLength;
+                KsTsdu->StartOffset   += KsTsduDat->TotalLength;
+            } else {
+                cfs_enter_debugger(); /* shoult not get here */
+            }
+
+            if (KsTsdu->StartOffset == KsTsdu->LastOffset) {
+
+                list_del(&KsTsdu->Link);
+                KsTsduMgr->NumOfTsdu--;
+                KsPutKsTsdu(KsTsdu);
+            }
+
+            spin_unlock(&tconn->kstc_lock);
+        }
+
+    } else {
+
+        /* cfs_enter_debugger(); */
+
+        /*
+         *  for the case that the transmission is ussuccessful,
+         *  we need abort the tdi connection, but not destroy it.
+         *  the socknal conn will drop the refer count, then the
+         *  tdi connection will be freed.
+         */
+
+        ks_abort_tconn(tconn);
+    }
+
+errorout:
+
+    /* freeing the Context structure... */
+
+    if (Context) {
+        ExFreePool(Context);
+        Context = NULL;
+    }
+
+    /* it's our duty to free the Irp. */
+
+    if (Irp) {
+        IoFreeIrp(Irp);
+        Irp = NULL;
+    }
+
+    EXIT;
+
+    return Status;
+}
+
+/*
+ *  Normal receive event handler
+ *
+ *  It will move data from system Tsdu to our TsduList
+ */
+
+NTSTATUS
+KsTcpReceiveEventHandler(
+    IN PVOID                TdiEventContext,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+   )
+{
+    NTSTATUS            Status;
+
+    ksock_tconn_t *     tconn;
+
+    PKS_CHAIN           KsChain;
+    PKS_TSDUMGR         KsTsduMgr;
+    PKS_TSDU            KsTsdu;
+    PKS_TSDU_DAT        KsTsduDat;
+    PKS_TSDU_BUF        KsTsduBuf;
+
+    BOOLEAN             bIsExpedited;
+    BOOLEAN             bIsCompleteTsdu;
+
+    BOOLEAN             bNewTsdu = FALSE;
+    BOOLEAN             bNewBuff = FALSE;
+
+    PCHAR               Buffer = NULL;
+
+    PIRP                Irp = NULL;
+    PMDL                Mdl = NULL;
+    PFILE_OBJECT        FileObject;
+    PDEVICE_OBJECT      DeviceObject;
+
+    ULONG               BytesReceived = 0;
+
+    PKS_TCP_COMPLETION_CONTEXT context = NULL;
+
+
+    tconn = (ksock_tconn_t *) ConnectionContext;
+
+    ks_get_tconn(tconn);
+
+    /* check whether the whole body of payload is received or not */
+    if ( (cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_ENTIRE_MESSAGE)) &&
+         (BytesIndicated == BytesAvailable) ) {
+        bIsCompleteTsdu = TRUE;
+    } else {
+        bIsCompleteTsdu = FALSE;
+    }
+
+    bIsExpedited = cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_EXPEDITED);
+
+    KsPrint((2, "KsTcpReceiveEventHandler BytesIndicated = %d BytesAvailable = %d ...\n", BytesIndicated, BytesAvailable));
+    KsPrint((2, "bIsCompleteTsdu = %d bIsExpedited = %d\n", bIsCompleteTsdu, bIsExpedited ));
+
+    spin_lock(&(tconn->kstc_lock));
+
+    /*  check whether we are conntected or not listener Â¡Â­*/
+    if ( !((tconn->kstc_state == ksts_connected) &&
+           (tconn->kstc_type == kstt_sender ||
+            tconn->kstc_type == kstt_child))) {
+
+        *BytesTaken = BytesIndicated;
+
+        spin_unlock(&(tconn->kstc_lock));
+        ks_put_tconn(tconn);
+
+        return (STATUS_SUCCESS);
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    /* if the Tsdu is even larger than the biggest Tsdu, we have
+       to allocate new buffer and use TSDU_TYOE_BUF to store it */
+
+    if ( KS_TSDU_STRU_SIZE(BytesAvailable) > ks_data.ksnd_tsdu_size -
+         KS_DWORD_ALIGN(sizeof(KS_TSDU))) {
+        bNewBuff = TRUE;
+    }
+
+    /* retrieve the latest Tsdu buffer form TsduMgr
+       list if the list is not empty. */
+
+    if (list_empty(&(KsTsduMgr->TsduList))) {
+
+        LASSERT(KsTsduMgr->NumOfTsdu == 0);
+        KsTsdu = NULL;
+
+    } else {
+
+        LASSERT(KsTsduMgr->NumOfTsdu > 0);
+        KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link);
+
+        /* if this Tsdu does not contain enough space, we need
+           allocate a new Tsdu queue. */
+
+        if (bNewBuff) {
+            if ( KsTsdu->LastOffset + sizeof(KS_TSDU_BUF) >
+                 KsTsdu->TotalLength )  {
+                KsTsdu = NULL;
+            }
+        } else {
+            if ( KS_TSDU_STRU_SIZE(BytesAvailable) >
+                 KsTsdu->TotalLength - KsTsdu->LastOffset ) {
+                KsTsdu = NULL;
+            }
+        }
+    }
+
+    /* allocating the buffer for TSDU_TYPE_BUF */
+    if (bNewBuff) {
+        Buffer = ExAllocatePool(NonPagedPool, BytesAvailable);
+        if (NULL == Buffer) {
+            /* there's no enough memory for us. We just try to
+               receive maximum bytes with a new Tsdu */
+            bNewBuff = FALSE;
+            KsTsdu = NULL;
+        }
+    }
+
+    /* allocate a new Tsdu in case we are not statisfied. */
+
+    if (NULL == KsTsdu) {
+
+        KsTsdu = KsAllocateKsTsdu();
+
+        if (NULL == KsTsdu) {
+            goto errorout;
+        } else {
+            bNewTsdu = TRUE;
+        }
+    }
+
+    KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+    KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+    if (bNewBuff) {
+
+        /* setup up the KS_TSDU_BUF record */
+
+        KsTsduBuf->TsduType     = TSDU_TYPE_BUF;
+        KsTsduBuf->TsduFlags    = 0;
+        KsTsduBuf->StartOffset  = 0;
+        KsTsduBuf->UserBuffer   = Buffer;
+        KsTsduBuf->DataLength   = BytesReceived = BytesAvailable;
+
+        KsTsdu->LastOffset += sizeof(KS_TSDU_BUF);
+
+    } else {
+
+        /* setup the KS_TSDU_DATA to contain all the messages */
+
+        KsTsduDat->TsduType     =  TSDU_TYPE_DAT;
+        KsTsduDat->TsduFlags    = 0;
+
+        if ( KsTsdu->TotalLength - KsTsdu->LastOffset >=
+            KS_TSDU_STRU_SIZE(BytesAvailable) ) {
+            BytesReceived = BytesAvailable;
+        } else {
+            BytesReceived = KsTsdu->TotalLength - KsTsdu->LastOffset -
+                            FIELD_OFFSET(KS_TSDU_DAT, Data);
+            BytesReceived &= (~((ULONG)3));
+        }
+        KsTsduDat->DataLength   =  BytesReceived;
+        KsTsduDat->TotalLength  =  KS_TSDU_STRU_SIZE(BytesReceived);
+        KsTsduDat->StartOffset  = 0;
+
+        Buffer = &KsTsduDat->Data[0];
+
+        KsTsdu->LastOffset += KsTsduDat->TotalLength;
+    }
+
+    KsTsduMgr->TotalBytes  +=  BytesReceived;
+
+    if (bIsCompleteTsdu) {
+
+        /* It's a complete receive, we just move all
+           the data from system to our Tsdu */
+
+        RtlMoveMemory(
+            Buffer,
+            Tsdu,
+            BytesReceived
+            );
+
+        *BytesTaken = BytesReceived;
+        Status = STATUS_SUCCESS;
+
+        if (bNewTsdu) {
+            list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+            KsTsduMgr->NumOfTsdu++;
+        }
+
+        KeSetEvent(&(KsTsduMgr->Event), 0, FALSE);
+
+        /* re-active the ks connection and wake up the scheduler */
+        if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+            tconn->kstc_sched_cb( tconn, FALSE, NULL,
+                                  KsTsduMgr->TotalBytes );
+        }
+
+    } else {
+
+        /* there's still data in tdi internal queue, we need issue a new
+           Irp to receive all of them. first allocate the tcp context */
+
+        context = ExAllocatePoolWithTag(
+                        NonPagedPool,
+                        sizeof(KS_TCP_COMPLETION_CONTEXT),
+                        'cTsK');
+
+        if (!context) {
+
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+            goto errorout;
+        }
+
+        /* setup the context */
+        RtlZeroMemory(context, sizeof(KS_TCP_COMPLETION_CONTEXT));
+
+        context->tconn             = tconn;
+        context->CompletionRoutine = KsTcpReceiveCompletionRoutine;
+        context->CompletionContext = KsTsdu;
+        context->CompletionContext = bNewBuff ? (PVOID)KsTsduBuf : (PVOID)KsTsduDat;
+        context->KsTsduMgr         = KsTsduMgr;
+        context->Event             = &(KsTsduMgr->Event);
+
+        if (tconn->kstc_type == kstt_sender) {
+            FileObject = tconn->sender.kstc_info.FileObject;
+        } else {
+            FileObject = tconn->child.kstc_info.FileObject;
+        }
+
+        DeviceObject = IoGetRelatedDeviceObject(FileObject);
+
+        /* build new tdi Irp and setup it. */
+        Irp = KsBuildTdiIrp(DeviceObject);
+
+        if (NULL == Irp) {
+            goto errorout;
+        }
+
+        Status = KsLockUserBuffer(
+                    Buffer,
+                    FALSE,
+                    BytesReceived,
+                    IoModifyAccess,
+                    &Mdl
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            goto errorout;
+        }
+
+        TdiBuildReceive(
+            Irp,
+            DeviceObject,
+            FileObject,
+            KsTcpCompletionRoutine,
+            context,
+            Mdl,
+            ReceiveFlags & (TDI_RECEIVE_NORMAL | TDI_RECEIVE_EXPEDITED),
+            BytesReceived
+          );
+
+        IoSetNextIrpStackLocation(Irp);
+
+        /* return the newly built Irp to transport driver,
+           it will process it to receive all the data */
+
+        *IoRequestPacket = Irp;
+        *BytesTaken = 0;
+
+        if (bNewTsdu) {
+
+            list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+            KsTsduMgr->NumOfTsdu++;
+        }
+
+        if (bNewBuff) {
+            cfs_set_flag(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING);
+        } else {
+            cfs_set_flag(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING);
+        }
+        ks_get_tconn(tconn);
+        Status = STATUS_MORE_PROCESSING_REQUIRED;
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+    ks_put_tconn(tconn);
+
+    return (Status);
+
+errorout:
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    if (bNewTsdu && (KsTsdu != NULL)) {
+        KsFreeKsTsdu(KsTsdu);
+    }
+
+    if (Mdl) {
+        KsReleaseMdl(Mdl, FALSE);
+    }
+
+    if (Irp) {
+        IoFreeIrp(Irp);
+    }
+
+    if (context) {
+        ExFreePool(context);
+    }
+
+    ks_abort_tconn(tconn);
+    ks_put_tconn(tconn);
+
+    *BytesTaken = BytesAvailable;
+    Status = STATUS_SUCCESS;
+
+    return (Status);
+}
+
+/*
+ *  Expedited receive event handler
+ */
+
+NTSTATUS
+KsTcpReceiveExpeditedEventHandler(
+    IN PVOID                TdiEventContext,
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                BytesIndicated,
+    IN ULONG                BytesAvailable,
+    OUT ULONG *             BytesTaken,
+    IN PVOID                Tsdu,
+    OUT PIRP *              IoRequestPacket
+    )
+{
+    return KsTcpReceiveEventHandler(
+                TdiEventContext,
+                ConnectionContext,
+                ReceiveFlags | TDI_RECEIVE_EXPEDITED,
+                BytesIndicated,
+                BytesAvailable,
+                BytesTaken,
+                Tsdu,
+                IoRequestPacket
+                );
+}
+
+
+/*
+ *  Bulk receive event handler
+ *
+ *  It will queue all the system Tsdus to our TsduList.
+ *  Then later ks_recv_mdl will release them.
+ */
+
+NTSTATUS
+KsTcpChainedReceiveEventHandler (
+    IN PVOID TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT ConnectionContext,
+    IN ULONG ReceiveFlags,
+    IN ULONG ReceiveLength,
+    IN ULONG StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL  Tsdu,                  // TSDU data chain
+    IN PVOID TsduDescriptor         // for call to TdiReturnChainedReceives
+    )
+{
+
+    NTSTATUS            Status;
+
+    ksock_tconn_t *     tconn;
+
+    PKS_CHAIN           KsChain;
+    PKS_TSDUMGR         KsTsduMgr;
+    PKS_TSDU            KsTsdu;
+    PKS_TSDU_MDL        KsTsduMdl;
+
+    BOOLEAN             bIsExpedited;
+    BOOLEAN             bNewTsdu = FALSE;
+
+    tconn = (ksock_tconn_t *) ConnectionContext;
+
+    bIsExpedited = cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_EXPEDITED);
+
+    KsPrint((2, "KsTcpChainedReceive: ReceiveLength = %xh bIsExpedited = %d\n", ReceiveLength, bIsExpedited));
+
+    ks_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    /* check whether we are conntected or not listener Â¡Â­*/
+    if ( !((tconn->kstc_state == ksts_connected) &&
+         (tconn->kstc_type == kstt_sender ||
+          tconn->kstc_type == kstt_child))) {
+
+        spin_unlock(&(tconn->kstc_lock));
+        ks_put_tconn(tconn);
+
+        return (STATUS_SUCCESS);
+    }
+
+    /* get the latest Tsdu buffer form TsduMgr list.
+       just set NULL if the list is empty. */
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    if (list_empty(&(KsTsduMgr->TsduList))) {
+
+        LASSERT(KsTsduMgr->NumOfTsdu == 0);
+        KsTsdu = NULL;
+
+    } else {
+
+        LASSERT(KsTsduMgr->NumOfTsdu > 0);
+        KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link);
+        LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+        if (sizeof(KS_TSDU_MDL) > KsTsdu->TotalLength - KsTsdu->LastOffset) {
+            KsTsdu = NULL;
+        }
+    }
+
+    /* if there's no Tsdu or the free size is not enough for this
+       KS_TSDU_MDL structure. We need re-allocate a new Tsdu.  */
+
+    if (NULL == KsTsdu) {
+
+        KsTsdu = KsAllocateKsTsdu();
+
+        if (NULL == KsTsdu) {
+            goto errorout;
+        } else {
+            bNewTsdu = TRUE;
+        }
+    }
+
+    /* just queue the KS_TSDU_MDL to the Tsdu buffer */
+
+    KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+    KsTsduMdl->TsduType     =  TSDU_TYPE_MDL;
+    KsTsduMdl->DataLength   =  ReceiveLength;
+    KsTsduMdl->StartOffset  =  StartingOffset;
+    KsTsduMdl->Mdl          =  Tsdu;
+    KsTsduMdl->Descriptor   =  TsduDescriptor;
+
+    KsTsdu->LastOffset     += sizeof(KS_TSDU_MDL);
+    KsTsduMgr->TotalBytes  += ReceiveLength;
+
+    KsPrint((2, "KsTcpChainedReceiveEventHandler: Total %xh bytes.\n",
+                KsTsduMgr->TotalBytes ));
+
+    Status = STATUS_PENDING;
+
+    /* attach it to the TsduMgr list if the Tsdu is newly created. */
+    if (bNewTsdu) {
+
+        list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+        KsTsduMgr->NumOfTsdu++;
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    /* wake up the threads waiing in ks_recv_mdl */
+    KeSetEvent(&(KsTsduMgr->Event), 0, FALSE);
+
+    if (tconn->kstc_conn && tconn->kstc_sched_cb) {
+        tconn->kstc_sched_cb( tconn, FALSE, NULL,
+                              KsTsduMgr->TotalBytes );
+    }
+
+    ks_put_tconn(tconn);
+
+    /* Return STATUS_PENDING to system because we are still
+       owning the MDL resources. ks_recv_mdl is expected
+       to free the MDL resources. */
+
+    return (Status);
+
+errorout:
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    if (bNewTsdu && (KsTsdu != NULL)) {
+        KsFreeKsTsdu(KsTsdu);
+    }
+
+    /* abort the tdi connection */
+    ks_abort_tconn(tconn);
+    ks_put_tconn(tconn);
+
+
+    Status = STATUS_SUCCESS;
+
+    return (Status);
+}
+
+
+/*
+ *  Expedited & Bulk receive event handler
+ */
+
+NTSTATUS
+KsTcpChainedReceiveExpeditedEventHandler (
+    IN PVOID                TdiEventContext,       // the event context
+    IN CONNECTION_CONTEXT   ConnectionContext,
+    IN ULONG                ReceiveFlags,
+    IN ULONG                ReceiveLength,
+    IN ULONG                StartingOffset,        // offset of start of client data in TSDU
+    IN PMDL                 Tsdu,                  // TSDU data chain
+    IN PVOID                TsduDescriptor         // for call to TdiReturnChainedReceives
+    )
+{
+    return KsTcpChainedReceiveEventHandler(
+                TdiEventContext,
+                ConnectionContext,
+                ReceiveFlags | TDI_RECEIVE_EXPEDITED,
+                ReceiveLength,
+                StartingOffset,
+                Tsdu,
+                TsduDescriptor );
+}
+
+
+VOID
+KsPrintProviderInfo(
+   PWSTR DeviceName,
+   PTDI_PROVIDER_INFO ProviderInfo
+   )
+{
+    KsPrint((2, "%ws ProviderInfo:\n", DeviceName));
+
+    KsPrint((2, "  Version              : 0x%4.4X\n", ProviderInfo->Version ));
+    KsPrint((2, "  MaxSendSize          : %d\n", ProviderInfo->MaxSendSize ));
+    KsPrint((2, "  MaxConnectionUserData: %d\n", ProviderInfo->MaxConnectionUserData ));
+    KsPrint((2, "  MaxDatagramSize      : %d\n", ProviderInfo->MaxDatagramSize ));
+    KsPrint((2, "  ServiceFlags         : 0x%8.8X\n", ProviderInfo->ServiceFlags ));
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_CONNECTION_MODE) {
+        KsPrint((2, "  CONNECTION_MODE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_ORDERLY_RELEASE) {
+        KsPrint((2, "  ORDERLY_RELEASE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_CONNECTIONLESS_MODE) {
+        KsPrint((2, "  CONNECTIONLESS_MODE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_ERROR_FREE_DELIVERY) {
+        KsPrint((2, "  ERROR_FREE_DELIVERY\n"));
+    }
+
+    if( ProviderInfo->ServiceFlags & TDI_SERVICE_SECURITY_LEVEL ) {
+        KsPrint((2, "  SECURITY_LEVEL\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_BROADCAST_SUPPORTED) {
+        KsPrint((2, "  BROADCAST_SUPPORTED\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_MULTICAST_SUPPORTED) {
+        KsPrint((2, "  MULTICAST_SUPPORTED\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_DELAYED_ACCEPTANCE) {
+        KsPrint((2, "  DELAYED_ACCEPTANCE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_EXPEDITED_DATA) {
+        KsPrint((2, "  EXPEDITED_DATA\n"));
+    }
+
+    if( ProviderInfo->ServiceFlags & TDI_SERVICE_INTERNAL_BUFFERING) {
+        KsPrint((2, "  INTERNAL_BUFFERING\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_ROUTE_DIRECTED) {
+        KsPrint((2, "  ROUTE_DIRECTED\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_NO_ZERO_LENGTH) {
+        KsPrint((2, "  NO_ZERO_LENGTH\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_POINT_TO_POINT) {
+        KsPrint((2, "  POINT_TO_POINT\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_MESSAGE_MODE) {
+        KsPrint((2, "  MESSAGE_MODE\n"));
+    }
+
+    if (ProviderInfo->ServiceFlags & TDI_SERVICE_HALF_DUPLEX) {
+        KsPrint((2, "  HALF_DUPLEX\n"));
+    }
+
+    KsPrint((2, "  MinimumLookaheadData : %d\n", ProviderInfo->MinimumLookaheadData ));
+    KsPrint((2, "  MaximumLookaheadData : %d\n", ProviderInfo->MaximumLookaheadData ));
+    KsPrint((2, "  NumberOfResources    : %d\n", ProviderInfo->NumberOfResources ));
+}
+
+
+/*
+ * KsAllocateKsTsdu
+ *   Reuse a Tsdu from the freelist or allocate a new Tsdu
+ *   from the LookAsideList table or the NonPagedPool
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   PKS_Tsdu: the new Tsdu or NULL if it fails
+ *
+ * Notes:
+ *   N/A
+ */
+
+PKS_TSDU
+KsAllocateKsTsdu()
+{
+    PKS_TSDU    KsTsdu = NULL;
+
+    spin_lock(&(ks_data.ksnd_tsdu_lock));
+
+    if (!list_empty (&(ks_data.ksnd_freetsdus))) {
+
+        LASSERT(ks_data.ksnd_nfreetsdus > 0);
+
+        KsTsdu = list_entry(ks_data.ksnd_freetsdus.next, KS_TSDU, Link);
+        list_del(&(KsTsdu->Link));
+        ks_data.ksnd_nfreetsdus--;
+
+    } else {
+
+        KsTsdu = (PKS_TSDU) cfs_mem_cache_alloc(
+                        ks_data.ksnd_tsdu_slab, 0);
+    }
+
+    spin_unlock(&(ks_data.ksnd_tsdu_lock));
+
+    if (NULL != KsTsdu) {
+        KsInitializeKsTsdu(KsTsdu, ks_data.ksnd_tsdu_size);
+    }
+
+    return (KsTsdu);
+}
+
+
+/*
+ * KsPutKsTsdu
+ *   Move the Tsdu to the free tsdu list in ks_data.
+ *
+ * Arguments:
+ *   KsTsdu: Tsdu to be moved.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+VOID
+KsPutKsTsdu(
+    PKS_TSDU  KsTsdu
+    )
+{
+    spin_lock(&(ks_data.ksnd_tsdu_lock));
+
+    list_add_tail( &(KsTsdu->Link), &(ks_data.ksnd_freetsdus));
+    ks_data.ksnd_nfreetsdus++;
+
+    spin_unlock(&(ks_data.ksnd_tsdu_lock));
+}
+
+
+/*
+ * KsFreeKsTsdu
+ *   Release a Tsdu: uninitialize then free it.
+ *
+ * Arguments:
+ *   KsTsdu: Tsdu to be freed.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+VOID
+KsFreeKsTsdu(
+    PKS_TSDU  KsTsdu
+    )
+{
+    cfs_mem_cache_free(
+            ks_data.ksnd_tsdu_slab,
+            KsTsdu );
+}
+
+
+/*
+ * KsInitializeKsTsdu
+ *   Initialize the Tsdu buffer header
+ *
+ * Arguments:
+ *   KsTsdu: the Tsdu to be initialized
+ *   Length: the total length of the Tsdu
+ *
+ * Return Value:
+ *   VOID
+ *
+ * NOTES:
+ *   N/A
+ */
+
+VOID
+KsInitializeKsTsdu(
+    PKS_TSDU    KsTsdu,
+    ULONG       Length
+    )
+{
+    RtlZeroMemory(KsTsdu, Length);
+    KsTsdu->Magic = KS_TSDU_MAGIC;
+    KsTsdu->TotalLength = Length;
+    KsTsdu->StartOffset = KsTsdu->LastOffset =
+    KS_DWORD_ALIGN(sizeof(KS_TSDU));
+}
+
+
+/*
+ * KsInitializeKsTsduMgr
+ *   Initialize the management structure of
+ *   Tsdu buffers
+ *
+ * Arguments:
+ *   TsduMgr: the TsduMgr to be initialized
+ *
+ * Return Value:
+ *   VOID
+ *
+ * NOTES:
+ *   N/A
+ */
+
+VOID
+KsInitializeKsTsduMgr(
+    PKS_TSDUMGR     TsduMgr
+    )
+{
+    KeInitializeEvent(
+            &(TsduMgr->Event),
+            NotificationEvent,
+            FALSE
+            );
+
+    CFS_INIT_LIST_HEAD(
+            &(TsduMgr->TsduList)
+            );
+
+    TsduMgr->NumOfTsdu  = 0;
+    TsduMgr->TotalBytes = 0;
+}
+
+
+/*
+ * KsInitializeKsChain
+ *   Initialize the China structure for receiving
+ *   or transmitting
+ *
+ * Arguments:
+ *   KsChain: the KsChain to be initialized
+ *
+ * Return Value:
+ *   VOID
+ *
+ * NOTES:
+ *   N/A
+ */
+
+VOID
+KsInitializeKsChain(
+    PKS_CHAIN       KsChain
+    )
+{
+    KsInitializeKsTsduMgr(&(KsChain->Normal));
+    KsInitializeKsTsduMgr(&(KsChain->Expedited));
+}
+
+
+/*
+ * KsCleanupTsduMgr
+ *   Clean up all the Tsdus in the TsduMgr list
+ *
+ * Arguments:
+ *   KsTsduMgr: the Tsdu list manager
+ *
+ * Return Value:
+ *   NTSTATUS:  nt status code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsCleanupTsduMgr(
+    PKS_TSDUMGR     KsTsduMgr
+    )
+{
+    PKS_TSDU        KsTsdu;
+    PKS_TSDU_DAT    KsTsduDat;
+    PKS_TSDU_BUF    KsTsduBuf;
+    PKS_TSDU_MDL    KsTsduMdl;
+
+    LASSERT(NULL != KsTsduMgr);
+
+    KeSetEvent(&(KsTsduMgr->Event), 0, FALSE);
+
+    while (!list_empty(&KsTsduMgr->TsduList)) {
+
+        KsTsdu = list_entry(KsTsduMgr->TsduList.next, KS_TSDU, Link);
+        LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+        if (KsTsdu->StartOffset == KsTsdu->LastOffset) {
+
+            //
+            // KsTsdu is empty now, we need free it ...
+            //
+
+            list_del(&(KsTsdu->Link));
+            KsTsduMgr->NumOfTsdu--;
+
+            KsFreeKsTsdu(KsTsdu);
+
+        } else {
+
+            KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+
+            if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+
+                KsTsdu->StartOffset += KsTsduDat->TotalLength;
+
+            } else if (TSDU_TYPE_BUF == KsTsduBuf->TsduType) {
+
+                ASSERT(KsTsduBuf->UserBuffer != NULL);
+
+                if (KsTsduBuf->DataLength > KsTsduBuf->StartOffset) {
+                    ExFreePool(KsTsduBuf->UserBuffer);
+                } else {
+                    cfs_enter_debugger();
+                }
+
+                KsTsdu->StartOffset += sizeof(KS_TSDU_BUF);
+
+            } else if (TSDU_TYPE_MDL == KsTsduMdl->TsduType) {
+
+                //
+                // MDL Tsdu Unit ...
+                //
+
+                TdiReturnChainedReceives(
+                    &(KsTsduMdl->Descriptor),
+                    1 );
+
+                KsTsdu->StartOffset += sizeof(KS_TSDU_MDL);
+            }
+        }
+    }
+
+    return STATUS_SUCCESS;
+}
+
+
+/*
+ * KsCleanupKsChain
+ *   Clean up the TsduMgrs of the KsChain
+ *
+ * Arguments:
+ *   KsChain: the chain managing TsduMgr
+ *
+ * Return Value:
+ *   NTSTATUS:  nt status code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsCleanupKsChain(
+    PKS_CHAIN   KsChain
+    )
+{
+    NTSTATUS    Status;
+
+    LASSERT(NULL != KsChain);
+
+    Status = KsCleanupTsduMgr(
+                &(KsChain->Normal)
+                );
+
+    if (!NT_SUCCESS(Status)) {
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    Status = KsCleanupTsduMgr(
+                &(KsChain->Expedited)
+                );
+
+    if (!NT_SUCCESS(Status)) {
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+errorout:
+
+    return Status;
+}
+
+
+/*
+ * KsCleanupTsdu
+ *   Clean up all the Tsdus of a tdi connected object
+ *
+ * Arguments:
+ *   tconn: the tdi connection which is connected already.
+ *
+ * Return Value:
+ *   Nt status code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsCleanupTsdu(
+    ksock_tconn_t * tconn
+    )
+{
+    NTSTATUS        Status = STATUS_SUCCESS;
+
+
+    if (tconn->kstc_type != kstt_sender &&
+        tconn->kstc_type != kstt_child ) {
+
+        goto errorout;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+
+        Status = KsCleanupKsChain(
+                    &(tconn->sender.kstc_recv)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        Status = KsCleanupKsChain(
+                    &(tconn->sender.kstc_send)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+    } else {
+
+        Status = KsCleanupKsChain(
+                    &(tconn->child.kstc_recv)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+        Status = KsCleanupKsChain(
+                    &(tconn->child.kstc_send)
+                    );
+
+        if (!NT_SUCCESS(Status)) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+
+    }
+
+errorout:
+
+    return (Status);
+}
+
+
+/*
+ * KsCopyMdlChainToMdlChain
+ *   Copy data from  a [chained] Mdl to anther [chained] Mdl.
+ *   Tdi library does not provide this function. We have to
+ *   realize it ourselives.
+ *
+ * Arguments:
+ *   SourceMdlChain: the source mdl
+ *   SourceOffset:   start offset of the source
+ *   DestinationMdlChain: the dst mdl
+ *   DestinationOffset: the offset where data are to be copied.
+ *   BytesTobecopied:   the expteced bytes to be copied
+ *   BytesCopied:    to store the really copied data length
+ *
+ * Return Value:
+ *   NTSTATUS: STATUS_SUCCESS or other error code
+ *
+ * NOTES:
+ *   The length of source mdl must be >= SourceOffset + BytesTobecopied
+ */
+
+NTSTATUS
+KsCopyMdlChainToMdlChain(
+    IN PMDL     SourceMdlChain,
+    IN ULONG    SourceOffset,
+    IN PMDL     DestinationMdlChain,
+    IN ULONG    DestinationOffset,
+    IN ULONG    BytesTobecopied,
+    OUT PULONG  BytesCopied
+    )
+{
+    PMDL        SrcMdl = SourceMdlChain;
+    PMDL        DstMdl = DestinationMdlChain;
+
+    PUCHAR      SrcBuf = NULL;
+    PUCHAR      DstBuf = NULL;
+
+    ULONG       dwBytes = 0;
+
+    NTSTATUS    Status = STATUS_SUCCESS;
+
+
+    while (dwBytes < BytesTobecopied) {
+
+        ULONG   Length = 0;
+
+        while (MmGetMdlByteCount(SrcMdl) <= SourceOffset) {
+
+            SourceOffset -= MmGetMdlByteCount(SrcMdl);
+
+            SrcMdl = SrcMdl->Next;
+
+            if (NULL == SrcMdl) {
+
+                Status = STATUS_INVALID_PARAMETER;
+                goto errorout;
+            }
+        }
+
+        while (MmGetMdlByteCount(DstMdl) <= DestinationOffset) {
+
+            DestinationOffset -= MmGetMdlByteCount(DstMdl);
+
+            DstMdl = DstMdl->Next;
+
+            if (NULL == DstMdl) {
+
+                Status = STATUS_INVALID_PARAMETER;
+                goto errorout;
+            }
+        }
+
+        DstBuf = (PUCHAR)KsMapMdlBuffer(DstMdl);
+
+        if ((NULL == DstBuf)) {
+            Status = STATUS_INSUFFICIENT_RESOURCES;
+            goto errorout;
+        }
+
+        //
+        // Here we need skip the OVERFLOW case via RtlCopyMemory :-(
+        //
+
+        if ( KsQueryMdlsSize(SrcMdl) - SourceOffset >
+             MmGetMdlByteCount(DstMdl) - DestinationOffset ) {
+
+            Length = BytesTobecopied - dwBytes;
+
+            if (Length > KsQueryMdlsSize(SrcMdl) - SourceOffset) {
+                Length = KsQueryMdlsSize(SrcMdl) - SourceOffset;
+            }
+
+            if (Length > MmGetMdlByteCount(DstMdl) - DestinationOffset) {
+                Length = MmGetMdlByteCount(DstMdl) - DestinationOffset;
+            }
+
+            SrcBuf = (PUCHAR)KsMapMdlBuffer(SrcMdl);
+
+            if ((NULL == DstBuf)) {
+                Status = STATUS_INSUFFICIENT_RESOURCES;
+                goto errorout;
+            }
+
+            RtlCopyMemory(
+                DstBuf + DestinationOffset,
+                SrcBuf + SourceOffset,
+                Length
+                );
+
+        } else {
+
+            Status = TdiCopyMdlToBuffer(
+                        SrcMdl,
+                        SourceOffset,
+                        DstBuf,
+                        DestinationOffset,
+                        MmGetMdlByteCount(DstMdl),
+                        &Length
+                        );
+
+            if (STATUS_BUFFER_OVERFLOW == Status) {
+                cfs_enter_debugger();
+            } else if (!NT_SUCCESS(Status)) {
+                cfs_enter_debugger();
+                goto errorout;
+            }
+        }
+
+        SourceOffset += Length;
+        DestinationOffset += Length;
+        dwBytes += Length;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(Status)) {
+        *BytesCopied = dwBytes;
+    } else {
+        *BytesCopied = 0;
+    }
+
+    return Status;
+}
+
+
+
+/*
+ * KsQueryMdlSize
+ *   Query the whole size of a MDL (may be chained)
+ *
+ * Arguments:
+ *   Mdl:  the Mdl to be queried
+ *
+ * Return Value:
+ *   ULONG: the total size of the mdl
+ *
+ * NOTES:
+ *   N/A
+ */
+
+ULONG
+KsQueryMdlsSize (PMDL Mdl)
+{
+    PMDL    Next = Mdl;
+    ULONG   Length = 0;
+
+
+    //
+    // Walking the MDL Chain ...
+    //
+
+    while (Next) {
+        Length += MmGetMdlByteCount(Next);
+        Next = Next->Next;
+    }
+
+    return (Length);
+}
+
+
+/*
+ * KsLockUserBuffer
+ *   Allocate MDL for the buffer and lock the pages into
+ *   nonpaged pool
+ *
+ * Arguments:
+ *   UserBuffer:  the user buffer to be locked
+ *   Length:      length in bytes of the buffer
+ *   Operation:   read or write access
+ *   pMdl:        the result of the created mdl
+ *
+ * Return Value:
+ *   NTSTATUS:     kernel status code (STATUS_SUCCESS
+ *                 or other error code)
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+KsLockUserBuffer (
+    IN PVOID            UserBuffer,
+    IN BOOLEAN          bPaged,
+    IN ULONG            Length,
+    IN LOCK_OPERATION   Operation,
+    OUT PMDL *          pMdl
+    )
+{
+    NTSTATUS    Status;
+    PMDL        Mdl = NULL;
+
+    LASSERT(UserBuffer != NULL);
+
+    *pMdl = NULL;
+
+    Mdl = IoAllocateMdl(
+                UserBuffer,
+                Length,
+                FALSE,
+                FALSE,
+                NULL
+                );
+
+    if (Mdl == NULL) {
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+
+    } else {
+
+        __try {
+
+            if (bPaged) {
+                MmProbeAndLockPages(
+                    Mdl,
+                    KernelMode,
+                    Operation
+                    );
+            } else {
+                MmBuildMdlForNonPagedPool(
+                    Mdl
+                    );
+            }
+
+            Status = STATUS_SUCCESS;
+
+            *pMdl = Mdl;
+
+        } __except (EXCEPTION_EXECUTE_HANDLER) {
+
+            IoFreeMdl(Mdl);
+
+            Mdl = NULL;
+
+            cfs_enter_debugger();
+
+            Status = STATUS_INVALID_USER_BUFFER;
+        }
+    }
+
+    return Status;
+}
+
+/*
+ * KsMapMdlBuffer
+ *   Map the mdl into a buffer in kernel space
+ *
+ * Arguments:
+ *   Mdl:  the mdl to be mapped
+ *
+ * Return Value:
+ *   PVOID: the buffer mapped or NULL in failure
+ *
+ * NOTES:
+ *   N/A
+ */
+
+PVOID
+KsMapMdlBuffer (PMDL    Mdl)
+{
+    LASSERT(Mdl != NULL);
+
+    return MmGetSystemAddressForMdlSafe(
+                Mdl,
+                NormalPagePriority
+                );
+}
+
+
+/*
+ * KsReleaseMdl
+ *   Unlock all the pages in the mdl
+ *
+ * Arguments:
+ *   Mdl:  memory description list to be released
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES:
+ *   N/A
+ */
+
+VOID
+KsReleaseMdl (IN PMDL   Mdl,
+              IN int    Paged )
+{
+    LASSERT(Mdl != NULL);
+
+    while (Mdl) {
+
+        PMDL    Next;
+
+        Next = Mdl->Next;
+
+        if (Paged) {
+            MmUnlockPages(Mdl);
+        }
+
+        IoFreeMdl(Mdl);
+
+        Mdl = Next;
+    }
+}
+
+
+/*
+ * ks_lock_buffer
+ *   allocate MDL for the user spepcified buffer and lock (paging-in)
+ *   all the pages of the buffer into system memory
+ *
+ * Arguments:
+ *   buffer:  the user buffer to be locked
+ *   length:  length in bytes of the buffer
+ *   access:  read or write access
+ *   mdl:     the result of the created mdl
+ *
+ * Return Value:
+ *   int:     the ks error code: 0: success / -x: failture
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_lock_buffer (
+    void *            buffer,
+    int               paged,
+    int               length,
+    LOCK_OPERATION    access,
+    ksock_mdl_t **    kmdl
+    )
+{
+    NTSTATUS        status;
+
+    status = KsLockUserBuffer(
+                    buffer,
+                    paged !=0,
+                    length,
+                    access,
+                    kmdl
+                    );
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * ks_map_mdl
+ *   Map the mdl pages into kernel space
+ *
+ * Arguments:
+ *   mdl:  the mdl to be mapped
+ *
+ * Return Value:
+ *   void *: the buffer mapped or NULL in failure
+ *
+ * Notes:
+ *   N/A
+ */
+
+void *
+ks_map_mdl (ksock_mdl_t * mdl)
+{
+    LASSERT(mdl != NULL);
+
+    return KsMapMdlBuffer(mdl);
+}
+
+/*
+ *  ks_release_mdl
+ *   Unlock all the pages in the mdl and release the mdl
+ *
+ * Arguments:
+ *   mdl:  memory description list to be released
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+void
+ks_release_mdl (ksock_mdl_t *mdl, int paged)
+{
+    LASSERT(mdl != NULL);
+
+    KsReleaseMdl(mdl, paged);
+}
+
+
+/*
+ * ks_create_tconn
+ *   allocate a new tconn structure from the SLAB cache or
+ *   NonPaged sysetm pool
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   ksock_tconn_t *: the address of tconn or NULL if it fails
+ *
+ * NOTES:
+ *   N/A
+ */
+
+ksock_tconn_t *
+ks_create_tconn()
+{
+    ksock_tconn_t * tconn = NULL;
+
+    /* allocate ksoc_tconn_t from the slab cache memory */
+
+    tconn = (ksock_tconn_t *)cfs_mem_cache_alloc(
+                ks_data.ksnd_tconn_slab, CFS_ALLOC_ZERO);
+
+    if (tconn) {
+
+        /* zero tconn elements */
+        memset(tconn, 0, sizeof(ksock_tconn_t));
+
+        /* initialize the tconn ... */
+        tconn->kstc_magic = KS_TCONN_MAGIC;
+
+        ExInitializeWorkItem(
+            &(tconn->kstc_disconnect.WorkItem),
+            KsDisconnectHelper,
+            &(tconn->kstc_disconnect)
+            );
+
+        KeInitializeEvent(
+                &(tconn->kstc_disconnect.Event),
+                SynchronizationEvent,
+                FALSE );
+
+        ExInitializeWorkItem(
+            &(tconn->kstc_destroy),
+            ks_destroy_tconn,
+            tconn
+            );
+
+        spin_lock_init(&(tconn->kstc_lock));
+
+        ks_get_tconn(tconn);
+
+        spin_lock(&(ks_data.ksnd_tconn_lock));
+
+        /* attach it into global list in ks_data */
+
+        list_add(&(tconn->kstc_list), &(ks_data.ksnd_tconns));
+        ks_data.ksnd_ntconns++;
+        spin_unlock(&(ks_data.ksnd_tconn_lock));
+
+        tconn->kstc_rcv_wnd = tconn->kstc_snd_wnd = 0x10000;
+    }
+
+    return (tconn);
+}
+
+
+/*
+ * ks_free_tconn
+ *   free the tconn structure to the SLAB cache or NonPaged
+ *   sysetm pool
+ *
+ * Arguments:
+ *   tconn:  the tcon is to be freed
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+void
+ks_free_tconn(ksock_tconn_t * tconn)
+{
+    LASSERT(atomic_read(&(tconn->kstc_refcount)) == 0);
+
+    spin_lock(&(ks_data.ksnd_tconn_lock));
+
+    /* remove it from the global list */
+    list_del(&tconn->kstc_list);
+    ks_data.ksnd_ntconns--;
+
+    /* if this is the last tconn, it would be safe for
+       ks_tdi_fini_data to quit ... */
+    if (ks_data.ksnd_ntconns == 0) {
+        cfs_wake_event(&ks_data.ksnd_tconn_exit);
+    }
+    spin_unlock(&(ks_data.ksnd_tconn_lock));
+
+    /* free the structure memory */
+    cfs_mem_cache_free(ks_data.ksnd_tconn_slab, tconn);
+}
+
+
+/*
+ * ks_init_listener
+ *   Initialize the tconn as a listener (daemon)
+ *
+ * Arguments:
+ *   tconn: the listener tconn
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+void
+ks_init_listener(
+    ksock_tconn_t * tconn
+    )
+{
+    /* preparation: intialize the tconn members */
+
+    tconn->kstc_type = kstt_listener;
+
+    RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME);
+
+    CFS_INIT_LIST_HEAD(&(tconn->listener.kstc_listening.list));
+    CFS_INIT_LIST_HEAD(&(tconn->listener.kstc_accepted.list));
+
+    cfs_init_event( &(tconn->listener.kstc_accept_event),
+                    TRUE,
+                    FALSE );
+
+    cfs_init_event( &(tconn->listener.kstc_destroy_event),
+                    TRUE,
+                    FALSE );
+
+    tconn->kstc_state = ksts_inited;
+}
+
+
+/*
+ * ks_init_sender
+ *   Initialize the tconn as a sender
+ *
+ * Arguments:
+ *   tconn: the sender tconn
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+void
+ks_init_sender(
+    ksock_tconn_t * tconn
+    )
+{
+    tconn->kstc_type = kstt_sender;
+    RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME);
+
+    KsInitializeKsChain(&(tconn->sender.kstc_recv));
+    KsInitializeKsChain(&(tconn->sender.kstc_send));
+
+    tconn->kstc_snd_wnd = TDINAL_WINDOW_DEFAULT_SIZE;
+    tconn->kstc_rcv_wnd = TDINAL_WINDOW_DEFAULT_SIZE;
+
+    tconn->kstc_state = ksts_inited;
+}
+
+/*
+ * ks_init_child
+ *   Initialize the tconn as a child
+ *
+ * Arguments:
+ *   tconn: the child tconn
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES:
+ *   N/A
+ */
+
+void
+ks_init_child(
+    ksock_tconn_t * tconn
+    )
+{
+    tconn->kstc_type = kstt_child;
+    RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME);
+
+    KsInitializeKsChain(&(tconn->child.kstc_recv));
+    KsInitializeKsChain(&(tconn->child.kstc_send));
+
+    tconn->kstc_snd_wnd = TDINAL_WINDOW_DEFAULT_SIZE;
+    tconn->kstc_rcv_wnd = TDINAL_WINDOW_DEFAULT_SIZE;
+
+    tconn->kstc_state = ksts_inited;
+}
+
+/*
+ * ks_get_tconn
+ *   increase the reference count of the tconn with 1
+ *
+ * Arguments:
+ *   tconn: the tdi connection to be referred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES:
+ *   N/A
+ */
+
+void
+ks_get_tconn(
+    ksock_tconn_t * tconn
+    )
+{
+    atomic_inc(&(tconn->kstc_refcount));
+}
+
+/*
+ * ks_put_tconn
+ *   decrease the reference count of the tconn and destroy
+ *   it if the refercount becomes 0.
+ *
+ * Arguments:
+ *   tconn: the tdi connection to be dereferred
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES:
+ *   N/A
+ */
+
+void
+ks_put_tconn(
+    ksock_tconn_t *tconn
+    )
+{
+    if (atomic_dec_and_test(&(tconn->kstc_refcount))) {
+
+        spin_lock(&(tconn->kstc_lock));
+
+        if ( ( tconn->kstc_type == kstt_child ||
+               tconn->kstc_type == kstt_sender ) &&
+             ( tconn->kstc_state == ksts_connected ) ) {
+
+            spin_unlock(&(tconn->kstc_lock));
+
+            ks_abort_tconn(tconn);
+
+        } else {
+
+            if (cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DESTROY_BUSY)) {
+                cfs_enter_debugger();
+            } else {
+                ExQueueWorkItem(
+                        &(tconn->kstc_destroy),
+                        DelayedWorkQueue
+                        );
+
+                cfs_set_flag(tconn->kstc_flags, KS_TCONN_DESTROY_BUSY);
+            }
+
+            spin_unlock(&(tconn->kstc_lock));
+        }
+    }
+}
+
+/*
+ * ks_destroy_tconn
+ *   cleanup the tdi connection and free it
+ *
+ * Arguments:
+ *   tconn: the tdi connection to be cleaned.
+ *
+ * Return Value:
+ *   N/A
+ *
+ * NOTES:
+ *   N/A
+ */
+
+void
+ks_destroy_tconn(
+    ksock_tconn_t *     tconn
+    )
+{
+    LASSERT(tconn->kstc_refcount.counter == 0);
+
+    if (tconn->kstc_type == kstt_listener) {
+
+        ks_reset_handlers(tconn);
+
+        /* for listener, we just need to close the address object */
+        KsCloseAddress(
+                tconn->kstc_addr.Handle,
+                tconn->kstc_addr.FileObject
+                );
+
+        tconn->kstc_state = ksts_inited;
+
+    } else if (tconn->kstc_type == kstt_child) {
+
+        /* for child tdi conections */
+
+        /* disassociate the relation between it's connection object
+           and the address object */
+
+        if (tconn->kstc_state == ksts_associated) {
+            KsDisassociateAddress(
+                tconn->child.kstc_info.FileObject
+                );
+        }
+
+        /* release the connection object */
+
+        KsCloseConnection(
+                tconn->child.kstc_info.Handle,
+                tconn->child.kstc_info.FileObject
+                );
+
+        /* release it's refer of it's parent's address object */
+        KsCloseAddress(
+                NULL,
+                tconn->kstc_addr.FileObject
+                );
+
+        spin_lock(&tconn->child.kstc_parent->kstc_lock);
+        spin_lock(&tconn->kstc_lock);
+
+        tconn->kstc_state = ksts_inited;
+
+        /* remove it frome it's parent's queues */
+
+        if (tconn->child.kstc_queued) {
+
+            list_del(&(tconn->child.kstc_link));
+
+            if (tconn->child.kstc_queueno) {
+
+                LASSERT(tconn->child.kstc_parent->listener.kstc_accepted.num > 0);
+                tconn->child.kstc_parent->listener.kstc_accepted.num -= 1;
+
+            } else {
+
+                LASSERT(tconn->child.kstc_parent->listener.kstc_listening.num > 0);
+                tconn->child.kstc_parent->listener.kstc_listening.num -= 1;
+            }
+
+            tconn->child.kstc_queued = FALSE;
+        }
+
+        spin_unlock(&tconn->kstc_lock);
+        spin_unlock(&tconn->child.kstc_parent->kstc_lock);
+
+        /* drop the reference of the parent tconn */
+        ks_put_tconn(tconn->child.kstc_parent);
+
+    } else if (tconn->kstc_type == kstt_sender) {
+
+        ks_reset_handlers(tconn);
+
+        /* release the connection object */
+
+        KsCloseConnection(
+                tconn->sender.kstc_info.Handle,
+                tconn->sender.kstc_info.FileObject
+                );
+
+        /* release it's refer of it's parent's address object */
+        KsCloseAddress(
+                tconn->kstc_addr.Handle,
+                tconn->kstc_addr.FileObject
+                );
+
+        tconn->kstc_state = ksts_inited;
+
+    } else {
+        cfs_enter_debugger();
+    }
+
+    /* free the tconn structure ... */
+
+    ks_free_tconn(tconn);
+}
+
+int
+ks_query_data(
+    ksock_tconn_t * tconn,
+    size_t *        size,
+    int             bIsExpedited )
+{
+    int             rc = 0;
+
+    PKS_CHAIN       KsChain;
+    PKS_TSDUMGR     KsTsduMgr;
+
+    *size = 0;
+
+    ks_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    if ( tconn->kstc_type != kstt_sender &&
+         tconn->kstc_type != kstt_child) {
+        rc = -EINVAL;
+        spin_unlock(&(tconn->kstc_lock));
+        goto errorout;
+    }
+
+    if (tconn->kstc_state != ksts_connected) {
+        rc = -ENOTCONN;
+        spin_unlock(&(tconn->kstc_lock));
+        goto errorout;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    *size = KsTsduMgr->TotalBytes;
+    spin_unlock(&(tconn->kstc_lock));
+
+errorout:
+
+    ks_put_tconn(tconn);
+
+    return (rc);
+}
+
+/*
+ * ks_get_tcp_option
+ *   Query the the options of the tcp stream connnection
+ *
+ * Arguments:
+ *   tconn:         the tdi connection
+ *   ID:            option id
+ *   OptionValue:   buffer to store the option value
+ *   Length:        the length of the value, to be returned
+ *
+ * Return Value:
+ *   int:           ks return code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+int
+ks_get_tcp_option (
+    ksock_tconn_t *     tconn,
+    ULONG               ID,
+    PVOID               OptionValue,
+    PULONG              Length
+    )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    IO_STATUS_BLOCK     IoStatus;
+
+    TCP_REQUEST_QUERY_INFORMATION_EX QueryInfoEx;
+
+    PFILE_OBJECT        ConnectionObject;
+    PDEVICE_OBJECT      DeviceObject = NULL;
+
+    PIRP                Irp = NULL;
+    PIO_STACK_LOCATION  IrpSp = NULL;
+
+    KEVENT              Event;
+
+    /* make sure the tdi connection is connected ? */
+
+    ks_get_tconn(tconn);
+
+    if (tconn->kstc_state != ksts_connected) {
+        Status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    LASSERT(tconn->kstc_type == kstt_sender ||
+           tconn->kstc_type == kstt_child);
+
+    if (tconn->kstc_type == kstt_sender) {
+        ConnectionObject = tconn->sender.kstc_info.FileObject;
+    } else {
+        ConnectionObject = tconn->child.kstc_info.FileObject;
+    }
+
+    QueryInfoEx.ID.toi_id = ID;
+    QueryInfoEx.ID.toi_type   = INFO_TYPE_CONNECTION;
+    QueryInfoEx.ID.toi_class  = INFO_CLASS_PROTOCOL;
+    QueryInfoEx.ID.toi_entity.tei_entity   = CO_TL_ENTITY;
+    QueryInfoEx.ID.toi_entity.tei_instance = 0;
+
+    RtlZeroMemory(&(QueryInfoEx.Context), CONTEXT_SIZE);
+
+    KeInitializeEvent(&Event, NotificationEvent, FALSE);
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    Irp = IoBuildDeviceIoControlRequest(
+                IOCTL_TCP_QUERY_INFORMATION_EX,
+                DeviceObject,
+                &QueryInfoEx,
+                sizeof(TCP_REQUEST_QUERY_INFORMATION_EX),
+                OptionValue,
+                *Length,
+                FALSE,
+                &Event,
+                &IoStatus
+                );
+
+    if (Irp == NULL) {
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp = IoGetNextIrpStackLocation(Irp);
+
+    if (IrpSp == NULL) {
+
+        IoFreeIrp(Irp);
+        Irp = NULL;
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp->FileObject = ConnectionObject;
+    IrpSp->DeviceObject = DeviceObject;
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    if (Status == STATUS_PENDING) {
+
+        KeWaitForSingleObject(
+                &Event,
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+        Status = IoStatus.Status;
+    }
+
+
+    if (NT_SUCCESS(Status)) {
+        *Length = IoStatus.Information;
+    } else {
+        cfs_enter_debugger();
+        memset(OptionValue, 0, *Length);
+        Status = STATUS_SUCCESS;
+    }
+
+errorout:
+
+    ks_put_tconn(tconn);
+
+    return cfs_error_code(Status);
+}
+
+/*
+ * ks_set_tcp_option
+ *   Set the the options for the tcp stream connnection
+ *
+ * Arguments:
+ *   tconn:     the tdi connection
+ *   ID:        option id
+ *   OptionValue: buffer containing the new option value
+ *   Length:    the length of the value
+ *
+ * Return Value:
+ *   int:       ks return code
+ *
+ * NOTES:
+ *   N/A
+ */
+
+NTSTATUS
+ks_set_tcp_option (
+    ksock_tconn_t * tconn,
+    ULONG           ID,
+    PVOID           OptionValue,
+    ULONG           Length
+    )
+{
+    NTSTATUS            Status = STATUS_SUCCESS;
+
+    IO_STATUS_BLOCK     IoStatus;
+
+    ULONG               SetInfoExLength;
+    PTCP_REQUEST_SET_INFORMATION_EX SetInfoEx = NULL;
+
+    PFILE_OBJECT        ConnectionObject;
+    PDEVICE_OBJECT      DeviceObject = NULL;
+
+    PIRP                Irp = NULL;
+    PIO_STACK_LOCATION  IrpSp = NULL;
+
+    PKEVENT             Event;
+
+    /* make sure the tdi connection is connected ? */
+
+    ks_get_tconn(tconn);
+
+    if (tconn->kstc_state != ksts_connected) {
+        Status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    LASSERT(tconn->kstc_type == kstt_sender ||
+           tconn->kstc_type == kstt_child);
+
+    if (tconn->kstc_type == kstt_sender) {
+        ConnectionObject = tconn->sender.kstc_info.FileObject;
+    } else {
+        ConnectionObject = tconn->child.kstc_info.FileObject;
+    }
+
+    SetInfoExLength =  sizeof(TCP_REQUEST_SET_INFORMATION_EX) - 1 + Length + sizeof(KEVENT);
+
+    SetInfoEx = ExAllocatePoolWithTag(
+                    NonPagedPool,
+                    SetInfoExLength,
+                    'TSSK'
+                    );
+
+    if (SetInfoEx == NULL) {
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    SetInfoEx->ID.toi_id = ID;
+
+    SetInfoEx->ID.toi_type  = INFO_TYPE_CONNECTION;
+    SetInfoEx->ID.toi_class = INFO_CLASS_PROTOCOL;
+    SetInfoEx->ID.toi_entity.tei_entity   = CO_TL_ENTITY;
+    SetInfoEx->ID.toi_entity.tei_instance = TL_INSTANCE;
+
+    SetInfoEx->BufferSize = Length;
+    RtlCopyMemory(&(SetInfoEx->Buffer[0]), OptionValue, Length);
+
+    Event = (PKEVENT)(&(SetInfoEx->Buffer[Length]));
+    KeInitializeEvent(Event, NotificationEvent, FALSE);
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    Irp = IoBuildDeviceIoControlRequest(
+                IOCTL_TCP_SET_INFORMATION_EX,
+                DeviceObject,
+                SetInfoEx,
+                SetInfoExLength,
+                NULL,
+                0,
+                FALSE,
+                Event,
+                &IoStatus
+                );
+
+    if (Irp == NULL) {
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp = IoGetNextIrpStackLocation(Irp);
+
+    if (IrpSp == NULL) {
+        IoFreeIrp(Irp);
+        Irp = NULL;
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    IrpSp->FileObject = ConnectionObject;
+    IrpSp->DeviceObject = DeviceObject;
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    if (Status == STATUS_PENDING) {
+
+        KeWaitForSingleObject(
+                Event,
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+        Status = IoStatus.Status;
+    }
+
+errorout:
+
+    if (SetInfoEx) {
+        ExFreePool(SetInfoEx);
+    }
+
+    if (!NT_SUCCESS(Status)) {
+        printk("ks_set_tcp_option: error setup tcp option: ID (%d), Status = %xh\n",
+               ID, Status);
+        Status = STATUS_SUCCESS;
+    }
+
+    ks_put_tconn(tconn);
+
+    return cfs_error_code(Status);
+}
+
+/*
+ * ks_bind_tconn
+ *   bind the tdi connection object with an address
+ *
+ * Arguments:
+ *   tconn:    tconn to be bound
+ *   parent:   the parent tconn object
+ *   ipaddr:   the ip address
+ *   port:     the port number
+ *
+ * Return Value:
+ *   int:   0 for success or ks error codes.
+ *
+ * NOTES:
+ *   N/A
+ */
+
+int
+ks_bind_tconn (
+    ksock_tconn_t * tconn,
+    ksock_tconn_t * parent,
+    ulong_ptr   addr,
+    unsigned short  port
+    )
+{
+    NTSTATUS            status;
+    int                 rc = 0;
+
+    ksock_tdi_addr_t    taddr;
+
+    memset(&taddr, 0, sizeof(ksock_tdi_addr_t));
+
+    if (tconn->kstc_state != ksts_inited) {
+
+        status = STATUS_INVALID_PARAMETER;
+        rc = cfs_error_code(status);
+
+        goto errorout;
+
+    } else if (tconn->kstc_type == kstt_child) {
+
+        if (NULL == parent) {
+            status = STATUS_INVALID_PARAMETER;
+            rc = cfs_error_code(status);
+
+            goto errorout;
+        }
+
+        /* refer it's parent's address object */
+
+        taddr = parent->kstc_addr;
+        ObReferenceObject(taddr.FileObject);
+
+        ks_get_tconn(parent);
+
+    } else {
+
+        PTRANSPORT_ADDRESS TdiAddress = &(taddr.Tdi);
+        ULONG              AddrLen = 0;
+
+        /* intialize the tdi address*/
+
+        TdiAddress->TAAddressCount = 1;
+        TdiAddress->Address[0].AddressLength = TDI_ADDRESS_LENGTH_IP;
+        TdiAddress->Address[0].AddressType   = TDI_ADDRESS_TYPE_IP;
+
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_port = htons(port);
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->in_addr = htonl(addr);
+
+        memset(&(((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_zero[0]),0,8);
+
+
+        /* open the transport address object */
+
+        AddrLen = FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) +
+                  TDI_ADDRESS_LENGTH_IP;
+
+        status = KsOpenAddress(
+                    &(tconn->kstc_dev),
+                    &(taddr.Tdi),
+                    AddrLen,
+                    &(taddr.Handle),
+                    &(taddr.FileObject)
+                    );
+
+        if (!NT_SUCCESS(status)) {
+
+            KsPrint((0, "ks_bind_tconn: failed to open ip addr object (%x:%d), status = %xh\n",
+                        addr, port,  status ));
+            rc = cfs_error_code(status);
+            goto errorout;
+        }
+    }
+
+    if (tconn->kstc_type == kstt_child) {
+        tconn->child.kstc_parent = parent;
+    }
+
+    tconn->kstc_state = ksts_bind;
+    tconn->kstc_addr  = taddr;
+
+errorout:
+
+    return (rc);
+}
+
+/*
+ * ks_build_tconn
+ *  build tcp/streaming connection to remote peer
+ *
+ * Arguments:
+ *   tconn:    tconn to be connected to the peer
+ *   addr:     the peer's ip address
+ *   port:     the peer's port number
+ *
+ * Return Value:
+ *   int:   0 for success or ks error codes.
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_build_tconn(
+    ksock_tconn_t *                 tconn,
+    ulong_ptr                       addr,
+    unsigned short                  port
+    )
+{
+    int                             rc = 0;
+    NTSTATUS                        status = STATUS_SUCCESS;
+
+
+    PFILE_OBJECT                    ConnectionObject = NULL;
+    PDEVICE_OBJECT                  DeviceObject = NULL;
+
+    PTDI_CONNECTION_INFORMATION     ConnectionInfo = NULL;
+    ULONG                           AddrLength;
+
+    PIRP                            Irp = NULL;
+
+    LASSERT(tconn->kstc_type == kstt_sender);
+    LASSERT(tconn->kstc_state == ksts_bind);
+
+    ks_get_tconn(tconn);
+
+    {
+        /* set the event callbacks */
+        rc = ks_set_handlers(tconn);
+
+        if (rc < 0) {
+            cfs_enter_debugger();
+            goto errorout;
+        }
+    }
+
+    /* create the connection file handle / object  */
+    status = KsOpenConnection(
+                &(tconn->kstc_dev),
+                (CONNECTION_CONTEXT)tconn,
+                &(tconn->sender.kstc_info.Handle),
+                &(tconn->sender.kstc_info.FileObject)
+                );
+
+    if (!NT_SUCCESS(status)) {
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* associdate the the connection with the adress object of the tconn */
+
+    status = KsAssociateAddress(
+                tconn->kstc_addr.Handle,
+                tconn->sender.kstc_info.FileObject
+                );
+
+    if (!NT_SUCCESS(status)) {
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    tconn->kstc_state = ksts_associated;
+
+    /* Allocating Connection Info Together with the Address */
+    AddrLength = FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address)
+                 + TDI_ADDRESS_LENGTH_IP;
+
+    ConnectionInfo = (PTDI_CONNECTION_INFORMATION)ExAllocatePoolWithTag(
+    NonPagedPool, sizeof(TDI_CONNECTION_INFORMATION) + AddrLength, 'iCsK');
+
+    if (NULL == ConnectionInfo) {
+
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* Initializing ConnectionInfo ... */
+    {
+        PTRANSPORT_ADDRESS TdiAddress;
+
+        /* ConnectionInfo settings */
+
+        ConnectionInfo->UserDataLength = 0;
+        ConnectionInfo->UserData = NULL;
+        ConnectionInfo->OptionsLength = 0;
+        ConnectionInfo->Options = NULL;
+        ConnectionInfo->RemoteAddressLength = AddrLength;
+        ConnectionInfo->RemoteAddress = ConnectionInfo + 1;
+
+
+        /* intialize the tdi address*/
+
+        TdiAddress = ConnectionInfo->RemoteAddress;
+
+        TdiAddress->TAAddressCount = 1;
+        TdiAddress->Address[0].AddressLength = TDI_ADDRESS_LENGTH_IP;
+        TdiAddress->Address[0].AddressType   = TDI_ADDRESS_TYPE_IP;
+
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_port = htons(port);
+        ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->in_addr = htonl(addr);
+
+        memset(&(((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_zero[0]),0,8);
+    }
+
+    /* Now prepare to connect the remote peer ... */
+
+    ConnectionObject = tconn->sender.kstc_info.FileObject;
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    /* allocate a new Irp */
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        rc = cfs_error_code(status);
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* setup the Irp */
+
+    TdiBuildConnect(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            NULL,
+            NULL,
+            NULL,
+            ConnectionInfo,
+            NULL
+            );
+
+
+    /* sumbit the Irp to the underlying transport driver */
+    status = KsSubmitTdiIrp(
+                    DeviceObject,
+                    Irp,
+                    TRUE,
+                    NULL
+                    );
+
+    spin_lock(&(tconn->kstc_lock));
+
+    if (NT_SUCCESS(status)) {
+
+        /* Connected! the conneciton is built successfully. */
+
+        tconn->kstc_state = ksts_connected;
+
+        tconn->sender.kstc_info.ConnectionInfo = ConnectionInfo;
+        tconn->sender.kstc_info.Remote         = ConnectionInfo->RemoteAddress;
+
+        spin_unlock(&(tconn->kstc_lock));
+
+    } else {
+
+        /* Not connected! Abort it ... */
+
+        if (rc != 0) {
+            cfs_enter_debugger();
+        }
+
+        Irp = NULL;
+        rc = cfs_error_code(status);
+
+        tconn->kstc_state = ksts_associated;
+        spin_unlock(&(tconn->kstc_lock));
+
+        /* disassocidate the connection and the address object,
+           after cleanup,  it's safe to set the state to abort ... */
+
+        if ( NT_SUCCESS(KsDisassociateAddress(
+                        tconn->sender.kstc_info.FileObject))) {
+            tconn->kstc_state = ksts_aborted;
+        }
+
+        /* reset the event callbacks */
+        rc = ks_reset_handlers(tconn);
+
+        goto errorout;
+    }
+
+errorout:
+
+    if (NT_SUCCESS(status)) {
+
+        ks_query_local_ipaddr(tconn);
+
+    } else {
+
+        if (ConnectionInfo) {
+            ExFreePool(ConnectionInfo);
+        }
+        if (Irp) {
+            IoFreeIrp(Irp);
+        }
+    }
+
+    ks_put_tconn(tconn);
+
+    return (rc);
+}
+
+
+/*
+ * ks_disconnect_tconn
+ *   disconnect the tconn from a connection
+ *
+ * Arguments:
+ *   tconn: the tdi connecton object connected already
+ *   flags: flags & options for disconnecting
+ *
+ * Return Value:
+ *   int: ks error code
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_disconnect_tconn(
+    ksock_tconn_t *     tconn,
+    ulong_ptr       flags
+    )
+{
+    NTSTATUS            status = STATUS_SUCCESS;
+
+    ksock_tconn_info_t * info;
+
+    PFILE_OBJECT        ConnectionObject;
+    PDEVICE_OBJECT      DeviceObject = NULL;
+
+    PIRP                Irp = NULL;
+
+    KEVENT              Event;
+
+    ks_get_tconn(tconn);
+
+    /* make sure tt's connected already and it
+       must be a sender or a child ...       */
+
+    LASSERT(tconn->kstc_state == ksts_connected);
+    LASSERT( tconn->kstc_type == kstt_sender ||
+            tconn->kstc_type == kstt_child);
+
+    /* reset all the event handlers to NULL */
+
+    if (tconn->kstc_type != kstt_child) {
+        ks_reset_handlers (tconn);
+    }
+
+    /* Disconnecting to the remote peer ... */
+
+    if (tconn->kstc_type == kstt_sender) {
+        info = &(tconn->sender.kstc_info);
+    } else {
+        info = &(tconn->child.kstc_info);
+    }
+
+    ConnectionObject = info->FileObject;
+    DeviceObject = IoGetRelatedDeviceObject(ConnectionObject);
+
+    /* allocate an Irp and setup it */
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        status = STATUS_INSUFFICIENT_RESOURCES;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    KeInitializeEvent(
+            &Event,
+            SynchronizationEvent,
+            FALSE
+            );
+
+    TdiBuildDisconnect(
+            Irp,
+            DeviceObject,
+            ConnectionObject,
+            KsDisconectCompletionRoutine,
+            &Event,
+            NULL,
+            flags,
+            NULL,
+            NULL
+            );
+
+    /* issue the Irp to the underlying transport
+       driver to disconnect the connection    */
+
+    status = IoCallDriver(DeviceObject, Irp);
+
+    if (STATUS_PENDING == status) {
+
+        status = KeWaitForSingleObject(
+                     &Event,
+                     Executive,
+                     KernelMode,
+                     FALSE,
+                     NULL
+                     );
+
+        status = Irp->IoStatus.Status;
+    }
+
+    KsPrint((2, "KsDisconnect: Disconnection is done with Status = %xh (%s) ...\n",
+                status, KsNtStatusToString(status)));
+
+    IoFreeIrp(Irp);
+
+    if (info->ConnectionInfo) {
+
+        /* disassociate the association between connection/address objects */
+
+        status = KsDisassociateAddress(ConnectionObject);
+
+        if (!NT_SUCCESS(status)) {
+            cfs_enter_debugger();
+        }
+
+        spin_lock(&(tconn->kstc_lock));
+
+        /* cleanup the tsdumgr Lists */
+        KsCleanupTsdu (tconn);
+
+        /* set the state of the tconn */
+        if (NT_SUCCESS(status)) {
+            tconn->kstc_state = ksts_disconnected;
+        } else {
+            tconn->kstc_state = ksts_associated;
+        }
+
+        /* free  the connection info to system pool*/
+        ExFreePool(info->ConnectionInfo);
+        info->ConnectionInfo = NULL;
+        info->Remote = NULL;
+
+        spin_unlock(&(tconn->kstc_lock));
+    }
+
+    status = STATUS_SUCCESS;
+
+errorout:
+
+    ks_put_tconn(tconn);
+
+    return cfs_error_code(status);
+}
+
+
+/*
+ * ks_abort_tconn
+ *   The connection is broken un-expectedly. We need do
+ *   some cleanup.
+ *
+ * Arguments:
+ *   tconn: the tdi connection
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+void
+ks_abort_tconn(
+    ksock_tconn_t *     tconn
+    )
+{
+    PKS_DISCONNECT_WORKITEM WorkItem = NULL;
+
+    WorkItem = &(tconn->kstc_disconnect);
+
+    ks_get_tconn(tconn);
+    spin_lock(&(tconn->kstc_lock));
+
+    if (tconn->kstc_state != ksts_connected) {
+        ks_put_tconn(tconn);
+    } else {
+
+        if (!cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY)) {
+
+            WorkItem->Flags = TDI_DISCONNECT_ABORT;
+            WorkItem->tconn = tconn;
+
+            cfs_set_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY);
+
+            ExQueueWorkItem(
+                    &(WorkItem->WorkItem),
+                    DelayedWorkQueue
+                    );
+        }
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+}
+
+
+/*
+ * ks_query_local_ipaddr
+ *   query the local connection ip address
+ *
+ * Arguments:
+ *   tconn:  the tconn which is connected
+ *
+ * Return Value:
+ *   int: ks error code
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_query_local_ipaddr(
+    ksock_tconn_t *     tconn
+    )
+{
+    PFILE_OBJECT    FileObject = NULL;
+    NTSTATUS        status;
+
+    PTRANSPORT_ADDRESS TdiAddress;
+    ULONG              AddressLength;
+
+    if (tconn->kstc_type == kstt_sender) {
+        FileObject = tconn->sender.kstc_info.FileObject;
+    } else if (tconn->kstc_type == kstt_child) {
+        FileObject = tconn->child.kstc_info.FileObject;
+    } else {
+        status = STATUS_INVALID_PARAMETER;
+        goto errorout;
+    }
+
+    TdiAddress = &(tconn->kstc_addr.Tdi);
+    AddressLength = MAX_ADDRESS_LENGTH;
+
+    status =  KsQueryIpAddress(FileObject, TdiAddress, &AddressLength);
+
+    if (NT_SUCCESS(status)) {
+
+        KsPrint((0, "ks_query_local_ipaddr: Local ip address = %xh port = %xh\n",
+                ((PTDI_ADDRESS_IP)(&(TdiAddress->Address[0].Address)))->in_addr,
+                ((PTDI_ADDRESS_IP)(&(TdiAddress->Address[0].Address)))->sin_port ));
+    } else {
+        KsPrint((0, "KsQueryonnectionIpAddress: Failed to query the connection local ip address.\n"));
+    }
+
+errorout:
+
+    return cfs_error_code(status);
+}
+
+/*
+ * ks_send_mdl
+ *   send MDL chain to the peer for a stream connection
+ *
+ * Arguments:
+ *   tconn: tdi connection object
+ *   tx:    the transmit context
+ *   mdl:   the mdl chain containing the data
+ *   len:   length of the data
+ *   flags: flags of the transmission
+ *
+ * Return Value:
+ *   ks return code
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_send_mdl(
+    ksock_tconn_t * tconn,
+    void *          tx,
+    ksock_mdl_t *   mdl,
+    int             len,
+    int             flags
+    )
+{
+    NTSTATUS            Status;
+    int                 rc = 0;
+    ulong_ptr       length;
+    ulong_ptr       tflags;
+    ksock_tdi_tx_t *    context;
+
+    PKS_CHAIN           KsChain;
+    PKS_TSDUMGR         KsTsduMgr;
+    PKS_TSDU            KsTsdu;
+    PKS_TSDU_BUF        KsTsduBuf;
+    PKS_TSDU_DAT        KsTsduDat;
+
+    BOOLEAN             bNewTsdu = FALSE;   /* newly allocated */
+    BOOLEAN             bNewBuff = FALSE;   /* newly allocated */
+
+    BOOLEAN             bBuffed;            /* bufferred sending */
+
+    PUCHAR              Buffer = NULL;
+    ksock_mdl_t *       NewMdl = NULL;
+
+    PIRP                Irp = NULL;
+    PFILE_OBJECT        ConnObject;
+    PDEVICE_OBJECT      DeviceObject;
+
+    BOOLEAN             bIsNonBlock;
+
+    ks_get_tconn(tconn);
+
+    tflags = ks_tdi_send_flags(flags);
+    bIsNonBlock  = cfs_is_flag_set(flags, MSG_DONTWAIT);
+
+    spin_lock(&tconn->kstc_lock);
+
+    LASSERT( tconn->kstc_type == kstt_sender ||
+             tconn->kstc_type == kstt_child );
+
+    if (tconn->kstc_state != ksts_connected) {
+        spin_unlock(&tconn->kstc_lock);
+        ks_put_tconn(tconn);
+        return -ENOTCONN;
+    }
+
+    /* get the latest Tsdu buffer form TsduMgr list.
+       just set NULL if the list is empty. */
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_send);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_send);
+    }
+
+    if (cfs_is_flag_set(tflags, TDI_SEND_EXPEDITED)) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+    if (KsTsduMgr->TotalBytes + len <= tconn->kstc_snd_wnd) {
+        bBuffed = TRUE;
+    } else {
+        bBuffed = FALSE;
+    }
+
+    /* do the preparation work for bufferred sending */
+
+    if (bBuffed) {
+
+        /* if the data is even larger than the biggest Tsdu, we have
+           to allocate new buffer and use TSDU_TYOE_BUF to store it */
+
+        if ( KS_TSDU_STRU_SIZE((ULONG)len) > ks_data.ksnd_tsdu_size
+             - KS_DWORD_ALIGN(sizeof(KS_TSDU))) {
+            bNewBuff = TRUE;
+        }
+
+        if (list_empty(&(KsTsduMgr->TsduList))) {
+
+            LASSERT(KsTsduMgr->NumOfTsdu == 0);
+            KsTsdu = NULL;
+
+        } else {
+
+            LASSERT(KsTsduMgr->NumOfTsdu > 0);
+            KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link);
+            LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+
+            /* check whether KsTsdu free space is enough, or we need alloc new Tsdu */
+            if (bNewBuff) {
+                if (sizeof(KS_TSDU_BUF) + KsTsdu->LastOffset > KsTsdu->TotalLength) {
+                    KsTsdu = NULL;
+                }
+            } else {
+                if ( KS_TSDU_STRU_SIZE((ULONG)len) >
+                     KsTsdu->TotalLength - KsTsdu->LastOffset ) {
+                    KsTsdu = NULL;
+                }
+            }
+        }
+
+        /* if there's no Tsdu or the free size is not enough for the
+           KS_TSDU_BUF or KS_TSDU_DAT. We need re-allocate a new Tsdu.  */
+
+        if (NULL == KsTsdu) {
+
+            KsTsdu = KsAllocateKsTsdu();
+
+            if (NULL == KsTsdu) {
+                bBuffed = FALSE;
+                bNewBuff = FALSE;
+            } else {
+                bNewTsdu = TRUE;
+            }
+        }
+
+        /* process the case that a new buffer is to be allocated from system memory */
+        if (bNewBuff) {
+
+            /* now allocating internal buffer to contain the payload */
+            Buffer = ExAllocatePool(NonPagedPool, len);
+
+            if (NULL == Buffer) {
+                bBuffed = FALSE;
+            }
+        }
+    }
+
+    if (bBuffed) {
+
+        if (bNewBuff) {
+
+            /* queue a new KS_TSDU_BUF to the Tsdu buffer */
+            KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+            KsTsduBuf->TsduFlags    =  0;
+            KsTsduBuf->DataLength   =  (ULONG)len;
+            KsTsduBuf->StartOffset  =  0;
+            KsTsduBuf->UserBuffer   =  Buffer;
+        } else {
+            /* queue a new KS_TSDU_BUF to the Tsdu buffer */
+            KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->LastOffset);
+
+            KsTsduDat->TsduFlags    =  0;
+            KsTsduDat->DataLength   =  (ULONG)len;
+            KsTsduDat->StartOffset  =  0;
+            KsTsduDat->TotalLength  = KS_TSDU_STRU_SIZE((ULONG)len);
+
+            Buffer = &KsTsduDat->Data[0];
+        }
+
+        /* now locking the Buffer and copy user payload into the buffer */
+        ASSERT(Buffer != NULL);
+
+        rc = ks_lock_buffer(Buffer, FALSE, len, IoReadAccess, &NewMdl);
+        if (rc != 0) {
+            printk("ks_send_mdl: bufferred: error allocating mdl.\n");
+            bBuffed = FALSE;
+        } else {
+            ULONG BytesCopied = 0;
+            TdiCopyMdlToBuffer(mdl, 0, Buffer, 0, (ULONG)len, &BytesCopied);
+            if (BytesCopied != (ULONG) len) {
+                bBuffed = FALSE;
+            }
+        }
+
+        /* Do the finializing job if we succeed to to lock the buffer and move
+           user data. Or we need do cleaning up ... */
+        if (bBuffed) {
+
+            if (bNewBuff) {
+                KsTsduBuf->TsduType     =  TSDU_TYPE_BUF;
+                KsTsdu->LastOffset += sizeof(KS_TSDU_BUF);
+
+            } else {
+                KsTsduDat->TsduType     =  TSDU_TYPE_DAT;
+                KsTsdu->LastOffset += KsTsduDat->TotalLength;
+            }
+
+            /* attach it to the TsduMgr list if the Tsdu is newly created. */
+            if (bNewTsdu) {
+
+                list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+                KsTsduMgr->NumOfTsdu++;
+            }
+
+        } else {
+
+            if (NewMdl) {
+                ks_release_mdl(NewMdl, FALSE);
+                NewMdl = NULL;
+            }
+
+            if (bNewBuff) {
+                ExFreePool(Buffer);
+                Buffer = NULL;
+                bNewBuff = FALSE;
+            }
+        }
+    }
+
+    /* update the TotalBytes being in sending */
+    KsTsduMgr->TotalBytes += (ULONG)len;
+
+    spin_unlock(&tconn->kstc_lock);
+
+    /* cleanup the Tsdu if not successful */
+    if (!bBuffed && bNewTsdu) {
+        KsPutKsTsdu(KsTsdu);
+        bNewTsdu = FALSE;
+        KsTsdu = NULL;
+    }
+
+    /* we need allocate the ksock_tx_t structure from memory pool. */
+
+    context = cfs_alloc(sizeof(ksock_tdi_tx_t) + sizeof(KEVENT),0);
+    if (!context) {
+        /* release the chained mdl */
+        ks_release_mdl(mdl, FALSE);
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    /* intialize the TcpContext */
+
+    memset(context,0, sizeof(ksock_tdi_tx_t) + sizeof(KEVENT));
+
+    context->tconn = tconn;
+    context->Event = (PKEVENT) ((PUCHAR)context + sizeof(ksock_tdi_tx_t));
+
+    KeInitializeEvent(context->Event, SynchronizationEvent, FALSE);
+
+    if (bBuffed) {
+
+         /* for bufferred transmission, we need set
+            the internal completion routine.  */
+
+        context->CompletionRoutine  = KsTcpSendCompletionRoutine;
+        context->KsTsduMgr          = KsTsduMgr;
+        context->CompletionContext  = KsTsdu;
+        context->CompletionContext2 = (bNewBuff ? (PVOID)KsTsduBuf : (PVOID)KsTsduDat);
+        context->bCounted = FALSE;
+
+    } else if (bIsNonBlock) {
+
+         /* for non-blocking transmission, we need set
+            the internal completion routine too.  */
+
+        context->CompletionRoutine = KsTcpSendCompletionRoutine;
+        context->CompletionContext = tx;
+        context->KsTsduMgr         = KsTsduMgr;
+        context->bCounted = TRUE;
+        context->ReferCount = 2;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        ConnObject = tconn->sender.kstc_info.FileObject;
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        ConnObject = tconn->child.kstc_info.FileObject;
+    }
+
+    DeviceObject = IoGetRelatedDeviceObject(ConnObject);
+
+    Irp = KsBuildTdiIrp(DeviceObject);
+
+    if (NULL == Irp) {
+
+        /* release the chained mdl */
+        ks_release_mdl(mdl, FALSE);
+
+        Status = STATUS_INSUFFICIENT_RESOURCES;
+        goto errorout;
+    }
+
+    length = KsQueryMdlsSize(mdl);
+
+    LASSERT((ULONG)len <= length);
+
+    ks_get_tconn(tconn);
+
+    TdiBuildSend(
+        Irp,
+        DeviceObject,
+        ConnObject,
+        KsTcpCompletionRoutine,
+        context,
+        (bBuffed ? NewMdl : mdl),
+        (bBuffed ? (tflags | TDI_SEND_NON_BLOCKING) : tflags),
+        (ULONG)len;
+      );
+
+    Status = IoCallDriver(DeviceObject, Irp);
+
+    if (bBuffed) {
+        ks_release_mdl(mdl, FALSE);
+        NewMdl = NULL;
+    }
+
+    if (!NT_SUCCESS(Status)) {
+        cfs_enter_debugger();
+        rc = cfs_error_code(Status);
+        goto errorout;
+    }
+
+    if (bBuffed) {
+        Status = STATUS_SUCCESS;
+        rc  = len;
+        context = NULL;
+    } else {
+        if (bIsNonBlock) {
+            if (InterlockedDecrement(&context->ReferCount) == 0) {
+                Status = Irp->IoStatus.Status;
+            } else {
+                Status = STATUS_PENDING;
+                context = NULL;
+            }
+        } else {
+            if (STATUS_PENDING == Status) {
+                Status = KeWaitForSingleObject(
+                         context->Event,
+                         Executive,
+                         KernelMode,
+                         FALSE,
+                         NULL
+                         );
+
+                if (NT_SUCCESS(Status)) {
+                    Status = Irp->IoStatus.Status;
+                }
+            }
+        }
+
+        if (Status == STATUS_SUCCESS) {
+            rc = (int)(Irp->IoStatus.Information);
+
+            spin_lock(&tconn->kstc_lock);
+            KsTsduMgr->TotalBytes -= rc;
+            spin_unlock(&tconn->kstc_lock);
+
+        } else {
+            rc = cfs_error_code(Status);
+        }
+    }
+
+errorout:
+
+    if (bBuffed) {
+
+        if (NewMdl) {
+            ks_release_mdl(NewMdl, FALSE);
+            NewMdl = NULL;
+        }
+
+        if (bNewBuff) {
+            if (!NT_SUCCESS(Status)) {
+                ExFreePool(Buffer);
+                Buffer = NULL;
+            }
+        }
+
+    } else {
+
+        if (Status != STATUS_PENDING) {
+
+            if (Irp) {
+
+                /* Freeing the Irp ... */
+
+                IoFreeIrp(Irp);
+                Irp = NULL;
+            }
+        }
+    }
+
+    if (!NT_SUCCESS(Status)) {
+
+        spin_lock(&tconn->kstc_lock);
+
+        KsTsduMgr->TotalBytes -= (ULONG)len;
+
+        if (bBuffed) {
+
+            /* attach it to the TsduMgr list if the Tsdu is newly created. */
+            if (bNewTsdu) {
+
+                list_del(&(KsTsdu->Link));
+                KsTsduMgr->NumOfTsdu--;
+
+                KsPutKsTsdu(KsTsdu);
+            } else {
+                if (bNewBuff) {
+                    if ( (ulong_ptr)KsTsduBuf + sizeof(KS_TSDU_BUF) ==
+                         (ulong_ptr)KsTsdu + KsTsdu->LastOffset) {
+                        KsTsdu->LastOffset -= sizeof(KS_TSDU_BUF);
+                        KsTsduBuf->TsduType = 0;
+                    } else {
+                        cfs_enter_debugger();
+                        KsTsduBuf->StartOffset = KsTsduBuf->DataLength;
+                    }
+                } else {
+                    if ( (ulong_ptr)KsTsduDat + KsTsduDat->TotalLength ==
+                         (ulong_ptr)KsTsdu + KsTsdu->LastOffset) {
+                        KsTsdu->LastOffset -= KsTsduDat->TotalLength;
+                        KsTsduDat->TsduType = 0;
+                    } else {
+                        cfs_enter_debugger();
+                        KsTsduDat->StartOffset = KsTsduDat->DataLength;
+                    }
+                }
+            }
+        }
+
+        spin_unlock(&tconn->kstc_lock);
+    }
+
+    /* free the context if is not used at all */
+    if (context) {
+        cfs_free(context);
+    }
+
+    ks_put_tconn(tconn);
+
+    return rc;
+}
+
+/*
+ * ks_recv_mdl
+ *   Receive data from the peer for a stream connection
+ *
+ * Arguments:
+ *   tconn: tdi connection object
+ *   mdl:   the mdl chain to contain the incoming data
+ *   len:   length of the data
+ *   flags: flags of the receiving
+ *
+ * Return Value:
+ *   ks return code
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_recv_mdl(
+    ksock_tconn_t * tconn,
+    ksock_mdl_t *   mdl,
+    int             size,
+    int             flags
+    )
+{
+    NTSTATUS        Status = STATUS_SUCCESS;
+    int             rc = 0;
+
+    BOOLEAN         bIsNonBlock;
+    BOOLEAN         bIsExpedited;
+
+    PKS_CHAIN       KsChain;
+    PKS_TSDUMGR     KsTsduMgr;
+    PKS_TSDU        KsTsdu;
+    PKS_TSDU_DAT    KsTsduDat;
+    PKS_TSDU_BUF    KsTsduBuf;
+    PKS_TSDU_MDL    KsTsduMdl;
+
+    PUCHAR          Buffer;
+
+    ULONG           BytesRecved = 0;
+    ULONG           RecvedOnce;
+
+    bIsNonBlock  = cfs_is_flag_set(flags, MSG_DONTWAIT);
+    bIsExpedited = cfs_is_flag_set(flags, MSG_OOB);
+
+    ks_get_tconn(tconn);
+
+Again:
+
+    RecvedOnce = 0;
+
+    spin_lock(&(tconn->kstc_lock));
+
+    if ( tconn->kstc_type != kstt_sender &&
+         tconn->kstc_type != kstt_child) {
+
+        rc = -EINVAL;
+        spin_unlock(&(tconn->kstc_lock));
+
+        goto errorout;
+    }
+
+    if (tconn->kstc_state != ksts_connected) {
+
+        rc = -ENOTCONN;
+        spin_unlock(&(tconn->kstc_lock));
+
+        goto errorout;
+    }
+
+    if (tconn->kstc_type == kstt_sender) {
+        KsChain = &(tconn->sender.kstc_recv);
+    } else {
+        LASSERT(tconn->kstc_type == kstt_child);
+        KsChain = &(tconn->child.kstc_recv);
+    }
+
+    if (bIsExpedited) {
+        KsTsduMgr = &(KsChain->Expedited);
+    } else {
+        KsTsduMgr = &(KsChain->Normal);
+    }
+
+NextTsdu:
+
+    if (list_empty(&(KsTsduMgr->TsduList))) {
+
+        //
+        // It's a notification event. We need reset it to
+        // un-signaled state in case there no any tsdus.
+        //
+
+        KeResetEvent(&(KsTsduMgr->Event));
+
+    } else {
+
+        KsTsdu = list_entry(KsTsduMgr->TsduList.next, KS_TSDU, Link);
+        LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC);
+
+        /* remove the KsTsdu from TsduMgr list to release the lock */
+        list_del(&(KsTsdu->Link));
+        KsTsduMgr->NumOfTsdu--;
+
+        spin_unlock(&(tconn->kstc_lock));
+
+        while ((ULONG)size > BytesRecved) {
+
+            ULONG BytesCopied = 0;
+            ULONG BytesToCopy = 0;
+            ULONG StartOffset = 0;
+
+            KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+            KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->StartOffset);
+
+            if ( TSDU_TYPE_DAT == KsTsduDat->TsduType ||
+                 TSDU_TYPE_BUF == KsTsduBuf->TsduType ) {
+
+
+                //
+                // Data Tsdu Unit ...
+                //
+
+                if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+
+                    if (cfs_is_flag_set(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING)) {
+                        /* data is not ready yet*/
+                        KeResetEvent(&(KsTsduMgr->Event));
+                        printk("ks_recv_mdl: KsTsduDat (%xh) is not ready yet !!!!!!!\n", KsTsduDat);
+                        break;
+                    }
+
+                    Buffer = &KsTsduDat->Data[0];
+                    StartOffset = KsTsduDat->StartOffset;
+                    if (KsTsduDat->DataLength - KsTsduDat->StartOffset > size - BytesRecved) {
+                        /* Recvmsg requst could be statisfied ... */
+                        BytesToCopy = size - BytesRecved;
+                    } else {
+                        BytesToCopy = KsTsduDat->DataLength - KsTsduDat->StartOffset;
+                    }
+
+                } else {
+
+                    if (cfs_is_flag_set(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING)) {
+                        /* data is not ready yet*/
+                        KeResetEvent(&(KsTsduMgr->Event));
+                        DbgPrint("ks_recv_mdl: KsTsduBuf (%xh) is not ready yet !!!!!!!\n", KsTsduBuf);
+                        break;
+                    }
+
+                    ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType);
+                    Buffer = KsTsduBuf->UserBuffer;
+                    StartOffset = KsTsduBuf->StartOffset;
+
+                    if (KsTsduBuf->DataLength - KsTsduBuf->StartOffset > size - BytesRecved) {
+                        /* Recvmsg requst could be statisfied ... */
+                        BytesToCopy = size - BytesRecved;
+                    } else {
+                        BytesToCopy = KsTsduBuf->DataLength - KsTsduBuf->StartOffset;
+                    }
+                }
+
+                if (BytesToCopy > 0) {
+                    Status = TdiCopyBufferToMdl(
+                                    Buffer,
+                                    StartOffset,
+                                    BytesToCopy,
+                                    mdl,
+                                    BytesRecved,
+                                    &BytesCopied
+                                    );
+
+                    if (NT_SUCCESS(Status)) {
+
+                        if (BytesToCopy != BytesCopied) {
+                            cfs_enter_debugger();
+                        }
+
+                        BytesRecved += BytesCopied;
+                        RecvedOnce  += BytesCopied;
+
+                    } else {
+
+                        cfs_enter_debugger();
+
+                        if (STATUS_BUFFER_OVERFLOW == Status) {
+                        }
+                    }
+                }
+
+                if (TSDU_TYPE_DAT == KsTsduDat->TsduType) {
+
+                    KsTsduDat->StartOffset += BytesCopied;
+
+                    if (KsTsduDat->StartOffset == KsTsduDat->DataLength) {
+                        KsTsdu->StartOffset += KsTsduDat->TotalLength;
+                    }
+
+                } else {
+
+                    ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType);
+                    KsTsduBuf->StartOffset += BytesCopied;
+                    if (KsTsduBuf->StartOffset == KsTsduBuf->DataLength) {
+                        KsTsdu->StartOffset += sizeof(KS_TSDU_BUF);
+                        /* now we need release the buf to system pool */
+                        ExFreePool(KsTsduBuf->UserBuffer);
+                    }
+                }
+
+            } else if (TSDU_TYPE_MDL == KsTsduMdl->TsduType) {
+
+                //
+                // MDL Tsdu Unit ...
+                //
+
+                if (KsTsduMdl->DataLength > size - BytesRecved) {
+
+                    /* Recvmsg requst could be statisfied ... */
+
+                    BytesToCopy = size - BytesRecved;
+
+                } else {
+
+                    BytesToCopy = KsTsduMdl->DataLength;
+                }
+
+                Status = KsCopyMdlChainToMdlChain(
+                            KsTsduMdl->Mdl,
+                            KsTsduMdl->StartOffset,
+                            mdl,
+                            BytesRecved,
+                            BytesToCopy,
+                            &BytesCopied
+                            );
+
+                if (NT_SUCCESS(Status)) {
+
+                    if (BytesToCopy != BytesCopied) {
+                        cfs_enter_debugger();
+                    }
+
+                    KsTsduMdl->StartOffset += BytesCopied;
+                    KsTsduMdl->DataLength  -= BytesCopied;
+
+                    BytesRecved += BytesCopied;
+                    RecvedOnce  += BytesCopied;
+                } else {
+                    cfs_enter_debugger();
+                }
+
+                if (0 == KsTsduMdl->DataLength) {
+
+                    //
+                    // Call TdiReturnChainedReceives to release the Tsdu memory
+                    //
+
+                    TdiReturnChainedReceives(
+                        &(KsTsduMdl->Descriptor),
+                        1 );
+
+                    KsTsdu->StartOffset += sizeof(KS_TSDU_MDL);
+                }
+
+            } else {
+                printk("ks_recv_mdl: unknown tsdu slot: slot = %x type = %x Start= %x\n",
+                        KsTsduDat, KsTsduDat->TsduType, KsTsduDat->StartOffset, KsTsduDat->DataLength);
+                printk("        Tsdu = %x Magic=%x: Start = %x Last = %x Length = %x",
+                        KsTsdu, KsTsdu->Magic, KsTsdu->StartOffset, KsTsdu->LastOffset, KsTsdu->TotalLength);
+                cfs_enter_debugger();
+            }
+
+            if (KsTsdu->StartOffset == KsTsdu->LastOffset) {
+
+                //
+                // KsTsdu is empty now, we need free it ...
+                //
+
+                KsPutKsTsdu(KsTsdu);
+                KsTsdu = NULL;
+
+                break;
+            }
+        }
+
+        spin_lock(&(tconn->kstc_lock));
+
+        /* we need attach the KsTsdu to the list header */
+        if (KsTsdu) {
+            KsTsduMgr->NumOfTsdu++;
+            list_add(&(KsTsdu->Link), &(KsTsduMgr->TsduList));
+        } else if ((ULONG)size > BytesRecved) {
+            goto NextTsdu;
+        }
+    }
+
+    if (KsTsduMgr->TotalBytes < RecvedOnce) {
+        cfs_enter_debugger();
+        KsTsduMgr->TotalBytes = 0;
+    } else {
+        KsTsduMgr->TotalBytes -= RecvedOnce;
+    }
+
+    spin_unlock(&(tconn->kstc_lock));
+
+    if (NT_SUCCESS(Status)) {
+
+        if ((BytesRecved < (ulong_ptr)size) && (!bIsNonBlock)) {
+
+            KeWaitForSingleObject(
+                &(KsTsduMgr->Event),
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+            goto Again;
+        }
+
+        if (bIsNonBlock && (BytesRecved == 0)) {
+            rc = -EAGAIN;
+        } else {
+            rc = BytesRecved;
+        }
+    }
+
+errorout:
+
+    ks_put_tconn(tconn);
+
+    if (rc > 0) {
+        KsPrint((1, "ks_recv_mdl: recvieving %d bytes ...\n", rc));
+    } else {
+        KsPrint((0, "ks_recv_mdl: recvieving error code = %d Stauts = %xh ...\n", rc, Status));
+    }
+
+    /* release the chained mdl */
+    ks_release_mdl(mdl, FALSE);
+
+    return (rc);
+}
+
+
+/*
+ * ks_init_tdi_data
+ *   initialize the global data in ksockal_data
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   int: ks error code
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_init_tdi_data()
+{
+    int rc = 0;
+
+    /* initialize tconn related globals */
+    RtlZeroMemory(&ks_data, sizeof(ks_data_t));
+
+    spin_lock_init(&ks_data.ksnd_tconn_lock);
+    CFS_INIT_LIST_HEAD(&ks_data.ksnd_tconns);
+    cfs_init_event(&ks_data.ksnd_tconn_exit, TRUE, FALSE);
+
+    ks_data.ksnd_tconn_slab = cfs_mem_cache_create(
+        "tcon", sizeof(ksock_tconn_t) , 0, 0);
+
+    if (!ks_data.ksnd_tconn_slab) {
+        rc = -ENOMEM;
+        goto errorout;
+    }
+
+    /* initialize tsdu related globals */
+
+    spin_lock_init(&ks_data.ksnd_tsdu_lock);
+    CFS_INIT_LIST_HEAD(&ks_data.ksnd_freetsdus);
+    ks_data.ksnd_tsdu_size = TDINAL_TSDU_DEFAULT_SIZE; /* 64k */
+    ks_data.ksnd_tsdu_slab = cfs_mem_cache_create(
+        "tsdu", ks_data.ksnd_tsdu_size, 0, 0);
+
+    if (!ks_data.ksnd_tsdu_slab) {
+        rc = -ENOMEM;
+        cfs_mem_cache_destroy(ks_data.ksnd_tconn_slab);
+        ks_data.ksnd_tconn_slab = NULL;
+        goto errorout;
+    }
+
+    /* initialize daemon related globals */
+
+    spin_lock_init(&ks_data.ksnd_daemon_lock);
+    CFS_INIT_LIST_HEAD(&ks_data.ksnd_daemons);
+    cfs_init_event(&ks_data.ksnd_daemon_exit, TRUE, FALSE);
+
+    KsRegisterPnpHandlers();
+
+errorout:
+
+    return rc;
+}
+
+
+/*
+ * ks_fini_tdi_data
+ *   finalize the global data in ksockal_data
+ *
+ * Arguments:
+ *   N/A
+ *
+ * Return Value:
+ *   int: ks error code
+ *
+ * Notes:
+ *   N/A
+ */
+
+void
+ks_fini_tdi_data()
+{
+    PKS_TSDU            KsTsdu = NULL;
+    struct list_head *  list   = NULL;
+
+    /* clean up the pnp handler and address slots */
+    KsDeregisterPnpHandlers();
+
+    /* we need wait until all the tconn are freed */
+    spin_lock(&(ks_data.ksnd_tconn_lock));
+
+    if (list_empty(&(ks_data.ksnd_tconns))) {
+        cfs_wake_event(&ks_data.ksnd_tconn_exit);
+    }
+    spin_unlock(&(ks_data.ksnd_tconn_lock));
+
+    /* now wait on the tconn exit event */
+    cfs_wait_event(&ks_data.ksnd_tconn_exit, 0);
+
+    /* it's safe to delete the tconn slab ... */
+    cfs_mem_cache_destroy(ks_data.ksnd_tconn_slab);
+    ks_data.ksnd_tconn_slab = NULL;
+
+    /* clean up all the tsud buffers in the free list */
+    spin_lock(&(ks_data.ksnd_tsdu_lock));
+    list_for_each (list, &ks_data.ksnd_freetsdus) {
+        KsTsdu = list_entry (list, KS_TSDU, Link);
+
+        cfs_mem_cache_free(
+                ks_data.ksnd_tsdu_slab,
+                KsTsdu );
+    }
+    spin_unlock(&(ks_data.ksnd_tsdu_lock));
+
+    /* it's safe to delete the tsdu slab ... */
+    cfs_mem_cache_destroy(ks_data.ksnd_tsdu_slab);
+    ks_data.ksnd_tsdu_slab = NULL;
+
+    /* good! it's smooth to do the cleaning up...*/
+}
+
+/*
+ * ks_create_child_tconn
+ *   Create the backlog child connection for a listener
+ *
+ * Arguments:
+ *   parent: the listener daemon connection
+ *
+ * Return Value:
+ *   the child connection or NULL in failure
+ *
+ * Notes:
+ *   N/A
+ */
+
+ksock_tconn_t *
+ks_create_child_tconn(
+    ksock_tconn_t * parent
+    )
+{
+    NTSTATUS            status;
+    ksock_tconn_t *     backlog;
+
+    /* allocate the tdi connecton object */
+    backlog = ks_create_tconn();
+
+    if (!backlog) {
+        goto errorout;
+    }
+
+    /* initialize the tconn as a child */
+    ks_init_child(backlog);
+
+
+    /* now bind it */
+    if (ks_bind_tconn(backlog, parent, 0, 0) < 0) {
+        ks_free_tconn(backlog);
+        backlog = NULL;
+        goto errorout;
+    }
+
+    /* open the connection object */
+    status = KsOpenConnection(
+                &(backlog->kstc_dev),
+                (PVOID)backlog,
+                &(backlog->child.kstc_info.Handle),
+                &(backlog->child.kstc_info.FileObject)
+                );
+
+    if (!NT_SUCCESS(status)) {
+
+        ks_put_tconn(backlog);
+        backlog = NULL;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    /* associate it now ... */
+    status = KsAssociateAddress(
+                backlog->kstc_addr.Handle,
+                backlog->child.kstc_info.FileObject
+                );
+
+    if (!NT_SUCCESS(status)) {
+
+        ks_put_tconn(backlog);
+        backlog = NULL;
+        cfs_enter_debugger();
+        goto errorout;
+    }
+
+    backlog->kstc_state = ksts_associated;
+
+errorout:
+
+    return backlog;
+}
+
+/*
+ * ks_replenish_backlogs(
+ *   to replenish the backlogs listening...
+ *
+ * Arguments:
+ *   tconn: the parent listen tdi connect
+ *   nbacklog: number fo child connections in queue
+ *
+ * Return Value:
+ *   N/A
+ *
+ * Notes:
+ *   N/A
+ */
+
+void
+ks_replenish_backlogs(
+    ksock_tconn_t * parent,
+    int     nbacklog
+    )
+{
+    ksock_tconn_t * backlog;
+    int            n = 0;
+
+    /* calculate how many backlogs needed */
+    if ( ( parent->listener.kstc_listening.num +
+           parent->listener.kstc_accepted.num ) < nbacklog ) {
+        n = nbacklog - ( parent->listener.kstc_listening.num +
+            parent->listener.kstc_accepted.num );
+    } else {
+        n = 0;
+    }
+
+    while (n--) {
+
+        /* create the backlog child tconn */
+        backlog = ks_create_child_tconn(parent);
+
+        spin_lock(&(parent->kstc_lock));
+
+        if (backlog) {
+            spin_lock(&backlog->kstc_lock);
+            /* attch it into the listing list of daemon */
+            list_add( &backlog->child.kstc_link,
+                      &parent->listener.kstc_listening.list );
+            parent->listener.kstc_listening.num++;
+
+            backlog->child.kstc_queued = TRUE;
+            spin_unlock(&backlog->kstc_lock);
+        } else {
+            cfs_enter_debugger();
+        }
+
+        spin_unlock(&(parent->kstc_lock));
+    }
+}
+
+/*
+ * ks_start_listen
+ *   setup the listener tdi connection and make it listen
+ *    on the user specified ip address and port.
+ *
+ * Arguments:
+ *   tconn: the parent listen tdi connect
+ *   nbacklog: number fo child connections in queue
+ *
+ * Return Value:
+ *   ks error code >=: success; otherwise error.
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_start_listen(ksock_tconn_t *tconn, int nbacklog)
+{
+    int rc = 0;
+
+    /* now replenish the backlogs */
+    ks_replenish_backlogs(tconn, nbacklog);
+
+    /* set the event callback handlers */
+    rc = ks_set_handlers(tconn);
+
+    if (rc < 0) {
+        return rc;
+    }
+
+    spin_lock(&(tconn->kstc_lock));
+    tconn->listener.nbacklog = nbacklog;
+    tconn->kstc_state = ksts_listening;
+    cfs_set_flag(tconn->kstc_flags, KS_TCONN_DAEMON_STARTED);
+    spin_unlock(&(tconn->kstc_lock));
+
+    return rc;
+}
+
+void
+ks_stop_listen(ksock_tconn_t *tconn)
+{
+    struct list_head *      list;
+    ksock_tconn_t *         backlog;
+
+    /* reset all tdi event callbacks to NULL */
+    ks_reset_handlers (tconn);
+
+    spin_lock(&tconn->kstc_lock);
+
+    cfs_clear_flag(tconn->kstc_flags, KS_TCONN_DAEMON_STARTED);
+
+    /* cleanup all the listening backlog child connections */
+    list_for_each (list, &(tconn->listener.kstc_listening.list)) {
+        backlog = list_entry(list, ksock_tconn_t, child.kstc_link);
+
+        /* destory and free it */
+        ks_put_tconn(backlog);
+    }
+
+    spin_unlock(&tconn->kstc_lock);
+
+    /* wake up it from the waiting on new incoming connections */
+    KeSetEvent(&tconn->listener.kstc_accept_event, 0, FALSE);
+
+    /* free the listening daemon tconn */
+    ks_put_tconn(tconn);
+}
+
+
+/*
+ * ks_wait_child_tconn
+ *   accept a child connection from peer
+ *
+ * Arguments:
+ *   parent:   the daemon tdi connection listening
+ *   child:    to contain the accepted connection
+ *
+ * Return Value:
+ *   ks error code;
+ *
+ * Notes:
+ *   N/A
+ */
+
+int
+ks_wait_child_tconn(
+    ksock_tconn_t *  parent,
+    ksock_tconn_t ** child
+    )
+{
+    struct list_head * tmp;
+    ksock_tconn_t * backlog = NULL;
+
+    ks_replenish_backlogs(parent, parent->listener.nbacklog);
+
+    spin_lock(&(parent->kstc_lock));
+
+    if (parent->listener.kstc_listening.num <= 0) {
+        spin_unlock(&(parent->kstc_lock));
+        return -1;
+    }
+
+again:
+
+    /* check the listening queue and try to search the accepted connecton */
+
+    list_for_each(tmp, &(parent->listener.kstc_listening.list)) {
+        backlog = list_entry (tmp, ksock_tconn_t, child.kstc_link);
+
+        spin_lock(&(backlog->kstc_lock));
+
+        if (backlog->child.kstc_accepted) {
+
+            LASSERT(backlog->kstc_state == ksts_connected);
+            LASSERT(backlog->child.kstc_busy);
+
+            list_del(&(backlog->child.kstc_link));
+            list_add(&(backlog->child.kstc_link),
+                     &(parent->listener.kstc_accepted.list));
+            parent->listener.kstc_accepted.num++;
+            parent->listener.kstc_listening.num--;
+            backlog->child.kstc_queueno = 1;
+
+            spin_unlock(&(backlog->kstc_lock));
+
+            break;
+        } else {
+            spin_unlock(&(backlog->kstc_lock));
+            backlog = NULL;
+        }
+    }
+
+    spin_unlock(&(parent->kstc_lock));
+
+    /* we need wait until new incoming connections are requested
+       or the case of shuting down the listenig daemon thread  */
+    if (backlog == NULL) {
+
+        NTSTATUS    Status;
+
+        Status = KeWaitForSingleObject(
+                &(parent->listener.kstc_accept_event),
+                Executive,
+                KernelMode,
+                FALSE,
+                NULL
+                );
+
+        spin_lock(&(parent->kstc_lock));
+
+        /* check whether it's exptected to exit ? */
+        if (!cfs_is_flag_set(parent->kstc_flags, KS_TCONN_DAEMON_STARTED)) {
+            spin_unlock(&(parent->kstc_lock));
+        } else {
+            goto again;
+        }
+    }
+
+    if (backlog) {
+        /* query the local ip address of the connection */
+        ks_query_local_ipaddr(backlog);
+    }
+
+    *child = backlog;
+
+    return 0;
+}
+
+int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
+{
+    ks_addr_slot_t * slot = NULL;
+    PLIST_ENTRY      list = NULL;
+
+    spin_lock(&ks_data.ksnd_addrs_lock);
+
+    list = ks_data.ksnd_addrs_list.Flink;
+    while (list != &ks_data.ksnd_addrs_list) {
+        slot = CONTAINING_RECORD(list, ks_addr_slot_t, link);
+        if (_stricmp(name, &slot->iface[0]) == 0) {
+            *up = slot->up;
+            *ip = slot->ip_addr;
+            *mask = slot->netmask;
+            break;
+        }
+        list = list->Flink;
+        slot = NULL;
+    }
+
+    spin_unlock(&ks_data.ksnd_addrs_lock);
+
+    return (int)(slot == NULL);
+}
+
+int libcfs_ipif_enumerate(char ***names)
+{
+    ks_addr_slot_t * slot = NULL;
+    PLIST_ENTRY      list = NULL;
+    int              nips = 0;
+
+    spin_lock(&ks_data.ksnd_addrs_lock);
+
+    *names = cfs_alloc(sizeof(char *) * ks_data.ksnd_naddrs, CFS_ALLOC_ZERO);
+    if (*names == NULL) {
+        goto errorout;
+    }
+
+    list = ks_data.ksnd_addrs_list.Flink;
+    while (list != &ks_data.ksnd_addrs_list) {
+        slot = CONTAINING_RECORD(list, ks_addr_slot_t, link);
+        list = list->Flink;
+        (*names)[nips++] = slot->iface;
+        cfs_assert(nips <= ks_data.ksnd_naddrs);
+    }
+
+    cfs_assert(nips == ks_data.ksnd_naddrs);
+
+errorout:
+
+    spin_unlock(&ks_data.ksnd_addrs_lock);
+    return nips;
+}
+
+void libcfs_ipif_free_enumeration(char **names, int n)
+{
+    if (names) {
+        cfs_free(names);
+    }
+}
+
+int libcfs_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog)
+{
+    int                     rc = 0;
+    ksock_tconn_t *         parent;
+
+    parent = ks_create_tconn();
+    if (!parent) {
+        rc = -ENOMEM;
+        goto errorout;
+    }
+
+    /* initialize the tconn as a listener */
+    ks_init_listener(parent);
+
+    /* bind the daemon->tconn */
+    rc = ks_bind_tconn(parent, NULL, ip, (unsigned short)port);
+
+    if (rc < 0) {
+        ks_free_tconn(parent);
+        goto errorout;
+    }
+
+    /* create listening children and make it to listen state*/
+    rc = ks_start_listen(parent, backlog);
+    if (rc < 0) {
+        ks_stop_listen(parent);
+        goto errorout;
+    }
+
+    *sockp = parent;
+
+errorout:
+
+    return rc;
+}
+
+int libcfs_sock_accept(struct socket **newsockp, struct socket *sock)
+{
+    /* wait for incoming connecitons */
+    return ks_wait_child_tconn(sock, newsockp);
+}
+
+void libcfs_sock_abort_accept(struct socket *sock)
+{
+    LASSERT(sock->kstc_type == kstt_listener);
+
+    spin_lock(&(sock->kstc_lock));
+
+    /* clear the daemon flag */
+    cfs_clear_flag(sock->kstc_flags, KS_TCONN_DAEMON_STARTED);
+
+    /* wake up it from the waiting on new incoming connections */
+    KeSetEvent(&sock->listener.kstc_accept_event, 0, FALSE);
+
+    spin_unlock(&(sock->kstc_lock));
+}
+
+/*
+ * libcfs_sock_connect
+ *   build a conntion between local ip/port and the peer ip/port.
+ *
+ * Arguments:
+ *   laddr: local ip address
+ *   lport: local port number
+ *   paddr: peer's ip address
+ *   pport: peer's port number
+ *
+ * Return Value:
+ *   int:   return code ...
+ *
+ * Notes:
+ *   N/A
+ */
+
+
+int libcfs_sock_connect(struct socket **sockp, int *fatal,
+                        __u32 local_ip, int local_port,
+                        __u32 peer_ip, int peer_port)
+{
+    ksock_tconn_t * tconn = NULL;
+    int             rc = 0;
+
+    *sockp = NULL;
+
+    KsPrint((1, "libcfs_sock_connect: connecting to %x:%d with %x:%d...\n",
+                peer_ip, peer_port, local_ip, local_port ));
+
+    /* create the tdi connecion structure */
+    tconn = ks_create_tconn();
+    if (!tconn) {
+        rc = -ENOMEM;
+        goto errorout;
+    }
+
+    /* initialize the tdi sender connection */
+    ks_init_sender(tconn);
+
+    /* bind the local ip address with the tconn */
+    rc = ks_bind_tconn(tconn, NULL, local_ip, (unsigned short)local_port);
+    if (rc < 0) {
+        KsPrint((0, "libcfs_sock_connect: failed to bind address %x:%d...\n",
+                    local_ip, local_port ));
+        ks_free_tconn(tconn);
+        goto errorout;
+    }
+
+    /* connect to the remote peer */
+    rc = ks_build_tconn(tconn, peer_ip, (unsigned short)peer_port);
+    if (rc < 0) {
+        KsPrint((0, "libcfs_sock_connect: failed to connect %x:%d ...\n",
+                    peer_ip, peer_port ));
+
+        ks_put_tconn(tconn);
+        goto errorout;
+    }
+
+    *sockp = tconn;
+
+errorout:
+
+    return rc;
+}
+
+int libcfs_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize)
+{
+    return 0;
+}
+
+int libcfs_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize)
+{
+    return 0;
+}
+
+int libcfs_sock_getaddr(struct socket *socket, int remote, __u32 *ip, int *port)
+{
+    PTRANSPORT_ADDRESS  taddr = NULL;
+
+    spin_lock(&socket->kstc_lock);
+    if (remote) {
+        if (socket->kstc_type == kstt_sender) {
+            taddr = socket->sender.kstc_info.Remote;
+        } else if (socket->kstc_type == kstt_child) {
+            taddr = socket->child.kstc_info.Remote;
+        }
+    } else {
+        taddr = &(socket->kstc_addr.Tdi);
+    }
+
+    if (taddr) {
+        PTDI_ADDRESS_IP addr = (PTDI_ADDRESS_IP)(&(taddr->Address[0].Address));
+        if (ip != NULL)
+            *ip = ntohl (addr->in_addr);
+        if (port != NULL)
+            *port = ntohs (addr->sin_port);
+    } else {
+        spin_unlock(&socket->kstc_lock);
+        return -ENOTCONN;
+    }
+
+    spin_unlock(&socket->kstc_lock);
+    return 0;
+}
+
+int libcfs_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
+{
+    int           rc;
+    ksock_mdl_t * mdl;
+
+    int           offset = 0;
+
+    while (nob > offset) {
+
+        /* lock the user buffer */
+        rc = ks_lock_buffer( (char *)buffer + offset,
+                        FALSE, nob - offset, IoReadAccess, &mdl );
+
+        if (rc < 0) {
+            return (rc);
+        }
+
+        /* send out the whole mdl */
+        rc = ks_send_mdl( sock, NULL, mdl, nob - offset, 0 );
+
+        if (rc > 0) {
+            offset += rc;
+        } else {
+            return (rc);
+        }
+    }
+
+    return (0);
+}
+
+int libcfs_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
+{
+    int           rc;
+    ksock_mdl_t * mdl;
+
+    int           offset = 0;
+
+    while (nob > offset) {
+
+        /* lock the user buffer */
+        rc = ks_lock_buffer( (char *)buffer + offset,
+                               FALSE, nob - offset, IoWriteAccess, &mdl );
+
+        if (rc < 0) {
+            return (rc);
+        }
+
+        /* recv the requested buffer */
+        rc = ks_recv_mdl( sock, mdl, nob - offset, 0 );
+
+        if (rc > 0) {
+            offset += rc;
+        } else {
+            return (rc);
+        }
+    }
+
+    return (0);
+}
+
+void libcfs_sock_release(struct socket *sock)
+{
+    if (sock->kstc_type == kstt_listener &&
+        sock->kstc_state == ksts_listening) {
+        ks_stop_listen(sock);
+    } else {
+        ks_put_tconn(sock);
+    }
+}
diff --git a/lnet/libcfs/winnt/winnt-tracefile.c b/lnet/libcfs/winnt/winnt-tracefile.c
new file mode 100644 (file)
index 0000000..d172bff
--- /dev/null
@@ -0,0 +1,300 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#define LUSTRE_TRACEFILE_PRIVATE
+
+#include <libcfs/libcfs.h>
+#include <libcfs/kp30.h>
+#include "tracefile.h"
+
+#ifndef get_cpu
+#define get_cpu() smp_processor_id()
+#define put_cpu() do { } while (0)
+#endif
+
+extern union trace_data_union trace_data[NR_CPUS];
+extern char *tracefile;
+extern int64_t tracefile_size;
+
+event_t     tracefile_event;
+
+void tracefile_init_arch()
+{
+       int    i;
+       int    j;
+
+    cfs_init_event(&tracefile_event, TRUE, TRUE);
+
+    memset(trace_console_buffers, 0, sizeof(trace_console_buffers));
+
+       for (i = 0; i < NR_CPUS; i++) {
+               for (j = 0; j < 1; j++) {
+                       trace_console_buffers[i][j] =
+                               cfs_alloc(TRACE_CONSOLE_BUFFER_SIZE,
+                                       CFS_ALLOC_ZERO);
+
+                       if (trace_console_buffers[i][j] == NULL) {
+                               tracefile_fini_arch();
+                               KsPrint((0, "Can't allocate console message buffer\n"));
+                               return -ENOMEM;
+                       }
+               }
+    }
+
+       return 0;
+}
+
+void tracefile_fini_arch()
+{
+       int    i;
+       int    j;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               for (j = 0; j < 2; j++) {
+                       if (trace_console_buffers[i][j] != NULL) {
+                               cfs_free(trace_console_buffers[i][j]);
+                               trace_console_buffers[i][j] = NULL;
+                       }
+        }
+    }
+}
+
+void tracefile_read_lock()
+{
+    cfs_wait_event(&tracefile_event, 0);
+}
+
+void tracefile_read_unlock()
+{
+    cfs_wake_event(&tracefile_event);
+}
+
+void tracefile_write_lock()
+{
+    cfs_wait_event(&tracefile_event, 0);
+}
+
+void tracefile_write_unlock()
+{
+    cfs_wake_event(&tracefile_event);
+}
+
+char *
+trace_get_console_buffer(void)
+{
+#pragma message ("is there possible problem with pre-emption ?")
+    int cpu = (int) KeGetCurrentProcessorNumber();
+    return trace_console_buffers[cpu][0];
+}
+
+void
+trace_put_console_buffer(char *buffer)
+{
+}
+
+struct trace_cpu_data *
+trace_get_tcd(void)
+{
+#pragma message("todo: return NULL if in interrupt context")
+
+       int cpu = (int) KeGetCurrentProcessorNumber();
+       return &trace_data[cpu].tcd;
+}
+
+void
+trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags)
+{
+}
+
+void
+set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask,
+                    const int line, unsigned long stack)
+{
+       struct timeval tv;
+
+       do_gettimeofday(&tv);
+
+       header->ph_subsys = subsys;
+       header->ph_mask = mask;
+       header->ph_cpu_id = smp_processor_id();
+       header->ph_sec = (__u32)tv.tv_sec;
+       header->ph_usec = tv.tv_usec;
+       header->ph_stack = stack;
+       header->ph_pid = current->pid;
+       header->ph_line_num = line;
+       header->ph_extern_pid = 0;
+       return;
+}
+
+void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf,
+                                 int len, const char *file, const char *fn)
+{
+       char *prefix = NULL, *ptype = NULL;
+
+       if ((mask & D_EMERG) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_EMERG;
+       } else if ((mask & D_ERROR) != 0) {
+               prefix = "LustreError";
+               ptype = KERN_ERR;
+       } else if ((mask & D_WARNING) != 0) {
+               prefix = "Lustre";
+               ptype = KERN_WARNING;
+       } else if ((mask & libcfs_printk) != 0 || (mask & D_CONSOLE)) {
+               prefix = "Lustre";
+               ptype = KERN_INFO;
+       }
+
+       if ((mask & D_CONSOLE) != 0) {
+               printk("%s%s: %s", ptype, prefix, buf);
+       } else {
+               printk("%s%s: %d:%d:(%s:%d:%s()) %s", ptype, prefix, hdr->ph_pid,
+                      hdr->ph_extern_pid, file, hdr->ph_line_num, fn, buf);
+       }
+       return;
+}
+
+int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage)
+{
+       return 1;
+}
+
+
+int trace_write_daemon_file(struct file *file, const char *buffer,
+                            unsigned long count, void *data)
+{
+       char *name;
+       unsigned long off;
+       int rc;
+
+       name =cfs_alloc(count + 1, 0);
+       if (name == NULL)
+               return -ENOMEM;
+
+       if (copy_from_user((void *)name, (void*)buffer, count)) {
+               rc = -EFAULT;
+               goto out;
+       }
+
+       /* be nice and strip out trailing '\n' */
+       for (off = count ; off > 2 && isspace(name[off - 1]); off--)
+               ;
+
+       name[off] = '\0';
+
+       tracefile_write_lock();
+       if (strcmp(name, "stop") == 0) {
+               tracefile = NULL;
+               trace_stop_thread();
+               goto out_sem;
+       } else if (strncmp(name, "size=", 5) == 0) {
+               tracefile_size = simple_strtoul(name + 5, NULL, 0);
+               if (tracefile_size < 10 || tracefile_size > 20480)
+                       tracefile_size = TRACEFILE_SIZE;
+               else
+                       tracefile_size <<= 20;
+               goto out_sem;
+       }
+
+       if (tracefile != NULL)
+               cfs_free(tracefile);
+
+       tracefile = name;
+       name = NULL;
+       printk(KERN_INFO "Lustre: debug daemon will attempt to start writing "
+              "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10));
+
+       trace_start_thread();
+out_sem:
+    tracefile_write_unlock();
+out:
+    if (name != NULL)
+           cfs_free(name);
+       return count;
+}
+
+int trace_read_daemon_file(char *page, char **start, off_t off, int count,
+                           int *eof, void *data)
+{
+       int rc;
+
+       tracefile_read_lock();
+       rc = snprintf(page, count, "%s", tracefile);
+       tracefile_read_unlock();
+
+       return rc;
+}
+
+int trace_write_debug_mb(struct file *file, const char *buffer,
+                         unsigned long count, void *data)
+{
+       char string[32];
+       int i;
+       unsigned max;
+
+       if (count >= sizeof(string)) {
+               printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n",
+                      count);
+               return -EOVERFLOW;
+       }
+
+       if (copy_from_user((void *)string, (void *)buffer, count))
+               return -EFAULT;
+
+       max = simple_strtoul(string, NULL, 0);
+       if (max == 0)
+               return -EINVAL;
+
+       if (max > (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5 || max >= 512) {
+               printk(KERN_ERR "Lustre: Refusing to set debug buffer size to "
+                      "%dMB, which is more than 80%% of available RAM (%lu)\n",
+                      max, (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5);
+               return -EINVAL;
+       }
+
+       max /= smp_num_cpus;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               struct trace_cpu_data *tcd;
+               tcd = &trace_data[i].tcd;
+               tcd->tcd_max_pages = max << (20 - CFS_PAGE_SHIFT);
+       }
+       return count;
+}
+
+int trace_read_debug_mb(char *page, char **start, off_t off, int count,
+                        int *eof, void *data)
+{
+       struct trace_cpu_data *tcd;
+       int rc;
+
+       tcd = trace_get_tcd();
+        LASSERT (tcd != NULL);
+       rc = snprintf(page, count, "%lu\n",
+                     (tcd->tcd_max_pages >> (20 - CFS_PAGE_SHIFT)) * smp_num_cpus);
+       trace_put_tcd(tcd);
+       return rc;
+}
+
+void
+trace_call_on_all_cpus(void (*fn)(void *arg), void *arg)
+{
+#error "tbd"
+}
+
diff --git a/lnet/libcfs/winnt/winnt-usr.c b/lnet/libcfs/winnt/winnt-usr.c
new file mode 100644 (file)
index 0000000..f79347b
--- /dev/null
@@ -0,0 +1,85 @@
+
+#ifndef __KERNEL__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <io.h>
+#include <time.h>
+#include <windows.h>
+
+void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
+                              const int line, unsigned long stack,
+                              char *format, ...) {
+    }
+
+int cfs_proc_mknod(const char *path, unsigned short  mode,  unsigned int dev)
+{
+    return 0;
+}
+
+
+void print_last_error(char* Prefix)
+{
+    LPVOID lpMsgBuf;
+
+    FormatMessage( 
+        FORMAT_MESSAGE_ALLOCATE_BUFFER |
+        FORMAT_MESSAGE_FROM_SYSTEM |
+        FORMAT_MESSAGE_IGNORE_INSERTS,
+        NULL,
+        GetLastError(),
+        0,
+        (LPTSTR) &lpMsgBuf,
+        0,
+        NULL
+        );
+
+    printf("%s %s", Prefix, (LPTSTR) lpMsgBuf);
+
+    LocalFree(lpMsgBuf);
+}
+
+//
+// The following declarations are defined in io.h of VC
+// sys/types.h will conflict with io.h, so we need place
+// these declartions here.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    void
+    __declspec (naked) __cdecl _chkesp(void)
+    {
+#if _X86_
+        __asm {  jz      exit_chkesp     };
+        __asm {  int     3               };
+    exit_chkesp:
+        __asm {  ret                     };
+#endif
+    }
+#ifdef __cplusplus
+}
+#endif
+
+unsigned int sleep (unsigned int seconds)
+{
+    Sleep(seconds * 1000);
+    return 0;
+}
+
+int gethostname(char * name, int namelen)
+{
+    return 0;
+}
+
+int ioctl (
+    int handle,
+    int cmd,
+    void *buffer
+    )
+{
+    printf("hello, world\n");
+    return 0;
+}
+
+#endif /* __KERNEL__ */
\ No newline at end of file
diff --git a/lnet/libcfs/winnt/winnt-utils.c b/lnet/libcfs/winnt/winnt-utils.c
new file mode 100644 (file)
index 0000000..cd33aa2
--- /dev/null
@@ -0,0 +1,158 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ *  Copyright (c) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or modify it under
+ *   the terms of version 2 of the GNU General Public License as published by
+ *   the Free Software Foundation. Lustre is distributed in the hope that it
+ *   will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details. You should have received a
+ *   copy of the GNU General Public License along with Lustre; if not, write
+ *   to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139,
+ *   USA.
+ */
+
+
+/*
+ * miscellaneous libcfs stuff
+ */
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/types.h>
+
+/*
+ * Convert server error code to client format. Error codes are from
+ * Linux errno.h, so for Linux client---identity.
+ */
+int convert_server_error(__u64 ecode)
+{
+       return cfs_error_code((NTSTATUS)ecode);
+}
+
+/*
+ * convert <fcntl.h> flag from client to server.
+ * 
+ * nt kernel uses several members to describe the open flags
+ * such as DesiredAccess/ShareAccess/CreateDisposition/CreateOptions
+ * so it's better to convert when using, not here.
+ */
+
+int convert_client_oflag(int cflag, int *result)
+{
+    *result = 0;
+       return 0;
+}
+
+
+int cfs_error_code(NTSTATUS Status)
+{
+    switch (Status) {
+
+        case STATUS_ACCESS_DENIED:
+            return (-EACCES);
+
+        case STATUS_ACCESS_VIOLATION:
+            return (-EFAULT);
+    
+        case STATUS_BUFFER_TOO_SMALL:
+            return (-ETOOSMALL);
+
+        case STATUS_INVALID_PARAMETER:
+            return (-EINVAL);
+
+        case STATUS_NOT_IMPLEMENTED:
+        case STATUS_NOT_SUPPORTED:
+            return (-EOPNOTSUPP);
+
+        case STATUS_INVALID_ADDRESS:
+        case STATUS_INVALID_ADDRESS_COMPONENT:
+            return (-EADDRNOTAVAIL);
+
+        case STATUS_NO_SUCH_DEVICE:
+        case STATUS_NO_SUCH_FILE:
+        case STATUS_OBJECT_NAME_NOT_FOUND:
+        case STATUS_OBJECT_PATH_NOT_FOUND:  
+        case STATUS_NETWORK_BUSY:
+        case STATUS_INVALID_NETWORK_RESPONSE:
+        case STATUS_UNEXPECTED_NETWORK_ERROR:
+            return (-ENETDOWN);
+
+        case STATUS_BAD_NETWORK_PATH:
+        case STATUS_NETWORK_UNREACHABLE:
+        case STATUS_PROTOCOL_UNREACHABLE:     
+            return (-ENETUNREACH);
+
+        case STATUS_LOCAL_DISCONNECT:
+        case STATUS_TRANSACTION_ABORTED:
+        case STATUS_CONNECTION_ABORTED:
+            return (-ECONNABORTED);
+
+        case STATUS_REMOTE_DISCONNECT:
+        case STATUS_LINK_FAILED:
+        case STATUS_CONNECTION_DISCONNECTED:
+        case STATUS_CONNECTION_RESET:
+        case STATUS_PORT_UNREACHABLE:
+            return (-ECONNRESET);
+
+        case STATUS_PAGEFILE_QUOTA:
+        case STATUS_NO_MEMORY:
+        case STATUS_CONFLICTING_ADDRESSES:
+        case STATUS_QUOTA_EXCEEDED:
+        case STATUS_TOO_MANY_PAGING_FILES:
+        case STATUS_INSUFFICIENT_RESOURCES:
+        case STATUS_WORKING_SET_QUOTA:
+        case STATUS_COMMITMENT_LIMIT:
+        case STATUS_TOO_MANY_ADDRESSES:
+        case STATUS_REMOTE_RESOURCES:
+            return (-ENOBUFS);
+
+        case STATUS_INVALID_CONNECTION:
+            return (-ENOTCONN);
+
+        case STATUS_PIPE_DISCONNECTED:
+            return (-ESHUTDOWN);
+
+        case STATUS_TIMEOUT:
+        case STATUS_IO_TIMEOUT:
+        case STATUS_LINK_TIMEOUT:
+            return (-ETIMEDOUT);
+
+        case STATUS_REMOTE_NOT_LISTENING:
+        case STATUS_CONNECTION_REFUSED:
+            return (-ECONNREFUSED);
+
+        case STATUS_HOST_UNREACHABLE:
+            return (-EHOSTUNREACH);
+
+        case STATUS_PENDING:
+        case STATUS_DEVICE_NOT_READY:
+            return (-EAGAIN);
+
+        case STATUS_CANCELLED:
+        case STATUS_REQUEST_ABORTED:
+            return (-EINTR);
+
+        case STATUS_BUFFER_OVERFLOW:
+        case STATUS_INVALID_BUFFER_SIZE:
+            return (-EMSGSIZE);
+
+    }
+
+    if (NT_SUCCESS(Status)) 
+        return 0;
+
+    return (-EINVAL);
+}
+
+
+void cfs_stack_trace_fill(struct cfs_stack_trace *trace)
+{
+}
+
+void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no)
+{
+    return NULL;
+}
index 60c304b..2b3967f 100644 (file)
@@ -5,11 +5,11 @@
        <key>CFBundleDevelopmentRegion</key>
        <string>English</string>
        <key>CFBundleExecutable</key>
-       <string>portals</string>
+       <string>lnet</string>
        <key>CFBundleIconFile</key>
        <string></string>
        <key>CFBundleIdentifier</key>
-       <string>com.clusterfs.lustre.portals</string>
+       <string>com.clusterfs.lustre.lnet</string>
        <key>CFBundleInfoDictionaryVersion</key>
        <string>6.0</string>
        <key>CFBundlePackageType</key>
         <string>1.0.0</string>
        <key>OSBundleLibraries</key>
        <dict>
-                <key>com.apple.kernel.bsd</key>
-                <string>1.1</string>
-                <key>com.apple.kernel.iokit</key>
-                <string>1.0.0b1</string>
-                <key>com.apple.kernel.mach</key>
-                <string>1.0.0b1</string>
+               <key>com.apple.kpi.bsd</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.libkern</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.mach</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.unsupported</key>
+               <string>8.0.0b1</string>
                 <key>com.clusterfs.lustre.libcfs</key>
                 <string>1.0.0</string>
        </dict>
index c0f2e71..3bc86f6 100644 (file)
@@ -1,6 +1,10 @@
-MODULES := portals
-portals-objs := api-errno.o api-ni.o api-wrap.o
-portals-objs += lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o
-portals-objs += lib-move.o lib-ni.o lib-pid.o module.o
+MODULES := lnet
+
+lnet-objs := api-errno.o api-ni.o config.o
+lnet-objs += lib-me.o lib-msg.o lib-eq.o lib-md.o
+lnet-objs += lib-move.o module.o lo.o
+lnet-objs += router.o router_proc.o acceptor.o peer.o
+
+default: all
 
 @INCLUDE_RULES@
diff --git a/lnet/lnet/acceptor.c b/lnet/lnet/acceptor.c
new file mode 100644 (file)
index 0000000..1968f59
--- /dev/null
@@ -0,0 +1,537 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+#ifdef __KERNEL__
+static char *accept = "secure";
+CFS_MODULE_PARM(accept, "s", charp, 0444,
+                "Accept connections (secure|all|none)");
+
+static int accept_port = 988;
+CFS_MODULE_PARM(accept_port, "i", int, 0444,
+                "Acceptor's port (same on all nodes)");
+
+static int accept_backlog = 127;
+CFS_MODULE_PARM(accept_backlog, "i", int, 0444,
+                "Acceptor's listen backlog");
+
+static int accept_timeout = 5;
+CFS_MODULE_PARM(accept_timeout, "i", int, 0644,
+               "Acceptor's timeout (seconds)");
+
+struct {
+       int               pta_shutdown;
+       cfs_socket_t     *pta_sock;
+       struct semaphore  pta_signal;
+} lnet_acceptor_state;
+
+int
+lnet_acceptor_timeout(void)
+{
+        return accept_timeout;
+}
+EXPORT_SYMBOL(lnet_acceptor_timeout);
+
+int
+lnet_acceptor_port(void)
+{
+        return accept_port;
+}
+EXPORT_SYMBOL(lnet_acceptor_port);
+
+void
+lnet_connect_console_error (int rc, lnet_nid_t peer_nid, 
+                           __u32 peer_ip, int peer_port)
+{
+        switch (rc) {
+        /* "normal" errors */
+        case -ECONNREFUSED:
+                CDEBUG(D_NETERROR, "Connection to %s at host %u.%u.%u.%u "
+                       "on port %d was refused: "
+                       "check that Lustre is running on that node.\n",
+                       libcfs_nid2str(peer_nid),
+                       HIPQUAD(peer_ip), peer_port);
+                break;
+        case -EHOSTUNREACH:
+        case -ENETUNREACH:
+                CDEBUG(D_NETERROR, "Connection to %s at host %u.%u.%u.%u "
+                       "was unreachable: the network or that node may "
+                       "be down, or Lustre may be misconfigured.\n",
+                       libcfs_nid2str(peer_nid), HIPQUAD(peer_ip));
+                break;
+        case -ETIMEDOUT:
+                LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u on "
+                               "port %d took too long: that node may be hung "
+                               "or experiencing high load.\n",
+                               libcfs_nid2str(peer_nid),
+                               HIPQUAD(peer_ip), peer_port);
+                break;
+        case -ECONNRESET:
+                LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u on "
+                               "port %d was reset: "
+                               "is it running a compatible version of Lustre "
+                               "and is %s one of its NIDs?\n",
+                               libcfs_nid2str(peer_nid),
+                               HIPQUAD(peer_ip), peer_port,
+                               libcfs_nid2str(peer_nid));
+                break;
+        case -EPROTO:
+                LCONSOLE_ERROR("Protocol error connecting to %s at host "
+                               "%u.%u.%u.%u on port %d: "
+                               "is it running a compatible version of Lustre?\n",
+                               libcfs_nid2str(peer_nid),
+                               HIPQUAD(peer_ip), peer_port);
+                break;
+        case -EADDRINUSE:
+                LCONSOLE_ERROR("No privileged ports available to connect to "
+                               "%s at host %u.%u.%u.%u on port %d\n",
+                               libcfs_nid2str(peer_nid),
+                               HIPQUAD(peer_ip), peer_port);
+                break;
+        default:
+                LCONSOLE_ERROR("Unexpected error %d connecting to %s at "
+                               "host %u.%u.%u.%u on port %d\n", rc,
+                               libcfs_nid2str(peer_nid),
+                               HIPQUAD(peer_ip), peer_port);
+                break;
+        }
+}
+EXPORT_SYMBOL(lnet_connect_console_error);
+
+int
+lnet_connect(cfs_socket_t **sockp, lnet_nid_t peer_nid,
+            __u32 local_ip, __u32 peer_ip, int peer_port)
+{
+        lnet_acceptor_connreq_t cr;
+        cfs_socket_t           *sock;
+        int                     rc;
+        int                     port;
+        int                     fatal;
+
+        CLASSERT (sizeof(cr) <= 16);            /* not too big to be on the stack */
+
+        for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; 
+             port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; 
+             --port) {
+                /* Iterate through reserved ports. */
+
+                rc = libcfs_sock_connect(&sock, &fatal, 
+                                         local_ip, port, 
+                                         peer_ip, peer_port);
+                if (rc != 0) {
+                        if (fatal)
+                                goto failed;
+                        continue;
+                }
+
+                CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1);
+
+                if (the_lnet.ln_ptlcompat != 2) {
+                        /* When portals compatibility is "strong", simply
+                         * connect (i.e. send no acceptor connection request).
+                         * Othewise send an acceptor connection request. I can
+                         * have no portals peers so everyone else should
+                         * understand my protocol. */
+                        cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+                        cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+                        cr.acr_nid     = peer_nid;
+
+                        if (the_lnet.ln_testprotocompat != 0) {
+                                /* single-shot proto check */
+                                LNET_LOCK();
+                                if ((the_lnet.ln_testprotocompat & 4) != 0) {
+                                        cr.acr_version++;
+                                        the_lnet.ln_testprotocompat &= ~4;
+                                }
+                                if ((the_lnet.ln_testprotocompat & 8) != 0) {
+                                        cr.acr_magic = LNET_PROTO_MAGIC;
+                                        the_lnet.ln_testprotocompat &= ~8;
+                                }
+                                LNET_UNLOCK();
+                        }
+
+                        rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                               accept_timeout);
+                        if (rc != 0)
+                                goto failed_sock;
+                }
+                
+                *sockp = sock;
+                return 0;
+        }
+
+        rc = -EADDRINUSE;
+        goto failed;
+        
+ failed_sock:
+        libcfs_sock_release(sock);
+ failed:
+        lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port);
+        return rc;
+}
+EXPORT_SYMBOL(lnet_connect);
+
+static inline int
+lnet_accept_magic(__u32 magic, __u32 constant)
+{
+        return (magic == constant ||
+                magic == __swab32(constant));
+}
+
+int
+lnet_accept(lnet_ni_t *blind_ni, cfs_socket_t *sock, __u32 magic)
+{
+        lnet_acceptor_connreq_t cr;
+        __u32                   peer_ip;
+        int                     peer_port;
+        int                     rc;
+        int                     flip;
+        lnet_ni_t              *ni;
+        char                   *str;
+
+        /* CAVEAT EMPTOR: I may be called by an LND in any thread's context if
+         * I passed the new socket "blindly" to the single NI that needed an
+         * acceptor.  If so, blind_ni != NULL... */
+
+        LASSERT (sizeof(cr) <= 16);             /* not too big for the stack */
+        
+        rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port);
+        LASSERT (rc == 0);                      /* we succeeded before */
+
+        if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) {
+
+                if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) {
+                        /* future version compatibility!
+                         * When LNET unifies protocols over all LNDs, the first
+                         * thing sent will be a version query.  I send back
+                         * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */
+
+                        memset (&cr, 0, sizeof(cr));
+                        cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+                        cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+                        rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                               accept_timeout);
+
+                        if (rc != 0)
+                                CERROR("Error sending magic+version in response"
+                                       "to LNET magic from %u.%u.%u.%u: %d\n",
+                                       HIPQUAD(peer_ip), rc);
+                        return -EPROTO;
+                }
+
+                if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC))
+                        str = "'old' socknal/tcpnal";
+                else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC))
+                        str = "'old' ranal";
+                else if (lnet_accept_magic(magic, LNET_PROTO_OPENIB_MAGIC))
+                        str = "'old' openibnal";
+                else
+                        str = "unrecognised";
+            
+                LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u magic %08x: "
+                               " %s acceptor protocol\n",
+                               HIPQUAD(peer_ip), magic, str);
+                return -EPROTO;
+        }
+
+        flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC);
+
+        rc = libcfs_sock_read(sock, &cr.acr_version, 
+                              sizeof(cr.acr_version),
+                              accept_timeout);
+        if (rc != 0) {
+                CERROR("Error %d reading connection request version from "
+                       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+                return -EIO;
+        }
+
+        if (flip)
+                __swab32s(&cr.acr_version);
+        
+        if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) {
+                /* future version compatibility!
+                 * An acceptor-specific protocol rev will first send a version
+                 * query.  I send back my current version to tell her I'm
+                 * "old". */
+                int peer_version = cr.acr_version;
+
+                memset (&cr, 0, sizeof(cr));
+                cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC;
+                cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+
+                rc = libcfs_sock_write(sock, &cr, sizeof(cr),
+                                       accept_timeout);
+
+                if (rc != 0)
+                        CERROR("Error sending magic+version in response"
+                               "to version %d from %u.%u.%u.%u: %d\n",
+                               peer_version, HIPQUAD(peer_ip), rc);
+                return -EPROTO;
+        }
+
+        rc = libcfs_sock_read(sock, &cr.acr_nid,
+                              sizeof(cr) -
+                              offsetof(lnet_acceptor_connreq_t, acr_nid),
+                              accept_timeout);
+        if (rc != 0) {
+                CERROR("Error %d reading connection request from "
+                       "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+                return -EIO;
+        }
+
+        if (flip)
+                __swab64s(&cr.acr_nid);
+
+        ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid));
+        if (ni == NULL ||               /* no matching net */
+            ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */
+                if (ni != NULL)
+                        lnet_ni_decref(ni);
+                LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: "
+                               " No matching NI\n",
+                               HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+                return -EPERM;
+        }
+
+        if (ni->ni_lnd->lnd_accept == NULL) {
+                /* This catches a request for the loopback LND */
+                lnet_ni_decref(ni);
+                LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: "
+                               " NI doesn not accept IP connections\n",
+                               HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid));
+                return -EPERM;
+        }
+
+        CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u%s\n",
+               libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip),
+               blind_ni == NULL ? "" : " (blind)");
+
+        if (blind_ni == NULL) {
+                /* called by the acceptor: call into the requested NI... */
+                rc = ni->ni_lnd->lnd_accept(ni, sock);
+        } else {
+                /* portals_compatible set and the (only) NI called me to verify
+                 * and skip the connection request... */
+                LASSERT (the_lnet.ln_ptlcompat != 0);
+                LASSERT (ni == blind_ni);
+                rc = 0;
+        }
+
+        lnet_ni_decref(ni);
+        return rc;
+}
+EXPORT_SYMBOL(lnet_accept);
+        
+int
+lnet_acceptor(void *arg)
+{
+       char           name[16];
+       cfs_socket_t  *newsock;
+       int            rc;
+        int            n_acceptor_nis;
+       __u32          magic;
+       __u32          peer_ip;
+       int            peer_port;
+        lnet_ni_t     *blind_ni = NULL;
+        int            secure = (int)((unsigned long)arg);
+
+       LASSERT (lnet_acceptor_state.pta_sock == NULL);
+
+        if (the_lnet.ln_ptlcompat != 0) {
+                /* When portals_compatibility is enabled, peers may connect
+                 * without sending an acceptor connection request.  There is no
+                 * ambiguity about which network the peer wants to connect to
+                 * since there can only be 1 network, so I pass connections
+                 * "blindly" to it. */
+                n_acceptor_nis = lnet_count_acceptor_nis(&blind_ni);
+                LASSERT (n_acceptor_nis == 1);
+                LASSERT (blind_ni != NULL);
+        }
+
+       snprintf(name, sizeof(name), "acceptor_%03d", accept_port);
+       cfs_daemonize(name);
+       cfs_block_allsigs();
+
+       rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock,
+                               0, accept_port, accept_backlog);
+       if (rc != 0) {
+                if (rc == -EADDRINUSE)
+                        LCONSOLE_ERROR("Can't start acceptor on port %d: "
+                                       "port already in use\n",
+                                       accept_port);
+                else
+                        LCONSOLE_ERROR("Can't start acceptor on port %d: "
+                                       "unexpected error %d\n",
+                                       accept_port, rc);
+
+               lnet_acceptor_state.pta_sock = NULL;
+        } else {
+                LCONSOLE(0, "Accept %s, port %d%s\n", 
+                         accept, accept_port,
+                         blind_ni == NULL ? "" : " (proto compatible)");
+        }
+        
+       /* set init status and unblock parent */
+       lnet_acceptor_state.pta_shutdown = rc;
+       mutex_up(&lnet_acceptor_state.pta_signal);
+       
+       if (rc != 0)
+               return rc;
+
+       while (lnet_acceptor_state.pta_shutdown == 0) {
+               
+               rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock);
+               if (rc != 0) {
+                       if (rc != -EAGAIN) {
+                               CWARN("Accept error %d: pausing...\n", rc);
+                               cfs_pause(cfs_time_seconds(1));
+                       }
+                       continue;
+               }
+
+               rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port);
+               if (rc != 0) {
+                       CERROR("Can't determine new connection's address\n");
+                       goto failed;
+               }
+
+                if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) {
+                        CERROR("Refusing connection from %u.%u.%u.%u: "
+                               "insecure port %d\n",
+                               HIPQUAD(peer_ip), peer_port);
+                        goto failed;
+                }
+
+                if (blind_ni != NULL) {
+                        rc = blind_ni->ni_lnd->lnd_accept(blind_ni, newsock);
+                        if (rc != 0) {
+                                CERROR("NI %s refused 'blind' connection from "
+                                       "%u.%u.%u.%u\n", 
+                                       libcfs_nid2str(blind_ni->ni_nid), 
+                                       HIPQUAD(peer_ip));
+                                goto failed;
+                        }
+                        continue;
+                }
+                
+               rc = libcfs_sock_read(newsock, &magic, sizeof(magic),
+                                     accept_timeout);
+               if (rc != 0) {
+                        CERROR("Error %d reading connection request from "
+                               "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip));
+                       goto failed;
+               }
+
+                rc = lnet_accept(NULL, newsock, magic);
+                if (rc != 0)
+                        goto failed;
+                
+                continue;
+                
+       failed:
+               libcfs_sock_release(newsock);
+       }
+       
+       libcfs_sock_release(lnet_acceptor_state.pta_sock);
+        lnet_acceptor_state.pta_sock = NULL;
+
+        if (blind_ni != NULL)
+                lnet_ni_decref(blind_ni);
+
+        LCONSOLE(0,"Acceptor stopping\n");
+       
+       /* unblock lnet_acceptor_stop() */
+       mutex_up(&lnet_acceptor_state.pta_signal);
+       return 0;
+}
+
+int
+lnet_acceptor_start(void)
+{
+       long   pid;
+        long   secure;
+
+       LASSERT (lnet_acceptor_state.pta_sock == NULL);
+       init_mutex_locked(&lnet_acceptor_state.pta_signal);
+
+        if (!strcmp(accept, "secure")) {
+                secure = 1;
+        } else if (!strcmp(accept, "all")) {
+                secure = 0;
+        } else if (!strcmp(accept, "none")) {
+                return 0;
+        } else {
+                LCONSOLE_ERROR ("Can't parse 'accept=\"%s\"'\n",
+                                accept);
+                return -EINVAL;
+        }
+       
+       if (lnet_count_acceptor_nis(NULL) == 0)  /* not required */
+               return 0;
+       
+       pid = cfs_kernel_thread(lnet_acceptor, (void *)secure, 0);
+       if (pid < 0) {
+               CERROR("Can't start acceptor thread: %ld\n", pid);
+               return -ESRCH;
+       }
+
+       mutex_down(&lnet_acceptor_state.pta_signal); /* wait for acceptor to startup */
+
+       if (lnet_acceptor_state.pta_shutdown == 0) {
+                /* started OK */
+                LASSERT (lnet_acceptor_state.pta_sock != NULL);
+               return 0;
+        }
+
+        LASSERT (lnet_acceptor_state.pta_sock == NULL);
+       return -ENETDOWN;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+       if (lnet_acceptor_state.pta_sock == NULL) /* not running */
+               return;
+       
+       lnet_acceptor_state.pta_shutdown = 1;
+       libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock);
+
+       /* block until acceptor signals exit */
+       mutex_down(&lnet_acceptor_state.pta_signal);
+}
+
+#else /* __KERNEL__ */
+
+int
+lnet_acceptor_start(void)
+{
+       return 0;
+}
+
+void
+lnet_acceptor_stop(void)
+{
+}
+
+#endif /* !__KERNEL__ */
index 2f32cbf..a158d6e 100644 (file)
@@ -9,41 +9,3 @@
  */
 
 /* If you change these, you must update the number table in portals/errno.h */
-const char *ptl_err_str[] = {
-        "PTL_OK",
-        "PTL_SEGV",
-
-        "PTL_NO_SPACE",
-        "PTL_ME_IN_USE",
-        "PTL_VAL_FAILED",
-
-        "PTL_NAL_FAILED",
-        "PTL_NO_INIT",
-        "PTL_IFACE_DUP",
-        "PTL_IFACE_INVALID",
-
-        "PTL_HANDLE_INVALID",
-        "PTL_MD_INVALID",
-        "PTL_ME_INVALID",
-/* If you change these, you must update the number table in portals/errno.h */
-        "PTL_PROCESS_INVALID",
-        "PTL_PT_INDEX_INVALID",
-
-        "PTL_SR_INDEX_INVALID",
-        "PTL_EQ_INVALID",
-        "PTL_EQ_DROPPED",
-
-        "PTL_EQ_EMPTY",
-        "PTL_MD_NO_UPDATE",
-        "PTL_FAIL",
-
-        "PTL_IOV_INVALID",
-
-        "PTL_EQ_IN_USE",
-
-        "PTL_NI_INVALID",
-        "PTL_MD_ILLEGAL",
-
-        "PTL_MAX_ERRNO"
-};
-/* If you change these, you must update the number table in portals/errno.h */
index 91a307a..82c1d75 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
-#include <portals/api-support.h>
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
 
-int ptl_init;
-
-/* Put some magic in the NI handle so uninitialised/zeroed handles are easy
- * to spot */
-#define NI_HANDLE_MAGIC  0xebc0de00
-#define NI_HANDLE_MASK   0x000000ff
+#ifdef __KERNEL__
+#define D_LNI D_CONSOLE
+#else
+#define D_LNI D_CONFIG
+#endif
 
-static struct nal_t *ptl_nal_table[NAL_MAX_NR + 1];
+lnet_t      the_lnet;                           /* THE state of the network */
 
 #ifdef __KERNEL__
-struct semaphore ptl_mutex;
 
-static void ptl_mutex_enter (void) 
+static char *ip2nets = "";
+CFS_MODULE_PARM(ip2nets, "s", charp, 0444,
+                "LNET network <- IP table");
+
+static char *networks = "";
+CFS_MODULE_PARM(networks, "s", charp, 0444,
+                "local networks");
+
+static char *routes = "";
+CFS_MODULE_PARM(routes, "s", charp, 0444,
+                "routes to non-local networks");
+
+static char *portals_compatibility = "none";
+CFS_MODULE_PARM(portals_compatibility, "s", charp, 0444,
+                "wire protocol compatibility: 'strong'|'weak'|'none'");
+
+char *
+lnet_get_routes(void)
+{
+        return routes;
+}
+
+char *
+lnet_get_networks(void)
+{
+        char   *nets;
+        int     rc;
+
+        if (*networks != 0 && *ip2nets != 0) {
+                LCONSOLE_ERROR("Please specify EITHER 'networks' or 'ip2nets'"
+                               " but not both at once\n");
+                return NULL;
+        }
+        
+        if (*ip2nets != 0) {
+                rc = lnet_parse_ip2nets(&nets, ip2nets);
+                return (rc == 0) ? nets : NULL;
+        }
+
+        if (*networks != 0)
+                return networks;
+
+        return "tcp";
+}
+
+int
+lnet_get_portals_compatibility(void)
 {
-        mutex_down (&ptl_mutex);
+        if (!strcmp(portals_compatibility, "none")) {
+                return 0;
+        }
+
+        if (!strcmp(portals_compatibility, "weak")) {
+                return 1;
+                LCONSOLE_WARN("Starting in weak portals-compatible mode\n");
+        }
+
+        if (!strcmp(portals_compatibility, "strong")) {
+                return 2;
+                LCONSOLE_WARN("Starting in strong portals-compatible mode\n");
+        } 
+
+        LCONSOLE_ERROR("portals_compatibility=\"%s\" not supported\n",
+                       portals_compatibility);
+        return -EINVAL;
 }
 
-static void ptl_mutex_exit (void)
+void
+lnet_init_locks(void)
 {
-        mutex_up (&ptl_mutex);
+        spin_lock_init (&the_lnet.ln_lock);
+        cfs_waitq_init (&the_lnet.ln_waitq);
+        init_mutex(&the_lnet.ln_lnd_mutex);
+        init_mutex(&the_lnet.ln_api_mutex);
 }
+
+void
+lnet_fini_locks(void)
+{
+}
+
 #else
-static void ptl_mutex_enter (void)
+
+char *
+lnet_get_routes(void)
 {
+        char *str = getenv("LNET_ROUTES");
+        
+        return (str == NULL) ? "" : str;
 }
 
-static void ptl_mutex_exit (void) 
+char *
+lnet_get_networks (void)
 {
+        static char       default_networks[256];
+        char             *networks = getenv ("LNET_NETWORKS");
+        char             *ip2nets  = getenv ("LNET_IP2NETS");
+        char             *str;
+        char             *sep;
+        int               len;
+        int               nob;
+        int               rc;
+        struct list_head *tmp;
+
+#ifdef NOT_YET
+        if (networks != NULL && ip2nets != NULL) {
+                LCONSOLE_ERROR("Please set EITHER 'LNET_NETWORKS' or "
+                               "'LNET_IP2NETS' but not both at once\n");
+                return NULL;
+        }
+
+        if (ip2nets != NULL) {
+                rc = lnet_parse_ip2nets(&networks, ip2nets);
+                return (rc == 0) ? networks : NULL;
+        }
+#else
+        ip2nets = NULL;
+        rc = 0;
+#endif
+        if (networks != NULL)
+                return networks;
+
+        /* In userland, the default 'networks=' is the list of known net types */
+
+        len = sizeof(default_networks);
+        str = default_networks;
+        *str = 0;
+        sep = "";
+                
+        list_for_each (tmp, &the_lnet.ln_lnds) {
+                        lnd_t *lnd = list_entry(tmp, lnd_t, lnd_list);
+                        
+                        nob = snprintf(str, len, "%s%s", sep,
+                                       libcfs_lnd2str(lnd->lnd_type));
+                        len -= nob;
+                        if (len < 0) {
+                                /* overflowed the string; leave it where it was */
+                                *str = 0;
+                                break;
+                        }
+                        
+                        str += nob;
+                        sep = ",";
+        }
+
+        return default_networks;
+}
+
+int
+lnet_get_portals_compatibility(void)
+{
+        return 0;
+}
+
+# if !HAVE_LIBPTHREAD
+
+void lnet_init_locks(void)
+{
+        the_lnet.ln_lock = 0;
+        the_lnet.ln_lnd_mutex = 0;
+        the_lnet.ln_api_mutex = 0;
 }
+
+void lnet_fini_locks(void)
+{
+        LASSERT (the_lnet.ln_api_mutex == 0);
+        LASSERT (the_lnet.ln_lnd_mutex == 0);
+        LASSERT (the_lnet.ln_lock == 0);
+}
+
+# else
+
+void lnet_init_locks(void)
+{
+        pthread_cond_init(&the_lnet.ln_cond, NULL);
+        pthread_mutex_init(&the_lnet.ln_lock, NULL);
+        pthread_mutex_init(&the_lnet.ln_lnd_mutex, NULL);
+        pthread_mutex_init(&the_lnet.ln_api_mutex, NULL);
+}
+
+void lnet_fini_locks(void)
+{
+        pthread_mutex_destroy(&the_lnet.ln_api_mutex);
+        pthread_mutex_destroy(&the_lnet.ln_lnd_mutex);
+        pthread_mutex_destroy(&the_lnet.ln_lock);
+        pthread_cond_destroy(&the_lnet.ln_cond);
+}
+
+# endif
 #endif
 
-nal_t *ptl_hndl2nal(ptl_handle_any_t *handle)
+void lnet_assert_wire_constants (void)
 {
-        unsigned int idx = handle->nal_idx;
+        /* Wire protocol assertions generated by 'wirecheck'
+         * running on Linux robert.bartonsoftware.com 2.6.8-1.521
+         * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux
+         * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */
 
-        /* XXX we really rely on the caller NOT racing with interface
-         * setup/teardown.  That ensures her NI handle can't get
-         * invalidated out from under her (or worse, swapped for a
-         * completely different interface!) */
+        /* Constants... */
+        CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded);
+        CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1);
+        CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0);
+        CLASSERT (LNET_MSG_ACK == 0);
+        CLASSERT (LNET_MSG_PUT == 1);
+        CLASSERT (LNET_MSG_GET == 2);
+        CLASSERT (LNET_MSG_REPLY == 3);
+        CLASSERT (LNET_MSG_HELLO == 4);
 
-        LASSERT (ptl_init);
+        /* Checks for struct ptl_handle_wire_t */
+        CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16);
+        CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0);
+        CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8);
+        CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8);
+        CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8);
 
-        if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0)
-                return NULL;
+        /* Checks for struct lnet_magicversion_t */
+        CLASSERT ((int)sizeof(lnet_magicversion_t) == 8);
+        CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0);
+        CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4);
+        CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4);
+        CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2);
+        CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6);
+        CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2);
+
+        /* Checks for struct lnet_hdr_t */
+        CLASSERT ((int)sizeof(lnet_hdr_t) == 72);
+        CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8);
+        CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8);
+        CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4);
+        CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4);
+        CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4);
+        CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40);
 
-        idx &= NI_HANDLE_MASK;
+        /* Ack */
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4);
+
+        /* Put */
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4);
+
+        /* Get */
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4);
+
+        /* Reply */
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16);
+
+        /* Hello */
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8);
+        CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40);
+        CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4);
+}
+
+lnd_t *
+lnet_find_lnd_by_type (int type) 
+{
+        lnd_t              *lnd;
+        struct list_head   *tmp;
+
+        /* holding lnd mutex */
+        list_for_each (tmp, &the_lnet.ln_lnds) {
+                lnd = list_entry(tmp, lnd_t, lnd_list);
+
+                if (lnd->lnd_type == type)
+                        return lnd;
+        }
         
-        if (idx > NAL_MAX_NR ||
-            ptl_nal_table[idx] == NULL ||
-            ptl_nal_table[idx]->nal_refct == 0)
-                return NULL;
+        return NULL;
+}
+
+void
+lnet_register_lnd (lnd_t *lnd)
+{
+        LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex);
 
-        return ptl_nal_table[idx];
+        LASSERT (the_lnet.ln_init);
+        LASSERT (libcfs_isknown_lnd(lnd->lnd_type));
+        LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL);
+        
+        list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds);
+        lnd->lnd_refcount = 0;
+
+        CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type));
+
+        LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
 }
 
-int ptl_register_nal (ptl_interface_t interface, nal_t *nal)
+void
+lnet_unregister_lnd (lnd_t *lnd)
 {
-        int    rc;
+        LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex);
+
+        LASSERT (the_lnet.ln_init);
+        LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd);
+        LASSERT (lnd->lnd_refcount == 0);
         
-        ptl_mutex_enter();
+        list_del (&lnd->lnd_list);
+        CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type));
+
+        LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
+}
+
+#ifndef LNET_USE_LIB_FREELIST
+
+int
+lnet_descriptor_setup (void)
+{
+        return 0;
+}
+
+void
+lnet_descriptor_cleanup (void)
+{
+}
+
+#else
+
+int
+lnet_freelist_init (lnet_freelist_t *fl, int n, int size)
+{
+        char *space;
+
+        LASSERT (n > 0);
+
+        size += offsetof (lnet_freeobj_t, fo_contents);
+
+        LIBCFS_ALLOC(space, n * size);
+        if (space == NULL)
+                return (-ENOMEM);
+
+        CFS_INIT_LIST_HEAD (&fl->fl_list);
+        fl->fl_objs = space;
+        fl->fl_nobjs = n;
+        fl->fl_objsize = size;
+
+        do
+        {
+                memset (space, 0, size);
+                list_add ((struct list_head *)space, &fl->fl_list);
+                space += size;
+        } while (--n != 0);
+
+        return (0);
+}
+
+void
+lnet_freelist_fini (lnet_freelist_t *fl)
+{
+        struct list_head *el;
+        int               count;
+
+        if (fl->fl_nobjs == 0)
+                return;
+
+        count = 0;
+        for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
+                count++;
+
+        LASSERT (count == fl->fl_nobjs);
+
+        LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
+        memset (fl, 0, sizeof (fl));
+}
+
+int
+lnet_descriptor_setup (void)
+{
+        /* NB on failure caller must still call lnet_descriptor_cleanup */
+        /*               ******                                         */
+        int        rc;
+
+        memset (&the_lnet.ln_free_mes,  0, sizeof (the_lnet.ln_free_mes));
+        memset (&the_lnet.ln_free_msgs, 0, sizeof (the_lnet.ln_free_msgs));
+        memset (&the_lnet.ln_free_mds,  0, sizeof (the_lnet.ln_free_mds));
+        memset (&the_lnet.ln_free_eqs,  0, sizeof (the_lnet.ln_free_eqs));
+
+        rc = lnet_freelist_init(&the_lnet.ln_free_mes,
+                                MAX_MES, sizeof (lnet_me_t));
+        if (rc != 0)
+                return (rc);
+
+        rc = lnet_freelist_init(&the_lnet.ln_free_msgs,
+                                MAX_MSGS, sizeof (lnet_msg_t));
+        if (rc != 0)
+                return (rc);
+
+        rc = lnet_freelist_init(&the_lnet.ln_free_mds,
+                                MAX_MDS, sizeof (lnet_libmd_t));
+        if (rc != 0)
+                return (rc);
+
+        rc = lnet_freelist_init(&the_lnet.ln_free_eqs,
+                                MAX_EQS, sizeof (lnet_eq_t));
+        return (rc);
+}
+
+void
+lnet_descriptor_cleanup (void)
+{
+        lnet_freelist_fini (&the_lnet.ln_free_mes);
+        lnet_freelist_fini (&the_lnet.ln_free_msgs);
+        lnet_freelist_fini (&the_lnet.ln_free_mds);
+        lnet_freelist_fini (&the_lnet.ln_free_eqs);
+}
+
+#endif
+
+__u64
+lnet_create_interface_cookie (void)
+{
+        /* NB the interface cookie in wire handles guards against delayed
+         * replies and ACKs appearing valid after reboot. Initialisation time,
+         * even if it's only implemented to millisecond resolution is probably
+         * easily good enough. */
+        struct timeval tv;
+        __u64          cookie;
+#ifndef __KERNEL__
+        int            rc = gettimeofday (&tv, NULL);
+        LASSERT (rc == 0);
+#else
+        do_gettimeofday(&tv);
+#endif
+        cookie = tv.tv_sec;
+        cookie *= 1000000;
+        cookie += tv.tv_usec;
+        return cookie;
+}
+
+int
+lnet_setup_handle_hash (void) 
+{
+        int       i;
+        
+        /* Arbitrary choice of hash table size */
+#ifdef __KERNEL__
+        the_lnet.ln_lh_hash_size = CFS_PAGE_SIZE / sizeof (struct list_head);
+#else
+        the_lnet.ln_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4;
+#endif
+        LIBCFS_ALLOC(the_lnet.ln_lh_hash_table,
+                     the_lnet.ln_lh_hash_size * sizeof (struct list_head));
+        if (the_lnet.ln_lh_hash_table == NULL)
+                return (-ENOMEM);
+        
+        for (i = 0; i < the_lnet.ln_lh_hash_size; i++)
+                CFS_INIT_LIST_HEAD (&the_lnet.ln_lh_hash_table[i]);
+
+        the_lnet.ln_next_object_cookie = LNET_COOKIE_TYPES;
+        
+        return (0);
+}
+
+void
+lnet_cleanup_handle_hash (void)
+{
+        if (the_lnet.ln_lh_hash_table == NULL)
+                return;
+        
+        LIBCFS_FREE(the_lnet.ln_lh_hash_table,
+                    the_lnet.ln_lh_hash_size * sizeof (struct list_head));
+}
+
+lnet_libhandle_t *
+lnet_lookup_cookie (__u64 cookie, int type) 
+{
+        /* ALWAYS called with LNET_LOCK held */
+        struct list_head    *list;
+        struct list_head    *el;
+        unsigned int         hash;
+
+        if ((cookie & (LNET_COOKIE_TYPES - 1)) != type)
+                return (NULL);
         
-        if (interface < 0 || interface > NAL_MAX_NR)
-                rc = PTL_IFACE_INVALID;
-        else if (ptl_nal_table[interface] != NULL)
-                rc = PTL_IFACE_DUP;
-        else {
-                rc = PTL_OK;
-                ptl_nal_table[interface] = nal;
-                LASSERT(nal->nal_refct == 0);
+        hash = ((unsigned int)cookie) % the_lnet.ln_lh_hash_size;
+        list = &the_lnet.ln_lh_hash_table[hash];
+        
+        list_for_each (el, list) {
+                lnet_libhandle_t *lh = list_entry (el, lnet_libhandle_t,
+                                                  lh_hash_chain);
+                
+                if (lh->lh_cookie == cookie)
+                        return (lh);
         }
+        
+        return (NULL);
+}
 
-        ptl_mutex_exit();
-        return (rc);
+void
+lnet_initialise_handle (lnet_libhandle_t *lh, int type) 
+{
+        /* ALWAYS called with LNET_LOCK held */
+        unsigned int    hash;
+
+        LASSERT (type >= 0 && type < LNET_COOKIE_TYPES);
+        lh->lh_cookie = the_lnet.ln_next_object_cookie | type;
+        the_lnet.ln_next_object_cookie += LNET_COOKIE_TYPES;
+        
+        hash = ((unsigned int)lh->lh_cookie) % the_lnet.ln_lh_hash_size;
+        list_add (&lh->lh_hash_chain, &the_lnet.ln_lh_hash_table[hash]);
+}
+
+void
+lnet_invalidate_handle (lnet_libhandle_t *lh)
+{
+        /* ALWAYS called with LNET_LOCK held */
+        list_del (&lh->lh_hash_chain);
+}
+
+int
+lnet_init_finalizers(void)
+{
+#ifdef __KERNEL__
+        int    i;
+
+        the_lnet.ln_nfinalizers = num_online_cpus();
+
+        LIBCFS_ALLOC(the_lnet.ln_finalizers,
+                     the_lnet.ln_nfinalizers * 
+                     sizeof(*the_lnet.ln_finalizers));
+        if (the_lnet.ln_finalizers == NULL) {
+                CERROR("Can't allocate ln_finalizers\n");
+                return -ENOMEM;
+        }
+
+        for (i = 0; i < the_lnet.ln_nfinalizers; i++)
+                the_lnet.ln_finalizers[i] = NULL;
+#else
+        the_lnet.ln_finalizing = 0;
+#endif
+
+        CFS_INIT_LIST_HEAD(&the_lnet.ln_finalizeq);
+        return 0;
+}
+
+void
+lnet_fini_finalizers(void)
+{
+#ifdef __KERNEL__
+        int    i;
+        
+        for (i = 0; i < the_lnet.ln_nfinalizers; i++)
+                LASSERT (the_lnet.ln_finalizers[i] == NULL);
+
+        LIBCFS_FREE(the_lnet.ln_finalizers,
+                    the_lnet.ln_nfinalizers *
+                    sizeof(*the_lnet.ln_finalizers));
+#else
+        LASSERT (!the_lnet.ln_finalizing);
+#endif
+        LASSERT (list_empty(&the_lnet.ln_finalizeq));
 }
 
-void ptl_unregister_nal (ptl_interface_t interface)
+int
+lnet_prepare(lnet_pid_t requested_pid)
 {
-        LASSERT(interface >= 0 && interface <= NAL_MAX_NR);
-        LASSERT(ptl_nal_table[interface] != NULL);
-        LASSERT(ptl_nal_table[interface]->nal_refct == 0);
+        /* Prepare to bring up the network */
+        int               rc = 0;
+        int               i;
+
+        LASSERT (the_lnet.ln_refcount == 0);
+
+        the_lnet.ln_routing = 0;
+
+#ifdef __KERNEL__
+        LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0);
+        the_lnet.ln_pid = requested_pid;
+#else
+        /* My PID must be unique on this node and flag I'm userspace */
+        the_lnet.ln_pid = getpid() | LNET_PID_USERFLAG;
+#endif
+
+        rc = lnet_descriptor_setup();
+        if (rc != 0)
+                goto failed0;
+
+        memset(&the_lnet.ln_counters, 0, 
+               sizeof(the_lnet.ln_counters));
+
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_active_msgs);
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_active_mds);
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_active_eqs);
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_test_peers);
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_nis);
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_zombie_nis);
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_remote_nets);
+        CFS_INIT_LIST_HEAD (&the_lnet.ln_routers);
+
+        the_lnet.ln_interface_cookie = lnet_create_interface_cookie();
+
+        lnet_init_rtrpools();
+
+        rc = lnet_setup_handle_hash ();
+        if (rc != 0)
+                goto failed0;
+
+        rc = lnet_create_peer_table();
+        if (rc != 0)
+                goto failed1;
+
+        rc = lnet_init_finalizers();
+        if (rc != 0)
+                goto failed2;
+
+        the_lnet.ln_nportals = MAX_PORTALS;
+        LIBCFS_ALLOC(the_lnet.ln_portals, 
+                     the_lnet.ln_nportals * 
+                     sizeof(*the_lnet.ln_portals));
+        if (the_lnet.ln_portals == NULL) {
+                rc = -ENOMEM;
+                goto failed3;
+        }
+
+        for (i = 0; i < the_lnet.ln_nportals; i++) {
+                CFS_INIT_LIST_HEAD(&(the_lnet.ln_portals[i].ptl_ml));
+                CFS_INIT_LIST_HEAD(&(the_lnet.ln_portals[i].ptl_msgq));
+                the_lnet.ln_portals[i].ptl_options = 0;
+        }
+
+        return 0;
         
-        ptl_mutex_enter();
+ failed3:
+        lnet_fini_finalizers();
+ failed2:
+        lnet_destroy_peer_table();
+ failed1:
+        lnet_cleanup_handle_hash();
+ failed0:
+        lnet_descriptor_cleanup();
+        return rc;
+}
+
+int
+lnet_unprepare (void)
+{
+        int       idx;
         
-        ptl_nal_table[interface] = NULL;
+        /* NB no LNET_LOCK since this is the last reference.  All LND instances
+         * have shut down already, so it is safe to unlink and free all
+         * descriptors, even those that appear committed to a network op (eg MD
+         * with non-zero pending count) */
 
-        ptl_mutex_exit();
+        lnet_fail_nid(LNET_NID_ANY, 0);
+
+        LASSERT (list_empty(&the_lnet.ln_test_peers));
+        LASSERT (the_lnet.ln_refcount == 0);
+        LASSERT (list_empty(&the_lnet.ln_nis));
+        LASSERT (list_empty(&the_lnet.ln_zombie_nis));
+        LASSERT (the_lnet.ln_nzombie_nis == 0);
+               
+        for (idx = 0; idx < the_lnet.ln_nportals; idx++) {
+
+                LNetClearLazyPortal(idx);
+                LASSERT (list_empty(&the_lnet.ln_portals[idx].ptl_msgq));
+
+                while (!list_empty (&the_lnet.ln_portals[idx].ptl_ml)) {
+                        lnet_me_t *me = list_entry (the_lnet.ln_portals[idx].ptl_ml.next,
+                                                    lnet_me_t, me_list);
+
+                        CERROR ("Active me %p on exit\n", me);
+                        list_del (&me->me_list);
+                        lnet_me_free (me);
+                }
+        }
+
+        while (!list_empty (&the_lnet.ln_active_mds)) {
+                lnet_libmd_t *md = list_entry (the_lnet.ln_active_mds.next,
+                                               lnet_libmd_t, md_list);
+
+                CERROR ("Active md %p on exit\n", md);
+                list_del (&md->md_list);
+                lnet_md_free (md);
+        }
+
+        while (!list_empty (&the_lnet.ln_active_eqs)) {
+                lnet_eq_t *eq = list_entry (the_lnet.ln_active_eqs.next,
+                                            lnet_eq_t, eq_list);
+
+                CERROR ("Active eq %p on exit\n", eq);
+                list_del (&eq->eq_list);
+                lnet_eq_free (eq);
+        }
+
+        while (!list_empty (&the_lnet.ln_active_msgs)) {
+                lnet_msg_t *msg = list_entry (the_lnet.ln_active_msgs.next,
+                                              lnet_msg_t, msg_activelist);
+
+                CERROR ("Active msg %p on exit\n", msg);
+                LASSERT (msg->msg_onactivelist);
+                list_del (&msg->msg_activelist);
+                lnet_msg_free (msg);
+        }
+
+        LIBCFS_FREE(the_lnet.ln_portals,  
+                    the_lnet.ln_nportals * sizeof(*the_lnet.ln_portals));
+
+        lnet_free_rtrpools();
+        lnet_fini_finalizers();
+        lnet_destroy_peer_table();
+        lnet_cleanup_handle_hash();
+        lnet_descriptor_cleanup();
+
+        return (0);
 }
 
-int PtlInit(int *max_interfaces)
+lnet_ni_t  *
+lnet_net2ni_locked (__u32 net)
 {
-        LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO"));
+        struct list_head *tmp;
+        lnet_ni_t        *ni;
+
+        list_for_each (tmp, &the_lnet.ln_nis) {
+                ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+                if (lnet_ptlcompat_matchnet(LNET_NIDNET(ni->ni_nid), net)) {
+                        lnet_ni_addref_locked(ni);
+                        return ni;
+                }
+        }
+        
+        return NULL;
+}
 
-        /* If this assertion fails, we need more bits in NI_HANDLE_MASK and
-         * to shift NI_HANDLE_MAGIC left appropriately */
-        LASSERT (NAL_MAX_NR < (NI_HANDLE_MASK + 1));
+int
+lnet_islocalnet (__u32 net)
+{
+        lnet_ni_t        *ni;
         
-        if (max_interfaces != NULL)
-                *max_interfaces = NAL_MAX_NR + 1;
+        LNET_LOCK();
+        ni = lnet_net2ni_locked(net);
+        if (ni != NULL)
+                lnet_ni_decref_locked(ni);
+        LNET_UNLOCK();
 
-        ptl_mutex_enter();
+        return ni != NULL;
+}
 
-        if (!ptl_init) {
-                /* NULL pointers, clear flags */
-                memset(ptl_nal_table, 0, sizeof(ptl_nal_table));
-#ifndef __KERNEL__
-                /* Kernel NALs register themselves when their module loads,
-                 * and unregister themselves when their module is unloaded.
-                 * Userspace NALs, are plugged in explicitly here... */
-                {
-                        extern nal_t procapi_nal;
-
-                        /* XXX pretend it's socknal to keep liblustre happy... */
-                        ptl_nal_table[SOCKNAL] = &procapi_nal;
-                        LASSERT (procapi_nal.nal_refct == 0);
+lnet_ni_t  *
+lnet_nid2ni_locked (lnet_nid_t nid)
+{
+        struct list_head *tmp;
+        lnet_ni_t        *ni;
+
+        list_for_each (tmp, &the_lnet.ln_nis) {
+                ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+                if (lnet_ptlcompat_matchnid(ni->ni_nid, nid)) {
+                        lnet_ni_addref_locked(ni);
+                        return ni;
                 }
-#endif
-                ptl_init = 1;
         }
+        
+        return NULL;
+}
 
-        ptl_mutex_exit();
+int
+lnet_islocalnid (lnet_nid_t nid)
+{
+        lnet_ni_t     *ni;
         
-        return PTL_OK;
+        LNET_LOCK();
+        ni = lnet_nid2ni_locked(nid);
+        if (ni != NULL)
+                lnet_ni_decref_locked(ni);
+        LNET_UNLOCK();
+
+        return ni != NULL;
 }
 
-void PtlFini(void)
+int
+lnet_count_acceptor_nis (lnet_ni_t **first_ni)
 {
-        nal_t  *nal;
-        int     i;
+        /* Return the # of NIs that need the acceptor.  Return the first one in
+         * *first_ni so the acceptor can pass it connections "blind" to retain
+         * binary compatibility. */
+        int                count = 0;
+#ifdef __KERNEL__
+        struct list_head  *tmp;
+        lnet_ni_t         *ni;
 
-        ptl_mutex_enter();
+        LNET_LOCK();
+        list_for_each (tmp, &the_lnet.ln_nis) {
+                ni = list_entry(tmp, lnet_ni_t, ni_list);
 
-        if (ptl_init) {
-                for (i = 0; i <= NAL_MAX_NR; i++) {
+                if (ni->ni_lnd->lnd_accept != NULL) {
+                        /* This LND uses the acceptor */
+                        if (count == 0 && first_ni != NULL) {
+                                lnet_ni_addref_locked(ni);
+                                *first_ni = ni;
+                        }
+                        count++;
+                }
+        }
+        
+        LNET_UNLOCK();
+#endif
+        return count;
+}
 
-                        nal = ptl_nal_table[i];
-                        if (nal == NULL)
-                                continue;
-                        
-                        if (nal->nal_refct != 0) {
-                                CWARN("NAL %x has outstanding refcount %d\n",
-                                      i, nal->nal_refct);
-                                nal->nal_ni_fini(nal);
+void
+lnet_shutdown_lndnis (void)
+{
+        int                i;
+        int                islo;
+        lnet_ni_t         *ni;
+
+        /* NB called holding the global mutex */
+
+        /* All quiet on the API front */
+        LASSERT (!the_lnet.ln_shutdown);
+        LASSERT (the_lnet.ln_refcount == 0);
+        LASSERT (list_empty(&the_lnet.ln_zombie_nis));
+        LASSERT (the_lnet.ln_nzombie_nis == 0);
+        LASSERT (list_empty(&the_lnet.ln_remote_nets));
+
+        LNET_LOCK();
+        the_lnet.ln_shutdown = 1;               /* flag shutdown */
+
+        /* Unlink NIs from the global table */
+        while (!list_empty(&the_lnet.ln_nis)) {
+                ni = list_entry(the_lnet.ln_nis.next,
+                                lnet_ni_t, ni_list);
+                list_del (&ni->ni_list);
+
+                the_lnet.ln_nzombie_nis++;
+                lnet_ni_decref_locked(ni); /* drop apini's ref */
+        }
+
+        /* Drop the cached eqwait NI. */
+        if (the_lnet.ln_eqwaitni != NULL) {
+                lnet_ni_decref_locked(the_lnet.ln_eqwaitni);
+                the_lnet.ln_eqwaitni = NULL;
+        }
+
+        /* Drop the cached loopback NI. */
+        if (the_lnet.ln_loni != NULL) {
+                lnet_ni_decref_locked(the_lnet.ln_loni);
+                the_lnet.ln_loni = NULL;
+        }
+
+        LNET_UNLOCK();
+        /* Clear the peer table and wait for all peers to go (they hold refs on
+         * their NIs) */
+
+        lnet_clear_peer_table();
+
+        LNET_LOCK();
+        /* Now wait for the NI's I just nuked to show up on apini_zombie_nis
+         * and shut them down in guaranteed thread context */
+        i = 2;
+        while (the_lnet.ln_nzombie_nis != 0) {
+
+                while (list_empty(&the_lnet.ln_zombie_nis)) {
+                        LNET_UNLOCK();
+                        ++i;
+                        if ((i & (-i)) == i)
+                                CDEBUG(D_WARNING,"Waiting for %d zombie NIs\n",
+                                       the_lnet.ln_nzombie_nis);
+                        cfs_pause(cfs_time_seconds(1));
+                        LNET_LOCK();
+                }
+
+                ni = list_entry(the_lnet.ln_zombie_nis.next,
+                                lnet_ni_t, ni_list);
+                list_del(&ni->ni_list);
+                ni->ni_lnd->lnd_refcount--;
+
+                LNET_UNLOCK();
+
+                islo = ni->ni_lnd->lnd_type == LOLND;
+
+                LASSERT (!in_interrupt ());
+                (ni->ni_lnd->lnd_shutdown)(ni);
+
+                /* can't deref lnd anymore now; it might have unregistered
+                 * itself...  */
+
+                if (!islo)
+                        CDEBUG(D_LNI, "Removed LNI %s\n",
+                               libcfs_nid2str(ni->ni_nid));
+
+                LIBCFS_FREE(ni, sizeof(*ni));
+
+                LNET_LOCK();
+                the_lnet.ln_nzombie_nis--;
+        }
+
+        the_lnet.ln_shutdown = 0;
+        LNET_UNLOCK();
+
+        if (the_lnet.ln_network_tokens != NULL) {
+                LIBCFS_FREE(the_lnet.ln_network_tokens,
+                            the_lnet.ln_network_tokens_nob);
+                the_lnet.ln_network_tokens = NULL;
+        }
+}
+
+int
+lnet_startup_lndnis (void)
+{
+        lnd_t             *lnd;
+        lnet_ni_t         *ni;
+        struct list_head   nilist;
+        int                rc = 0;
+        int                lnd_type;
+        int                nicount = 0;
+        char              *nets = lnet_get_networks();
+
+        INIT_LIST_HEAD(&nilist);
+
+        if (nets == NULL)
+                goto failed;
+
+        rc = lnet_parse_networks(&nilist, nets);
+        if (rc != 0)
+                goto failed;
+
+        while (!list_empty(&nilist)) {
+                ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+                lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid));
+
+                LASSERT (libcfs_isknown_lnd(lnd_type));
+
+                LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex);
+                lnd = lnet_find_lnd_by_type(lnd_type);
+
+#ifdef __KERNEL__
+                if (lnd == NULL) {
+                        LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
+                        rc = request_module(libcfs_lnd2modname(lnd_type));
+                        LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex);
+
+                        lnd = lnet_find_lnd_by_type(lnd_type);
+                        if (lnd == NULL) {
+                                LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
+                                CERROR("Can't load LND %s, module %s, rc=%d\n",
+                                       libcfs_lnd2str(lnd_type),
+                                       libcfs_lnd2modname(lnd_type), rc);
+#ifndef CONFIG_KMOD
+                                LCONSOLE_ERROR("Your kernel must be compiled "
+                                               "with CONFIG_KMOD set for "
+                                               "automatic module loading.");
+#endif
+                                goto failed;
                         }
-                        
-                        ptl_nal_table[i] = NULL;
+                }
+#else
+                if (lnd == NULL) {
+                        LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
+                        CERROR("LND %s not supported\n",
+                               libcfs_lnd2str(lnd_type));
+                        goto failed;
+                }
+#endif
+
+                ni->ni_refcount = 1;
+
+                LNET_LOCK();
+                lnd->lnd_refcount++;
+                LNET_UNLOCK();
+
+                ni->ni_lnd = lnd;
+
+                rc = (lnd->lnd_startup)(ni);
+
+                LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex);
+
+                if (rc != 0) {
+                        LCONSOLE_ERROR("Error %d starting up LNI %s\n",
+                                       rc, libcfs_lnd2str(lnd->lnd_type));
+                        LNET_LOCK();
+                        lnd->lnd_refcount--;
+                        LNET_UNLOCK();
+                        goto failed;
                 }
 
-                ptl_init = 0;
+                list_del(&ni->ni_list);
+
+                LNET_LOCK();
+                list_add_tail(&ni->ni_list, &the_lnet.ln_nis);
+                LNET_UNLOCK();
+
+                if (lnd->lnd_type == LOLND) {
+                        lnet_ni_addref(ni);
+                        LASSERT (the_lnet.ln_loni == NULL);
+                        the_lnet.ln_loni = ni;
+                        continue;
+                }
+
+#ifndef __KERNEL__
+                if (lnd->lnd_wait != NULL) {
+                        if (the_lnet.ln_eqwaitni == NULL) {
+                                lnet_ni_addref(ni);
+                                the_lnet.ln_eqwaitni = ni;
+                        }
+                } else {
+# if !HAVE_LIBPTHREAD
+                        LCONSOLE_ERROR("LND %s not supported in a "
+                                       "single-threaded runtime\n",
+                                       libcfs_lnd2str(lnd_type));
+                        goto failed;
+# endif
+                }
+#endif
+                if (ni->ni_peertxcredits == 0 ||
+                    ni->ni_maxtxcredits == 0) {
+                        LCONSOLE_ERROR("LNI %s has no %scredits\n",
+                                       libcfs_lnd2str(lnd->lnd_type),
+                                       ni->ni_peertxcredits == 0 ?
+                                       "" : "per-peer ");
+                        goto failed;
+                }
+
+                ni->ni_txcredits = ni->ni_mintxcredits = ni->ni_maxtxcredits;
+
+                CDEBUG(D_LNI, "Added LNI %s [%d/%d]\n",
+                       libcfs_nid2str(ni->ni_nid),
+                       ni->ni_peertxcredits, ni->ni_txcredits);
+
+                /* Handle nidstrings for network 0 just like this one */
+                if (the_lnet.ln_ptlcompat > 0) {
+                        if (nicount > 0) {
+                                LCONSOLE_ERROR("Can't run > 1 network when "
+                                               "portals_compatibility is set\n");
+                                goto failed;
+                        }
+                        libcfs_setnet0alias(lnd->lnd_type);
+                }
+                
+                nicount++;
         }
-        
-        ptl_mutex_exit();
+
+        if (the_lnet.ln_eqwaitni != NULL && nicount > 1) {
+                lnd_type = the_lnet.ln_eqwaitni->ni_lnd->lnd_type;
+                LCONSOLE_ERROR("LND %s can only run single-network\n",
+                               libcfs_lnd2str(lnd_type));
+                goto failed;
+        }
+
+        return 0;
+
+ failed:
+        lnet_shutdown_lndnis();
+
+        while (!list_empty(&nilist)) {
+                ni = list_entry(nilist.next, lnet_ni_t, ni_list);
+                list_del(&ni->ni_list);
+                LIBCFS_FREE(ni, sizeof(*ni));
+        }
+
+        return -ENETDOWN;
 }
 
-int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid,
-              ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits,
-              ptl_handle_ni_t *handle)
+int
+LNetInit(void)
 {
-        nal_t *nal;
-        int    i;
         int    rc;
 
-        if (!ptl_init)
-                return PTL_NO_INIT;
+        lnet_assert_wire_constants ();
+        LASSERT (!the_lnet.ln_init);
 
-        ptl_mutex_enter ();
+        memset(&the_lnet, 0, sizeof(the_lnet));
 
-        if (interface == PTL_IFACE_DEFAULT) {
-                for (i = 0; i <= NAL_MAX_NR; i++)
-                        if (ptl_nal_table[i] != NULL) {
-                                interface = i;
-                                break;
+        rc = lnet_get_portals_compatibility();
+        if (rc < 0)
+                return rc;
+
+        lnet_init_locks();
+        CFS_INIT_LIST_HEAD(&the_lnet.ln_lnds);
+        the_lnet.ln_ptlcompat = rc;
+        the_lnet.ln_refcount = 0;
+        the_lnet.ln_init = 1;
+
+#ifdef __KERNEL__
+        /* All LNDs apart from the LOLND are in separate modules.  They
+         * register themselves when their module loads, and unregister
+         * themselves when their module is unloaded. */
+#else
+        /* Register LNDs
+         * NB the order here determines default 'networks=' order */
+# ifdef CRAY_XT3
+        LNET_REGISTER_ULND(the_ptllnd);
+# endif
+# if HAVE_LIBPTHREAD
+        LNET_REGISTER_ULND(the_tcplnd);
+# endif
+#endif
+        lnet_register_lnd(&the_lolnd);
+        return 0;
+}
+
+void
+LNetFini(void)
+{
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount == 0);
+
+        while (!list_empty(&the_lnet.ln_lnds))
+                lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next,
+                                               lnd_t, lnd_list));
+        lnet_fini_locks();
+
+        the_lnet.ln_init = 0;
+}
+
+int
+LNetNIInit(lnet_pid_t requested_pid)
+{
+        int         im_a_router = 0;
+        int         rc;
+
+        LNET_MUTEX_DOWN(&the_lnet.ln_api_mutex);
+
+        LASSERT (the_lnet.ln_init);
+        CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount);
+
+        if (the_lnet.ln_refcount > 0) {
+                rc = the_lnet.ln_refcount++;
+                goto out;
+        }
+
+        if (requested_pid == LNET_PID_ANY) {
+                /* Don't instantiate LNET just for me */
+                rc = -ENETDOWN;
+                goto failed0;
+        }
+
+        rc = lnet_prepare(requested_pid);
+        if (rc != 0)
+                goto failed0;
+
+        rc = lnet_startup_lndnis();
+        if (rc != 0)
+                goto failed1;
+
+        rc = lnet_parse_routes(lnet_get_routes(), &im_a_router);
+        if (rc != 0)
+                goto failed2;
+
+        rc = lnet_check_routes();
+        if (rc != 0)
+                goto failed2;
+
+        rc = lnet_alloc_rtrpools(im_a_router);
+        if (rc != 0)
+                goto failed2;
+
+        rc = lnet_acceptor_start();
+        if (rc != 0)
+                goto failed2;
+
+        the_lnet.ln_refcount = 1;
+        /* Now I may use my own API functions... */
+
+        rc = lnet_router_checker_start();
+        if (rc != 0)
+                goto failed3;
+
+        rc = lnet_ping_target_init();
+        if (rc != 0)
+                goto failed4;
+
+        lnet_proc_init();
+        goto out;
+
+ failed4:
+        lnet_router_checker_stop();
+ failed3:
+        the_lnet.ln_refcount = 0;
+        lnet_acceptor_stop();
+ failed2:
+        lnet_destroy_routes();
+        lnet_shutdown_lndnis();
+ failed1:
+        lnet_unprepare();
+ failed0:
+        LASSERT (rc < 0);
+ out:
+        LNET_MUTEX_UP(&the_lnet.ln_api_mutex);
+        return rc;
+}
+
+int
+LNetNIFini()
+{
+        LNET_MUTEX_DOWN(&the_lnet.ln_api_mutex);
+
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+
+        if (the_lnet.ln_refcount != 1) {
+                the_lnet.ln_refcount--;
+        } else {
+                LASSERT (!the_lnet.ln_niinit_self);
+
+                lnet_proc_fini();
+                lnet_ping_target_fini();
+                lnet_router_checker_stop();
+
+                /* Teardown fns that use my own API functions BEFORE here */
+                the_lnet.ln_refcount = 0;
+
+                lnet_acceptor_stop();
+                lnet_destroy_routes();
+                lnet_shutdown_lndnis();
+                lnet_unprepare();
+        }
+
+        LNET_MUTEX_UP(&the_lnet.ln_api_mutex);
+        return 0;
+}
+
+int
+LNetCtl(unsigned int cmd, void *arg)
+{
+        struct libcfs_ioctl_data *data = arg;
+        lnet_process_id_t         id;
+        lnet_ni_t                *ni;
+        int                       rc;
+
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+
+        switch (cmd) {
+        case IOC_LIBCFS_GET_NI:
+                rc = LNetGetId(data->ioc_count, &id);
+                data->ioc_nid = id.nid;
+                return rc;
+
+        case IOC_LIBCFS_FAIL_NID:
+                return lnet_fail_nid(data->ioc_nid, data->ioc_count);
+                
+        case IOC_LIBCFS_ADD_ROUTE:
+                rc = lnet_add_route(data->ioc_net, data->ioc_count, 
+                                    data->ioc_nid);
+                return (rc != 0) ? rc : lnet_check_routes();
+                
+        case IOC_LIBCFS_DEL_ROUTE:
+                return lnet_del_route(data->ioc_net, data->ioc_nid);
+
+        case IOC_LIBCFS_GET_ROUTE:
+                return lnet_get_route(data->ioc_count, 
+                                      &data->ioc_net, &data->ioc_count, 
+                                      &data->ioc_nid, &data->ioc_flags);
+        case IOC_LIBCFS_NOTIFY_ROUTER:
+                return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, 
+                                   (time_t)data->ioc_u64[0]);
+
+        case IOC_LIBCFS_PORTALS_COMPATIBILITY:
+                return the_lnet.ln_ptlcompat;
+
+        case IOC_LIBCFS_LNET_DIST:
+                rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]);
+                if (rc < 0 && rc != -EHOSTUNREACH)
+                        return rc;
+                
+                data->ioc_u32[0] = rc;
+                return 0;
+
+        case IOC_LIBCFS_TESTPROTOCOMPAT:
+                LNET_LOCK();
+                the_lnet.ln_testprotocompat = data->ioc_flags;
+                LNET_UNLOCK();
+                return 0;
+
+        case IOC_LIBCFS_PING:
+                rc = lnet_ping((lnet_process_id_t) {.nid = data->ioc_nid,
+                                                    .pid = data->ioc_u32[0]},
+                               data->ioc_u32[1], /* timeout */
+                               (lnet_process_id_t *)data->ioc_pbuf1,
+                               data->ioc_plen1/sizeof(lnet_process_id_t));
+                if (rc < 0)
+                        return rc;
+                data->ioc_count = rc;
+                return 0;
+
+        case IOC_LIBCFS_DEBUG_PEER: {
+                /* CAVEAT EMPTOR: this one designed for calling directly; not
+                 * via an ioctl */
+                lnet_process_id_t *id = arg;
+
+                lnet_debug_peer(id->nid);
+
+                ni = lnet_net2ni(LNET_NIDNET(id->nid));
+                if (ni == NULL) {
+                        CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(*id));
+                } else {
+                        if (ni->ni_lnd->lnd_ctl == NULL) {
+                                CDEBUG(D_WARNING, "No ctl for %s\n",
+                                       libcfs_id2str(*id));
+                        } else {
+                                (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg);
                         }
-                /* NB if no interfaces are registered, 'interface' will
-                 * fail the valid test below */
+                        
+                        lnet_ni_decref(ni);
+                }
+                return 0;
         }
-        
-        if (interface < 0 || 
-            interface > NAL_MAX_NR ||
-            ptl_nal_table[interface] == NULL) {
-                GOTO(out, rc = PTL_IFACE_INVALID);
+                
+        default:
+                ni = lnet_net2ni(data->ioc_net);
+                if (ni == NULL)
+                        return -EINVAL;
+
+                if (ni->ni_lnd->lnd_ctl == NULL)
+                        rc = -EINVAL;
+                else
+                        rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg);
+
+                lnet_ni_decref(ni);
+                return rc;
         }
+        /* not reached */
+}
 
-        nal = ptl_nal_table[interface];
-        nal->nal_handle.nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface;
-        nal->nal_handle.cookie = 0;
-        
-        CDEBUG(D_OTHER, "Starting up NAL (%x) refs %d\n", interface, nal->nal_refct);
-        rc = nal->nal_ni_init(nal, requested_pid, desired_limits, actual_limits);
+int
+LNetGetId(unsigned int index, lnet_process_id_t *id)
+{
+        lnet_ni_t        *ni;
+        struct list_head *tmp;
+        int               rc = -ENOENT;
+
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+
+        LNET_LOCK();
+
+        list_for_each(tmp, &the_lnet.ln_nis) {
+                if (index-- != 0)
+                        continue;
+                
+                ni = list_entry(tmp, lnet_ni_t, ni_list);
 
-        if (rc != PTL_OK) {
-                CERROR("Error %d starting up NAL %x, refs %d\n", rc,
-                       interface, nal->nal_refct);
-                GOTO(out, rc);
+                id->nid = ni->ni_nid;
+                id->pid = the_lnet.ln_pid;
+                rc = 0;
+                break;
         }
+
+        LNET_UNLOCK();
+
+        return rc;
+}
+
+void
+LNetSnprintHandle(char *str, int len, lnet_handle_any_t h)
+{
+        snprintf(str, len, LPX64, h.cookie);
+}
+
+
+int
+lnet_ping_target_init(void)
+{
+        lnet_handle_me_t  meh;
+        lnet_process_id_t id;
+        int               rc;
+        int               rc2;
+        int               n;
+        int               infosz;
+        int               i;
         
-        if (nal->nal_refct != 0) {
-                /* Caller gets to know if this was the first ref or not */
-                rc = PTL_IFACE_DUP;
+        for (n = 0; ; n++) {
+                rc = LNetGetId(n, &id);
+                if (rc == -ENOENT)
+                        break;
+
+                LASSERT (rc == 0);
+        }
+
+        infosz = offsetof(lnet_ping_info_t, pi_nid[n]);
+        LIBCFS_ALLOC(the_lnet.ln_ping_info, infosz);
+        if (the_lnet.ln_ping_info == NULL) {
+                CERROR("Can't allocate ping info[%d]\n", n);
+                return -ENOMEM;
+        }
+
+        the_lnet.ln_ping_info->pi_magic   = LNET_PROTO_PING_MAGIC;
+        the_lnet.ln_ping_info->pi_version = LNET_PROTO_PING_VERSION;
+        the_lnet.ln_ping_info->pi_pid     = the_lnet.ln_pid;
+        the_lnet.ln_ping_info->pi_nnids   = n;
+
+        for (i = 0; i < n; i++) {
+                rc = LNetGetId(i, &id);
+                LASSERT (rc == 0);
+                the_lnet.ln_ping_info->pi_nid[i] = id.nid;
         }
         
-        nal->nal_refct++;
-        *handle = nal->nal_handle;
+        /* We can have a tiny EQ since we only need to see the unlink event on
+         * teardown, which by definition is the last one! */
+        rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq);
+        if (rc != 0) {
+                CERROR("Can't allocate ping EQ: %d\n", rc);
+                goto failed_0;
+        }
 
- out:
-        ptl_mutex_exit ();
+        rc = LNetMEAttach(LNET_RESERVED_PORTAL,
+                          (lnet_process_id_t){.nid = LNET_NID_ANY,
+                                              .pid = LNET_PID_ANY},
+                          LNET_PROTO_PING_MATCHBITS, 0LL,
+                          LNET_UNLINK, LNET_INS_AFTER,
+                          &meh);
+        if (rc != 0) {
+                CERROR("Can't create ping ME: %d\n", rc);
+                goto failed_1;
+        }
+
+        rc = LNetMDAttach(meh,
+                          (lnet_md_t){.start = the_lnet.ln_ping_info,
+                                      .length = infosz,
+                                      .threshold = LNET_MD_THRESH_INF,
+                                      .options = (LNET_MD_OP_GET |
+                                                  LNET_MD_TRUNCATE |
+                                                  LNET_MD_MANAGE_REMOTE),
+                                      .eq_handle = the_lnet.ln_ping_target_eq},
+                          LNET_RETAIN,
+                          &the_lnet.ln_ping_target_md);
+        if (rc != 0) {
+                CERROR("Can't attach ping MD: %d\n", rc);
+                goto failed_2;
+        }
+
+        return 0;
+
+ failed_2:
+        rc2 = LNetMEUnlink(meh);
+        LASSERT (rc2 == 0);
+ failed_1:
+        rc2 = LNetEQFree(the_lnet.ln_ping_target_eq);
+        LASSERT (rc2 == 0);
+ failed_0:
+        LIBCFS_FREE(the_lnet.ln_ping_info, infosz);
 
         return rc;
 }
 
-int PtlNIFini(ptl_handle_ni_t ni)
+void
+lnet_ping_target_fini(void)
 {
-        nal_t *nal;
-        int    idx;
+        lnet_event_t    event;
+        int             rc;
+        int             which;
+        int             timeout_ms = 1000;
+        cfs_sigset_t    blocked = cfs_block_allsigs();
+
+        LNetMDUnlink(the_lnet.ln_ping_target_md);
+        /* NB md could be busy; this just starts the unlink */
+
+        for (;;) {
+                rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1,
+                                timeout_ms, &event, &which);
+
+                /* I expect overflow... */
+                LASSERT (rc >= 0 || rc == -EOVERFLOW);
+
+                if (rc == 0) {
+                        /* timed out: provide a diagnostic */
+                        CWARN("Still waiting for ping MD to unlink\n");
+                        timeout_ms *= 2;
+                        continue;
+                }
+
+                /* Got a valid event */
+                if (event.unlinked)
+                        break;
+        }
+
+        rc = LNetEQFree(the_lnet.ln_ping_target_eq);
+        LASSERT (rc == 0);
+
+        LIBCFS_FREE(the_lnet.ln_ping_info,
+                    offsetof(lnet_ping_info_t,
+                             pi_nid[the_lnet.ln_ping_info->pi_nnids]));
+
+        cfs_restore_sigs(blocked);
+}
+
+int
+lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids)
+{
+        lnet_handle_eq_t     eqh;
+        lnet_handle_md_t     mdh;
+        lnet_event_t         event;
+        int                  which;
+        int                  unlinked = 0;
+        int                  replied = 0;
+        const int            a_long_time = 60000; /* mS */
+        int                  infosz = offsetof(lnet_ping_info_t, pi_nid[n_ids]);
+        lnet_ping_info_t    *info;
+        lnet_process_id_t    tmpid;
+        int                  i;
+        int                  nob;
+        int                  rc;
+        int                  rc2;
+        cfs_sigset_t         blocked;
+
+        if (n_ids <= 0 ||
+            id.nid == LNET_NID_ANY ||
+            timeout_ms > 500000 ||              /* arbitrary limit! */
+            n_ids > 20)                         /* arbitrary limit! */
+                return -EINVAL;
+
+        if (id.pid == LNET_PID_ANY)
+                id.pid = LUSTRE_SRV_LNET_PID;
+
+        LIBCFS_ALLOC(info, infosz);
+        if (info == NULL)
+                return -ENOMEM;
+
+        /* NB 2 events max (including any unlink event) */
+        rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh);
+        if (rc != 0) {
+                CERROR("Can't allocate EQ: %d\n", rc);
+                goto out_0;
+        }
+
+        rc = LNetMDBind((lnet_md_t){.start = info,
+                                    .length = infosz,
+                                    .threshold = 2, /* GET/REPLY */
+                                    .options = LNET_MD_TRUNCATE,
+                                    .eq_handle = eqh},
+                        LNET_UNLINK,
+                        &mdh);
+        if (rc != 0) {
+                CERROR("Can't bind MD: %d\n", rc);
+                goto out_1;
+        }
+
+        rc = LNetGet(LNET_NID_ANY, mdh, id,
+                     LNET_RESERVED_PORTAL,
+                     LNET_PROTO_PING_MATCHBITS, 0);
+
+        if (rc != 0) {
+                /* Don't CERROR; this could be deliberate! */
+
+                rc2 = LNetMDUnlink(mdh);
+                LASSERT (rc2 == 0);
 
-        if (!ptl_init)
-                return PTL_NO_INIT;
+                /* NB must wait for the UNLINK event below... */
+                unlinked = 1;
+                timeout_ms = a_long_time;
+        }
+
+        do {
+                /* MUST block for unlink to complete */
+                if (unlinked)
+                        blocked = cfs_block_allsigs();
+
+                rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which);
+
+                if (unlinked)
+                        cfs_restore_sigs(blocked);
 
-        ptl_mutex_enter ();
+                CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
+                       (rc2 <= 0) ? -1 : event.type,
+                       (rc2 <= 0) ? -1 : event.status,
+                       (rc2 > 0 && event.unlinked) ? " unlinked" : "");
+
+                LASSERT (rc2 != -EOVERFLOW);     /* can't miss anything */
+
+                if (rc2 <= 0 || event.status != 0) {
+                        /* timeout or error */
+                        if (!replied && rc == 0)
+                                rc = (rc2 < 0) ? rc2 :
+                                     (rc2 == 0) ? -ETIMEDOUT :
+                                     event.status;
+
+                        if (!unlinked) {
+                                /* Ensure completion in finite time... */
+                                LNetMDUnlink(mdh);
+                                /* No assertion (racing with network) */
+                                unlinked = 1;
+                                timeout_ms = a_long_time;
+                        } else if (rc2 == 0) {
+                                /* timed out waiting for unlink */
+                                CWARN("ping %s: late network completion\n",
+                                      libcfs_id2str(id));
+                        }
 
-        nal = ptl_hndl2nal (&ni);
-        if (nal == NULL) {
-                ptl_mutex_exit ();
-                return PTL_HANDLE_INVALID;
+                } else if (event.type == LNET_EVENT_REPLY) {
+                        replied = 1;
+                        rc = event.mlength;
+                }
+
+        } while (rc2 <= 0 || !event.unlinked);
+
+        if (!replied) {
+                if (rc >= 0)
+                        CWARN("%s: Unexpected rc >= 0 but no reply!\n",
+                              libcfs_id2str(id));
+                rc = -EIO;
+                goto out_1;
         }
 
-        idx = ni.nal_idx & NI_HANDLE_MASK;
+        nob = rc;
+        LASSERT (nob >= 0 && nob <= infosz);
 
-        LASSERT(nal->nal_refct > 0);
+        rc = -EPROTO;                           /* if I can't parse... */
 
-        nal->nal_refct--;
+        if (nob < 8) {
+                /* can't check magic/version */
+                CERROR("%s: ping info too short %d\n",
+                       libcfs_id2str(id), nob);
+                goto out_1;
+        }
 
-        /* nal_refct == 0 tells nal->shutdown to really shut down */
-        nal->nal_ni_fini(nal);
+        if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) {
+                /* NB I might be swabbing garbage until I check below, but it
+                 * doesn't matter */
+                __swab32s(&info->pi_version);
+                __swab32s(&info->pi_pid);
+                __swab32s(&info->pi_nnids);
+                for (i = 0; i < info->pi_nnids && i < n_ids; i++)
+                        __swab64s(&info->pi_nid[i]);
 
-        ptl_mutex_exit ();
-        return PTL_OK;
+        } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) {
+                CERROR("%s: Unexpected magic %08x\n", 
+                       libcfs_id2str(id), info->pi_magic);
+                goto out_1;
+        }
+
+        if (info->pi_version != LNET_PROTO_PING_VERSION) {
+                CERROR("%s: Unexpected version 0x%x\n",
+                       libcfs_id2str(id), info->pi_version);
+                goto out_1;
+        }
+
+        if (nob < offsetof(lnet_ping_info_t, pi_nid[0])) {
+                CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id), 
+                       nob, (int)offsetof(lnet_ping_info_t, pi_nid[0]));
+                goto out_1;
+        }
+
+        if (info->pi_nnids < n_ids)
+                n_ids = info->pi_nnids;
+
+        if (nob < offsetof(lnet_ping_info_t, pi_nid[n_ids])) {
+                CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id), 
+                       nob, (int)offsetof(lnet_ping_info_t, pi_nid[n_ids]));
+                goto out_1;
+        }
+
+        rc = -EFAULT;                           /* If I SEGV... */
+
+        for (i = 0; i < n_ids; i++) {
+                tmpid.pid = info->pi_pid;
+                tmpid.nid = info->pi_nid[i];
+#ifdef __KERNEL__
+                if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid)))
+                        goto out_1;
+#else
+                ids[i] = tmpid;
+#endif
+        }
+        rc = info->pi_nnids;
+
+ out_1:
+        rc2 = LNetEQFree(eqh);
+        if (rc2 != 0)
+                CERROR("rc2 %d\n", rc2);
+        LASSERT (rc2 == 0);
+
+ out_0:
+        LIBCFS_FREE(info, infosz);
+        return rc;
 }
diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c
deleted file mode 100644 (file)
index 92f495e..0000000
+++ /dev/null
@@ -1,379 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * api/api-wrap.c
- * User-level wrappers that dispatch across the protection boundaries
- *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.sf.net/projects/lustre/
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define DEBUG_SUBSYSTEM S_PORTALS
-#include <portals/api-support.h>
-
-void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h)
-{
-        snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie);
-}
-
-int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t *ni_out)
-{
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        if (ptl_hndl2nal(&handle_in) == NULL)
-                return PTL_HANDLE_INVALID;
-
-        *ni_out = handle_in;
-        return PTL_OK;
-}
-
-int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&ni_handle);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        return nal->nal_get_id(nal, id);
-}
-
-int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&ni_handle);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        /* We don't support different uids yet */
-        *uid = 0;
-        return PTL_OK;
-}
-
-int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&interface);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        return nal->nal_fail_nid(nal, nid, threshold);
-}
-
-int PtlLoopback (ptl_handle_ni_t interface, int set, int *enabled)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-        
-        nal = ptl_hndl2nal(&interface);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-        
-        return nal->nal_loopback(nal, set, enabled);
-}
-
-int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in,
-                ptl_sr_value_t *status_out)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&interface_in);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        return nal->nal_ni_status(nal, register_in, status_out);
-}
-
-int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in,
-              unsigned long *distance_out)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&interface_in);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        return nal->nal_ni_dist(nal, &process_in, distance_out);
-}
-
-int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in,
-                ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in,
-                ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in,
-                ptl_ins_pos_t pos_in, ptl_handle_me_t *handle_out)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&interface_in);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        return nal->nal_me_attach(nal, index_in, match_id_in,
-                                  match_bits_in, ignore_bits_in,
-                                  unlink_in, pos_in, handle_out);
-}
-
-int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in,
-                ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in,
-                ptl_unlink_t unlink_in, ptl_ins_pos_t position_in,
-                ptl_handle_me_t * handle_out)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&current_in);
-        if (nal == NULL)
-                return PTL_ME_INVALID;
-
-        return nal->nal_me_insert(nal, &current_in, match_id_in,
-                                  match_bits_in, ignore_bits_in,
-                                  unlink_in, position_in, handle_out);
-}
-
-int PtlMEUnlink(ptl_handle_me_t current_in)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&current_in);
-        if (nal == NULL)
-                return PTL_ME_INVALID;
-
-        return nal->nal_me_unlink(nal, &current_in);
-}
-
-int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in,
-                ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&me_in);
-        if (nal == NULL)
-                return PTL_ME_INVALID;
-
-        if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) &&
-            ptl_hndl2nal(&md_in.eq_handle) != nal)
-                return PTL_MD_ILLEGAL;
-
-        return (nal->nal_md_attach)(nal, &me_in, &md_in,
-                                    unlink_in, handle_out);
-}
-
-int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in,
-              ptl_unlink_t unlink_in, ptl_handle_md_t *handle_out)
-{
-        nal_t     *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&ni_in);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) &&
-            ptl_hndl2nal(&md_in.eq_handle) != nal)
-                return PTL_MD_ILLEGAL;
-
-        return (nal->nal_md_bind)(nal, &md_in, unlink_in, handle_out);
-}
-
-int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout,
-                ptl_md_t *new_inout, ptl_handle_eq_t testq_in)
-{
-        nal_t    *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&md_in);
-        if (nal == NULL)
-                return PTL_MD_INVALID;
-
-        if (!PtlHandleIsEqual(testq_in, PTL_EQ_NONE) &&
-            ptl_hndl2nal(&testq_in) != nal)
-                return PTL_EQ_INVALID;
-
-        return (nal->nal_md_update)(nal, &md_in,
-                                    old_inout, new_inout, &testq_in);
-}
-
-int PtlMDUnlink(ptl_handle_md_t md_in)
-{
-        nal_t    *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&md_in);
-        if (nal == NULL)
-                return PTL_MD_INVALID;
-
-        return (nal->nal_md_unlink)(nal, &md_in);
-}
-
-int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count,
-               ptl_eq_handler_t callback,
-               ptl_handle_eq_t *handle_out)
-{
-        nal_t    *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&interface);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        return (nal->nal_eq_alloc)(nal, count, callback, handle_out);
-}
-
-int PtlEQFree(ptl_handle_eq_t eventq)
-{
-        nal_t       *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&eventq);
-        if (nal == NULL)
-                return PTL_EQ_INVALID;
-
-        return (nal->nal_eq_free)(nal, &eventq);
-}
-
-int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t *ev)
-{
-        int which;
-
-        return (PtlEQPoll (&eventq, 1, 0, ev, &which));
-}
-
-int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out)
-{
-        int which;
-
-        return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER,
-                           event_out, &which));
-}
-
-int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout,
-              ptl_event_t *event_out, int *which_out)
-{
-        int           i;
-        nal_t        *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        if (neq_in < 1)
-                return PTL_EQ_INVALID;
-
-        nal = ptl_hndl2nal(&eventqs_in[0]);
-        if (nal == NULL)
-                return PTL_EQ_INVALID;
-
-        for (i = 1; i < neq_in; i++)
-                if (ptl_hndl2nal(&eventqs_in[i]) != nal)
-                        return PTL_EQ_INVALID;
-
-        return (nal->nal_eq_poll)(nal, eventqs_in, neq_in, timeout,
-                                  event_out, which_out);
-}
-
-
-int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in,
-               ptl_process_id_t match_id_in, ptl_pt_index_t portal_in)
-{
-        nal_t    *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&ni_in);
-        if (nal == NULL)
-                return PTL_NI_INVALID;
-
-        return (nal->nal_ace_entry)(nal, index_in, match_id_in, portal_in);
-}
-
-int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in,
-           ptl_process_id_t target_in, ptl_pt_index_t portal_in,
-           ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in,
-           ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in)
-{
-        nal_t    *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&md_in);
-        if (nal == NULL)
-                return PTL_MD_INVALID;
-
-        return (nal->nal_put)(nal, &md_in, ack_req_in,
-                              &target_in, portal_in, ac_in,
-                              match_bits_in, offset_in, hdr_data_in);
-}
-
-int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in,
-           ptl_pt_index_t portal_in, ptl_ac_index_t ac_in,
-           ptl_match_bits_t match_bits_in, ptl_size_t offset_in)
-{
-        nal_t  *nal;
-
-        if (!ptl_init)
-                return PTL_NO_INIT;
-
-        nal = ptl_hndl2nal(&md_in);
-        if (nal == NULL)
-                return PTL_MD_INVALID;
-
-        return (nal->nal_get)(nal, &md_in,
-                              &target_in, portal_in, ac_in,
-                              match_bits_in, offset_in);
-}
-
index bd05e93..9ce40fe 100644 (file)
@@ -1,32 +1,34 @@
-my_sources =    api-errno.c api-ni.c api-wrap.c \
-               lib-init.c lib-me.c lib-msg.c lib-eq.c \
-               lib-md.c lib-move.c lib-ni.c lib-pid.c
+my_sources =    api-errno.c api-ni.c config.c \
+               lib-me.c lib-msg.c lib-eq.c \
+               lib-md.c lib-move.c lo.c \
+               router.c router_proc.c \
+               acceptor.c peer.c
 
-if !CRAY_PORTALS
 
 if LIBLUSTRE
-noinst_LIBRARIES= libportals.a
-libportals_a_SOURCES= $(my_sources)
-libportals_a_CPPFLAGS = $(LLCPPFLAGS)
-libportals_a_CFLAGS = $(LLCFLAGS)
+noinst_LIBRARIES= liblnet.a
+liblnet_a_SOURCES= $(my_sources)
+liblnet_a_CPPFLAGS = $(LLCPPFLAGS)
+liblnet_a_CFLAGS = $(LLCFLAGS)
 endif
 
 if MODULES
 
 if LINUX
-modulenet_DATA = portals$(KMODEXT)
+modulenet_DATA = lnet$(KMODEXT)
 endif # LINUX
 
 if DARWIN
-macos_PROGRAMS := portals
+macos_PROGRAMS := lnet
 
-portals_SOURCES := api-errno.c api-ni.c api-wrap.c
-portals_SOURCES += lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c
-portals_SOURCES += lib-move.c lib-ni.c lib-pid.c module.c
+lnet_SOURCES := api-errno.c api-ni.c config.c
+lnet_SOURCES += lib-me.c lib-msg.c lib-eq.c lib-md.c
+lnet_SOURCES += lib-move.c module.c lo.c router.c router_proc.c
+lnet_SOURCES += acceptor.c peer.c
 
-portals_CFLAGS := $(EXTRA_KCFLAGS)
-portals_LDFLAGS := $(EXTRA_KLDFLAGS)
-portals_LDADD := $(EXTRA_KLIBS)
+lnet_CFLAGS := $(EXTRA_KCFLAGS)
+lnet_LDFLAGS := $(EXTRA_KLDFLAGS)
+lnet_LDADD := $(EXTRA_KLIBS)
 
 plist_DATA := Info.plist
 
@@ -36,11 +38,9 @@ endif # DARWIN
 
 endif # MODULES
 
-endif # CRAY_PORTALS
-
 install-data-hook: $(install_data_hook)
 
 EXTRA_DIST := Info.plist
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  portals
-DIST_SOURCES = $(portals-objs:%.o=%.c)
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ lnet
+DIST_SOURCES = $(lnet-objs:%.o=%.c)
diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c
new file mode 100644 (file)
index 0000000..cd5e211
--- /dev/null
@@ -0,0 +1,1386 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (c) 2005 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.sf.net/projects/lustre/
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+typedef struct {                                /* tmp struct for parsing routes */
+       struct list_head   ltb_list;            /* stash on lists */
+       int                ltb_size;            /* allocated size */
+       char               ltb_text[0];         /* text buffer */
+} lnet_text_buf_t;
+
+static int lnet_tbnob = 0;                     /* track text buf allocation */
+#define LNET_MAX_TEXTBUF_NOB     (64<<10)      /* bound allocation */
+#define LNET_SINGLE_TEXTBUF_NOB  (4<<10)
+
+typedef struct {
+        struct list_head   lre_list;            /* stash in a list */
+        int                lre_min;             /* min value */
+        int                lre_max;             /* max value */
+        int                lre_stride;          /* stride */
+} lnet_range_expr_t;
+
+static int lnet_re_alloc = 0;                   /* track expr allocation */
+
+void 
+lnet_syntax(char *name, char *str, int offset, int width)
+{
+        static char dots[LNET_SINGLE_TEXTBUF_NOB];
+        static char dashes[LNET_SINGLE_TEXTBUF_NOB];
+        
+        memset(dots, '.', sizeof(dots));
+        dots[sizeof(dots)-1] = 0;
+        memset(dashes, '-', sizeof(dashes));
+        dashes[sizeof(dashes)-1] = 0;
+        
+       LCONSOLE_ERROR("Error parsing '%s=\"%s\"'\n", name, str);
+       LCONSOLE_ERROR("here...........%.*s..%.*s|%.*s|\n", 
+                       (int)strlen(name), dots, offset, dots,
+                       (width < 1) ? 0 : width - 1, dashes);
+}
+
+int 
+lnet_issep (char c)
+{
+       switch (c) {
+       case '\n':
+       case '\r':
+       case ';':
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+int
+lnet_iswhite (char c)
+{
+       switch (c) {
+       case ' ':
+       case '\t':
+       case '\n':
+       case '\r':
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+char *
+lnet_trimwhite(char *str)
+{
+       char *end;
+       
+       while (lnet_iswhite(*str))
+               str++;
+       
+       end = str + strlen(str);
+       while (end > str) {
+               if (!lnet_iswhite(end[-1]))
+                       break;
+               end--;
+       }
+
+       *end = 0;
+       return str;
+}
+
+int
+lnet_net_unique(__u32 net, struct list_head *nilist)
+{
+        struct list_head *tmp;
+        lnet_ni_t        *ni;
+
+        list_for_each (tmp, nilist) {
+                ni = list_entry(tmp, lnet_ni_t, ni_list);
+
+                if (LNET_NIDNET(ni->ni_nid) == net)
+                        return 0;
+        }
+        
+        return 1;
+}
+
+lnet_ni_t *
+lnet_new_ni(__u32 net, struct list_head *nilist)
+{
+        lnet_ni_t *ni;
+
+        if (!lnet_net_unique(net, nilist)) {
+                LCONSOLE_ERROR("Duplicate network specified: %s\n",
+                               libcfs_net2str(net));
+                return NULL;
+        }
+        
+        LIBCFS_ALLOC(ni, sizeof(*ni));
+        if (ni == NULL) {
+                CERROR("Out of memory creating network %s\n",
+                       libcfs_net2str(net));
+                return NULL;
+        }
+        
+        /* zero counters/flags, NULL pointers... */
+        memset(ni, 0, sizeof(*ni));
+
+        /* LND will fill in the address part of the NID */
+        ni->ni_nid = LNET_MKNID(net, 0);
+        CFS_INIT_LIST_HEAD(&ni->ni_txq);
+
+        list_add_tail(&ni->ni_list, nilist);
+        return ni;
+}
+
+int
+lnet_parse_networks(struct list_head *nilist, char *networks)
+{
+       int        tokensize = strlen(networks) + 1;
+        char      *tokens;
+        char      *str;
+        lnet_ni_t *ni;
+        __u32      net;
+        int        nnets = 0;
+
+       if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) {
+               /* _WAY_ conservative */
+               LCONSOLE_ERROR("Can't parse networks: string too long\n");
+               return -EINVAL;
+       }
+
+        LIBCFS_ALLOC(tokens, tokensize);
+        if (tokens == NULL) {
+                CERROR("Can't allocate net tokens\n");
+               return -ENOMEM;
+        }
+
+        the_lnet.ln_network_tokens = tokens;
+        the_lnet.ln_network_tokens_nob = tokensize;
+        memcpy (tokens, networks, tokensize);
+       str = tokens;
+        
+        /* Add in the loopback network */
+        ni = lnet_new_ni(LNET_MKNET(LOLND, 0), nilist);
+        if (ni == NULL)
+                goto failed;
+        
+        while (str != NULL && *str != 0) {
+                char      *comma = strchr(str, ',');
+                char      *bracket = strchr(str, '(');
+                int        niface;
+               char      *iface;
+
+                /* NB we don't check interface conflicts here; it's the LNDs
+                 * responsibility (if it cares at all) */
+
+                if (bracket == NULL ||
+                   (comma != NULL && comma < bracket)) {
+
+                        /* no interface list specified */
+
+                       if (comma != NULL)
+                               *comma++ = 0;
+                       net = libcfs_str2net(lnet_trimwhite(str));
+                       
+                       if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                                lnet_syntax("networks", networks, 
+                                            str - tokens, strlen(str));
+                                LCONSOLE_ERROR("Unrecognised network type\n");
+                                goto failed;
+                        }
+
+                        if (LNET_NETTYP(net) != LOLND && /* loopback is implicit */
+                            lnet_new_ni(net, nilist) == NULL)
+                                goto failed;
+
+                       str = comma;
+                       continue;
+               }
+
+               *bracket = 0;
+               net = libcfs_str2net(lnet_trimwhite(str));
+               if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                        lnet_syntax("networks", networks,
+                                    str - tokens, strlen(str));
+                        goto failed;
+                } 
+
+                if (nnets > 0 &&
+                    the_lnet.ln_ptlcompat > 0) {
+                        LCONSOLE_ERROR("Only 1 network supported when "
+                                       "'portals_compatible' is set\n");
+                        goto failed;
+                }
+
+                nnets++;
+                ni = lnet_new_ni(net, nilist);
+                if (ni == NULL)
+                        goto failed;
+
+                niface = 0;
+               iface = bracket + 1;
+
+               bracket = strchr(iface, ')');
+               if (bracket == NULL) {
+                        lnet_syntax("networks", networks,
+                                    iface - tokens, strlen(iface));
+                        goto failed;
+               }
+
+               *bracket = 0;
+               do {
+                       comma = strchr(iface, ',');
+                       if (comma != NULL)
+                               *comma++ = 0;
+                       
+                       iface = lnet_trimwhite(iface);
+                       if (*iface == 0) {
+                                lnet_syntax("networks", networks, 
+                                            iface - tokens, strlen(iface));
+                                goto failed;
+                        }
+
+                        if (niface == LNET_MAX_INTERFACES) {
+                                LCONSOLE_ERROR("Too many interfaces for net %s\n",
+                                               libcfs_net2str(net));
+                                goto failed;
+                        }
+
+                        ni->ni_interfaces[niface++] = iface;
+                       iface = comma;
+               } while (iface != NULL);
+
+               str = bracket + 1;
+               comma = strchr(bracket + 1, ',');
+               if (comma != NULL) {
+                       *comma = 0;
+                       str = lnet_trimwhite(str);
+                       if (*str != 0) {
+                                lnet_syntax("networks", networks,
+                                            str - tokens, strlen(str));
+                                goto failed;
+                        }
+                       str = comma + 1;
+                       continue;
+               }
+               
+               str = lnet_trimwhite(str);
+               if (*str != 0) {
+                        lnet_syntax("networks", networks,
+                                    str - tokens, strlen(str));
+                        goto failed;
+                }
+       }
+
+        LASSERT (!list_empty(nilist));
+        return 0;
+
+ failed:
+        while (!list_empty(nilist)) {
+                ni = list_entry(nilist->next, lnet_ni_t, ni_list);
+                
+                list_del(&ni->ni_list);
+                LIBCFS_FREE(ni, sizeof(*ni));
+        }
+       LIBCFS_FREE(tokens, tokensize);
+        the_lnet.ln_network_tokens = NULL;
+
+        return -EINVAL;
+}
+
+lnet_text_buf_t *
+lnet_new_text_buf (int str_len) 
+{
+       lnet_text_buf_t *ltb;
+       int              nob;
+
+        /* NB allocate space for the terminating 0 */
+       nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]);
+       if (nob > LNET_SINGLE_TEXTBUF_NOB) {
+               /* _way_ conservative for "route net gateway..." */
+               CERROR("text buffer too big\n");
+               return NULL;
+       }
+
+       if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) {
+               CERROR("Too many text buffers\n");
+               return NULL;
+       }
+       
+       LIBCFS_ALLOC(ltb, nob);
+       if (ltb == NULL)
+               return NULL;
+
+       ltb->ltb_size = nob;
+        ltb->ltb_text[0] = 0;
+       lnet_tbnob += nob;
+       return ltb;
+}
+
+void
+lnet_free_text_buf (lnet_text_buf_t *ltb)
+{
+       lnet_tbnob -= ltb->ltb_size;
+       LIBCFS_FREE(ltb, ltb->ltb_size);
+}
+
+void
+lnet_free_text_bufs(struct list_head *tbs)
+{
+       lnet_text_buf_t  *ltb;
+       
+       while (!list_empty(tbs)) {
+               ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+               
+               list_del(&ltb->ltb_list);
+               lnet_free_text_buf(ltb);
+       }
+}
+
+void
+lnet_print_text_bufs(struct list_head *tbs)
+{
+       struct list_head  *tmp;
+       lnet_text_buf_t   *ltb;
+
+       list_for_each (tmp, tbs) {
+               ltb = list_entry(tmp, lnet_text_buf_t, ltb_list);
+
+               CDEBUG(D_WARNING, "%s\n", ltb->ltb_text);
+       }
+
+       CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob);
+}
+
+int
+lnet_str2tbs_sep (struct list_head *tbs, char *str) 
+{
+       struct list_head  pending;
+       char             *sep;
+       int               nob;
+        int               i;
+       lnet_text_buf_t  *ltb;
+
+       INIT_LIST_HEAD(&pending);
+
+       /* Split 'str' into separate commands */
+       for (;;) {
+                /* skip leading whitespace */
+                while (lnet_iswhite(*str))
+                        str++;
+                
+               /* scan for separator or comment */
+               for (sep = str; *sep != 0; sep++)
+                       if (lnet_issep(*sep) || *sep == '#')
+                               break;
+
+               nob = sep - str;
+               if (nob > 0) {
+                       ltb = lnet_new_text_buf(nob);
+                       if (ltb == NULL) {
+                               lnet_free_text_bufs(&pending);
+                               return -1;
+                       }
+                       
+                        for (i = 0; i < nob; i++)
+                                if (lnet_iswhite(str[i]))
+                                        ltb->ltb_text[i] = ' ';
+                                else
+                                        ltb->ltb_text[i] = str[i];
+
+                       ltb->ltb_text[nob] = 0;
+
+                       list_add_tail(&ltb->ltb_list, &pending);
+               }
+
+               if (*sep == '#') {
+                       /* scan for separator */
+                       do {
+                               sep++;
+                       } while (*sep != 0 && !lnet_issep(*sep));
+               }
+               
+               if (*sep == 0)
+                       break;
+
+               str = sep + 1;
+       }
+
+       list_splice(&pending, tbs->prev);
+       return 0;
+}
+
+int
+lnet_expand1tb (struct list_head *list, 
+              char *str, char *sep1, char *sep2, 
+              char *item, int itemlen)
+{
+       int              len1 = sep1 - str;
+       int              len2 = strlen(sep2 + 1);
+       lnet_text_buf_t *ltb;
+
+       LASSERT (*sep1 == '[');
+       LASSERT (*sep2 == ']');
+
+       ltb = lnet_new_text_buf(len1 + itemlen + len2);
+       if (ltb == NULL)
+               return -ENOMEM;
+       
+       memcpy(ltb->ltb_text, str, len1);
+       memcpy(&ltb->ltb_text[len1], item, itemlen);
+       memcpy(&ltb->ltb_text[len1+itemlen], sep2 + 1, len2);
+       ltb->ltb_text[len1 + itemlen + len2] = 0;
+       
+       list_add_tail(&ltb->ltb_list, list);
+       return 0;
+}
+
+int
+lnet_str2tbs_expand (struct list_head *tbs, char *str)
+{
+       char              num[16];
+       struct list_head  pending;
+       char             *sep;
+       char             *sep2;
+       char             *parsed;
+       char             *enditem;
+       int               lo;
+       int               hi;
+       int               stride;
+       int               i;
+       int               nob;
+       int               scanned;
+
+       INIT_LIST_HEAD(&pending);
+       
+       sep = strchr(str, '[');
+       if (sep == NULL)                        /* nothing to expand */
+               return 0;
+
+        sep2 = strchr(sep, ']');
+        if (sep2 == NULL)
+                goto failed;
+
+       for (parsed = sep; parsed < sep2; parsed = enditem) {
+
+               enditem = ++parsed;
+               while (enditem < sep2 && *enditem != ',')
+                       enditem++;
+
+               if (enditem == parsed)          /* no empty items */
+                       goto failed;
+
+                if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) {
+
+                       if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) {
+
+                               /* simple string enumeration */
+                               if (lnet_expand1tb(&pending, str, sep, sep2,
+                                                   parsed, enditem - parsed) != 0)
+                                       goto failed;
+                               
+                               continue;
+                       }
+                       
+                       stride = 1;
+               }
+
+               /* range expansion */
+
+               if (enditem != parsed + scanned) /* no trailing junk */
+                       goto failed;
+                        
+               if (hi < 0 || lo < 0 || stride < 0 || hi < lo || 
+                   (hi - lo) % stride != 0)
+                       goto failed;
+                        
+               for (i = lo; i <= hi; i += stride) {
+
+                       snprintf(num, sizeof(num), "%d", i);
+                       nob = strlen(num);
+                       if (nob + 1 == sizeof(num))
+                               goto failed;
+                       
+                       if (lnet_expand1tb(&pending, str, sep, sep2, 
+                                           num, nob) != 0)
+                               goto failed;
+               }
+       }
+               
+       list_splice(&pending, tbs->prev);
+       return 1;
+       
+ failed:
+       lnet_free_text_bufs(&pending);
+       return -1;
+}
+
+int
+lnet_parse_hops (char *str, unsigned int *hops)
+{
+        int     len = strlen(str);
+        int     nob = len;
+        
+        return (sscanf(str, "%u%n", hops, &nob) >= 1 &&
+                nob == len &&
+                *hops > 0 && *hops < 256);
+}
+
+
+int
+lnet_parse_route (char *str, int *im_a_router)
+{
+       /* static scratch buffer OK (single threaded) */
+       static char       cmd[LNET_SINGLE_TEXTBUF_NOB];
+
+       struct list_head  nets;
+       struct list_head  gateways;
+       struct list_head *tmp1;
+       struct list_head *tmp2;
+       __u32             net;
+       lnet_nid_t        nid;
+       lnet_text_buf_t  *ltb;
+       int               rc;
+       char             *sep;
+       char             *token = str;
+       int               ntokens = 0;
+        int               myrc = -1;
+        unsigned int      hops;
+        int               got_hops = 0;
+
+       CFS_INIT_LIST_HEAD(&gateways);
+       CFS_INIT_LIST_HEAD(&nets);
+
+       /* save a copy of the string for error messages */
+       strncpy(cmd, str, sizeof(cmd) - 1);
+       cmd[sizeof(cmd) - 1] = 0;
+
+       sep = str;
+       for (;;) {
+               /* scan for token start */
+               while (lnet_iswhite(*sep))
+                       sep++;
+               if (*sep == 0) {
+                       if (ntokens < (got_hops ? 3 : 2))
+                                goto token_error;
+                       break;
+               }
+
+               ntokens++;
+               token = sep++;
+
+               /* scan for token end */
+               while (*sep != 0 && !lnet_iswhite(*sep))
+                       sep++;
+               if (*sep != 0)
+                       *sep++ = 0;
+               
+               if (ntokens == 1) {
+                       tmp2 = &nets;           /* expanding nets */
+                } else if (ntokens == 2 &&
+                           lnet_parse_hops(token, &hops)) {
+                        got_hops = 1;           /* got a hop count */
+                        continue;
+                } else {
+                       tmp2 = &gateways;       /* expanding gateways */
+                }
+                
+               ltb = lnet_new_text_buf(strlen(token));
+               if (ltb == NULL)
+                       goto out;
+
+               strcpy(ltb->ltb_text, token);
+               tmp1 = &ltb->ltb_list;
+               list_add_tail(tmp1, tmp2);
+               
+               while (tmp1 != tmp2) {
+                       ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+
+                       rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text);
+                       if (rc < 0)
+                               goto token_error;
+
+                       tmp1 = tmp1->next;
+                       
+                       if (rc > 0) {           /* expanded! */
+                               list_del(&ltb->ltb_list);
+                               lnet_free_text_buf(ltb);
+                               continue;
+                       }
+
+                       if (ntokens == 1) {
+                               net = libcfs_str2net(ltb->ltb_text);
+                               if (net == LNET_NIDNET(LNET_NID_ANY) ||
+                                    LNET_NETTYP(net) == LOLND)
+                                       goto token_error;
+                       } else {
+                               nid = libcfs_str2nid(ltb->ltb_text);
+                               if (nid == LNET_NID_ANY ||
+                                    LNET_NETTYP(LNET_NIDNET(nid)) == LOLND)
+                                       goto token_error;
+                       }
+               }
+       }
+
+        if (!got_hops)
+                hops = 1;
+
+       LASSERT (!list_empty(&nets));
+       LASSERT (!list_empty(&gateways));
+
+       list_for_each (tmp1, &nets) {
+               ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list);
+               net = libcfs_str2net(ltb->ltb_text);
+               LASSERT (net != LNET_NIDNET(LNET_NID_ANY));
+
+               list_for_each (tmp2, &gateways) {
+                       ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list);
+                       nid = libcfs_str2nid(ltb->ltb_text);
+                       LASSERT (nid != LNET_NID_ANY);
+
+                        if (lnet_islocalnid(nid)) {
+                                *im_a_router = 1;
+                                continue;
+                        }
+                        
+                        rc = lnet_add_route (net, hops, nid);
+                        if (rc != 0) {
+                                CERROR("Can't create route "
+                                       "to %s via %s\n",
+                                       libcfs_net2str(net),
+                                       libcfs_nid2str(nid));
+                                goto out;
+                        }
+               }
+       }
+
+        myrc = 0;
+        goto out;
+        
+ token_error:
+       lnet_syntax("routes", cmd, token - str, strlen(token));
+ out:
+       lnet_free_text_bufs(&nets);
+       lnet_free_text_bufs(&gateways);
+       return myrc;
+}
+
+int
+lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router)
+{
+       lnet_text_buf_t   *ltb;
+
+       while (!list_empty(tbs)) {
+               ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list);
+
+               if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) {
+                       lnet_free_text_bufs(tbs);
+                       return -EINVAL;
+               }
+
+               list_del(&ltb->ltb_list);
+               lnet_free_text_buf(ltb);
+       }
+
+        return 0;
+}
+
+int
+lnet_parse_routes (char *routes, int *im_a_router)
+{
+       struct list_head  tbs;
+       int               rc = 0;
+
+        *im_a_router = 0;
+
+        if (the_lnet.ln_ptlcompat > 0 && 
+            routes[0] != 0) {
+                /* Can't route when running in compatibility mode */
+                LCONSOLE_ERROR("Route tables are not supported when "
+                               "'portals_compatible' is set\n");
+                return -EINVAL;
+        }
+        
+       CFS_INIT_LIST_HEAD(&tbs);
+
+       if (lnet_str2tbs_sep(&tbs, routes) < 0) {
+               CERROR("Error parsing routes\n");
+               rc = -EINVAL;
+       } else {
+                rc = lnet_parse_route_tbs(&tbs, im_a_router);
+        }
+
+       LASSERT (lnet_tbnob == 0);
+       return rc;
+}
+
+void
+lnet_print_range_exprs(struct list_head *exprs)
+{
+        struct list_head   *e;
+        lnet_range_expr_t *lre;
+        
+        list_for_each(e, exprs) {
+                lre = list_entry(exprs->next, lnet_range_expr_t, lre_list);
+                
+                CDEBUG(D_WARNING, "%d-%d/%d\n", 
+                       lre->lre_min, lre->lre_max, lre->lre_stride);
+        }
+        
+        CDEBUG(D_WARNING, "%d allocated\n", lnet_re_alloc);
+}
+
+int
+lnet_new_range_expr(struct list_head *exprs, int min, int max, int stride)
+{
+        lnet_range_expr_t *lre;
+
+        CDEBUG(D_NET, "%d-%d/%d\n", min, max, stride);
+
+        if (min < 0 || min > 255 || min > max || stride < 0)
+                return -EINVAL;
+
+        LIBCFS_ALLOC(lre, sizeof(*lre));
+        if (lre == NULL)
+                return -ENOMEM;
+
+        lnet_re_alloc++;
+
+        lre->lre_min = min;
+        lre->lre_max = max;
+        lre->lre_stride = stride;
+        
+        list_add(&lre->lre_list, exprs);
+        return 0;
+}
+
+void
+lnet_destroy_range_exprs(struct list_head *exprs)
+{
+        lnet_range_expr_t *lre;
+        
+        while (!list_empty(exprs)) {
+                lre = list_entry(exprs->next, lnet_range_expr_t, lre_list);
+                
+                list_del(&lre->lre_list);
+                LIBCFS_FREE(lre, sizeof(*lre));
+                lnet_re_alloc--;
+        }
+}
+
+int
+lnet_parse_range_expr(struct list_head *exprs, char *str)
+{
+        int                nob = strlen(str);
+        char              *sep;
+        int                n;
+        int                x;
+        int                y;
+        int                z;
+        int                rc;
+
+        if (nob == 0)
+                return -EINVAL;
+
+        if (!strcmp(str, "*"))                  /* match all */
+                return lnet_new_range_expr(exprs, 0, 255, 1);
+                
+        n = nob;
+        if (sscanf(str, "%u%n", &x, &n) >= 1 && n == nob) {
+                /* simple number */
+                return lnet_new_range_expr(exprs, x, x, 1);
+        }
+
+        /* Has to be an expansion */
+        if (!(str[0] == '[' && nob > 2 && str[nob-1] == ']'))
+                return -EINVAL;
+
+        nob -= 2;
+        str++;
+        str[nob] = 0;
+
+        do {
+                /* Comma separated list of expressions... */
+                sep = strchr(str, ',');
+                if (sep != NULL)
+                        *sep++ = 0;
+                
+                nob = strlen(str);
+                n = nob;
+                if (sscanf(str, "%u%n", &x, &n) >= 1 && n == nob) {
+                        /* simple number */
+                        rc = lnet_new_range_expr(exprs, x, x, 1);
+                        if (rc != 0)
+                                return rc;
+
+                        continue;
+                }
+
+                n = nob;
+                if (sscanf(str, "%u-%u%n", &x, &y, &n) >= 2 && n == nob) {
+                        /* simple range */
+                        rc = lnet_new_range_expr(exprs, x, y, 1);
+                        if (rc != 0)
+                                return rc;
+                        continue;
+                }
+                        
+                n = nob;
+                if (sscanf(str, "%u-%u/%u%n", &x, &y, &z, &n) >= 3 && n == nob) {
+                        /* strided range */
+                        rc = lnet_new_range_expr(exprs, x, y, z);
+                        if (rc != 0)
+                                return rc;
+                        continue;
+                }
+                
+                return -EINVAL;
+
+        } while ((str = sep) != NULL);
+
+        return 0;
+}
+
+int
+lnet_match_network_token(char *token, __u32 *ipaddrs, int nip)
+{
+        struct list_head   exprs[4];
+        struct list_head  *e;
+        lnet_range_expr_t *re;
+        char              *str;
+        int                i;
+        int                j;
+        __u32              ip;
+        int                n;
+        int                match;
+        int                rc;
+
+        for (i = 0; i < 4; i++)
+                CFS_INIT_LIST_HEAD(&exprs[i]);
+
+        for (i = 0; i < 4; i++) {
+                str = token;
+                if (i != 3) {
+                        token = strchr(token, '.');
+                        if (token == NULL) {
+                                rc = -EINVAL;
+                                goto out;
+                        }
+                        *token++ = 0;
+                }
+
+                rc = lnet_parse_range_expr(&exprs[i], str);
+                if (rc != 0) {
+                        LASSERT (rc < 0);
+                        goto out;
+                }
+        }
+
+        for (match = i = 0; !match && i < nip; i++) {
+                ip = ipaddrs[i];
+                
+                for (match = 1, j = 0; match && j < 4; j++) {
+                        n = (ip >> (8 * (3 - j))) & 0xff;
+                        match = 0;
+
+                        list_for_each(e, &exprs[j]) {
+                                re = list_entry(e, lnet_range_expr_t, lre_list);
+
+                                if (re->lre_min <= n &&
+                                    re->lre_max >= n &&
+                                    (n - re->lre_min) % re->lre_stride == 0) {
+                                        match = 1;
+                                        break;
+                                }
+                        }
+                }
+        }
+        
+        rc = match ? 1 : 0;
+
+ out:
+        for (i = 0; i < 4; i++)
+                lnet_destroy_range_exprs(&exprs[i]);
+        LASSERT (lnet_re_alloc == 0);
+        
+        return rc;
+}
+
+int
+lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip)
+{
+        static char tokens[LNET_SINGLE_TEXTBUF_NOB];
+
+        int   matched = 0;
+        int   ntokens = 0;
+        int   len;
+        char *net = NULL;
+        char *sep;
+        char *token;
+        int   rc;
+
+        LASSERT (strlen(net_entry) < sizeof(tokens));
+
+        /* work on a copy of the string */
+        strcpy(tokens, net_entry);
+        sep = tokens;
+        for (;;) {
+                /* scan for token start */
+                while (lnet_iswhite(*sep))
+                        sep++;
+                if (*sep == 0)
+                        break;
+                
+                token = sep++;
+                
+                /* scan for token end */
+                while (*sep != 0 && !lnet_iswhite(*sep))
+                        sep++;
+                if (*sep != 0)
+                        *sep++ = 0;
+                
+                if (ntokens++ == 0) {
+                        net = token;
+                        continue;
+                }
+
+                len = strlen(token);
+                
+                rc = lnet_match_network_token(token, ipaddrs, nip);
+                if (rc < 0) {
+                        lnet_syntax("ip2nets", net_entry,
+                                    token - tokens, len);
+                        return rc;
+                }
+
+                matched |= (rc != 0);
+        }
+        
+        if (!matched)
+                return 0;
+        
+        strcpy(net_entry, net);                 /* replace with matched net */
+        return 1;
+}
+
+__u32 
+lnet_netspec2net(char *netspec)
+{
+        char   *bracket = strchr(netspec, '(');
+        __u32   net;
+
+        if (bracket != NULL)
+                *bracket = 0;
+
+        net = libcfs_str2net(netspec);
+
+        if (bracket != NULL)
+                *bracket = '(';
+                
+        return net;
+}
+
+int
+lnet_splitnets(char *source, struct list_head *nets)
+{
+        int               offset = 0;
+        int               offset2;
+        int               len;
+        lnet_text_buf_t  *tb;
+        lnet_text_buf_t  *tb2;
+        struct list_head *t;
+        char             *sep;
+        char             *bracket;
+        __u32             net;
+
+        LASSERT (!list_empty(nets));
+        LASSERT (nets->next == nets->prev);     /* single entry */
+        
+        tb = list_entry(nets->next, lnet_text_buf_t, ltb_list);
+
+        for (;;) {
+                sep = strchr(tb->ltb_text, ',');
+                bracket = strchr(tb->ltb_text, '(');
+
+                if (sep != NULL && 
+                    bracket != NULL && 
+                    bracket < sep) {
+                        /* netspec lists interfaces... */
+
+                        offset2 = offset + (bracket - tb->ltb_text);
+                        len = strlen(bracket);
+
+                        bracket = strchr(bracket + 1, ')');
+
+                        if (bracket == NULL ||
+                            !(bracket[1] == ',' || bracket[1] == 0)) {
+                                lnet_syntax("ip2nets", source, offset2, len);
+                                return -EINVAL;
+                        }
+
+                        sep = (bracket[1] == 0) ? NULL : bracket + 1;
+                }
+
+                if (sep != NULL)
+                        *sep++ = 0;
+
+                net = lnet_netspec2net(tb->ltb_text);
+                if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                        lnet_syntax("ip2nets", source, offset,
+                                    strlen(tb->ltb_text));
+                        return -EINVAL;
+                }
+
+                list_for_each(t, nets) {
+                        tb2 = list_entry(t, lnet_text_buf_t, ltb_list);
+
+                        if (tb2 == tb)
+                                continue;
+                        
+                        if (net == lnet_netspec2net(tb2->ltb_text)) {
+                                /* duplicate network */
+                                lnet_syntax("ip2nets", source, offset,
+                                            strlen(tb->ltb_text));
+                                return -EINVAL;
+                        }
+                }
+                
+                if (sep == NULL)
+                        return 0;
+
+                offset += sep - tb->ltb_text;
+                tb2 = lnet_new_text_buf(strlen(sep));
+                if (tb2 == NULL)
+                        return -ENOMEM;
+                        
+                strcpy(tb2->ltb_text, sep);
+                list_add_tail(&tb2->ltb_list, nets);
+
+                tb = tb2;
+        }
+}
+
+int
+lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip)
+{
+        static char  networks[LNET_SINGLE_TEXTBUF_NOB];
+        static char  source[LNET_SINGLE_TEXTBUF_NOB];
+
+        struct list_head    raw_entries;
+        struct list_head    matched_nets;
+        struct list_head    current_nets;
+        struct list_head   *t;
+        struct list_head   *t2;
+        lnet_text_buf_t    *tb;
+        lnet_text_buf_t    *tb2;
+        __u32               net1;
+        __u32               net2;
+        int                 len;
+        int                 count;
+        int                 dup;
+        int                 rc;
+
+        CFS_INIT_LIST_HEAD(&raw_entries);
+        if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) {
+                CERROR("Error parsing ip2nets\n");
+                LASSERT (lnet_tbnob == 0);
+                return -EINVAL;
+        }
+
+        CFS_INIT_LIST_HEAD(&matched_nets);
+        CFS_INIT_LIST_HEAD(&current_nets);
+        networks[0] = 0;
+        count = 0;
+        len = 0;
+        rc = 0;
+
+        while (!list_empty(&raw_entries)) {
+                tb = list_entry(raw_entries.next, lnet_text_buf_t, ltb_list);
+
+                strncpy(source, tb->ltb_text, sizeof(source)-1);
+                source[sizeof(source)-1] = 0;
+
+                /* replace ltb_text with the network(s) add on match */
+                rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip);
+                if (rc < 0)
+                        break;
+
+                list_del(&tb->ltb_list);
+
+                if (rc == 0) {                  /* no match */
+                        lnet_free_text_buf(tb);
+                        continue;
+                }
+
+                /* split into separate networks */
+                CFS_INIT_LIST_HEAD(&current_nets);
+                list_add(&tb->ltb_list, &current_nets);
+                rc = lnet_splitnets(source, &current_nets);
+                if (rc < 0)
+                        break;
+
+                dup = 0;
+                list_for_each (t, &current_nets) {
+                        tb = list_entry(t, lnet_text_buf_t, ltb_list);
+                        net1 = lnet_netspec2net(tb->ltb_text);
+                        LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY));
+
+                        list_for_each(t2, &matched_nets) {
+                                tb2 = list_entry(t2, lnet_text_buf_t, ltb_list);
+                                net2 = lnet_netspec2net(tb2->ltb_text);
+                                LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY));
+
+                                if (net1 == net2) {
+                                        dup = 1;
+                                        break;
+                                }
+                        }
+
+                        if (dup)
+                                break;
+                }
+
+                if (dup) {
+                        lnet_free_text_bufs(&current_nets);
+                        continue;
+                }
+
+                list_for_each_safe(t, t2, &current_nets) {
+                        tb = list_entry(t, lnet_text_buf_t, ltb_list);
+                        
+                        list_del(&tb->ltb_list);
+                        list_add_tail(&tb->ltb_list, &matched_nets);
+
+                        len += snprintf(networks + len, sizeof(networks) - len,
+                                        "%s%s", (len == 0) ? "" : ",", 
+                                        tb->ltb_text);
+                
+                        if (len >= sizeof(networks)) {
+                                CERROR("Too many matched networks\n");
+                                rc = -E2BIG;
+                                goto out;
+                        }
+                }
+                
+                count++;
+        }
+
+ out:
+        lnet_free_text_bufs(&raw_entries);
+        lnet_free_text_bufs(&matched_nets);
+        lnet_free_text_bufs(&current_nets);
+        LASSERT (lnet_tbnob == 0);
+
+        if (rc < 0)
+                return rc;
+        
+        *networksp = networks;
+        return count;
+}
+
+#ifdef __KERNEL__
+void
+lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip)
+{
+        LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs));
+}
+
+int
+lnet_ipaddr_enumerate (__u32 **ipaddrsp)
+{
+        int        up;
+        __u32      netmask;
+        __u32     *ipaddrs;
+        __u32     *ipaddrs2;
+        int        nip;
+        char     **ifnames;
+        int        nif = libcfs_ipif_enumerate(&ifnames);
+        int        i;
+        int        rc;
+
+        if (nif <= 0)
+                return nif;
+        
+        LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs));
+        if (ipaddrs == NULL) {
+                CERROR("Can't allocate ipaddrs[%d]\n", nif);
+                libcfs_ipif_free_enumeration(ifnames, nif);
+                return -ENOMEM;
+        }
+
+        for (i = nip = 0; i < nif; i++) {
+                if (!strcmp(ifnames[i], "lo"))
+                        continue;
+                
+                rc = libcfs_ipif_query(ifnames[i], &up, 
+                                       &ipaddrs[nip], &netmask);
+                if (rc != 0) {
+                        CWARN("Can't query interface %s: %d\n",
+                              ifnames[i], rc);
+                        continue;
+                }
+
+                if (!up) {
+                        CWARN("Ignoring interface %s: it's down\n",
+                              ifnames[i]);
+                        continue;
+                }
+
+                nip++;
+        }
+
+        libcfs_ipif_free_enumeration(ifnames, nif);
+
+        if (nip == nif) {
+                *ipaddrsp = ipaddrs;
+        } else {
+                if (nip > 0) {
+                        LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2));
+                        if (ipaddrs2 == NULL) {
+                                CERROR("Can't allocate ipaddrs[%d]\n", nip);
+                                nip = -ENOMEM;
+                        } else {
+                                memcpy(ipaddrs2, ipaddrs, 
+                                       nip * sizeof(*ipaddrs));
+                                *ipaddrsp = ipaddrs2;
+                                rc = nip;
+                        }
+                }
+                lnet_ipaddr_free_enumeration(ipaddrs, nif);
+        }
+        return nip;
+}
+
+int
+lnet_parse_ip2nets (char **networksp, char *ip2nets)
+{
+        __u32     *ipaddrs;
+        int        nip = lnet_ipaddr_enumerate(&ipaddrs);
+        int        rc;
+
+        if (nip < 0) {
+                LCONSOLE_ERROR("Error %d enumerating local IP interfaces "
+                               "for ip2nets to match\n", nip);
+                return nip;
+        }
+
+        if (nip == 0) {
+                LCONSOLE_ERROR("No local IP interfaces "
+                               "for ip2nets to match\n");
+                return -ENOENT;
+        }
+
+        rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip);
+        lnet_ipaddr_free_enumeration(ipaddrs, nip);
+
+        if (rc < 0) {
+                LCONSOLE_ERROR("Error %d parsing ip2nets\n", rc);
+                return rc;
+        }
+
+        if (rc == 0) {
+                LCONSOLE_ERROR("ip2nets does not match "
+                               "any local IP interfaces\n");
+                return -ENOENT;
+        }
+
+        return 0;
+}
+
+int
+lnet_set_ip_niaddr (lnet_ni_t *ni) 
+{
+        __u32  net = LNET_NIDNET(ni->ni_nid);
+        char **names;
+        int    n;
+        __u32  ip;
+        __u32  netmask;
+        int    up;
+        int    i;
+        int    rc;
+
+        /* Convenience for LNDs that use the IP address of a local interface as
+         * the local address part of their NID */
+
+        if (ni->ni_interfaces[0] != NULL) {
+
+                CLASSERT (LNET_MAX_INTERFACES > 1);
+
+                if (ni->ni_interfaces[1] != NULL) {
+                        CERROR("Net %s doesn't support multiple interfaces\n",
+                               libcfs_net2str(net));
+                        return -EPERM;
+                }
+                
+                rc = libcfs_ipif_query(ni->ni_interfaces[0],
+                                       &up, &ip, &netmask);
+                if (rc != 0) {
+                        CERROR("Net %s can't query interface %s: %d\n",
+                               libcfs_net2str(net), ni->ni_interfaces[0], rc);
+                        return -EPERM;
+                }
+
+                if (!up) {
+                        CERROR("Net %s can't use interface %s: it's down\n",
+                               libcfs_net2str(net), ni->ni_interfaces[0]);
+                        return -ENETDOWN;
+                }
+                
+                ni->ni_nid = LNET_MKNID(net, ip);
+                return 0;
+        }
+
+        n = libcfs_ipif_enumerate(&names);
+        if (n <= 0) {
+                CERROR("Net %s can't enumerate interfaces: %d\n", 
+                       libcfs_net2str(net), n);
+                return 0;
+        }
+
+        for (i = 0; i < n; i++) {
+                if (!strcmp(names[i], "lo")) /* skip the loopback IF */
+                        continue;
+                
+                rc = libcfs_ipif_query(names[i], &up, &ip, &netmask);
+                
+                if (rc != 0) {
+                        CWARN("Net %s can't query interface %s: %d\n",
+                              libcfs_net2str(net), names[i], rc);
+                        continue;
+                }
+                        
+                if (!up) {
+                        CWARN("Net %s ignoring interface %s (down)\n",
+                              libcfs_net2str(net), names[i]);
+                        continue;
+                }
+
+                libcfs_ipif_free_enumeration(names, n);
+                ni->ni_nid = LNET_MKNID(net, ip);
+                return 0;
+        }
+
+        CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net));
+        libcfs_ipif_free_enumeration(names, n);
+        return -ENOENT;
+}
+EXPORT_SYMBOL(lnet_set_ip_niaddr);
+
+#endif
index 4992fce..98adecc 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
-#include <portals/lib-p30.h>
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
 
-int 
-lib_api_eq_alloc (nal_t *apinal, ptl_size_t count,
-                  ptl_eq_handler_t callback, 
-                  ptl_handle_eq_t *handle)
+int
+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, 
+            lnet_handle_eq_t *handle)
 {
-        lib_nal_t     *nal = apinal->nal_data;
-        lib_eq_t      *eq;
-        unsigned long  flags;
-        int            rc;
+        lnet_eq_t     *eq;
 
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
         /* We need count to be a power of 2 so that when eq_{enq,deq}_seq
          * overflow, they don't skip entries, so the queue has the same
          * apparant capacity at all times */
@@ -48,36 +47,24 @@ lib_api_eq_alloc (nal_t *apinal, ptl_size_t count,
         }
 
         if (count == 0)        /* catch bad parameter / overflow on roundup */
-                return (PTL_VAL_FAILED);
+                return (-EINVAL);
         
-        eq = lib_eq_alloc (nal);
+        eq = lnet_eq_alloc();
         if (eq == NULL)
-                return (PTL_NO_SPACE);
+                return (-ENOMEM);
 
-        PORTAL_ALLOC(eq->eq_events, count * sizeof(ptl_event_t));
+        LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t));
         if (eq->eq_events == NULL) {
-                LIB_LOCK(nal, flags);
-                lib_eq_free (nal, eq);
-                LIB_UNLOCK(nal, flags);
-        }
+                LNET_LOCK();
+                lnet_eq_free (eq);
+                LNET_UNLOCK();
 
-        if (nal->libnal_map != NULL) {
-                struct iovec iov = {
-                        .iov_base = eq->eq_events,
-                        .iov_len = count * sizeof(ptl_event_t)};
-
-                rc = nal->libnal_map(nal, 1, &iov, &eq->eq_addrkey);
-                if (rc != PTL_OK) {
-                        LIB_LOCK(nal, flags);
-                        lib_eq_free (nal, eq);
-                        LIB_UNLOCK(nal, flags);
-                        return (rc);
-                }
+                return -ENOMEM;
         }
 
         /* NB this resets all event sequence numbers to 0, to be earlier
          * than eq_deq_seq */
-        memset(eq->eq_events, 0, count * sizeof(ptl_event_t));
+        memset(eq->eq_events, 0, count * sizeof(lnet_event_t));
 
         eq->eq_deq_seq = 1;
         eq->eq_enq_seq = 1;
@@ -85,77 +72,68 @@ lib_api_eq_alloc (nal_t *apinal, ptl_size_t count,
         eq->eq_refcount = 0;
         eq->eq_callback = callback;
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ);
-        list_add (&eq->eq_list, &nal->libnal_ni.ni_active_eqs);
+        lnet_initialise_handle (&eq->eq_lh, LNET_COOKIE_TYPE_EQ);
+        list_add (&eq->eq_list, &the_lnet.ln_active_eqs);
 
-        LIB_UNLOCK(nal, flags);
+        LNET_UNLOCK();
 
-        ptl_eq2handle(handle, nal, eq);
-        return (PTL_OK);
+        lnet_eq2handle(handle, eq);
+        return (0);
 }
 
-int 
-lib_api_eq_free(nal_t *apinal, ptl_handle_eq_t *eqh)
+int
+LNetEQFree(lnet_handle_eq_t eqh)
 {
-        lib_nal_t     *nal = apinal->nal_data;
-        lib_eq_t      *eq;
+        lnet_eq_t     *eq;
         int            size;
-        ptl_event_t   *events;
-        void          *addrkey;
-        unsigned long  flags;
+        lnet_event_t  *events;
 
-        LIB_LOCK(nal, flags);
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        LNET_LOCK();
 
-        eq = ptl_handle2eq(eqh, nal);
+        eq = lnet_handle2eq(&eqh);
         if (eq == NULL) {
-                LIB_UNLOCK(nal, flags);
-                return (PTL_EQ_INVALID);
+                LNET_UNLOCK();
+                return (-ENOENT);
         }
 
         if (eq->eq_refcount != 0) {
-                LIB_UNLOCK(nal, flags);
-                return (PTL_EQ_IN_USE);
+                LNET_UNLOCK();
+                return (-EBUSY);
         }
 
         /* stash for free after lock dropped */
         events  = eq->eq_events;
         size    = eq->eq_size;
-        addrkey = eq->eq_addrkey;
 
-        lib_invalidate_handle (nal, &eq->eq_lh);
+        lnet_invalidate_handle (&eq->eq_lh);
         list_del (&eq->eq_list);
-        lib_eq_free (nal, eq);
-
-        LIB_UNLOCK(nal, flags);
+        lnet_eq_free (eq);
 
-        if (nal->libnal_unmap != NULL) {
-                struct iovec iov = {
-                        .iov_base = events,
-                        .iov_len = size * sizeof(ptl_event_t)};
-
-                nal->libnal_unmap(nal, 1, &iov, &addrkey);
-        }
+        LNET_UNLOCK();
 
-        PORTAL_FREE(events, size * sizeof (ptl_event_t));
+        LIBCFS_FREE(events, size * sizeof (lnet_event_t));
 
-        return (PTL_OK);
+        return 0;
 }
 
 int
-lib_get_event (lib_eq_t *eq, ptl_event_t *ev)
+lib_get_event (lnet_eq_t *eq, lnet_event_t *ev)
 {
-        int          new_index = eq->eq_deq_seq & (eq->eq_size - 1);
-        ptl_event_t *new_event = &eq->eq_events[new_index];
-        int          rc;
+        int           new_index = eq->eq_deq_seq & (eq->eq_size - 1);
+        lnet_event_t *new_event = &eq->eq_events[new_index];
+        int           rc;
         ENTRY;
 
         CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n",
                new_event, eq->eq_deq_seq, eq->eq_size);
 
-        if (PTL_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) {
-                RETURN(PTL_EQ_EMPTY);
+        if (LNET_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) {
+                RETURN(0);
         }
 
         /* We've got a new event... */
@@ -163,11 +141,13 @@ lib_get_event (lib_eq_t *eq, ptl_event_t *ev)
 
         /* ...but did it overwrite an event we've not seen yet? */
         if (eq->eq_deq_seq == new_event->sequence) {
-                rc = PTL_OK;
+                rc = 1;
         } else {
-                CERROR("Event Queue Overflow: eq seq %lu ev seq %lu\n",
+                /* don't complain with CERROR: some EQs are sized small
+                 * anyway; if it's important, the caller should complain */
+                CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n",
                        eq->eq_deq_seq, new_event->sequence);
-                rc = PTL_EQ_DROPPED;
+                rc = -EOVERFLOW;
         }
 
         eq->eq_deq_seq = new_event->sequence + 1;
@@ -176,13 +156,27 @@ lib_get_event (lib_eq_t *eq, ptl_event_t *ev)
 
 
 int
-lib_api_eq_poll (nal_t *apinal, 
-                 ptl_handle_eq_t *eventqs, int neq, int timeout_ms,
-                 ptl_event_t *event, int *which)
+LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+        int which;
+
+        return LNetEQPoll(&eventq, 1, 0, 
+                         event, &which);
+}
+
+int
+LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event)
+{
+        int which;
+
+        return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER,
+                         event, &which);
+}
+
+int
+LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms,
+            lnet_event_t *event, int *which)
 {
-        lib_nal_t       *nal = apinal->nal_data;
-        lib_ni_t        *ni = &nal->libnal_ni;
-        unsigned long    flags;
         int              i;
         int              rc;
 #ifdef __KERNEL__
@@ -191,57 +185,112 @@ lib_api_eq_poll (nal_t *apinal,
 #else
         struct timeval   then;
         struct timeval   now;
+# if HAVE_LIBPTHREAD
         struct timespec  ts;
+# endif
+        lnet_ni_t       *eqwaitni = the_lnet.ln_eqwaitni;
 #endif
         ENTRY;
 
-        LIB_LOCK(nal, flags);
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+
+        if (neq < 1)
+                RETURN(-ENOENT);
+
+        LNET_LOCK();
 
         for (;;) {
                 for (i = 0; i < neq; i++) {
-                        lib_eq_t *eq = ptl_handle2eq(&eventqs[i], nal);
+                        lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]);
+
+                        if (eq == NULL) {
+                                LNET_UNLOCK();
+                                RETURN(-ENOENT);
+                        }
 
                         rc = lib_get_event (eq, event);
-                        if (rc != PTL_EQ_EMPTY) {
-                                LIB_UNLOCK(nal, flags);
+                        if (rc != 0) {
+                                LNET_UNLOCK();
                                 *which = i;
                                 RETURN(rc);
                         }
                 }
                 
+#ifdef __KERNEL__
                 if (timeout_ms == 0) {
-                        LIB_UNLOCK (nal, flags);
-                        RETURN (PTL_EQ_EMPTY);
+                        LNET_UNLOCK ();
+                        RETURN (0);
                 }
 
-                /* Some architectures force us to do spin locking/unlocking
-                 * in the same stack frame, means we can abstract the
-                 * locking here */
-#ifdef __KERNEL__
                 cfs_waitlink_init(&wl);
                 set_current_state(TASK_INTERRUPTIBLE);
-                cfs_waitq_add(&ni->ni_waitq, &wl);
+                cfs_waitq_add(&the_lnet.ln_waitq, &wl);
 
-                LIB_UNLOCK(nal, flags);
+                LNET_UNLOCK();
 
                 if (timeout_ms < 0) {
-                        cfs_waitq_wait (&wl);
+                        cfs_waitq_wait (&wl, CFS_TASK_INTERRUPTIBLE);
                 } else { 
                         struct timeval tv;
 
                         now = cfs_time_current();
-                        cfs_waitq_timedwait(&wl, cfs_time_seconds(timeout_ms)/1000);
-                        cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv); 
+                        cfs_waitq_timedwait(&wl, CFS_TASK_INTERRUPTIBLE,
+                                            cfs_time_seconds(timeout_ms)/1000);
+                        cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), 
+                                          &tv); 
                         timeout_ms -= tv.tv_sec * 1000 + tv.tv_usec / 1000;
                         if (timeout_ms < 0)
                                 timeout_ms = 0;
                 }
                 
-                LIB_LOCK(nal, flags);
-                cfs_waitq_del(&ni->ni_waitq, &wl);
+                LNET_LOCK();
+                cfs_waitq_del(&the_lnet.ln_waitq, &wl);
 #else
+                if (eqwaitni != NULL) {
+                        /* I have a single NI that I have to call into, to get
+                         * events queued, or to block. */
+                        lnet_ni_addref_locked(eqwaitni);
+                        LNET_UNLOCK();
+
+                        if (timeout_ms <= 0) {
+                                (eqwaitni->ni_lnd->lnd_wait)(eqwaitni, timeout_ms);
+                        } else {
+                                gettimeofday(&then, NULL);
+
+                                (eqwaitni->ni_lnd->lnd_wait)(eqwaitni, timeout_ms);
+                                
+                                gettimeofday(&now, NULL);
+                                timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 +
+                                              (now.tv_usec - then.tv_usec) / 1000;
+                                if (timeout_ms < 0)
+                                        timeout_ms = 0;
+                        }
+
+                        LNET_LOCK();
+                        lnet_ni_decref_locked(eqwaitni);
+
+                        /* don't call into eqwaitni again if timeout has
+                         * expired */
+                        if (timeout_ms == 0)
+                                eqwaitni = NULL;
+
+                        continue;               /* go back and check for events */
+                }
+
+                if (timeout_ms == 0) {
+                        LNET_UNLOCK();
+                        RETURN (0);
+                }
+
+# if !HAVE_LIBPTHREAD
+                /* If I'm single-threaded, LNET fails at startup if it can't
+                 * set the_lnet.ln_eqwaitni correctly.  */
+                LBUG();
+# else
                 if (timeout_ms < 0) {
-                        pthread_cond_wait(&ni->ni_cond, &ni->ni_mutex);
+                        pthread_cond_wait(&the_lnet.ln_cond, 
+                                          &the_lnet.ln_lock);
                 } else {
                         gettimeofday(&then, NULL);
                         
@@ -253,8 +302,8 @@ lib_api_eq_poll (nal_t *apinal,
                                 ts.tv_nsec -= 1000000000;
                         }
                         
-                        pthread_cond_timedwait(&ni->ni_cond,
-                                               &ni->ni_mutex, &ts);
+                        pthread_cond_timedwait(&the_lnet.ln_cond,
+                                               &the_lnet.ln_lock, &ts);
                         
                         gettimeofday(&now, NULL);
                         timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 +
@@ -263,6 +312,7 @@ lib_api_eq_poll (nal_t *apinal,
                         if (timeout_ms < 0)
                                 timeout_ms = 0;
                 }
+# endif
 #endif
         }
 }
diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c
deleted file mode 100644 (file)
index 6d0099c..0000000
+++ /dev/null
@@ -1,433 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * lib/lib-init.c
- * Start up the internal library and clear all structures
- * Called by the NAL when it initializes.  Safe to call multiple times.
- *
- *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-# define DEBUG_SUBSYSTEM S_PORTALS
-#include <portals/lib-p30.h>
-
-#ifdef __KERNEL__
-# include <libcfs/kp30.h>
-#else
-# include <string.h>
-# include <sys/time.h>
-#endif
-
-#ifndef PTL_USE_LIB_FREELIST
-
-int
-kportal_descriptor_setup (lib_nal_t *nal,
-                          ptl_ni_limits_t *requested_limits,
-                          ptl_ni_limits_t *actual_limits)
-{
-        /* Ignore requested limits! */
-        actual_limits->max_mes = INT_MAX;
-        actual_limits->max_mds = INT_MAX;
-        actual_limits->max_eqs = INT_MAX;
-
-        return PTL_OK;
-}
-
-void
-kportal_descriptor_cleanup (lib_nal_t *nal)
-{
-}
-#else
-
-int
-lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int n, int size)
-{
-        char *space;
-
-        LASSERT (n > 0);
-
-        size += offsetof (lib_freeobj_t, fo_contents);
-
-        PORTAL_ALLOC(space, n * size);
-        if (space == NULL)
-                return (PTL_NO_SPACE);
-
-        CFS_INIT_LIST_HEAD (&fl->fl_list);
-        fl->fl_objs = space;
-        fl->fl_nobjs = n;
-        fl->fl_objsize = size;
-
-        do
-        {
-                memset (space, 0, size);
-                list_add ((struct list_head *)space, &fl->fl_list);
-                space += size;
-        } while (--n != 0);
-
-        return (PTL_OK);
-}
-
-void
-lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl)
-{
-        struct list_head *el;
-        int               count;
-
-        if (fl->fl_nobjs == 0)
-                return;
-
-        count = 0;
-        for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next)
-                count++;
-
-        LASSERT (count == fl->fl_nobjs);
-
-        PORTAL_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize);
-        memset (fl, 0, sizeof (fl));
-}
-
-int
-kportal_descriptor_setup (lib_nal_t *nal,
-                          ptl_ni_limits_t *requested_limits,
-                          ptl_ni_limits_t *actual_limits)
-{
-        /* NB on failure caller must still call kportal_descriptor_cleanup */
-        /*               ******                                            */
-        lib_ni_t  *ni = &nal->libnal_ni;
-        int        rc;
-
-        memset (&ni->ni_free_mes,  0, sizeof (ni->ni_free_mes));
-        memset (&ni->ni_free_msgs, 0, sizeof (ni->ni_free_msgs));
-        memset (&ni->ni_free_mds,  0, sizeof (ni->ni_free_mds));
-        memset (&ni->ni_free_eqs,  0, sizeof (ni->ni_free_eqs));
-
-        /* Ignore requested limits! */
-        actual_limits->max_mes = MAX_MES;
-        actual_limits->max_mds = MAX_MDS;
-        actual_limits->max_eqs = MAX_EQS;
-        /* Hahahah what a load of bollocks.  There's nowhere to
-         * specify the max # messages in-flight */
-
-        rc = lib_freelist_init (nal, &ni->ni_free_mes,
-                                MAX_MES, sizeof (lib_me_t));
-        if (rc != PTL_OK)
-                return (rc);
-
-        rc = lib_freelist_init (nal, &ni->ni_free_msgs,
-                                MAX_MSGS, sizeof (lib_msg_t));
-        if (rc != PTL_OK)
-                return (rc);
-
-        rc = lib_freelist_init (nal, &ni->ni_free_mds,
-                                MAX_MDS, sizeof (lib_md_t));
-        if (rc != PTL_OK)
-                return (rc);
-
-        rc = lib_freelist_init (nal, &ni->ni_free_eqs,
-                                MAX_EQS, sizeof (lib_eq_t));
-        return (rc);
-}
-
-void
-kportal_descriptor_cleanup (lib_nal_t *nal)
-{
-        lib_ni_t   *ni = &nal->libnal_ni;
-        
-        lib_freelist_fini (nal, &ni->ni_free_mes);
-        lib_freelist_fini (nal, &ni->ni_free_msgs);
-        lib_freelist_fini (nal, &ni->ni_free_mds);
-        lib_freelist_fini (nal, &ni->ni_free_eqs);
-}
-
-#endif
-
-__u64
-lib_create_interface_cookie (lib_nal_t *nal)
-{
-        /* NB the interface cookie in wire handles guards against delayed
-         * replies and ACKs appearing valid in a new instance of the same
-         * interface.  Initialisation time, even if it's only implemented
-         * to millisecond resolution is probably easily good enough. */
-        struct timeval tv;
-        __u64          cookie;
-#ifndef __KERNEL__
-        int            rc = gettimeofday (&tv, NULL);
-        LASSERT (rc == 0);
-#else
-       do_gettimeofday(&tv);
-#endif
-        cookie = tv.tv_sec;
-        cookie *= 1000000;
-        cookie += tv.tv_usec;
-        return (cookie);
-}
-
-int
-lib_setup_handle_hash (lib_nal_t *nal) 
-{
-        lib_ni_t *ni = &nal->libnal_ni;
-        int       i;
-        
-        /* Arbitrary choice of hash table size */
-#ifdef __KERNEL__
-        ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head);
-#else
-        ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4;
-#endif
-        PORTAL_ALLOC(ni->ni_lh_hash_table,
-                     ni->ni_lh_hash_size * sizeof (struct list_head));
-        if (ni->ni_lh_hash_table == NULL)
-                return (PTL_NO_SPACE);
-        
-        for (i = 0; i < ni->ni_lh_hash_size; i++)
-                CFS_INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]);
-
-        ni->ni_next_object_cookie = PTL_COOKIE_TYPES;
-        
-        return (PTL_OK);
-}
-
-void
-lib_cleanup_handle_hash (lib_nal_t *nal)
-{
-        lib_ni_t *ni = &nal->libnal_ni;
-
-        if (ni->ni_lh_hash_table == NULL)
-                return;
-        
-        PORTAL_FREE(ni->ni_lh_hash_table,
-                    ni->ni_lh_hash_size * sizeof (struct list_head));
-}
-
-lib_handle_t *
-lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type) 
-{
-        /* ALWAYS called with statelock held */
-        lib_ni_t            *ni = &nal->libnal_ni;
-        struct list_head    *list;
-        struct list_head    *el;
-        unsigned int         hash;
-
-        if ((cookie & (PTL_COOKIE_TYPES - 1)) != type)
-                return (NULL);
-        
-        hash = ((unsigned int)cookie) % ni->ni_lh_hash_size;
-        list = &ni->ni_lh_hash_table[hash];
-        
-        list_for_each (el, list) {
-                lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain);
-                
-                if (lh->lh_cookie == cookie)
-                        return (lh);
-        }
-        
-        return (NULL);
-}
-
-void
-lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type) 
-{
-        /* ALWAYS called with statelock held */
-        lib_ni_t       *ni = &nal->libnal_ni;
-        unsigned int    hash;
-
-        LASSERT (type >= 0 && type < PTL_COOKIE_TYPES);
-        lh->lh_cookie = ni->ni_next_object_cookie | type;
-        ni->ni_next_object_cookie += PTL_COOKIE_TYPES;
-        
-        hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size;
-        list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]);
-}
-
-void
-lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh)
-{
-        list_del (&lh->lh_hash_chain);
-}
-
-int
-lib_init(lib_nal_t *libnal, nal_t *apinal, 
-         ptl_process_id_t process_id,
-         ptl_ni_limits_t *requested_limits,
-         ptl_ni_limits_t *actual_limits)
-{
-        int       rc = PTL_OK;
-        lib_ni_t *ni = &libnal->libnal_ni;
-        int       ptl_size;
-        int       i;
-        ENTRY;
-
-        /* NB serialised in PtlNIInit() */
-
-        lib_assert_wire_constants ();
-
-        /* Setup the API nal with the lib API handling functions */
-        apinal->nal_get_id    = lib_api_get_id;
-        apinal->nal_ni_status = lib_api_ni_status;
-        apinal->nal_ni_dist   = lib_api_ni_dist;
-        apinal->nal_fail_nid  = lib_api_fail_nid;
-        apinal->nal_loopback  = lib_api_loopback;
-        apinal->nal_me_attach = lib_api_me_attach;
-        apinal->nal_me_insert = lib_api_me_insert;
-        apinal->nal_me_unlink = lib_api_me_unlink;
-        apinal->nal_md_attach = lib_api_md_attach;
-        apinal->nal_md_bind   = lib_api_md_bind;
-        apinal->nal_md_unlink = lib_api_md_unlink;
-        apinal->nal_md_update = lib_api_md_update;
-        apinal->nal_eq_alloc  = lib_api_eq_alloc;
-        apinal->nal_eq_free   = lib_api_eq_free;
-        apinal->nal_eq_poll   = lib_api_eq_poll;
-        apinal->nal_put       = lib_api_put;
-        apinal->nal_get       = lib_api_get;
-
-        apinal->nal_data      = libnal;
-        ni->ni_api            = apinal;
-
-        rc = kportal_descriptor_setup (libnal, requested_limits, 
-                                       &ni->ni_actual_limits);
-        if (rc != PTL_OK)
-                goto out;
-
-        memset(&ni->ni_counters, 0, sizeof(lib_counters_t));
-
-        CFS_INIT_LIST_HEAD (&ni->ni_active_msgs);
-        CFS_INIT_LIST_HEAD (&ni->ni_active_mds);
-        CFS_INIT_LIST_HEAD (&ni->ni_active_eqs);
-        CFS_INIT_LIST_HEAD (&ni->ni_test_peers);
-
-#ifdef __KERNEL__
-        spin_lock_init (&ni->ni_lock);
-        cfs_waitq_init (&ni->ni_waitq);
-#else
-        pthread_mutex_init(&ni->ni_mutex, NULL);
-        pthread_cond_init(&ni->ni_cond, NULL);
-#endif
-
-        ni->ni_interface_cookie = lib_create_interface_cookie (libnal);
-        ni->ni_next_object_cookie = 0;
-        rc = lib_setup_handle_hash (libnal);
-        if (rc != PTL_OK)
-                goto out;
-        
-        ni->ni_pid = process_id;
-
-        if (requested_limits != NULL)
-                ptl_size = requested_limits->max_pt_index + 1;
-        else
-                ptl_size = 64;
-
-        ni->ni_portals.size = ptl_size;
-        PORTAL_ALLOC(ni->ni_portals.tbl,
-                     ptl_size * sizeof(struct list_head));
-        if (ni->ni_portals.tbl == NULL) {
-                rc = PTL_NO_SPACE;
-                goto out;
-        }
-
-        for (i = 0; i < ptl_size; i++)
-                CFS_INIT_LIST_HEAD(&(ni->ni_portals.tbl[i]));
-
-        /* max_{mes,mds,eqs} set in kportal_descriptor_setup */
-
-        /* We don't have an access control table! */
-        ni->ni_actual_limits.max_ac_index = -1;
-
-        ni->ni_actual_limits.max_pt_index = ptl_size - 1;
-        ni->ni_actual_limits.max_md_iovecs = PTL_MD_MAX_IOV;
-        ni->ni_actual_limits.max_me_list = INT_MAX;
-
-        /* We don't support PtlGetPut! */
-        ni->ni_actual_limits.max_getput_md = 0;
-
-        if (actual_limits != NULL)
-                *actual_limits = ni->ni_actual_limits;
-
-        /* disable loopback optimisation by default */
-        ni->ni_loopback = 0;
-
- out:
-        if (rc != PTL_OK) {
-                lib_cleanup_handle_hash (libnal);
-                kportal_descriptor_cleanup (libnal);
-        }
-
-        RETURN (rc);
-}
-
-int
-lib_fini(lib_nal_t *nal)
-{
-        lib_ni_t *ni = &nal->libnal_ni;
-        int       idx;
-
-        /* NB no state_lock() since this is the last reference.  The NAL
-         * should have shut down already, so it should be safe to unlink
-         * and free all descriptors, even those that appear committed to a
-         * network op (eg MD with non-zero pending count)
-         */
-
-        for (idx = 0; idx < ni->ni_portals.size; idx++)
-                while (!list_empty (&ni->ni_portals.tbl[idx])) {
-                        lib_me_t *me = list_entry (ni->ni_portals.tbl[idx].next,
-                                                   lib_me_t, me_list);
-
-                        CERROR ("Active me %p on exit\n", me);
-                        list_del (&me->me_list);
-                        lib_me_free (nal, me);
-                }
-
-        while (!list_empty (&ni->ni_active_mds)) {
-                lib_md_t *md = list_entry (ni->ni_active_mds.next,
-                                           lib_md_t, md_list);
-
-                CERROR ("Active md %p on exit\n", md);
-                list_del (&md->md_list);
-                lib_md_free (nal, md);
-        }
-
-        while (!list_empty (&ni->ni_active_eqs)) {
-                lib_eq_t *eq = list_entry (ni->ni_active_eqs.next,
-                                           lib_eq_t, eq_list);
-
-                CERROR ("Active eq %p on exit\n", eq);
-                list_del (&eq->eq_list);
-                lib_eq_free (nal, eq);
-        }
-
-        while (!list_empty (&ni->ni_active_msgs)) {
-                lib_msg_t *msg = list_entry (ni->ni_active_msgs.next,
-                                             lib_msg_t, msg_list);
-
-                CERROR ("Active msg %p on exit\n", msg);
-                list_del (&msg->msg_list);
-                lib_msg_free (nal, msg);
-        }
-
-        PORTAL_FREE(ni->ni_portals.tbl,  
-                    ni->ni_portals.size * sizeof(struct list_head));
-
-        lib_cleanup_handle_hash (nal);
-        kportal_descriptor_cleanup (nal);
-
-#ifndef __KERNEL__
-        pthread_mutex_destroy(&ni->ni_mutex);
-        pthread_cond_destroy(&ni->ni_cond);
-#endif
-
-        return (PTL_OK);
-}
index f188e2a..0e8524c 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
-#ifndef __KERNEL__
-# include <stdio.h>
-#else
-# include <libcfs/kp30.h>
-#endif
+#include <lnet/lib-lnet.h>
 
-#include <portals/lib-p30.h>
-
-/* must be called with state lock held */
+/* must be called with LNET_LOCK held */
 void
-lib_md_unlink(lib_nal_t *nal, lib_md_t *md)
+lnet_md_unlink(lnet_libmd_t *md)
 {
-        if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) == 0) {
+        if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) {
                 /* first unlink attempt... */
-                lib_me_t *me = md->me;
+                lnet_me_t *me = md->md_me;
 
-                md->md_flags |= PTL_MD_FLAG_ZOMBIE;
+                md->md_flags |= LNET_MD_FLAG_ZOMBIE;
 
                 /* Disassociate from ME (if any), and unlink it if it was created
-                 * with PTL_UNLINK */
+                 * with LNET_UNLINK */
                 if (me != NULL) {
-                        me->md = NULL;
-                        if (me->unlink == PTL_UNLINK)
-                                lib_me_unlink(nal, me);
+                        me->me_md = NULL;
+                        if (me->me_unlink == LNET_UNLINK)
+                                lnet_me_unlink(me);
                 }
 
                 /* emsure all future handle lookups fail */
-                lib_invalidate_handle(nal, &md->md_lh);
+                lnet_invalidate_handle(&md->md_lh);
         }
 
-        if (md->pending != 0) {
+        if (md->md_refcount != 0) {
                 CDEBUG(D_NET, "Queueing unlink of md %p\n", md);
                 return;
         }
 
         CDEBUG(D_NET, "Unlinking md %p\n", md);
 
-        if ((md->options & PTL_MD_KIOV) != 0) {
-                if (nal->libnal_unmap_pages != NULL)
-                        nal->libnal_unmap_pages (nal,
-                                                 md->md_niov,
-                                                 md->md_iov.kiov,
-                                                 &md->md_addrkey);
-        } else if (nal->libnal_unmap != NULL) {
-                nal->libnal_unmap (nal,
-                                   md->md_niov, md->md_iov.iov,
-                                   &md->md_addrkey);
-        }
-
-        if (md->eq != NULL) {
-                md->eq->eq_refcount--;
-                LASSERT (md->eq->eq_refcount >= 0);
+        if (md->md_eq != NULL) {
+                md->md_eq->eq_refcount--;
+                LASSERT (md->md_eq->eq_refcount >= 0);
         }
 
         list_del (&md->md_list);
-        lib_md_free(nal, md);
+        lnet_md_free(md);
 }
 
-/* must be called with state lock held */
+/* must be called with LNET_LOCK held */
 static int
-lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink)
+lib_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink)
 {
-        lib_eq_t     *eq = NULL;
-        int           rc;
-        int           i;
-        int           niov;
-        int           total_length = 0;
+        lnet_eq_t   *eq = NULL;
+        int          i;
+        unsigned int niov;
+        int          total_length = 0;
 
         /* NB we are passed an allocated, but uninitialised/active md.
-         * if we return success, caller may lib_md_unlink() it.
-         * otherwise caller may only lib_md_free() it.
+         * if we return success, caller may lnet_md_unlink() it.
+         * otherwise caller may only lnet_md_free() it.
          */
 
-        if (!PtlHandleIsEqual (umd->eq_handle, PTL_EQ_NONE)) {
-                eq = ptl_handle2eq(&umd->eq_handle, nal);
+        if (!LNetHandleIsEqual (umd->eq_handle, LNET_EQ_NONE)) {
+                eq = lnet_handle2eq(&umd->eq_handle);
                 if (eq == NULL)
-                        return PTL_EQ_INVALID;
+                        return -ENOENT;
         }
 
         /* This implementation doesn't know how to create START events or
          * disable END events.  Best to LASSERT our caller is compliant so
          * we find out quickly...  */
-        LASSERT (eq == NULL ||
-                 ((umd->options & PTL_MD_EVENT_START_DISABLE) != 0 &&
-                  (umd->options & PTL_MD_EVENT_END_DISABLE) == 0));
-
-        lmd->me = NULL;
-        lmd->start = umd->start;
-        lmd->offset = 0;
-        lmd->max_size = umd->max_size;
-        lmd->options = umd->options;
-        lmd->user_ptr = umd->user_ptr;
-        lmd->eq = eq;
-        lmd->threshold = umd->threshold;
-        lmd->pending = 0;
-        lmd->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0;
-
-        if ((umd->options & PTL_MD_IOVEC) != 0) {
-
-                if ((umd->options & PTL_MD_KIOV) != 0) /* Can't specify both */
-                        return PTL_MD_ILLEGAL;
+        /*  TODO - reevaluate what should be here in light of 
+         * the removal of the start and end events
+         * maybe there we shouldn't even allow LNET_EQ_NONE!)
+        LASSERT (eq == NULL);
+         */
+
+        lmd->md_me = NULL;
+        lmd->md_start = umd->start;
+        lmd->md_offset = 0;
+        lmd->md_max_size = umd->max_size;
+        lmd->md_options = umd->options;
+        lmd->md_user_ptr = umd->user_ptr;
+        lmd->md_eq = eq;
+        lmd->md_threshold = umd->threshold;
+        lmd->md_refcount = 0;
+        lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
+
+        if ((umd->options & LNET_MD_IOVEC) != 0) {
+
+                if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */
+                        return -EINVAL;
 
                 lmd->md_niov = niov = umd->length;
                 memcpy(lmd->md_iov.iov, umd->start,
@@ -133,33 +116,22 @@ lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink)
                 for (i = 0; i < niov; i++) {
                         /* We take the base address on trust */
                         if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */
-                                return PTL_MD_ILLEGAL;
+                                return -EINVAL;
 
                         total_length += lmd->md_iov.iov[i].iov_len;
                 }
 
-                lmd->length = total_length;
+                lmd->md_length = total_length;
 
-                if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */
+                if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
                     (umd->max_size < 0 ||
                      umd->max_size > total_length)) // illegal max_size
-                        return PTL_MD_ILLEGAL;
+                        return -EINVAL;
 
-                if (nal->libnal_map != NULL) {
-                        rc = nal->libnal_map (nal, niov, lmd->md_iov.iov,
-                                              &lmd->md_addrkey);
-                        if (rc != PTL_OK)
-                                return (rc);
-                }
-        } else if ((umd->options & PTL_MD_KIOV) != 0) {
+        } else if ((umd->options & LNET_MD_KIOV) != 0) {
 #ifndef __KERNEL__
-                return PTL_MD_ILLEGAL;
+                return -EINVAL;
 #else
-                /* Trap attempt to use paged I/O if unsupported early. */
-                if (nal->libnal_send_pages == NULL ||
-                    nal->libnal_recv_pages == NULL)
-                        return PTL_MD_INVALID;
-
                 lmd->md_niov = niov = umd->length;
                 memcpy(lmd->md_iov.kiov, umd->start,
                        niov * sizeof (lmd->md_iov.kiov[0]));
@@ -167,260 +139,179 @@ lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink)
                 for (i = 0; i < niov; i++) {
                         /* We take the page pointer on trust */
                         if (lmd->md_iov.kiov[i].kiov_offset +
-                            lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE )
-                                return PTL_VAL_FAILED; /* invalid length */
+                            lmd->md_iov.kiov[i].kiov_len > CFS_PAGE_SIZE )
+                                return -EINVAL; /* invalid length */
 
                         total_length += lmd->md_iov.kiov[i].kiov_len;
                 }
 
-                lmd->length = total_length;
+                lmd->md_length = total_length;
 
-                if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */
+                if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
                     (umd->max_size < 0 ||
                      umd->max_size > total_length)) // illegal max_size
-                        return PTL_MD_ILLEGAL;
-
-                if (nal->libnal_map_pages != NULL) {
-                        rc = nal->libnal_map_pages (nal, niov, lmd->md_iov.kiov,
-                                                    &lmd->md_addrkey);
-                        if (rc != PTL_OK)
-                                return (rc);
-                }
+                        return -EINVAL;
 #endif
         } else {   /* contiguous */
-                lmd->length = umd->length;
+                lmd->md_length = umd->length;
                 lmd->md_niov = niov = 1;
                 lmd->md_iov.iov[0].iov_base = umd->start;
                 lmd->md_iov.iov[0].iov_len = umd->length;
 
-                if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */
+                if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */
                     (umd->max_size < 0 ||
                      umd->max_size > umd->length)) // illegal max_size
-                        return PTL_MD_ILLEGAL;
-
-                if (nal->libnal_map != NULL) {
-                        rc = nal->libnal_map (nal, niov, lmd->md_iov.iov,
-                                              &lmd->md_addrkey);
-                        if (rc != PTL_OK)
-                                return (rc);
-                }
+                        return -EINVAL;
         }
 
         if (eq != NULL)
                 eq->eq_refcount++;
 
         /* It's good; let handle2md succeed and add to active mds */
-        lib_initialise_handle (nal, &lmd->md_lh, PTL_COOKIE_TYPE_MD);
-        list_add (&lmd->md_list, &nal->libnal_ni.ni_active_mds);
+        lnet_initialise_handle (&lmd->md_lh, LNET_COOKIE_TYPE_MD);
+        list_add (&lmd->md_list, &the_lnet.ln_active_mds);
 
-        return PTL_OK;
+        return 0;
 }
 
-/* must be called with state lock held */
+/* must be called with LNET_LOCK held */
 void
-lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd)
+lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd)
 {
         /* NB this doesn't copy out all the iov entries so when a
          * discontiguous MD is copied out, the target gets to know the
          * original iov pointer (in start) and the number of entries it had
          * and that's all.
          */
-        umd->start = lmd->start;
-        umd->length = ((lmd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ?
-                      lmd->length : lmd->md_niov;
-        umd->threshold = lmd->threshold;
-        umd->max_size = lmd->max_size;
-        umd->options = lmd->options;
-        umd->user_ptr = lmd->user_ptr;
-        ptl_eq2handle(&umd->eq_handle, nal, lmd->eq);
+        umd->start = lmd->md_start;
+        umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ?
+                      lmd->md_length : lmd->md_niov;
+        umd->threshold = lmd->md_threshold;
+        umd->max_size = lmd->md_max_size;
+        umd->options = lmd->md_options;
+        umd->user_ptr = lmd->md_user_ptr;
+        lnet_eq2handle(&umd->eq_handle, lmd->md_eq);
 }
 
 int
-lib_api_md_attach(nal_t *apinal, ptl_handle_me_t *meh,
-                  ptl_md_t *umd, ptl_unlink_t unlink,
-                  ptl_handle_md_t *handle)
+LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd,
+             lnet_unlink_t unlink, lnet_handle_md_t *handle)
 {
-        lib_nal_t    *nal = apinal->nal_data;
-        lib_me_t     *me;
-        lib_md_t     *md;
-        unsigned long flags;
-        int           rc;
-
-        if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 &&
-            umd->length > PTL_MD_MAX_IOV) /* too many fragments */
-                return PTL_IOV_INVALID;
-
-        md = lib_md_alloc(nal, umd);
+        lnet_me_t     *me;
+        lnet_libmd_t  *md;
+        int            rc;
+
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+            umd.length > LNET_MAX_IOV) /* too many fragments */
+                return -EINVAL;
+
+        md = lnet_md_alloc(&umd);
         if (md == NULL)
-                return PTL_NO_SPACE;
+                return -ENOMEM;
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        me = ptl_handle2me(meh, nal);
+        me = lnet_handle2me(&meh);
         if (me == NULL) {
-                rc = PTL_ME_INVALID;
-        } else if (me->md != NULL) {
-                rc = PTL_ME_IN_USE;
+                rc = -ENOENT;
+        } else if (me->me_md != NULL) {
+                rc = -EBUSY;
         } else {
-                rc = lib_md_build(nal, md, umd, unlink);
-                if (rc == PTL_OK) {
-                        me->md = md;
-                        md->me = me;
+                rc = lib_md_build(md, &umd, unlink);
+                if (rc == 0) {
+                        me->me_md = md;
+                        md->md_me = me;
+
+                        lnet_md2handle(handle, md);
 
-                        ptl_md2handle(handle, nal, md);
+                        /* check if this MD matches any blocked msgs */
+                        lnet_match_blocked_msg(md);   /* expects LNET_LOCK held */
 
-                        LIB_UNLOCK(nal, flags);
-                        return (PTL_OK);
+                        LNET_UNLOCK();
+                        return (0);
                 }
         }
 
-        lib_md_free (nal, md);
+        lnet_md_free (md);
 
-        LIB_UNLOCK(nal, flags);
+        LNET_UNLOCK();
         return (rc);
 }
 
 int
-lib_api_md_bind(nal_t *apinal,
-                ptl_md_t *umd, ptl_unlink_t unlink,
-                ptl_handle_md_t *handle)
+LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle)
 {
-        lib_nal_t    *nal = apinal->nal_data;
-        lib_md_t     *md;
-        unsigned long flags;
-        int           rc;
+        lnet_libmd_t  *md;
+        int            rc;
 
-        if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 &&
-            umd->length > PTL_MD_MAX_IOV) /* too many fragments */
-                return PTL_IOV_INVALID;
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 &&
+            umd.length > LNET_MAX_IOV) /* too many fragments */
+                return -EINVAL;
 
-        md = lib_md_alloc(nal, umd);
+        md = lnet_md_alloc(&umd);
         if (md == NULL)
-                return PTL_NO_SPACE;
+                return -ENOMEM;
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        rc = lib_md_build(nal, md, umd, unlink);
+        rc = lib_md_build(md, &umd, unlink);
 
-        if (rc == PTL_OK) {
-                ptl_md2handle(handle, nal, md);
+        if (rc == 0) {
+                lnet_md2handle(handle, md);
 
-                LIB_UNLOCK(nal, flags);
-                return (PTL_OK);
+                LNET_UNLOCK();
+                return (0);
         }
 
-        lib_md_free (nal, md);
+        lnet_md_free (md);
 
-        LIB_UNLOCK(nal, flags);
+        LNET_UNLOCK();
         return (rc);
 }
 
 int
-lib_api_md_unlink (nal_t *apinal, ptl_handle_md_t *mdh)
+LNetMDUnlink (lnet_handle_md_t mdh)
 {
-        lib_nal_t       *nal = apinal->nal_data;
-        ptl_event_t      ev;
-        lib_md_t        *md;
-        unsigned long    flags;
+        lnet_event_t     ev;
+        lnet_libmd_t    *md;
 
-        LIB_LOCK(nal, flags);
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        LNET_LOCK();
 
-        md = ptl_handle2md(mdh, nal);
+        md = lnet_handle2md(&mdh);
         if (md == NULL) {
-                LIB_UNLOCK(nal, flags);
-                return PTL_MD_INVALID;
+                LNET_UNLOCK();
+                return -ENOENT;
         }
 
-        /* If the MD is busy, lib_md_unlink just marks it for deletion, and
+        /* If the MD is busy, lnet_md_unlink just marks it for deletion, and
          * when the NAL is done, the completion event flags that the MD was
          * unlinked.  Otherwise, we enqueue an event now... */
 
-        if (md->eq != NULL &&
-            md->pending == 0) {
+        if (md->md_eq != NULL &&
+            md->md_refcount == 0) {
                 memset(&ev, 0, sizeof(ev));
 
-                ev.type = PTL_EVENT_UNLINK;
-                ev.ni_fail_type = PTL_OK;
+                ev.type = LNET_EVENT_UNLINK;
+                ev.status = 0;
                 ev.unlinked = 1;
-                lib_md_deconstruct(nal, md, &ev.md);
-                ptl_md2handle(&ev.md_handle, nal, md);
+                lnet_md_deconstruct(md, &ev.md);
+                lnet_md2handle(&ev.md_handle, md);
 
-                lib_enq_event_locked(nal, NULL, md->eq, &ev);
+                lnet_enq_event_locked(md->md_eq, &ev);
         }
 
-        lib_md_unlink(nal, md);
+        lnet_md_unlink(md);
 
-        LIB_UNLOCK(nal, flags);
-        return PTL_OK;
+        LNET_UNLOCK();
+        return 0;
 }
 
-int
-lib_api_md_update (nal_t *apinal,
-                   ptl_handle_md_t *mdh,
-                   ptl_md_t *oldumd, ptl_md_t *newumd,
-                   ptl_handle_eq_t *testqh)
-{
-        lib_nal_t    *nal = apinal->nal_data;
-        lib_md_t     *md;
-        lib_eq_t     *test_eq = NULL;
-        unsigned long flags;
-        int           rc;
-
-        LIB_LOCK(nal, flags);
-
-        md = ptl_handle2md(mdh, nal);
-        if (md == NULL) {
-                 rc = PTL_MD_INVALID;
-                 goto out;
-        }
-
-        if (oldumd != NULL)
-                lib_md_deconstruct(nal, md, oldumd);
-
-        if (newumd == NULL) {
-                rc = PTL_OK;
-                goto out;
-        }
-
-        /* XXX fttb, the new MD must be the same "shape" wrt fragmentation,
-         * since we simply overwrite the old lib-md */
-        if ((((newumd->options ^ md->options) &
-              (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0) ||
-            ((newumd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 &&
-             newumd->length != md->md_niov)) {
-                rc = PTL_IOV_INVALID;
-                goto out;
-        }
-
-        if (!PtlHandleIsEqual (*testqh, PTL_EQ_NONE)) {
-                test_eq = ptl_handle2eq(testqh, nal);
-                if (test_eq == NULL) {
-                        rc = PTL_EQ_INVALID;
-                        goto out;
-                }
-        }
-
-        if (md->pending != 0) {
-                rc = PTL_MD_NO_UPDATE;
-                goto out;
-        }
-
-        if (test_eq == NULL ||
-            test_eq->eq_deq_seq == test_eq->eq_enq_seq) {
-                lib_me_t *me = md->me;
-                int       unlink = (md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) ?
-                                   PTL_UNLINK : PTL_RETAIN;
-
-                // #warning this does not track eq refcounts properly
-                rc = lib_md_build(nal, md, newumd, unlink);
-
-                md->me = me;
-        } else {
-                rc = PTL_MD_NO_UPDATE;
-        }
-
- out:
-        LIB_UNLOCK(nal, flags);
-
-        return rc;
-}
index cbc7c53..fb72c6d 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
-#ifndef __KERNEL__
-# include <stdio.h>
-#else
-# include <libcfs/kp30.h>
-#endif
-
-#include <portals/lib-p30.h>
+#include <lnet/lib-lnet.h>
 
 int
-lib_api_me_attach(nal_t *apinal,
-                  ptl_pt_index_t portal,
-                  ptl_process_id_t match_id,
-                  ptl_match_bits_t match_bits,
-                  ptl_match_bits_t ignore_bits,
-                  ptl_unlink_t unlink, ptl_ins_pos_t pos,
-                  ptl_handle_me_t *handle)
+LNetMEAttach(unsigned int portal,
+             lnet_process_id_t match_id, 
+             __u64 match_bits, __u64 ignore_bits,
+             lnet_unlink_t unlink, lnet_ins_pos_t pos, 
+             lnet_handle_me_t *handle)
 {
-        lib_nal_t    *nal = apinal->nal_data;
-        lib_ni_t     *ni = &nal->libnal_ni;
-        lib_ptl_t    *tbl = &ni->ni_portals;
-        lib_me_t     *me;
-        unsigned long flags;
+        lnet_me_t     *me;
 
-        if (portal >= tbl->size)
-                return PTL_PT_INDEX_INVALID;
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        if (portal >= the_lnet.ln_nportals)
+                return -EINVAL;
 
-        me = lib_me_alloc (nal);
+        me = lnet_me_alloc();
         if (me == NULL)
-                return PTL_NO_SPACE;
+                return -ENOMEM;
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        me->match_id = match_id;
-        me->match_bits = match_bits;
-        me->ignore_bits = ignore_bits;
-        me->unlink = unlink;
-        me->md = NULL;
+        me->me_portal = portal;
+        me->me_match_id = match_id;
+        me->me_match_bits = match_bits;
+        me->me_ignore_bits = ignore_bits;
+        me->me_unlink = unlink;
+        me->me_md = NULL;
 
-        lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME);
+        lnet_initialise_handle (&me->me_lh, LNET_COOKIE_TYPE_ME);
 
-        if (pos == PTL_INS_AFTER)
-                list_add_tail(&me->me_list, &(tbl->tbl[portal]));
+        if (pos == LNET_INS_AFTER)
+                list_add_tail(&me->me_list, &(the_lnet.ln_portals[portal].ptl_ml));
         else
-                list_add(&me->me_list, &(tbl->tbl[portal]));
+                list_add(&me->me_list, &(the_lnet.ln_portals[portal].ptl_ml));
 
-        ptl_me2handle(handle, nal, me);
+        lnet_me2handle(handle, me);
 
-        LIB_UNLOCK(nal, flags);
+        LNET_UNLOCK();
 
-        return PTL_OK;
+        return 0;
 }
 
-int
-lib_api_me_insert(nal_t *apinal,
-                  ptl_handle_me_t *current_meh,
-                  ptl_process_id_t match_id,
-                  ptl_match_bits_t match_bits,
-                  ptl_match_bits_t ignore_bits,
-                  ptl_unlink_t unlink, ptl_ins_pos_t pos,
-                  ptl_handle_me_t *handle)
+int 
+LNetMEInsert(lnet_handle_me_t current_meh, 
+             lnet_process_id_t match_id, 
+             __u64 match_bits, __u64 ignore_bits,
+             lnet_unlink_t unlink, lnet_ins_pos_t pos,
+             lnet_handle_me_t *handle)
 {
-        lib_nal_t    *nal = apinal->nal_data;
-        lib_me_t     *current_me;
-        lib_me_t     *new_me;
-        unsigned long flags;
+        lnet_me_t     *current_me;
+        lnet_me_t     *new_me;
 
-        new_me = lib_me_alloc (nal);
+        LASSERT (the_lnet.ln_init);        
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        new_me = lnet_me_alloc();
         if (new_me == NULL)
-                return PTL_NO_SPACE;
+                return -ENOMEM;
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        current_me = ptl_handle2me(current_meh, nal);
+        current_me = lnet_handle2me(&current_meh);
         if (current_me == NULL) {
-                lib_me_free (nal, new_me);
+                lnet_me_free (new_me);
 
-                LIB_UNLOCK(nal, flags);
-                return PTL_ME_INVALID;
+                LNET_UNLOCK();
+                return -ENOENT;
         }
 
-        new_me->match_id = match_id;
-        new_me->match_bits = match_bits;
-        new_me->ignore_bits = ignore_bits;
-        new_me->unlink = unlink;
-        new_me->md = NULL;
+        new_me->me_match_id = match_id;
+        new_me->me_match_bits = match_bits;
+        new_me->me_ignore_bits = ignore_bits;
+        new_me->me_unlink = unlink;
+        new_me->me_md = NULL;
 
-        lib_initialise_handle (nal, &new_me->me_lh, PTL_COOKIE_TYPE_ME);
+        lnet_initialise_handle (&new_me->me_lh, LNET_COOKIE_TYPE_ME);
 
-        if (pos == PTL_INS_AFTER)
+        if (pos == LNET_INS_AFTER)
                 list_add_tail(&new_me->me_list, &current_me->me_list);
         else
                 list_add(&new_me->me_list, &current_me->me_list);
 
-        ptl_me2handle(handle, nal, new_me);
+        lnet_me2handle(handle, new_me);
 
-        LIB_UNLOCK(nal, flags);
+        LNET_UNLOCK();
 
-        return PTL_OK;
+        return 0;
 }
 
 int
-lib_api_me_unlink (nal_t *apinal, ptl_handle_me_t *meh)
+LNetMEUnlink(lnet_handle_me_t meh)
 {
-        lib_nal_t    *nal = apinal->nal_data;
-        unsigned long flags;
-        lib_me_t     *me;
+        lnet_me_t     *me;
         int           rc;
 
-        LIB_LOCK(nal, flags);
+        LASSERT (the_lnet.ln_init);        
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        LNET_LOCK();
 
-        me = ptl_handle2me(meh, nal);
+        me = lnet_handle2me(&meh);
         if (me == NULL) {
-                rc = PTL_ME_INVALID;
+                rc = -ENOENT;
         } else {
-                lib_me_unlink(nal, me);
-                rc = PTL_OK;
+                lnet_me_unlink(me);
+                rc = 0;
         }
 
-        LIB_UNLOCK(nal, flags);
+        LNET_UNLOCK();
 
         return (rc);
 }
 
-/* call with state_lock please */
+/* call with LNET_LOCK please */
 void
-lib_me_unlink(lib_nal_t *nal, lib_me_t *me)
+lnet_me_unlink(lnet_me_t *me)
 {
         list_del (&me->me_list);
 
-        if (me->md) {
-                me->md->me = NULL;
-                lib_md_unlink(nal, me->md);
+        if (me->me_md) {
+                me->me_md->md_me = NULL;
+                lnet_md_unlink(me->me_md);
         }
 
-        lib_invalidate_handle (nal, &me->me_lh);
-        lib_me_free(nal, me);
+        lnet_invalidate_handle (&me->me_lh);
+        lnet_me_free(me);
 }
 
 #if 0
 static void
-lib_me_dump(lib_nal_t *nal, lib_me_t * me)
+lib_me_dump(lnet_me_t *me)
 {
         CWARN("Match Entry %p ("LPX64")\n", me,
               me->me_lh.lh_cookie);
 
         CWARN("\tMatch/Ignore\t= %016lx / %016lx\n",
-              me->match_bits, me->ignore_bits);
+              me->me_match_bits, me->me_ignore_bits);
 
         CWARN("\tMD\t= %p\n", me->md);
         CWARN("\tprev\t= %p\n",
-              list_entry(me->me_list.prev, lib_me_t, me_list));
+              list_entry(me->me_list.prev, lnet_me_t, me_list));
         CWARN("\tnext\t= %p\n",
-              list_entry(me->me_list.next, lib_me_t, me_list));
+              list_entry(me->me_list.next, lnet_me_t, me_list));
 }
 #endif
index 5339b6d..b7c6e51 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
-#ifndef __KERNEL__
-# include <stdio.h>
-#else
-# include <libcfs/kp30.h>
-#endif
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
+#include <lnet/lib-lnet.h>
+
+static int local_nid_dist_zero = 1;
+CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444,
+                "Reserved");
 
 /* forward ref */
-static void lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg);
-static ptl_err_t do_lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, 
-                              void *private, int loopback);
-
-static lib_md_t *
-lib_match_md(lib_nal_t *nal, int index, int op_mask,
-             ptl_nid_t src_nid, ptl_pid_t src_pid,
-             ptl_size_t rlength, ptl_size_t roffset,
-             ptl_match_bits_t match_bits, lib_msg_t *msg,
-             ptl_size_t *mlength_out, ptl_size_t *offset_out)
+static void lnet_commit_md (lnet_libmd_t *md, lnet_msg_t *msg);
+static void lnet_drop_delayed_put(lnet_msg_t *msg, char *reason);
+
+#define LNET_MATCHMD_NONE     0   /* Didn't match */
+#define LNET_MATCHMD_OK       1   /* Matched OK */
+#define LNET_MATCHMD_DROP     2   /* Must be disarded */
+
+static int
+lnet_try_match_md (int index, int op_mask, lnet_process_id_t src,
+                   unsigned int rlength, unsigned int roffset, 
+                   __u64 match_bits, lnet_libmd_t *md, lnet_msg_t *msg,
+                   unsigned int *mlength_out, unsigned int *offset_out)
 {
-        lib_ni_t         *ni = &nal->libnal_ni;
-        struct list_head *match_list = &ni->ni_portals.tbl[index];
-        struct list_head *tmp;
-        lib_me_t         *me;
-        lib_md_t         *md;
-        ptl_size_t        mlength;
-        ptl_size_t        offset;
-        ENTRY;
+        /* ALWAYS called holding the LNET_LOCK, and can't LNET_UNLOCK;
+         * lnet_match_blocked_msg() relies on this to avoid races */
+        unsigned int  offset;
+        unsigned int  mlength;
+        lnet_me_t    *me = md->md_me;
 
-        CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d "
-                "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits);
+        /* mismatched MD op */
+        if ((md->md_options & op_mask) == 0)
+                return LNET_MATCHMD_NONE;
 
-        if (index < 0 || index >= ni->ni_portals.size) {
-                CERROR("Invalid portal %d not in [0-%d]\n",
-                       index, ni->ni_portals.size);
-                goto failed;
-        }
+        /* MD exhausted */
+        if (lnet_md_exhausted(md))
+                return LNET_MATCHMD_NONE;
 
-        list_for_each (tmp, match_list) {
-                me = list_entry(tmp, lib_me_t, me_list);
-                md = me->md;
+        /* mismatched ME nid/pid? */
+        if (me->me_match_id.nid != LNET_NID_ANY &&
+            me->me_match_id.nid != src.nid)
+                return LNET_MATCHMD_NONE;
 
-                 /* ME attached but MD not attached yet */
-                if (md == NULL)
-                        continue;
+        if (me->me_match_id.pid != LNET_PID_ANY &&
+            me->me_match_id.pid != src.pid)
+                return LNET_MATCHMD_NONE;
 
-                LASSERT (me == md->me);
+        /* mismatched ME matchbits? */
+        if (((me->me_match_bits ^ match_bits) & ~me->me_ignore_bits) != 0)
+                return LNET_MATCHMD_NONE;
 
-                /* mismatched MD op */
-                if ((md->options & op_mask) == 0)
-                        continue;
+        /* Hurrah! This _is_ a match; check it out... */
 
-                /* MD exhausted */
-                if (lib_md_exhausted(md))
-                        continue;
+        if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0)
+                offset = md->md_offset;
+        else
+                offset = roffset;
 
-                /* mismatched ME nid/pid? */
-                if (me->match_id.nid != PTL_NID_ANY &&
-                    me->match_id.nid != src_nid)
-                        continue;
+        if ((md->md_options & LNET_MD_MAX_SIZE) != 0) {
+                mlength = md->md_max_size;
+                LASSERT (md->md_offset + mlength <= md->md_length);
+        } else {
+                mlength = md->md_length - offset;
+        }
 
-                CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n",
-                       me->match_id.pid, src_pid);
+        if (rlength <= mlength) {        /* fits in allowed space */
+                mlength = rlength;
+        } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) {
+                /* this packet _really_ is too big */
+                CERROR("Matching packet from %s, match "LPU64
+                       " length %d too big: %d left, %d allowed\n", 
+                       libcfs_id2str(src), match_bits, rlength,
+                       md->md_length - offset, mlength);
 
-                if (me->match_id.pid != PTL_PID_ANY &&
-                    me->match_id.pid != src_pid)
-                        continue;
+                return LNET_MATCHMD_DROP;
+        }
 
-                /* mismatched ME matchbits? */
-                if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0)
-                        continue;
+        /* Commit to this ME/MD */
+        CDEBUG(D_NET, "Incoming %s index %x from %s of "
+               "length %d/%d into md "LPX64" [%d] + %d\n",
+               (op_mask == LNET_MD_OP_PUT) ? "put" : "get",
+               index, libcfs_id2str(src), mlength, rlength,
+               md->md_lh.lh_cookie, md->md_niov, offset);
+
+        lnet_commit_md(md, msg);
+        md->md_offset = offset + mlength;
+
+        /* NB Caller will set ev.type and ev.hdr_data */
+        msg->msg_ev.initiator = src;
+        msg->msg_ev.pt_index = index;
+        msg->msg_ev.match_bits = match_bits;
+        msg->msg_ev.rlength = rlength;
+        msg->msg_ev.mlength = mlength;
+        msg->msg_ev.offset = offset;
+
+        lnet_md_deconstruct(md, &msg->msg_ev.md);
+        lnet_md2handle(&msg->msg_ev.md_handle, md);
+
+        *offset_out = offset;
+        *mlength_out = mlength;
+
+        /* Auto-unlink NOW, so the ME gets unlinked if required.
+         * We bumped md->md_refcount above so the MD just gets flagged
+         * for unlink when it is finalized. */
+        if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 &&
+            lnet_md_exhausted(md)) {
+                lnet_md_unlink(md);
+        }
 
-                /* Hurrah! This _is_ a match; check it out... */
+        return LNET_MATCHMD_OK;
+}
 
-                if ((md->options & PTL_MD_MANAGE_REMOTE) == 0)
-                        offset = md->offset;
-                else
-                        offset = roffset;
+static int
+lnet_match_md(int index, int op_mask, lnet_process_id_t src,
+              unsigned int rlength, unsigned int roffset,
+              __u64 match_bits, lnet_msg_t *msg,
+              unsigned int *mlength_out, unsigned int *offset_out,
+              lnet_libmd_t **md_out)
+{
+        lnet_portal_t    *ptl = &the_lnet.ln_portals[index];
+        struct list_head *tmp;
+        lnet_me_t        *me;
+        lnet_libmd_t     *md;
+        int               rc;
 
-                if ((md->options & PTL_MD_MAX_SIZE) != 0) {
-                        mlength = md->max_size;
-                        LASSERT (md->offset + mlength <= md->length);
-                } else {
-                        mlength = md->length - offset;
-                }
+        CDEBUG (D_NET, "Request from %s of length %d into portal %d "
+                "MB="LPX64"\n", libcfs_id2str(src), rlength, index, match_bits);
 
-                if (rlength <= mlength) {        /* fits in allowed space */
-                        mlength = rlength;
-                } else if ((md->options & PTL_MD_TRUNCATE) == 0) {
-                        /* this packet _really_ is too big */
-                        CERROR("Matching packet %d too big: %d left, "
-                               "%d allowed\n", rlength, md->length - offset,
-                               mlength);
-                        goto failed;
+        if (index < 0 || index >= the_lnet.ln_nportals) {
+                CERROR("Invalid portal %d not in [0-%d]\n",
+                       index, the_lnet.ln_nportals);
+                return LNET_MATCHMD_DROP;
+        }
+
+        list_for_each (tmp, &ptl->ptl_ml) {
+                me = list_entry(tmp, lnet_me_t, me_list);
+                md = me->me_md;
+
+                 /* ME attached but MD not attached yet */
+                if (md == NULL)
+                        continue;
+
+                LASSERT (me == md->md_me);
+
+                rc = lnet_try_match_md(index, op_mask, src, rlength, 
+                                       roffset, match_bits, md, msg,
+                                       mlength_out, offset_out);
+                switch (rc) {
+                default:
+                        LBUG();
+                        
+                case LNET_MATCHMD_NONE:
+                        continue;
+                        
+                case LNET_MATCHMD_OK:
+                        *md_out = md;
+                        return LNET_MATCHMD_OK;
+                        
+                case LNET_MATCHMD_DROP:
+                        return LNET_MATCHMD_DROP;
                 }
+                /* not reached */
+        }
 
-                /* Commit to this ME/MD */
-                CDEBUG(D_NET, "Incoming %s index %x from "LPU64"/%u of "
-                       "length %d/%d into md "LPX64" [%d] + %d\n",
-                       (op_mask == PTL_MD_OP_PUT) ? "put" : "get",
-                       index, src_nid, src_pid, mlength, rlength,
-                       md->md_lh.lh_cookie, md->md_niov, offset);
-
-                lib_commit_md(nal, md, msg);
-                md->offset = offset + mlength;
-
-                /* NB Caller sets ev.type and ev.hdr_data */
-                msg->ev.initiator.nid = src_nid;
-                msg->ev.initiator.pid = src_pid;
-                msg->ev.pt_index = index;
-                msg->ev.match_bits = match_bits;
-                msg->ev.rlength = rlength;
-                msg->ev.mlength = mlength;
-                msg->ev.offset = offset;
-
-                lib_md_deconstruct(nal, md, &msg->ev.md);
-                ptl_md2handle(&msg->ev.md_handle, nal, md);
-
-                *offset_out = offset;
-                *mlength_out = mlength;
-
-                /* Auto-unlink NOW, so the ME gets unlinked if required.
-                 * We bumped md->pending above so the MD just gets flagged
-                 * for unlink when it is finalized. */
-                if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) != 0 &&
-                    lib_md_exhausted(md))
-                        lib_md_unlink(nal, md);
-
-                RETURN (md);
-        }
-
- failed:
-        CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64
-                " offset %d length %d: no match\n",
-                ni->ni_pid.nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT",
-                src_nid, src_pid, index, match_bits, roffset, rlength);
-        RETURN(NULL);
+        if (op_mask == LNET_MD_OP_GET ||
+            (ptl->ptl_options & LNET_PTL_LAZY) == 0)
+                return LNET_MATCHMD_DROP;
+        
+        return LNET_MATCHMD_NONE;
 }
 
-int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold)
+int
+lnet_fail_nid (lnet_nid_t nid, unsigned int threshold)
 {
-        lib_nal_t         *nal = apinal->nal_data;
-        lib_test_peer_t   *tp;
-        unsigned long      flags;
+        lnet_test_peer_t   *tp;
         struct list_head  *el;
         struct list_head  *next;
         struct list_head   cull;
 
+        LASSERT (the_lnet.ln_init);
+        
         if (threshold != 0) {
                 /* Adding a new entry */
-                PORTAL_ALLOC(tp, sizeof(*tp));
+                LIBCFS_ALLOC(tp, sizeof(*tp));
                 if (tp == NULL)
-                        return PTL_NO_SPACE;
+                        return -ENOMEM;
 
                 tp->tp_nid = nid;
                 tp->tp_threshold = threshold;
 
-                LIB_LOCK(nal, flags);
-                list_add_tail (&tp->tp_list, &nal->libnal_ni.ni_test_peers);
-                LIB_UNLOCK(nal, flags);
-                return PTL_OK;
+                LNET_LOCK();
+                list_add_tail (&tp->tp_list, &the_lnet.ln_test_peers);
+                LNET_UNLOCK();
+                return 0;
         }
 
         /* removing entries */
         CFS_INIT_LIST_HEAD (&cull);
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) {
-                tp = list_entry (el, lib_test_peer_t, tp_list);
+        list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+                tp = list_entry (el, lnet_test_peer_t, tp_list);
 
                 if (tp->tp_threshold == 0 ||    /* needs culling anyway */
-                    nid == PTL_NID_ANY ||       /* removing all entries */
+                    nid == LNET_NID_ANY ||       /* removing all entries */
                     tp->tp_nid == nid)          /* matched this one */
                 {
                         list_del (&tp->tp_list);
@@ -204,46 +233,32 @@ int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold)
                 }
         }
 
-        LIB_UNLOCK(nal, flags);
+        LNET_UNLOCK();
 
         while (!list_empty (&cull)) {
-                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+                tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
 
                 list_del (&tp->tp_list);
-                PORTAL_FREE(tp, sizeof (*tp));
+                LIBCFS_FREE(tp, sizeof (*tp));
         }
-        return PTL_OK;
-}
-
-int 
-lib_api_loopback (nal_t *apinal, int set, int *enabled)
-{
-        lib_nal_t *nal = apinal->nal_data;
-
-        if (set)
-                nal->libnal_ni.ni_loopback = *enabled;
-        else
-                *enabled = nal->libnal_ni.ni_loopback;
-        
-        return PTL_OK;
+        return 0;
 }
 
 static int
-fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing)
+fail_peer (lnet_nid_t nid, int outgoing)
 {
-        lib_test_peer_t  *tp;
+        lnet_test_peer_t  *tp;
         struct list_head *el;
         struct list_head *next;
-        unsigned long     flags;
         struct list_head  cull;
         int               fail = 0;
 
         CFS_INIT_LIST_HEAD (&cull);
 
-        LIB_LOCK (nal, flags);
+        LNET_LOCK();
 
-        list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) {
-                tp = list_entry (el, lib_test_peer_t, tp_list);
+        list_for_each_safe (el, next, &the_lnet.ln_test_peers) {
+                tp = list_entry (el, lnet_test_peer_t, tp_list);
 
                 if (tp->tp_threshold == 0) {
                         /* zombie entry */
@@ -257,11 +272,11 @@ fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing)
                         continue;
                 }
 
-                if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */
+                if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */
                     nid == tp->tp_nid) {        /* fail this peer */
                         fail = 1;
 
-                        if (tp->tp_threshold != PTL_MD_THRESH_INF) {
+                        if (tp->tp_threshold != LNET_MD_THRESH_INF) {
                                 tp->tp_threshold--;
                                 if (outgoing &&
                                     tp->tp_threshold == 0) {
@@ -274,22 +289,22 @@ fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing)
                 }
         }
 
-        LIB_UNLOCK (nal, flags);
+        LNET_UNLOCK ();
 
         while (!list_empty (&cull)) {
-                tp = list_entry (cull.next, lib_test_peer_t, tp_list);
+                tp = list_entry (cull.next, lnet_test_peer_t, tp_list);
                 list_del (&tp->tp_list);
 
-                PORTAL_FREE(tp, sizeof (*tp));
+                LIBCFS_FREE(tp, sizeof (*tp));
         }
 
         return (fail);
 }
 
-ptl_size_t
-lib_iov_nob (int niov, struct iovec *iov)
+unsigned int
+lnet_iov_nob (unsigned int niov, struct iovec *iov)
 {
-        ptl_size_t nob = 0;
+        unsigned int nob = 0;
 
         while (niov-- > 0)
                 nob += (iov++)->iov_len;
@@ -298,77 +313,73 @@ lib_iov_nob (int niov, struct iovec *iov)
 }
 
 void
-lib_copy_iov2buf (char *dest, int niov, struct iovec *iov,
-                  ptl_size_t offset, ptl_size_t len)
+lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset,
+                   unsigned int nsiov, struct iovec *siov, unsigned int soffset,
+                   unsigned int nob)
 {
-        ptl_size_t nob;
+        /* NB diov, siov are READ-ONLY */
+        unsigned int  this_nob;
 
-        if (len == 0)
+        if (nob == 0)
                 return;
 
-        /* skip complete frags before 'offset' */
-        LASSERT (niov > 0);
-        while (offset >= iov->iov_len) {
-                offset -= iov->iov_len;
-                iov++;
-                niov--;
-                LASSERT (niov > 0);
+        /* skip complete frags before 'doffset' */
+        LASSERT (ndiov > 0);
+        while (doffset >= diov->iov_len) {
+                doffset -= diov->iov_len;
+                diov++;
+                ndiov--;
+                LASSERT (ndiov > 0);
         }
-
-        do {
-                LASSERT (niov > 0);
-                nob = MIN (iov->iov_len - offset, len);
-                memcpy (dest, iov->iov_base + offset, nob);
-
-                len -= nob;
-                dest += nob;
-                niov--;
-                iov++;
-                offset = 0;
-        } while (len > 0);
-}
-
-void
-lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset,
-                  char *src, ptl_size_t len)
-{
-        ptl_size_t nob;
-
-        if (len == 0)
-                return;
-
-        /* skip complete frags before 'offset' */
-        LASSERT (niov > 0);
-        while (offset >= iov->iov_len) {
-                offset -= iov->iov_len;
-                iov++;
-                niov--;
-                LASSERT (niov > 0);
+        
+        /* skip complete frags before 'soffset' */
+        LASSERT (nsiov > 0);
+        while (soffset >= siov->iov_len) {
+                soffset -= siov->iov_len;
+                siov++;
+                nsiov--;
+                LASSERT (nsiov > 0);
         }
 
         do {
-                LASSERT (niov > 0);
-                nob = MIN (iov->iov_len - offset, len);
-                memcpy (iov->iov_base + offset, src, nob);
-
-                len -= nob;
-                src += nob;
-                niov--;
-                iov++;
-                offset = 0;
-        } while (len > 0);
+                LASSERT (ndiov > 0);
+                LASSERT (nsiov > 0);
+                this_nob = MIN(diov->iov_len - doffset,
+                               siov->iov_len - soffset);
+                this_nob = MIN(this_nob, nob);
+
+                memcpy ((char *)diov->iov_base + doffset,
+                        (char *)siov->iov_base + soffset, this_nob);
+                nob -= this_nob;
+
+                if (diov->iov_len > doffset + this_nob) {
+                        doffset += this_nob;
+                } else {
+                        diov++;
+                        ndiov--;
+                        doffset = 0;
+                }
+                
+                if (siov->iov_len > soffset + this_nob) {
+                        soffset += this_nob;
+                } else {
+                        siov++;
+                        nsiov--;
+                        soffset = 0;
+                }
+        } while (nob > 0);
 }
 
 int
-lib_extract_iov (int dst_niov, struct iovec *dst,
-                 int src_niov, struct iovec *src,
-                 ptl_size_t offset, ptl_size_t len)
+lnet_extract_iov (int dst_niov, struct iovec *dst,
+                  int src_niov, struct iovec *src,
+                  unsigned int offset, unsigned int len)
 {
         /* Initialise 'dst' to the subset of 'src' starting at 'offset',
          * for exactly 'len' bytes, and return the number of entries.
          * NB not destructive to 'src' */
-        ptl_size_t      frag_len;
-        int             niov;
+        unsigned int    frag_len;
+        unsigned int    niov;
 
         if (len == 0)                           /* no data => */
                 return (0);                     /* no frags */
@@ -406,58 +417,51 @@ lib_extract_iov (int dst_niov, struct iovec *dst,
 }
 
 #ifndef __KERNEL__
-ptl_size_t
-lib_kiov_nob (int niov, ptl_kiov_t *kiov)
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
 {
         LASSERT (0);
         return (0);
 }
 
 void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
-                   ptl_size_t offset, ptl_size_t len)
+lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov, unsigned int doffset,
+                     unsigned int nskiov, lnet_kiov_t *skiov, unsigned int soffset,
+                     unsigned int nob)
 {
         LASSERT (0);
 }
 
 void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
-                   char *src, ptl_size_t len)
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                    unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                    unsigned int nob)
 {
         LASSERT (0);
 }
 
-int
-lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
-                  int src_niov, ptl_kiov_t *src,
-                  ptl_size_t offset, ptl_size_t len)
-{
-        LASSERT (0);
-}
-
-ptl_err_t
-lib_lo_rxkiov(lib_nal_t *nal, void *private, lib_msg_t *libmsg,
-              unsigned int niov, ptl_kiov_t *kiov,
-              size_t offset, size_t mlen, size_t rlen)
+void
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                    unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                    unsigned int nob)
 {
         LASSERT (0);
 }
 
-ptl_err_t
-lib_lo_txkiov (lib_nal_t *nal, void *private, lib_msg_t *libmsg,
-               ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-               unsigned int payload_niov, ptl_kiov_t *payload_kiov,
-               size_t payload_offset, size_t payload_nob)
+int
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+                   int src_niov, lnet_kiov_t *src,
+                   unsigned int offset, unsigned int len)
 {
         LASSERT (0);
 }
 
 #else /* __KERNEL__ */
 
-ptl_size_t
-lib_kiov_nob (int niov, ptl_kiov_t *kiov)
+unsigned int
+lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov)
 {
-        ptl_size_t  nob = 0;
+        unsigned int  nob = 0;
 
         while (niov-- > 0)
                 nob += (kiov++)->kiov_len;
@@ -466,89 +470,233 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov)
 }
 
 void
-lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov,
-                   ptl_size_t offset, ptl_size_t len)
+lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset,
+                     unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset,
+                     unsigned int nob)
 {
-        ptl_size_t  nob;
-        char       *addr;
+        /* NB diov, siov are READ-ONLY */
+        unsigned int    this_nob;
+        char           *daddr = NULL;
+        char           *saddr = NULL;
 
-        if (len == 0)
+        if (nob == 0)
+                return;
+
+        LASSERT (!in_interrupt ());
+
+        LASSERT (ndiov > 0);
+        while (doffset > diov->kiov_len) {
+                doffset -= diov->kiov_len;
+                diov++;
+                ndiov--;
+                LASSERT (ndiov > 0);
+        }
+
+        LASSERT (nsiov > 0);
+        while (soffset > siov->kiov_len) {
+                soffset -= siov->kiov_len;
+                siov++;
+                nsiov--;
+                LASSERT (nsiov > 0);
+        }
+
+        do {
+                LASSERT (ndiov > 0);
+                LASSERT (nsiov > 0);
+                this_nob = MIN(diov->kiov_len - doffset,
+                               siov->kiov_len - soffset);
+                this_nob = MIN(this_nob, nob);
+
+                if (daddr == NULL)
+                        daddr = ((char *)cfs_kmap(diov->kiov_page)) + 
+                                diov->kiov_offset + doffset;
+                if (saddr == NULL)
+                        saddr = ((char *)cfs_kmap(siov->kiov_page)) + 
+                                siov->kiov_offset + soffset;
+
+                /* Vanishing risk of kmap deadlock when mapping 2 pages.
+                 * However in practice at least one of the kiovs will be mapped
+                 * kernel pages and the map/unmap will be NOOPs */
+
+                memcpy (daddr, saddr, this_nob);
+                nob -= this_nob;
+
+                if (diov->kiov_len > doffset + this_nob) {
+                        daddr += this_nob;
+                        doffset += this_nob;
+                } else {
+                        cfs_kunmap(diov->kiov_page);
+                        daddr = NULL;
+                        diov++;
+                        ndiov--;
+                        doffset = 0;
+                }
+
+                if (siov->kiov_len > soffset + this_nob) {
+                        saddr += this_nob;
+                        soffset += this_nob;
+                } else {
+                        cfs_kunmap(siov->kiov_page);
+                        saddr = NULL;
+                        siov++;
+                        nsiov--;
+                        soffset = 0;
+                }
+        } while (nob > 0);
+
+        if (daddr != NULL)
+                cfs_kunmap(diov->kiov_page);
+        if (saddr != NULL)
+                cfs_kunmap(siov->kiov_page);
+}
+
+void
+lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                    unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                    unsigned int nob)
+{
+        /* NB iov, kiov are READ-ONLY */
+        unsigned int    this_nob;
+        char           *addr = NULL;
+
+        if (nob == 0)
                 return;
 
         LASSERT (!in_interrupt ());
 
         LASSERT (niov > 0);
-        while (offset > kiov->kiov_len) {
-                offset -= kiov->kiov_len;
-                kiov++;
+        while (iovoffset > iov->iov_len) {
+                iovoffset -= iov->iov_len;
+                iov++;
                 niov--;
                 LASSERT (niov > 0);
         }
 
+        LASSERT (nkiov > 0);
+        while (kiovoffset > kiov->kiov_len) {
+                kiovoffset -= kiov->kiov_len;
+                kiov++;
+                nkiov--;
+                LASSERT (nkiov > 0);
+        }
+
         do {
                 LASSERT (niov > 0);
-                nob = MIN (kiov->kiov_len - offset, len);
+                LASSERT (nkiov > 0);
+                this_nob = MIN(iov->iov_len - iovoffset,
+                               kiov->kiov_len - kiovoffset);
+                this_nob = MIN(this_nob, nob);
 
-                addr = ((char *)cfs_kmap(kiov->kiov_page)) + kiov->kiov_offset +
-                        offset;
-                memcpy (dest, addr, nob);
-                cfs_kunmap (kiov->kiov_page);
+                if (addr == NULL)
+                        addr = ((char *)cfs_kmap(kiov->kiov_page)) + 
+                                kiov->kiov_offset + kiovoffset;
 
-                len -= nob;
-                dest += nob;
-                niov--;
-                kiov++;
-                offset = 0;
-        } while (len > 0);
+                memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob);
+                nob -= this_nob;
+
+                if (iov->iov_len > iovoffset + this_nob) {
+                        iovoffset += this_nob;
+                } else {
+                        iov++;
+                        niov--;
+                        iovoffset = 0;
+                }
+
+                if (kiov->kiov_len > kiovoffset + this_nob) {
+                        addr += this_nob;
+                        kiovoffset += this_nob;
+                } else {
+                        cfs_kunmap(kiov->kiov_page);
+                        addr = NULL;
+                        kiov++;
+                        nkiov--;
+                        kiovoffset = 0;
+                }
+
+        } while (nob > 0);
+
+        if (addr != NULL)
+                cfs_kunmap(kiov->kiov_page);
 }
 
 void
-lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset,
-                   char *src, ptl_size_t len)
+lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset,
+                    unsigned int niov, struct iovec *iov, unsigned int iovoffset,
+                    unsigned int nob)
 {
-        ptl_size_t  nob;
-        char       *addr;
+        /* NB kiov, iov are READ-ONLY */
+        unsigned int    this_nob;
+        char           *addr = NULL;
 
-        if (len == 0)
+        if (nob == 0)
                 return;
 
         LASSERT (!in_interrupt ());
 
-        LASSERT (niov > 0);
-        while (offset >= kiov->kiov_len) {
-                offset -= kiov->kiov_len;
+        LASSERT (nkiov > 0);
+        while (kiovoffset > kiov->kiov_len) {
+                kiovoffset -= kiov->kiov_len;
                 kiov++;
+                nkiov--;
+                LASSERT (nkiov > 0);
+        }
+
+        LASSERT (niov > 0);
+        while (iovoffset > iov->iov_len) {
+                iovoffset -= iov->iov_len;
+                iov++;
                 niov--;
                 LASSERT (niov > 0);
         }
 
         do {
+                LASSERT (nkiov > 0);
                 LASSERT (niov > 0);
-                nob = MIN (kiov->kiov_len - offset, len);
+                this_nob = MIN(kiov->kiov_len - kiovoffset,
+                               iov->iov_len - iovoffset);
+                this_nob = MIN(this_nob, nob);
 
-                addr = ((char *)cfs_kmap(kiov->kiov_page)) + kiov->kiov_offset +
-                        offset;
-                memcpy (addr, src, nob);
-                cfs_kunmap (kiov->kiov_page);
+                if (addr == NULL)
+                        addr = ((char *)cfs_kmap(kiov->kiov_page)) + 
+                                kiov->kiov_offset + kiovoffset;
 
-                len -= nob;
-                src += nob;
-                niov--;
-                kiov++;
-                offset = 0;
-        } while (len > 0);
+                memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob);
+                nob -= this_nob;
+
+                if (kiov->kiov_len > kiovoffset + this_nob) {
+                        addr += this_nob;
+                        kiovoffset += this_nob;
+                } else {
+                        cfs_kunmap(kiov->kiov_page);
+                        addr = NULL;
+                        kiov++;
+                        nkiov--;
+                        kiovoffset = 0;
+                }
+
+                if (iov->iov_len > iovoffset + this_nob) {
+                        iovoffset += this_nob;
+                } else {
+                        iov++;
+                        niov--;
+                        iovoffset = 0;
+                }
+        } while (nob > 0);
+
+        if (addr != NULL)
+                cfs_kunmap(kiov->kiov_page);
 }
 
 int
-lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
-                  int src_niov, ptl_kiov_t *src,
-                  ptl_size_t offset, ptl_size_t len)
+lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst,
+                   int src_niov, lnet_kiov_t *src,
+                   unsigned int offset, unsigned int len)
 {
         /* Initialise 'dst' to the subset of 'src' starting at 'offset',
          * for exactly 'len' bytes, and return the number of entries.
          * NB not destructive to 'src' */
-        ptl_size_t      frag_len;
-        int             niov;
+        unsigned int    frag_len;
+        unsigned int    niov;
 
         if (len == 0)                           /* no data => */
                 return (0);                     /* no frags */
@@ -572,12 +720,12 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
 
                 if (len <= frag_len) {
                         dst->kiov_len = len;
-                        LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+                        LASSERT (dst->kiov_offset + dst->kiov_len <= CFS_PAGE_SIZE);
                         return (niov);
                 }
 
                 dst->kiov_len = frag_len;
-                LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE);
+                LASSERT (dst->kiov_offset + dst->kiov_len <= CFS_PAGE_SIZE);
 
                 len -= frag_len;
                 dst++;
@@ -587,442 +735,931 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst,
                 offset = 0;
         }
 }
-
-#ifndef __KERNEL__
-#if !defined(kmap)
-#define kmap(page) ((page)->addr)
-#endif
-#if !defined(kunmap)
-#define kunmap(page) do {} while(0)
-#endif
-#if !defined(page_address)
-#define page_address(page) ((page)->page_address)
-#endif
 #endif
 
-ptl_err_t
-lib_lo_rxkiov(lib_nal_t    *nal,
-              void         *private,
-              lib_msg_t    *libmsg,
-              unsigned int  niov,
-              ptl_kiov_t   *kiov,
-              size_t        offset,
-              size_t        mlen,
-              size_t        rlen)
+void
+lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
+             unsigned int offset, unsigned int mlen, unsigned int rlen)
 {
-        void          *srcaddr = NULL;
-        void          *dstaddr = NULL;
-        unsigned long  srcfrag = 0;
-        unsigned long  dstfrag = 0;
-        unsigned long  fraglen;
-        lo_desc_t     *lod = (lo_desc_t *)private;
+        unsigned int  niov = 0;
+        struct iovec *iov = NULL;
+        lnet_kiov_t  *kiov = NULL;
+        int           rc;
 
-        /* I only handle unmapped->unmapped matches */
-        LASSERT(lod->lod_type == LOD_KIOV);
+        LASSERT (!in_interrupt ());
+        LASSERT (mlen == 0 || msg != NULL);
+        
+        if (msg != NULL) {
+                LASSERT(msg->msg_receiving);
+                LASSERT(!msg->msg_sending);
+                LASSERT(rlen == msg->msg_len);
+                LASSERT(mlen <= msg->msg_len);
+
+                msg->msg_wanted = mlen;
+                msg->msg_offset = offset;
+                msg->msg_receiving = 0;
+
+                if (mlen != 0) {
+                        niov = msg->msg_niov;
+                        iov  = msg->msg_iov;
+                        kiov = msg->msg_kiov;
+                
+                        LASSERT (niov > 0);
+                        LASSERT ((iov == NULL) != (kiov == NULL));
+                }
+        }
+        
+        rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed,
+                                    niov, iov, kiov, offset, mlen, rlen);
+        if (rc < 0)
+                lnet_finalize(ni, msg, rc);
+}
 
-        if (mlen == 0)
-                return PTL_OK;
+int
+lnet_compare_routers(lnet_peer_t *p1, lnet_peer_t *p2)
+{
+        if (p1->lp_txqnob < p2->lp_txqnob)
+                return 1;
+        
+        if (p1->lp_txqnob > p2->lp_txqnob)
+                return -1;
+        
+        if (p1->lp_txcredits > p2->lp_txcredits)
+                return 1;
+        
+        if (p1->lp_txcredits < p2->lp_txcredits)
+                return -1;
+        
+        return 0;
+}
 
-        while (offset >= kiov->kiov_len) {
-                offset -= kiov->kiov_len;
-                kiov++;
-                niov--;
-                LASSERT(niov > 0);
-        }
 
-        while (lod->lod_offset >= lod->lod_iov.kiov->kiov_len) {
-                lod->lod_offset -= lod->lod_iov.kiov->kiov_len;
-                lod->lod_iov.kiov++;
-                lod->lod_niov--;
-                LASSERT(lod->lod_niov > 0);
-        }
+void
+lnet_setpayloadbuffer(lnet_msg_t *msg)
+{
+        lnet_libmd_t *md = msg->msg_md;
+
+        LASSERT (msg->msg_len > 0);
+        LASSERT (!msg->msg_routing);
+        LASSERT (md != NULL);
+        LASSERT (msg->msg_niov == 0);
+        LASSERT (msg->msg_iov == NULL);
+        LASSERT (msg->msg_kiov == NULL);
+
+        msg->msg_niov = md->md_niov;
+        if ((md->md_options & LNET_MD_KIOV) != 0)
+                msg->msg_kiov = md->md_iov.kiov;
+        else
+                msg->msg_iov = md->md_iov.iov;
+}
 
-        do {
-                /* CAVEAT EMPTOR:
-                 * I kmap 2 pages at once == slight risk of deadlock */
-                LASSERT(niov > 0);
-                if (dstaddr == NULL) {
-                        dstaddr = (void *)
-                                ((unsigned long)cfs_kmap(kiov->kiov_page) +
-                                 kiov->kiov_offset + offset);
-                        dstfrag = kiov->kiov_len -  offset;
-                }
+void
+lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target,
+               unsigned int offset, unsigned int len) 
+{
+        msg->msg_type = type;
+        msg->msg_target = target;
+        msg->msg_len = len;
+        msg->msg_offset = offset;
+
+        if (len != 0)
+                lnet_setpayloadbuffer(msg);
+
+        memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr));
+        msg->msg_hdr.type           = cpu_to_le32(type);
+        msg->msg_hdr.dest_nid       = cpu_to_le64(target.nid);
+        msg->msg_hdr.dest_pid       = cpu_to_le32(target.pid);
+        /* src_nid will be set later */
+        msg->msg_hdr.src_pid        = cpu_to_le32(the_lnet.ln_pid);
+        msg->msg_hdr.payload_length = cpu_to_le32(len);
+}
 
-                LASSERT(lod->lod_niov > 0);
-                if (srcaddr == NULL) {
-                        srcaddr = (void *)
-                         ((unsigned long)cfs_kmap(lod->lod_iov.kiov->kiov_page)+
-                          lod->lod_iov.kiov->kiov_offset + lod->lod_offset);
-                        srcfrag = lod->lod_iov.kiov->kiov_len - lod->lod_offset;
+void
+lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) 
+{
+        void   *priv = msg->msg_private;
+        int     rc;
+
+        LASSERT (!in_interrupt ());
+        LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
+                 (msg->msg_txcredit && msg->msg_peertxcredit));
+
+        rc = (ni->ni_lnd->lnd_send)(ni, priv, msg);
+        if (rc < 0)
+                lnet_finalize(ni, msg, rc);
+}
+
+int
+lnet_eager_recv_locked(lnet_msg_t *msg)
+{
+        lnet_peer_t *peer;
+        lnet_ni_t   *ni;
+        int          rc = 0;
+
+        LASSERT (!msg->msg_delayed);
+        msg->msg_delayed = 1;
+
+        LASSERT (msg->msg_receiving);
+        LASSERT (!msg->msg_sending);
+        
+        peer = msg->msg_rxpeer;
+        ni   = peer->lp_ni;
+
+        if (ni->ni_lnd->lnd_eager_recv != NULL) {
+                LNET_UNLOCK();
+                        
+                rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, 
+                                                  &msg->msg_private);
+                if (rc != 0) {
+                        CERROR("recv from %s / send to %s aborted: "
+                               "eager_recv failed %d\n",
+                               libcfs_nid2str(peer->lp_nid),
+                               libcfs_id2str(msg->msg_target), rc);
+                        LASSERT (rc < 0); /* required by my callers */
                 }
 
-                fraglen = MIN(srcfrag, dstfrag);
-                if (fraglen > mlen)
-                        fraglen = mlen;
+                LNET_LOCK();
+        }
 
-                memcpy(dstaddr, srcaddr, fraglen);
+        return rc;
+}
 
-                if (fraglen < dstfrag) {
-                        dstfrag -= fraglen;
-                        dstaddr = (void *)((unsigned long)dstaddr + fraglen);
-                } else {
-                        cfs_kunmap(kiov->kiov_page);
-                        dstaddr = NULL;
-                        offset = 0;
-                        kiov++;
-                        niov--;
+int
+lnet_post_send_locked (lnet_msg_t *msg, int do_send)
+{
+        /* lnet_send is going to LNET_UNLOCK immediately after this, so it sets
+         * do_send FALSE and I don't do the unlock/send/lock bit.  I return
+         * EAGAIN if msg blocked and 0 if sent or OK to send */
+        lnet_peer_t *lp = msg->msg_txpeer;
+        lnet_ni_t   *ni = lp->lp_ni;
+
+        /* non-lnet_send() callers have checked before */
+        LASSERT (!do_send || msg->msg_delayed);
+        LASSERT (!msg->msg_receiving);
+
+        if (!msg->msg_peertxcredit) {
+                LASSERT ((lp->lp_txcredits < 0) == !list_empty(&lp->lp_txq));
+
+                msg->msg_peertxcredit = 1;
+                lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t);
+                lp->lp_txcredits--;
+
+                if (lp->lp_txcredits < lp->lp_mintxcredits)
+                        lp->lp_mintxcredits = lp->lp_txcredits;
+
+                if (lp->lp_txcredits < 0) {
+                        msg->msg_delayed = 1;
+                        list_add_tail (&msg->msg_list, &lp->lp_txq);
+                        return EAGAIN;
                 }
+        }
+        
+        if (!msg->msg_txcredit) {
+                LASSERT ((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq));
 
-                if (fraglen < srcfrag) {
-                        srcfrag -= fraglen;
-                        srcaddr = (void *)((unsigned long)srcaddr + fraglen);
-                } else {
-                        cfs_kunmap(lod->lod_iov.kiov->kiov_page);
-                        srcaddr = NULL;
-                        lod->lod_offset = 0;
-                        lod->lod_iov.kiov++;
-                        lod->lod_niov--;
+                msg->msg_txcredit = 1;
+                ni->ni_txcredits--;
+
+                if (ni->ni_txcredits < ni->ni_mintxcredits)
+                        ni->ni_mintxcredits = ni->ni_txcredits;
+
+                if (ni->ni_txcredits < 0) {
+                        msg->msg_delayed = 1;
+                        list_add_tail (&msg->msg_list, &ni->ni_txq);
+                        return EAGAIN;
                 }
+        }
 
-                mlen -= fraglen;
-        } while (mlen > 0);
+        if (do_send) {
+                LNET_UNLOCK();
+                lnet_ni_send(ni, msg);
+                LNET_LOCK();
+        }
+        return 0;
+}
 
-        if (dstaddr != NULL)
-                cfs_kunmap(kiov->kiov_page);
+#ifdef __KERNEL__
+static void
+lnet_commit_routedmsg (lnet_msg_t *msg)
+{
+        /* ALWAYS called holding the LNET_LOCK */
+        LASSERT (msg->msg_routing);
+        
+        the_lnet.ln_counters.msgs_alloc++;
+        if (the_lnet.ln_counters.msgs_alloc > 
+            the_lnet.ln_counters.msgs_max)
+                the_lnet.ln_counters.msgs_max = 
+                        the_lnet.ln_counters.msgs_alloc;
+
+        the_lnet.ln_counters.route_count++;
+        the_lnet.ln_counters.route_length += msg->msg_len;
+        
+        LASSERT (!msg->msg_onactivelist);
+        msg->msg_onactivelist = 1;
+        list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs);
+}
 
-        if (srcaddr != NULL)
-                cfs_kunmap(lod->lod_iov.kiov->kiov_page);
+lnet_rtrbufpool_t *
+lnet_msg2bufpool(lnet_msg_t *msg) 
+{
+        lnet_rtrbufpool_t *rbp = &the_lnet.ln_rtrpools[0];
 
-        lib_finalize(nal, private, libmsg, PTL_OK);
-        return PTL_OK;
+        LASSERT (msg->msg_len <= LNET_MTU);
+        while (msg->msg_len > rbp->rbp_npages * CFS_PAGE_SIZE) {
+                rbp++;
+                LASSERT (rbp < &the_lnet.ln_rtrpools[LNET_NRBPOOLS]);
+        }
+
+        return rbp;
 }
 
-ptl_err_t
-lib_lo_txkiov (lib_nal_t    *nal,
-               void         *private,
-               lib_msg_t    *libmsg,
-               ptl_hdr_t    *hdr,
-               int           type,
-               ptl_nid_t     nid,
-               ptl_pid_t     pid,
-               unsigned int  payload_niov,
-               ptl_kiov_t   *payload_kiov,
-               size_t        payload_offset,
-               size_t        payload_nob)
+int
+lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv)
 {
-        lo_desc_t lod = {
-                .lod_type     = LOD_KIOV,
-                .lod_niov     = payload_niov,
-                .lod_offset   = payload_offset,
-                .lod_nob      = payload_nob,
-                .lod_iov      = { .kiov = payload_kiov } };
-        ptl_err_t   rc;
-
-        rc = do_lib_parse(nal, hdr, &lod, 1);
-        if (rc == PTL_OK)
-                lib_finalize(nal, private, libmsg, PTL_OK);
+        /* lnet_parse is going to LNET_UNLOCK immediately after this, so it
+         * sets do_recv FALSE and I don't do the unlock/send/lock bit.  I
+         * return EAGAIN if msg blocked and 0 if sent or OK to send */
+        lnet_peer_t         *lp = msg->msg_rxpeer;
+        lnet_rtrbufpool_t   *rbp;
+        lnet_rtrbuf_t       *rb;
+
+        LASSERT (msg->msg_iov == NULL);
+        LASSERT (msg->msg_kiov == NULL);
+        LASSERT (msg->msg_niov == 0);
+        LASSERT (msg->msg_routing);
+        LASSERT (msg->msg_receiving);
+        LASSERT (!msg->msg_sending);
+
+        /* non-lnet_parse callers only send delayed messages */
+        LASSERT (!do_recv || msg->msg_delayed);
+
+        if (!msg->msg_peerrtrcredit) {
+                LASSERT ((lp->lp_rtrcredits < 0) == !list_empty(&lp->lp_rtrq));
+                
+                msg->msg_peerrtrcredit = 1;
+                lp->lp_rtrcredits--;
+                if (lp->lp_rtrcredits < lp->lp_minrtrcredits)
+                        lp->lp_minrtrcredits = lp->lp_rtrcredits;
+                        
+                if (lp->lp_rtrcredits < 0) {
+                        /* must have checked eager_recv before here */
+                        LASSERT (msg->msg_delayed);
+                        list_add_tail(&msg->msg_list, &lp->lp_rtrq);
+                        return EAGAIN;
+                }
+        }
+        
+        rbp = lnet_msg2bufpool(msg);
 
-        return rc;
+        if (!msg->msg_rtrcredit) {
+                LASSERT ((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs));
+
+                msg->msg_rtrcredit = 1;
+                rbp->rbp_credits--;
+                if (rbp->rbp_credits < rbp->rbp_mincredits)
+                        rbp->rbp_mincredits = rbp->rbp_credits;
+
+                if (rbp->rbp_credits < 0) {
+                        /* must have checked eager_recv before here */
+                        LASSERT (msg->msg_delayed);
+                        list_add_tail(&msg->msg_list, &rbp->rbp_msgs);
+                        return EAGAIN;
+                }
+        }
+        
+        LASSERT (!list_empty(&rbp->rbp_bufs));
+        rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list);
+        list_del(&rb->rb_list);
+        
+        msg->msg_niov = rbp->rbp_npages;
+        msg->msg_kiov = &rb->rb_kiov[0];
+
+        if (do_recv) {
+                LNET_UNLOCK();
+                lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1,
+                             0, msg->msg_len, msg->msg_len);
+                LNET_LOCK();
+        }
+        return 0;
 }
 #endif
 
-ptl_err_t
-lib_lo_rxiov(lib_nal_t    *nal,
-             void         *private,
-             lib_msg_t    *libmsg,
-             unsigned int  niov,
-             struct iovec *iov,
-             size_t        offset,
-             size_t        mlen,
-             size_t        rlen)
+void
+lnet_return_credits_locked (lnet_msg_t *msg)
 {
-        lo_desc_t *lod = (lo_desc_t *)private;
+        lnet_peer_t       *txpeer = msg->msg_txpeer;
+        lnet_peer_t       *rxpeer = msg->msg_rxpeer;
+        lnet_msg_t        *msg2;
+        lnet_ni_t         *ni;
 
-        /* I only handle mapped->mapped matches */
-        LASSERT(lod->lod_type == LOD_IOV);
-        LASSERT(mlen > 0);
+        if (msg->msg_txcredit) {
+                /* give back NI txcredits */
+                msg->msg_txcredit = 0;
+                ni = txpeer->lp_ni;
 
-        while (offset >= iov->iov_len) {
-                offset -= iov->iov_len;
-                iov++;
-                niov--;
-                LASSERT(niov > 0);
+                LASSERT((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq));
+
+                ni->ni_txcredits++;
+                if (ni->ni_txcredits <= 0) {
+                        msg2 = list_entry(ni->ni_txq.next, lnet_msg_t, msg_list);
+                        list_del(&msg2->msg_list);
+
+                        LASSERT(msg2->msg_txpeer->lp_ni == ni);
+                        LASSERT(msg2->msg_delayed);
+
+                        (void) lnet_post_send_locked(msg2, 1);
+                }
         }
 
-        while (lod->lod_offset >= lod->lod_iov.iov->iov_len) {
-                lod->lod_offset -= lod->lod_iov.iov->iov_len;
-                lod->lod_iov.iov++;
-                lod->lod_niov--;
-                LASSERT(lod->lod_niov > 0);
+        if (msg->msg_peertxcredit) {
+                /* give back peer txcredits */
+                msg->msg_peertxcredit = 0;
+
+                LASSERT((txpeer->lp_txcredits < 0) == !list_empty(&txpeer->lp_txq));
+
+                txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t);
+                LASSERT (txpeer->lp_txqnob >= 0);
+
+                txpeer->lp_txcredits++;
+                if (txpeer->lp_txcredits <= 0) {
+                        msg2 = list_entry(txpeer->lp_txq.next, 
+                                          lnet_msg_t, msg_list);
+                        list_del(&msg2->msg_list);
+
+                        LASSERT (msg2->msg_txpeer == txpeer);
+                        LASSERT (msg2->msg_delayed);
+
+                        (void) lnet_post_send_locked(msg2, 1);
+                }
         }
 
-        do {
-                int fraglen = MIN(iov->iov_len - offset,
-                                  lod->lod_iov.iov->iov_len - lod->lod_offset);
+        if (txpeer != NULL) {
+                msg->msg_txpeer = NULL;
+                lnet_peer_decref_locked(txpeer);
+        }
+
+#ifdef __KERNEL__        
+        if (msg->msg_rtrcredit) {
+                /* give back global router credits */
+                lnet_rtrbuf_t     *rb;
+                lnet_rtrbufpool_t *rbp;
+
+                /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays
+                 * there until it gets one allocated, or aborts the wait
+                 * itself */
+                LASSERT (msg->msg_kiov != NULL);
+                
+                rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]);
+                rbp = rb->rb_pool;
+                LASSERT (rbp == lnet_msg2bufpool(msg));
+
+                msg->msg_kiov = NULL;
+                msg->msg_rtrcredit = 0;
+                
+                LASSERT((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs));
+                LASSERT((rbp->rbp_credits > 0) == !list_empty(&rbp->rbp_bufs));
+
+                list_add(&rb->rb_list, &rbp->rbp_bufs);
+                rbp->rbp_credits++;
+                if (rbp->rbp_credits <= 0) {
+                        msg2 = list_entry(rbp->rbp_msgs.next, 
+                                          lnet_msg_t, msg_list);
+                        list_del(&msg2->msg_list);
+                        
+                        (void) lnet_post_routed_recv_locked(msg2, 1);
+                }
+        }
+        
+        if (msg->msg_peerrtrcredit) {
+                /* give pack peer router credits */
+                msg->msg_peerrtrcredit = 0;
+                
+                LASSERT((rxpeer->lp_rtrcredits < 0) == !list_empty(&rxpeer->lp_rtrq));
+
+                rxpeer->lp_rtrcredits++;
+                if (rxpeer->lp_rtrcredits <= 0) {
+                        msg2 = list_entry(rxpeer->lp_rtrq.next,
+                                          lnet_msg_t, msg_list);
+                        list_del(&msg2->msg_list);
+                        
+                        (void) lnet_post_routed_recv_locked(msg2, 1);
+                }
+        }
+#else
+        LASSERT (!msg->msg_rtrcredit);
+        LASSERT (!msg->msg_peerrtrcredit);
+#endif
+        if (rxpeer != NULL) {
+                msg->msg_rxpeer = NULL;
+                lnet_peer_decref_locked(rxpeer);
+        }
+}
+
+int
+lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg)
+{
+        lnet_nid_t        dst_nid = msg->msg_target.nid;
+        lnet_ni_t        *src_ni;
+        lnet_ni_t        *local_ni;
+        lnet_remotenet_t *rnet;
+        lnet_route_t     *route;
+        lnet_route_t     *best_route;
+        struct list_head *tmp;
+        lnet_peer_t      *lp;
+        lnet_peer_t      *lp2;
+        int               rc;
 
-                LASSERT(niov > 0);
-                LASSERT(lod->lod_niov > 0);
+        LASSERT (msg->msg_txpeer == NULL);
+        LASSERT (!msg->msg_sending);
+        LASSERT (!msg->msg_target_is_router);
+        LASSERT (!msg->msg_receiving);
 
-                if (fraglen > mlen)
-                        fraglen = mlen;
+        msg->msg_sending = 1;
 
-                memcpy((void *)((unsigned long)iov->iov_base + offset),
-                       (void *)((unsigned long)lod->lod_iov.iov->iov_base +
-                                lod->lod_offset),
-                       fraglen);
+        /* NB! ni != NULL == interface pre-determined (ACK/REPLY) */
 
-                if (offset + fraglen < iov->iov_len) {
-                        offset += fraglen;
+        LNET_LOCK();
+
+        if (the_lnet.ln_shutdown) {
+                LNET_UNLOCK();
+                return -ESHUTDOWN;
+        }
+
+        if (src_nid == LNET_NID_ANY) {
+                src_ni = NULL;
+        } else {
+                src_ni = lnet_nid2ni_locked(src_nid);
+                if (src_ni == NULL) {
+                        LNET_UNLOCK();
+                        CERROR("Can't send to %s: src %s is not a local nid\n",
+                               libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
+                        return -EINVAL;
+                }
+                LASSERT (!msg->msg_routing);
+        }
+
+        /* Is this for someone on a local network? */ 
+        local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid));
+
+        if (local_ni != NULL) {
+                if (src_ni == NULL) {
+                        src_ni = local_ni;
+                        src_nid = src_ni->ni_nid;
+                } else if (src_ni == local_ni) {
+                        lnet_ni_decref_locked(local_ni);
                 } else {
-                        offset = 0;
-                        iov++;
-                        niov--;
+                        lnet_ni_decref_locked(local_ni);
+                        lnet_ni_decref_locked(src_ni);
+                        LNET_UNLOCK();
+                        CERROR("no route to %s via from %s\n",
+                               libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid));
+                        return -EINVAL;
                 }
 
-                if (lod->lod_offset + fraglen < lod->lod_iov.iov->iov_len ) {
-                        lod->lod_offset += fraglen;
+                LASSERT (src_nid != LNET_NID_ANY);
+
+                if (!msg->msg_routing) {
+                        src_nid = lnet_ptlcompat_srcnid(src_nid, dst_nid);
+                        msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+                }
+                
+                if (src_ni == the_lnet.ln_loni) {
+                        /* No send credit hassles with LOLND */
+                        LNET_UNLOCK();
+                        lnet_ni_send(src_ni, msg);
+                        lnet_ni_decref(src_ni);
+                        return 0;
+                }
+                
+                rc = lnet_nid2peer_locked(&lp, dst_nid);
+                lnet_ni_decref_locked(src_ni);  /* lp has ref on src_ni; lose mine */
+                if (rc != 0) {
+                        LNET_UNLOCK();
+                        CERROR("Error %d finding peer %s\n", rc,
+                               libcfs_nid2str(dst_nid));
+                        /* ENOMEM or shutting down */
+                        return rc;
+                }
+                LASSERT (lp->lp_ni == src_ni);
+        } else {
+                /* sending to a remote network */
+                rnet = lnet_find_net_locked(LNET_NIDNET(dst_nid));
+                if (rnet == NULL) {
+                        if (src_ni != NULL)
+                                lnet_ni_decref_locked(src_ni);
+                        LNET_UNLOCK();
+                        CERROR("No route to %s\n", libcfs_id2str(msg->msg_target));
+                        return -EHOSTUNREACH;
+                }
+
+                /* Find the best gateway I can use */
+                lp = NULL;
+                best_route = NULL;
+                list_for_each(tmp, &rnet->lrn_routes) {
+                        route = list_entry(tmp, lnet_route_t, lr_list);
+                        lp2 = route->lr_gateway;
+
+                        if (lp2->lp_alive &&
+                            (src_ni == NULL || lp2->lp_ni == src_ni) &&
+                            (lp == NULL || lnet_compare_routers(lp2, lp) > 0)) {
+                                best_route = route;
+                                lp = lp2;
+                        }
+                }
+
+                if (lp == NULL) {
+                        if (src_ni != NULL)
+                                lnet_ni_decref_locked(src_ni);
+                        LNET_UNLOCK();
+                        CERROR("No route to %s (all routers down)\n", 
+                               libcfs_id2str(msg->msg_target));
+                        return -EHOSTUNREACH;
+                }
+
+                /* Place selected route at the end of the route list to ensure
+                 * fairness; everything else being equal... */
+                list_del(&best_route->lr_list);
+                list_add_tail(&best_route->lr_list, &rnet->lrn_routes);
+
+                if (src_ni == NULL) {
+                        src_ni = lp->lp_ni;
+                        src_nid = src_ni->ni_nid;
                 } else {
-                        lod->lod_offset = 0;
-                        lod->lod_iov.iov++;
-                        lod->lod_niov--;
+                        LASSERT (src_ni == lp->lp_ni);
+                        lnet_ni_decref_locked(src_ni);
                 }
 
-                mlen -= fraglen;
-        } while (mlen > 0);
+                lnet_peer_addref_locked(lp);
 
-        lib_finalize(nal, private, libmsg, PTL_OK);
-        return PTL_OK;
-}
+                LASSERT (src_nid != LNET_NID_ANY);
 
-ptl_err_t
-lib_lo_txiov (lib_nal_t    *nal,
-              void         *private,
-              lib_msg_t    *libmsg,
-              ptl_hdr_t    *hdr,
-              int           type,
-              ptl_nid_t     nid,
-              ptl_pid_t     pid,
-              unsigned int  payload_niov,
-              struct iovec *payload_iov,
-              size_t        payload_offset,
-              size_t        payload_nob)
-{
-        lo_desc_t lod = {
-                .lod_type    = LOD_IOV,
-                .lod_niov    = payload_niov,
-                .lod_offset  = payload_offset,
-                .lod_nob     = payload_nob,
-                .lod_iov     = { .iov = payload_iov } };
-        ptl_err_t rc;
-
-        rc = do_lib_parse(nal, hdr, &lod, 1);
-        if (rc == PTL_OK)
-                lib_finalize(nal, private, libmsg, PTL_OK);
+                if (!msg->msg_routing) {
+                        /* I'm the source and now I know which NI to send on */
+                        src_nid = lnet_ptlcompat_srcnid(src_nid, dst_nid);
+                        msg->msg_hdr.src_nid = cpu_to_le64(src_nid);
+                }
 
-        return rc;
+                msg->msg_target_is_router = 1;
+                msg->msg_target.nid = lp->lp_nid;
+                msg->msg_target.pid = LUSTRE_SRV_LNET_PID;
+        }
+
+        /* 'lp' is our best choice of peer */
+
+        LASSERT (!msg->msg_peertxcredit);
+        LASSERT (!msg->msg_txcredit);
+        LASSERT (msg->msg_txpeer == NULL);
+
+        msg->msg_txpeer = lp;                   /* msg takes my ref on lp */
+
+        rc = lnet_post_send_locked(msg, 0);
+        LNET_UNLOCK();
+
+        if (rc == 0)
+                lnet_ni_send(src_ni, msg);
+
+        return 0;
 }
 
-ptl_err_t
-lib_lo_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
-             ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
+static void
+lnet_commit_md (lnet_libmd_t *md, lnet_msg_t *msg)
 {
-        if (mlen == 0) {
-                lib_finalize(nal, private, msg, PTL_OK);
-                return PTL_OK;
+        /* ALWAYS called holding the LNET_LOCK */
+        /* Here, we commit the MD to a network OP by marking it busy and
+         * decrementing its threshold.  Come what may, the network "owns"
+         * the MD until a call to lnet_finalize() signals completion. */
+        LASSERT (!msg->msg_routing);
+
+        msg->msg_md = md;
+
+        md->md_refcount++;
+        if (md->md_threshold != LNET_MD_THRESH_INF) {
+                LASSERT (md->md_threshold > 0);
+                md->md_threshold--;
         }
 
-        if ((md->options & PTL_MD_KIOV) == 0)
-                return lib_lo_rxiov(nal, private, msg,
-                                    md->md_niov, md->md_iov.iov,
-                                    offset, mlen, rlen);
+        the_lnet.ln_counters.msgs_alloc++;
+        if (the_lnet.ln_counters.msgs_alloc > 
+            the_lnet.ln_counters.msgs_max)
+                the_lnet.ln_counters.msgs_max = 
+                        the_lnet.ln_counters.msgs_alloc;
 
-        return lib_lo_rxkiov(nal, private, msg,
-                             md->md_niov, md->md_iov.kiov,
-                             offset, mlen, rlen);
+        LASSERT (!msg->msg_onactivelist);
+        msg->msg_onactivelist = 1;
+        list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs);
 }
 
-ptl_err_t
-lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md,
-          ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen)
+static void
+lnet_drop_message (lnet_ni_t *ni, void *private, unsigned int nob)
 {
-        if (mlen == 0)
-                return (nal->libnal_recv(nal, private, msg,
-                                         0, NULL,
-                                         offset, mlen, rlen));
-
-        if ((md->options & PTL_MD_KIOV) == 0)
-                return (nal->libnal_recv(nal, private, msg,
-                                         md->md_niov, md->md_iov.iov,
-                                         offset, mlen, rlen));
-
-        return (nal->libnal_recv_pages(nal, private, msg,
-                                       md->md_niov, md->md_iov.kiov,
-                                       offset, mlen, rlen));
+        LNET_LOCK();
+        the_lnet.ln_counters.drop_count++;
+        the_lnet.ln_counters.drop_length += nob;
+        LNET_UNLOCK();
+        
+        lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob);
 }
 
-ptl_err_t
-lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg,
-          ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
-          lib_md_t *md, ptl_size_t offset, ptl_size_t len)
+static void
+lnet_drop_delayed_put(lnet_msg_t *msg, char *reason) 
 {
-        int loopback = (nal->libnal_ni.ni_loopback &&
-                        (nid == nal->libnal_ni.ni_pid.nid));
-
-        if (len == 0) {
-                if (loopback)
-                        return lib_lo_txiov(nal, private, msg,
-                                            hdr, type, nid, pid,
-                                            0, NULL,
-                                            offset, len);
-                else
-                        return nal->libnal_send(nal, private, msg,
-                                                hdr, type, nid, pid,
-                                                0, NULL,
-                                                offset, len);
-        }
-
-        if ((md->options & PTL_MD_KIOV) == 0) {
-                if (loopback)
-                        return lib_lo_txiov(nal, private, msg,
-                                            hdr, type, nid, pid,
-                                            md->md_niov, md->md_iov.iov,
-                                            offset, len);
-                else
-                        return nal->libnal_send(nal, private, msg,
-                                                hdr, type, nid, pid,
-                                                md->md_niov, md->md_iov.iov,
-                                                offset, len);
-        }
-
-        if (loopback)
-                return lib_lo_txkiov(nal, private, msg,
-                                     hdr, type, nid, pid,
-                                     md->md_niov, md->md_iov.kiov,
-                                     offset, len);
-        else
-                return nal->libnal_send_pages(nal, private, msg,
-                                              hdr, type, nid, pid,
-                                              md->md_niov, md->md_iov.kiov,
-                                              offset, len);
+        LASSERT (msg->msg_md == NULL);
+        LASSERT (msg->msg_delayed);
+        LASSERT (msg->msg_rxpeer != NULL);
+        LASSERT (msg->msg_hdr.type == LNET_MSG_PUT);
+
+        CWARN("Dropping delayed PUT from %s portal %d match "LPU64
+              " offset %d length %d: %s\n", 
+              libcfs_id2str((lnet_process_id_t){        
+                      .nid = msg->msg_hdr.src_nid,
+                      .pid = msg->msg_hdr.src_pid}),
+              msg->msg_hdr.msg.put.ptl_index, 
+              msg->msg_hdr.msg.put.match_bits, 
+              msg->msg_hdr.msg.put.offset,
+              msg->msg_hdr.payload_length,
+              reason);
+
+        /* NB I can't drop msg's ref on msg_rxpeer until after I've
+         * called lnet_drop_message(), so I just hang onto msg as well
+         * until that's done */
+
+        lnet_drop_message(msg->msg_rxpeer->lp_ni, 
+                          msg->msg_private, msg->msg_len);
+
+        LNET_LOCK();
+
+        lnet_peer_decref_locked(msg->msg_rxpeer);
+        msg->msg_rxpeer = NULL;
+                
+        lnet_msg_free(msg);
+                
+        LNET_UNLOCK();
 }
 
-static void
-lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg)
+int
+LNetSetLazyPortal(int portal)
 {
-        /* ALWAYS called holding the LIB_LOCK */
-        lib_counters_t *counters = &nal->libnal_ni.ni_counters;
+        lnet_portal_t *ptl = &the_lnet.ln_portals[portal];
 
-        /* Here, we commit the MD to a network OP by marking it busy and
-         * decrementing its threshold.  Come what may, the network "owns"
-         * the MD until a call to lib_finalize() signals completion. */
-        msg->md = md;
+        if (portal < 0 || portal >= the_lnet.ln_nportals)
+                return -EINVAL;
+
+        CDEBUG(D_NET, "Setting portal %d lazy\n", portal);
+
+        LNET_LOCK();
+
+        ptl->ptl_options |= LNET_PTL_LAZY;
+
+        LNET_UNLOCK();
+
+        return 0;
+}
+
+int
+LNetClearLazyPortal(int portal)
+{
+        struct list_head  zombies;
+        lnet_portal_t    *ptl = &the_lnet.ln_portals[portal];
+        lnet_msg_t       *msg;
+
+        if (portal < 0 || portal >= the_lnet.ln_nportals)
+                return -EINVAL;
+
+        LNET_LOCK();
 
-        md->pending++;
-        if (md->threshold != PTL_MD_THRESH_INF) {
-                LASSERT (md->threshold > 0);
-                md->threshold--;
+        if ((ptl->ptl_options & LNET_PTL_LAZY) == 0) {
+                LNET_UNLOCK();
+                return 0;
         }
 
-        counters->msgs_alloc++;
-        if (counters->msgs_alloc > counters->msgs_max)
-                counters->msgs_max = counters->msgs_alloc;
+        CDEBUG(D_NET, "clearing portal %d lazy\n", portal);
 
-        list_add (&msg->msg_list, &nal->libnal_ni.ni_active_msgs);
+        /* grab all the blocked messages atomically */
+        list_add(&zombies, &ptl->ptl_msgq);
+        list_del_init(&ptl->ptl_msgq);
+
+        ptl->ptl_msgq_version++;
+        ptl->ptl_options &= ~LNET_PTL_LAZY;
+
+        LNET_UNLOCK();
+        
+        while (!list_empty(&zombies)) {
+                msg = list_entry(zombies.next, lnet_msg_t, msg_list);
+                list_del(&msg->msg_list);
+
+                lnet_drop_delayed_put(msg, "Clearing lazy portal attr");
+        }
+
+        return 0;
 }
 
 static void
-lib_drop_message (lib_nal_t *nal, void *private, ptl_hdr_t *hdr, int loopback)
+lnet_recv_put(lnet_libmd_t *md, lnet_msg_t *msg, int delayed,
+              unsigned int offset, unsigned int mlength)
 {
-        unsigned long flags;
+        lnet_hdr_t       *hdr = &msg->msg_hdr;
+
+        LNET_LOCK();
+
+        the_lnet.ln_counters.recv_count++;
+        the_lnet.ln_counters.recv_length += mlength;
 
-        /* CAVEAT EMPTOR: this only drops messages that we've not committed
-         * to receive (init_msg() not called) and therefore can't cause an
-         * event. */
+        LNET_UNLOCK();
 
-        LIB_LOCK(nal, flags);
-        nal->libnal_ni.ni_counters.drop_count++;
-        nal->libnal_ni.ni_counters.drop_length += hdr->payload_length;
-        LIB_UNLOCK(nal, flags);
+        if (mlength != 0)
+                lnet_setpayloadbuffer(msg);
 
-        /* NULL msg => if NAL calls lib_finalize it will be a noop */
-        if (!loopback)
-                (void) lib_recv(nal, private, NULL, NULL, 0, 0,
-                                hdr->payload_length);
+        msg->msg_ev.type       = LNET_EVENT_PUT;
+        msg->msg_ev.target.pid = hdr->dest_pid;
+        msg->msg_ev.target.nid = hdr->dest_nid;
+        msg->msg_ev.hdr_data   = hdr->msg.put.hdr_data;
+
+        /* Must I ACK?  If so I'll grab the ack_wmd out of the header and put
+         * it back into the ACK during lnet_finalize() */
+        msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
+                        (md->md_options & LNET_MD_ACK_DISABLE) == 0);
+        
+        lnet_ni_recv(msg->msg_rxpeer->lp_ni, 
+                     msg->msg_private, 
+                     msg, delayed, offset, mlength, 
+                     hdr->payload_length);
 }
 
-/*
- * Incoming messages have a ptl_msg_t object associated with them
- * by the library.  This object encapsulates the state of the
- * message and allows the NAL to do non-blocking receives or sends
- * of long messages.
- *
- */
-static ptl_err_t
-parse_put(lib_nal_t *nal, ptl_hdr_t *hdr, void *private,
-          lib_msg_t *msg, int loopback)
+/* called with LNET_LOCK held */
+void
+lnet_match_blocked_msg(lnet_libmd_t *md)
 {
-        lib_ni_t        *ni = &nal->libnal_ni;
-        ptl_size_t       mlength = 0;
-        ptl_size_t       offset = 0;
-        ptl_err_t        rc;
-        lib_md_t        *md;
-        unsigned long    flags;
+        CFS_LIST_HEAD    (drops);
+        CFS_LIST_HEAD    (matches);
+        struct list_head *tmp;
+        struct list_head *entry;
+        lnet_msg_t       *msg;
+        lnet_me_t        *me  = md->md_me;
+        lnet_portal_t    *ptl = &the_lnet.ln_portals[me->me_portal];
 
-        /* Convert put fields to host byte order */
-        hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
-        hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index);
-        hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
+        LASSERT (me->me_portal < the_lnet.ln_nportals);
+
+        if ((ptl->ptl_options & LNET_PTL_LAZY) == 0) {
+                LASSERT (list_empty(&ptl->ptl_msgq));
+                return;
+        }
+
+        LASSERT (md->md_refcount == 0); /* a brand new MD */
+
+        list_for_each_safe (entry, tmp, &ptl->ptl_msgq) {
+                int               rc;
+                int               index;
+                unsigned int      mlength;
+                unsigned int      offset;
+                lnet_hdr_t       *hdr;
+                lnet_process_id_t src;
+
+                msg = list_entry(entry, lnet_msg_t, msg_list);
+
+                LASSERT (msg->msg_delayed);
+
+                hdr   = &msg->msg_hdr;
+                index = hdr->msg.put.ptl_index;
+
+                src.nid = hdr->src_nid;
+                src.pid = hdr->src_pid;
+
+                rc = lnet_try_match_md(index, LNET_MD_OP_PUT, src,
+                                       hdr->payload_length, 
+                                       hdr->msg.put.offset, 
+                                       hdr->msg.put.match_bits, 
+                                       md, msg, &mlength, &offset);
+
+                if (rc == LNET_MATCHMD_NONE)
+                        continue;
+                
+                /* Hurrah! This _is_ a match */
+                list_del(&msg->msg_list); 
+                ptl->ptl_msgq_version++;
+
+                if (rc == LNET_MATCHMD_OK) {
+                        list_add_tail(&msg->msg_list, &matches);
+
+                        CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d "
+                               "match "LPU64" offset %d length %d.\n",
+                               libcfs_id2str(src),
+                               hdr->msg.put.ptl_index, 
+                               hdr->msg.put.match_bits, 
+                               hdr->msg.put.offset,
+                               hdr->payload_length);
+                } else {
+                        LASSERT (rc == LNET_MATCHMD_DROP);
 
-        LIB_LOCK(nal, flags);
+                        list_add_tail(&msg->msg_list, &drops);
+                }
 
-        md = lib_match_md(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT,
-                          hdr->src_nid, hdr->src_pid,
-                          hdr->payload_length, hdr->msg.put.offset,
-                          hdr->msg.put.match_bits, msg,
-                          &mlength, &offset);
-        if (md == NULL) {
-                LIB_UNLOCK(nal, flags);
-                return (PTL_FAIL);
+                if (lnet_md_exhausted(md))
+                        break;
         }
 
-        msg->ev.type = PTL_EVENT_PUT_END;
-        msg->ev.hdr_data = hdr->msg.put.hdr_data;
+        LNET_UNLOCK();
 
-        if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) &&
-            !(md->options & PTL_MD_ACK_DISABLE)) {
-                msg->ack_wmd = hdr->msg.put.ack_wmd;
+        list_for_each_safe (entry, tmp, &drops) {
+                msg = list_entry(entry, lnet_msg_t, msg_list);
+
+                list_del(&msg->msg_list); 
+
+                lnet_drop_delayed_put(msg, "Bad match");
         }
 
-        ni->ni_counters.recv_count++;
-        ni->ni_counters.recv_length += mlength;
+        list_for_each_safe (entry, tmp, &matches) {
+                msg = list_entry(entry, lnet_msg_t, msg_list);
 
-        LIB_UNLOCK(nal, flags);
+                list_del(&msg->msg_list); 
 
-        if (loopback)
-                rc = lib_lo_recv(nal, private, msg, md, offset, mlength,
-                                 hdr->payload_length);
-        else
-                rc = lib_recv(nal, private, msg, md, offset, mlength,
-                              hdr->payload_length);
+                /* md won't disappear under me, since each msg
+                 * holds a ref on it */
+                lnet_recv_put(md, msg, 1,
+                              msg->msg_ev.offset,
+                              msg->msg_ev.mlength);
+        }
 
-        if (rc != PTL_OK)
-                CERROR(LPU64": error on receiving PUT from "LPU64": %d\n",
-                       ni->ni_pid.nid, hdr->src_nid, rc);
+        LNET_LOCK();
+}
 
-        return (rc);
+static int
+lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg)
+{
+        int               rc;
+        int               index;
+        lnet_hdr_t       *hdr = &msg->msg_hdr;
+        unsigned int      rlength = hdr->payload_length;
+        unsigned int      mlength = 0;
+        unsigned int      offset = 0;
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
+        lnet_libmd_t     *md;
+
+        /* Convert put fields to host byte order */
+        hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits);
+        hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index);
+        hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset);
+
+        index = hdr->msg.put.ptl_index;
+
+        LNET_LOCK();
+
+        rc = lnet_match_md(index, LNET_MD_OP_PUT, src,
+                           rlength, hdr->msg.put.offset,
+                           hdr->msg.put.match_bits, msg,
+                           &mlength, &offset, &md);
+        switch (rc) {
+        default:
+                LBUG();
+                
+        case LNET_MATCHMD_OK:
+                LNET_UNLOCK();
+                lnet_recv_put(md, msg, 0, offset, mlength);
+                return 0;
+                
+        case LNET_MATCHMD_NONE:
+                rc = lnet_eager_recv_locked(msg);
+                if (rc == 0) {
+                        list_add_tail(&msg->msg_list, 
+                                      &the_lnet.ln_portals[index].ptl_msgq);
+
+                        the_lnet.ln_portals[index].ptl_msgq_version++;
+
+                        CDEBUG(D_NET, "Delaying PUT from %s portal %d match "
+                               LPU64" offset %d length %d: no match \n",
+                               libcfs_id2str(src), index, 
+                               hdr->msg.put.match_bits, 
+                               hdr->msg.put.offset, rlength);
+                        
+                        LNET_UNLOCK();
+                        return 0;
+                }
+                /* fall through */
+
+        case LNET_MATCHMD_DROP:
+                CWARN("Dropping PUT from %s portal %d match "LPU64
+                      " offset %d length %d: %d\n",
+                      libcfs_id2str(src), index, 
+                      hdr->msg.put.match_bits, 
+                      hdr->msg.put.offset, rlength, rc);
+                LNET_UNLOCK();
+
+                return ENOENT;          /* +ve: OK but no match */
+
+        }
 }
 
-static ptl_err_t
-parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private,
-          lib_msg_t *msg, int loopback)
+static int
+lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get)
 {
-        lib_ni_t        *ni = &nal->libnal_ni;
-        ptl_size_t       mlength = 0;
-        ptl_size_t       offset = 0;
-        lib_md_t        *md;
-        ptl_hdr_t        reply;
-        unsigned long    flags;
-        int              rc;
+        lnet_hdr_t        *hdr = &msg->msg_hdr;
+        unsigned int       mlength = 0;
+        unsigned int       offset = 0;
+        lnet_process_id_t  src = {/* .nid = */ hdr->src_nid,
+                                  /* .pid = */ hdr->src_pid};
+        lnet_handle_wire_t reply_wmd;
+        lnet_libmd_t      *md;
+        int                rc;
 
         /* Convert get fields to host byte order */
         hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits);
@@ -1030,221 +1667,228 @@ parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private,
         hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length);
         hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset);
 
-        LIB_LOCK(nal, flags);
-
-        md = lib_match_md(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET,
-                          hdr->src_nid, hdr->src_pid,
-                          hdr->msg.get.sink_length, hdr->msg.get.src_offset,
-                          hdr->msg.get.match_bits, msg,
-                          &mlength, &offset);
-        if (md == NULL) {
-                LIB_UNLOCK(nal, flags);
-                return (PTL_FAIL);
+        LNET_LOCK();
+
+        rc = lnet_match_md(hdr->msg.get.ptl_index, LNET_MD_OP_GET, src,
+                           hdr->msg.get.sink_length, hdr->msg.get.src_offset,
+                           hdr->msg.get.match_bits, msg,
+                           &mlength, &offset, &md);
+        if (rc == LNET_MATCHMD_DROP) {
+                CWARN("Dropping GET from %s portal %d match "LPU64
+                      " offset %d length %d\n",
+                      libcfs_id2str(src), 
+                      hdr->msg.get.ptl_index, 
+                      hdr->msg.get.match_bits, 
+                      hdr->msg.get.src_offset,
+                      hdr->msg.get.sink_length);
+                LNET_UNLOCK();
+                return ENOENT;                  /* +ve: OK but no match */
         }
 
-        msg->ev.type = PTL_EVENT_GET_END;
-        msg->ev.hdr_data = 0;
+        LASSERT (rc == LNET_MATCHMD_OK);
+                
+        the_lnet.ln_counters.send_count++;
+        the_lnet.ln_counters.send_length += mlength;
+
+        LNET_UNLOCK();
 
-        ni->ni_counters.send_count++;
-        ni->ni_counters.send_length += mlength;
+        reply_wmd = hdr->msg.get.return_wmd;
 
-        LIB_UNLOCK(nal, flags);
+        lnet_prep_send(msg, LNET_MSG_REPLY, src, offset, mlength);
 
-        memset (&reply, 0, sizeof (reply));
-        reply.type     = cpu_to_le32(PTL_MSG_REPLY);
-        reply.dest_nid = cpu_to_le64(hdr->src_nid);
-        reply.dest_pid = cpu_to_le32(hdr->src_pid);
-        reply.src_nid  = cpu_to_le64(ni->ni_pid.nid);
-        reply.src_pid  = cpu_to_le32(ni->ni_pid.pid);
-        reply.payload_length = cpu_to_le32(mlength);
+        msg->msg_hdr.msg.reply.dst_wmd = reply_wmd;
 
-        reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd;
+        msg->msg_ev.type = LNET_EVENT_GET;
+        msg->msg_ev.target.pid = hdr->dest_pid;
+        msg->msg_ev.target.nid = hdr->dest_nid;
+        msg->msg_ev.hdr_data = 0;
 
-        /* NB call lib_send() _BEFORE_ lib_recv() completes the incoming
-         * message.  Some NALs _require_ this to implement optimized GET */
+        if (rdma_get) {
+                /* The LND completes the REPLY from her recv procedure */
+                lnet_ni_recv(ni, msg->msg_private, msg, 0,
+                             msg->msg_offset, msg->msg_len, msg->msg_len);
+                return 0;
+        }
 
-        rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY,
-                       hdr->src_nid, hdr->src_pid, md, offset, mlength);
-        if (rc != PTL_OK)
-                CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n",
-                       ni->ni_pid.nid, hdr->src_nid, rc);
+        lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0);
+        msg->msg_receiving = 0;
+                             
+        rc = lnet_send(ni->ni_nid, msg);
+        if (rc < 0) {
+                /* didn't get as far as lnet_ni_send() */
+                CERROR("%s: Unable to send REPLY for GET from %s: %d\n",
+                       libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), rc);
 
-        /* Discard any junk after the hdr */
-        if (!loopback)
-                (void) lib_recv(nal, private, NULL, NULL, 0, 0,
-                                hdr->payload_length);
+                lnet_finalize(ni, msg, rc);
+        }
 
-        return (rc);
+        return 0;
 }
 
-static ptl_err_t
-parse_reply(lib_nal_t *nal, ptl_hdr_t *hdr, void *private,
-            lib_msg_t *msg, int loopback)
+static int
+lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg)
 {
-        lib_ni_t        *ni = &nal->libnal_ni;
-        lib_md_t        *md;
-        int              rlength;
-        int              length;
-        unsigned long    flags;
-        ptl_err_t        rc;
+        void             *private = msg->msg_private;
+        lnet_hdr_t       *hdr = &msg->msg_hdr;
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
+        lnet_libmd_t     *md;
+        int               rlength;
+        int               mlength;
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
         /* NB handles only looked up by creator (no flips) */
-        md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal);
-        if (md == NULL || md->threshold == 0) {
-                CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n",
-                        ni->ni_pid.nid, hdr->src_nid,
-                        md == NULL ? "invalid" : "inactive",
-                        hdr->msg.reply.dst_wmd.wh_interface_cookie,
-                        hdr->msg.reply.dst_wmd.wh_object_cookie);
-
-                LIB_UNLOCK(nal, flags);
-                return (PTL_FAIL);
-        }
-
-        LASSERT (md->offset == 0);
-
-        length = rlength = hdr->payload_length;
-
-        if (length > md->length) {
-                if ((md->options & PTL_MD_TRUNCATE) == 0) {
-                        CERROR (LPU64": Dropping REPLY from "LPU64
-                                " length %d for MD "LPX64" would overflow (%d)\n",
-                                ni->ni_pid.nid, hdr->src_nid, length,
-                                hdr->msg.reply.dst_wmd.wh_object_cookie,
-                                md->length);
-                        LIB_UNLOCK(nal, flags);
-                        return (PTL_FAIL);
-                }
-                length = md->length;
+        md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd);
+        if (md == NULL || md->md_threshold == 0) {
+                CWARN("%s: Dropping REPLY from %s for %s "
+                      "MD "LPX64"."LPX64"\n", 
+                      libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                      (md == NULL) ? "invalid" : "inactive",
+                      hdr->msg.reply.dst_wmd.wh_interface_cookie,
+                      hdr->msg.reply.dst_wmd.wh_object_cookie);
+
+                LNET_UNLOCK();
+                return ENOENT;                  /* +ve: OK but no match */
         }
 
-        CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n",
-               hdr->src_nid, length, rlength,
-               hdr->msg.reply.dst_wmd.wh_object_cookie);
+        LASSERT (md->md_offset == 0);
 
-        lib_commit_md(nal, md, msg);
+        rlength = hdr->payload_length;
+        mlength = MIN(rlength, md->md_length);
 
-        msg->ev.type = PTL_EVENT_REPLY_END;
-        msg->ev.initiator.nid = hdr->src_nid;
-        msg->ev.initiator.pid = hdr->src_pid;
-        msg->ev.rlength = rlength;
-        msg->ev.mlength = length;
-        msg->ev.offset = 0;
+        if (mlength < rlength &&
+            (md->md_options & LNET_MD_TRUNCATE) == 0) {
+                CERROR ("%s: Dropping REPLY from %s length %d "
+                        "for MD "LPX64" would overflow (%d)\n",
+                        libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                        rlength, hdr->msg.reply.dst_wmd.wh_object_cookie,
+                        mlength);
+                LNET_UNLOCK();
+                return ENOENT;          /* +ve: OK but no match */
+        }
 
-        lib_md_deconstruct(nal, md, &msg->ev.md);
-        ptl_md2handle(&msg->ev.md_handle, nal, md);
+        CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n",
+               libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), 
+               mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie);
 
-        ni->ni_counters.recv_count++;
-        ni->ni_counters.recv_length += length;
+        lnet_commit_md(md, msg);
 
-        LIB_UNLOCK(nal, flags);
+        if (mlength != 0)
+                lnet_setpayloadbuffer(msg);
 
-        if (loopback)
-                rc = lib_lo_recv(nal, private, msg, md, 0, length, rlength);
-        else
-                rc = lib_recv(nal, private, msg, md, 0, length, rlength);
+        msg->msg_ev.type = LNET_EVENT_REPLY;
+        msg->msg_ev.target.pid = hdr->dest_pid;
+        msg->msg_ev.target.nid = hdr->dest_nid;
+        msg->msg_ev.initiator = src;
+        msg->msg_ev.rlength = rlength;
+        msg->msg_ev.mlength = mlength;
+        msg->msg_ev.offset = 0;
 
-        if (rc != PTL_OK)
-                CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n",
-                       ni->ni_pid.nid, hdr->src_nid, rc);
+        lnet_md_deconstruct(md, &msg->msg_ev.md);
+        lnet_md2handle(&msg->msg_ev.md_handle, md);
 
-        return (rc);
+        the_lnet.ln_counters.recv_count++;
+        the_lnet.ln_counters.recv_length += mlength;
+
+        LNET_UNLOCK();
+
+        lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength);
+        return 0;
 }
 
-static ptl_err_t
-parse_ack(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, 
-          lib_msg_t *msg, int loopback)
+static int
+lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg)
 {
-        lib_ni_t      *ni = &nal->libnal_ni;
-        lib_md_t      *md;
-        unsigned long  flags;
+        lnet_hdr_t       *hdr = &msg->msg_hdr;
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
+        lnet_libmd_t    *md;
 
         /* Convert ack fields to host byte order */
         hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits);
         hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength);
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
         /* NB handles only looked up by creator (no flips) */
-        md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal);
-        if (md == NULL || md->threshold == 0) {
-                CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD "
-                       LPX64"."LPX64"\n", ni->ni_pid.nid, hdr->src_nid,
-                       (md == NULL) ? "invalid" : "inactive",
-                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
-                       hdr->msg.ack.dst_wmd.wh_object_cookie);
-
-                LIB_UNLOCK(nal, flags);
-                return (PTL_FAIL);
+        md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd);
+        if (md == NULL || md->md_threshold == 0) {
+#if 0
+                /* Don't moan; this is expected */
+                CERROR ("%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n",
+                        libcfs_nid2str(ni->ni_nid), libcfs_id2str(src),
+                        (md == NULL) ? "invalid" : "inactive",
+                        hdr->msg.ack.dst_wmd.wh_interface_cookie,
+                        hdr->msg.ack.dst_wmd.wh_object_cookie);
+#endif
+                LNET_UNLOCK();
+                return ENOENT;                  /* +ve! */
         }
 
-        CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n",
-               ni->ni_pid.nid, hdr->src_nid,
+        CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n",
+               libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), 
                hdr->msg.ack.dst_wmd.wh_object_cookie);
 
-        lib_commit_md(nal, md, msg);
-
-        msg->ev.type = PTL_EVENT_ACK;
-        msg->ev.initiator.nid = hdr->src_nid;
-        msg->ev.initiator.pid = hdr->src_pid;
-        msg->ev.mlength = hdr->msg.ack.mlength;
-        msg->ev.match_bits = hdr->msg.ack.match_bits;
-
-        lib_md_deconstruct(nal, md, &msg->ev.md);
-        ptl_md2handle(&msg->ev.md_handle, nal, md);
+        lnet_commit_md(md, msg);
 
-        ni->ni_counters.recv_count++;
+        msg->msg_ev.type = LNET_EVENT_ACK;
+        msg->msg_ev.target.pid = hdr->dest_pid;
+        msg->msg_ev.target.nid = hdr->dest_nid;
+        msg->msg_ev.initiator = src;
+        msg->msg_ev.mlength = hdr->msg.ack.mlength;
+        msg->msg_ev.match_bits = hdr->msg.ack.match_bits;
 
-        LIB_UNLOCK(nal, flags);
+        lnet_md_deconstruct(md, &msg->msg_ev.md);
+        lnet_md2handle(&msg->msg_ev.md_handle, md);
 
-        /* We have received and matched up the ack OK, create the
-         * completion event now... */
-        lib_finalize(nal, private, msg, PTL_OK);
+        the_lnet.ln_counters.recv_count++;
 
-        /* ...and now discard any junk after the hdr */
-        if (!loopback)
-                (void) lib_recv(nal, private, NULL, NULL, 0, 0,
-                                hdr->payload_length);
+        LNET_UNLOCK();
 
-       return (PTL_OK);
+        lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len);
+        return 0;
 }
 
-static char *
-hdr_type_string (ptl_hdr_t *hdr)
+char *
+lnet_msgtyp2str (int type)
 {
-        switch (hdr->type) {
-        case PTL_MSG_ACK:
+        switch (type) {
+        case LNET_MSG_ACK:
                 return ("ACK");
-        case PTL_MSG_PUT:
+        case LNET_MSG_PUT:
                 return ("PUT");
-        case PTL_MSG_GET:
+        case LNET_MSG_GET:
                 return ("GET");
-        case PTL_MSG_REPLY:
+        case LNET_MSG_REPLY:
                 return ("REPLY");
-        case PTL_MSG_HELLO:
+        case LNET_MSG_HELLO:
                 return ("HELLO");
         default:
                 return ("<UNKNOWN>");
         }
 }
 
-void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr)
+void
+lnet_print_hdr(lnet_hdr_t * hdr)
 {
-        char *type_str = hdr_type_string (hdr);
+        lnet_process_id_t src = {/* .nid = */ hdr->src_nid,
+                                 /* .pid = */ hdr->src_pid};
+        lnet_process_id_t dst = {/* .nid = */ hdr->dest_nid,
+                                 /* .pid = */ hdr->dest_pid};
+        char *type_str = lnet_msgtyp2str (hdr->type);
 
         CWARN("P3 Header at %p of type %s\n", hdr, type_str);
-        CWARN("    From nid/pid "LPX64"/%u", hdr->src_nid, hdr->src_pid);
-        CWARN("    To nid/pid "LPX64"/%u\n", hdr->dest_nid, hdr->dest_pid);
+        CWARN("    From %s\n", libcfs_id2str(src));
+        CWARN("    To   %s\n", libcfs_id2str(dst));
 
         switch (hdr->type) {
         default:
                 break;
 
-        case PTL_MSG_PUT:
+        case LNET_MSG_PUT:
                 CWARN("    Ptl index %d, ack md "LPX64"."LPX64", "
-                      "match bits "LPX64"\n",
+                      "match bits "LPU64"\n",
                       hdr->msg.put.ptl_index,
                       hdr->msg.put.ack_wmd.wh_interface_cookie,
                       hdr->msg.put.ack_wmd.wh_object_cookie,
@@ -1254,9 +1898,9 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr)
                       hdr->msg.put.hdr_data);
                 break;
 
-        case PTL_MSG_GET:
+        case LNET_MSG_GET:
                 CWARN("    Ptl index %d, return md "LPX64"."LPX64", "
-                      "match bits "LPX64"\n", hdr->msg.get.ptl_index,
+                      "match bits "LPU64"\n", hdr->msg.get.ptl_index,
                       hdr->msg.get.return_wmd.wh_interface_cookie,
                       hdr->msg.get.return_wmd.wh_object_cookie,
                       hdr->msg.get.match_bits);
@@ -1265,7 +1909,7 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr)
                       hdr->msg.get.src_offset);
                 break;
 
-        case PTL_MSG_ACK:
+        case LNET_MSG_ACK:
                 CWARN("    dst md "LPX64"."LPX64", "
                       "manipulated length %d\n",
                       hdr->msg.ack.dst_wmd.wh_interface_cookie,
@@ -1273,7 +1917,7 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr)
                       hdr->msg.ack.mlength);
                 break;
 
-        case PTL_MSG_REPLY:
+        case LNET_MSG_REPLY:
                 CWARN("    dst md "LPX64"."LPX64", "
                       "length %d\n",
                       hdr->msg.reply.dst_wmd.wh_interface_cookie,
@@ -1281,481 +1925,573 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr)
                       hdr->payload_length);
         }
 
-}                               /* end of print_hdr() */
+}
 
 
-ptl_err_t
-lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private)
+int
+lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, 
+           void *private, int rdma_req)
 {
-        return do_lib_parse(nal, hdr, private, 0);
-}
+        int            rc = 0;
+        int            for_me;
+        lnet_msg_t    *msg;
+        lnet_nid_t     dest_nid;
+        lnet_nid_t     src_nid;
+        __u32          payload_length;
+        __u32          type;
 
-ptl_err_t
-do_lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, int loopback)
-{
-        unsigned long  flags;
-        ptl_err_t      rc;
-        lib_msg_t     *msg;
+        LASSERT (!in_interrupt ());
 
-        /* NB we return PTL_OK if we manage to parse the header and believe
-         * it looks OK.  Anything that goes wrong with receiving the
-         * message after that point is the responsibility of the NAL */
+        type = le32_to_cpu(hdr->type);
+        src_nid = le64_to_cpu(hdr->src_nid);
+        dest_nid = le64_to_cpu(hdr->dest_nid);
+        payload_length = le32_to_cpu(hdr->payload_length);
+
+        for_me = lnet_ptlcompat_matchnid(ni->ni_nid, dest_nid);
+
+        switch (type) {
+        case LNET_MSG_ACK:
+        case LNET_MSG_GET:
+                if (payload_length > 0) {
+                        CERROR("%s, src %s: bad %s payload %d (0 expected)\n",
+                               libcfs_nid2str(from_nid), 
+                               libcfs_nid2str(src_nid), 
+                               lnet_msgtyp2str(type), payload_length);
+                        return -EPROTO;
+                }
+                break;
+                               
+        case LNET_MSG_PUT:
+        case LNET_MSG_REPLY:
+                if (payload_length > (for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) {
+                        CERROR("%s, src %s: bad %s payload %d "
+                               "(%d max expected)\n", 
+                               libcfs_nid2str(from_nid),
+                               libcfs_nid2str(src_nid), 
+                               lnet_msgtyp2str(type),
+                               payload_length,
+                               for_me ? LNET_MAX_PAYLOAD : LNET_MTU);
+                        return -EPROTO;
+                }
+                break;
 
-        /* convert common fields to host byte order */
-        hdr->type = le32_to_cpu(hdr->type);
-        hdr->src_nid = le64_to_cpu(hdr->src_nid);
-        hdr->src_pid = le32_to_cpu(hdr->src_pid);
-        hdr->dest_pid = le32_to_cpu(hdr->dest_pid);
-        hdr->payload_length = le32_to_cpu(hdr->payload_length);
+        default:
+                CERROR("%s, src %s: Bad message type 0x%x\n",
+                       libcfs_nid2str(from_nid),
+                       libcfs_nid2str(src_nid), type);
+                return -EPROTO;
+        }
 
-        switch (hdr->type) {
-        case PTL_MSG_HELLO: {
-                /* dest_nid is really ptl_magicversion_t */
-                ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid;
-
-                mv->magic = le32_to_cpu(mv->magic);
-                mv->version_major = le16_to_cpu(mv->version_major);
-                mv->version_minor = le16_to_cpu(mv->version_minor);
-
-                if (mv->magic == PORTALS_PROTO_MAGIC &&
-                    mv->version_major == PORTALS_PROTO_VERSION_MAJOR &&
-                    mv->version_minor == PORTALS_PROTO_VERSION_MINOR) {
-                        CWARN (LPU64": Dropping unexpected HELLO message: "
-                               "magic %d, version %d.%d from "LPD64"\n",
-                               nal->libnal_ni.ni_pid.nid, mv->magic,
-                               mv->version_major, mv->version_minor,
-                               hdr->src_nid);
-
-                        /* it's good but we don't want it */
-                        lib_drop_message(nal, private, hdr, loopback);
-                        return PTL_OK;
+        /* Regard a bad destination NID as a protocol error.  Senders should
+         * know what they're doing; if they don't they're misconfigured, buggy
+         * or malicious so we chop them off at the knees :) */
+
+        if (!for_me) {
+                if (the_lnet.ln_ptlcompat > 0) {
+                        /* portals compatibility is single-network */
+                        CERROR ("%s, src %s: Bad dest nid %s "
+                                "(routing not supported)\n",
+                                libcfs_nid2str(from_nid),
+                                libcfs_nid2str(src_nid),
+                                libcfs_nid2str(dest_nid));
+                        return -EPROTO;
                 }
 
-                /* we got garbage */
-                CERROR (LPU64": Bad HELLO message: "
-                        "magic %d, version %d.%d from "LPD64"\n",
-                        nal->libnal_ni.ni_pid.nid, mv->magic,
-                        mv->version_major, mv->version_minor,
-                        hdr->src_nid);
-                return PTL_FAIL;
-        }
-
-        case PTL_MSG_ACK:
-        case PTL_MSG_PUT:
-        case PTL_MSG_GET:
-        case PTL_MSG_REPLY:
-                hdr->dest_nid = le64_to_cpu(hdr->dest_nid);
-                if (hdr->dest_nid != nal->libnal_ni.ni_pid.nid) {
-                        CERROR(LPU64": BAD dest NID in %s message from"
-                               LPU64" to "LPU64" (not me)\n",
-                               nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr),
-                               hdr->src_nid, hdr->dest_nid);
-                        return PTL_FAIL;
+                if (the_lnet.ln_ptlcompat == 0 &&
+                    LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) {
+                        /* should have gone direct */
+                        CERROR ("%s, src %s: Bad dest nid %s "
+                                "(should have been sent direct)\n",
+                                libcfs_nid2str(from_nid),
+                                libcfs_nid2str(src_nid),
+                                libcfs_nid2str(dest_nid));
+                        return -EPROTO;
                 }
-                break;
 
-        default:
-                CERROR(LPU64": Bad message type 0x%x from "LPU64"\n",
-                       nal->libnal_ni.ni_pid.nid, hdr->type, hdr->src_nid);
-                return PTL_FAIL;
+                if (the_lnet.ln_ptlcompat == 0 &&
+                    lnet_islocalnid(dest_nid)) {
+                        /* dest is another local NI; sender should have used
+                         * this node's NID on its own network */
+                        CERROR ("%s, src %s: Bad dest nid %s "
+                                "(it's my nid but on a different network)\n",
+                                libcfs_nid2str(from_nid),
+                                libcfs_nid2str(src_nid),
+                                libcfs_nid2str(dest_nid));
+                        return -EPROTO;
+                }
+
+                if (rdma_req && type == LNET_MSG_GET) {
+                        CERROR ("%s, src %s: Bad optimized GET for %s "
+                                "(final destination must be me)\n",
+                                libcfs_nid2str(from_nid),
+                                libcfs_nid2str(src_nid),
+                                libcfs_nid2str(dest_nid));
+                        return -EPROTO;
+                }
+                
+                if (!the_lnet.ln_routing) {
+                        CERROR ("%s, src %s: Dropping message for %s "
+                                "(routing not enabled)\n",
+                                libcfs_nid2str(from_nid),
+                                libcfs_nid2str(src_nid),
+                                libcfs_nid2str(dest_nid));
+                        goto drop;
+                }
         }
 
-        /* We've decided we're not receiving garbage since we can parse the
-         * header.  We will return PTL_OK come what may... */
+        /* Message looks OK; we're not going to return an error, so we MUST
+         * call back lnd_recv() come what may... */
 
-        if (!list_empty (&nal->libnal_ni.ni_test_peers) && /* normally we don't */
-            fail_peer (nal, hdr->src_nid, 0))      /* shall we now? */
+        if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+            fail_peer (src_nid, 0))             /* shall we now? */
         {
-                CERROR(LPU64": Dropping incoming %s from "LPU64
-                       ": simulated failure\n",
-                       nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr),
-                       hdr->src_nid);
-                lib_drop_message(nal, private, hdr, loopback);
-                return PTL_OK;
+                CERROR("%s, src %s: Dropping %s to simulate failure\n",
+                       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                       lnet_msgtyp2str(type));
+                goto drop;
         }
 
-        msg = lib_msg_alloc(nal);
+        msg = lnet_msg_alloc();
         if (msg == NULL) {
-                CERROR(LPU64": Dropping incoming %s from "LPU64
-                       ": can't allocate a lib_msg_t\n",
-                       nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr),
-                       hdr->src_nid);
-                lib_drop_message(nal, private, hdr, loopback);
-                return PTL_OK;
+                CERROR("%s, src %s: Dropping %s (out of memory)\n",
+                       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid)
+                       , lnet_msgtyp2str(type));
+                goto drop;
         }
 
-        switch (hdr->type) {
-        case PTL_MSG_ACK:
-                rc = parse_ack(nal, hdr, private, msg, loopback);
+        /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */
+        
+        msg->msg_type = type;
+        msg->msg_private = private;
+        msg->msg_receiving = 1;
+        msg->msg_len = msg->msg_wanted = payload_length;
+        msg->msg_offset = 0;
+        msg->msg_hdr = *hdr;
+
+        LNET_LOCK();
+        rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid);
+        if (rc != 0) {
+                LNET_UNLOCK();
+                CERROR("%s, src %s: Dropping %s "
+                       "(error %d looking up sender)\n",
+                       libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
+                       lnet_msgtyp2str(type), rc);
+                goto free_drop;
+        }
+        LNET_UNLOCK();
+
+#ifndef __KERNEL__
+        LASSERT (for_me);
+#else
+        if (!for_me) {
+                msg->msg_target.pid = le32_to_cpu(hdr->dest_pid);
+                msg->msg_target.nid = dest_nid;
+                msg->msg_routing = 1;
+                msg->msg_offset = 0;
+
+                LNET_LOCK();
+                if (msg->msg_rxpeer->lp_rtrcredits <= 0 ||
+                    lnet_msg2bufpool(msg)->rbp_credits <= 0) {
+                        rc = lnet_eager_recv_locked(msg);
+                        if (rc != 0) {
+                                LNET_UNLOCK();
+                                goto free_drop;
+                        }
+                }
+                
+                lnet_commit_routedmsg(msg);
+                rc = lnet_post_routed_recv_locked(msg, 0);
+                LNET_UNLOCK();
+
+                if (rc == 0)
+                        lnet_ni_recv(ni, msg->msg_private, msg, 0,
+                                     0, payload_length, payload_length);
+                return 0;
+        }
+#endif
+        /* convert common msg->hdr fields to host byteorder */
+        msg->msg_hdr.type = type;
+        msg->msg_hdr.src_nid = src_nid;
+        msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid);
+        msg->msg_hdr.dest_nid = dest_nid;
+        msg->msg_hdr.dest_pid = le32_to_cpu(msg->msg_hdr.dest_pid);
+        msg->msg_hdr.payload_length = payload_length;
+        
+        switch (type) {
+        case LNET_MSG_ACK:
+                rc = lnet_parse_ack(ni, msg);
                 break;
-        case PTL_MSG_PUT:
-                rc = parse_put(nal, hdr, private, msg, loopback);
+        case LNET_MSG_PUT:
+                rc = lnet_parse_put(ni, msg);
                 break;
-        case PTL_MSG_GET:
-                rc = parse_get(nal, hdr, private, msg, loopback);
+        case LNET_MSG_GET:
+                rc = lnet_parse_get(ni, msg, rdma_req);
                 break;
-        case PTL_MSG_REPLY:
-                rc = parse_reply(nal, hdr, private, msg, loopback);
+        case LNET_MSG_REPLY:
+                rc = lnet_parse_reply(ni, msg);
                 break;
         default:
                 LASSERT(0);
-                rc = PTL_FAIL;                  /* no compiler warning please */
-                break;
+                goto free_drop;  /* prevent an unused label if !kernel */
         }
 
-        if (rc != PTL_OK) {
-                if (msg->md != NULL) {
-                        /* committed... */
-                        lib_finalize(nal, private, msg, rc);
-                } else {
-                        LIB_LOCK(nal, flags);
-                        lib_msg_free(nal, msg); /* expects LIB_LOCK held */
-                        LIB_UNLOCK(nal, flags);
-
-                        lib_drop_message(nal, private, hdr, loopback);
-                }
+        if (rc == 0)
+                return 0;
+        
+        LASSERT (rc == ENOENT);
+
+ free_drop:
+        LASSERT (msg->msg_md == NULL);
+        LNET_LOCK();
+        if (msg->msg_rxpeer != NULL) {
+                lnet_peer_decref_locked(msg->msg_rxpeer);
+                msg->msg_rxpeer = NULL;
         }
+        lnet_msg_free(msg);                     /* expects LNET_LOCK held */
+        LNET_UNLOCK();
 
-        return PTL_OK;
-        /* That's "OK I can parse it", not "OK I like it" :) */
+ drop:
+        lnet_drop_message(ni, private, payload_length);
+        return 0;
 }
 
 int
-lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh,
-            ptl_ack_req_t ack, ptl_process_id_t *id,
-            ptl_pt_index_t portal, ptl_ac_index_t ac,
-            ptl_match_bits_t match_bits,
-            ptl_size_t offset, ptl_hdr_data_t hdr_data)
+LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack,
+        lnet_process_id_t target, unsigned int portal,
+        __u64 match_bits, unsigned int offset, 
+        __u64 hdr_data)
 {
-        lib_nal_t        *nal = apinal->nal_data;
-        lib_ni_t         *ni = &nal->libnal_ni;
-        lib_msg_t        *msg;
-        ptl_hdr_t         hdr;
-        lib_md_t         *md;
-        unsigned long     flags;
+        lnet_msg_t       *msg;
+        lnet_libmd_t     *md;
         int               rc;
 
-        if (!list_empty (&ni->ni_test_peers) && /* normally we don't */
-            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+            fail_peer (target.nid, 1))          /* shall we now? */
         {
-                CERROR("Dropping PUT to "LPU64": simulated failure\n",
-                       id->nid);
-                return PTL_PROCESS_INVALID;
+                CERROR("Dropping PUT to %s: simulated failure\n",
+                       libcfs_id2str(target));
+                return -EIO;
         }
 
-        msg = lib_msg_alloc(nal);
+        msg = lnet_msg_alloc();
         if (msg == NULL) {
-                CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n",
-                       ni->ni_pid.nid, id->nid);
-                return PTL_NO_SPACE;
+                CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n",
+                       libcfs_id2str(target));
+                return -ENOMEM;
         }
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        md = ptl_handle2md(mdh, nal);
-        if (md == NULL || md->threshold == 0) {
-                lib_msg_free(nal, msg);
-                LIB_UNLOCK(nal, flags);
+        md = lnet_handle2md(&mdh);
+        if (md == NULL || md->md_threshold == 0) {
+                lnet_msg_free(msg);
+                LNET_UNLOCK();
 
-                return PTL_MD_INVALID;
+                CERROR("Dropping PUT to %s: MD invalid\n", 
+                       libcfs_id2str(target));
+                return -ENOENT;
         }
 
-        CDEBUG(D_NET, "PtlPut -> "LPX64"\n", id->nid);
+        CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target));
 
-        memset (&hdr, 0, sizeof (hdr));
-        hdr.type     = cpu_to_le32(PTL_MSG_PUT);
-        hdr.dest_nid = cpu_to_le64(id->nid);
-        hdr.dest_pid = cpu_to_le32(id->pid);
-        hdr.src_nid  = cpu_to_le64(ni->ni_pid.nid);
-        hdr.src_pid  = cpu_to_le32(ni->ni_pid.pid);
-        hdr.payload_length = cpu_to_le32(md->length);
+        lnet_commit_md(md, msg);
+
+        lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length);
+
+        msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits);
+        msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal);
+        msg->msg_hdr.msg.put.offset = cpu_to_le32(offset);
+        msg->msg_hdr.msg.put.hdr_data = hdr_data;
 
         /* NB handles only looked up by creator (no flips) */
-        if (ack == PTL_ACK_REQ) {
-                hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie;
-                hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+        if (ack == LNET_ACK_REQ) {
+                msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = 
+                        the_lnet.ln_interface_cookie;
+                msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = 
+                        md->md_lh.lh_cookie;
         } else {
-                hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE;
+                msg->msg_hdr.msg.put.ack_wmd = LNET_WIRE_HANDLE_NONE;
         }
 
-        hdr.msg.put.match_bits = cpu_to_le64(match_bits);
-        hdr.msg.put.ptl_index = cpu_to_le32(portal);
-        hdr.msg.put.offset = cpu_to_le32(offset);
-        hdr.msg.put.hdr_data = hdr_data;
-
-        lib_commit_md(nal, md, msg);
-
-        msg->ev.type = PTL_EVENT_SEND_END;
-        msg->ev.initiator.nid = ni->ni_pid.nid;
-        msg->ev.initiator.pid = ni->ni_pid.pid;
-        msg->ev.pt_index = portal;
-        msg->ev.match_bits = match_bits;
-        msg->ev.rlength = md->length;
-        msg->ev.mlength = md->length;
-        msg->ev.offset = offset;
-        msg->ev.hdr_data = hdr_data;
-
-        lib_md_deconstruct(nal, md, &msg->ev.md);
-        ptl_md2handle(&msg->ev.md_handle, nal, md);
-
-        ni->ni_counters.send_count++;
-        ni->ni_counters.send_length += md->length;
-
-        LIB_UNLOCK(nal, flags);
-
-        rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_PUT,
-                       id->nid, id->pid, md, 0, md->length);
-        if (rc != PTL_OK) {
-                CERROR("Error sending PUT to "LPX64": %d\n",
-                       id->nid, rc);
-                lib_finalize (nal, NULL, msg, rc);
+        msg->msg_ev.type = LNET_EVENT_SEND;
+        msg->msg_ev.initiator.nid = LNET_NID_ANY;
+        msg->msg_ev.initiator.pid = the_lnet.ln_pid;
+        msg->msg_ev.target = target;
+        msg->msg_ev.pt_index = portal;
+        msg->msg_ev.match_bits = match_bits;
+        msg->msg_ev.rlength = md->md_length;
+        msg->msg_ev.mlength = md->md_length;
+        msg->msg_ev.offset = offset;
+        msg->msg_ev.hdr_data = hdr_data;
+
+        lnet_md_deconstruct(md, &msg->msg_ev.md);
+        lnet_md2handle(&msg->msg_ev.md_handle, md);
+
+        the_lnet.ln_counters.send_count++;
+        the_lnet.ln_counters.send_length += md->md_length;
+
+        LNET_UNLOCK();
+
+        rc = lnet_send(self, msg);
+        if (rc != 0) {
+                CERROR("Error sending PUT to %s: %d\n",
+                       libcfs_id2str(target), rc);
+                lnet_finalize (NULL, msg, rc);
         }
 
         /* completion will be signalled by an event */
-        return PTL_OK;
+        return 0;
 }
 
-lib_msg_t *
-lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg)
+lnet_msg_t *
+lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg)
 {
-        /* The NAL can DMA direct to the GET md (i.e. no REPLY msg).  This
-         * returns a msg for the NAL to pass to lib_finalize() when the sink
+        /* The LND can DMA direct to the GET md (i.e. no REPLY msg).  This
+         * returns a msg for the LND to pass to lnet_finalize() when the sink
          * data has been received.
          *
          * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when
-         * lib_finalize() is called on it, so the NAL must call this first */
+         * lnet_finalize() is called on it, so the LND must call this first */
+
+        lnet_msg_t        *msg = lnet_msg_alloc();
+        lnet_libmd_t      *getmd = getmsg->msg_md;
+        lnet_process_id_t  peer_id = getmsg->msg_target;
 
-        lib_ni_t        *ni = &nal->libnal_ni;
-        lib_msg_t       *msg = lib_msg_alloc(nal);
-        lib_md_t        *getmd = getmsg->md;
-        unsigned long    flags;
+        LASSERT (!getmsg->msg_target_is_router);
+        LASSERT (!getmsg->msg_routing);
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        LASSERT (getmd->pending > 0);
+        LASSERT (getmd->md_refcount > 0);
 
         if (msg == NULL) {
-                CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n",
-                        peer_nid);
+                CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n",
+                        libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id));
                 goto drop;
         }
 
-        if (getmd->threshold == 0) {
-                CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n",
-                        peer_nid, getmd);
+        if (getmd->md_threshold == 0) {
+                CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n",
+                        libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), 
+                        getmd);
                 goto drop_msg;
         }
 
-        LASSERT (getmd->offset == 0);
+        LASSERT (getmd->md_offset == 0);
 
-        CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd);
+        CDEBUG(D_NET, "%s: Reply from %s md %p\n", 
+               libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
 
-        lib_commit_md (nal, getmd, msg);
+        lnet_commit_md (getmd, msg);
 
-        msg->ev.type = PTL_EVENT_REPLY_END;
-        msg->ev.initiator.nid = peer_nid;
-        msg->ev.initiator.pid = 0;      /* XXX FIXME!!! */
-        msg->ev.rlength = msg->ev.mlength = getmd->length;
-        msg->ev.offset = 0;
+        msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
 
-        lib_md_deconstruct(nal, getmd, &msg->ev.md);
-        ptl_md2handle(&msg->ev.md_handle, nal, getmd);
+        msg->msg_ev.type = LNET_EVENT_REPLY;
+        msg->msg_ev.initiator = peer_id;
+        msg->msg_ev.rlength = msg->msg_ev.mlength = getmd->md_length;
+        msg->msg_ev.offset = 0;
 
-        ni->ni_counters.recv_count++;
-        ni->ni_counters.recv_length += getmd->length;
+        lnet_md_deconstruct(getmd, &msg->msg_ev.md);
+        lnet_md2handle(&msg->msg_ev.md_handle, getmd);
 
-        LIB_UNLOCK(nal, flags);
+        the_lnet.ln_counters.recv_count++;
+        the_lnet.ln_counters.recv_length += getmd->md_length;
+
+        LNET_UNLOCK();
 
         return msg;
 
  drop_msg:
-        lib_msg_free(nal, msg);
+        lnet_msg_free(msg);
  drop:
-        nal->libnal_ni.ni_counters.drop_count++;
-        nal->libnal_ni.ni_counters.drop_length += getmd->length;
+        the_lnet.ln_counters.drop_count++;
+        the_lnet.ln_counters.drop_length += getmd->md_length;
 
-        LIB_UNLOCK (nal, flags);
+        LNET_UNLOCK ();
 
         return NULL;
 }
 
+void
+lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len)
+{
+        /* Set the REPLY length, now the RDMA that elides the REPLY message has
+         * completed and I know it. */
+        LASSERT (reply != NULL);
+        LASSERT (reply->msg_type == LNET_MSG_GET);
+        LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY);
+
+        /* NB I trusted my peer to RDMA.  If she tells me she's written beyond
+         * the end of my buffer, I might as well be dead. */
+        LASSERT (len <= reply->msg_ev.mlength);
+        
+        reply->msg_ev.mlength = len;
+}
+
 int
-lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, ptl_process_id_t *id,
-            ptl_pt_index_t portal, ptl_ac_index_t ac,
-            ptl_match_bits_t match_bits, ptl_size_t offset)
+LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, 
+        lnet_process_id_t target, unsigned int portal, 
+        __u64 match_bits, unsigned int offset)
 {
-        lib_nal_t        *nal = apinal->nal_data;
-        lib_ni_t         *ni = &nal->libnal_ni;
-        lib_msg_t        *msg;
-        ptl_hdr_t         hdr;
-        lib_md_t         *md;
-        unsigned long     flags;
+        lnet_msg_t       *msg;
+        lnet_libmd_t     *md;
         int               rc;
 
-        if (!list_empty (&ni->ni_test_peers) && /* normally we don't */
-            fail_peer (nal, id->nid, 1))           /* shall we now? */
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+        
+        if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */
+            fail_peer (target.nid, 1))          /* shall we now? */
         {
-                CERROR("Dropping PUT to "LPX64": simulated failure\n",
-                       id->nid);
-                return PTL_PROCESS_INVALID;
+                CERROR("Dropping GET to %s: simulated failure\n",
+                       libcfs_id2str(target));
+                return -EIO;
         }
 
-        msg = lib_msg_alloc(nal);
+        msg = lnet_msg_alloc();
         if (msg == NULL) {
-                CERROR("Dropping GET to "LPU64": ENOMEM on lib_msg_t\n",
-                       id->nid);
-                return PTL_NO_SPACE;
+                CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n",
+                       libcfs_id2str(target));
+                return -ENOMEM;
         }
 
-        LIB_LOCK(nal, flags);
+        LNET_LOCK();
 
-        md = ptl_handle2md(mdh, nal);
-        if (md == NULL || !md->threshold) {
-                lib_msg_free(nal, msg);
-                LIB_UNLOCK(nal, flags);
+        md = lnet_handle2md(&mdh);
+        if (md == NULL || md->md_threshold == 0) {
+                lnet_msg_free(msg);
+                LNET_UNLOCK();
 
-                return PTL_MD_INVALID;
+                CERROR("Dropping GET to %s: MD invalid\n",
+                       libcfs_id2str(target));
+                return -ENOENT;
         }
 
-        CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid,
-               (unsigned long)id->pid);
+        CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target));
 
-        memset (&hdr, 0, sizeof (hdr));
-        hdr.type     = cpu_to_le32(PTL_MSG_GET);
-        hdr.dest_nid = cpu_to_le64(id->nid);
-        hdr.dest_pid = cpu_to_le32(id->pid);
-        hdr.src_nid  = cpu_to_le64(ni->ni_pid.nid);
-        hdr.src_pid  = cpu_to_le32(ni->ni_pid.pid);
-        hdr.payload_length = 0;
+        lnet_commit_md(md, msg);
 
-        /* NB handles only looked up by creator (no flips) */
-        hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie;
-        hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie;
+        lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0);
 
-        hdr.msg.get.match_bits = cpu_to_le64(match_bits);
-        hdr.msg.get.ptl_index = cpu_to_le32(portal);
-        hdr.msg.get.src_offset = cpu_to_le32(offset);
-        hdr.msg.get.sink_length = cpu_to_le32(md->length);
+        msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits);
+        msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal);
+        msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset);
+        msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length);
 
-        lib_commit_md(nal, md, msg);
+        /* NB handles only looked up by creator (no flips) */
+        msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = 
+                the_lnet.ln_interface_cookie;
+        msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = 
+                md->md_lh.lh_cookie;
+
+        msg->msg_ev.type = LNET_EVENT_SEND;
+        msg->msg_ev.initiator.nid = LNET_NID_ANY;
+        msg->msg_ev.initiator.pid = the_lnet.ln_pid;
+        msg->msg_ev.target = target;
+        msg->msg_ev.pt_index = portal;
+        msg->msg_ev.match_bits = match_bits;
+        msg->msg_ev.rlength = md->md_length;
+        msg->msg_ev.mlength = md->md_length;
+        msg->msg_ev.offset = offset;
+        msg->msg_ev.hdr_data = 0;
+
+        lnet_md_deconstruct(md, &msg->msg_ev.md);
+        lnet_md2handle(&msg->msg_ev.md_handle, md);
+
+        the_lnet.ln_counters.send_count++;
+
+        LNET_UNLOCK();
+
+        rc = lnet_send(self, msg);
+        if (rc < 0) {
+                CERROR("error sending GET to %s: %d\n",
+                       libcfs_id2str(target), rc);
+                lnet_finalize (NULL, msg, rc);
+        }
+
+        /* completion will be signalled by an event */
+        return 0;
+}
 
-        msg->ev.type = PTL_EVENT_SEND_END;
-        msg->ev.initiator = ni->ni_pid;
-        msg->ev.pt_index = portal;
-        msg->ev.match_bits = match_bits;
-        msg->ev.rlength = md->length;
-        msg->ev.mlength = md->length;
-        msg->ev.offset = offset;
-        msg->ev.hdr_data = 0;
+int
+LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, int *orderp)
+{
+       struct list_head *e;
+        lnet_ni_t        *ni;
+        lnet_route_t     *route;
+        lnet_remotenet_t *rnet;
+        __u32             dstnet = LNET_NIDNET(dstnid);
+        int               hops;
+        int               order = 2;
+
+        /* if !local_nid_dist_zero, I don't return a distance of 0 ever
+         * (when lustre sees a distance of 0, it substitutes 0@lo), so I
+         * keep order 0 free for 0@lo and order 1 free for a local NID
+         * match */
+
+        LASSERT (the_lnet.ln_init);
+        LASSERT (the_lnet.ln_refcount > 0);
+
+        LNET_LOCK();
+
+        list_for_each (e, &the_lnet.ln_nis) {
+                ni = list_entry(e, lnet_ni_t, ni_list);
+                
+                if (ni->ni_nid == dstnid ||
+                    (the_lnet.ln_ptlcompat > 0 &&
+                     LNET_NIDNET(dstnid) == 0 &&
+                     LNET_NIDADDR(dstnid) == LNET_NIDADDR(ni->ni_nid) &&
+                     LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) != LOLND)) {
+                        if (srcnidp != NULL)
+                                *srcnidp = dstnid;
+                        if (orderp != NULL) {
+                                if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND)
+                                        *orderp = 0;
+                                else
+                                        *orderp = 1;
+                        }
+                        LNET_UNLOCK();
 
-        lib_md_deconstruct(nal, md, &msg->ev.md);
-        ptl_md2handle(&msg->ev.md_handle, nal, md);
+                        return local_nid_dist_zero ? 0 : 1;
+                }
 
-        ni->ni_counters.send_count++;
+                if (LNET_NIDNET(ni->ni_nid) == dstnet ||
+                    (the_lnet.ln_ptlcompat > 0 &&
+                     dstnet == 0 &&
+                     LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) != LOLND)) {
+                        if (srcnidp != NULL)
+                                *srcnidp = ni->ni_nid;
+                        if (orderp != NULL)
+                                *orderp = order;
+                        LNET_UNLOCK();
+                        return 1;
+                }
 
-        LIB_UNLOCK(nal, flags);
+                order++;
+        }
 
-        rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_GET,
-                       id->nid, id->pid, NULL, 0, 0);
-        if (rc != PTL_OK) {
-                CERROR(LPU64": error sending GET to "LPU64": %d\n",
-                       ni->ni_pid.nid, id->nid, rc);
-                lib_finalize (nal, NULL, msg, rc);
+        list_for_each (e, &the_lnet.ln_remote_nets) {
+               rnet = list_entry(e, lnet_remotenet_t, lrn_list);
+
+                if (rnet->lrn_net == dstnet) {
+                        LASSERT (!list_empty(&rnet->lrn_routes));
+                        route = list_entry(rnet->lrn_routes.next,
+                                           lnet_route_t, lr_list);
+                        hops = rnet->lrn_hops;
+                        if (srcnidp != NULL)
+                                *srcnidp = route->lr_gateway->lp_ni->ni_nid;
+                        if (orderp != NULL)
+                                *orderp = order;
+                        LNET_UNLOCK();
+                        return hops + 1;
+                }
+                order++;
         }
 
-        /* completion will be signalled by an event */
-        return PTL_OK;
+        LNET_UNLOCK();
+        return -EHOSTUNREACH;
 }
 
-void lib_assert_wire_constants (void)
-{
-        /* Wire protocol assertions generated by 'wirecheck'
-         * running on Linux mdevi 2.4.21-p4smp-55chaos #1 SMP Tue Jun 8 14:38:44 PDT 2004 i686 i686 i
-         * with gcc version 3.2.3 20030502 (Red Hat Linux 3.2.3-34) */
-
-
-        /* Constants... */
-        LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded);
-        LASSERT (PORTALS_PROTO_VERSION_MAJOR == 1);
-        LASSERT (PORTALS_PROTO_VERSION_MINOR == 0);
-        LASSERT (PTL_MSG_ACK == 0);
-        LASSERT (PTL_MSG_PUT == 1);
-        LASSERT (PTL_MSG_GET == 2);
-        LASSERT (PTL_MSG_REPLY == 3);
-        LASSERT (PTL_MSG_HELLO == 4);
-
-        /* Checks for struct ptl_handle_wire_t */
-        LASSERT ((int)sizeof(ptl_handle_wire_t) == 16);
-        LASSERT ((int)offsetof(ptl_handle_wire_t, wh_interface_cookie) == 0);
-        LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8);
-        LASSERT ((int)offsetof(ptl_handle_wire_t, wh_object_cookie) == 8);
-        LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_object_cookie) == 8);
-
-        /* Checks for struct ptl_magicversion_t */
-        LASSERT ((int)sizeof(ptl_magicversion_t) == 8);
-        LASSERT ((int)offsetof(ptl_magicversion_t, magic) == 0);
-        LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->magic) == 4);
-        LASSERT ((int)offsetof(ptl_magicversion_t, version_major) == 4);
-        LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_major) == 2);
-        LASSERT ((int)offsetof(ptl_magicversion_t, version_minor) == 6);
-        LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_minor) == 2);
-
-        /* Checks for struct ptl_hdr_t */
-        LASSERT ((int)sizeof(ptl_hdr_t) == 72);
-        LASSERT ((int)offsetof(ptl_hdr_t, dest_nid) == 0);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_nid) == 8);
-        LASSERT ((int)offsetof(ptl_hdr_t, src_nid) == 8);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_nid) == 8);
-        LASSERT ((int)offsetof(ptl_hdr_t, dest_pid) == 16);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_pid) == 4);
-        LASSERT ((int)offsetof(ptl_hdr_t, src_pid) == 20);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_pid) == 4);
-        LASSERT ((int)offsetof(ptl_hdr_t, type) == 24);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->type) == 4);
-        LASSERT ((int)offsetof(ptl_hdr_t, payload_length) == 28);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->payload_length) == 4);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg) == 32);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg) == 40);
-
-        /* Ack */
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.dst_wmd) == 32);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.match_bits) == 48);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.match_bits) == 8);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.mlength) == 56);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.mlength) == 4);
-
-        /* Put */
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ack_wmd) == 32);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.match_bits) == 48);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.match_bits) == 8);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.hdr_data) == 56);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.hdr_data) == 8);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ptl_index) == 64);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ptl_index) == 4);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.put.offset) == 68);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.offset) == 4);
-
-        /* Get */
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.return_wmd) == 32);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.return_wmd) == 16);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.match_bits) == 48);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.match_bits) == 8);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.ptl_index) == 56);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.ptl_index) == 4);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.src_offset) == 60);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.src_offset) == 4);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.get.sink_length) == 64);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.sink_length) == 4);
-
-        /* Reply */
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.reply.dst_wmd) == 32);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16);
-
-        /* Hello */
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.incarnation) == 32);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.incarnation) == 8);
-        LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.type) == 40);
-        LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.type) == 4);
-}
index 38904c4..a9834b5 100644 (file)
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
 
-#ifndef __KERNEL__
-# include <stdio.h>
-#else
-# include <libcfs/kp30.h>
-#endif
-
-#include <portals/lib-p30.h>
+#include <lnet/lib-lnet.h>
 
 void
-lib_enq_event_locked (lib_nal_t *nal, void *private,
-                      lib_eq_t *eq, ptl_event_t *ev)
+lnet_enq_event_locked (lnet_eq_t *eq, lnet_event_t *ev)
 {
-        ptl_event_t  *eq_slot;
+        lnet_event_t  *eq_slot;
 
         /* Allocate the next queue slot */
-        ev->link = ev->sequence = eq->eq_enq_seq++;
-        /* NB we don't support START events yet and we don't create a separate
-         * UNLINK event unless an explicit unlink succeeds, so the link
-         * sequence is pretty useless */
-
-        /* We don't support different uid/jids yet */
-        ev->uid = 0;
-        ev->jid = 0;
+        ev->sequence = eq->eq_enq_seq++;
 
         /* size must be a power of 2 to handle sequence # overflow */
         LASSERT (eq->eq_size != 0 &&
@@ -54,7 +40,7 @@ lib_enq_event_locked (lib_nal_t *nal, void *private,
         eq_slot = eq->eq_events + (ev->sequence & (eq->eq_size - 1));
 
         /* There is no race since both event consumers and event producers
-         * take the LIB_LOCK(), so we don't screw around with memory
+         * take the LNET_LOCK, so we don't screw around with memory
          * barriers, setting the sequence number last or wierd structure
          * layout assertions. */
         *eq_slot = *ev;
@@ -63,85 +49,176 @@ lib_enq_event_locked (lib_nal_t *nal, void *private,
         if (eq->eq_callback != NULL)
                 eq->eq_callback (eq_slot);
 
-        /* Wake anyone sleeping for an event (see lib-eq.c) */
 #ifdef __KERNEL__
-        if (cfs_waitq_active(&nal->libnal_ni.ni_waitq))
-                cfs_waitq_broadcast(&nal->libnal_ni.ni_waitq);
+        /* Wake anyone waiting in LNetEQPoll() */
+        if (cfs_waitq_active(&the_lnet.ln_waitq))
+                cfs_waitq_broadcast(&the_lnet.ln_waitq);
 #else
-        pthread_cond_broadcast(&nal->libnal_ni.ni_cond);
+# if !HAVE_LIBPTHREAD
+        /* LNetEQPoll() calls into _the_ LND to wait for action */
+# else
+        /* Wake anyone waiting in LNetEQPoll() */
+        pthread_cond_broadcast(&the_lnet.ln_cond);
+# endif
 #endif
 }
 
 void
-lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_err_t status)
+lnet_complete_msg_locked(lnet_msg_t *msg)
+{
+        lnet_handle_wire_t ack_wmd;
+        int                rc;
+        int                status = msg->msg_ev.status;
+
+        LASSERT (msg->msg_onactivelist);
+
+        if (status == 0 && msg->msg_ack) {
+                /* Only send an ACK if the PUT completed successfully */
+
+                lnet_return_credits_locked(msg);
+
+                msg->msg_ack = 0;
+                LNET_UNLOCK();
+        
+                LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
+                LASSERT(!msg->msg_routing);
+
+                ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
+                
+                lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0);
+
+                msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
+                msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
+                msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
+                
+                rc = lnet_send(msg->msg_ev.target.nid, msg);
+
+                LNET_LOCK();
+
+                if (rc == 0)
+                        return;
+        } else if (status == 0 &&               /* OK so far */
+                   (msg->msg_routing && !msg->msg_sending)) { /* not forwarded */
+                
+                LASSERT (!msg->msg_receiving);  /* called back recv already */
+        
+                LNET_UNLOCK();
+                
+                rc = lnet_send(LNET_NID_ANY, msg);
+
+                LNET_LOCK();
+
+                if (rc == 0)
+                        return;
+        }
+
+        lnet_return_credits_locked(msg);
+
+        LASSERT (msg->msg_onactivelist);
+        msg->msg_onactivelist = 0;
+        list_del (&msg->msg_activelist);
+        the_lnet.ln_counters.msgs_alloc--;
+        lnet_msg_free(msg);
+}
+
+
+void
+lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status)
 {
-        lib_md_t     *md;
-        int           unlink;
-        unsigned long flags;
-        int           rc;
-        ptl_hdr_t     ack;
+#ifdef __KERNEL__
+        int                i;
+        int                my_slot;
+#endif
+        lnet_libmd_t      *md;
+
+        LASSERT (!in_interrupt ());
 
         if (msg == NULL)
                 return;
+#if 0
+        CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n",
+               lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target),
+               msg->msg_target_is_router ? "t" : "",
+               msg->msg_routing ? "X" : "",
+               msg->msg_ack ? "A" : "",
+               msg->msg_sending ? "S" : "",
+               msg->msg_receiving ? "R" : "",
+               msg->msg_delayed ? "d" : "",
+               msg->msg_txcredit ? "C" : "",
+               msg->msg_peertxcredit ? "c" : "",
+               msg->msg_rtrcredit ? "F" : "",
+               msg->msg_peerrtrcredit ? "f" : "",
+               msg->msg_onactivelist ? "!" : "",
+               msg->msg_txpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_txpeer->lp_nid),
+               msg->msg_rxpeer == NULL ? "<none>" : libcfs_nid2str(msg->msg_rxpeer->lp_nid));
+#endif
+        LNET_LOCK();
 
-        /* Only send an ACK if the PUT completed successfully */
-        if (status == PTL_OK &&
-            !ptl_is_wire_handle_none(&msg->ack_wmd)) {
-
-                LASSERT(msg->ev.type == PTL_EVENT_PUT_END);
-
-                memset (&ack, 0, sizeof (ack));
-                ack.type     = cpu_to_le32(PTL_MSG_ACK);
-                ack.dest_nid = cpu_to_le64(msg->ev.initiator.nid);
-                ack.dest_pid = cpu_to_le32(msg->ev.initiator.pid);
-                ack.src_nid  = cpu_to_le64(nal->libnal_ni.ni_pid.nid);
-                ack.src_pid  = cpu_to_le32(nal->libnal_ni.ni_pid.pid);
-                ack.payload_length = 0;
-
-                ack.msg.ack.dst_wmd = msg->ack_wmd;
-                ack.msg.ack.match_bits = msg->ev.match_bits;
-                ack.msg.ack.mlength = cpu_to_le32(msg->ev.mlength);
-
-                rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK,
-                               msg->ev.initiator.nid, msg->ev.initiator.pid,
-                               NULL, 0, 0);
-                if (rc != PTL_OK) {
-                        /* send failed: there's nothing else to clean up. */
-                        CERROR("Error %d sending ACK to "LPX64"\n",
-                               rc, msg->ev.initiator.nid);
-                }
-        }
+        LASSERT (msg->msg_onactivelist);
+
+        msg->msg_ev.status = status;
 
-        md = msg->md;
+        md = msg->msg_md;
+        if (md != NULL) {
+                int      unlink;
 
-        LIB_LOCK(nal, flags);
+                /* Now it's safe to drop my caller's ref */
+                md->md_refcount--;
+                LASSERT (md->md_refcount >= 0);
 
-        /* Now it's safe to drop my caller's ref */
-        md->pending--;
-        LASSERT (md->pending >= 0);
+                unlink = lnet_md_unlinkable(md);
+                
+                msg->msg_ev.unlinked = unlink;
+                
+                if (md->md_eq != NULL)
+                        lnet_enq_event_locked(md->md_eq, &msg->msg_ev);
+                
+                if (unlink)
+                        lnet_md_unlink(md);
+
+                msg->msg_md = NULL;
+        }
 
-        /* Should I unlink this MD? */
-        if (md->pending != 0)                   /* other refs */
-                unlink = 0;
-        else if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) != 0)
-                unlink = 1;
-        else if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) == 0)
-                unlink = 0;
-        else
-                unlink = lib_md_exhausted(md);
+        list_add_tail (&msg->msg_list, &the_lnet.ln_finalizeq);
 
-        msg->ev.ni_fail_type = status;
-        msg->ev.unlinked = unlink;
+        /* Recursion breaker.  Don't complete the message here if I am (or
+         * enough other threads are) already completing messages */
 
-        if (md->eq != NULL)
-                lib_enq_event_locked(nal, private, md->eq, &msg->ev);
+#ifdef __KERNEL__
+        my_slot = -1;
+        for (i = 0; i < the_lnet.ln_nfinalizers; i++) {
+                if (the_lnet.ln_finalizers[i] == cfs_current())
+                        goto out;
+                if (my_slot < 0 && the_lnet.ln_finalizers[i] == NULL)
+                        my_slot = i;
+        }
+        if (my_slot < 0)
+                goto out;
 
-        if (unlink)
-                lib_md_unlink(nal, md);
+        the_lnet.ln_finalizers[my_slot] = cfs_current();
+#else
+        if (the_lnet.ln_finalizing)
+                goto out;
+#endif
 
-        list_del (&msg->msg_list);
-        nal->libnal_ni.ni_counters.msgs_alloc--;
-        lib_msg_free(nal, msg);
+        while (!list_empty(&the_lnet.ln_finalizeq)) {
+                msg = list_entry(the_lnet.ln_finalizeq.next,
+                                 lnet_msg_t, msg_list);
+                
+                list_del(&msg->msg_list);
 
-        LIB_UNLOCK(nal, flags);
+                /* NB drops and regains the lnet lock if it actually does
+                 * anything, so my finalizing friends can chomp along too */
+                lnet_complete_msg_locked(msg);
+        }
+
+#ifdef __KERNEL__
+        the_lnet.ln_finalizers[my_slot] = NULL;
+#else
+        the_lnet.ln_finalizing = 0;
+#endif
+
+ out:
+        LNET_UNLOCK();
 }
+
diff --git a/lnet/lnet/lib-ni.c b/lnet/lnet/lib-ni.c
deleted file mode 100644 (file)
index e45859a..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *   This file is part of Lustre, http://www.lustre.org
- *   This file is not subject to copyright protection.
- */
-
-#define DEBUG_SUBSYSTEM S_PORTALS
-#include <portals/lib-p30.h>
-
-int lib_api_ni_status (nal_t *apinal, ptl_sr_index_t sr_idx,
-                       ptl_sr_value_t *status)
-{
-        return PTL_FAIL;
-}
-
-
-int lib_api_ni_dist (nal_t *apinal, ptl_process_id_t *pid, unsigned long *dist)
-{
-        lib_nal_t *nal = apinal->nal_data;
-
-        if (nal->libnal_ni.ni_loopback &&
-            pid->nid == nal->libnal_ni.ni_pid.nid) {
-                *dist = 0;
-                return PTL_OK;
-        }
-        
-        return (nal->libnal_dist(nal, pid->nid, dist));
-}
diff --git a/lnet/lnet/lib-pid.c b/lnet/lnet/lib-pid.c
deleted file mode 100644 (file)
index 23d6dd3..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *   This file is part of Lustre, http://www.lustre.org
- *   This file is not subject to copyright protection.
- */
-
-/* This should be removed.  The NAL should have the PID information */
-#define DEBUG_SUBSYSTEM S_PORTALS
-
-#include <portals/lib-p30.h>
-
-int
-lib_api_get_id(nal_t *apinal, ptl_process_id_t *pid)
-{
-        lib_nal_t *nal = apinal->nal_data;
-        
-        *pid = nal->libnal_ni.ni_pid;
-        return PTL_OK;
-}
diff --git a/lnet/lnet/lo.c b/lnet/lnet/lo.c
new file mode 100644 (file)
index 0000000..e123b3d
--- /dev/null
@@ -0,0 +1,112 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *  Copyright (C) 2004 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+int
+lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
+{
+        LASSERT (!lntmsg->msg_routing);
+        LASSERT (!lntmsg->msg_target_is_router);
+        
+        return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0);
+}
+
+int
+lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
+            int delayed, unsigned int niov, 
+            struct iovec *iov, lnet_kiov_t *kiov,
+            unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        lnet_msg_t *sendmsg = private;
+
+        if (lntmsg != NULL) {                   /* not discarding */
+                if (sendmsg->msg_iov != NULL) {
+                        if (iov != NULL)
+                                lnet_copy_iov2iov(niov, iov, offset,
+                                                  sendmsg->msg_niov,
+                                                  sendmsg->msg_iov,
+                                                  sendmsg->msg_offset, mlen);
+                        else
+                                lnet_copy_iov2kiov(niov, kiov, offset,
+                                                   sendmsg->msg_niov,
+                                                   sendmsg->msg_iov,
+                                                   sendmsg->msg_offset, mlen);
+                } else {
+                        if (iov != NULL)
+                                lnet_copy_kiov2iov(niov, iov, offset,
+                                                   sendmsg->msg_niov,
+                                                   sendmsg->msg_kiov,
+                                                   sendmsg->msg_offset, mlen);
+                        else
+                                lnet_copy_kiov2kiov(niov, kiov, offset,
+                                                    sendmsg->msg_niov,
+                                                    sendmsg->msg_kiov,
+                                                    sendmsg->msg_offset, mlen);
+                }
+
+                lnet_finalize(ni, lntmsg, 0);
+        }
+        
+        lnet_finalize(ni, sendmsg, 0);
+        return 0;
+}
+
+static int lolnd_instanced;
+
+void
+lolnd_shutdown(lnet_ni_t *ni)
+{
+       CDEBUG (D_NET, "shutdown\n");
+        LASSERT (lolnd_instanced);
+        
+        lolnd_instanced = 0;
+}
+
+int
+lolnd_startup (lnet_ni_t *ni)
+{
+       LASSERT (ni->ni_lnd == &the_lolnd);
+       LASSERT (!lolnd_instanced);
+        lolnd_instanced = 1;
+
+       return (0);
+}
+
+lnd_t the_lolnd = {
+        /* .lnd_list       = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list},
+        /* .lnd_refcount   = */ 0,
+        /* .lnd_type       = */ LOLND,
+        /* .lnd_startup    = */ lolnd_startup,
+        /* .lnd_shutdown   = */ lolnd_shutdown,
+        /* .lnt_ctl        = */ NULL, 
+        /* .lnd_send       = */ lolnd_send,
+        /* .lnd_recv       = */ lolnd_recv,
+        /* .lnd_eager_recv = */ NULL,
+        /* .lnd_notify     = */ NULL,
+#ifdef __KERNEL__
+        /* .lnd_accept     = */ NULL
+#else
+        /* .lnd_wait       = */ NULL
+#endif
+};
+
index 472175b..eff8daa 100644 (file)
 #ifndef EXPORT_SYMTAB
 # define EXPORT_SYMTAB
 #endif
-#define DEBUG_SUBSYSTEM S_PORTALS
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
 
-#include <portals/lib-p30.h>
-#include <portals/p30.h>
-#include <portals/nal.h>
-#include <libcfs/kp30.h>
-#include <portals/kpr.h>
+static int config_on_load = 0;
+CFS_MODULE_PARM(config_on_load, "i", int, 0444,
+                "configure network at module load");
 
-extern void (kping_client)(struct portal_ioctl_data *);
+static struct semaphore lnet_config_mutex;
 
-static int kportal_ioctl(struct portal_ioctl_data *data,
-                         unsigned int cmd, unsigned long arg)
+int
+lnet_configure (void *arg)
 {
-        int err;
-        char str[PTL_NALFMT_SIZE];
-        ENTRY;
+        /* 'arg' only there so I can be passed to cfs_kernel_thread() */
+        int    rc = 0;
 
-        switch (cmd) {
-        case IOC_PORTAL_PING: {
-                void (*ping)(struct portal_ioctl_data *);
-
-                CDEBUG(D_IOCTL, "doing %d pings to nid "LPX64" (%s)\n",
-                       data->ioc_count, data->ioc_nid,
-                       portals_nid2str(data->ioc_nal, data->ioc_nid, str));
-                ping = PORTAL_SYMBOL_GET(kping_client);
-                if (!ping)
-                        CERROR("PORTAL_SYMBOL_GET failed\n");
-                else {
-                        ping(data);
-                        PORTAL_SYMBOL_PUT(kping_client);
+        LNET_MUTEX_DOWN(&lnet_config_mutex);
+
+        if (!the_lnet.ln_niinit_self) {
+                rc = LNetNIInit(LUSTRE_SRV_LNET_PID);
+                if (rc >= 0) {
+                        the_lnet.ln_niinit_self = 1;
+                        rc = 0;
                 }
-                RETURN(0);
         }
 
-        case IOC_PORTAL_GET_NID: {
-                ptl_handle_ni_t    nih;
-                ptl_process_id_t   pid;
-
-                CDEBUG (D_IOCTL, "Getting nid for nal [%x]\n", data->ioc_nal);
-
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
-                                NULL, &nih);
-                if (!(err == PTL_OK || err == PTL_IFACE_DUP))
-                        RETURN (-EINVAL);
-
-                err = PtlGetId (nih, &pid);
-                LASSERT (err == PTL_OK);
+        LNET_MUTEX_UP(&lnet_config_mutex);
+        return rc;
+}
 
-                PtlNIFini(nih);
+int
+lnet_unconfigure (void)
+{
+        int   refcount;
+        
+        LNET_MUTEX_DOWN(&lnet_config_mutex);
 
-                data->ioc_nid = pid.nid;
-                if (copy_to_user ((char *)arg, data, sizeof (*data)))
-                        RETURN (-EFAULT);
-                RETURN(0);
+        if (the_lnet.ln_niinit_self) {
+                the_lnet.ln_niinit_self = 0;
+                LNetNIFini();
         }
 
-        case IOC_PORTAL_FAIL_NID: {
-                ptl_handle_ni_t    nih;
-
-                CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n",
-                        data->ioc_nal, data->ioc_nid, data->ioc_count);
-
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
-                                NULL, &nih);
-                if (!(err == PTL_OK || err == PTL_IFACE_DUP))
-                        return (-EINVAL);
-
-                if (err == PTL_OK) {
-                        /* There's no point in failing an interface that
-                         * came into existance just for this */
-                        err = -EINVAL;
-                } else {
-                        err = PtlFailNid (nih, data->ioc_nid, data->ioc_count);
-                        if (err != PTL_OK)
-                                err = -EINVAL;
-                }
+        LNET_MUTEX_DOWN(&the_lnet.ln_api_mutex);
+        refcount = the_lnet.ln_refcount;
+        LNET_MUTEX_UP(&the_lnet.ln_api_mutex);
 
-                PtlNIFini(nih);
-                RETURN (err);
-        }
+        LNET_MUTEX_UP(&lnet_config_mutex);
+        return (refcount == 0) ? 0 : -EBUSY;
+}
 
-        case IOC_PORTAL_LOOPBACK: {
-                ptl_handle_ni_t  nih;
-                int              enabled = data->ioc_flags;
-                int              set = data->ioc_misc;
-
-                CDEBUG (D_IOCTL, "loopback: [%d] %d %d\n",
-                        data->ioc_nal, enabled, set);
-
-                err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL,
-                                NULL, &nih);
-                if (!(err == PTL_OK || err == PTL_IFACE_DUP))
-                        return (-EINVAL);
-
-                if (err == PTL_OK) {
-                        /* There's no point in failing an interface that
-                         * came into existance just for this */
-                        err = -EINVAL;
-                } else {
-                        err = PtlLoopback (nih, set, &enabled);
-                        if (err != PTL_OK) {
-                                err = -EINVAL;
-                        } else {
-                                data->ioc_flags = enabled;
-                                if (copy_to_user ((char *)arg, data, 
-                                                  sizeof (*data)))
-                                        err = -EFAULT;
-                                else
-                                        err = 0;
-                        }
-                }
+int
+lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data)
+{
+        int   rc;
 
-                PtlNIFini(nih);
-                RETURN (err);
-        }
+        switch (cmd) {
+        case IOC_LIBCFS_CONFIGURE:
+                return lnet_configure(NULL);
+
+        case IOC_LIBCFS_UNCONFIGURE:
+                return lnet_unconfigure();
+                
         default:
-                RETURN(-EINVAL);
+                /* Passing LNET_PID_ANY only gives me a ref if the net is up
+                 * already; I'll need it to ensure the net can't go down while
+                 * I'm called into it */
+                rc = LNetNIInit(LNET_PID_ANY);
+                if (rc >= 0) {
+                        rc = LNetCtl(cmd, data);
+                        LNetNIFini();
+                }
+                return rc;
         }
-        /* Not Reached */
 }
 
-DECLARE_IOCTL_HANDLER(kportal_ioctl_handler, kportal_ioctl);
-extern struct semaphore ptl_mutex;
+DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl);
 
-static int init_kportals_module(void)
+int
+init_lnet(void)
 {
-        int rc;
+        int                  rc;
         ENTRY;
 
-        init_mutex(&ptl_mutex);
-        rc = PtlInit(NULL);
-        if (rc) {
-                CERROR("PtlInit: error %d\n", rc);
+        init_mutex(&lnet_config_mutex);
+
+        rc = LNetInit();
+        if (rc != 0) {
+                CERROR("LNetInit: error %d\n", rc);
                 RETURN(rc);
         }
 
-        rc = libcfs_register_ioctl(&kportal_ioctl_handler);
+        rc = libcfs_register_ioctl(&lnet_ioctl_handler);
         LASSERT (rc == 0);
 
-        RETURN(rc);
+        if (config_on_load) {
+                /* Have to schedule a separate thread to avoid deadlocking
+                 * in modload */
+                (void) cfs_kernel_thread(lnet_configure, NULL, 0);
+        }
+
+        RETURN(0);
 }
 
-static void exit_kportals_module(void)
+void
+fini_lnet(void)
 {
         int rc;
 
-        rc = libcfs_deregister_ioctl(&kportal_ioctl_handler);
+        rc = libcfs_deregister_ioctl(&lnet_ioctl_handler);
         LASSERT (rc == 0);
 
-        PtlFini();
+        LNetFini();
 }
 
-EXPORT_SYMBOL(ptl_register_nal);
-EXPORT_SYMBOL(ptl_unregister_nal);
-
-EXPORT_SYMBOL(ptl_err_str);
-EXPORT_SYMBOL(PtlMEAttach);
-EXPORT_SYMBOL(PtlMEInsert);
-EXPORT_SYMBOL(PtlMEUnlink);
-EXPORT_SYMBOL(PtlEQAlloc);
-EXPORT_SYMBOL(PtlMDAttach);
-EXPORT_SYMBOL(PtlMDUnlink);
-EXPORT_SYMBOL(PtlNIInit);
-EXPORT_SYMBOL(PtlNIFini);
-EXPORT_SYMBOL(PtlInit);
-EXPORT_SYMBOL(PtlFini);
-EXPORT_SYMBOL(PtlSnprintHandle);
-EXPORT_SYMBOL(PtlPut);
-EXPORT_SYMBOL(PtlGet);
-EXPORT_SYMBOL(PtlEQWait);
-EXPORT_SYMBOL(PtlEQFree);
-EXPORT_SYMBOL(PtlEQGet);
-EXPORT_SYMBOL(PtlGetId);
-EXPORT_SYMBOL(PtlMDBind);
-EXPORT_SYMBOL(lib_iov_nob);
-EXPORT_SYMBOL(lib_copy_iov2buf);
-EXPORT_SYMBOL(lib_copy_buf2iov);
-EXPORT_SYMBOL(lib_extract_iov);
-EXPORT_SYMBOL(lib_kiov_nob);
-EXPORT_SYMBOL(lib_copy_kiov2buf);
-EXPORT_SYMBOL(lib_copy_buf2kiov);
-EXPORT_SYMBOL(lib_extract_kiov);
-EXPORT_SYMBOL(lib_finalize);
-EXPORT_SYMBOL(lib_parse);
-EXPORT_SYMBOL(lib_create_reply_msg);
-EXPORT_SYMBOL(lib_init);
-EXPORT_SYMBOL(lib_fini);
+EXPORT_SYMBOL(lnet_register_lnd);
+EXPORT_SYMBOL(lnet_unregister_lnd);
+
+EXPORT_SYMBOL(LNetMEAttach);
+EXPORT_SYMBOL(LNetMEInsert);
+EXPORT_SYMBOL(LNetMEUnlink);
+EXPORT_SYMBOL(LNetEQAlloc);
+EXPORT_SYMBOL(LNetMDAttach);
+EXPORT_SYMBOL(LNetMDUnlink);
+EXPORT_SYMBOL(LNetNIInit);
+EXPORT_SYMBOL(LNetNIFini);
+EXPORT_SYMBOL(LNetInit);
+EXPORT_SYMBOL(LNetFini);
+EXPORT_SYMBOL(LNetSnprintHandle);
+EXPORT_SYMBOL(LNetPut);
+EXPORT_SYMBOL(LNetGet);
+EXPORT_SYMBOL(LNetEQWait);
+EXPORT_SYMBOL(LNetEQFree);
+EXPORT_SYMBOL(LNetEQGet);
+EXPORT_SYMBOL(LNetGetId);
+EXPORT_SYMBOL(LNetMDBind);
+EXPORT_SYMBOL(LNetDist);
+EXPORT_SYMBOL(LNetCtl);
+EXPORT_SYMBOL(LNetSetLazyPortal);
+EXPORT_SYMBOL(LNetClearLazyPortal);
+EXPORT_SYMBOL(the_lnet);
+EXPORT_SYMBOL(lnet_iov_nob);
+EXPORT_SYMBOL(lnet_extract_iov);
+EXPORT_SYMBOL(lnet_kiov_nob);
+EXPORT_SYMBOL(lnet_extract_kiov);
+EXPORT_SYMBOL(lnet_copy_iov2iov);
+EXPORT_SYMBOL(lnet_copy_iov2kiov);
+EXPORT_SYMBOL(lnet_copy_kiov2iov);
+EXPORT_SYMBOL(lnet_copy_kiov2kiov);
+EXPORT_SYMBOL(lnet_finalize);
+EXPORT_SYMBOL(lnet_parse);
+EXPORT_SYMBOL(lnet_create_reply_msg);
+EXPORT_SYMBOL(lnet_set_reply_msg_len);
+EXPORT_SYMBOL(lnet_msgtyp2str);
+EXPORT_SYMBOL(lnet_net2ni_locked);
 
 MODULE_AUTHOR("Peter J. Braam <braam@clusterfs.com>");
 MODULE_DESCRIPTION("Portals v3.1");
 MODULE_LICENSE("GPL");
 
-cfs_module(portals, "1.0.0", init_kportals_module, exit_kportals_module);
+cfs_module(lnet, "1.0.0", init_lnet, fini_lnet);
diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c
new file mode 100644 (file)
index 0000000..6ac1d1e
--- /dev/null
@@ -0,0 +1,244 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * lib/lib-move.c
+ * Data movement routines
+ *
+ *  Copyright (c) 2001-2003 Cluster File Systems, Inc.
+ *
+ *   This file is part of Lustre, http://www.lustre.org
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+#include <lnet/lib-lnet.h>
+
+int
+lnet_create_peer_table(void)
+{
+       struct list_head *hash;
+       int               i;
+
+       LASSERT (the_lnet.ln_peer_hash == NULL);
+       LIBCFS_ALLOC(hash, LNET_PEER_HASHSIZE * sizeof(struct list_head));
+       
+       if (hash == NULL) {
+               CERROR("Can't allocate peer hash table\n");
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < LNET_PEER_HASHSIZE; i++)
+               CFS_INIT_LIST_HEAD(&hash[i]);
+
+       the_lnet.ln_peer_hash = hash;
+       return 0;
+}
+
+void
+lnet_destroy_peer_table(void)
+{
+       int         i;
+
+        if (the_lnet.ln_peer_hash == NULL)
+                return;
+
+       for (i = 0; i < LNET_PEER_HASHSIZE; i++)
+               LASSERT (list_empty(&the_lnet.ln_peer_hash[i]));
+       
+       LIBCFS_FREE(the_lnet.ln_peer_hash,
+                   LNET_PEER_HASHSIZE * sizeof (struct list_head));
+        the_lnet.ln_peer_hash = NULL;
+}
+
+void
+lnet_clear_peer_table(void)
+{
+       int         i;
+
+        LASSERT (the_lnet.ln_shutdown);         /* i.e. no new peers */
+       
+       for (i = 0; i < LNET_PEER_HASHSIZE; i++) {
+               struct list_head *peers = &the_lnet.ln_peer_hash[i];
+
+               LNET_LOCK();
+               while (!list_empty(peers)) {
+                       lnet_peer_t *lp = list_entry(peers->next,
+                                                    lnet_peer_t, lp_hashlist);
+                       
+                       list_del(&lp->lp_hashlist);
+                        lnet_peer_decref_locked(lp);   /* lose hash table's ref */
+               }
+               LNET_UNLOCK();
+       }
+
+        LNET_LOCK();
+        for (i = 3; the_lnet.ln_npeers != 0;i++) {
+                LNET_UNLOCK();
+
+                if ((i & (i-1)) == 0)
+                        CDEBUG(D_WARNING,"Waiting for %d peers\n", 
+                               the_lnet.ln_npeers);
+                cfs_pause(cfs_time_seconds(1));
+
+                LNET_LOCK();
+        }
+        LNET_UNLOCK();
+}
+
+void
+lnet_destroy_peer_locked (lnet_peer_t *lp) 
+{
+        lnet_ni_decref_locked(lp->lp_ni);
+        LNET_UNLOCK();
+
+        LASSERT (lp->lp_refcount == 0);
+        LASSERT (lp->lp_rtr_refcount == 0);
+       LASSERT (list_empty(&lp->lp_txq));
+        LASSERT (lp->lp_txqnob == 0);
+
+       LIBCFS_FREE(lp, sizeof(*lp));
+
+        LNET_LOCK();
+
+        LASSERT(the_lnet.ln_npeers > 0);
+        the_lnet.ln_npeers--;
+}
+
+lnet_peer_t *
+lnet_find_peer_locked (lnet_nid_t nid)
+{
+       unsigned int      idx = LNET_NIDADDR(nid) % LNET_PEER_HASHSIZE;
+       struct list_head *peers = &the_lnet.ln_peer_hash[idx];
+       struct list_head *tmp;
+        lnet_peer_t      *lp;
+
+       if (the_lnet.ln_shutdown)
+                return NULL;
+
+       list_for_each (tmp, peers) {
+               lp = list_entry(tmp, lnet_peer_t, lp_hashlist);
+               
+               if (lp->lp_nid == nid) {
+                        lnet_peer_addref_locked(lp);
+                       return lp;
+                }
+       }
+        
+       return NULL;
+}
+
+int
+lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid)
+{
+       lnet_peer_t    *lp;
+       lnet_peer_t    *lp2;
+
+        lp = lnet_find_peer_locked(nid);
+        if (lp != NULL) {
+                *lpp = lp;
+                return 0;
+        }
+        
+        LNET_UNLOCK();
+       
+       LIBCFS_ALLOC(lp, sizeof(*lp));
+       if (lp == NULL) {
+                *lpp = NULL;
+                LNET_LOCK();
+                return -ENOMEM;
+        }
+
+        memset(lp, 0, sizeof(*lp));             /* zero counters etc */
+        
+       CFS_INIT_LIST_HEAD(&lp->lp_txq);
+        CFS_INIT_LIST_HEAD(&lp->lp_rtrq);
+       
+       lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */
+        lp->lp_notify = 0;
+        lp->lp_notifylnd = 0;
+        lp->lp_notifying = 0;
+        lp->lp_alive_count = 0;
+       lp->lp_timestamp = 0;
+        lp->lp_ping_timestamp = 0;
+       lp->lp_nid = nid;
+        lp->lp_refcount = 2;                    /* 1 for caller; 1 for hash */
+        lp->lp_rtr_refcount = 0;
+
+        LNET_LOCK();
+
+        lp2 = lnet_find_peer_locked(nid);
+        if (lp2 != NULL) {
+                LNET_UNLOCK();
+                LIBCFS_FREE(lp, sizeof(*lp));
+                LNET_LOCK();
+
+                *lpp = lp2;
+                return 0;
+        }
+                
+        lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid));
+        if (lp->lp_ni == NULL) {
+                LNET_UNLOCK();
+                LIBCFS_FREE(lp, sizeof(*lp));
+                LNET_LOCK();
+
+                *lpp = NULL;
+                return the_lnet.ln_shutdown ? -ESHUTDOWN : -EHOSTUNREACH;
+        }
+
+       lp->lp_txcredits = 
+                lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits;
+
+        /* As a first approximation; allow this peer the same number of router
+         * buffers as it is allowed outstanding sends */
+        lp->lp_rtrcredits = lp->lp_minrtrcredits = lp->lp_txcredits;
+
+        LASSERT (!the_lnet.ln_shutdown);
+        /* can't add peers after shutdown starts */
+
+        list_add_tail(&lp->lp_hashlist, lnet_nid2peerhash(nid));
+        the_lnet.ln_npeers++;
+        the_lnet.ln_peertable_version++;
+        *lpp = lp;
+        return 0;
+}
+
+void
+lnet_debug_peer(lnet_nid_t nid)
+{
+        int          rc;
+        lnet_peer_t *lp;
+
+        LNET_LOCK();
+        
+        rc = lnet_nid2peer_locked(&lp, nid);
+        if (rc != 0) {
+                LNET_UNLOCK();
+                CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid));
+                return;
+        }
+
+        CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
+               libcfs_nid2str(lp->lp_nid), lp->lp_refcount, 
+               lp->lp_alive ? "up" : "down",
+               lp->lp_ni->ni_peertxcredits, 
+               lp->lp_rtrcredits, lp->lp_minrtrcredits, 
+               lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob);
+
+        lnet_peer_decref_locked(lp);
+
+        LNET_UNLOCK();
+}
diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c
new file mode 100644 (file)
index 0000000..aec3d06
--- /dev/null
@@ -0,0 +1,1135 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <lnet/lib-lnet.h>
+
+#if defined(__KERNEL__) && defined(LNET_ROUTER)
+
+static char *forwarding = "";
+CFS_MODULE_PARM(forwarding, "s", charp, 0444,
+                "Explicitly enable/disable forwarding between networks");
+
+static int tiny_router_buffers = 512;
+CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444,
+                "# of 0 payload messages to buffer in the router");
+static int small_router_buffers = 256;
+CFS_MODULE_PARM(small_router_buffers, "i", int, 0444,
+                "# of small (1 page) messages to buffer in the router");
+static int large_router_buffers = 32;
+CFS_MODULE_PARM(large_router_buffers, "i", int, 0444,
+                "# of large messages to buffer in the router");
+
+static int auto_down = 1;
+CFS_MODULE_PARM(auto_down, "i", int, 0444,
+                "Automatically mark peers down on comms error");
+
+static int check_routers_before_use = 0;
+CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444,
+                "Assume routers are down and ping them before use");
+
+static int dead_router_check_interval = 0;
+CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444,
+                "Seconds between dead router health checks (<= 0 to disable)");
+
+static int live_router_check_interval = 0;
+CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444,
+                "Seconds between live router health checks (<= 0 to disable)");
+
+static int router_ping_timeout = 50;
+CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444,
+                "Seconds to wait for the reply to a router health query");
+
+typedef struct
+{
+        work_struct_t           kpru_tq;
+        lnet_nid_t              kpru_nid;
+        int                     kpru_alive;
+        time_t                  kpru_when;
+} kpr_upcall_t;
+
+void
+kpr_do_upcall (void *arg)
+{
+        kpr_upcall_t *u = (kpr_upcall_t *)arg;
+
+#ifndef __WINNT__
+
+        char          nidstr[36];
+        char          whenstr[36];
+        char         *argv[] = {
+                NULL,
+                "ROUTER_NOTIFY",
+                nidstr,
+                u->kpru_alive ? "up" : "down",
+                whenstr,
+                NULL};
+
+        snprintf (nidstr, sizeof(nidstr), "%s", libcfs_nid2str(u->kpru_nid));
+        snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when);
+
+        libcfs_run_upcall (argv);
+
+#endif /* __WINNT__ */
+
+        LIBCFS_FREE(u, sizeof(*u));
+}
+
+void
+kpr_upcall (lnet_nid_t gw_nid, int alive, time_t when)
+{
+        /* May be in arbitrary context */
+        kpr_upcall_t  *u;
+
+        LIBCFS_ALLOC_ATOMIC(u, sizeof(*u));
+        if (u == NULL) {
+                CERROR ("Upcall out of memory: nid %s %s\n",
+                        libcfs_nid2str(gw_nid), alive ? "up" : "down");
+                return;
+        }
+
+        u->kpru_nid        = gw_nid;
+        u->kpru_alive      = alive;
+        u->kpru_when       = when;
+
+        prepare_work (&u->kpru_tq, kpr_do_upcall, u);
+        schedule_work (&u->kpru_tq);
+}
+
+int
+lnet_peers_start_down(void)
+{
+        return check_routers_before_use;
+}
+
+void
+lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when)
+{
+        if (when < lp->lp_timestamp) {          /* out of date information */
+                CDEBUG(D_NET, "Out of date\n");
+                return;
+        }
+
+        lp->lp_timestamp = when;                /* update timestamp */
+        lp->lp_ping_deadline = 0;               /* disable ping timeout */
+
+        if (lp->lp_alive_count != 0 &&          /* got old news */
+            (!lp->lp_alive) == (!alive)) {      /* new date for old news */
+                CDEBUG(D_NET, "Old news\n");
+                return;
+        }
+
+        /* Flag that notification is outstanding */
+
+        lp->lp_alive_count++;
+        lp->lp_alive = !(!alive);               /* 1 bit! */
+        lp->lp_notify = 1;
+        lp->lp_notifylnd = notifylnd;
+
+        CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive);
+}
+
+void
+lnet_do_notify (lnet_peer_t *lp) 
+{
+        lnet_ni_t *ni = lp->lp_ni;
+        int        alive;
+        time_t     when;
+        int        lnd;
+        
+        LNET_LOCK();
+                
+        /* Notify only in 1 thread at any time to ensure ordered notification.
+         * NB individual events can be missed; the only guarantee is that you
+         * always get the most recent news */
+
+        if (lp->lp_notifying) {
+                LNET_UNLOCK();
+                return;
+        }
+
+        lp->lp_notifying = 1;
+        
+        while (lp->lp_notify) {
+                alive = lp->lp_alive;
+                when  = lp->lp_timestamp;
+                lnd   = lp->lp_notifylnd;
+
+                lp->lp_notify = 0;
+
+                LNET_UNLOCK();
+
+                /* A new notification could happen now; I'll handle it when
+                 * control returns to me */
+                
+                if (!lnd) {
+                        CDEBUG(D_NET, "Upcall: NID %s is %s\n",
+                               libcfs_nid2str(lp->lp_nid),
+                               alive ? "alive" : "dead");
+                        kpr_upcall(lp->lp_nid, alive, when);
+                } else {
+                        if (ni->ni_lnd->lnd_notify != NULL)
+                                (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive);
+                }
+
+                LNET_LOCK();
+        }
+
+        lp->lp_notifying = 0;
+
+        LNET_UNLOCK();
+}
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
+{
+        lnet_peer_t         *lp = NULL;
+        time_t               now = cfs_time_current_sec();
+
+        LASSERT (!in_interrupt ());
+
+        CDEBUG (D_NET, "%s notifying %s: %s\n",
+                (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+                libcfs_nid2str(nid),
+                alive ? "up" : "down");
+
+        if (ni != NULL &&
+            LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) {
+                CWARN ("Ignoring notification of %s %s by %s (different net)\n",
+                        libcfs_nid2str(nid), alive ? "birth" : "death",
+                        libcfs_nid2str(ni->ni_nid));
+                return -EINVAL;
+        }
+
+        /* can't do predictions... */
+        if (when > now) {
+                CWARN ("Ignoring prediction from %s of %s %s "
+                       "%ld seconds in the future\n",
+                       (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid),
+                       libcfs_nid2str(nid), alive ? "up" : "down",
+                       when - now);
+                return -EINVAL;
+        }
+
+        if (ni != NULL && !alive &&             /* LND telling me she's down */
+            !auto_down) {                       /* auto-down disabled */
+                CDEBUG(D_NET, "Auto-down disabled\n");
+                return 0;
+        }
+        
+        LNET_LOCK();
+
+        lp = lnet_find_peer_locked(nid);
+        if (lp == NULL) {
+                /* nid not found */
+                LNET_UNLOCK();
+                CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid));
+                return 0;
+        }
+
+        lnet_notify_locked(lp, ni == NULL, alive, when);
+
+        LNET_UNLOCK();
+        
+        lnet_do_notify(lp);
+        
+        LNET_LOCK();
+
+        lnet_peer_decref_locked(lp);
+
+        LNET_UNLOCK();
+        return 0;
+}
+EXPORT_SYMBOL(lnet_notify);
+
+#else
+
+int
+lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when)
+{
+        return -EOPNOTSUPP;
+}
+
+#endif
+
+static void
+lnet_rtr_addref_locked(lnet_peer_t *lp)
+{
+        LASSERT (lp->lp_refcount > 0);
+        LASSERT (lp->lp_rtr_refcount >= 0);
+
+        lp->lp_rtr_refcount++;
+        if (lp->lp_rtr_refcount == 1) {
+                struct list_head *pos;
+
+                /* a simple insertion sort */
+                list_for_each_prev(pos, &the_lnet.ln_routers) {
+                        lnet_peer_t *rtr = list_entry(pos, lnet_peer_t, 
+                                                      lp_rtr_list);
+
+                        if (rtr->lp_nid < lp->lp_nid)
+                                break;
+                }
+
+                list_add(&lp->lp_rtr_list, pos);
+                /* addref for the_lnet.ln_routers */
+                lnet_peer_addref_locked(lp);
+                the_lnet.ln_routers_version++;
+        }
+}
+
+static void
+lnet_rtr_decref_locked(lnet_peer_t *lp)
+{
+        LASSERT (lp->lp_refcount > 0);
+        LASSERT (lp->lp_rtr_refcount > 0);
+
+        lp->lp_rtr_refcount--;
+        if (lp->lp_rtr_refcount == 0) {
+                list_del(&lp->lp_rtr_list);
+                /* decref for the_lnet.ln_routers */
+                lnet_peer_decref_locked(lp);
+                the_lnet.ln_routers_version++;
+        }
+}
+
+lnet_remotenet_t *
+lnet_find_net_locked (__u32 net)
+{
+        lnet_remotenet_t *rnet;
+        struct list_head *tmp;
+
+        LASSERT (!the_lnet.ln_shutdown);
+
+        list_for_each (tmp, &the_lnet.ln_remote_nets) {
+                rnet = list_entry(tmp, lnet_remotenet_t, lrn_list);
+
+                if (rnet->lrn_net == net)
+                        return rnet;
+        }
+        return NULL;
+}
+
+int
+lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway)
+{
+        struct list_head     zombies;
+       struct list_head    *e;
+       lnet_remotenet_t    *rnet;
+       lnet_remotenet_t    *rnet2;
+       lnet_route_t        *route;
+       lnet_route_t        *route2;
+        lnet_ni_t           *ni;
+        int                  add_route;
+        int                  rc;
+
+        CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n",
+               libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+
+        if (gateway == LNET_NID_ANY ||
+            LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND ||
+            net == LNET_NIDNET(LNET_NID_ANY) ||
+            LNET_NETTYP(net) == LOLND ||
+            LNET_NIDNET(gateway) == net ||
+            hops < 1 || hops > 255)
+                return (-EINVAL);
+
+        if (lnet_islocalnet(net))               /* it's a local network */
+                return 0;                       /* ignore the route entry */
+
+        /* Assume net, route, all new */
+        LIBCFS_ALLOC(route, sizeof(*route));
+        LIBCFS_ALLOC(rnet, sizeof(*rnet));
+        if (route == NULL || rnet == NULL) {
+                CERROR("Out of memory creating route %s %d %s\n",
+                       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+                if (route != NULL)
+                        LIBCFS_FREE(route, sizeof(*route));
+                if (rnet != NULL)
+                        LIBCFS_FREE(rnet, sizeof(*rnet));
+                return -ENOMEM;
+        }
+
+        INIT_LIST_HEAD(&rnet->lrn_routes);
+        rnet->lrn_net = net;
+        rnet->lrn_hops = hops;
+
+        LNET_LOCK();
+
+        rc = lnet_nid2peer_locked(&route->lr_gateway, gateway);
+        if (rc != 0) {
+                LNET_UNLOCK();
+
+                LIBCFS_FREE(route, sizeof(*route));
+                LIBCFS_FREE(rnet, sizeof(*rnet));
+
+                if (rc == -EHOSTUNREACH)        /* gateway is not on a local net */
+                        return 0;               /* ignore the route entry */
+
+                CERROR("Error %d creating route %s %d %s\n", rc,
+                       libcfs_net2str(net), hops, libcfs_nid2str(gateway));
+                return rc;
+        }
+
+        LASSERT (!the_lnet.ln_shutdown);
+        CFS_INIT_LIST_HEAD(&zombies);
+
+        rnet2 = lnet_find_net_locked(net);
+        if (rnet2 == NULL) {
+                /* new network */
+                list_add_tail(&rnet->lrn_list, &the_lnet.ln_remote_nets);
+                rnet2 = rnet;
+        }
+
+        if (hops > rnet2->lrn_hops) {
+                /* New route is longer; ignore it */
+                add_route = 0;
+        } else if (hops < rnet2->lrn_hops) {
+                /* new route supercedes all currently known routes to this
+                 * net */
+                list_add(&zombies, &rnet2->lrn_routes);
+                list_del_init(&rnet2->lrn_routes);
+                add_route = 1;
+        } else {
+                add_route = 1;
+                /* New route has the same hopcount as existing routes; search
+                 * for a duplicate route (it's a NOOP if it is) */
+                list_for_each (e, &rnet2->lrn_routes) {
+                        route2 = list_entry(e, lnet_route_t, lr_list);
+
+                        if (route2->lr_gateway == route->lr_gateway) {
+                                add_route = 0;
+                                break;
+                        }
+
+                        /* our loopups must be true */
+                        LASSERT (route2->lr_gateway->lp_nid != gateway);
+                }
+        }
+        
+        if (add_route) {
+                ni = route->lr_gateway->lp_ni;
+                lnet_ni_addref_locked(ni);
+                
+                LASSERT (rc == 0);
+                list_add_tail(&route->lr_list, &rnet2->lrn_routes);
+                the_lnet.ln_remote_nets_version++;
+
+                lnet_rtr_addref_locked(route->lr_gateway);
+
+                LNET_UNLOCK();
+
+                /* XXX Assume alive */
+                if (ni->ni_lnd->lnd_notify != NULL)
+                        (ni->ni_lnd->lnd_notify)(ni, gateway, 1);
+
+                lnet_ni_decref(ni);
+        } else {
+                lnet_peer_decref_locked(route->lr_gateway);
+                LNET_UNLOCK();
+                LIBCFS_FREE(route, sizeof(*route));
+        }
+
+        if (rnet != rnet2)
+                LIBCFS_FREE(rnet, sizeof(*rnet));
+
+        while (!list_empty(&zombies)) {
+                route = list_entry(zombies.next, lnet_route_t, lr_list);
+                list_del(&route->lr_list);
+                
+                LNET_LOCK();
+                lnet_peer_decref_locked(route->lr_gateway);
+                LNET_UNLOCK();
+                LIBCFS_FREE(route, sizeof(*route));
+        }
+
+        return rc;
+}
+
+int
+lnet_check_routes (void)
+{
+        lnet_remotenet_t    *rnet;
+        lnet_route_t        *route;
+        lnet_route_t        *route2;
+        struct list_head    *e1;
+        struct list_head    *e2;
+
+        LNET_LOCK();
+
+        list_for_each (e1, &the_lnet.ln_remote_nets) {
+                rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                route2 = NULL;
+                list_for_each (e2, &rnet->lrn_routes) {
+                        route = list_entry(e2, lnet_route_t, lr_list);
+
+                        if (route2 == NULL)
+                                route2 = route;
+                        else if (route->lr_gateway->lp_ni !=
+                                 route2->lr_gateway->lp_ni) {
+                                LNET_UNLOCK();
+                                
+                                CERROR("Routes to %s via %s and %s not supported\n",
+                                       libcfs_net2str(rnet->lrn_net),
+                                       libcfs_nid2str(route->lr_gateway->lp_nid),
+                                       libcfs_nid2str(route2->lr_gateway->lp_nid));
+                                return -EINVAL;
+                        }
+                }
+        }
+        
+        LNET_UNLOCK();
+        return 0;
+}
+
+int
+lnet_del_route (__u32 net, lnet_nid_t gw_nid)
+{
+        lnet_remotenet_t    *rnet;
+        lnet_route_t        *route;
+        struct list_head    *e1;
+        struct list_head    *e2;
+        int                  rc = -ENOENT;
+
+        CDEBUG(D_NET, "Del route: net %s : gw %s\n",
+               libcfs_net2str(net), libcfs_nid2str(gw_nid));
+
+        /* NB Caller may specify either all routes via the given gateway
+         * or a specific route entry actual NIDs) */
+
+ again:
+        LNET_LOCK();
+
+        list_for_each (e1, &the_lnet.ln_remote_nets) {
+                rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                if (!(net == LNET_NIDNET(LNET_NID_ANY) ||
+                      net == rnet->lrn_net))
+                        continue;
+
+                list_for_each (e2, &rnet->lrn_routes) {
+                        route = list_entry(e2, lnet_route_t, lr_list);
+
+                        if (!(gw_nid == LNET_NID_ANY ||
+                              gw_nid == route->lr_gateway->lp_nid))
+                                continue;
+
+                        list_del(&route->lr_list);
+                        the_lnet.ln_remote_nets_version++;
+
+                        if (list_empty(&rnet->lrn_routes))
+                                list_del(&rnet->lrn_list);
+                        else
+                                rnet = NULL;
+
+                        lnet_rtr_decref_locked(route->lr_gateway);
+                        lnet_peer_decref_locked(route->lr_gateway);
+                        LNET_UNLOCK();
+
+                        LIBCFS_FREE(route, sizeof (*route));
+
+                        if (rnet != NULL)
+                                LIBCFS_FREE(rnet, sizeof(*rnet));
+
+                        rc = 0;
+                        goto again;
+                }
+        }
+
+        LNET_UNLOCK();
+        return rc;
+}
+
+void
+lnet_destroy_routes (void)
+{
+        lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY);
+}
+
+int
+lnet_get_route (int idx, __u32 *net, __u32 *hops,
+               lnet_nid_t *gateway, __u32 *alive)
+{
+       struct list_head    *e1;
+       struct list_head    *e2;
+        lnet_remotenet_t    *rnet;
+        lnet_route_t        *route;
+
+        LNET_LOCK();
+
+        list_for_each (e1, &the_lnet.ln_remote_nets) {
+                rnet = list_entry(e1, lnet_remotenet_t, lrn_list);
+
+                list_for_each (e2, &rnet->lrn_routes) {
+                        route = list_entry(e2, lnet_route_t, lr_list);
+
+                        if (idx-- == 0) {
+                                *net     = rnet->lrn_net;
+                                *hops    = rnet->lrn_hops;
+                                *gateway = route->lr_gateway->lp_nid;
+                                *alive   = route->lr_gateway->lp_alive;
+                                LNET_UNLOCK();
+                                return 0;
+                        }
+                }
+        }
+
+        LNET_UNLOCK();
+        return -ENOENT;
+}
+
+#if defined(__KERNEL__) && defined(LNET_ROUTER)
+static void
+lnet_router_checker_event (lnet_event_t *event)
+{
+        /* CAVEAT EMPTOR: I'm called with LNET_LOCKed and I'm not allowed to
+         * drop it (that's how come I see _every_ event, even ones that would
+         * overflow my EQ) */
+        lnet_peer_t   *lp;
+        lnet_nid_t     nid;
+
+        if (event->unlinked) {
+                /* The router checker thread has unlinked the rc_md
+                 * and exited. */
+                LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING);
+                the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED; 
+                mutex_up(&the_lnet.ln_rc_signal); 
+                return;
+        }
+
+        LASSERT (event->type == LNET_EVENT_SEND || 
+                 event->type == LNET_EVENT_REPLY);
+        
+        nid = (event->type == LNET_EVENT_SEND) ?
+              event->target.nid : event->initiator.nid;
+
+        lp = lnet_find_peer_locked(nid);
+        if (lp == NULL) {
+                /* router may have been removed */
+                CDEBUG(D_NET, "Router %s not found\n", libcfs_nid2str(nid));
+                return;
+        }
+
+        if (event->type == LNET_EVENT_SEND)     /* re-enable another ping */
+                lp->lp_ping_notsent = 0;
+
+        if (lnet_isrouter(lp) &&                /* ignore if no longer a router */
+            (event->status != 0 ||
+             event->type == LNET_EVENT_REPLY)) {
+                
+                /* A successful REPLY means the router is up.  If _any_ comms
+                 * to the router fail I assume it's down (this will happen if
+                 * we ping alive routers to try to detect router death before
+                 * apps get burned). */
+
+                lnet_notify_locked(lp, 1, (event->status == 0),
+                                   cfs_time_current_sec());
+
+                /* The router checker will wake up very shortly and do the
+                 * actual notification.  
+                 * XXX If 'lp' stops being a router before then, it will still
+                 * have the notification pending!!! */
+        }
+
+        /* This decref will NOT drop LNET_LOCK (it had to have 1 ref when it
+         * was in the peer table and I've not dropped the lock, so no-one else
+         * can have reduced the refcount) */
+        LASSERT(lp->lp_refcount > 1);
+
+        lnet_peer_decref_locked(lp);
+}
+
+static int
+lnet_router_checker(void *arg)
+{
+        static lnet_ping_info_t   pinginfo;
+
+        int                  rc;
+        lnet_handle_md_t     mdh;
+        lnet_peer_t         *rtr;
+        struct list_head    *entry;
+        time_t               now;
+        lnet_process_id_t    rtr_id;
+        int                  secs;
+
+       cfs_daemonize("router_checker");
+       cfs_block_allsigs();
+
+        rtr_id.pid = LUSTRE_SRV_LNET_PID;
+
+        LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+        rc = LNetMDBind((lnet_md_t){.start     = &pinginfo,
+                                    .length    = sizeof(pinginfo),
+                                    .threshold = LNET_MD_THRESH_INF,
+                                    .options   = LNET_MD_TRUNCATE,
+                                    .eq_handle = the_lnet.ln_rc_eqh},
+                        LNET_UNLINK,
+                        &mdh);
+
+        if (rc < 0) {
+                CERROR("Can't bind MD: %d\n", rc);
+                the_lnet.ln_rc_state = rc;
+                mutex_up(&the_lnet.ln_rc_signal);
+                return rc;
+        }
+
+        LASSERT (rc == 0);
+
+        the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING;
+        mutex_up(&the_lnet.ln_rc_signal);       /* let my parent go */
+
+       while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) {
+                __u64 version;
+
+                LNET_LOCK();
+rescan:
+                version = the_lnet.ln_routers_version;
+
+                list_for_each (entry, &the_lnet.ln_routers) {
+                        rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+
+                        lnet_peer_addref_locked(rtr);
+
+                        now = cfs_time_current_sec();
+
+                        if (rtr->lp_ping_deadline != 0 && /* ping timed out? */
+                            now > rtr->lp_ping_deadline)
+                                lnet_notify_locked(rtr, 1, 0, now);
+
+                        LNET_UNLOCK();
+
+                        /* Run any outstanding notificiations */
+                        lnet_do_notify(rtr);
+
+                        if (rtr->lp_alive) {
+                                secs = live_router_check_interval;
+                        } else {
+                                secs = dead_router_check_interval;
+                        }
+                        if (secs <= 0)
+                                secs = 0;
+                        
+                        if (secs != 0 &&
+                            !rtr->lp_ping_notsent &&
+                            now > rtr->lp_ping_timestamp + secs) {
+                                CDEBUG(D_NET, "Check: %s\n",
+                                       libcfs_nid2str(rtr->lp_nid));
+
+                                LNET_LOCK();
+                                rtr_id.nid = rtr->lp_nid;
+                                rtr->lp_ping_notsent = 1;
+                                rtr->lp_ping_timestamp = now;
+
+                                if (rtr->lp_ping_deadline == 0)
+                                        rtr->lp_ping_deadline = 
+                                                now + router_ping_timeout;
+
+                                LNET_UNLOCK();
+
+                                LNetGet(LNET_NID_ANY, mdh, rtr_id,
+                                        LNET_RESERVED_PORTAL,
+                                        LNET_PROTO_PING_MATCHBITS, 0);
+                        }
+                        
+                        LNET_LOCK();
+                        lnet_peer_decref_locked(rtr);
+
+                        if (version != the_lnet.ln_routers_version) {
+                                /* the routers list has changed */
+                                goto rescan;
+                        }
+                }
+
+                LNET_UNLOCK();
+
+                /* Call cfs_pause() here always adds 1 to load average 
+                 * because kernel counts # active tasks as nr_running 
+                 * + nr_uninterruptible. */
+                set_current_state(CFS_TASK_INTERRUPTIBLE);
+                cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE,
+                                     cfs_time_seconds(1));
+       }
+
+        LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD);
+        the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING;
+        
+        rc = LNetMDUnlink(mdh);
+        LASSERT (rc == 0);
+
+        /* The unlink event callback will signal final completion */
+
+       return 0;
+}
+
+
+void
+lnet_wait_known_routerstate(void)
+{
+        lnet_peer_t         *rtr;
+        struct list_head    *entry;
+        int                  all_known;
+
+        for (;;) {
+                LNET_LOCK();
+                
+                all_known = 1;
+                list_for_each (entry, &the_lnet.ln_routers) {
+                        rtr = list_entry(entry, lnet_peer_t, lp_rtr_list);
+                
+                        if (rtr->lp_alive_count == 0) {
+                                all_known = 0;
+                                break;
+                        }
+                }
+
+                LNET_UNLOCK();
+
+                if (all_known)
+                        return;
+
+                cfs_pause(cfs_time_seconds(1));
+        }
+}
+
+void
+lnet_router_checker_stop(void)
+{
+        int       rc;
+
+        LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING ||
+                 the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+        if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN)
+                return;
+
+        the_lnet.ln_rc_state = LNET_RC_STATE_STOPTHREAD;
+       /* block until event callback signals exit */
+       mutex_down(&the_lnet.ln_rc_signal);
+
+        LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKED);
+
+        rc = LNetEQFree(the_lnet.ln_rc_eqh);
+        LASSERT (rc == 0);
+        
+        the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+}
+
+int
+lnet_router_checker_start(void)
+{
+        int  rc;
+
+        LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN);
+
+        if (check_routers_before_use &&
+            dead_router_check_interval <= 0) {
+                LCONSOLE_ERROR("'dead_router_check_interval' must be set if "
+                               "'check_routers_before_use' is set\n");
+                return -EINVAL;
+        }
+        
+        if (live_router_check_interval <= 0 &&
+            dead_router_check_interval <= 0)
+                return 0;
+
+       init_mutex_locked(&the_lnet.ln_rc_signal);
+
+        /* EQ size doesn't matter; the callback is guaranteed to get every
+         * event */
+        rc = LNetEQAlloc(1, lnet_router_checker_event,
+                         &the_lnet.ln_rc_eqh);
+        if (rc != 0) {
+                CERROR("Can't allocate EQ: %d\n", rc);
+                return -ENOMEM;
+        }
+
+       rc = (int)cfs_kernel_thread(lnet_router_checker, NULL, 0);
+       if (rc < 0) {
+               CERROR("Can't start router checker thread: %d\n", rc);
+                goto failed;
+       }
+
+       mutex_down(&the_lnet.ln_rc_signal);     /* wait for checker to startup */
+
+        rc = the_lnet.ln_rc_state;
+        if (rc < 0) {
+                the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN;
+                goto failed;
+        }
+        
+        LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING);
+
+        if (check_routers_before_use) {
+                /* Note that a helpful side-effect of pinging all known routers
+                 * at startup is that it makes them drop stale connections they
+                 * may have to a previous instance of me. */
+                lnet_wait_known_routerstate();
+        }
+        
+        return 0;
+        
+ failed:
+        rc = LNetEQFree(the_lnet.ln_rc_eqh);
+        LASSERT (rc == 0);
+        return rc;
+}
+
+void
+lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages)
+{
+        int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+
+        while (--npages >= 0)
+                cfs_free_page(rb->rb_kiov[npages].kiov_page);
+
+        LIBCFS_FREE(rb, sz);
+}
+
+lnet_rtrbuf_t *
+lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp)
+{
+        int            npages = rbp->rbp_npages;
+        int            sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]);
+        struct page   *page;
+        lnet_rtrbuf_t *rb;
+        int            i;
+
+        LIBCFS_ALLOC(rb, sz);
+
+        rb->rb_pool = rbp;
+
+        for (i = 0; i < npages; i++) {
+                page = cfs_alloc_page(CFS_ALLOC_ZERO | CFS_ALLOC_STD);
+                if (page == NULL) {
+                        while (--i >= 0)
+                                cfs_free_page(rb->rb_kiov[i].kiov_page);
+
+                        LIBCFS_FREE(rb, sz);
+                        return NULL;
+                }
+
+                rb->rb_kiov[i].kiov_len = CFS_PAGE_SIZE;
+                rb->rb_kiov[i].kiov_offset = 0;
+                rb->rb_kiov[i].kiov_page = page;
+        }
+
+        return rb;
+}
+
+void
+lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp)
+{
+        int            npages = rbp->rbp_npages;
+        int            nbuffers = 0;
+        lnet_rtrbuf_t *rb;
+
+        LASSERT (list_empty(&rbp->rbp_msgs));
+        LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers);
+
+        while (!list_empty(&rbp->rbp_bufs)) {
+                LASSERT (rbp->rbp_credits > 0);
+
+                rb = list_entry(rbp->rbp_bufs.next,
+                                lnet_rtrbuf_t, rb_list);
+                list_del(&rb->rb_list);
+                lnet_destroy_rtrbuf(rb, npages);
+                nbuffers++;
+        }
+
+        LASSERT (rbp->rbp_nbuffers == nbuffers);
+        LASSERT (rbp->rbp_credits == nbuffers);
+
+        rbp->rbp_nbuffers = rbp->rbp_credits = 0;
+}
+
+int
+lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs)
+{
+        lnet_rtrbuf_t *rb;
+        int            i;
+
+        if (rbp->rbp_nbuffers != 0) {
+                LASSERT (rbp->rbp_nbuffers == nbufs);
+                return 0;
+        }
+        
+        for (i = 0; i < nbufs; i++) {
+                rb = lnet_new_rtrbuf(rbp);
+
+                if (rb == NULL) {
+                        CERROR("Failed to allocate %d router bufs of %d pages\n",
+                               nbufs, rbp->rbp_npages);
+                        return -ENOMEM;
+                }
+
+                rbp->rbp_nbuffers++;
+                rbp->rbp_credits++;
+                rbp->rbp_mincredits++;
+                list_add(&rb->rb_list, &rbp->rbp_bufs);
+
+                /* No allocation "under fire" */
+                /* Otherwise we'd need code to schedule blocked msgs etc */
+                LASSERT (!the_lnet.ln_routing);
+        }
+
+        LASSERT (rbp->rbp_credits == nbufs);
+        return 0;
+}
+
+void
+lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages)
+{
+        CFS_INIT_LIST_HEAD(&rbp->rbp_msgs);
+        CFS_INIT_LIST_HEAD(&rbp->rbp_bufs);
+
+        rbp->rbp_npages = npages;
+        rbp->rbp_credits = 0;
+        rbp->rbp_mincredits = 0;
+}
+
+void
+lnet_free_rtrpools(void)
+{
+        lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[0]);
+        lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[1]);
+        lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[2]);
+}
+
+void
+lnet_init_rtrpools(void)
+{
+        int small_pages = 1;
+        int large_pages = (LNET_MTU + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
+
+        lnet_rtrpool_init(&the_lnet.ln_rtrpools[0], 0);
+        lnet_rtrpool_init(&the_lnet.ln_rtrpools[1], small_pages);
+        lnet_rtrpool_init(&the_lnet.ln_rtrpools[2], large_pages);
+}
+
+
+int
+lnet_alloc_rtrpools(int im_a_router)
+{
+        int       rc;
+        
+        if (!strcmp(forwarding, "")) {
+                /* not set either way */
+                if (!im_a_router)
+                        return 0;
+        } else if (!strcmp(forwarding, "disabled")) {
+                /* explicitly disabled */
+                return 0;
+        } else if (!strcmp(forwarding, "enabled")) {
+                /* explicitly enabled */
+        } else {
+                LCONSOLE_ERROR("'forwarding' not set to either "
+                               "'enabled' or 'disabled'\n");
+                return -EINVAL;
+        }
+        
+        if (tiny_router_buffers <= 0) {
+                LCONSOLE_ERROR("tiny_router_buffers=%d invalid when "
+                               "routing enabled\n", tiny_router_buffers);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[0],
+                                     tiny_router_buffers);
+        if (rc != 0)
+                goto failed;
+
+        if (small_router_buffers <= 0) {
+                LCONSOLE_ERROR("small_router_buffers=%d invalid when "
+                               "routing enabled\n", small_router_buffers);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[1],
+                                     small_router_buffers);
+        if (rc != 0)
+                goto failed;
+
+        if (large_router_buffers <= 0) {
+                LCONSOLE_ERROR("large_router_buffers=%d invalid when "
+                               "routing enabled\n", large_router_buffers);
+                rc = -EINVAL;
+                goto failed;
+        }
+
+        rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[2],
+                                     large_router_buffers);
+        if (rc != 0)
+                goto failed;
+
+        LNET_LOCK();
+        the_lnet.ln_routing = 1;
+        LNET_UNLOCK();
+        
+        return 0;
+
+ failed:
+        lnet_free_rtrpools();
+        return rc;
+}
+
+#else
+
+int
+lnet_peers_start_down(void)
+{
+        return 0;
+}
+
+void
+lnet_router_checker_stop(void)
+{
+        return;
+}
+
+int
+lnet_router_checker_start(void)
+{
+        return 0;
+}
+
+void
+lnet_free_rtrpools (void)
+{
+}
+
+void
+lnet_init_rtrpools (void)
+{
+}
+
+int
+lnet_alloc_rtrpools (int im_a_arouter)
+{
+        return 0;
+}
+
+#endif
diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c
new file mode 100644 (file)
index 0000000..5be36b1
--- /dev/null
@@ -0,0 +1,1094 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *
+ *   This file is part of Portals
+ *   http://sourceforge.net/projects/sandiaportals/
+ *
+ *   Portals is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Portals is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Portals; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+#include <libcfs/libcfs.h>
+#include <lnet/lib-lnet.h>
+
+#if defined(__KERNEL__) && defined(LNET_ROUTER)
+
+#include <linux/seq_file.h>
+#include <linux/lustre_compat25.h>
+
+/* this is really lnet_proc.c */
+
+#define LNET_PROC_STATS   "sys/lnet/stats"
+#define LNET_PROC_ROUTES  "sys/lnet/routes"
+#define LNET_PROC_ROUTERS "sys/lnet/routers"
+#define LNET_PROC_PEERS   "sys/lnet/peers"
+#define LNET_PROC_BUFFERS "sys/lnet/buffers"
+#define LNET_PROC_NIS     "sys/lnet/nis"
+
+static int
+lnet_router_proc_stats_read (char *page, char **start, off_t off,
+                             int count, int *eof, void *data)
+{
+        lnet_counters_t *ctrs;
+        int              rc;
+
+        *start = page;
+        *eof = 1;
+        if (off != 0)
+                return 0;
+
+        LIBCFS_ALLOC(ctrs, sizeof(*ctrs));
+        if (ctrs == NULL)
+                return -ENOMEM;
+
+        LNET_LOCK();
+        *ctrs = the_lnet.ln_counters;
+        LNET_UNLOCK();
+
+        rc = sprintf(page,
+                     "%u %u %u %u %u %u %u "LPU64" "LPU64" "LPU64" "LPU64"\n",
+                     ctrs->msgs_alloc, ctrs->msgs_max,
+                     ctrs->errors,
+                     ctrs->send_count, ctrs->recv_count,
+                     ctrs->route_count, ctrs->drop_count,
+                     ctrs->send_length, ctrs->recv_length,
+                     ctrs->route_length, ctrs->drop_length);
+
+        LIBCFS_FREE(ctrs, sizeof(*ctrs));
+        return rc;
+}
+
+static int
+lnet_router_proc_stats_write(struct file *file, const char *ubuffer,
+                     unsigned long count, void *data)
+{
+        LNET_LOCK();
+        memset(&the_lnet.ln_counters, 0, sizeof(the_lnet.ln_counters));
+        LNET_UNLOCK();
+
+        return (count);
+}
+
+typedef struct {
+        __u64                lrsi_version;
+        lnet_remotenet_t    *lrsi_net;
+        lnet_route_t        *lrsi_route;
+        loff_t               lrsi_off;
+} lnet_route_seq_iterator_t;
+
+int
+lnet_route_seq_seek (lnet_route_seq_iterator_t *lrsi, loff_t off)
+{
+        struct list_head  *n;
+        struct list_head  *r;
+        int                rc;
+        loff_t             here;
+
+        if (off == 0) {
+                lrsi->lrsi_net = NULL;
+                lrsi->lrsi_route = NULL;
+                lrsi->lrsi_off = 0;
+                return 0;
+        }
+
+        LNET_LOCK();
+
+        if (lrsi->lrsi_net != NULL &&
+            lrsi->lrsi_version != the_lnet.ln_remote_nets_version) {
+                /* tables have changed */
+                rc = -ESTALE;
+                goto out;
+        }
+
+        if (lrsi->lrsi_net == NULL || lrsi->lrsi_off > off) {
+                /* search from start */
+                n = the_lnet.ln_remote_nets.next;
+                r = NULL;
+                here = 1;
+        } else {
+                /* continue search */
+                n = &lrsi->lrsi_net->lrn_list;
+                r = &lrsi->lrsi_route->lr_list;
+                here = lrsi->lrsi_off;
+        }
+
+        lrsi->lrsi_version = the_lnet.ln_remote_nets_version;
+        lrsi->lrsi_off        = off;
+
+        while (n != &the_lnet.ln_remote_nets) {
+                lnet_remotenet_t *rnet =
+                        list_entry(n, lnet_remotenet_t, lrn_list);
+
+                if (r == NULL)
+                        r = rnet->lrn_routes.next;
+
+                while (r != &rnet->lrn_routes) {
+                        lnet_route_t *re =
+                                list_entry(r, lnet_route_t,
+                                           lr_list);
+
+                        if (here == off) {
+                                lrsi->lrsi_net = rnet;
+                                lrsi->lrsi_route = re;
+                                rc = 0;
+                                goto out;
+                        }
+
+                        r = r->next;
+                        here++;
+                }
+
+                r = NULL;
+                n = n->next;
+        }
+
+        lrsi->lrsi_net   = NULL;
+        lrsi->lrsi_route = NULL;
+        rc             = -ENOENT;
+ out:
+        LNET_UNLOCK();
+        return rc;
+}
+
+static void *
+lnet_route_seq_start (struct seq_file *s, loff_t *pos)
+{
+        lnet_route_seq_iterator_t *lrsi;
+        int                        rc;
+
+        LIBCFS_ALLOC(lrsi, sizeof(*lrsi));
+        if (lrsi == NULL)
+                return NULL;
+
+        lrsi->lrsi_net = NULL;
+        rc = lnet_route_seq_seek(lrsi, *pos);
+        if (rc == 0)
+                return lrsi;
+
+        LIBCFS_FREE(lrsi, sizeof(*lrsi));
+        return NULL;
+}
+
+static void
+lnet_route_seq_stop (struct seq_file *s, void *iter)
+{
+        lnet_route_seq_iterator_t  *lrsi = iter;
+
+        if (lrsi != NULL)
+                LIBCFS_FREE(lrsi, sizeof(*lrsi));
+}
+
+static void *
+lnet_route_seq_next (struct seq_file *s, void *iter, loff_t *pos)
+{
+        lnet_route_seq_iterator_t *lrsi = iter;
+        int                        rc;
+        loff_t                     next = *pos + 1;
+
+        rc = lnet_route_seq_seek(lrsi, next);
+        if (rc != 0) {
+                LIBCFS_FREE(lrsi, sizeof(*lrsi));
+                return NULL;
+        }
+
+        *pos = next;
+        return lrsi;
+}
+
+static int
+lnet_route_seq_show (struct seq_file *s, void *iter)
+{
+        lnet_route_seq_iterator_t *lrsi = iter;
+        __u32                      net;
+        unsigned int               hops;
+        lnet_nid_t                 nid;
+        int                        alive;
+
+        if (lrsi->lrsi_off == 0) {
+                seq_printf(s, "Routing %s\n",
+                           the_lnet.ln_routing ? "enabled" : "disabled");
+                seq_printf(s, "%-8s %4s %7s %s\n",
+                           "net", "hops", "state", "router");
+                return 0;
+        }
+
+        LASSERT (lrsi->lrsi_net != NULL);
+        LASSERT (lrsi->lrsi_route != NULL);
+
+        LNET_LOCK();
+
+        if (lrsi->lrsi_version != the_lnet.ln_remote_nets_version) {
+                LNET_UNLOCK();
+                return -ESTALE;
+        }
+
+        net   = lrsi->lrsi_net->lrn_net;
+        hops  = lrsi->lrsi_net->lrn_hops;
+        nid   = lrsi->lrsi_route->lr_gateway->lp_nid;
+        alive = lrsi->lrsi_route->lr_gateway->lp_alive;
+
+        LNET_UNLOCK();
+
+        seq_printf(s, "%-8s %4u %7s %s\n", libcfs_net2str(net), hops,
+                   alive ? "up" : "down", libcfs_nid2str(nid));
+        return 0;
+}
+
+static struct seq_operations lnet_routes_sops = {
+        .start = lnet_route_seq_start,
+        .stop  = lnet_route_seq_stop,
+        .next  = lnet_route_seq_next,
+        .show  = lnet_route_seq_show,
+};
+
+static int
+lnet_route_seq_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *dp = PDE(inode);
+        struct seq_file       *sf;
+        int                    rc;
+
+        rc = seq_open(file, &lnet_routes_sops);
+        if (rc == 0) {
+                sf = file->private_data;
+                sf->private = dp->data;
+        }
+
+        return rc;
+}
+
+static struct file_operations lnet_routes_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lnet_route_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+typedef struct {
+        __u64                lrtrsi_version;
+        lnet_peer_t          *lrtrsi_router;
+        loff_t               lrtrsi_off;
+} lnet_router_seq_iterator_t;
+
+int
+lnet_router_seq_seek (lnet_router_seq_iterator_t *lrtrsi, loff_t off)
+{
+        struct list_head  *r;
+        lnet_peer_t       *lp;
+        int                rc;
+        loff_t             here;
+
+        if (off == 0) {
+                lrtrsi->lrtrsi_router = NULL;
+                lrtrsi->lrtrsi_off = 0;
+                return 0;
+        }
+
+        LNET_LOCK();
+
+        lp = lrtrsi->lrtrsi_router;
+
+        if (lp != NULL &&
+            lrtrsi->lrtrsi_version != the_lnet.ln_routers_version) {
+                /* tables have changed */
+                rc = -ESTALE;
+                goto out;
+        }
+
+        if (lp == NULL || lrtrsi->lrtrsi_off > off) {
+                /* search from start */
+                r = the_lnet.ln_routers.next;
+                here = 1;
+        } else {
+                /* continue search */
+                r = &lp->lp_rtr_list;
+                here = lrtrsi->lrtrsi_off;
+        }
+
+        lrtrsi->lrtrsi_version = the_lnet.ln_routers_version;
+        lrtrsi->lrtrsi_off        = off;
+
+        while (r != &the_lnet.ln_routers) {
+                lnet_peer_t *rtr = list_entry(r, 
+                                              lnet_peer_t,
+                                              lp_rtr_list);
+
+                if (here == off) {
+                        lrtrsi->lrtrsi_router = rtr;
+                        rc = 0;
+                        goto out;
+                }
+
+                r = r->next;
+                here++;
+        }
+
+        lrtrsi->lrtrsi_router = NULL;
+        rc             = -ENOENT;
+ out:
+        LNET_UNLOCK();
+        return rc;
+}
+
+static void *
+lnet_router_seq_start (struct seq_file *s, loff_t *pos)
+{
+        lnet_router_seq_iterator_t *lrtrsi;
+        int                        rc;
+
+        LIBCFS_ALLOC(lrtrsi, sizeof(*lrtrsi));
+        if (lrtrsi == NULL)
+                return NULL;
+
+        lrtrsi->lrtrsi_router = NULL;
+        rc = lnet_router_seq_seek(lrtrsi, *pos);
+        if (rc == 0)
+                return lrtrsi;
+
+        LIBCFS_FREE(lrtrsi, sizeof(*lrtrsi));
+        return NULL;
+}
+
+static void
+lnet_router_seq_stop (struct seq_file *s, void *iter)
+{
+        lnet_router_seq_iterator_t  *lrtrsi = iter;
+
+        if (lrtrsi != NULL)
+                LIBCFS_FREE(lrtrsi, sizeof(*lrtrsi));
+}
+
+static void *
+lnet_router_seq_next (struct seq_file *s, void *iter, loff_t *pos)
+{
+        lnet_router_seq_iterator_t *lrtrsi = iter;
+        int                        rc;
+        loff_t                     next = *pos + 1;
+
+        rc = lnet_router_seq_seek(lrtrsi, next);
+        if (rc != 0) {
+                LIBCFS_FREE(lrtrsi, sizeof(*lrtrsi));
+                return NULL;
+        }
+
+        *pos = next;
+        return lrtrsi;
+}
+
+static int
+lnet_router_seq_show (struct seq_file *s, void *iter)
+{
+        lnet_router_seq_iterator_t *lrtrsi = iter;
+        lnet_peer_t *lp;
+        lnet_nid_t  nid;
+        int         alive;
+        int         nrefs;
+        int         nrtrrefs;
+
+        if (lrtrsi->lrtrsi_off == 0) {
+                seq_printf(s, "%-4s %7s %9s %6s %12s %s\n",
+                           "ref", "rtr_ref", "alive_cnt", "state", "last_ping", "router");
+                return 0;
+        }
+
+        lp = lrtrsi->lrtrsi_router;
+        LASSERT (lp != NULL);
+
+        LNET_LOCK();
+
+        if (lrtrsi->lrtrsi_version != the_lnet.ln_routers_version) {
+                LNET_UNLOCK();
+                return -ESTALE;
+        }
+
+        nrefs = lp->lp_refcount;
+        nrtrrefs = lp->lp_rtr_refcount;
+        nid   = lp->lp_nid;
+        alive = lp->lp_alive;
+
+        LNET_UNLOCK();
+
+        seq_printf(s, 
+                   "%-4d %7d %9d %6s %12lu %s\n", 
+                   nrefs, nrtrrefs,
+                   lp->lp_alive_count,
+                   alive ? "up" : "down", 
+                   lp->lp_ping_timestamp,
+                   libcfs_nid2str(nid));
+        return 0;
+}
+
+static struct seq_operations lnet_routers_sops = {
+        .start = lnet_router_seq_start,
+        .stop  = lnet_router_seq_stop,
+        .next  = lnet_router_seq_next,
+        .show  = lnet_router_seq_show,
+};
+
+static int
+lnet_router_seq_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *dp = PDE(inode);
+        struct seq_file       *sf;
+        int                    rc;
+
+        rc = seq_open(file, &lnet_routers_sops);
+        if (rc == 0) {
+                sf = file->private_data;
+                sf->private = dp->data;
+        }
+
+        return rc;
+}
+
+static struct file_operations lnet_routers_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lnet_router_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+typedef struct {
+        unsigned long long   lpsi_version;
+        int                  lpsi_idx;
+        lnet_peer_t         *lpsi_peer;
+        loff_t               lpsi_off;
+} lnet_peer_seq_iterator_t;
+
+int
+lnet_peer_seq_seek (lnet_peer_seq_iterator_t *lpsi, loff_t off)
+{
+        int                idx;
+        struct list_head  *p;
+        loff_t             here;
+        int                rc;
+
+        if (off == 0) {
+                lpsi->lpsi_idx = 0;
+                lpsi->lpsi_peer = NULL;
+                lpsi->lpsi_off = 0;
+                return 0;
+        }
+
+        LNET_LOCK();
+
+        if (lpsi->lpsi_peer != NULL &&
+            lpsi->lpsi_version != the_lnet.ln_peertable_version) {
+                /* tables have changed */
+                rc = -ESTALE;
+                goto out;
+        }
+
+        if (lpsi->lpsi_peer == NULL ||
+            lpsi->lpsi_off > off) {
+                /* search from start */
+                idx = 0;
+                p = NULL;
+                here = 1;
+        } else {
+                /* continue search */
+                idx = lpsi->lpsi_idx;
+                p = &lpsi->lpsi_peer->lp_hashlist;
+                here = lpsi->lpsi_off;
+        }
+
+        lpsi->lpsi_version = the_lnet.ln_peertable_version;
+        lpsi->lpsi_off     = off;
+
+        while (idx < LNET_PEER_HASHSIZE) {
+                if (p == NULL)
+                        p = the_lnet.ln_peer_hash[idx].next;
+
+                while (p != &the_lnet.ln_peer_hash[idx]) {
+                        lnet_peer_t *lp = list_entry(p, lnet_peer_t,
+                                                     lp_hashlist);
+
+                        if (here == off) {
+                                lpsi->lpsi_idx = idx;
+                                lpsi->lpsi_peer = lp;
+                                rc = 0;
+                                goto out;
+                        }
+
+                        here++;
+                        p = lp->lp_hashlist.next;
+                }
+
+                p = NULL;
+                idx++;
+        }
+
+        lpsi->lpsi_idx  = 0;
+        lpsi->lpsi_peer = NULL;
+        rc              = -ENOENT;
+ out:
+        LNET_UNLOCK();
+        return rc;
+}
+
+static void *
+lnet_peer_seq_start (struct seq_file *s, loff_t *pos)
+{
+        lnet_peer_seq_iterator_t *lpsi;
+        int                        rc;
+
+        LIBCFS_ALLOC(lpsi, sizeof(*lpsi));
+        if (lpsi == NULL)
+                return NULL;
+
+        lpsi->lpsi_idx = 0;
+        lpsi->lpsi_peer = NULL;
+        rc = lnet_peer_seq_seek(lpsi, *pos);
+        if (rc == 0)
+                return lpsi;
+
+        LIBCFS_FREE(lpsi, sizeof(*lpsi));
+        return NULL;
+}
+
+static void
+lnet_peer_seq_stop (struct seq_file *s, void *iter)
+{
+        lnet_peer_seq_iterator_t  *lpsi = iter;
+
+        if (lpsi != NULL)
+                LIBCFS_FREE(lpsi, sizeof(*lpsi));
+}
+
+static void *
+lnet_peer_seq_next (struct seq_file *s, void *iter, loff_t *pos)
+{
+        lnet_peer_seq_iterator_t *lpsi = iter;
+        int                       rc;
+        loff_t                    next = *pos + 1;
+
+        rc = lnet_peer_seq_seek(lpsi, next);
+        if (rc != 0) {
+                LIBCFS_FREE(lpsi, sizeof(*lpsi));
+                return NULL;
+        }
+
+        *pos = next;
+        return lpsi;
+}
+
+static int
+lnet_peer_seq_show (struct seq_file *s, void *iter)
+{
+        lnet_peer_seq_iterator_t *lpsi = iter;
+        lnet_peer_t              *lp;
+        lnet_nid_t                nid;
+        int                       maxcr;
+        int                       mintxcr;
+        int                       txcr;
+        int                       minrtrcr;
+        int                       rtrcr;
+        int                       alive;
+        int                       txqnob;
+        int                       nrefs;
+
+        if (lpsi->lpsi_off == 0) {
+                seq_printf(s, "%-24s %4s %5s %5s %5s %5s %5s %5s %s\n",
+                           "nid", "refs", "state", "max",
+                           "rtr", "min", "tx", "min", "queue");
+                return 0;
+        }
+
+        LASSERT (lpsi->lpsi_peer != NULL);
+
+        LNET_LOCK();
+
+        if (lpsi->lpsi_version != the_lnet.ln_peertable_version) {
+                LNET_UNLOCK();
+                return -ESTALE;
+        }
+
+        lp = lpsi->lpsi_peer;
+
+        nid      = lp->lp_nid;
+        maxcr    = lp->lp_ni->ni_peertxcredits;
+        txcr     = lp->lp_txcredits;
+        mintxcr  = lp->lp_mintxcredits;
+        rtrcr    = lp->lp_rtrcredits;
+        minrtrcr = lp->lp_minrtrcredits;
+        alive    = lp->lp_alive;
+        txqnob   = lp->lp_txqnob;
+        nrefs    = lp->lp_refcount;
+
+        LNET_UNLOCK();
+
+        seq_printf(s, "%-24s %4d %5s %5d %5d %5d %5d %5d %d\n",
+                   libcfs_nid2str(nid), nrefs, alive ? "up" : "down",
+                   maxcr, rtrcr, minrtrcr, txcr, mintxcr, txqnob);
+        return 0;
+}
+
+static struct seq_operations lnet_peer_sops = {
+        .start = lnet_peer_seq_start,
+        .stop  = lnet_peer_seq_stop,
+        .next  = lnet_peer_seq_next,
+        .show  = lnet_peer_seq_show,
+};
+
+static int
+lnet_peer_seq_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *dp = PDE(inode);
+        struct seq_file       *sf;
+        int                    rc;
+
+        rc = seq_open(file, &lnet_peer_sops);
+        if (rc == 0) {
+                sf = file->private_data;
+                sf->private = dp->data;
+        }
+
+        return rc;
+}
+
+static struct file_operations lnet_peer_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lnet_peer_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+typedef struct {
+        int                  lbsi_idx;
+        loff_t               lbsi_off;
+} lnet_buffer_seq_iterator_t;
+
+int
+lnet_buffer_seq_seek (lnet_buffer_seq_iterator_t *lbsi, loff_t off)
+{
+        int                idx;
+        loff_t             here;
+        int                rc;
+
+        if (off == 0) {
+                lbsi->lbsi_idx = -1;
+                lbsi->lbsi_off = 0;
+                return 0;
+        }
+
+        LNET_LOCK();
+
+        if (lbsi->lbsi_idx < 0 ||
+            lbsi->lbsi_off > off) {
+                /* search from start */
+                idx = 0;
+                here = 1;
+        } else {
+                /* continue search */
+                idx = lbsi->lbsi_idx;
+                here = lbsi->lbsi_off;
+        }
+
+        lbsi->lbsi_off     = off;
+
+        while (idx < LNET_NRBPOOLS) {
+                if (here == off) {
+                        lbsi->lbsi_idx = idx;
+                        rc = 0;
+                        goto out;
+                }
+                here++;
+                idx++;
+        }
+
+        lbsi->lbsi_idx  = -1;
+        rc              = -ENOENT;
+ out:
+        LNET_UNLOCK();
+        return rc;
+}
+
+static void *
+lnet_buffer_seq_start (struct seq_file *s, loff_t *pos)
+{
+        lnet_buffer_seq_iterator_t *lbsi;
+        int                        rc;
+
+        LIBCFS_ALLOC(lbsi, sizeof(*lbsi));
+        if (lbsi == NULL)
+                return NULL;
+
+        lbsi->lbsi_idx = -1;
+        rc = lnet_buffer_seq_seek(lbsi, *pos);
+        if (rc == 0)
+                return lbsi;
+
+        LIBCFS_FREE(lbsi, sizeof(*lbsi));
+        return NULL;
+}
+
+static void
+lnet_buffer_seq_stop (struct seq_file *s, void *iter)
+{
+        lnet_buffer_seq_iterator_t  *lbsi = iter;
+
+        if (lbsi != NULL)
+                LIBCFS_FREE(lbsi, sizeof(*lbsi));
+}
+
+static void *
+lnet_buffer_seq_next (struct seq_file *s, void *iter, loff_t *pos)
+{
+        lnet_buffer_seq_iterator_t *lbsi = iter;
+        int                         rc;
+        loff_t                      next = *pos + 1;
+
+        rc = lnet_buffer_seq_seek(lbsi, next);
+        if (rc != 0) {
+                LIBCFS_FREE(lbsi, sizeof(*lbsi));
+                return NULL;
+        }
+
+        *pos = next;
+        return lbsi;
+}
+
+static int
+lnet_buffer_seq_show (struct seq_file *s, void *iter)
+{
+        lnet_buffer_seq_iterator_t *lbsi = iter;
+        lnet_rtrbufpool_t          *rbp;
+        int                         npages;
+        int                         nbuf;
+        int                         cr;
+        int                         mincr;
+
+        if (lbsi->lbsi_off == 0) {
+                seq_printf(s, "%5s %5s %7s %7s\n",
+                           "pages", "count", "credits", "min");
+                return 0;
+        }
+
+        LASSERT (lbsi->lbsi_idx >= 0 && lbsi->lbsi_idx < LNET_NRBPOOLS);
+
+        LNET_LOCK();
+
+        rbp = &the_lnet.ln_rtrpools[lbsi->lbsi_idx];
+
+        npages = rbp->rbp_npages;
+        nbuf   = rbp->rbp_nbuffers;
+        cr     = rbp->rbp_credits;
+        mincr  = rbp->rbp_mincredits;
+
+        LNET_UNLOCK();
+
+        seq_printf(s, "%5d %5d %7d %7d\n",
+                   npages, nbuf, cr, mincr);
+        return 0;
+}
+
+static struct seq_operations lnet_buffer_sops = {
+        .start = lnet_buffer_seq_start,
+        .stop  = lnet_buffer_seq_stop,
+        .next  = lnet_buffer_seq_next,
+        .show  = lnet_buffer_seq_show,
+};
+
+static int
+lnet_buffer_seq_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *dp = PDE(inode);
+        struct seq_file       *sf;
+        int                    rc;
+
+        rc = seq_open(file, &lnet_buffer_sops);
+        if (rc == 0) {
+                sf = file->private_data;
+                sf->private = dp->data;
+        }
+
+        return rc;
+}
+
+static struct file_operations lnet_buffers_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lnet_buffer_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+typedef struct {
+        lnet_ni_t           *lnsi_ni;
+        loff_t               lnsi_off;
+} lnet_ni_seq_iterator_t;
+
+int
+lnet_ni_seq_seek (lnet_ni_seq_iterator_t *lnsi, loff_t off)
+{
+        struct list_head  *n;
+        loff_t             here;
+        int                rc;
+
+        if (off == 0) {
+                lnsi->lnsi_ni = NULL;
+                lnsi->lnsi_off = 0;
+                return 0;
+        }
+
+        LNET_LOCK();
+
+        if (lnsi->lnsi_ni == NULL ||
+            lnsi->lnsi_off > off) {
+                /* search from start */
+                n = NULL;
+                here = 1;
+        } else {
+                /* continue search */
+                n = &lnsi->lnsi_ni->ni_list;
+                here = lnsi->lnsi_off;
+        }
+
+        lnsi->lnsi_off = off;
+
+        if (n == NULL)
+                n = the_lnet.ln_nis.next;
+
+        while (n != &the_lnet.ln_nis) {
+                if (here == off) {
+                        lnsi->lnsi_ni = list_entry(n, lnet_ni_t, ni_list);
+                        rc = 0;
+                        goto out;
+                }
+                here++;
+                n = n->next;
+        }
+
+        lnsi->lnsi_ni  = NULL;
+        rc             = -ENOENT;
+ out:
+        LNET_UNLOCK();
+        return rc;
+}
+
+static void *
+lnet_ni_seq_start (struct seq_file *s, loff_t *pos)
+{
+        lnet_ni_seq_iterator_t *lnsi;
+        int                     rc;
+
+        LIBCFS_ALLOC(lnsi, sizeof(*lnsi));
+        if (lnsi == NULL)
+                return NULL;
+
+        lnsi->lnsi_ni = NULL;
+        rc = lnet_ni_seq_seek(lnsi, *pos);
+        if (rc == 0)
+                return lnsi;
+
+        LIBCFS_FREE(lnsi, sizeof(*lnsi));
+        return NULL;
+}
+
+static void
+lnet_ni_seq_stop (struct seq_file *s, void *iter)
+{
+        lnet_ni_seq_iterator_t  *lnsi = iter;
+
+        if (lnsi != NULL)
+                LIBCFS_FREE(lnsi, sizeof(*lnsi));
+}
+
+static void *
+lnet_ni_seq_next (struct seq_file *s, void *iter, loff_t *pos)
+{
+        lnet_ni_seq_iterator_t *lnsi = iter;
+        int                     rc;
+        loff_t                  next = *pos + 1;
+
+        rc = lnet_ni_seq_seek(lnsi, next);
+        if (rc != 0) {
+                LIBCFS_FREE(lnsi, sizeof(*lnsi));
+                return NULL;
+        }
+
+        *pos = next;
+        return lnsi;
+}
+
+static int
+lnet_ni_seq_show (struct seq_file *s, void *iter)
+{
+        lnet_ni_seq_iterator_t *lnsi = iter;
+        lnet_ni_t              *ni;
+        int                     maxtxcr;
+        int                     txcr;
+        int                     mintxcr;
+        int                     npeertxcr;
+        lnet_nid_t              nid;
+        int                     nref;
+
+        if (lnsi->lnsi_off == 0) {
+                seq_printf(s, "%-24s %4s %4s %5s %5s %5s\n",
+                           "nid", "refs", "peer", "max", "tx", "min");
+                return 0;
+        }
+
+        LASSERT (lnsi->lnsi_ni != NULL);
+
+        LNET_LOCK();
+
+        ni = lnsi->lnsi_ni;
+
+        maxtxcr   = ni->ni_maxtxcredits;
+        txcr      = ni->ni_txcredits;
+        mintxcr   = ni->ni_mintxcredits;
+        npeertxcr = ni->ni_peertxcredits;
+        nid       = ni->ni_nid;
+        nref      = ni->ni_refcount;
+
+        LNET_UNLOCK();
+
+        seq_printf(s, "%-24s %4d %4d %5d %5d %5d\n",
+                   libcfs_nid2str(nid), nref,
+                   npeertxcr, maxtxcr, txcr, mintxcr);
+        return 0;
+}
+
+static struct seq_operations lnet_ni_sops = {
+        .start = lnet_ni_seq_start,
+        .stop  = lnet_ni_seq_stop,
+        .next  = lnet_ni_seq_next,
+        .show  = lnet_ni_seq_show,
+};
+
+static int
+lnet_ni_seq_open(struct inode *inode, struct file *file)
+{
+        struct proc_dir_entry *dp = PDE(inode);
+        struct seq_file       *sf;
+        int                    rc;
+
+        rc = seq_open(file, &lnet_ni_sops);
+        if (rc == 0) {
+                sf = file->private_data;
+                sf->private = dp->data;
+        }
+
+        return rc;
+}
+
+static struct file_operations lnet_ni_fops = {
+        .owner   = THIS_MODULE,
+        .open    = lnet_ni_seq_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = seq_release,
+};
+
+void
+lnet_proc_init(void)
+{
+        struct proc_dir_entry *stats;
+        struct proc_dir_entry *routes;
+        struct proc_dir_entry *routers;
+        struct proc_dir_entry *peers;
+
+        /* Initialize LNET_PROC_STATS */
+        stats = create_proc_entry (LNET_PROC_STATS, 0644, NULL);
+        if (stats == NULL) {
+                CERROR("couldn't create proc entry %s\n", LNET_PROC_STATS);
+                return;
+        }
+
+        stats->data = NULL;
+        stats->read_proc = lnet_router_proc_stats_read;
+        stats->write_proc = lnet_router_proc_stats_write;
+
+        /* Initialize LNET_PROC_ROUTES */
+        routes = create_proc_entry (LNET_PROC_ROUTES, 0444, NULL);
+        if (routes == NULL) {
+                CERROR("couldn't create proc entry %s\n", LNET_PROC_ROUTES);
+                return;
+        }
+
+        routes->proc_fops = &lnet_routes_fops;
+        routes->data = NULL;
+
+        /* Initialize LNET_PROC_ROUTERS */
+        routers = create_proc_entry (LNET_PROC_ROUTERS, 0444, NULL);
+        if (routers == NULL) {
+                CERROR("couldn't create proc entry %s\n", LNET_PROC_ROUTERS);
+                return;
+        }
+
+        routers->proc_fops = &lnet_routers_fops;
+        routers->data = NULL;
+
+        /* Initialize LNET_PROC_PEERS */
+        peers = create_proc_entry (LNET_PROC_PEERS, 0444, NULL);
+        if (peers == NULL) {
+                CERROR("couldn't create proc entry %s\n", LNET_PROC_PEERS);
+                return;
+        }
+
+        peers->proc_fops = &lnet_peer_fops;
+        peers->data = NULL;
+
+        /* Initialize LNET_PROC_BUFFERS */
+        peers = create_proc_entry (LNET_PROC_BUFFERS, 0444, NULL);
+        if (peers == NULL) {
+                CERROR("couldn't create proc entry %s\n", LNET_PROC_BUFFERS);
+                return;
+        }
+
+        peers->proc_fops = &lnet_buffers_fops;
+        peers->data = NULL;
+
+        /* Initialize LNET_PROC_NIS */
+        peers = create_proc_entry (LNET_PROC_NIS, 0444, NULL);
+        if (peers == NULL) {
+                CERROR("couldn't create proc entry %s\n", LNET_PROC_NIS);
+                return;
+        }
+
+        peers->proc_fops = &lnet_ni_fops;
+        peers->data = NULL;
+}
+
+void
+lnet_proc_fini(void)
+{
+        remove_proc_entry(LNET_PROC_STATS, 0);
+        remove_proc_entry(LNET_PROC_ROUTES, 0);
+        remove_proc_entry(LNET_PROC_ROUTERS, 0);
+        remove_proc_entry(LNET_PROC_PEERS, 0);
+        remove_proc_entry(LNET_PROC_BUFFERS, 0);
+        remove_proc_entry(LNET_PROC_NIS, 0);
+}
+
+#else
+
+void
+lnet_proc_init(void)
+{
+}
+
+void
+lnet_proc_fini(void)
+{
+}
+
+#endif
diff --git a/lnet/router/Makefile.in b/lnet/router/Makefile.in
deleted file mode 100644 (file)
index 3bb6cf7..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-MODULES := kptlrouter
-kptlrouter-objs := router.o proc.o
-
-@INCLUDE_RULES@
diff --git a/lnet/router/proc.c b/lnet/router/proc.c
deleted file mode 100644 (file)
index 61b6880..0000000
+++ /dev/null
@@ -1,242 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *
- *   This file is part of Portals
- *   http://sourceforge.net/projects/sandiaportals/
- *
- *   Portals is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Portals is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Portals; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include "router.h"
-
-#define KPR_PROC_ROUTER "sys/portals/router"
-#define KPR_PROC_ROUTES "sys/portals/routes"
-
-/* Used for multi-page route list book keeping */
-struct proc_route_data {
-        struct list_head *curr;
-        unsigned int generation;
-        off_t skip;
-        rwlock_t proc_route_rwlock;
-} kpr_read_routes_data;
-
-/* nal2name support re-used from utils/portals.c */
-struct name2num {
-        char *name;
-        int   num;
-} nalnames[] = {
-        { "any",         0},
-        { "elan",        QSWNAL},
-        { "tcp",         SOCKNAL},
-        { "gm",          GMNAL},
-        { "ib",          OPENIBNAL},
-        { "iib",         IIBNAL},
-        { "lo",          LONAL},
-        { NULL,          -1}
-};
-
-static struct name2num *name2num_lookup_num(struct name2num *table, int num)
-{
-        while (table->name != NULL)
-                if (num == table->num)
-                        return (table);
-                else
-                        table++;
-        return (NULL);
-}
-
-static char *nal2name(int nal)
-{
-        struct name2num *e = name2num_lookup_num(nalnames, nal);
-        return ((e == NULL) ? "???" : e->name);
-}
-
-
-static int kpr_proc_router_read(char *page, char **start, off_t off,
-                                int count, int *eof, void *data)
-{
-        unsigned long long bytes = kpr_fwd_bytes;
-        unsigned long      packets = kpr_fwd_packets;
-        unsigned long      errors = kpr_fwd_errors;
-        unsigned int       qdepth = atomic_read (&kpr_queue_depth);
-        int                len;
-
-        *eof = 1;
-        if (off != 0)
-                return (0);
-
-        len = sprintf(page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth);
-
-        *start = page;
-        return (len);
-}
-
-static int kpr_proc_router_write(struct file *file, const char *ubuffer,
-                                 unsigned long count, void *data)
-{
-        /* Ignore what we've been asked to write, and just zero the stats */
-        kpr_fwd_bytes = 0;
-        kpr_fwd_packets = 0;
-        kpr_fwd_errors = 0;
-
-        return (count);
-}
-
-static int kpr_proc_routes_read(char *page, char **start, off_t off,
-                                int count, int *eof, void *data)
-{
-        struct proc_route_data  *prd = data;
-        kpr_route_entry_t       *re;
-        kpr_gateway_entry_t     *ge;
-        int                     chunk_len = 0;
-        int                     line_len = 0;
-        int                     user_len = 0;
-        int                     rc = 0;
-
-        *eof = 1;
-        *start = page;
-
-        write_lock(&(prd->proc_route_rwlock));
-
-        if (prd->curr == NULL) {
-                if (off != 0)
-                        goto routes_read_exit;
-
-                /* First pass, initialize our private data */
-                prd->curr = kpr_routes.next;
-                prd->generation = kpr_routes_generation;
-                prd->skip = 0;
-        } else {
-                /* Abort route list generation change */
-                if (prd->generation != kpr_routes_generation) {
-                        prd->curr = NULL;
-                        rc = sprintf(page, "\nError: Routes Changed\n");
-                        goto routes_read_exit;
-                }
-
-                /* All the routes have been walked */
-                if (prd->curr == &kpr_routes) {
-                        prd->curr = NULL;
-                        goto routes_read_exit;
-                }
-        }
-
-        read_lock(&kpr_rwlock);
-        *start = page + prd->skip;
-        user_len = -prd->skip;
-
-        while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) {
-                re = list_entry(prd->curr, kpr_route_entry_t, kpre_list);
-                ge = re->kpre_gateway;
-
-                line_len = sprintf(page + chunk_len,
-                        "%12s  "LPX64" : "LPX64" - "LPX64", %s\n",
-                        nal2name(ge->kpge_nalid), ge->kpge_nid,
-                        re->kpre_lo_nid, re->kpre_hi_nid,
-                        ge->kpge_alive ? "up" : "down");
-                chunk_len += line_len;
-                user_len += line_len;
-
-                /* Abort the route list changed */
-                if (prd->curr->next == NULL) {
-                        prd->curr = NULL;
-                        read_unlock(&kpr_rwlock);
-                        rc = sprintf(page, "\nError: Routes Changed\n");
-                        goto routes_read_exit;
-                }
-
-                prd->curr = prd->curr->next;
-
-                /* The route table will exceed one page, break the while loop
-                 * so the function can be re-called with a new page.
-                 */
-                if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count))
-                        break;
-        }
-
-        *eof = 0;
-
-        /* Caller received only a portion of the last entry, the
-         * remaining will be delivered in the next page if asked for.
-         */
-        if (user_len > count) {
-                prd->curr = prd->curr->prev;
-                prd->skip = line_len - (user_len - count);
-                read_unlock(&kpr_rwlock);
-                rc = count;
-                goto routes_read_exit;
-        }
-
-        /* Not enough data to entirely satify callers request */
-        prd->skip = 0;
-        read_unlock(&kpr_rwlock);
-        rc = user_len;
-
-routes_read_exit:
-        write_unlock(&(prd->proc_route_rwlock));
-        return rc;
-}
-
-static int kpr_proc_routes_write(struct file *file, const char *ubuffer,
-                                 unsigned long count, void *data)
-{
-        /* no-op; lctl should be used to adjust the routes */
-        return (count);
-}
-
-void kpr_proc_init(void)
-{
-        struct proc_dir_entry *router_entry;
-        struct proc_dir_entry *routes_entry;
-
-        /* Initialize KPR_PROC_ROUTER */
-        router_entry = create_proc_entry (KPR_PROC_ROUTER,
-                S_IFREG | S_IRUGO | S_IWUSR, NULL);
-
-        if (router_entry == NULL) {
-                CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER);
-                return;
-        }
-
-        router_entry->data = NULL;
-        router_entry->read_proc = kpr_proc_router_read;
-        router_entry->write_proc = kpr_proc_router_write;
-
-        /* Initialize KPR_PROC_ROUTES */
-        routes_entry = create_proc_entry (KPR_PROC_ROUTES,
-                S_IFREG | S_IRUGO | S_IWUSR, NULL);
-
-        if (routes_entry == NULL) {
-                CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTES);
-                return;
-        }
-
-        kpr_read_routes_data.curr = NULL;
-        kpr_read_routes_data.generation = 0;
-        kpr_read_routes_data.skip = 0;
-        kpr_read_routes_data.proc_route_rwlock = RW_LOCK_UNLOCKED;
-
-        routes_entry->data = &kpr_read_routes_data;
-        routes_entry->read_proc = kpr_proc_routes_read;
-        routes_entry->write_proc = kpr_proc_routes_write;
-}
-
-void kpr_proc_fini(void)
-{
-        remove_proc_entry(KPR_PROC_ROUTER, 0);
-        remove_proc_entry(KPR_PROC_ROUTES, 0);
-}
diff --git a/lnet/router/router.c b/lnet/router/router.c
deleted file mode 100644 (file)
index 849563b..0000000
+++ /dev/null
@@ -1,824 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *
- *   This file is part of Portals
- *   http://sourceforge.net/projects/sandiaportals/
- *
- *   Portals is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Portals is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Portals; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include "router.h"
-
-LIST_HEAD(kpr_routes);
-LIST_HEAD(kpr_gateways);
-LIST_HEAD(kpr_nals);
-
-unsigned int       kpr_routes_generation;
-unsigned long long kpr_fwd_bytes;
-unsigned long      kpr_fwd_packets;
-unsigned long      kpr_fwd_errors;
-atomic_t           kpr_queue_depth;
-
-/* Mostly the tables are read-only (thread and interrupt context)
- *
- * Once in a blue moon we register/deregister NALs and add/remove routing
- * entries (thread context only)... */
-rwlock_t         kpr_rwlock = RW_LOCK_UNLOCKED;
-
-kpr_router_interface_t kpr_router_interface = {
-       kprri_register:         kpr_register_nal,
-       kprri_lookup:           kpr_lookup_target,
-       kprri_fwd_start:        kpr_forward_packet,
-       kprri_fwd_done:         kpr_complete_packet,
-        kprri_notify:           kpr_nal_notify,
-       kprri_shutdown:         kpr_shutdown_nal,
-       kprri_deregister:       kpr_deregister_nal,
-};
-
-int
-kpr_register_nal (kpr_nal_interface_t *nalif, void **argp)
-{
-       unsigned long      flags;
-       struct list_head  *e;
-       kpr_nal_entry_t   *ne;
-
-        CDEBUG (D_NET, "Registering NAL %x\n", nalif->kprni_nalid);
-
-       PORTAL_ALLOC (ne, sizeof (*ne));
-       if (ne == NULL)
-               return (-ENOMEM);
-
-       memset (ne, 0, sizeof (*ne));
-        memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif));
-
-       LASSERT (!in_interrupt());
-       write_lock_irqsave (&kpr_rwlock, flags);
-
-       for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
-       {
-               kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list);
-
-               if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid)
-               {
-                       write_unlock_irqrestore (&kpr_rwlock, flags);
-
-                       CERROR ("Attempt to register same NAL %x twice\n", ne->kpne_interface.kprni_nalid);
-
-                       PORTAL_FREE (ne, sizeof (*ne));
-                       return (-EEXIST);
-               }
-       }
-
-        list_add (&ne->kpne_list, &kpr_nals);
-
-       write_unlock_irqrestore (&kpr_rwlock, flags);
-
-       *argp = ne;
-       PORTAL_MODULE_USE;
-        return (0);
-}
-
-void
-kpr_do_upcall (void *arg)
-{
-        kpr_upcall_t *u = (kpr_upcall_t *)arg;
-        char          nalstr[10];
-        char          nidstr[36];
-        char          whenstr[36];
-        char         *argv[] = {
-                NULL,
-                "ROUTER_NOTIFY",
-                nalstr,
-                nidstr,
-                u->kpru_alive ? "up" : "down",
-                whenstr,
-                NULL};
-        
-        snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id);
-        snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid);
-        snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when);
-
-        portals_run_upcall (argv);
-
-        kfree (u);
-}
-
-void
-kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when)
-{
-        char str[PTL_NALFMT_SIZE];
-        
-        /* May be in arbitrary context */
-        kpr_upcall_t  *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC);
-
-        if (u == NULL) {
-                CERROR ("Upcall out of memory: nal %x nid "LPX64" (%s) %s\n",
-                        gw_nalid, gw_nid,
-                        portals_nid2str(gw_nalid, gw_nid, str),
-                        alive ? "up" : "down");
-                return;
-        }
-
-        u->kpru_nal_id     = gw_nalid;
-        u->kpru_nid        = gw_nid;
-        u->kpru_alive      = alive;
-        u->kpru_when       = when;
-
-        prepare_work (&u->kpru_tq, kpr_do_upcall, u);
-        schedule_work (&u->kpru_tq);
-}
-
-int
-kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid,
-               int alive, time_t when)
-{
-       unsigned long        flags;
-        int                  found;
-        kpr_nal_entry_t     *ne = NULL;
-        kpr_gateway_entry_t *ge = NULL;
-        struct timeval       now;
-       struct list_head    *e;
-       struct list_head    *n;
-        char                 str[PTL_NALFMT_SIZE];
-
-        CDEBUG (D_NET, "%s notifying [%x] "LPX64": %s\n", 
-                byNal ? "NAL" : "userspace", 
-                gateway_nalid, gateway_nid, alive ? "up" : "down");
-
-        /* can't do predictions... */
-        do_gettimeofday (&now);
-        if (when > now.tv_sec) {
-                CWARN ("Ignoring prediction from %s of [%x] "LPX64" %s "
-                       "%ld seconds in the future\n", 
-                       byNal ? "NAL" : "userspace", 
-                       gateway_nalid, gateway_nid, 
-                       alive ? "up" : "down",
-                       when - now.tv_sec);
-                return (EINVAL);
-        }
-
-        LASSERT (when <= now.tv_sec);
-
-        /* Serialise with lookups (i.e. write lock) */
-       write_lock_irqsave(&kpr_rwlock, flags);
-
-        found = 0;
-        list_for_each_safe (e, n, &kpr_gateways) {
-
-                ge = list_entry(e, kpr_gateway_entry_t, kpge_list);
-                if ((gateway_nalid != 0 &&
-                     ge->kpge_nalid != gateway_nalid) ||
-                    ge->kpge_nid != gateway_nid)
-                        continue;
-
-                found = 1;
-                break;
-        }
-
-        if (!found) {
-                /* gateway not found */
-                write_unlock_irqrestore(&kpr_rwlock, flags);
-                CDEBUG (D_NET, "Gateway not found\n");
-                return (0);
-        }
-        
-        if (when < ge->kpge_timestamp) {
-                /* out of date information */
-                write_unlock_irqrestore (&kpr_rwlock, flags);
-                CDEBUG (D_NET, "Out of date\n");
-                return (0);
-        }
-
-        /* update timestamp */
-        ge->kpge_timestamp = when;
-
-        if ((!ge->kpge_alive) == (!alive)) {
-                /* new date for old news */
-                write_unlock_irqrestore (&kpr_rwlock, flags);
-                CDEBUG (D_NET, "Old news\n");
-                return (0);
-        }
-
-        ge->kpge_alive = alive;
-        CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive);
-
-        if (alive) {
-                /* Reset all gateway weights so the newly-enabled gateway
-                 * doesn't have to play catch-up */
-                list_for_each_safe (e, n, &kpr_gateways) {
-                        kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t,
-                                                             kpge_list);
-                        atomic_set (&ge->kpge_weight, 0);
-                }
-        }
-
-        found = 0;
-        if (!byNal) {
-                /* userland notified me: notify NAL? */
-                ne = kpr_find_nal_entry_locked (ge->kpge_nalid);
-                if (ne != NULL) {
-                        if (!ne->kpne_shutdown &&
-                            ne->kpne_interface.kprni_notify != NULL) {
-                                /* take a ref on this NAL until notifying
-                                 * it has completed... */
-                                atomic_inc (&ne->kpne_refcount);
-                                found = 1;
-                        }
-                }
-        }
-
-        write_unlock_irqrestore(&kpr_rwlock, flags);
-
-        if (found) {
-                ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg,
-                                                 gateway_nid, alive);
-                /* 'ne' can disappear now... */
-                atomic_dec (&ne->kpne_refcount);
-        }
-        
-        if (byNal) {
-                /* It wasn't userland that notified me... */
-                CWARN ("Upcall: NAL %x NID "LPX64" (%s) is %s\n",
-                       gateway_nalid, gateway_nid,
-                       portals_nid2str(gateway_nalid, gateway_nid, str),
-                       alive ? "alive" : "dead");
-                kpr_upcall (gateway_nalid, gateway_nid, alive, when);
-        } else {
-                CDEBUG (D_NET, " NOT Doing upcall\n");
-        }
-        
-        return (0);
-}
-
-void
-kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when)
-{
-        kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
-        
-        kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when);
-}
-
-void
-kpr_shutdown_nal (void *arg)
-{
-       unsigned long    flags;
-       kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
-
-        CDEBUG (D_NET, "Shutting down NAL %x\n", ne->kpne_interface.kprni_nalid);
-
-       LASSERT (!ne->kpne_shutdown);
-       LASSERT (!in_interrupt());
-
-       write_lock_irqsave (&kpr_rwlock, flags);
-       ne->kpne_shutdown = 1;
-       write_unlock_irqrestore (&kpr_rwlock, flags);
-}
-
-void
-kpr_deregister_nal (void *arg)
-{
-       unsigned long     flags;
-       kpr_nal_entry_t  *ne = (kpr_nal_entry_t *)arg;
-
-        CDEBUG (D_NET, "Deregister NAL %x\n", ne->kpne_interface.kprni_nalid);
-
-       LASSERT (ne->kpne_shutdown);            /* caller must have issued shutdown already */
-       LASSERT (!in_interrupt());
-
-       write_lock_irqsave (&kpr_rwlock, flags);
-       list_del (&ne->kpne_list);
-       write_unlock_irqrestore (&kpr_rwlock, flags);
-
-        /* Wait until all outstanding messages/notifications have completed */
-       while (atomic_read (&ne->kpne_refcount) != 0)
-       {
-               CDEBUG (D_NET, "Waiting for refcount on NAL %x to reach zero (%d)\n",
-                       ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount));
-
-               set_current_state (TASK_UNINTERRUPTIBLE);
-               schedule_timeout (HZ);
-       }
-
-       PORTAL_FREE (ne, sizeof (*ne));
-        PORTAL_MODULE_UNUSE;
-}
-
-int
-kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2)
-{
-        const int significant_bits = 0x00ffffff;
-        /* We use atomic_t to record/compare route weights for
-         * load-balancing.  Here we limit ourselves to only using
-         * 'significant_bits' when we do an 'after' comparison */
-
-        int    diff = (atomic_read (&ge1->kpge_weight) -
-                       atomic_read (&ge2->kpge_weight)) & significant_bits;
-        int    rc = (diff > (significant_bits >> 1));
-
-        CDEBUG(D_INFO, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n",
-               ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight),
-               rc ? ">" : "<",
-               ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight));
-
-        return (rc);
-}
-
-void
-kpr_update_weight (kpr_gateway_entry_t *ge, int nob)
-{
-        int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t);
-
-        /* We've chosen this route entry (i.e. gateway) to forward payload
-         * of length 'nob'; update the route's weight to make it less
-         * favoured.  Note that the weight is 1 plus the payload size
-         * rounded and scaled to the portals header size, so we get better
-         * use of the significant bits in kpge_weight. */
-
-        CDEBUG(D_INFO, "gateway [%p]"LPX64" += %d\n", ge,
-               ge->kpge_nid, weight);
-        
-        atomic_add (weight, &ge->kpge_weight);
-}
-
-int
-kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
-                   ptl_nid_t *gateway_nidp)
-{
-       kpr_nal_entry_t     *ne = (kpr_nal_entry_t *)arg;
-       struct list_head    *e;
-        kpr_route_entry_t   *re;
-        kpr_gateway_entry_t *ge = NULL;
-       int                  rc = -ENOENT;
-
-        /* Caller wants to know if 'target_nid' can be reached via a gateway
-         * ON HER OWN NETWORK */
-
-        CDEBUG (D_INFO, "lookup "LPX64" from NAL %x\n", target_nid, 
-                ne->kpne_interface.kprni_nalid);
-        LASSERT (!in_interrupt());
-
-       read_lock (&kpr_rwlock);
-
-       if (ne->kpne_shutdown) {        /* caller is shutting down */
-                read_unlock (&kpr_rwlock);
-               return (-ENOENT);
-        }
-
-       /* Search routes for one that has a gateway to target_nid on the callers network */
-
-        list_for_each (e, &kpr_routes) {
-               re = list_entry (e, kpr_route_entry_t, kpre_list);
-
-               if (re->kpre_lo_nid > target_nid ||
-                    re->kpre_hi_nid < target_nid)
-                       continue;
-
-               /* found table entry */
-
-               if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid ||
-                    !re->kpre_gateway->kpge_alive) {
-                        /* different NAL or gateway down */
-                        rc = -EHOSTUNREACH;
-                        continue;
-                }
-                
-                if (ge == NULL ||
-                    kpr_ge_isbetter (re->kpre_gateway, ge))
-                    ge = re->kpre_gateway;
-       }
-
-        if (ge != NULL) {
-                kpr_update_weight (ge, nob);
-                *gateway_nidp = ge->kpge_nid;
-                rc = 0;
-        }
-        
-       read_unlock (&kpr_rwlock);
-
-        /* NB can't deref 're' now; it might have been removed! */
-
-        CDEBUG (D_NET, "lookup "LPX64" from NAL %x: %d ("LPX64")\n",
-                target_nid, ne->kpne_interface.kprni_nalid, rc,
-                (rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
-       return (rc);
-}
-
-kpr_nal_entry_t *
-kpr_find_nal_entry_locked (int nal_id)
-{
-        struct list_head    *e;
-        
-        /* Called with kpr_rwlock held */
-
-        list_for_each (e, &kpr_nals) {
-                kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list);
-
-                if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */
-                        continue;
-
-                return (ne);
-        }
-        
-        return (NULL);
-}
-
-void
-kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
-{
-       kpr_nal_entry_t     *src_ne = (kpr_nal_entry_t *)arg;
-       ptl_nid_t            target_nid = fwd->kprfd_target_nid;
-        int                  nob = fwd->kprfd_nob;
-        kpr_gateway_entry_t *ge = NULL;
-        kpr_nal_entry_t     *dst_ne = NULL;
-       struct list_head    *e;
-        kpr_route_entry_t   *re;
-        kpr_nal_entry_t     *tmp_ne;
-        int                  rc;
-
-        CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x\n", fwd,
-                target_nid, src_ne->kpne_interface.kprni_nalid);
-
-        LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov));
-        LASSERT (!in_interrupt());
-
-       read_lock (&kpr_rwlock);
-
-        kpr_fwd_packets++;                   /* (loose) stats accounting */
-        kpr_fwd_bytes += nob + sizeof(ptl_hdr_t);
-
-       if (src_ne->kpne_shutdown) {         /* caller is shutting down */
-                rc = -ESHUTDOWN;
-               goto out;
-        }
-
-       fwd->kprfd_router_arg = src_ne;      /* stash caller's nal entry */
-
-       /* Search routes for one that has a gateway to target_nid NOT on the caller's network */
-
-        list_for_each (e, &kpr_routes) {
-               re = list_entry (e, kpr_route_entry_t, kpre_list);
-
-               if (re->kpre_lo_nid > target_nid || /* no match */
-                    re->kpre_hi_nid < target_nid)
-                       continue;
-
-               if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid)
-                       continue;               /* don't route to same NAL */
-
-                if (!re->kpre_gateway->kpge_alive)
-                        continue;               /* gateway is dead */
-                
-                tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid);
-
-                if (tmp_ne == NULL ||
-                    tmp_ne->kpne_shutdown) {
-                        /* NAL must be registered and not shutting down */
-                        continue;
-                }
-
-                if (ge == NULL ||
-                    kpr_ge_isbetter (re->kpre_gateway, ge)) {
-                        ge = re->kpre_gateway;
-                        dst_ne = tmp_ne;
-                }
-        }
-        
-        if (ge != NULL) {
-                LASSERT (dst_ne != NULL);
-                
-                kpr_update_weight (ge, nob);
-
-                fwd->kprfd_gateway_nid = ge->kpge_nid;
-                atomic_inc (&src_ne->kpne_refcount); /* source and dest nals are */
-                atomic_inc (&dst_ne->kpne_refcount); /* busy until fwd completes */
-                atomic_inc (&kpr_queue_depth);
-
-                read_unlock (&kpr_rwlock);
-
-                CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x: "
-                        "to "LPX64" on NAL %x\n", 
-                        fwd, target_nid, src_ne->kpne_interface.kprni_nalid,
-                        fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
-
-                dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
-                return;
-       }
-
-        rc = -EHOSTUNREACH;
- out:
-        kpr_fwd_errors++;
-
-        CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %x: %d\n", 
-                fwd, target_nid, src_ne->kpne_interface.kprni_nalid, rc);
-
-       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, rc);
-
-        read_unlock (&kpr_rwlock);
-}
-
-void
-kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error)
-{
-       kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
-       kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
-
-        CDEBUG (D_NET, "complete(1) [%p] from NAL %x to NAL %x: %d\n", fwd,
-                src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
-
-       atomic_dec (&dst_ne->kpne_refcount);    /* CAVEAT EMPTOR dst_ne can disappear now!!! */
-
-       (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
-
-        CDEBUG (D_NET, "complete(2) [%p] from NAL %x: %d\n", fwd,
-                src_ne->kpne_interface.kprni_nalid, error);
-
-        atomic_dec (&kpr_queue_depth);
-       atomic_dec (&src_ne->kpne_refcount);    /* CAVEAT EMPTOR src_ne can disappear now!!! */
-}
-
-int
-kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, 
-               ptl_nid_t lo_nid, ptl_nid_t hi_nid)
-{
-       unsigned long        flags;
-       struct list_head    *e;
-       kpr_route_entry_t   *re;
-        kpr_gateway_entry_t *ge;
-        int                  dup = 0;
-
-        CDEBUG(D_NET, "Add route: %x "LPX64" : "LPX64" - "LPX64"\n",
-               gateway_nalid, gateway_nid, lo_nid, hi_nid);
-
-        if (gateway_nalid == PTL_NID_ANY ||
-            lo_nid == PTL_NID_ANY ||
-            hi_nid == PTL_NID_ANY ||
-            lo_nid > hi_nid)
-                return (-EINVAL);
-
-        PORTAL_ALLOC (ge, sizeof (*ge));
-        if (ge == NULL)
-                return (-ENOMEM);
-
-        ge->kpge_nalid = gateway_nalid;
-        ge->kpge_nid   = gateway_nid;
-        ge->kpge_alive = 1;
-        ge->kpge_timestamp = 0;
-        ge->kpge_refcount = 0;
-        atomic_set (&ge->kpge_weight, 0);
-
-        PORTAL_ALLOC (re, sizeof (*re));
-        if (re == NULL) {
-                PORTAL_FREE (ge, sizeof (*ge));
-                return (-ENOMEM);
-        }
-
-        re->kpre_lo_nid = lo_nid;
-        re->kpre_hi_nid = hi_nid;
-
-        LASSERT(!in_interrupt());
-       write_lock_irqsave (&kpr_rwlock, flags);
-
-        list_for_each (e, &kpr_gateways) {
-                kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
-                                                      kpge_list);
-
-                if (ge2->kpge_nalid == gateway_nalid &&
-                    ge2->kpge_nid == gateway_nid) {
-                        PORTAL_FREE (ge, sizeof (*ge));
-                        ge = ge2;
-                        dup = 1;
-                        break;
-                }
-        }
-
-        if (!dup) {
-                /* Adding a new gateway... */
-                list_add (&ge->kpge_list, &kpr_gateways);
-
-                /* ...zero all gateway weights so this one doesn't have to
-                 * play catch-up */
-
-                list_for_each (e, &kpr_gateways) {
-                        kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
-                                                              kpge_list);
-                        atomic_set (&ge2->kpge_weight, 0);
-                }
-        }
-
-        re->kpre_gateway = ge;
-        ge->kpge_refcount++;
-        list_add (&re->kpre_list, &kpr_routes);
-        kpr_routes_generation++;
-
-        write_unlock_irqrestore (&kpr_rwlock, flags);
-        return (0);
-}
-
-int
-kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid,
-                int alive, time_t when)
-{
-        return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when));
-}
-
-int
-kpr_del_route (int gw_nalid, ptl_nid_t gw_nid,
-               ptl_nid_t lo, ptl_nid_t hi)
-{
-        int                specific = (lo != PTL_NID_ANY);
-        unsigned long      flags;
-        int                rc = -ENOENT;
-        struct list_head  *e;
-        struct list_head  *n;
-
-        CDEBUG(D_NET, "Del route [%x] "LPX64" : "LPX64" - "LPX64"\n",
-               gw_nalid, gw_nid, lo, hi);
-
-        LASSERT(!in_interrupt());
-
-        /* NB Caller may specify either all routes via the given gateway
-         * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are
-         * actual NIDs) */
-        if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY))
-                return (-EINVAL);
-
-        write_lock_irqsave(&kpr_rwlock, flags);
-
-        list_for_each_safe (e, n, &kpr_routes) {
-                kpr_route_entry_t   *re = list_entry(e, kpr_route_entry_t,
-                                                   kpre_list);
-                kpr_gateway_entry_t *ge = re->kpre_gateway;
-
-                if (ge->kpge_nalid != gw_nalid ||
-                    ge->kpge_nid != gw_nid ||
-                    (specific &&
-                     (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid)))
-                        continue;
-
-                rc = 0;
-
-                if (--ge->kpge_refcount == 0) {
-                        list_del (&ge->kpge_list);
-                        PORTAL_FREE (ge, sizeof (*ge));
-                }
-
-                list_del (&re->kpre_list);
-                PORTAL_FREE(re, sizeof (*re));
-
-                if (specific)
-                        break;
-        }
-
-        kpr_routes_generation++;
-        write_unlock_irqrestore(&kpr_rwlock, flags);
-
-        return (rc);
-}
-
-int
-kpr_get_route (int idx, __u32 *gateway_nalid, ptl_nid_t *gateway_nid,
-               ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, __u32 *alive)
-{
-       struct list_head  *e;
-
-        LASSERT (!in_interrupt());
-       read_lock(&kpr_rwlock);
-
-        for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
-                kpr_route_entry_t   *re = list_entry(e, kpr_route_entry_t,
-                                                     kpre_list);
-                kpr_gateway_entry_t *ge = re->kpre_gateway;
-                
-                if (idx-- == 0) {
-                        *gateway_nalid = ge->kpge_nalid;
-                        *gateway_nid = ge->kpge_nid;
-                        *alive = ge->kpge_alive;
-                        *lo_nid = re->kpre_lo_nid;
-                        *hi_nid = re->kpre_hi_nid;
-
-                        read_unlock(&kpr_rwlock);
-                        return (0);
-                }
-        }
-
-        read_unlock (&kpr_rwlock);
-        return (-ENOENT);
-}
-
-static int 
-kpr_nal_cmd(struct portals_cfg *pcfg, void * private)
-{
-        int err = -EINVAL;
-        ENTRY;
-
-        switch(pcfg->pcfg_command) {
-        default:
-                CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command);
-                break;
-                
-        case NAL_CMD_ADD_ROUTE:
-                CDEBUG(D_IOCTL, "Adding route: [%x] "LPU64" : "LPU64" - "LPU64"\n",
-                       pcfg->pcfg_nal, pcfg->pcfg_nid, 
-                       pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                err = kpr_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                                    pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                break;
-
-        case NAL_CMD_DEL_ROUTE:
-                CDEBUG (D_IOCTL, "Removing routes via [%x] "LPU64" : "LPU64" - "LPU64"\n",
-                        pcfg->pcfg_gw_nal, pcfg->pcfg_nid, 
-                        pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                err = kpr_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                                     pcfg->pcfg_nid2, pcfg->pcfg_nid3);
-                break;
-
-        case NAL_CMD_NOTIFY_ROUTER: {
-                CDEBUG (D_IOCTL, "Notifying peer [%x] "LPU64" %s @ %ld\n",
-                        pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                        pcfg->pcfg_flags ? "Enabling" : "Disabling",
-                        (time_t)pcfg->pcfg_nid3);
-                
-                err = kpr_sys_notify (pcfg->pcfg_gw_nal, pcfg->pcfg_nid,
-                                      pcfg->pcfg_flags, (time_t)pcfg->pcfg_nid3);
-                break;
-        }
-                
-        case NAL_CMD_GET_ROUTE:
-                CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count);
-                err = kpr_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal,
-                                    &pcfg->pcfg_nid, 
-                                    &pcfg->pcfg_nid2, &pcfg->pcfg_nid3,
-                                    &pcfg->pcfg_flags);
-                break;
-        }
-        RETURN(err);
-}
-
-
-static void /*__exit*/
-kpr_finalise (void)
-{
-        LASSERT (list_empty (&kpr_nals));
-
-        libcfs_nal_cmd_unregister(ROUTER);
-
-        PORTAL_SYMBOL_UNREGISTER(kpr_router_interface);
-
-        kpr_proc_fini();
-
-        while (!list_empty (&kpr_routes)) {
-                kpr_route_entry_t *re = list_entry(kpr_routes.next,
-                                                   kpr_route_entry_t,
-                                                   kpre_list);
-
-                list_del(&re->kpre_list);
-                PORTAL_FREE(re, sizeof (*re));
-        }
-
-        CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n",
-               atomic_read(&portal_kmemory));
-}
-
-static int __init
-kpr_initialise (void)
-{
-        int     rc;
-        
-        CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n",
-               atomic_read(&portal_kmemory));
-
-        kpr_routes_generation = 0;
-        kpr_proc_init();
-
-        rc = libcfs_nal_cmd_register(ROUTER, kpr_nal_cmd, NULL);
-        if (rc != 0) {
-                CERROR("Can't register nal cmd handler\n");
-                return (rc);
-        }
-        
-        PORTAL_SYMBOL_REGISTER(kpr_router_interface);
-        return (0);
-}
-
-MODULE_AUTHOR("Eric Barton");
-MODULE_DESCRIPTION("Kernel Portals Router v0.01");
-MODULE_LICENSE("GPL");
-
-module_init (kpr_initialise);
-module_exit (kpr_finalise);
-
-EXPORT_SYMBOL (kpr_router_interface);
diff --git a/lnet/router/router.h b/lnet/router/router.h
deleted file mode 100644 (file)
index 44f307a..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *   This file is part of Lustre, http://www.lustre.org
- *
- *   Portals is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Portals is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Portals; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#ifndef _KPTLROUTER_H
-#define _KPTLROUTER_H
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-
-#define DEBUG_SUBSYSTEM S_PTLROUTER
-
-#include <libcfs/kp30.h>
-#include <portals/kpr.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
-
-typedef struct
-{
-       struct list_head        kpne_list;
-       kpr_nal_interface_t     kpne_interface;
-       atomic_t                kpne_refcount;
-       int                     kpne_shutdown;
-} kpr_nal_entry_t;
-
-typedef struct
-{
-        struct list_head        kpge_list;
-        atomic_t                kpge_weight;
-        time_t                  kpge_timestamp;
-        int                     kpge_alive;
-        int                     kpge_nalid;
-        int                     kpge_refcount;
-        ptl_nid_t               kpge_nid;
-} kpr_gateway_entry_t;
-
-typedef struct
-{
-       struct list_head        kpre_list;
-        kpr_gateway_entry_t    *kpre_gateway;
-       ptl_nid_t               kpre_lo_nid;
-        ptl_nid_t               kpre_hi_nid;
-} kpr_route_entry_t;
-
-typedef struct
-{
-        work_struct_t           kpru_tq;
-        int                     kpru_nal_id;
-        ptl_nid_t               kpru_nid;
-        int                     kpru_alive;
-        time_t                  kpru_when;
-} kpr_upcall_t;
-
-extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp);
-extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, 
-                              ptl_nid_t *gateway_nidp);
-extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id);
-extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd);
-extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error);
-extern void kpr_nal_notify (void *arg, ptl_nid_t peer,
-                            int alive, time_t when);
-extern void kpr_shutdown_nal (void *arg);
-extern void kpr_deregister_nal (void *arg);
-
-extern void kpr_proc_init (void);
-extern void kpr_proc_fini (void);
-
-extern unsigned int       kpr_routes_generation;
-extern unsigned long long kpr_fwd_bytes;
-extern unsigned long      kpr_fwd_packets;
-extern unsigned long      kpr_fwd_errors;
-extern atomic_t           kpr_queue_depth;
-
-extern struct list_head   kpr_routes;
-extern rwlock_t           kpr_rwlock;
-
-#endif /* _KPLROUTER_H */
index c309db0..5860c3e 100644 (file)
@@ -1,16 +1,14 @@
-MODULES := pingsrv pingcli spingsrv spingcli
+MODULES := pingsrv pingcli
+#utcli utsrv
 pingsrv-objs := ping_srv.o
 
 ifeq ($(PATCHLEVEL),6)
 pingcli-objs := ping_cli.o
-spingsrv-objs := sping_srv.o
-spingcli-objs := sping_cli.o
+#utcli-objs := ut_cli.o
+#utsrv-objs := ut_srv.o
 else
 ping%.c: ping_%.c
        ln -sf $< $@
-
-sping%.c: sping_%.c
-       ln -sf $< $@
 endif
 
 @INCLUDE_RULES@
index f611868..f187255 100644 (file)
@@ -4,17 +4,39 @@
 # See the file COPYING in this distribution
 
 if MODULES
-if !CRAY_PORTALS
 if TESTS
 
 if LINUX
 noinst_DATA := pingsrv$(KMODEXT) pingcli$(KMODEXT)
-noinst_DATA += spingsrv$(KMODEXT) spingcli$(KMODEXT)
+#noinst_DATA += utsrv$(KMODEXT) utcli$(KMODEXT)
 endif
 
-endif
-endif
-endif
+if DARWIN
+macos_PROGRAMS := pingcli
+#macos_PROGRAMS := pingsrv
+
+pingcli_SOURCES := ping_cli.c
+
+pingcli_CFLAGS := $(EXTRA_KCFLAGS)
+pingcli_LDFLAGS := $(EXTRA_KLDFLAGS)
+pingcli_LDADD := $(EXTRA_KLIBS)
+
+#pingsrv_SOURCES := ping_srv.c
+
+#pingsrv_CFLAGS := $(EXTRA_KCFLAGS)
+#pingsrv_LDFLAGS := $(EXTRA_KLDFLAGS)
+#pingsrv_LDADD := $(EXTRA_KLIBS)
+
+plist_DATA := ping_cli/Info.plist
+#plist_DATA := ping_srv/Info.plist
+
+install_data_hook := fix-kext-ownership
+endif # Darwin
+
+endif # TEST 
+endif # MODULE
+install-data-hook: $(install_data_hook)
 
-MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  pingsrv.c pingcli.c spingsrv.c spingcli.c
-DIST_SOURCES = ping_srv.c ping_cli.c sping_srv.c sping_cli.c ping.h
+MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ pingsrv.c pingcli.c 
+DIST_SOURCES = ping_srv.c ping_cli.c ping.h 
+#ut_cli.c ut_srv.c ut.h
index ef937af..1dde8bc 100644 (file)
@@ -2,7 +2,7 @@
 #define _KPING_INCLUDED
 
 #include <libcfs/portals_utils.h>
-#include <portals/p30.h>
+#include <lnet/lnet.h>
 
 
 #define PTL_PING_IN_SIZE               256     // n packets per buffer
@@ -28,7 +28,7 @@
        (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1))
 
 #define PDEBUG(str, err)                       \
-       CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err)
+       CERROR ("%s: error=(%d)\n", str, err)
 
 
 /* Ping data to be passed via the ioctl to kernel space */
 #if __KERNEL__
 
 struct pingsrv_data {
-        
-        ptl_handle_ni_t         ni;
-        ptl_handle_me_t         me;
-        ptl_handle_eq_t         eq;
-        void                   *in_buf;
-        ptl_process_id_t        my_id;
-        ptl_process_id_t        id_local;
-        ptl_md_t                mdin;
-        ptl_md_t                mdout;
-        ptl_handle_md_t         mdin_h;
-        ptl_handle_md_t         mdout_h;
-        ptl_event_t             evnt;
+       lnet_handle_me_t         me;
+        lnet_handle_eq_t         eq;
+        void                    *in_buf;
+        lnet_process_id_t        my_id;
+        lnet_process_id_t        id_local;
+        lnet_md_t                mdin;
+        lnet_md_t                mdout;
+        lnet_handle_md_t         mdin_h;
+        lnet_handle_md_t         mdout_h;
+        lnet_event_t             evnt;
         cfs_task_t             *tsk;
 }; /* struct pingsrv_data */
  
 struct pingcli_data {
         
-        struct portal_ioctl_data *args;
-        ptl_handle_me_t        me;
-        ptl_handle_eq_t                eq;
+       int                     count;
+       int                     size;
+       lnet_nid_t              nid;
+       int                     timeout;
+        lnet_handle_me_t       me;
+        lnet_handle_eq_t       eq;
         char                          *inbuf;    
         char                   *outbuf;   
-        ptl_process_id_t       myid; 
-        ptl_process_id_t       id_local; 
-        ptl_process_id_t       id_remote;
-        ptl_md_t               md_in_head;
-        ptl_md_t               md_out_head;
-        ptl_handle_md_t        md_in_head_h;
-        ptl_handle_md_t        md_out_head_h;
-        ptl_event_t            ev;
+        lnet_process_id_t      myid; 
+        lnet_process_id_t      id_local; 
+        lnet_process_id_t      id_remote;
+        lnet_md_t              md_in_head;
+        lnet_md_t              md_out_head;
+        lnet_handle_md_t       md_in_head_h;
+        lnet_handle_md_t       md_out_head_h;
+        lnet_event_t           ev;
         cfs_task_t             *tsk;
 }; /* struct pingcli_data */
 
index 2995b46..eaf83c0 100644 (file)
@@ -26,9 +26,9 @@
 #define DEBUG_SUBSYSTEM S_PINGER
 
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
+#include <lnet/lnet.h>
 #include "ping.h"
-/* int portal_debug = D_PING_CLI;  */
+/* int libcfs_debug = D_PING_CLI;  */
 
 
 #define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
@@ -42,9 +42,8 @@ static struct pingcli_data *client = NULL;
 static int count = 0;
 
 static void
-pingcli_shutdown(ptl_handle_ni_t nih, int err)
+pingcli_shutdown(int err)
 {
-        struct portal_ioctl_data *args = client->args;
         int rc;
 
         /* Yes, we are intentionally allowing us to fall through each
@@ -54,32 +53,32 @@ pingcli_shutdown(ptl_handle_ni_t nih, int err)
         switch (err) {
                 case 1:
                         /* Unlink any memory descriptors we may have used */
-                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
-                                PDEBUG ("PtlMDUnlink", rc);
+                        if ((rc = LNetMDUnlink (client->md_out_head_h)))
+                                PDEBUG ("LNetMDUnlink", rc);
                 case 2:
-                        if ((rc = PtlMDUnlink (client->md_in_head_h)))
-                                PDEBUG ("PtlMDUnlink", rc);
+                        if ((rc = LNetMDUnlink (client->md_in_head_h)))
+                                PDEBUG ("LNetMDUnlink", rc);
 
                         /* Free the event queue */
-                        if ((rc = PtlEQFree (client->eq)))
-                                PDEBUG ("PtlEQFree", rc);
+                        if ((rc = LNetEQFree (client->eq)))
+                                PDEBUG ("LNetEQFree", rc);
 
-                        if ((rc = PtlMEUnlink (client->me)))
-                                PDEBUG ("PtlMEUnlink", rc);
+                        if ((rc = LNetMEUnlink (client->me)))
+                                PDEBUG ("LNetMEUnlink", rc);
                 case 3:
-                        PtlNIFini(nih);
+                        LNetNIFini();
 
                 case 4:
                         /* Free our buffers */
                         if (client->outbuf != NULL)
-                                PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size);
+                                LIBCFS_FREE (client->outbuf, STDSIZE + client->size);
 
                         if (client->inbuf != NULL)
-                                PORTAL_FREE (client->inbuf,
-                                             (args->ioc_size + STDSIZE) * args->ioc_count);
+                                LIBCFS_FREE (client->inbuf,
+                                             (client->size + STDSIZE) * client->count);
 
                         if (client != NULL)
-                                PORTAL_FREE (client,
+                                LIBCFS_FREE (client,
                                                 sizeof(struct pingcli_data));
         }
 
@@ -87,12 +86,12 @@ pingcli_shutdown(ptl_handle_ni_t nih, int err)
         CDEBUG (D_OTHER, "ping client released resources\n");
 } /* pingcli_shutdown() */
 
-static void pingcli_callback(ptl_event_t *ev)
+static void pingcli_callback(lnet_event_t *ev)
 {
         int i;
         unsigned magic;
-        i = __le32_to_cpu(*(int *)(ev->md.start + ev->offset + sizeof(unsigned)));
-        magic = __le32_to_cpu(*(int *)(ev->md.start + ev->offset));
+        i = __le32_to_cpu(*(int *)((char *)ev->md.start + ev->offset + sizeof(unsigned)));
+        magic = __le32_to_cpu(*(int *)((char *)ev->md.start + ev->offset));
 
         if(magic != 0xcafebabe) {
                 CERROR("Unexpected response %x\n", magic);
@@ -105,122 +104,121 @@ static void pingcli_callback(ptl_event_t *ev)
 }
 
 
-static struct pingcli_data *
-pingcli_start(struct portal_ioctl_data *args)
+static void
+pingcli_start(struct libcfs_ioctl_data *args)
 {
-        ptl_handle_ni_t nih = PTL_INVALID_HANDLE;
         unsigned ping_head_magic = __cpu_to_le32(PING_HEADER_MAGIC);
         int rc;
         struct timeval tv1, tv2;
-        char str[PTL_NALFMT_SIZE];
         
         client->tsk = cfs_current();
-        client->args = args;
-        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64" (%s),  \
-                        nal %x, size %u, count: %u, timeout: %u\n",
-                        args->ioc_nid,
-                        portals_nid2str(args->ioc_nal, args->ioc_nid, str),
-                        args->ioc_nal, args->ioc_size,
-                        args->ioc_count, args->ioc_timeout);
+        client->nid = args->ioc_nid;
+        client->count = args->ioc_count;
+        client->size = args->ioc_u32[0];
+        client->timeout = args->ioc_u32[1];
+        
+       CDEBUG (D_OTHER, "pingcli_setup args: nid %s (%s),  \
+                        size %u, count: %u, timeout: %u\n",
+                        libcfs_nid2str(client->nid),
+                        libcfs_nid2str(client->nid),
+                        client->size, client->count, client->timeout);
 
 
-        PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ;
+        LIBCFS_ALLOC (client->outbuf, STDSIZE + client->size) ;
         if (client->outbuf == NULL)
         {
                 CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (nih, 4);
-                return (NULL);
+                pingcli_shutdown (4);
+                return;
         }
 
-        PORTAL_ALLOC (client->inbuf,
-                        (args->ioc_size + STDSIZE) * args->ioc_count);
+        LIBCFS_ALLOC (client->inbuf,
+                        (client->size + STDSIZE) * client->count);
         if (client->inbuf == NULL)
         {
                 CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (nih, 4);
-                return (NULL);
+                pingcli_shutdown (4);
+                return;
         }
 
-        /* Aquire and initialize the proper nal for portals. */
-        rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP)
+        rc = LNetNIInit(0);
+        if (rc != 0 && rc != 1)
         {
-                CERROR ("NAL %x not loaded\n", args->ioc_nal);
-                pingcli_shutdown (nih, 4);
-                return (NULL);
+                CERROR ("LNetNIInit: error %d\n", rc);
+                pingcli_shutdown (4);
+                return;
         }
 
         /* Based on the initialization aquire our unique portal ID. */
-        if ((rc = PtlGetId (nih, &client->myid)))
+        if ((rc = LNetGetId (1, &client->myid)))
         {
-                CERROR ("PtlGetId error %d\n", rc);
-                pingcli_shutdown (nih, 2);
-                return (NULL);
+                CERROR ("LNetGetId error %d\n", rc);
+                pingcli_shutdown (2);
+                return;
         }
 
         /* Setup the local match entries */
-        client->id_local.nid = PTL_NID_ANY;
-        client->id_local.pid = PTL_PID_ANY;
+        client->id_local.nid = LNET_NID_ANY;
+        client->id_local.pid = LNET_PID_ANY;
 
         /* Setup the remote match entries */
-        client->id_remote.nid = args->ioc_nid;
+        client->id_remote.nid = client->nid;
         client->id_remote.pid = 0;
 
-        if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT,
-                   client->id_local, 0, ~0, PTL_RETAIN,
-                   PTL_INS_AFTER, &client->me)))
+        if ((rc = LNetMEAttach (PTL_PING_CLIENT,
+                   client->id_local, 0, ~0, LNET_RETAIN,
+                   LNET_INS_AFTER, &client->me)))
         {
-                CERROR ("PtlMEAttach error %d\n", rc);
-                pingcli_shutdown (nih, 2);
-                return (NULL);
+                CERROR ("LNetMEAttach error %d\n", rc);
+                pingcli_shutdown (2);
+                return;
         }
 
         /* Allocate the event queue for this network interface */
-        if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq)))
+        if ((rc = LNetEQAlloc (64, pingcli_callback, &client->eq)))
         {
-                CERROR ("PtlEQAlloc error %d\n", rc);
-                pingcli_shutdown (nih, 2);
-                return (NULL);
+                CERROR ("LNetEQAlloc error %d\n", rc);
+                pingcli_shutdown (2);
+                return;
         }
 
-        count = args->ioc_count;
+        count = client->count;
 
         client->md_in_head.start     = client->inbuf;
-        client->md_in_head.length    = (args->ioc_size + STDSIZE)
-                                                * count;
-        client->md_in_head.threshold = PTL_MD_THRESH_INF;
-        client->md_in_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
+        client->md_in_head.length    = (client->size + STDSIZE) * count;
+        client->md_in_head.threshold = LNET_MD_THRESH_INF;
+        client->md_in_head.options   = LNET_MD_OP_PUT;
         client->md_in_head.user_ptr  = NULL;
         client->md_in_head.eq_handle = client->eq;
-        memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count);
+        memset (client->inbuf, 0, (client->size + STDSIZE) * count);
 
         /* Attach the incoming buffer */
-        if ((rc = PtlMDAttach (client->me, client->md_in_head,
-                              PTL_UNLINK, &client->md_in_head_h))) {
-                CERROR ("PtlMDAttach error %d\n", rc);
-                pingcli_shutdown (nih, 1);
-                return (NULL);
+        if ((rc = LNetMDAttach (client->me, client->md_in_head,
+                              LNET_UNLINK, &client->md_in_head_h))) {
+                CERROR ("LNetMDAttach error %d\n", rc);
+                pingcli_shutdown (1);
+                return;
         }
         /* Setup the outgoing ping header */
         client->md_out_head.start     = client->outbuf;
-        client->md_out_head.length    = STDSIZE + args->ioc_size;
-        client->md_out_head.threshold = args->ioc_count;
-        client->md_out_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
+        client->md_out_head.length    = STDSIZE + client->size;
+        client->md_out_head.threshold = client->count;
+        client->md_out_head.options   = LNET_MD_OP_PUT;
         client->md_out_head.user_ptr  = NULL;
-        client->md_out_head.eq_handle = PTL_EQ_NONE;
+        client->md_out_head.eq_handle = LNET_EQ_NONE;
 
         memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic));
 
         count = 0;
 
         /* Bind the outgoing ping header */
-        if ((rc=PtlMDBind (nih, client->md_out_head,
-                           PTL_UNLINK, &client->md_out_head_h))) {
-                CERROR ("PtlMDBind error %d\n", rc);
-                pingcli_shutdown (nih, 1);
-                return NULL;
+        if ((rc=LNetMDBind (client->md_out_head,
+                           LNET_UNLINK, &client->md_out_head_h))) {
+                CERROR ("LNetMDBind error %d\n", rc);
+                pingcli_shutdown (1);
+                return;
         }
-        while ((args->ioc_count - count)) {
+        while ((client->count - count)) {
                 unsigned __count;
                 __count = __cpu_to_le32(count);
 
@@ -232,16 +230,19 @@ pingcli_start(struct portal_ioctl_data *args)
                 memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1,
                        sizeof(struct timeval));
 
-                if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
-                          client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
-                         PDEBUG ("PtlPut (header)", rc);
-                         pingcli_shutdown (nih, 1);
-                         return NULL;
+                if((rc = LNetPut (LNET_NID_ANY, client->md_out_head_h, 
+                                  LNET_NOACK_REQ,
+                                  client->id_remote, PTL_PING_SERVER, 
+                                  0, 0, 0))) {
+                         PDEBUG ("LNetPut (header)", rc);
+                         pingcli_shutdown (1);
+                         return;
                 }
                 CWARN ("Lustre: sent msg no %d.\n", count);
 
-                set_current_state (TASK_INTERRUPTIBLE);
-                rc = schedule_timeout (cfs_time_seconds(args->ioc_timeout));
+                set_current_state (CFS_TASK_INTERRUPTIBLE);
+                rc = cfs_schedule_timeout (CFS_TASK_INTERRUPTIBLE,
+                                           cfs_time_seconds(client->timeout));
                 if (rc == 0) {
                         CERROR ("timeout .....\n");
                 } else {
@@ -253,18 +254,16 @@ pingcli_start(struct portal_ioctl_data *args)
                 count++;
         }
 
-        pingcli_shutdown (nih, 2);
+        pingcli_shutdown (2);
 
-        /* Success! */
-        return NULL;
 } /* pingcli_setup() */
 
 
 
 /* called by the portals_ioctl for ping requests */
-int kping_client(struct portal_ioctl_data *args)
+int kping_client(struct libcfs_ioctl_data *args)
 {
-        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
+        LIBCFS_ALLOC (client, sizeof(struct pingcli_data));
         if (client == NULL)
         {
                 CERROR ("Unable to allocate client structure\n");
index 2dfc37b..4ecee0f 100644 (file)
@@ -5,11 +5,11 @@
        <key>CFBundleDevelopmentRegion</key>
        <string>English</string>
        <key>CFBundleExecutable</key>
-       <string>ping_cli</string>
+       <string>pingcli</string>
        <key>CFBundleIconFile</key>
        <string></string>
        <key>CFBundleIdentifier</key>
-       <string>com.clusterfs.lustre.portals.tests.ping_cli</string>
+       <string>com.clusterfs.lustre.pingcli</string>
        <key>CFBundleInfoDictionaryVersion</key>
        <string>6.0</string>
        <key>CFBundlePackageType</key>
        <key>CFBundleSignature</key>
        <string>????</string>
        <key>CFBundleVersion</key>
-       <string>1.0.0d1</string>
+       <string>1.0.1</string>
+       <key>OSBundleCompatibleVersion</key>
+       <string>1.0.0</string>
        <key>OSBundleLibraries</key>
        <dict>
-                <key>com.apple.kernel.bsd</key>
-                <string>1.1</string>
-                <key>com.apple.kernel.iokit</key>
-                <string>1.0.0b1</string>
-                <key>com.apple.kernel.mach</key>
-                <string>1.0.0b1</string>
-                <key>com.clusterfs.lustre.portals.libcfs</key>
+               <key>com.apple.kpi.bsd</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.libkern</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.mach</key>
+               <string>8.0.0b1</string>
+               <key>com.apple.kpi.unsupported</key>
+               <string>8.0.0b1</string>
+                <key>com.clusterfs.lustre.libcfs</key>
                 <string>1.0.0</string>
-                <key>com.clusterfs.lustre.portals.portals</key>
-                <string>1.0.0</string>
-                <key>com.clusterfs.lustre.portals.knals.ksocknal</key>
+                <key>com.clusterfs.lustre.lnet</key>
                 <string>1.0.0</string>
        </dict>
 </dict>
diff --git a/lnet/tests/ping_cli/winnt-pingcli.c b/lnet/tests/ping_cli/winnt-pingcli.c
new file mode 100644 (file)
index 0000000..7c9a1a1
--- /dev/null
@@ -0,0 +1,634 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Matt Wu <mattwu@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+/*
+ *  Included Headers 
+ */
+
+
+#include <libcfs/libcfs.h>
+
+
+/* libcfs module init/exit routines */
+DECLARE_INIT(init_libcfs_module);
+DECLARE_EXIT(exit_libcfs_module);
+
+/* portal module init/exit routines */
+DECLARE_INIT(init_lnet);
+DECLARE_EXIT(fini_lnet);
+
+/* tdinal module init/exit routines */
+DECLARE_INIT(ksocknal_module_init);
+DECLARE_EXIT(ksocknal_module_fini);
+
+/* pingcli module init/exit routines */
+DECLARE_INIT(pingcli_init);
+DECLARE_EXIT(pingcli_cleanup);
+
+
+/* pingsrv module init/exit routines */
+DECLARE_INIT(pingsrv_init);
+DECLARE_EXIT(pingsrv_cleanup);
+
+/*
+ * structure definitions
+ */
+
+
+#define LUSTRE_PING_VERSION   0x00010000               /* ping srv/cli version: 0001.0000 */
+
+#define LUSTRE_PING_DEVICE    L"\\Device\\LNET"     /* device object name */
+#define LUSTRE_PING_SYMLNK    L"\\DosDevices\\LNET" /* user-visible name for the device*/
+
+typedef struct _DEVICE_EXTENSION
+{
+    BOOLEAN    bProcFS;
+
+} DEVICE_EXTENSION, *PDEVICE_EXTENSION;
+
+
+/*
+ *  global definitions
+ */
+
+PDEVICE_OBJECT  PingObject = NULL;  /* ping device object */
+PDEVICE_OBJECT  ProcObject = NULL;  /* procfs emulator device */
+
+
+/*
+ *  common routines
+ */
+
+
+//
+// complete Irp request ...
+//
+
+NTSTATUS
+UTCompleteIrp(
+    PIRP        Irp,
+    NTSTATUS    Status,
+    ULONG       Info
+    )
+{
+    Irp->IoStatus.Status = Status;
+    Irp->IoStatus.Information = Info;
+    IoCompleteRequest(Irp,IO_NO_INCREMENT);
+
+    return Status;
+}
+
+//
+//  Open/Create Device ...
+//
+
+NTSTATUS
+UTCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTCreate: DeviceCreate ...\n"));
+
+    return UTCompleteIrp(Irp,STATUS_SUCCESS,0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+UTClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    KdPrint(("UTClose: Device Closed.\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+
+
+NTSTATUS
+UTShutdown(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTShutdown: shuting TdiSock ...\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+//
+// driver frame Routines ...
+//
+
+
+NTSTATUS
+UTDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("UTDeviceControl: Device Ioctl ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+            Status = STATUS_SUCCESS;
+            break;
+
+        default:
+            break;
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("UTDeviceControl: Device Ioctl returned.\n"));
+
+    return Status;
+}
+
+NTSTATUS
+ProcCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS                    Status;
+    PIO_STACK_LOCATION          IrpSp;
+
+    FILE_FULL_EA_INFORMATION *  ea;
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcCreate: Proc device is being opened ...\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    ea = (PFILE_FULL_EA_INFORMATION) Irp->AssociatedIrp.SystemBuffer;
+
+    if (!ea) {
+        Status = STATUS_INVALID_PARAMETER;
+    } else {
+        fp = lustre_open_file(&ea->EaName[0]);
+        if (!fp) {
+            Status = STATUS_OBJECT_NAME_NOT_FOUND;
+        } else {
+            IrpSp->FileObject->FsContext = fp;
+            IrpSp->FileObject->FsContext2 = fp->private_data;
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+    return UTCompleteIrp(Irp, Status, 0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+ProcClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    PIO_STACK_LOCATION          IrpSp;
+
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcClose: Proc device object is to be closed.\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+    ASSERT(fp != NULL);
+    ASSERT(IrpSp->FileObject->FsContext2 == fp->private_data);
+
+    lustre_close_file(fp);
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+/*
+ * proc frame routines
+ */
+
+NTSTATUS
+ProcDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("ProcDeviceControl: Proc device ioctling ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+
+            Status = STATUS_SUCCESS;
+
+            break;
+
+        case IOCTL_LIBCFS_ENTRY:
+        {
+            int rc = 0;
+            cfs_file_t * fp;
+
+            fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+            if (!fp) {
+                rc = -EINVAL;
+            } else {
+                rc = lustre_ioctl_file(fp, (PCFS_PROC_IOCTL) (lpvInBuffer));
+            }
+
+            if (rc == 0) {
+                Irp->IoStatus.Information = InputLength;
+                Status = STATUS_SUCCESS;
+            }
+        }    
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("ProcDeviceControl: Proc device ioctl returned with status = %xh.\n", Status));
+
+    return Status;
+}
+
+
+
+NTSTATUS
+ProcReadWrite (PDEVICE_OBJECT DeviceObject, PIRP Irp)
+{
+    PIO_STACK_LOCATION  IrpSp;
+    NTSTATUS            Status;
+
+    cfs_file_t *        fp;
+    int                 rc;
+    PCHAR               buf;
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    if (Irp->MdlAddress) {
+        buf = MmGetSystemAddressForMdlSafe(
+                        Irp->MdlAddress,
+                        NormalPagePriority);
+    } else {
+        buf = Irp->AssociatedIrp.SystemBuffer;
+    }
+
+    if (buf == NULL) {
+        Status = STATUS_SUCCESS;
+        rc = 0;
+    } else {
+        fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+        if (!fp) {
+            Status = STATUS_INVALID_PARAMETER;
+            goto errorout;
+        }
+
+        if (IrpSp->MajorFunction == IRP_MJ_READ) {
+            rc = lustre_read_file(
+                    fp, IrpSp->Parameters.Read.ByteOffset.LowPart,
+                    IrpSp->Parameters.Read.Length, buf);
+        } else {
+            rc = lustre_write_file(
+                    fp, IrpSp->Parameters.Write.ByteOffset.LowPart,
+                    IrpSp->Parameters.Write.Length, buf);
+        }
+        if (rc < 0) {
+            cfs_enter_debugger();
+            Status = STATUS_UNSUCCESSFUL;
+        } else {
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+errorout:
+    return UTCompleteIrp(Irp, Status, rc);
+}
+
+
+//
+//  common dispatch routines
+//
+
+NTSTATUS
+UTDispatchRequest(
+    IN PDEVICE_OBJECT DeviceObject,
+    IN PIRP           Irp
+    )
+{
+    NTSTATUS            Status;
+    PIO_STACK_LOCATION  IrpSp;
+
+    Status = STATUS_INVALID_DEVICE_REQUEST;
+
+    __try {
+
+        IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+        switch (IrpSp->MajorFunction) {
+
+            case IRP_MJ_CREATE:
+                if (DeviceObject == PingObject) {
+                    Status = UTCreate(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcCreate(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_CLOSE:
+                if (DeviceObject == PingObject) {
+                    Status = UTClose(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcClose(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_READ:
+            case IRP_MJ_WRITE:
+                if (DeviceObject == ProcObject) {
+                    Status = ProcReadWrite(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_DEVICE_CONTROL:
+                if (DeviceObject == PingObject) {
+                    Status = UTDeviceControl(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcDeviceControl(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_SHUTDOWN:
+                Status = UTShutdown(DeviceObject, Irp);
+                break;
+
+            default:
+
+                KdPrint(("UTDispatchRequest: Major Function: %xh is not supported.\n",
+                           IrpSp->MajorFunction));
+                UTCompleteIrp(Irp, Status, 0);
+                break;
+        }
+    }
+
+    __finally {
+    }
+
+    return Status;
+}
+
+//
+// create a device object and a dosdevice symbol link
+//
+
+PDEVICE_OBJECT
+CreateDevice(
+    IN PDRIVER_OBJECT   DriverObject,
+    IN PWCHAR           DeviceName,
+    IN PWCHAR           SymlnkName,
+    IN BOOLEAN          bProcFS
+    )
+{
+    NTSTATUS            Status;
+
+    UNICODE_STRING      NtDevName;
+    UNICODE_STRING      Win32DevName;
+
+    PDEVICE_EXTENSION   DeviceExtension;
+    PDEVICE_OBJECT      DeviceObject;
+
+    /* create the device object with the specified name */
+
+    RtlInitUnicodeString(&NtDevName, DeviceName);
+    
+    Status = IoCreateDevice(
+                    DriverObject,
+                    sizeof(DEVICE_EXTENSION),
+                    &NtDevName,
+                    FILE_DEVICE_UNKNOWN,
+                    0,
+                    FALSE,
+                    &DeviceObject );
+        
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    /* create the symlink to make the device visible to user */
+
+    RtlInitUnicodeString(&Win32DevName, SymlnkName);
+        
+    Status = IoCreateSymbolicLink(&Win32DevName, &NtDevName);
+
+    if (!NT_SUCCESS(Status)) {
+
+        IoDeleteDevice(DeviceObject);
+        return NULL;
+    }
+
+    DeviceExtension = (PDEVICE_EXTENSION)DeviceObject->DeviceObjectExtension;
+    DeviceExtension->bProcFS = bProcFS;
+
+    DeviceObject->Flags |= DO_BUFFERED_IO;
+    DeviceObject->Flags &= ~DO_DEVICE_INITIALIZING;
+
+    return DeviceObject;
+}
+
+
+//
+// DriverEntry
+//
+
+NTSTATUS DriverEntry(
+    IN PDRIVER_OBJECT  DriverObject,
+    IN PUNICODE_STRING RegistryPath 
+    )
+{
+    KdPrint(("Lustre ping test: Build Time: " __DATE__ " " __TIME__ "\n"));
+    KdPrint(("Lustre ping test: DriverEntry ... \n"));
+
+    /* initialize libcfs module */
+    if (module_init_libcfs_module() != 0) {
+        KdPrint(("ping: error initialize module: libcfs ...\n"));
+        goto errorout;
+    }
+
+    /* initialize lnet module */
+    if (module_init_lnet() != 0) {
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: lnet ...\n"));
+        goto errorout;
+    }
+
+    /* initialize tdinal module */
+    if (module_ksocknal_module_init() != 0) {
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: tdilnd ...\n"));
+        goto errorout;
+    }
+
+#if defined(LUSTRE_PING_CLI)
+    /* initialize pingcli module */
+    if (module_pingcli_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingcli ...\n"));
+        goto errorout;
+    }
+#endif
+
+#if defined(LUSTRE_PING_SRV)
+    /* initialize pingsrv module */
+    if (module_pingsrv_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingsrv ...\n"));
+        goto errorout;
+    }
+#endif
+
+    /* create the ping device object */
+    PingObject = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PING_DEVICE,
+                        LUSTRE_PING_SYMLNK,
+                        FALSE );
+    if (!PingObject) {
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* create the libcfs proc fs emultor device object */
+    ProcObject  = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PROC_DEVICE,
+                        LUSTRE_PROC_SYMLNK,
+                        TRUE );
+    if (!ProcObject) {
+
+        IoDeleteDevice(PingObject);
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* initialize the driver callback routines */
+
+    DriverObject->MajorFunction[IRP_MJ_CREATE]          = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_CLOSE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_READ]            = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_WRITE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_SHUTDOWN]        = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_DEVICE_CONTROL]  = UTDispatchRequest;
+
+    return STATUS_SUCCESS;
+
+errorout:
+
+    cfs_enter_debugger();
+
+    return STATUS_UNSUCCESSFUL;
+}
index ae0d722..22eefbf 100644 (file)
@@ -26,7 +26,7 @@
 #define DEBUG_SUBSYSTEM S_PINGER
 
 #include <libcfs/kp30.h>
-#include <portals/p30.h>
+#include <lnet/lnet.h>
 #include "ping.h"
 
 #define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval))
@@ -34,7 +34,6 @@
 
 static unsigned ping_head_magic;
 static unsigned ping_bulk_magic;
-static int nal  = SOCKNAL;                            // Your NAL,
 static unsigned long packets_valid = 0;         // Valid packets 
 static int running = 1;
 atomic_t pkt;
@@ -52,28 +51,28 @@ static void *pingsrv_shutdown(int err)
         switch (err) {
                 case 1:
                         /* Unlink any memory descriptors we may have used */
-                        if ((rc = PtlMDUnlink (server->mdin_h)))
-                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
+                        if ((rc = LNetMDUnlink (server->mdin_h)))
+                                PDEBUG ("LNetMDUnlink (out head buffer)", rc);
                 case 2:
                         /* Free the event queue */
-                        if ((rc = PtlEQFree (server->eq)))
-                                PDEBUG ("PtlEQFree", rc);
+                        if ((rc = LNetEQFree (server->eq)))
+                                PDEBUG ("LNetEQFree", rc);
 
                         /* Unlink the client portal from the ME list */
-                        if ((rc = PtlMEUnlink (server->me)))
-                                        PDEBUG ("PtlMEUnlink", rc);
+                        if ((rc = LNetMEUnlink (server->me)))
+                                        PDEBUG ("LNetMEUnlink", rc);
 
                 case 3:
-                        PtlNIFini (server->ni);
+                        LNetNIFini ();
 
                 case 4:
                         
                 case 5:
                         if (server->in_buf != NULL)
-                                PORTAL_FREE (server->in_buf, MAXSIZE);
+                                LIBCFS_FREE (server->in_buf, MAXSIZE);
                         
                         if (server != NULL)
-                                PORTAL_FREE (server, 
+                                LIBCFS_FREE (server, 
                                              sizeof (struct pingsrv_data));
                         
         }
@@ -89,17 +88,18 @@ int pingsrv_thread(void *arg)
         unsigned long magic;
         unsigned long ping_bulk_magic = __cpu_to_le32(0xcafebabe);
         
-        kportal_daemonize ("pingsrv");
+        cfs_daemonize ("pingsrv");
         server->tsk =  cfs_current();
         
         while (running) {
-                set_current_state (TASK_INTERRUPTIBLE);
+                set_current_state (CFS_TASK_INTERRUPTIBLE);
                 if (atomic_read (&pkt) == 0) {
-                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
+                        cfs_schedule_timeout (CFS_TASK_INTERRUPTIBLE, 
+                                              MAX_SCHEDULE_TIMEOUT);
                         continue;
                 }
                
-                magic =  __le32_to_cpu(*((int *)(server->evnt.md.start 
+                magic =  __le32_to_cpu(*((int *)((char *)server->evnt.md.start 
                                         + server->evnt.offset)));
                 
                 
@@ -112,14 +112,14 @@ int pingsrv_thread(void *arg)
                 server->mdout.length    = server->evnt.rlength;
                 server->mdout.start     = server->in_buf;
                 server->mdout.threshold = 1; 
-                server->mdout.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
+                server->mdout.options   = LNET_MD_OP_PUT;
                 server->mdout.user_ptr  = NULL;
-                server->mdout.eq_handle = PTL_EQ_NONE;
+                server->mdout.eq_handle = LNET_EQ_NONE;
        
                 /* Bind the outgoing buffer */
-                if ((rc = PtlMDBind (server->ni, server->mdout, 
-                                     PTL_UNLINK, &server->mdout_h))) {
-                         PDEBUG ("PtlMDBind", rc);
+                if ((rc = LNetMDBind (server->mdout, 
+                                     LNET_UNLINK, &server->mdout_h))) {
+                         PDEBUG ("LNetMDBind", rc);
                          pingsrv_shutdown (1);
                          return 1;
                }
@@ -128,19 +128,21 @@ int pingsrv_thread(void *arg)
                 server->mdin.start     = server->in_buf;
                 server->mdin.length    = MAXSIZE;
                 server->mdin.threshold = 1; 
-                server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
+                server->mdin.options   = LNET_MD_OP_PUT;
                 server->mdin.user_ptr  = NULL;
                 server->mdin.eq_handle = server->eq;
         
-                if ((rc = PtlMDAttach (server->me, server->mdin,
-                        PTL_UNLINK, &server->mdin_h))) {
-                        PDEBUG ("PtlMDAttach (bulk)", rc);
+                if ((rc = LNetMDAttach (server->me, server->mdin,
+                        LNET_UNLINK, &server->mdin_h))) {
+                        PDEBUG ("LNetMDAttach (bulk)", rc);
                         CDEBUG (D_OTHER, "ping server resources allocated\n");
                 }
                 
-                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
-                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
-                         PDEBUG ("PtlPut", rc);
+                if ((rc = LNetPut (server->evnt.target.nid, server->mdout_h, 
+                                   LNET_NOACK_REQ,
+                                   server->evnt.initiator, PTL_PING_CLIENT, 
+                                   0, 0, 0)))
+                         PDEBUG ("LNetPut", rc);
                 
                 atomic_dec (&pkt);
                 
@@ -150,13 +152,13 @@ int pingsrv_thread(void *arg)
         return 0;    
 }
 
-static void pingsrv_packet(ptl_event_t *ev)
+static void pingsrv_packet(lnet_event_t *ev)
 {
         atomic_inc (&pkt);
         wake_up_process (server->tsk);
 } /* pingsrv_head() */
 
-static void pingsrv_callback(ptl_event_t *ev)
+static void pingsrv_callback(lnet_event_t *ev)
 {
         
         if (ev == NULL) {
@@ -165,12 +167,13 @@ static void pingsrv_callback(ptl_event_t *ev)
         }
         server->evnt = *ev;
         
-        CWARN ("received ping from nid "LPX64" "
+        CWARN ("received ping from nid %s "
                "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
-               ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
-               __le32_to_cpu(*((int *)(ev->md.start + ev->offset))),
-               __le32_to_cpu(*((int *)(ev->md.start + ev->offset + sizeof(unsigned)))),
-               __le32_to_cpu(*((int *)(ev->md.start + ev->offset + 2 * 
+               libcfs_nid2str(ev->initiator.nid), 
+               ev->offset, ev->rlength, ev->mlength,
+               __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset))),
+               __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset + sizeof(unsigned)))),
+               __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset + 2 * 
                                sizeof(unsigned)))));
         
         packets_valid++;
@@ -184,41 +187,38 @@ static struct pingsrv_data *pingsrv_setup(void)
 {
         int rc;
 
-        server->ni = PTL_INVALID_HANDLE;
-
-       /* Aquire and initialize the proper nal for portals. */
-        rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni);
-        if (!(rc == PTL_OK || rc == PTL_IFACE_DUP)) {
-                CDEBUG (D_OTHER, "NAL %x not loaded\n", nal);
+        /* Aquire and initialize the proper nal for portals. */
+        rc = LNetNIInit(0);
+        if (!(rc == 0 || rc == 1)) {
+                CDEBUG (D_OTHER, "LNetNIInit: error %d\n", rc);
                 return pingsrv_shutdown (4);
         }
 
 
         /* Based on the initialization aquire our unique portal ID. */
-        if ((rc = PtlGetId (server->ni, &server->my_id))) {
-                PDEBUG ("PtlGetId", rc);
+        if ((rc = LNetGetId (1, &server->my_id))) {
+                PDEBUG ("LNetGetId", rc);
                 return pingsrv_shutdown (2);
         }
 
-        server->id_local.nid = PTL_NID_ANY;
-        server->id_local.pid = PTL_PID_ANY;
+        server->id_local.nid = LNET_NID_ANY;
+        server->id_local.pid = LNET_PID_ANY;
 
         /* Attach a match entries for header packets */
-        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
+        if ((rc = LNetMEAttach (PTL_PING_SERVER,
             server->id_local,0, ~0,
-            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
-                PDEBUG ("PtlMEAttach", rc);
+            LNET_RETAIN, LNET_INS_AFTER, &server->me))) {
+                PDEBUG ("LNetMEAttach", rc);
                 return pingsrv_shutdown (2);
         }
 
 
-        if ((rc = PtlEQAlloc (server->ni, 1024, &pingsrv_callback,
-                                        &server->eq))) {
-                PDEBUG ("PtlEQAlloc (callback)", rc);
+        if ((rc = LNetEQAlloc (1024, &pingsrv_callback, &server->eq))) {
+                PDEBUG ("LNetEQAlloc (callback)", rc);
                 return pingsrv_shutdown (2);
         }
         
-        PORTAL_ALLOC (server->in_buf, MAXSIZE);
+        LIBCFS_ALLOC (server->in_buf, MAXSIZE);
         if(!server->in_buf){
                 CDEBUG (D_OTHER,"Allocation error\n");
                 return pingsrv_shutdown(2);
@@ -228,29 +228,36 @@ static struct pingsrv_data *pingsrv_setup(void)
         server->mdin.start     = server->in_buf;
         server->mdin.length    = MAXSIZE;
         server->mdin.threshold = 1; 
-        server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
+        server->mdin.options   = LNET_MD_OP_PUT;
         server->mdin.user_ptr  = NULL;
         server->mdin.eq_handle = server->eq;
         memset (server->in_buf, 0, STDSIZE);
-        
-        if ((rc = PtlMDAttach (server->me, server->mdin,
-                PTL_UNLINK, &server->mdin_h))) {
-                    PDEBUG ("PtlMDAttach (bulk)", rc);
+
+        if ((rc = LNetMDAttach (server->me, server->mdin,
+                LNET_UNLINK, &server->mdin_h))) {
+                    PDEBUG ("LNetMDAttach (bulk)", rc);
                 CDEBUG (D_OTHER, "ping server resources allocated\n");
        }
+
         /* Success! */
-        return server; 
+        return server;
 } /* pingsrv_setup() */
 
-static int pingsrv_start(void) 
+static int pingsrv_start(void)
 {
+        long pid;
+
         /* Setup our server */
         if (!pingsrv_setup()) {
                 CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
                 return -ENOMEM;
         }
-        cfs_kernel_thread (pingsrv_thread,NULL,0);
+        pid = cfs_kernel_thread (pingsrv_thread,NULL,0);
+        if (pid < 0) {
+                CERROR("Can't start pingsrv thread: rc = %ld\n", pid);
+                return (int)pid;
+        }
+
         return 0;
 } /* pingsrv_start() */
 
@@ -258,7 +265,7 @@ static int __init pingsrv_init(void)
 {
         ping_head_magic = __cpu_to_le32(PING_HEADER_MAGIC);
         ping_bulk_magic = __cpu_to_le32(PING_BULK_MAGIC);
-        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
+        LIBCFS_ALLOC (server, sizeof(struct pingsrv_data));  
         atomic_set(&pkt, 0);
         return pingsrv_start ();
 } /* pingsrv_init() */
@@ -270,17 +277,13 @@ static void /*__exit*/ pingsrv_cleanup(void)
         running = 0;
         wake_up_process (server->tsk);
         while (running != 1) {
-                set_current_state (TASK_UNINTERRUPTIBLE);
-                schedule_timeout (cfs_time_seconds(1));
+                set_current_state (CFS_TASK_UNINT);
+                cfs_schedule_timeout (CFS_TASK_UNINT, cfs_time_seconds(1));
         }
         
 } /* pingsrv_cleanup() */
 
 
-MODULE_PARM(nal, "i");
-MODULE_PARM_DESC(nal, "Use the specified NAL "
-                "(2-ksocknal, 1-kqswnal)");
 MODULE_AUTHOR("Brian Behlendorf (LLNL)");
 MODULE_DESCRIPTION("A kernel space ping server for portals testing");
 MODULE_LICENSE("GPL");
index 21024f0..b08212c 100644 (file)
@@ -2,36 +2,39 @@
 <!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
-       <key>CFBundleDevelopmentRegion</key>
-       <string>English</string>
-       <key>CFBundleExecutable</key>
-       <string>ping_srv</string>
-       <key>CFBundleIconFile</key>
-       <string></string>
-       <key>CFBundleIdentifier</key>
-       <string>com.clusterfs.lustre.portals.tests.ping_srv</string>
-       <key>CFBundleInfoDictionaryVersion</key>
-       <string>6.0</string>
-       <key>CFBundlePackageType</key>
-       <string>KEXT</string>
-       <key>CFBundleSignature</key>
-       <string>????</string>
-       <key>CFBundleVersion</key>
-       <string>1.0.0d1</string>
-       <key>OSBundleLibraries</key>
-       <dict> 
-               <key>com.apple.kernel.bsd</key> 
-               <string>1.1</string> 
-               <key>com.apple.kernel.iokit</key> 
-               <string>1.0.0b1</string> 
-               <key>com.apple.kernel.mach</key> 
-               <string>1.0.0b1</string> 
-               <key>com.clusterfs.lustre.portals.libcfs</key> 
-               <string>1.0.0</string> 
-               <key>com.clusterfs.lustre.portals.portals</key> 
-               <string>1.0.0</string>
-               <key>com.clusterfs.lustre.portals.knals.ksocknal</key> 
-               <string>1.0.0</string>
-       </dict>
+        <key>CFBundleDevelopmentRegion</key>
+        <string>English</string>
+        <key>CFBundleExecutable</key>
+        <string>pingsrv</string>
+        <key>CFBundleIconFile</key>
+        <string></string>
+        <key>CFBundleIdentifier</key>
+        <string>com.clusterfs.lustre.pingsrv</string>
+        <key>CFBundleInfoDictionaryVersion</key>
+        <string>6.0</string>
+        <key>CFBundlePackageType</key>
+        <string>KEXT</string>
+        <key>CFBundleSignature</key>
+        <string>????</string>
+        <key>CFBundleVersion</key>
+        <string>1.0.1</string>
+        <key>OSBundleCompatibleVersion</key>
+        <string>1.0.0</string>
+        <key>OSBundleLibraries</key>
+        <dict>
+                <key>com.apple.kpi.bsd</key>
+                <string>8.0.0b1</string>
+                <key>com.apple.kpi.libkern</key>
+                <string>8.0.0b1</string>
+                <key>com.apple.kpi.mach</key>
+                <string>8.0.0b1</string>
+                <key>com.apple.kpi.unsupported</key>
+                <string>8.0.0b1</string>
+                <key>com.clusterfs.lustre.libcfs</key>
+                <string>1.0.0</string>
+                <key>com.clusterfs.lustre.lnet</key>
+                <string>1.0.0</string>
+        </dict>
 </dict>
 </plist>
+
diff --git a/lnet/tests/ping_srv/winnt-pingsrv.c b/lnet/tests/ping_srv/winnt-pingsrv.c
new file mode 100644 (file)
index 0000000..7c9a1a1
--- /dev/null
@@ -0,0 +1,634 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=4:tabstop=4:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ *   Author: Matt Wu <mattwu@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LNET
+
+/*
+ *  Included Headers 
+ */
+
+
+#include <libcfs/libcfs.h>
+
+
+/* libcfs module init/exit routines */
+DECLARE_INIT(init_libcfs_module);
+DECLARE_EXIT(exit_libcfs_module);
+
+/* portal module init/exit routines */
+DECLARE_INIT(init_lnet);
+DECLARE_EXIT(fini_lnet);
+
+/* tdinal module init/exit routines */
+DECLARE_INIT(ksocknal_module_init);
+DECLARE_EXIT(ksocknal_module_fini);
+
+/* pingcli module init/exit routines */
+DECLARE_INIT(pingcli_init);
+DECLARE_EXIT(pingcli_cleanup);
+
+
+/* pingsrv module init/exit routines */
+DECLARE_INIT(pingsrv_init);
+DECLARE_EXIT(pingsrv_cleanup);
+
+/*
+ * structure definitions
+ */
+
+
+#define LUSTRE_PING_VERSION   0x00010000               /* ping srv/cli version: 0001.0000 */
+
+#define LUSTRE_PING_DEVICE    L"\\Device\\LNET"     /* device object name */
+#define LUSTRE_PING_SYMLNK    L"\\DosDevices\\LNET" /* user-visible name for the device*/
+
+typedef struct _DEVICE_EXTENSION
+{
+    BOOLEAN    bProcFS;
+
+} DEVICE_EXTENSION, *PDEVICE_EXTENSION;
+
+
+/*
+ *  global definitions
+ */
+
+PDEVICE_OBJECT  PingObject = NULL;  /* ping device object */
+PDEVICE_OBJECT  ProcObject = NULL;  /* procfs emulator device */
+
+
+/*
+ *  common routines
+ */
+
+
+//
+// complete Irp request ...
+//
+
+NTSTATUS
+UTCompleteIrp(
+    PIRP        Irp,
+    NTSTATUS    Status,
+    ULONG       Info
+    )
+{
+    Irp->IoStatus.Status = Status;
+    Irp->IoStatus.Information = Info;
+    IoCompleteRequest(Irp,IO_NO_INCREMENT);
+
+    return Status;
+}
+
+//
+//  Open/Create Device ...
+//
+
+NTSTATUS
+UTCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTCreate: DeviceCreate ...\n"));
+
+    return UTCompleteIrp(Irp,STATUS_SUCCESS,0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+UTClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    KdPrint(("UTClose: Device Closed.\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+
+
+NTSTATUS
+UTShutdown(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    KdPrint(("UTShutdown: shuting TdiSock ...\n"));
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+//
+// driver frame Routines ...
+//
+
+
+NTSTATUS
+UTDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("UTDeviceControl: Device Ioctl ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+            Status = STATUS_SUCCESS;
+            break;
+
+        default:
+            break;
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("UTDeviceControl: Device Ioctl returned.\n"));
+
+    return Status;
+}
+
+NTSTATUS
+ProcCreate(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS                    Status;
+    PIO_STACK_LOCATION          IrpSp;
+
+    FILE_FULL_EA_INFORMATION *  ea;
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcCreate: Proc device is being opened ...\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    ea = (PFILE_FULL_EA_INFORMATION) Irp->AssociatedIrp.SystemBuffer;
+
+    if (!ea) {
+        Status = STATUS_INVALID_PARAMETER;
+    } else {
+        fp = lustre_open_file(&ea->EaName[0]);
+        if (!fp) {
+            Status = STATUS_OBJECT_NAME_NOT_FOUND;
+        } else {
+            IrpSp->FileObject->FsContext = fp;
+            IrpSp->FileObject->FsContext2 = fp->private_data;
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+    return UTCompleteIrp(Irp, Status, 0);
+}
+
+//
+// Close Devcie ...
+//
+
+NTSTATUS
+ProcClose(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp)
+{
+    PIO_STACK_LOCATION          IrpSp;
+
+    cfs_file_t *                fp;
+
+    KdPrint(("ProcClose: Proc device object is to be closed.\n"));
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+    ASSERT(fp != NULL);
+    ASSERT(IrpSp->FileObject->FsContext2 == fp->private_data);
+
+    lustre_close_file(fp);
+
+    return UTCompleteIrp(Irp, STATUS_SUCCESS, 0);
+
+    UNREFERENCED_PARAMETER(DeviceObject);
+}
+
+/*
+ * proc frame routines
+ */
+
+NTSTATUS
+ProcDeviceControl(
+    IN PDEVICE_OBJECT   DeviceObject,
+    IN PIRP             Irp
+    )
+{
+    NTSTATUS            Status = STATUS_INVALID_DEVICE_REQUEST;
+    PIO_STACK_LOCATION  IrpSp;
+
+    ULONG               ControlCode;
+    ULONG               InputLength;
+    ULONG               OutputLength;
+
+    PVOID               lpvInBuffer;
+
+    KdPrint(("ProcDeviceControl: Proc device ioctling ...\n"));
+
+    Irp->IoStatus.Information = 0;
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+    ControlCode  = IrpSp->Parameters.DeviceIoControl.IoControlCode;
+    InputLength  = IrpSp->Parameters.DeviceIoControl.InputBufferLength;
+    OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength;
+    lpvInBuffer  = Irp->AssociatedIrp.SystemBuffer;
+
+    ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL);
+
+    switch (ControlCode)
+    {
+        case IOCTL_LIBCFS_VERSION:
+
+            *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION);
+            Irp->IoStatus.Information = sizeof(ULONG);
+
+            Status = STATUS_SUCCESS;
+
+            break;
+
+        case IOCTL_LIBCFS_ENTRY:
+        {
+            int rc = 0;
+            cfs_file_t * fp;
+
+            fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+            if (!fp) {
+                rc = -EINVAL;
+            } else {
+                rc = lustre_ioctl_file(fp, (PCFS_PROC_IOCTL) (lpvInBuffer));
+            }
+
+            if (rc == 0) {
+                Irp->IoStatus.Information = InputLength;
+                Status = STATUS_SUCCESS;
+            }
+        }    
+    }
+
+    Irp->IoStatus.Status = Status;
+
+    IoCompleteRequest(Irp, IO_NO_INCREMENT);
+
+    KdPrint(("ProcDeviceControl: Proc device ioctl returned with status = %xh.\n", Status));
+
+    return Status;
+}
+
+
+
+NTSTATUS
+ProcReadWrite (PDEVICE_OBJECT DeviceObject, PIRP Irp)
+{
+    PIO_STACK_LOCATION  IrpSp;
+    NTSTATUS            Status;
+
+    cfs_file_t *        fp;
+    int                 rc;
+    PCHAR               buf;
+
+    IrpSp = IoGetCurrentIrpStackLocation(Irp);
+    if (Irp->MdlAddress) {
+        buf = MmGetSystemAddressForMdlSafe(
+                        Irp->MdlAddress,
+                        NormalPagePriority);
+    } else {
+        buf = Irp->AssociatedIrp.SystemBuffer;
+    }
+
+    if (buf == NULL) {
+        Status = STATUS_SUCCESS;
+        rc = 0;
+    } else {
+        fp = (cfs_file_t *) IrpSp->FileObject->FsContext;
+
+        if (!fp) {
+            Status = STATUS_INVALID_PARAMETER;
+            goto errorout;
+        }
+
+        if (IrpSp->MajorFunction == IRP_MJ_READ) {
+            rc = lustre_read_file(
+                    fp, IrpSp->Parameters.Read.ByteOffset.LowPart,
+                    IrpSp->Parameters.Read.Length, buf);
+        } else {
+            rc = lustre_write_file(
+                    fp, IrpSp->Parameters.Write.ByteOffset.LowPart,
+                    IrpSp->Parameters.Write.Length, buf);
+        }
+        if (rc < 0) {
+            cfs_enter_debugger();
+            Status = STATUS_UNSUCCESSFUL;
+        } else {
+            Status = STATUS_SUCCESS;
+        }
+    }
+
+errorout:
+    return UTCompleteIrp(Irp, Status, rc);
+}
+
+
+//
+//  common dispatch routines
+//
+
+NTSTATUS
+UTDispatchRequest(
+    IN PDEVICE_OBJECT DeviceObject,
+    IN PIRP           Irp
+    )
+{
+    NTSTATUS            Status;
+    PIO_STACK_LOCATION  IrpSp;
+
+    Status = STATUS_INVALID_DEVICE_REQUEST;
+
+    __try {
+
+        IrpSp = IoGetCurrentIrpStackLocation(Irp);
+
+        switch (IrpSp->MajorFunction) {
+
+            case IRP_MJ_CREATE:
+                if (DeviceObject == PingObject) {
+                    Status = UTCreate(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcCreate(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_CLOSE:
+                if (DeviceObject == PingObject) {
+                    Status = UTClose(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcClose(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_READ:
+            case IRP_MJ_WRITE:
+                if (DeviceObject == ProcObject) {
+                    Status = ProcReadWrite(DeviceObject, Irp);
+                }
+                break;
+        
+            case IRP_MJ_DEVICE_CONTROL:
+                if (DeviceObject == PingObject) {
+                    Status = UTDeviceControl(DeviceObject, Irp);
+                } else if (DeviceObject == ProcObject) {
+                    Status = ProcDeviceControl(DeviceObject, Irp);
+                }
+                break;
+
+            case IRP_MJ_SHUTDOWN:
+                Status = UTShutdown(DeviceObject, Irp);
+                break;
+
+            default:
+
+                KdPrint(("UTDispatchRequest: Major Function: %xh is not supported.\n",
+                           IrpSp->MajorFunction));
+                UTCompleteIrp(Irp, Status, 0);
+                break;
+        }
+    }
+
+    __finally {
+    }
+
+    return Status;
+}
+
+//
+// create a device object and a dosdevice symbol link
+//
+
+PDEVICE_OBJECT
+CreateDevice(
+    IN PDRIVER_OBJECT   DriverObject,
+    IN PWCHAR           DeviceName,
+    IN PWCHAR           SymlnkName,
+    IN BOOLEAN          bProcFS
+    )
+{
+    NTSTATUS            Status;
+
+    UNICODE_STRING      NtDevName;
+    UNICODE_STRING      Win32DevName;
+
+    PDEVICE_EXTENSION   DeviceExtension;
+    PDEVICE_OBJECT      DeviceObject;
+
+    /* create the device object with the specified name */
+
+    RtlInitUnicodeString(&NtDevName, DeviceName);
+    
+    Status = IoCreateDevice(
+                    DriverObject,
+                    sizeof(DEVICE_EXTENSION),
+                    &NtDevName,
+                    FILE_DEVICE_UNKNOWN,
+                    0,
+                    FALSE,
+                    &DeviceObject );
+        
+    if (!NT_SUCCESS(Status)) {
+
+        cfs_enter_debugger();
+        return NULL;
+    }
+
+    /* create the symlink to make the device visible to user */
+
+    RtlInitUnicodeString(&Win32DevName, SymlnkName);
+        
+    Status = IoCreateSymbolicLink(&Win32DevName, &NtDevName);
+
+    if (!NT_SUCCESS(Status)) {
+
+        IoDeleteDevice(DeviceObject);
+        return NULL;
+    }
+
+    DeviceExtension = (PDEVICE_EXTENSION)DeviceObject->DeviceObjectExtension;
+    DeviceExtension->bProcFS = bProcFS;
+
+    DeviceObject->Flags |= DO_BUFFERED_IO;
+    DeviceObject->Flags &= ~DO_DEVICE_INITIALIZING;
+
+    return DeviceObject;
+}
+
+
+//
+// DriverEntry
+//
+
+NTSTATUS DriverEntry(
+    IN PDRIVER_OBJECT  DriverObject,
+    IN PUNICODE_STRING RegistryPath 
+    )
+{
+    KdPrint(("Lustre ping test: Build Time: " __DATE__ " " __TIME__ "\n"));
+    KdPrint(("Lustre ping test: DriverEntry ... \n"));
+
+    /* initialize libcfs module */
+    if (module_init_libcfs_module() != 0) {
+        KdPrint(("ping: error initialize module: libcfs ...\n"));
+        goto errorout;
+    }
+
+    /* initialize lnet module */
+    if (module_init_lnet() != 0) {
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: lnet ...\n"));
+        goto errorout;
+    }
+
+    /* initialize tdinal module */
+    if (module_ksocknal_module_init() != 0) {
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: tdilnd ...\n"));
+        goto errorout;
+    }
+
+#if defined(LUSTRE_PING_CLI)
+    /* initialize pingcli module */
+    if (module_pingcli_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingcli ...\n"));
+        goto errorout;
+    }
+#endif
+
+#if defined(LUSTRE_PING_SRV)
+    /* initialize pingsrv module */
+    if (module_pingsrv_init() != 0) {
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        KdPrint(("ping: error initialize module: pingsrv ...\n"));
+        goto errorout;
+    }
+#endif
+
+    /* create the ping device object */
+    PingObject = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PING_DEVICE,
+                        LUSTRE_PING_SYMLNK,
+                        FALSE );
+    if (!PingObject) {
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* create the libcfs proc fs emultor device object */
+    ProcObject  = CreateDevice(
+                        DriverObject,
+                        LUSTRE_PROC_DEVICE,
+                        LUSTRE_PROC_SYMLNK,
+                        TRUE );
+    if (!ProcObject) {
+
+        IoDeleteDevice(PingObject);
+#if defined(LUSTRE_PING_CLI)
+        module_pingcli_cleanup();
+#endif
+#if defined(LUSTRE_PING_SRV)
+        module_pingsrv_cleanup();
+#endif
+        module_ksocknal_module_fini();
+        module_fini_lnet();
+        module_exit_libcfs_module();
+        return STATUS_INSUFFICIENT_RESOURCES;
+    }
+
+    /* initialize the driver callback routines */
+
+    DriverObject->MajorFunction[IRP_MJ_CREATE]          = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_CLOSE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_READ]            = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_WRITE]           = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_SHUTDOWN]        = UTDispatchRequest;
+    DriverObject->MajorFunction[IRP_MJ_DEVICE_CONTROL]  = UTDispatchRequest;
+
+    return STATUS_SUCCESS;
+
+errorout:
+
+    cfs_enter_debugger();
+
+    return STATUS_UNSUCCESSFUL;
+}
diff --git a/lnet/tests/sping_cli.c b/lnet/tests/sping_cli.c
deleted file mode 100644 (file)
index 71a2a98..0000000
+++ /dev/null
@@ -1,279 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
- * Author: Brian Behlendorf <behlendorf1@llnl.gov>
- *         Kedar Sovani (kedar@calsoftinc.com)
- *         Amey Inamdar (amey@calsoftinc.com)
- *
- * This file is part of Portals, http://www.sf.net/projects/lustre/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-/* This is a striped down version of pinger. It follows a single
- * request-response protocol. Doesn't do Bulk data pinging. Also doesn't
- * send multiple packets in a single ioctl.
- */
-
-
-#define DEBUG_SUBSYSTEM S_PINGER
-
-#include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include "ping.h"
-/* int portal_debug = D_PING_CLI;  */
-
-
-#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes
-                                                   assumed */
-
-/* This should be enclosed in a structure */
-
-static struct pingcli_data *client = NULL;
-
-static int count = 0;
-
-static void
-pingcli_shutdown(ptl_handle_ni_t nih, int err)
-{
-        int rc;
-
-        /* Yes, we are intentionally allowing us to fall through each
-         * case in to the next.  This allows us to pass an error
-         * code to just clean up the right stuff.
-         */
-        switch (err) {
-                case 1:
-                        /* Unlink any memory descriptors we may have used */
-                        if ((rc = PtlMDUnlink (client->md_out_head_h)))
-                                PDEBUG ("PtlMDUnlink", rc);
-                case 2:
-                        /* Free the event queue */
-                        if ((rc = PtlEQFree (client->eq)))
-                                PDEBUG ("PtlEQFree", rc);
-
-                        if ((rc = PtlMEUnlink (client->me)))
-                                PDEBUG ("PtlMEUnlink", rc);
-                case 3:
-                        PtlNIFini (nih);
-
-                case 4:
-                        /* Free our buffers */
-                        if (client->outbuf != NULL)
-                                PORTAL_FREE (client->outbuf, STDSIZE);
-
-                        if (client->inbuf != NULL)
-                                PORTAL_FREE (client->inbuf, STDSIZE);
-
-
-                        if (client != NULL)
-                                PORTAL_FREE (client,
-                                                sizeof(struct pingcli_data));
-        }
-
-
-        CDEBUG (D_OTHER, "ping client released resources\n");
-} /* pingcli_shutdown() */
-
-static void pingcli_callback(ptl_event_t *ev)
-{
-        wake_up_process (client->tsk);
-}
-
-
-static struct pingcli_data *
-pingcli_start(struct portal_ioctl_data *args)
-{
-        ptl_handle_ni_t nih = PTL_INVALID_HANDLE;
-        unsigned ping_head_magic = PING_HEADER_MAGIC;
-        char str[PTL_NALFMT_SIZE];
-        int rc;
-
-        client->tsk = current;
-        client->args = args;
-
-        CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64" (%s),  \
-                        nal %x, size %u, count: %u, timeout: %u\n",
-                        args->ioc_nid,
-                        portals_nid2str(args->ioc_nid, args->ioc_nal, str),
-                        args->ioc_nal, args->ioc_size,
-                        args->ioc_count, args->ioc_timeout);
-
-
-        PORTAL_ALLOC (client->outbuf, STDSIZE) ;
-        if (client->outbuf == NULL)
-        {
-                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (nih, 4);
-                return (NULL);
-        }
-
-        PORTAL_ALLOC (client->inbuf,  STDSIZE);
-
-        if (client->inbuf == NULL)
-        {
-                CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE);
-                pingcli_shutdown (nih, 4);
-                return (NULL);
-        }
-
-        /* Aquire and initialize the proper nal for portals. */
-        rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP)
-        {
-                CERROR ("NAL %x not loaded.\n", args->ioc_nal);
-                pingcli_shutdown (nih, 4);
-                return (NULL);
-        }
-
-        /* Based on the initialization aquire our unique portal ID. */
-        if ((rc = PtlGetId (nih, &client->myid)))
-        {
-                CERROR ("PtlGetId error %d\n", rc);
-                pingcli_shutdown (nih, 2);
-                return (NULL);
-        }
-
-        /* Setup the local match entries */
-        client->id_local.nid = PTL_NID_ANY;
-        client->id_local.pid = PTL_PID_ANY;
-
-        /* Setup the remote match entries */
-        client->id_remote.nid = args->ioc_nid;
-        client->id_remote.pid = 0;
-
-        if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT,
-                   client->id_local, 0, ~0, PTL_RETAIN,
-                   PTL_INS_AFTER, &client->me)))
-        {
-                CERROR ("PtlMEAttach error %d\n", rc);
-                pingcli_shutdown (nih, 2);
-                return (NULL);
-        }
-
-        /* Allocate the event queue for this network interface */
-        if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq)))
-        {
-                CERROR ("PtlEQAlloc error %d\n", rc);
-                pingcli_shutdown (nih, 2);
-                return (NULL);
-        }
-
-
-        client->md_in_head.start     = client->inbuf;
-        client->md_in_head.length    = STDSIZE;
-        client->md_in_head.threshold = 1;
-        client->md_in_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
-        client->md_in_head.user_ptr  = NULL;
-        client->md_in_head.eq_handle = client->eq;
-        memset (client->inbuf, 0, STDSIZE);
-
-        /* Attach the incoming buffer */
-        if ((rc = PtlMDAttach (client->me, client->md_in_head,
-                              PTL_UNLINK, &client->md_in_head_h))) {
-                CERROR ("PtlMDAttach error %d\n", rc);
-                pingcli_shutdown (nih, 1);
-                return (NULL);
-        }
-
-        /* Setup the outgoing ping header */
-        client->md_out_head.start     = client->outbuf;
-        client->md_out_head.length    = STDSIZE;
-        client->md_out_head.threshold = 1;
-        client->md_out_head.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
-        client->md_out_head.user_ptr  = NULL;
-        client->md_out_head.eq_handle = PTL_EQ_NONE;
-
-        memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic));
-
-        /* Bind the outgoing ping header */
-        if ((rc=PtlMDBind (nih, client->md_out_head,
-                           PTL_UNLINK, &client->md_out_head_h))) {
-                CERROR ("PtlMDBind error %d\n", rc);
-                pingcli_shutdown (nih, 1);
-                return (NULL);
-        }
-        /* Put the ping packet */
-        if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ,
-                         client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) {
-                PDEBUG ("PtlPut (header)", rc);
-                pingcli_shutdown (nih, 1);
-                return NULL;
-        }
-
-        count = 0;
-        set_current_state (TASK_INTERRUPTIBLE);
-        rc = schedule_timeout (20 * args->ioc_timeout);
-        if (rc == 0) {
-                CERROR ("Time out on the server\n");
-                pingcli_shutdown (nih, 2);
-                return NULL;
-        } else {
-                CWARN("Received respose from the server \n");
-        }
-
-        pingcli_shutdown (nih, 2);
-
-        /* Success! */
-        return NULL;
-} /* pingcli_setup() */
-
-
-
-/* called by the portals_ioctl for ping requests */
-int kping_client(struct portal_ioctl_data *args)
-{
-
-        PORTAL_ALLOC (client, sizeof(struct pingcli_data));
-        memset (client, 0, sizeof(struct pingcli_data));
-        if (client == NULL)
-        {
-                CERROR ("Unable to allocate client structure\n");
-                return (0);
-        }
-        pingcli_start (args);
-
-        return 0;
-} /* kping_client() */
-
-
-static int __init pingcli_init(void)
-{
-        PORTAL_SYMBOL_REGISTER(kping_client);
-        return 0;
-} /* pingcli_init() */
-
-
-static void /*__exit*/ pingcli_cleanup(void)
-{
-        PORTAL_SYMBOL_UNREGISTER (kping_client);
-} /* pingcli_cleanup() */
-
-
-MODULE_AUTHOR("Brian Behlendorf (LLNL)");
-MODULE_DESCRIPTION("A simple kernel space ping client for portals testing");
-MODULE_LICENSE("GPL");
-
-module_init(pingcli_init);
-module_exit(pingcli_cleanup);
-
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-EXPORT_SYMBOL (kping_client);
-#endif
diff --git a/lnet/tests/sping_srv.c b/lnet/tests/sping_srv.c
deleted file mode 100644 (file)
index 30f158c..0000000
+++ /dev/null
@@ -1,294 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL)
- * Author: Brian Behlendorf <behlendorf1@llnl.gov>
- *        Amey Inamdar     <amey@calsoftinc.com>
- *        Kedar Sovani     <kedar@calsoftinc.com>
- *
- *
- * This file is part of Portals, http://www.sf.net/projects/lustre/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* This is a striped down version of pinger. It follows a single
- * request-response protocol. Doesn't do Bulk data pinging. Also doesn't 
- * send multiple packets in a single ioctl.
- */
-
-#define DEBUG_SUBSYSTEM S_PINGER
-
-#include <libcfs/kp30.h>
-#include <portals/p30.h>
-#include "ping.h"
-
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/version.h>
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#include <linux/workqueue.h>
-#else
-#include <linux/tqueue.h>
-#endif
-#include <linux/wait.h>
-#include <linux/smp_lock.h>
-
-#include <asm/unistd.h>
-#include <asm/semaphore.h>
-
-#define STDSIZE (sizeof(int) + sizeof(int) + 4)
-
-static int nal  = PTL_IFACE_DEFAULT;            // Your NAL,
-static unsigned long packets_valid = 0;         // Valid packets 
-static int running = 1;
-atomic_t pkt;
-       
-static struct pingsrv_data *server=NULL;             // Our ping server
-
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
-#endif
-
-static void *pingsrv_shutdown(int err)
-{
-        int rc;
-
-        /* Yes, we are intentionally allowing us to fall through each
-         * case in to the next.  This allows us to pass an error
-         * code to just clean up the right stuff.
-         */
-        switch (err) {
-                case 1:
-                        /* Unlink any memory descriptors we may have used */
-                        if ((rc = PtlMDUnlink (server->mdin_h)))
-                                PDEBUG ("PtlMDUnlink (out head buffer)", rc);
-                case 2:
-                        /* Free the event queue */
-                        if ((rc = PtlEQFree (server->eq)))
-                                PDEBUG ("PtlEQFree", rc);
-
-                        /* Unlink the client portal from the ME list */
-                        if ((rc = PtlMEUnlink (server->me)))
-                                        PDEBUG ("PtlMEUnlink", rc);
-
-                case 3:
-                        PtlNIFini(server->ni);
-
-                case 4:
-                        
-                        if (server->in_buf != NULL)
-                                PORTAL_FREE (server->in_buf, STDSIZE);
-                        
-                        if (server != NULL)
-                                PORTAL_FREE (server, 
-                                             sizeof (struct pingsrv_data));
-                        
-        }
-
-        CDEBUG (D_OTHER, "ping sever resources released\n");
-        return NULL;
-} /* pingsrv_shutdown() */
-
-
-int pingsrv_thread(void *arg)
-{
-        int rc;
-        
-        kportal_daemonize ("pingsrv");
-        server->tsk = current;
-        
-        while (running) {
-                set_current_state (TASK_INTERRUPTIBLE);
-                if (atomic_read (&pkt) == 0) {
-                        schedule_timeout (MAX_SCHEDULE_TIMEOUT);
-                        continue;
-                }
-                               
-                server->mdout.start     = server->in_buf;
-                server->mdout.length    = STDSIZE;
-                server->mdout.threshold = 1; 
-                server->mdout.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
-                server->mdout.user_ptr  = NULL;
-                server->mdout.eq_handle = PTL_EQ_NONE;
-       
-                /* Bind the outgoing buffer */
-                if ((rc = PtlMDBind (server->ni, server->mdout, 
-                                     PTL_UNLINK, &server->mdout_h))) {
-                         PDEBUG ("PtlMDBind", rc);
-                         pingsrv_shutdown (1);
-                         return 1;
-               }
-         
-                
-                server->mdin.start     = server->in_buf;
-                server->mdin.length    = STDSIZE;
-                server->mdin.threshold = 1; 
-                server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
-                server->mdin.user_ptr  = NULL;
-                server->mdin.eq_handle = server->eq;
-        
-                if ((rc = PtlMDAttach (server->me, server->mdin,
-                        PTL_UNLINK, &server->mdin_h))) {
-                        PDEBUG ("PtlMDAttach (bulk)", rc);
-                        CDEBUG (D_OTHER, "ping server resources allocated\n");
-                }
-                
-                if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ,
-                         server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0)))
-                         PDEBUG ("PtlPut", rc);
-                
-                atomic_dec (&pkt);
-                
-        }
-        pingsrv_shutdown (1);
-        running = 1;
-        return 0;    
-}
-
-static void pingsrv_packet(ptl_event_t *ev)
-{
-        atomic_inc (&pkt);
-        wake_up_process (server->tsk);
-} /* pingsrv_head() */
-
-static void pingsrv_callback(ptl_event_t *ev)
-{
-        
-        if (ev == NULL) {
-                CERROR ("null in callback, ev=%p\n", ev);
-                return;
-        }
-        server->evnt = *ev;
-        
-        CWARN("Lustre: received ping from nid "LPX64" "
-              "(off=%u rlen=%u mlen=%u head=%x)\n",
-              ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
-              *((int *)(ev->md.start + ev->offset)));
-        
-        packets_valid++;
-
-        pingsrv_packet(ev);
-        
-} /* pingsrv_callback() */
-
-
-static struct pingsrv_data *pingsrv_setup(void)
-{
-        int rc;
-
-       /* Aquire and initialize the proper nal for portals. */
-        server->ni = PTL_INVALID_HANDLE;
-
-        rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni);
-        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
-                CDEBUG (D_OTHER, "Nal %x not loaded.\n", nal);
-                return pingsrv_shutdown (4);
-        }
-
-        /* Based on the initialization aquire our unique portal ID. */
-        if ((rc = PtlGetId (server->ni, &server->my_id))) {
-                PDEBUG ("PtlGetId", rc);
-                return pingsrv_shutdown (2);
-        }
-
-        server->id_local.nid = PTL_NID_ANY;
-        server->id_local.pid = PTL_PID_ANY;
-
-        /* Attach a match entries for header packets */
-        if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER,
-            server->id_local,0, ~0,
-            PTL_RETAIN, PTL_INS_AFTER, &server->me))) {
-                PDEBUG ("PtlMEAttach", rc);
-                return pingsrv_shutdown (2);
-        }
-
-
-        if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback,
-                                        &server->eq))) {
-                PDEBUG ("PtlEQAlloc (callback)", rc);
-                return pingsrv_shutdown (2);
-        }
-        
-        PORTAL_ALLOC (server->in_buf, STDSIZE);
-        if(!server->in_buf){
-                CDEBUG (D_OTHER,"Allocation error\n");
-                return pingsrv_shutdown(2);
-        }
-        
-        /* Setup the incoming buffer */
-        server->mdin.start     = server->in_buf;
-        server->mdin.length    = STDSIZE;
-        server->mdin.threshold = 1; 
-        server->mdin.options   = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT;
-        server->mdin.user_ptr  = NULL;
-        server->mdin.eq_handle = server->eq;
-        memset (server->in_buf, 0, STDSIZE);
-        
-        if ((rc = PtlMDAttach (server->me, server->mdin,
-                PTL_UNLINK, &server->mdin_h))) {
-                    PDEBUG ("PtlMDAttach (bulk)", rc);
-                CDEBUG (D_OTHER, "ping server resources allocated\n");
-       }
-        /* Success! */
-        return server; 
-} /* pingsrv_setup() */
-
-static int pingsrv_start(void)
-{
-        /* Setup our server */
-        if (!pingsrv_setup()) {
-                CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n");
-                return -ENOMEM;
-        }
-        kernel_thread (pingsrv_thread,NULL,0);
-        return 0;
-} /* pingsrv_start() */
-
-
-
-static int __init pingsrv_init(void)
-{
-        PORTAL_ALLOC (server, sizeof(struct pingsrv_data));  
-        return pingsrv_start ();
-} /* pingsrv_init() */
-
-
-static void /*__exit*/ pingsrv_cleanup(void)
-{
-        remove_proc_entry ("net/pingsrv", NULL);
-        
-        running = 0;
-        wake_up_process (server->tsk);
-        while (running != 1) {
-                set_current_state (TASK_UNINTERRUPTIBLE);
-                schedule_timeout (HZ);
-        }
-        
-} /* pingsrv_cleanup() */
-
-
-MODULE_PARM(nal, "i");
-MODULE_PARM_DESC(nal, "Use the specified NAL "
-                "(2-ksocknal, 1-kqswnal)");
-MODULE_AUTHOR("Brian Behlendorf (LLNL)");
-MODULE_DESCRIPTION("A kernel space ping server for portals testing");
-MODULE_LICENSE("GPL");
-
-module_init(pingsrv_init);
-module_exit(pingsrv_cleanup);
index be60509..2a30a01 100644 (file)
@@ -1,37 +1,10 @@
 #!/bin/sh
 
-SIMPLE=${SIMPLE:-0}
-
-if [ $SIMPLE -eq 0 ]; then
-       PING=pingcli.o
-else
-       PING=spingcli.o
-fi
+case `uname -r` in
+    2.6.*) ext=.ko;;
+    2.4.*) ext=.o;;
+    *)     echo unknown OS version; return 1;;
+esac
 
-case "$1" in
-       tcp)
-               /sbin/insmod  ../oslib/portals.o
-               /sbin/insmod ../socknal/ksocknal.o
-               /sbin/insmod ./$PING 
-               echo ksocknal > /tmp/nal
-       ;;
-       
-       elan)
-               /sbin/insmod  ../oslib/portals.o
-               /sbin/insmod ../qswnal/kqswnal.o
-               /sbin/insmod ./$PING
-               echo kqswnal > /tmp/nal
-       ;;
+insmod pingcli$ext
 
-       gm)
-               /sbin/insmod  portals
-               /sbin/insmod kgmnal
-               /sbin/insmod ./$PING
-               echo kgmnal > /tmp/nal
-       ;;
-       
-       *)
-               echo "Usage : ${0} < tcp | elan | gm>"
-               exit 1;
-esac
-exit 0;
index 9b5ccf6..355a8ae 100644 (file)
@@ -1,38 +1,9 @@
 #!/bin/sh
 
-SIMPLE=${SIMPLE:-0}
-
-if [ $SIMPLE -eq 0 ]; then
-       PING=pingsrv.o
-else
-       PING=spingsrv.o
-fi
-
-case "$1" in
-       tcp)
-               /sbin/insmod  ../oslib/portals.o
-               /sbin/insmod ../socknal/ksocknal.o
-               /sbin/insmod ./$PING nal=2
-               echo ksocknal > /tmp/nal
-       ;;
-       
-       elan)
-               /sbin/insmod  ../oslib/portals.o
-               /sbin/insmod ../qswnal/kqswnal.o
-               /sbin/insmod ./$PING nal=4
-               echo kqswnal > /tmp/nal
-       ;;
-
-       gm)
-               /sbin/insmod  portals
-               /sbin/insmod kgmnal
-               /sbin/insmod ./$PING nal=3
-               echo kgmnal > /tmp/nal
-       ;;
-       
-       *)
-               echo "Usage : ${0} < tcp | elan | gm>"
-               exit 1;
+case `uname -r` in
+    2.6.*) ext=.ko;;
+    2.4.*) ext=.o;;
+    *)     echo unknown OS version; return 1;;
 esac
-../utils/acceptor 9999&
-exit 0;
+
+insmod pingsrv$ext
index f7e3aa1..276d374 100644 (file)
@@ -1,14 +1,3 @@
 #!/bin/sh
 
-SIMPLE=${SIMPLE:-1}
-
-if [ $SIMPLE -eq 0 ]; then
-       PING=spingcli
-else
-       PING=pingcli
-fi
-
-rmmod $PING
-NAL=`cat /tmp/nal`;
-rmmod $NAL
-rmmod portals
+rmmod pingcli
index 3e81831..829afc6 100644 (file)
@@ -1,16 +1,3 @@
 #!/bin/sh
 
-SIMPLE=${SIMPLE:-1}
-
-if [ $SIMPLE -eq 0 ]; then
-       PING=spingsrv
-else
-       PING=pingsrv
-fi
-
-rmmod $PING
-NAL=`cat /tmp/nal`;
-rmmod $NAL
-killall -9 acceptor
-rm -f /var/run/acceptor-9999.pid
-rmmod portals
+rmmod pingsrv
diff --git a/lnet/tests/ut.README b/lnet/tests/ut.README
new file mode 100644 (file)
index 0000000..ef70b2f
--- /dev/null
@@ -0,0 +1,43 @@
+The utcli (unit test client) and utsrv (unit test server) are very simple
+unit test tools, for sending and receiving single get's/put's of a specific 
+size, using the LNET API set.
+
+Test Setup
+uml1 ip=192.168.2.1
+uml2 ip=192.168.2.2
+
+--------------------------------------------------------------------------------
+Example Test #1 - small get operation
+
+1) Setup server for listening
+uml2 $ insmod utsvr.ko   
+
+2) Do the get operation NID must be specified but all other are default
+paramters which causes a 300 byte get op
+uml1 $ insmod utcli.ko nid=192.168.2.2@tcp
+
+3) Unload the utsvr because currently it only supports a single operation
+buffers are not reposted after they are consumed
+*** FIX THIS LIMITATION ***
+uml2 $ rmmod utsvr
+
+--------------------------------------------------------------------------------
+Example Test #2 - small put operation
+(The setup and cleanup of the server are left out, because they are the
+same as above)
+
+1) The adition of the "put=1" paramter causes a put rather than a get.  The
+default size of 300 is still used.
+uml1 $ insmod utcli.ko nid=192.168.2.2@tcp put=1
+
+--------------------------------------------------------------------------------
+Example Test #3 - large get operation
+
+1) Setup server for listening.  The size must be specified on the server or else
+the default of 300 bytes will be used.
+uml2 $ insmod utsvr.ko  pkt_size=5000 
+
+2) Do the large get operation pkt_size=5000.  put=0 is a get operation, 
+it is equivlenet to just not having that parameter.
+uml1 $ insmod utcli.ko nid=192.168.2.2@tcp put=0 pkt_size=5000
+
diff --git a/lnet/tests/ut.h b/lnet/tests/ut.h
new file mode 100644 (file)
index 0000000..96ccb34
--- /dev/null
@@ -0,0 +1,45 @@
+#define DEBUG_SUBSYSTEM S_PINGER
+
+#include <libcfs/kp30.h>
+#include <lnet/lnet.h>
+
+#define UT_PORTAL       42
+
+#define PJK_UT_MSG(fmt...) do{printk("<1>" UT_MSG_MODULE_NAME ":%-30s:",__FUNCTION__);printk(fmt);}while(0)
+
+#define DO_TYPE(x) case x: return #x;
+
+const char *get_ev_type_string(int evtype)
+{
+        switch(evtype)
+        {
+                DO_TYPE(LNET_EVENT_GET);
+                DO_TYPE(LNET_EVENT_PUT);
+                DO_TYPE(LNET_EVENT_REPLY);
+                DO_TYPE(LNET_EVENT_ACK);
+                DO_TYPE(LNET_EVENT_SEND);
+                DO_TYPE(LNET_EVENT_UNLINK);
+        default:
+                return "";
+        }
+}
+
+static volatile int seen = 0;
+static volatile int seen_unlink = 0;
+
+static inline void handler(lnet_event_t *ev)
+{
+        PJK_UT_MSG("-------- EVENT START ------------\n");
+        PJK_UT_MSG("type=%d %s\n",ev->type,get_ev_type_string(ev->type));
+        PJK_UT_MSG("portal=%d\n",ev->pt_index);
+        PJK_UT_MSG("matchbits="LPX64"\n",ev->match_bits);
+        PJK_UT_MSG("request length=%d\n",ev->rlength);
+        PJK_UT_MSG("manipulated length=%d\n",ev->mlength);
+        PJK_UT_MSG("offset=%d\n",ev->offset);
+        PJK_UT_MSG("status=%d\n",ev->status);
+        PJK_UT_MSG("unlinked=%d\n",ev->unlinked);
+        PJK_UT_MSG("md.user_ptr=%p\n",ev->md.user_ptr);
+        PJK_UT_MSG("-------- EVENT END --------------\n");
+        ++seen;
+        if(ev->unlinked)++seen_unlink;
+}
diff --git a/lnet/tests/ut_cli.c b/lnet/tests/ut_cli.c
new file mode 100644 (file)
index 0000000..3a6e255
--- /dev/null
@@ -0,0 +1,211 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+ #define UT_MSG_MODULE_NAME "utcli "
+ #include "ut.h"
+
+int pkt_size = 300;
+module_param(pkt_size,int,S_IRUGO);
+int get=0;
+module_param(get,int,S_IRUGO);
+int put=0;
+module_param(put,int,S_IRUGO);
+int auto_unlink=1;
+module_param(auto_unlink,int,S_IRUGO);
+char* nid=0;
+module_param(nid,charp,S_IRUGO);
+
+static int __init utcli_init(void)
+{
+        lnet_handle_md_t        mdh;
+        lnet_process_id_t       target;
+        lnet_process_id_t       mypid;
+        lnet_handle_eq_t        eqh;
+        lnet_md_t               md;
+        int                     rc,i;
+        char* buffer            = 0;
+        /*
+         * Put and get really control the same thing
+         */
+        if(put)get=0;
+        /* Default to get */
+        if(!put && !get)get=1;
+
+        PJK_UT_MSG("utcli_init %s\n",get==0?"PUT":"GET");
+        PJK_UT_MSG("pkt_size=%d\n",pkt_size);
+        PJK_UT_MSG("auto_unlink=%d\n",auto_unlink);
+        PJK_UT_MSG("nid=%s\n",nid);
+        if(nid == 0)
+        {
+                CERROR("NID Must be specified\n");
+                return -EINVAL;
+        }
+
+        PJK_UT_MSG("LIBCFS_ALLOC\n");
+        LIBCFS_ALLOC (buffer, pkt_size);
+        if (buffer == NULL)
+        {
+                CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size);
+                return -ENOMEM;
+        }
+
+        PJK_UT_MSG("LNetNiInit()\n");
+        rc = LNetNIInit(0);
+        if (rc < 0)
+        {
+                CERROR ("LNetNIInit: error %d\n", rc);
+                goto exit0;
+        }
+
+
+        LNetGetId(0,&mypid);
+        PJK_UT_MSG("my.nid="LPX64"\n",mypid.nid);
+        PJK_UT_MSG("my.pid=0x%x\n",mypid.pid);
+
+
+        PJK_UT_MSG("LNetEQAlloc\n");
+        rc = LNetEQAlloc(
+                64,      /* max number of envents why 64? */
+                handler, /* handler callback */
+                &eqh);   /* output handle */
+        if(rc != 0) {
+                CERROR("LNetEQAlloc failed %d\n",rc);
+                goto exit1;
+        }
+
+        md.start = buffer;
+        md.length = pkt_size;
+        md.threshold = auto_unlink ? (get ? 2 : 1) : 15;
+        md.max_size = 0;
+        md.options = 0;
+        if(get){
+                md.options |= LNET_MD_OP_GET;
+        }else{
+                md.options |= LNET_MD_OP_PUT;
+                md.options |= LNET_MD_ACK_DISABLE;
+        }
+        md.user_ptr = 0;
+        md.eq_handle = eqh;
+
+        PJK_UT_MSG("LNetMDBind()\n");
+        if ((rc=LNetMDBind (
+                     md,
+                     LNET_UNLINK,
+                     &mdh)))               /* out handle */
+        {
+                CERROR ("LNetMDBind error %d\n", rc);
+                goto exit4;
+        }
+
+        target.pid = 0;
+        target.nid = libcfs_str2nid(nid);
+
+        PJK_UT_MSG("target.nid="LPX64"\n",target.nid);
+
+        for(i=0;i<1;i++)
+        {
+                if(get){
+                        PJK_UT_MSG("LNetGet()\n");
+                        if((rc = LNetGet (
+                                    LNET_ID_ANY,
+                                    mdh,
+                                    target,       /* peer "address" */
+                                    UT_PORTAL,    /* portal */
+                                    i,            /* match bits */
+                                    0)))          /* header data */
+                        {
+                                CERROR("LNetGet %d error %d\n",i, rc);
+                                goto exit5;
+                        }
+                }else{
+
+                        PJK_UT_MSG("LNetPut()\n");
+                        if((rc = LNetPut (
+                                    LNET_ID_ANY,
+                                    mdh,
+                                    LNET_ACK_REQ, /* we want ack */
+                                    target,       /* peer "address" */
+                                    UT_PORTAL,    /* portal */
+                                    i,            /* match bits */
+                                    0,            /* offset */
+                                    0)))          /* header data */
+                        {
+                                CERROR("LNetPut %d error %d\n",i, rc);
+                                goto exit5;
+                        }
+                }
+        }
+
+
+        PJK_UT_MSG("------------Waiting for SEND_END()------------\n");
+        i=0;
+        while(i++ < 10 && seen == 0)
+                cfs_pause(cfs_time_seconds(1));
+        if(seen == 0)
+                PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
+        else{
+                int good;
+                if(get){
+                        PJK_UT_MSG("------------Waiting for REPLY()------------\n");
+                        i=0;
+                        while(i++ < 10 && seen == 1)
+                                cfs_pause(cfs_time_seconds(1));
+                        good = (seen != 1);
+                }else{
+                        good = 1;
+                }
+
+                if(good)
+                        PJK_UT_MSG("------------------COMPLETE--------------------\n");
+                else
+                        PJK_UT_MSG("------------------TIMEDOUT--------------------\n");
+        }
+
+
+
+        /*
+        PJK_UT_MSG("LNetEQWait()\n");
+        rc = LNetEQWait(eqh,&ev);
+        if(rc != 0)
+                goto exit5;
+        */
+
+exit5:
+        PJK_UT_MSG("LNetMDUnlink()\n");
+        LNetMDUnlink(mdh);
+
+        if(!seen_unlink){
+                PJK_UT_MSG("------------Waiting for UNLINK ------------\n");
+                i=0;
+                while(i++ < 120 && seen_unlink == 0)
+                        cfs_pause(cfs_time_seconds(1));
+        }
+
+        cfs_pause(cfs_time_seconds(1));
+exit4:
+        PJK_UT_MSG("LNetEQFree()\n");
+        LNetEQFree(eqh);
+exit1:
+        PJK_UT_MSG("LNetNiFini()\n");
+        LNetNIFini();
+exit0:
+        if(buffer)
+                LIBCFS_FREE(buffer,pkt_size);
+
+        return -1;
+} /* utcli_init() */
+
+
+static void /*__exit*/ utcli_cleanup(void)
+{
+        PJK_UT_MSG(">>>\n");
+        PJK_UT_MSG("<<<\n");
+} /* utcli_cleanup() */
+
+
+MODULE_AUTHOR("PJ Kirner (CFS)");
+MODULE_DESCRIPTION("A simple LNET Unit Test module");
+MODULE_LICENSE("GPL");
+
+cfs_module(ut_cli, "1.0.0", utcli_init, utcli_cleanup);
diff --git a/lnet/tests/ut_srv.c b/lnet/tests/ut_srv.c
new file mode 100644 (file)
index 0000000..3ffbac6
--- /dev/null
@@ -0,0 +1,144 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+
+#define UT_MSG_MODULE_NAME "utsrv "
+#include "ut.h"
+
+
+int pkt_size = 300;
+module_param(pkt_size,int,S_IRUGO);
+int auto_unlink=1;
+module_param(auto_unlink,int,S_IRUGO);
+
+char                   *buffer = 0;
+lnet_handle_eq_t        eqh;
+lnet_handle_me_t        meh;
+lnet_handle_md_t        mdh;
+
+static int __init utsrv_init(void)
+{
+        int                     rc;
+        lnet_process_id_t       anypid;
+        lnet_process_id_t       mypid;
+        lnet_md_t               md;
+
+        PJK_UT_MSG(">>>\n");
+        PJK_UT_MSG("pkt_size=%d\n",pkt_size);
+        PJK_UT_MSG("auto_unlink=%d\n",auto_unlink);
+
+        PJK_UT_MSG("LIBCFS_ALLOC\n");
+        LIBCFS_ALLOC (buffer, pkt_size);
+        if (buffer == NULL)
+        {
+                CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size);
+                rc = -ENOMEM;
+                goto exit0;
+        }
+
+        PJK_UT_MSG("LNetNiInit()\n");
+        rc = LNetNIInit(0);
+        if (rc < 0)
+        {
+                CERROR ("LNetNIInit: error %d\n", rc);
+                goto exit1;
+        }
+
+        LNetGetId(0,&mypid);
+        PJK_UT_MSG("my.nid="LPX64"\n",mypid.nid);
+        PJK_UT_MSG("my.pid=0x%x\n",mypid.pid);
+
+        PJK_UT_MSG("LNetEQAlloc\n");
+        rc = LNetEQAlloc(
+                64,      /* max number of envents why 64? */
+                handler, /* handler callback */
+                &eqh);   /* output handle */
+        if(rc != 0) {
+                CERROR("LNetEQAlloc failed %d\n",rc);
+                goto exit2;
+        }
+
+        anypid.nid = LNET_NID_ANY;
+        anypid.pid = LNET_PID_ANY;
+
+
+        PJK_UT_MSG("LNetMEAttach\n");
+        rc = LNetMEAttach(
+                UT_PORTAL,    /* ptl index*/
+                anypid,       /* pid - in this case allow any*/
+                0,            /*matchbits*/
+                0x0FFFF,      /*ignorebits - ignore botton 16-bits*/
+                LNET_UNLINK,  /* unlik vs LNET_RETAIN*/
+                LNET_INS_BEFORE,
+                &meh);
+        if(rc != 0) {
+                CERROR("LNetMeAttach failed %d\n",rc);
+                goto exit3;
+        }
+
+        md.start = buffer;
+        md.length = pkt_size;
+        md.threshold = auto_unlink ? 1 : 100;
+        md.max_size = 0;
+        md.options = 0;
+        md.options |= LNET_MD_OP_GET;
+        md.options |= LNET_MD_OP_PUT;
+        md.options |= LNET_MD_ACK_DISABLE;
+        md.user_ptr= 0;
+        md.eq_handle = eqh;
+
+        PJK_UT_MSG("LNetMDAttach\n");
+        rc = LNetMDAttach(
+                meh,
+                md,
+                LNET_UNLINK,
+                &mdh);
+        if(rc != 0){
+                CERROR("LNetMDAttach failed %d\n",rc);
+                goto exit4;
+        }
+
+        rc = 0;
+        goto exit0;
+
+exit4:
+        PJK_UT_MSG("LNetMEUnlink()\n");
+        LNetMEUnlink(meh);
+exit3:
+        PJK_UT_MSG("LNetEQFree()\n");
+        LNetEQFree(eqh);
+exit2:
+        PJK_UT_MSG("LNetNiFini()\n");
+        LNetNIFini();
+exit1:
+        LIBCFS_FREE(buffer,pkt_size);
+exit0:
+        PJK_UT_MSG("<<< rc=%d\n",rc);
+        return rc;
+
+} /* utsrv_init() */
+
+
+static void /*__exit*/ utsrv_cleanup(void)
+{
+        PJK_UT_MSG(">>>\n");
+        PJK_UT_MSG("LNetMDUnlink()\n");
+        LNetMDUnlink(mdh);
+        PJK_UT_MSG("LNetMEUnlink()\n");
+        LNetMEUnlink(meh);
+        PJK_UT_MSG("LNetEQFree()\n");
+        LNetEQFree(eqh);
+        PJK_UT_MSG("LNetNiFini()\n");
+        LNetNIFini();
+        LIBCFS_FREE(buffer,pkt_size);
+        PJK_UT_MSG("<<<\n");
+} /* utsrv_cleanup() */
+
+
+MODULE_AUTHOR("PJ Kirner (CFS)");
+MODULE_DESCRIPTION("A simple LNET Unit Test module");
+MODULE_LICENSE("GPL");
+
+cfs_module(utsvr, "1.0.0", utsrv_init, utsrv_cleanup);
+
index e995588..2711a44 100644 (file)
@@ -1,3 +1,4 @@
 .deps
 Makefile
-Makefile.in
+autoMakefile
+autoMakefile.in
diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am
deleted file mode 100644 (file)
index 3437d39..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-if LIBLUSTRE
-if !CRAY_PORTALS
-noinst_LIBRARIES = libtcpnal.a
-endif
-endif
-
-noinst_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
-libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
-libtcpnal_a_CFLAGS = $(LLCFLAGS)
diff --git a/lnet/ulnds/Makefile.in b/lnet/ulnds/Makefile.in
new file mode 100644 (file)
index 0000000..78432ee
--- /dev/null
@@ -0,0 +1,5 @@
+@BUILD_USOCKLND_TRUE@subdir-m += socklnd
+@BUILD_UPTLLND_TRUE@subdir-m += ptllnd
+
+@INCLUDE_RULES@
+
diff --git a/lnet/ulnds/README b/lnet/ulnds/README
deleted file mode 100644 (file)
index 6cb93d9..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-This library implements two NAL interfaces, both running over IP.
-The first, tcpnal, creates TCP connections between participating
-processes in order to transport the portals requests. The second,
-ernal, provides a simple transport protocol which runs over
-UDP datagrams.
-
-The interface functions return both of these values in host order for
-convenience and readability. However this means that addresses
-exchanged in messages between hosts of different orderings will not
-function properly.
-
-Both NALs use the same support functions in order to schedule events
-and communicate with the generic portals implementation.
-
-            -------------------------
-            |         api           |
-            |_______________________|
-            |         lib           |
-            |_______________________|
-            | ernal  |   |tcpnal    |
-            |--------|   |----------|
-            | udpsock|   |connection|
-            |-----------------------|
-            |     timer/select      |
-            -------------------------
-
-
-  These NALs uses the framework from fdnal of a pipe between the api
-and library sides. This is wrapped up in the select on the library
-side, and blocks on the api side. Performance could be severely
-enhanced by collapsing this aritificial barrier, by using shared
-memory queues, or by wiring the api layer directly to the library.
-
-
-nid is defined as the low order 24-bits of the IP address of the
-physical node left shifted by 8 plus a virtual node number of 0
-through 255 (really only 239).  The virtual node number of a tcpnal
-application should be specified using the environment variable
-PTL_VIRTNODE.  pid is now a completely arbitrary number in the
-range of 0 to 255.  The IP interface used can be overridden by
-specifying the appropriate hostid by setting the PTL_HOSTID
-environment variable.  The value can be either dotted decimal
-(n.n.n.n) or hex starting with "0x".
-TCPNAL:
-  As the NAL needs to try to send to a particular nid/pid pair, it
-  will open up connections on demand. Because the port associated with
-  the connecting socket is different from the bound port, two
-  connections will normally be established between a pair of peers, with
-  data flowing from the anonymous connect (active) port to the advertised
-  or well-known bound (passive) port of each peer.
-
-  Should the connection fail to open, an error is reported to the
-  library component, which causes the api request to fail.
diff --git a/lnet/ulnds/address.c b/lnet/ulnds/address.c
deleted file mode 100644 (file)
index 07b4249..0000000
+++ /dev/null
@@ -1,147 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* address.c:
- * this file provides functions to aquire the IP address of the node
- * and translate them into a NID/PID pair which supports a static
- * mapping of virtual nodes into the port range of an IP socket.
-*/
-
-#define DEBUG_SUBSYSTEM S_NAL
-
-#include <stdlib.h>
-#include <netdb.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <portals/p30.h>
-#include <bridge.h>
-#include <ipmap.h>
-
-
-/* Function:  get_node_id
- * Returns: a 32 bit id for this node, actually a big-endian IP address
- *
- * get_node_id() determines the host name and uses the resolver to
- *  find out its ip address. This is fairly fragile and inflexible, but
- *  explicitly asking about interfaces and their addresses is very
- *  complicated and nonportable.
- */
-static unsigned int get_node_id(void)
-{
-    char buffer[255];
-    unsigned int x;
-    struct hostent *he;
-    char * host_envp;
-
-    if (!(host_envp = getenv("PTL_HOSTID")))
-        {
-            gethostname(buffer,sizeof(buffer));
-            he=gethostbyname(buffer);
-            if (he)
-                    x=*(unsigned int *)he->h_addr_list[0];
-            else
-                    x = 0;
-            return(ntohl(x));
-        }
-    else
-        {
-            if (host_envp[1] != 'x')
-                {
-                    int a, b, c, d;
-                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
-                    return ((a<<24) | (b<<16) | (c<<8) | d);
-                }
-            else
-                {
-                    long long hostid = strtoll(host_envp, 0, 0);
-                    return((unsigned int) hostid);
-                }
-        }
-}
-
-
-/* Function:  set_address
- * Arugments: t: a procnal structure to populate with the request
- *
- * set_address performs the bit manipulations to set the nid, pid, and
- *    iptop8 fields of the procnal structures.
- *
- * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
- */
-
-#ifdef DIRECT_IP_MODE
-void set_address(bridge t,ptl_pid_t pidrequest)
-{
-    int port;
-    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
-    else port=pidrequest;
-    t->lib_nal->libnal_ni.ni_pid.nid=get_node_id();
-    t->lib_nal->libnal_ni.ni_pid.pid=port;
-}
-#else
-
-void set_address(bridge t,ptl_pid_t pidrequest)
-{
-    int virtnode, in_addr, port;
-    ptl_pid_t pid;
-
-    /* get and remember my node id*/
-    if (!getenv("PTL_VIRTNODE"))
-        virtnode = 0;
-    else
-        {
-            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT
-                                              >> PNAL_VNODE_SHIFT);
-            virtnode = atoi(getenv("PTL_VIRTNODE"));
-            if (virtnode > maxvnode)
-                {
-                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
-                            virtnode, maxvnode);
-                    return;
-                }
-        }
-
-    in_addr = get_node_id();
-
-    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
-    t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK)
-                                        << PNAL_VNODE_SHIFT)
-                                       + virtnode;
-    pid=pidrequest;
-    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
-#ifdef notyet
-    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
-#endif
-    if (pid==(unsigned short)PTL_PID_ANY)
-        {
-            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
-            return;
-        }
-    else if (pid > PNAL_PID_MASK)
-        {
-            fprintf(stderr, "portal pid of %d is too large - max %d\n",
-                    pid, PNAL_PID_MASK);
-            return;
-        }
-    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
-    t->lib_nal->libnal_ni.ni_pid.pid=pid;
-}
-#endif
diff --git a/lnet/ulnds/autoMakefile.am b/lnet/ulnds/autoMakefile.am
new file mode 100644 (file)
index 0000000..0e7fa4c
--- /dev/null
@@ -0,0 +1,6 @@
+# Copyright (C) 2001  Cluster File Systems, Inc.
+#
+# This code is issued under the GNU General Public License.
+# See the file COPYING in this distribution
+
+SUBDIRS = socklnd ptllnd
diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h
deleted file mode 100644 (file)
index d2f0f2c..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-#ifndef TCPNAL_PROCBRIDGE_H
-#define TCPNAL_PROCBRIDGE_H
-
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
-
-#define PTL_IFACE_TCP 1
-#define PTL_IFACE_ER 2
-#define PTL_IFACE_SS 3
-#define PTL_IFACE_MAX 4
-
-typedef struct bridge {
-    int alive;
-    lib_nal_t *lib_nal;
-    void *lower;
-    void *local;
-    void (*shutdown)(struct bridge *);
-    /* this doesn't really belong here */
-    unsigned char iptop8;
-} *bridge;
-
-
-typedef int (*nal_initialize)(bridge);
-extern nal_initialize nal_table[PTL_IFACE_MAX];
-
-#endif
diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c
deleted file mode 100644 (file)
index 49cca96..0000000
+++ /dev/null
@@ -1,507 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* connection.c:
-   This file provides a simple stateful connection manager which
-   builds tcp connections on demand and leaves them open for
-   future use. It also provides the machinery to allow peers
-   to connect to it
-*/
-
-#include <stdlib.h>
-#include <pqtimer.h>
-#include <dispatch.h>
-#include <table.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <portals/types.h>
-#include <portals/lib-types.h>
-#include <portals/socknal.h>
-#include <libcfs/kp30.h>
-#include <connection.h>
-#include <pthread.h>
-#include <errno.h>
-#ifndef __CYGWIN__
-#include <syscall.h>
-#endif
-
-/* global variable: acceptor port */
-unsigned short tcpnal_acceptor_port = 988;
-
-
-/* Function:  compare_connection
- * Arguments: connection c:      a connection in the hash table
- *            ptl_process_id_t:  an id to verify  agains
- * Returns: 1 if the connection is the one requested, 0 otherwise
- *
- *    compare_connection() tests for collisions in the hash table
- */
-static int compare_connection(void *arg1, void *arg2)
-{
-    connection c = arg1;
-    unsigned int * id = arg2;
-#if 0
-    return((c->ip==id[0]) && (c->port==id[1]));
-#else
-    /* CFS specific hacking */
-    return (c->ip == id[0]);
-#endif
-}
-
-
-/* Function:  connection_key
- * Arguments: ptl_process_id_t id:  an id to hash
- * Returns: a not-particularily-well-distributed hash
- *          of the id
- */
-static unsigned int connection_key(unsigned int *id)
-{
-#if 0
-    return(id[0]^id[1]);
-#else
-    /* CFS specific hacking */
-    return (unsigned int) id[0];
-#endif
-}
-
-
-/* Function:  remove_connection
- * Arguments: c: the connection to remove
- */
-void remove_connection(void *arg)
-{
-        connection c = arg;
-        unsigned int id[2];
-        
-        id[0]=c->ip;
-        id[1]=c->port;
-        hash_table_remove(c->m->connections,id);
-        close(c->fd);
-        free(c);
-}
-
-
-/* Function:  read_connection: 
- * Arguments: c:    the connection to read from 
- *            dest: the buffer to read into
- *            len:  the number of bytes to read   
- * Returns: success as 1, or failure as 0
- *
- *   read_connection() reads data from the connection, continuing
- *   to read partial results until the request is satisfied or
- *   it errors. TODO: this read should be covered by signal protection.
- */
-int read_connection(connection c,
-                    unsigned char *dest,
-                    int len)
-{
-    int offset = 0,rc;
-
-    if (len) {
-        do {
-#ifndef __CYGWIN__
-            rc = syscall(SYS_read, c->fd, dest+offset, len-offset);
-#else
-            rc = recv(c->fd, dest+offset, len-offset, 0);
-#endif
-            if (rc <= 0) {
-                if (errno == EINTR) {
-                    rc = 0;
-                } else {
-                    remove_connection(c);
-                    return (0);
-                }
-            }
-            offset += rc;
-        } while (offset < len);
-    }
-    return (1);
-}
-
-static int connection_input(void *d)
-{
-        connection c = d;
-        return((*c->m->handler)(c->m->handler_arg,c));
-}
-
-
-/* Function:  allocate_connection
- * Arguments: t:    tcpnal the allocation is occuring in the context of
- *            dest: portal endpoint address for this connection
- *            fd:   open file descriptor for the socket
- * Returns: an allocated connection structure
- *
- * just encompasses the action common to active and passive
- *  connections of allocation and placement in the global table
- */
-static connection allocate_connection(manager m,
-                               unsigned int ip,
-                               unsigned short port,
-                               int fd)
-{
-    connection c=malloc(sizeof(struct connection));
-    unsigned int id[2];
-    c->m=m;
-    c->fd=fd;
-    c->ip=ip;
-    c->port=port;
-    id[0]=ip;
-    id[1]=port;
-    register_io_handler(fd,READ_HANDLER,connection_input,c);
-    hash_table_insert(m->connections,c,id);
-    return(c);
-}
-
-
-/* Function:  new_connection
- * Arguments: t: opaque argument holding the tcpname
- * Returns: 1 in order to reregister for new connection requests
- *
- *  called when the bound service socket recieves
- *     a new connection request, it always accepts and
- *     installs a new connection
- */
-static int new_connection(void *z)
-{
-    manager m=z;
-    struct sockaddr_in s;
-    int len=sizeof(struct sockaddr_in);
-    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
-    unsigned int nid=*((unsigned int *)&s.sin_addr);
-    /* cfs specific hack */
-    //unsigned short pid=s.sin_port;
-    pthread_mutex_lock(&m->conn_lock);
-    allocate_connection(m,htonl(nid),0/*pid*/,fd);
-    pthread_mutex_unlock(&m->conn_lock);
-    return(1);
-}
-
-extern ptl_nid_t tcpnal_mynid;
-
-int
-tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
-{
-        int                 rc;
-        int                 nob;
-        ptl_hdr_t           hdr;
-        ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
-
-        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
-
-        memset (&hdr, 0, sizeof (hdr));
-        hmv->magic         = cpu_to_le32(PORTALS_PROTO_MAGIC);
-        hmv->version_major = cpu_to_le32(PORTALS_PROTO_VERSION_MAJOR);
-        hmv->version_minor = cpu_to_le32(PORTALS_PROTO_VERSION_MINOR);
-        
-        hdr.src_nid = cpu_to_le64(tcpnal_mynid);
-        hdr.type    = cpu_to_le32(PTL_MSG_HELLO);
-
-        hdr.msg.hello.type = cpu_to_le32(type);
-        hdr.msg.hello.incarnation = cpu_to_le64(incarnation);
-
-        /* I don't send any interface info */
-
-        /* Assume sufficient socket buffering for this message */
-        rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr));
-        if (rc <= 0) {
-                CERROR ("Error %d sending HELLO to "LPX64"\n", rc, *nid);
-                return (rc);
-        }
-
-        rc = syscall(SYS_read, sockfd, hmv, sizeof(*hmv));
-        if (rc <= 0) {
-                CERROR ("Error %d reading HELLO from "LPX64"\n", rc, *nid);
-                return (rc);
-        }
-        
-        if (hmv->magic != le32_to_cpu(PORTALS_PROTO_MAGIC)) {
-                CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n",
-                        cpu_to_le32(hmv->magic), PORTALS_PROTO_MAGIC, *nid);
-                return (-EPROTO);
-        }
-
-        if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) ||
-            hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) {
-                CERROR ("Incompatible protocol version %d.%d (%d.%d expected)"
-                        " from "LPX64"\n",
-                        le16_to_cpu (hmv->version_major),
-                        le16_to_cpu (hmv->version_minor),
-                        PORTALS_PROTO_VERSION_MAJOR,
-                        PORTALS_PROTO_VERSION_MINOR,
-                        *nid);
-                return (-EPROTO);
-        }
-
-#if (PORTALS_PROTO_VERSION_MAJOR != 1)
-# error "This code only understands protocol version 1.x"
-#endif
-        /* version 1 sends magic/version as the dest_nid of a 'hello' header,
-         * so read the rest of it in now... */
-
-        rc = syscall(SYS_read, sockfd, hmv + 1, sizeof(hdr) - sizeof(*hmv));
-        if (rc <= 0) {
-                CERROR ("Error %d reading rest of HELLO hdr from "LPX64"\n",
-                        rc, *nid);
-                return (rc);
-        }
-
-        /* ...and check we got what we expected */
-        if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) {
-                CERROR ("Expecting a HELLO hdr "
-                        " but got type %d with %d payload from "LPX64"\n",
-                        le32_to_cpu (hdr.type),
-                        le32_to_cpu (hdr.payload_length), *nid);
-                return (-EPROTO);
-        }
-
-        if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) {
-                CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n");
-                return (-EPROTO);
-        }
-
-        if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
-                *nid = le64_to_cpu(hdr.src_nid);
-        } else if (*nid != le64_to_cpu (hdr.src_nid)) {
-                CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n",
-                        le64_to_cpu (hdr.src_nid), *nid);
-                return (-EPROTO);
-        }
-
-        /* Ignore any interface info in the payload */
-        nob = le32_to_cpu(hdr.payload_length);
-        if (nob > getpagesize()) {
-                CERROR("Unexpected HELLO payload %d from "LPX64"\n",
-                       nob, *nid);
-                return (-EPROTO);
-        }
-        if (nob > 0) {
-                char *space = (char *)malloc(nob);
-                
-                if (space == NULL) {
-                        CERROR("Can't allocate scratch buffer %d\n", nob);
-                        return (-ENOMEM);
-                }
-                
-                rc = syscall(SYS_read, sockfd, space, nob);
-                if (rc <= 0) {
-                        CERROR("Error %d skipping HELLO payload from "
-                               LPX64"\n", rc, *nid);
-                        return (rc);
-                }
-        }
-
-        return (0);
-}
-
-/* Function:  force_tcp_connection
- * Arguments: t: tcpnal
- *            dest: portals endpoint for the connection
- * Returns: an allocated connection structure, either
- *          a pre-existing one, or a new connection
- */
-connection force_tcp_connection(manager m,
-                                unsigned int ip,
-                                unsigned short port,
-                                procbridge pb)
-{
-    connection conn;
-    struct sockaddr_in addr;
-    struct sockaddr_in locaddr; 
-    unsigned int id[2];
-    struct timeval tv;
-    __u64 incarnation;
-
-    int fd;
-    int option;
-    int rc;
-    int rport;
-    ptl_nid_t peernid = PTL_NID_ANY;
-
-    port = tcpnal_acceptor_port;
-
-    id[0] = ip;
-    id[1] = port;
-
-    pthread_mutex_lock(&m->conn_lock);
-
-    conn = hash_table_find(m->connections, id);
-    if (conn)
-            goto out;
-
-    memset(&addr, 0, sizeof(addr));
-    addr.sin_family      = AF_INET;
-    addr.sin_addr.s_addr = htonl(ip);
-    addr.sin_port        = htons(port);
-
-    memset(&locaddr, 0, sizeof(locaddr)); 
-    locaddr.sin_family = AF_INET; 
-    locaddr.sin_addr.s_addr = INADDR_ANY;
-
-    for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
-            fd = socket(AF_INET, SOCK_STREAM, 0);
-            if (fd < 0) {
-                    perror("tcpnal socket failed");
-                    goto out;
-            } 
-            
-            option = 1;
-            rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
-                            &option, sizeof(option));
-            if (rc != 0) {
-                    perror ("Can't set SO_REUSEADDR for socket"); 
-                    close(fd);
-                    goto out;
-            } 
-
-            locaddr.sin_port = htons(rport);
-            rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
-            if (rc == 0 || errno == EACCES) {
-                    rc = connect(fd, (struct sockaddr *)&addr,
-                                 sizeof(struct sockaddr_in));
-                    if (rc == 0) {
-                            break;
-                    } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) {
-                            perror("Error connecting to remote host");
-                            close(fd);
-                            goto out;
-                    }
-            } else if (errno != EADDRINUSE) {
-                    perror("Error binding to privileged port");
-                    close(fd);
-                    goto out;
-            }
-            close(fd);
-    }
-    
-    if (rport == IPPORT_RESERVED / 2) {
-            fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
-            goto out;
-    }
-    
-#if 1
-    option = 1;
-    setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
-    option = 1<<20;
-    setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
-    option = 1<<20;
-    setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
-#endif
-   
-    gettimeofday(&tv, NULL);
-    incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-
-    /* say hello */
-    if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
-            exit(-1);
-    
-    conn = allocate_connection(m, ip, port, fd);
-    
-    /* let nal thread know this event right away */
-    if (conn)
-            procbridge_wakeup_nal(pb);
-
-out:
-    pthread_mutex_unlock(&m->conn_lock);
-    return (conn);
-}
-
-
-/* Function:  bind_socket
- * Arguments: t: the nal state for this interface
- *            port: the port to attempt to bind to
- * Returns: 1 on success, or 0 on error
- *
- * bind_socket() attempts to allocate and bind a socket to the requested
- *  port, or dynamically assign one from the kernel should the port be
- *  zero. Sets the bound and bound_handler elements of m.
- *
- *  TODO: The port should be an explicitly sized type.
- */
-static int bind_socket(manager m,unsigned short port)
-{
-    struct sockaddr_in addr;
-    int alen=sizeof(struct sockaddr_in);
-    
-    if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0)  
-        return(0);
-    
-    bzero((char *) &addr, sizeof(addr));
-    addr.sin_family      = AF_INET;
-    addr.sin_addr.s_addr = 0;
-    addr.sin_port        = htons(port);
-
-    if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
-        fprintf(stderr, "tcpnal bind: %s port %u\n", strerror(errno), port); 
-        return(0);
-    }
-    
-    getsockname(m->bound,(struct sockaddr *)&addr, &alen);
-
-    m->bound_handler=register_io_handler(m->bound,READ_HANDLER,
-                                         new_connection,m);
-    listen(m->bound,5); 
-    m->port=addr.sin_port;
-    return(1);
-}
-
-
-/* Function:  shutdown_connections
- * Arguments: m: the manager structure
- *
- * close all connections and reclaim resources
- */
-void shutdown_connections(manager m)
-{
-    close(m->bound);
-    remove_io_handler(m->bound_handler);
-    hash_destroy_table(m->connections,remove_connection);
-    free(m);
-}
-
-
-/* Function:  init_connections
- * Arguments: t: the nal state for this interface
- *            port: the port to attempt to bind to
- * Returns: a newly allocated manager structure, or
- *          zero if the fixed port could not be bound
- */
-manager init_connections(unsigned short pid,
-                         int (*input)(void *, void *),
-                         void *a)
-{
-    manager m = (manager)malloc(sizeof(struct manager));
-    m->connections = hash_create_table(compare_connection,connection_key);
-    m->handler = input;
-    m->handler_arg = a;
-    pthread_mutex_init(&m->conn_lock, 0);
-
-    if (bind_socket(m,pid))
-        return(m);
-
-    free(m);
-    return(0);
-}
diff --git a/lnet/ulnds/connection.h b/lnet/ulnds/connection.h
deleted file mode 100644 (file)
index 343ffa6..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-#include <table.h>
-#include <procbridge.h>
-
-typedef struct manager {
-    table connections;
-    pthread_mutex_t conn_lock; /* protect connections table */
-    int bound;
-    io_handler bound_handler;
-    int (*handler)(void *, void *);
-    void *handler_arg;
-    unsigned short port;
-} *manager;
-
-
-typedef struct connection {
-    unsigned int ip;
-    unsigned short port;
-    int fd;
-    manager m;
-} *connection;
-
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
-                                procbridge pb);
-manager init_connections(unsigned short, int (*f)(void *, void *), void *);
-void remove_connection(void *arg);
-void shutdown_connections(manager m);
-int read_connection(connection c, unsigned char *dest, int len);
diff --git a/lnet/ulnds/debug.c b/lnet/ulnds/debug.c
deleted file mode 100644 (file)
index b82bb2f..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Phil Schwan <phil@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <stdio.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <sys/time.h>
-
-int smp_processor_id = 1;
-char debug_file_path[1024] = "/tmp/lustre-log";
-char debug_file_name[1024];
-FILE *debug_file_fd;
-
-int portals_do_debug_dumplog(void *arg)
-{
-        printf("Look in %s\n", debug_file_name);
-        return 0;
-}
-
-
-void portals_debug_print(void)
-{
-        return;
-}
-
-
-void portals_debug_dumplog(void)
-{
-        printf("Look in %s\n", debug_file_name);
-        return;
-}
-
-
-int portals_debug_init(unsigned long bufsize)
-{ 
-        debug_file_fd = stdout;
-        return 0;
-}
-
-int portals_debug_cleanup(void)
-{
-        return 0; //close(portals_debug_fd);
-}
-
-int portals_debug_clear_buffer(void)
-{
-        return 0;
-}
-
-int portals_debug_mark_buffer(char *text)
-{
-
-        fprintf(debug_file_fd, "*******************************************************************************\n");
-        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
-        fprintf(debug_file_fd, "*******************************************************************************\n");
-
-        return 0;
-}
-
-int portals_debug_copy_to_user(char *buf, unsigned long len)
-{
-        return 0;
-}
-
-/* FIXME: I'm not very smart; someone smarter should make this better. */
-void
-portals_debug_msg (int subsys, int mask, char *file, const char *fn, 
-                   const int line, const char *format, ...)
-{
-        va_list       ap;
-        unsigned long flags;
-        struct timeval tv;
-        int nob;
-
-
-        /* NB since we pass a non-zero sized buffer (at least) on the first
-         * print, we can be assured that by the end of all the snprinting,
-         * we _do_ have a terminated buffer, even if our message got truncated.
-         */
-
-        gettimeofday(&tv, NULL);
-
-        nob += fprintf(debug_file_fd,
-                              "%02x:%06x:%d:%lu.%06lu ",
-                              subsys >> 24, mask, smp_processor_id,
-                              tv.tv_sec, tv.tv_usec);
-
-        nob += fprintf(debug_file_fd,
-                            "(%s:%d:%s() %d+%ld): ",
-                            file, line, fn, 0,
-                            8192 - ((unsigned long)&flags & 8191UL));
-
-        va_start (ap, format);
-        nob += fprintf(debug_file_fd, format, ap);
-        va_end (ap);
-
-
-}
-
diff --git a/lnet/ulnds/dispatch.h b/lnet/ulnds/dispatch.h
deleted file mode 100644 (file)
index a8f916d..0000000
+++ /dev/null
@@ -1,46 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2002 Eric Hoffman
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-/* this file is only called dispatch.h to prevent it
-   from colliding with /usr/include/sys/select.h */
-
-typedef struct io_handler *io_handler;
-
-struct io_handler{
-  io_handler *last;
-  io_handler next;
-  int fd;
-  int type;
-  int (*function)(void *);
-  void *argument;
-  int disabled;
-};
-
-
-#define READ_HANDLER 1
-#define WRITE_HANDLER 2
-#define EXCEPTION_HANDLER 4
-#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER)
-
-io_handler register_io_handler(int fd,
-                               int type,
-                               int (*function)(void *),
-                               void *arg);
-
-void remove_io_handler (io_handler i);
-void init_unix_timer(void);
-void select_timer_block(when until);
-when now(void);
-
-/*
- * hacking for CFS internal MPI testing
- */ 
-#if !CRAY_PORTALS
-#define ENABLE_SELECT_DISPATCH
-#endif
diff --git a/lnet/ulnds/ipmap.h b/lnet/ulnds/ipmap.h
deleted file mode 100644 (file)
index 85b1e18..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-#define DIRECT_IP_MODE
-#ifdef DIRECT_IP_MODE
-#define PNAL_NID(in_addr, port) (in_addr)
-#define PNAL_PID(pid) (pid)
-#define PNAL_IP(in_addr, port) (in_addr)
-#define PNAL_PORT(nid, pid) (pid)
-#else
-
-#define PNAL_BASE_PORT 4096
-#define PNAL_HOSTID_SHIFT 24
-#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
-#define PNAL_VNODE_SHIFT 8
-#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
-#define PNAL_PID_SHIFT 8
-#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
-
-#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
-                                    << PNAL_VNODE_SHIFT) \
-                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
-                                       PNAL_PID_SHIFT)))
-#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
-
-#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
-                                >> PNAL_VNODE_SHIFT)\
-                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
-#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
-                                 << PNAL_VNODE_SHIFT) \
-                                | ((pid) & PNAL_PID_MASK)) \
-                               + PNAL_BASE_PORT))
-#endif
diff --git a/lnet/ulnds/pqtimer.c b/lnet/ulnds/pqtimer.c
deleted file mode 100644 (file)
index 98c48eb..0000000
+++ /dev/null
@@ -1,226 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2002 Eric Hoffman
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* timer.c:
- *   this file implements a simple priority-queue based timer system. when
- * combined with a file which implements now() and block(), it can
- * be used to provide course-grained time-based callbacks.
- */
-
-#include <pqtimer.h>
-#include <stdlib.h>
-#include <string.h>
-
-struct timer {
-  void (*function)(void *);
-  void *arg;
-  when w;
-  int interval;
-  int disable;
-};
-
-typedef struct thunk *thunk;
-struct thunk {
-    void (*f)(void *);
-    void *a;
-    thunk next;
-};
-
-extern when now(void);
-
-static thunk thunks;
-static int internal;
-static void (*block_function)(when);
-static int number_of_timers;
-static int size_of_pqueue;
-static timer *timers;
-
-
-static void heal(int where)
-{
-    int left=(where<<1);
-    int right=(where<<1)+1;
-    int min=where;
-    timer temp;
-  
-    if (left <= number_of_timers)
-       if (timers[left]->w < timers[min]->w) min=left;
-    if (right <= number_of_timers)
-       if (timers[right]->w < timers[min]->w) min=right;
-    if (min != where){
-       temp=timers[where];
-       timers[where]=timers[min];
-       timers[min]=temp;
-       heal(min);
-    }
-}
-
-static void add_pqueue(int i)
-{
-    timer temp;
-    int parent=(i>>1);
-    if ((i>1) && (timers[i]->w< timers[parent]->w)){
-       temp=timers[i];
-       timers[i]=timers[parent];
-       timers[parent]=temp;
-       add_pqueue(parent);
-    }
-}
-
-static void add_timer(timer t)
-{
-    if (size_of_pqueue<(number_of_timers+2)){
-       int oldsize=size_of_pqueue;
-       timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10));
-       memcpy(new,timers,sizeof(timer)*oldsize);
-       timers=new;
-    }
-    timers[++number_of_timers]=t;
-    add_pqueue(number_of_timers);
-}
-
-/* Function: register_timer
- * Arguments: interval: the time interval from the current time when
- *                      the timer function should be called
- *            function: the function to call when the time has expired
- *            argument: the argument to call it with.
- * Returns: a pointer to a timer structure
- */
-timer register_timer(when interval,
-                    void (*function)(void *),
-                    void *argument)
-{
-    timer t=(timer)malloc(sizeof(struct timer));
-
-    t->arg=argument;
-    t->function=function;
-    t->interval=interval;
-    t->disable=0;
-    t->w=now()+interval;
-    add_timer(t);
-    if (!internal && (number_of_timers==1))
-        block_function(t->w);
-    return(t);
-}
-
-/* Function: remove_timer
- * Arguments: t: 
- * Returns: nothing
- *
- * remove_timer removes a timer from the system, insuring
- * that it will never be called. It does not actually
- * free the timer due to reentrancy issues.
- */
-
-void remove_timer(timer t)
-{
-    t->disable=1;
-}
-
-
-
-void timer_fire()
-{
-    timer current;
-
-    current=timers[1];
-    timers[1]=timers[number_of_timers--];
-    heal(1);
-    if (!current->disable) {
-        (*current->function)(current->arg);
-    }
-    free(current);
-}
-
-when next_timer(void)
-{
-    when here=now();
-
-    while (number_of_timers && (timers[1]->w <= here)) timer_fire();
-    if (number_of_timers) return(timers[1]->w);
-    return(0);
-}
-
-/* Function: timer_loop
- * Arguments: none
- * Returns: never
- * 
- * timer_loop() is the blocking dispatch function for the timer.
- * Is calls the block() function registered with init_timer,
- * and handles associated with timers that have been registered.
- */
-void timer_loop()
-{
-    when here;
-
-    while (1){
-       thunk z;
-       here=now();
-
-       for (z=thunks;z;z=z->next) (*z->f)(z->a);
-
-       if (number_of_timers){
-           if (timers[1]->w > here){
-               (*block_function)(timers[1]->w);
-           } else {
-                timer_fire();
-           }
-       } else {
-           thunk z;
-           for (z=thunks;z;z=z->next) (*z->f)(z->a);
-           (*block_function)(0);
-       }
-    }
-}
-
-
-/* Function: register_thunk
- * Arguments: f: the function to call
- *            a: the single argument to call it with
- *
- * Thunk functions get called at irregular intervals, they
- * should not assume when, or take a particularily long
- * amount of time. Thunks are for background cleanup tasks.
- */
-void register_thunk(void (*f)(void *),void *a)
-{
-    thunk t=(void *)malloc(sizeof(struct thunk));
-    t->f=f;
-    t->a=a;
-    t->next=thunks;
-    thunks=t;
-}
-
-/* Function: initialize_timer
- * Arguments: block: the function to call to block for the specified interval 
- *
- * initialize_timer() must be called before any other timer function,
- * including timer_loop.
- */
-void initialize_timer(void (*block)(when))
-{
-    block_function=block;
-    number_of_timers=0;
-    size_of_pqueue=10;
-    timers=(timer *)malloc(sizeof(timer)*size_of_pqueue);
-    thunks=0;
-}
diff --git a/lnet/ulnds/pqtimer.h b/lnet/ulnds/pqtimer.h
deleted file mode 100644 (file)
index 11efb0e..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2002 Eric Hoffman
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-typedef unsigned long long when;
-when now(void);
-typedef struct timer *timer;
-timer register_timer(when interval,
-                    void (*function)(void *),
-                    void *argument);
-timer register_timer_wait(void);
-void remove_timer(timer);
-void timer_loop(void);
-void initialize_timer(void (*block)(when));
-void timer_fire(void);
-
-
-#define HZ 0x100000000ull
-
-
diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c
deleted file mode 100644 (file)
index 6b471c0..0000000
+++ /dev/null
@@ -1,196 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2003 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* api.c:
- *  This file provides the 'api' side for the process-based nals.
- *  it is responsible for creating the 'library' side thread,
- *  and passing wrapped portals transactions to it.
- *
- *  Along with initialization, shutdown, and transport to the library
- *  side, this file contains some stubs to satisfy the nal definition.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#ifndef __CYGWIN__
-#include <syscall.h>
-#endif
-#include <sys/socket.h>
-#include <procbridge.h>
-#include <pqtimer.h>
-#include <dispatch.h>
-#include <errno.h>
-
-
-/* XXX CFS workaround, to give a chance to let nal thread wake up
- * from waiting in select
- */
-static int procbridge_notifier_handler(void *arg)
-{
-    static char buf[8];
-    procbridge p = (procbridge) arg;
-
-    syscall(SYS_read, p->notifier[1], buf, sizeof(buf));
-    return 1;
-}
-
-void procbridge_wakeup_nal(procbridge p)
-{
-    static char buf[8];
-    syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
-}
-
-/* Function: shutdown
- * Arguments: nal: a pointer to my top side nal structure
- *            ni: my network interface index
- *
- * cleanup nal state, reclaim the lower side thread and
- *   its state using PTL_FINI codepoint
- */
-static void procbridge_shutdown(nal_t *n)
-{
-    lib_nal_t *nal = n->nal_data;
-    bridge b=(bridge)nal->libnal_data;
-    procbridge p=(procbridge)b->local;
-
-    p->nal_flags |= NAL_FLAG_STOPPING;
-    procbridge_wakeup_nal(p);
-
-    do {
-        pthread_mutex_lock(&p->mutex);
-        if (p->nal_flags & NAL_FLAG_STOPPED) {
-                pthread_mutex_unlock(&p->mutex);
-                break;
-        }
-        pthread_cond_wait(&p->cond, &p->mutex);
-        pthread_mutex_unlock(&p->mutex);
-    } while (1);
-
-    free(p);
-}
-
-
-/* forward decl */
-extern int procbridge_startup (nal_t *, ptl_pid_t,
-                               ptl_ni_limits_t *, ptl_ni_limits_t *);
-
-/* api_nal
- *  the interface vector to allow the generic code to access
- *  this nal. this is seperate from the library side lib_nal.
- *  TODO: should be dyanmically allocated
- */
-nal_t procapi_nal = {
-    nal_data: NULL,
-    nal_ni_init: procbridge_startup,
-    nal_ni_fini: procbridge_shutdown,
-};
-
-ptl_nid_t tcpnal_mynid;
-
-#ifdef ENABLE_SELECT_DISPATCH
-procbridge __global_procbridge = NULL;
-#endif
-
-/* Function: procbridge_startup
- *
- * Arguments:  pid: requested process id (port offset)
- *                  PTL_ID_ANY not supported.
- *             desired: limits passed from the application
- *                      and effectively ignored
- *             actual:  limits actually allocated and returned
- *
- * Returns: portals rc
- *
- * initializes the tcp nal. we define unix_failure as an
- * error wrapper to cut down clutter.
- */
-int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
-                        ptl_ni_limits_t *requested_limits,
-                        ptl_ni_limits_t *actual_limits)
-{
-    nal_init_args_t args;
-
-    procbridge p;
-    bridge b;
-    /* XXX nal_type is purely private to tcpnal here */
-    int nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
-
-    LASSERT(nal == &procapi_nal);
-
-    init_unix_timer();
-
-    b=(bridge)malloc(sizeof(struct bridge));
-    p=(procbridge)malloc(sizeof(struct procbridge));
-    b->local=p;
-
-    args.nia_requested_pid = requested_pid;
-    args.nia_requested_limits = requested_limits;
-    args.nia_actual_limits = actual_limits;
-    args.nia_nal_type = nal_type;
-    args.nia_bridge = b;
-    args.nia_apinal = nal;
-
-    /* init procbridge */
-    pthread_mutex_init(&p->mutex,0);
-    pthread_cond_init(&p->cond, 0);
-    p->nal_flags = 0;
-
-    /* initialize notifier */
-    if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
-        perror("socketpair failed");
-        return PTL_FAIL;
-    }
-
-    if (!register_io_handler(p->notifier[1], READ_HANDLER,
-                procbridge_notifier_handler, p)) {
-        perror("fail to register notifier handler");
-        return PTL_FAIL;
-    }
-
-#ifdef ENABLE_SELECT_DISPATCH
-    __global_procbridge = p;
-#endif
-
-    /* create nal thread */
-    if (pthread_create(&p->t, NULL, nal_thread, &args)) {
-        perror("nal_init: pthread_create");
-        return PTL_FAIL;
-    }
-
-    do {
-        pthread_mutex_lock(&p->mutex);
-        if (p->nal_flags & (NAL_FLAG_RUNNING | NAL_FLAG_STOPPED)) {
-                pthread_mutex_unlock(&p->mutex);
-                break;
-        }
-        pthread_cond_wait(&p->cond, &p->mutex);
-        pthread_mutex_unlock(&p->mutex);
-    } while (1);
-
-    if (p->nal_flags & NAL_FLAG_STOPPED)
-        return PTL_FAIL;
-
-    b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid;
-
-    return PTL_OK;
-}
diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h
deleted file mode 100644 (file)
index 1f91ced..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2003 Cluster File Systems, Inc.
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-#ifndef _PROCBRIDGE_H_
-#define _PROCBRIDGE_H_
-
-#include <pthread.h>
-#include <bridge.h>
-#include <ipmap.h>
-
-
-#define NAL_FLAG_RUNNING        1
-#define NAL_FLAG_STOPPING       2
-#define NAL_FLAG_STOPPED        4
-
-typedef struct procbridge {
-    /* sync between user threads and nal thread */
-    pthread_t t;
-    pthread_cond_t cond;
-    pthread_mutex_t mutex;
-
-    /* socket pair used to notify nal thread */
-    int notifier[2];
-
-    int nal_flags;
-
-} *procbridge;
-
-typedef struct nal_init_args {
-    ptl_pid_t        nia_requested_pid;
-    ptl_ni_limits_t *nia_requested_limits;
-    ptl_ni_limits_t *nia_actual_limits;
-    int              nia_nal_type;
-    bridge           nia_bridge;
-    nal_t           *nia_apinal;
-} nal_init_args_t;
-
-extern void *nal_thread(void *);
-
-
-#define PTL_INIT        (LIB_MAX_DISPATCH+1)
-#define PTL_FINI        (LIB_MAX_DISPATCH+2)
-
-#define MAX_ACLS        1
-#define MAX_PTLS        128
-
-extern void set_address(bridge t,ptl_pid_t pidrequest);
-extern void procbridge_wakeup_nal(procbridge p);
-
-#endif
diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c
deleted file mode 100644 (file)
index 7ee7c71..0000000
+++ /dev/null
@@ -1,137 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2003 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* lib.c:
- *  This file provides the 'library' side for the process-based nals.
- *  it is responsible for communication with the 'api' side and
- *  providing service to the generic portals 'library'
- *  implementation. 'library' might be better termed 'communication'
- *  or 'kernel'.
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <procbridge.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netdb.h>
-#include <errno.h>
-#include <timer.h>
-#include <dispatch.h>
-
-/* the following functions are stubs to satisfy the nal definition
-   without doing anything particularily useful*/
-
-static int nal_dist(lib_nal_t *nal,
-                    ptl_nid_t nid,
-                    unsigned long *dist)
-{
-    return 0;
-}
-
-static void check_stopping(void *z)
-{
-    bridge b = z;
-    procbridge p = b->local;
-
-    if ((p->nal_flags & NAL_FLAG_STOPPING) == 0)
-            return;
-    
-    pthread_mutex_lock(&p->mutex);
-    p->nal_flags |= NAL_FLAG_STOPPED;
-    pthread_cond_broadcast(&p->cond);
-    pthread_mutex_unlock(&p->mutex);
-
-    pthread_exit(0);
-}
-
-
-/* Function:  nal_thread
- * Arguments: z: an opaque reference to a nal control structure
- *               allocated and partially populated by the api level code
- * Returns: nothing, and only on error or explicit shutdown
- *
- *  This function is the entry point of the pthread initiated on 
- *  the api side of the interface. This thread is used to handle
- *  asynchronous delivery to the application.
- * 
- *  We define a limit macro to place a ceiling on limits
- *   for syntactic convenience
- */
-extern int tcpnal_init(bridge);
-
-nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
-
-void *nal_thread(void *z)
-{
-    nal_init_args_t *args = (nal_init_args_t *) z;
-    bridge b = args->nia_bridge;
-    procbridge p=b->local;
-    int rc;
-    ptl_process_id_t process_id;
-    int nal_type;
-    
-    b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t));
-    b->lib_nal->libnal_data=b;
-    b->lib_nal->libnal_map=NULL;
-    b->lib_nal->libnal_unmap=NULL;
-    b->lib_nal->libnal_dist=nal_dist;
-
-    nal_type = args->nia_nal_type;
-
-    /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which
-     * lib_init() is about to do from the process_id passed to it...*/
-    set_address(b,args->nia_requested_pid);
-
-    process_id = b->lib_nal->libnal_ni.ni_pid;
-    
-    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
-    /* initialize the generic 'library' level code */
-
-    rc = lib_init(b->lib_nal, args->nia_apinal, 
-                  process_id, 
-                  args->nia_requested_limits, 
-                  args->nia_actual_limits);
-
-    /*
-     * Whatever the initialization returned is passed back to the
-     * user level code for further interpretation.  We just exit if
-     * it is non-zero since something went wrong.
-     */
-    /* this should perform error checking */
-    pthread_mutex_lock(&p->mutex);
-    p->nal_flags |= (rc != PTL_OK) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING;
-    pthread_cond_broadcast(&p->cond);
-    pthread_mutex_unlock(&p->mutex);
-
-    if (rc == PTL_OK) {
-        /* the thunk function is called each time the timer loop
-           performs an operation and returns to blocking mode. we
-           overload this function to inform the api side that
-           it may be interested in looking at the event queue */
-        register_thunk(check_stopping,b);
-        timer_loop();
-    }
-    return(0);
-}
diff --git a/lnet/ulnds/ptllnd/.cvsignore b/lnet/ulnds/ptllnd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
diff --git a/lnet/ulnds/ptllnd/Makefile.am b/lnet/ulnds/ptllnd/Makefile.am
new file mode 100644 (file)
index 0000000..e48cb85
--- /dev/null
@@ -0,0 +1,12 @@
+
+if BUILD_UPTLLND
+if LIBLUSTRE 
+noinst_LIBRARIES = libptllnd.a
+noinst_HEADERS =  ptllnd.h
+libptllnd_a_SOURCES = ptllnd.h ptllnd.c ptllnd_cb.c
+libptllnd_a_CPPFLAGS= $(LLCPPFLAGS)
+# I need $(PTLNDCPPLFLAGS) to be AFTER $(CPPFLAGS)
+# Adding them into $(AM_CFLAGS) seems wrong, but lets me get on..
+libptllnd_a_CFLAGS= $(PTLLNDCPPFLAGS) $(LLCFLAGS)
+endif
+endif
diff --git a/lnet/ulnds/ptllnd/ptllnd.c b/lnet/ulnds/ptllnd/ptllnd.c
new file mode 100644 (file)
index 0000000..b13f520
--- /dev/null
@@ -0,0 +1,629 @@
+
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: Eric Barton <eeb@bartonsoftware.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+#include "ptllnd.h"
+
+lnd_t               the_ptllnd = {
+        .lnd_type       = PTLLND,
+        .lnd_startup    = ptllnd_startup,
+        .lnd_shutdown   = ptllnd_shutdown,
+       .lnd_ctl        = ptllnd_ctl,
+        .lnd_send       = ptllnd_send,
+        .lnd_recv       = ptllnd_recv,
+        .lnd_eager_recv = ptllnd_eager_recv,
+        .lnd_notify     = ptllnd_notify,
+        .lnd_wait       = ptllnd_wait,
+};
+
+static int ptllnd_ni_count = 0;
+
+void 
+ptllnd_assert_wire_constants (void)
+{
+        /* Wire protocol assertions generated by 'wirecheck'
+         * running on Linux fedora 2.6.11-co-0.6.4 #1 Mon Jun 19 05:36:13 UTC 2006 i686 i686 i386 GNU
+         * with gcc version 4.1.1 20060525 (Red Hat 4.1.1-1) */
+
+
+        /* Constants... */
+        CLASSERT (PTL_RESERVED_MATCHBITS == 0x100);
+        CLASSERT (LNET_MSG_MATCHBITS == 0);
+        CLASSERT (PTLLND_MSG_MAGIC == 0x50746C4E);
+        CLASSERT (PTLLND_MSG_VERSION == 0x04);
+        CLASSERT (PTLLND_RDMA_OK == 0x00);
+        CLASSERT (PTLLND_RDMA_FAIL == 0x01);
+        CLASSERT (PTLLND_MSG_TYPE_INVALID == 0x00);
+        CLASSERT (PTLLND_MSG_TYPE_PUT == 0x01);
+        CLASSERT (PTLLND_MSG_TYPE_GET == 0x02);
+        CLASSERT (PTLLND_MSG_TYPE_IMMEDIATE == 0x03);
+        CLASSERT (PTLLND_MSG_TYPE_NOOP == 0x04);
+        CLASSERT (PTLLND_MSG_TYPE_HELLO == 0x05);
+        CLASSERT (PTLLND_MSG_TYPE_NAK == 0x06);
+
+        /* Checks for struct kptl_msg_t */
+        CLASSERT ((int)sizeof(kptl_msg_t) == 136);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_magic) == 0);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_magic) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_version) == 4);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_version) == 2);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_type) == 6);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_type) == 1);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_credits) == 7);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_credits) == 1);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_nob) == 8);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_nob) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_cksum) == 12);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_cksum) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcnid) == 16);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcnid) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcstamp) == 24);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcstamp) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstnid) == 32);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstnid) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dststamp) == 40);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dststamp) == 8);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcpid) == 48);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcpid) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstpid) == 52);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstpid) == 4);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.immediate) == 56);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.immediate) == 72);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.rdma) == 56);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.rdma) == 80);
+        CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.hello) == 56);
+        CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.hello) == 12);
+
+        /* Checks for struct kptl_immediate_msg_t */
+        CLASSERT ((int)sizeof(kptl_immediate_msg_t) == 72);
+        CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_hdr) == 0);
+        CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_hdr) == 72);
+        CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_payload[13]) == 85);
+        CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_payload[13]) == 1);
+
+        /* Checks for struct kptl_rdma_msg_t */
+        CLASSERT ((int)sizeof(kptl_rdma_msg_t) == 80);
+        CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_hdr) == 0);
+        CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_hdr) == 72);
+        CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_matchbits) == 72);
+        CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_matchbits) == 8);
+
+        /* Checks for struct kptl_hello_msg_t */
+        CLASSERT ((int)sizeof(kptl_hello_msg_t) == 12);
+        CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_matchbits) == 0);
+        CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_matchbits) == 8);
+        CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_max_msg_size) == 8);
+        CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_max_msg_size) == 4);
+}
+
+int
+ptllnd_parse_int_tunable(int *value, char *name, int dflt)
+{
+        char    *env = getenv(name);
+        char    *end;
+
+        if (env == NULL) {
+                *value = dflt;
+                return 0;
+        }
+
+        *value = strtoull(env, &end, 0);
+        if (*end == 0)
+                return 0;
+
+        CERROR("Can't parse tunable %s=%s\n", name, env);
+        return -EINVAL;
+}
+
+int
+ptllnd_get_tunables(lnet_ni_t *ni)
+{
+        ptllnd_ni_t *plni = ni->ni_data;
+        int          max_msg_size;
+        int          msgs_per_buffer;
+        int          rc;
+        int          temp;
+
+        rc = ptllnd_parse_int_tunable(&plni->plni_portal,
+                                      "PTLLND_PORTAL", PTLLND_PORTAL);
+        if (rc != 0)
+                return rc;
+
+        rc = ptllnd_parse_int_tunable(&temp,
+                                      "PTLLND_PID", PTLLND_PID);
+        if (rc != 0)
+                return rc;
+        plni->plni_ptllnd_pid = (ptl_pid_t)temp;
+
+        rc = ptllnd_parse_int_tunable(&plni->plni_peer_credits,
+                                      "PTLLND_PEERCREDITS", PTLLND_PEERCREDITS);
+        if (rc != 0)
+                return rc;
+
+        rc = ptllnd_parse_int_tunable(&max_msg_size,
+                                      "PTLLND_MAX_MSG_SIZE",
+                                      PTLLND_MAX_MSG_SIZE);
+        if (rc != 0)
+                return rc;
+
+        rc = ptllnd_parse_int_tunable(&msgs_per_buffer,
+                                      "PTLLND_MSGS_PER_BUFFER",
+                                      PTLLND_MSGS_PER_BUFFER);
+        if (rc != 0)
+                return rc;
+
+        rc = ptllnd_parse_int_tunable(&plni->plni_msgs_spare,
+                                      "PTLLND_MSGS_SPARE",
+                                      PTLLND_MSGS_SPARE);
+        if (rc != 0)
+                return rc;
+
+        rc = ptllnd_parse_int_tunable(&plni->plni_peer_hash_size,
+                                      "PTLLND_PEER_HASH_SIZE",
+                                      PTLLND_PEER_HASH_SIZE);
+        if (rc != 0)
+                return rc;
+
+
+        rc = ptllnd_parse_int_tunable(&plni->plni_eq_size,
+                                      "PTLLND_EQ_SIZE", PTLLND_EQ_SIZE);
+        if (rc != 0)
+                return rc;
+
+       rc = ptllnd_parse_int_tunable(&plni->plni_checksum,
+                                     "PTLLND_CHECKSUM", 0);
+       if (rc != 0)
+               return rc;
+
+       rc = ptllnd_parse_int_tunable(&plni->plni_max_tx_history,
+                                     "PTLLND_TX_HISTORY", PTLLND_TX_HISTORY);
+       if (rc != 0)
+               return rc;
+
+       rc = ptllnd_parse_int_tunable(&plni->plni_abort_on_nak,
+                                     "PTLLND_ABORT_ON_NAK",
+                                     PTLLND_ABORT_ON_NAK);
+       if (rc != 0)
+               return rc;
+
+        plni->plni_max_msg_size = max_msg_size & ~7;
+        if (plni->plni_max_msg_size < sizeof(kptl_msg_t))
+                plni->plni_max_msg_size = (sizeof(kptl_msg_t) + 7) & ~7;
+
+        plni->plni_buffer_size = plni->plni_max_msg_size * msgs_per_buffer;
+
+        CDEBUG(D_NET, "portal          = %d\n",plni->plni_portal);
+        CDEBUG(D_NET, "ptllnd_pid      = %d\n",plni->plni_ptllnd_pid);
+        CDEBUG(D_NET, "max_msg_size    = %d\n",max_msg_size);
+        CDEBUG(D_NET, "msgs_per_buffer = %d\n",msgs_per_buffer);
+        CDEBUG(D_NET, "msgs_spare      = %d\n",plni->plni_msgs_spare);
+        CDEBUG(D_NET, "peer_hash_size  = %d\n",plni->plni_peer_hash_size);
+        CDEBUG(D_NET, "eq_size         = %d\n",plni->plni_eq_size);
+        CDEBUG(D_NET, "max_msg_size    = %d\n",plni->plni_max_msg_size);
+        CDEBUG(D_NET, "buffer_size     = %d\n",plni->plni_buffer_size);
+
+        return 0;
+}
+
+ptllnd_buffer_t *
+ptllnd_create_buffer (lnet_ni_t *ni)
+{
+        ptllnd_ni_t     *plni = ni->ni_data;
+        ptllnd_buffer_t *buf;
+
+        LIBCFS_ALLOC(buf, sizeof(*buf));
+        if (buf == NULL) {
+                CERROR("Can't allocate buffer descriptor\n");
+                return NULL;
+        }
+
+        buf->plb_ni = ni;
+        buf->plb_posted = 0;
+        CFS_INIT_LIST_HEAD(&buf->plb_list);
+
+        LIBCFS_ALLOC(buf->plb_buffer, plni->plni_buffer_size);
+        if (buf->plb_buffer == NULL) {
+                CERROR("Can't allocate buffer size %d\n",
+                       plni->plni_buffer_size);
+                LIBCFS_FREE(buf, sizeof(*buf));
+                return NULL;
+        }
+
+        list_add(&buf->plb_list, &plni->plni_buffers);
+        plni->plni_nbuffers++;
+
+        return buf;
+}
+
+void
+ptllnd_destroy_buffer (ptllnd_buffer_t *buf)
+{
+        ptllnd_ni_t     *plni = buf->plb_ni->ni_data;
+
+        LASSERT (!buf->plb_posted);
+
+        plni->plni_nbuffers--;
+        list_del(&buf->plb_list);
+        LIBCFS_FREE(buf->plb_buffer, plni->plni_buffer_size);
+        LIBCFS_FREE(buf, sizeof(*buf));
+}
+
+int
+ptllnd_grow_buffers (lnet_ni_t *ni)
+{
+        ptllnd_ni_t     *plni = ni->ni_data;
+        ptllnd_buffer_t *buf;
+        int              nmsgs;
+        int              nbufs;
+        int              rc;
+
+        CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers);
+
+        nmsgs = plni->plni_npeers * plni->plni_peer_credits +
+                plni->plni_msgs_spare;
+
+        nbufs = (nmsgs * plni->plni_max_msg_size + plni->plni_buffer_size - 1) /
+                plni->plni_buffer_size;
+
+        while (nbufs > plni->plni_nbuffers) {
+                buf = ptllnd_create_buffer(ni);
+
+                if (buf == NULL)
+                        return -ENOMEM;
+
+                rc = ptllnd_post_buffer(buf);
+                if (rc != 0){
+                        /* TODO - this path seems to orpahn the buffer
+                         * in a state where its not posted and will never be
+                         * However it does not leak the buffer as it's
+                         * already been put onto the global buffer list
+                         * and will be cleaned up
+                         */
+                        return rc;
+                }
+        }
+
+        CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers);
+        return 0;
+}
+
+void
+ptllnd_destroy_buffers (lnet_ni_t *ni)
+{
+        ptllnd_ni_t       *plni = ni->ni_data;
+        ptllnd_buffer_t   *buf;
+        struct list_head  *tmp;
+        struct list_head  *nxt;
+
+        CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers);
+
+        list_for_each_safe(tmp, nxt, &plni->plni_buffers) {
+                buf = list_entry(tmp, ptllnd_buffer_t, plb_list);
+
+                //CDEBUG(D_NET, "buf=%p posted=%d\n",buf,buf->plb_posted);
+
+                LASSERT (plni->plni_nbuffers > 0);
+                if (buf->plb_posted) {
+                       time_t   start = cfs_time_current_sec();
+                       int      w = PTLLND_WARN_LONG_WAIT;
+                       
+                        LASSERT (plni->plni_nposted_buffers > 0);
+
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                        (void) PtlMDUnlink(buf->plb_md);
+
+                       while (buf->plb_posted) {
+                               if (cfs_time_current_sec() > start + w) {
+                                       CWARN("Waited %ds to unlink buffer\n", w);
+                                       w *= 2;
+                               }
+                               ptllnd_wait(ni, w*1000);
+                       }
+#else
+                        while (buf->plb_posted) {
+                                rc = PtlMDUnlink(buf->plb_md);
+                                if (rc == PTL_OK) {
+                                        buf->plb_posted = 0;
+                                        plni->plni_nposted_buffers--;
+                                        break;
+                                }
+                                LASSERT (rc == PTL_MD_IN_USE);
+                               if (cfs_time_current_sec() > start + w) {
+                                       CWARN("Waited %ds to unlink buffer\n", w);
+                                       w *= 2;
+                               }
+                               ptllnd_wait(ni, w*1000);
+                        }
+#endif
+                }
+                ptllnd_destroy_buffer(buf);
+        }
+
+        CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
+        CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers);
+
+        LASSERT (plni->plni_nposted_buffers == 0);
+        LASSERT (plni->plni_nbuffers == 0);
+}
+
+int
+ptllnd_create_peer_hash (lnet_ni_t *ni)
+{
+        ptllnd_ni_t *plni = ni->ni_data;
+        int          i;
+
+        plni->plni_npeers = 0;
+
+        LIBCFS_ALLOC(plni->plni_peer_hash,
+                     plni->plni_peer_hash_size * sizeof(*plni->plni_peer_hash));
+        if (plni->plni_peer_hash == NULL) {
+                CERROR("Can't allocate ptllnd peer hash (size %d)\n",
+                       plni->plni_peer_hash_size);
+                return -ENOMEM;
+        }
+
+        for (i = 0; i < plni->plni_peer_hash_size; i++)
+                CFS_INIT_LIST_HEAD(&plni->plni_peer_hash[i]);
+
+        return 0;
+}
+
+void
+ptllnd_destroy_peer_hash (lnet_ni_t *ni)
+{
+        ptllnd_ni_t    *plni = ni->ni_data;
+        int             i;
+
+        LASSERT( plni->plni_npeers == 0);
+
+        for (i = 0; i < plni->plni_peer_hash_size; i++)
+                LASSERT (list_empty(&plni->plni_peer_hash[i]));
+
+        LIBCFS_FREE(plni->plni_peer_hash,
+                    plni->plni_peer_hash_size * sizeof(*plni->plni_peer_hash));
+}
+
+void
+ptllnd_close_peers (lnet_ni_t *ni)
+{
+        ptllnd_ni_t    *plni = ni->ni_data;
+        ptllnd_peer_t  *plp;
+        int             i;
+
+        for (i = 0; i < plni->plni_peer_hash_size; i++)
+                while (!list_empty(&plni->plni_peer_hash[i])) {
+                        plp = list_entry(plni->plni_peer_hash[i].next,
+                                         ptllnd_peer_t, plp_list);
+
+                        ptllnd_close_peer(plp, 0);
+                }
+}
+
+int
+ptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
+{
+       switch (cmd) {
+       case IOC_LIBCFS_DEBUG_PEER:
+               ptllnd_debug_peer(ni, *((lnet_process_id_t *)arg));
+               return 0;
+               
+       default:
+               return -EINVAL;
+       }
+}
+
+__u64
+ptllnd_get_timestamp(void)
+{
+        struct timeval  tv;
+        int             rc = gettimeofday(&tv, NULL);
+
+        LASSERT (rc == 0);
+        return ((__u64)tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+
+void
+ptllnd_shutdown (lnet_ni_t *ni)
+{
+        ptllnd_ni_t *plni = ni->ni_data;
+        int          rc;
+       time_t       start = cfs_time_current_sec();
+       int          w = PTLLND_WARN_LONG_WAIT;
+
+        LASSERT (ptllnd_ni_count == 1);
+       plni->plni_max_tx_history = 0;
+
+       ptllnd_cull_tx_history(plni);
+
+        ptllnd_destroy_buffers(ni);
+        ptllnd_close_peers(ni);
+
+        while (plni->plni_npeers > 0) {
+               if (cfs_time_current_sec() > start + w) {
+                       CWARN("Waited %ds for peers to shutdown\n", w);
+                       w *= 2;
+               }
+                ptllnd_wait(ni, w*1000);
+       }
+
+        LASSERT (plni->plni_ntxs == 0);
+        LASSERT (plni->plni_nrxs == 0);
+
+        rc = PtlEQFree(plni->plni_eqh);
+        LASSERT (rc == PTL_OK);
+
+        rc = PtlNIFini(plni->plni_nih);
+        LASSERT (rc == PTL_OK);
+
+        ptllnd_destroy_peer_hash(ni);
+        LIBCFS_FREE(plni, sizeof(*plni));
+        ptllnd_ni_count--;
+}
+
+int
+ptllnd_startup (lnet_ni_t *ni)
+{
+        ptllnd_ni_t *plni;
+        int          rc;
+
+       /* could get limits from portals I guess... */
+       ni->ni_maxtxcredits =
+       ni->ni_peertxcredits = 1000;
+
+        if (ptllnd_ni_count != 0) {
+                CERROR("Can't have > 1 instance of ptllnd\n");
+                return -EPERM;
+        }
+
+        ptllnd_ni_count++;
+
+        LIBCFS_ALLOC(plni, sizeof(*plni));
+        if (plni == NULL) {
+                CERROR("Can't allocate ptllnd state\n");
+                rc = -ENOMEM;
+                goto failed0;
+        }
+
+        ni->ni_data = plni;
+
+        plni->plni_stamp = ptllnd_get_timestamp();
+        plni->plni_nrxs = 0;
+        plni->plni_ntxs = 0;
+       plni->plni_ntx_history = 0;
+        CFS_INIT_LIST_HEAD(&plni->plni_zombie_txs);
+        CFS_INIT_LIST_HEAD(&plni->plni_tx_history);
+
+        /*
+         *  Initilize buffer related data structures
+         */
+        CFS_INIT_LIST_HEAD(&plni->plni_buffers);
+        plni->plni_nbuffers = 0;
+        plni->plni_nposted_buffers = 0;
+
+        rc = ptllnd_get_tunables(ni);
+        if (rc != 0)
+                goto failed1;
+
+        rc = ptllnd_create_peer_hash(ni);
+        if (rc != 0)
+                goto failed1;
+
+        /* NB I most probably won't get the PID I requested here.  It doesn't
+         * matter because I don't need a fixed PID (only connection acceptors
+         * need a "well known" PID). */
+
+        rc = PtlNIInit(PTL_IFACE_DEFAULT, plni->plni_ptllnd_pid,
+                       NULL, NULL, &plni->plni_nih);
+        if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
+                CERROR("PtlNIInit failed: %d\n", rc);
+                rc = -ENODEV;
+                goto failed2;
+        }
+
+        rc = PtlEQAlloc(plni->plni_nih, plni->plni_eq_size,
+                        PTL_EQ_HANDLER_NONE, &plni->plni_eqh);
+        if (rc != PTL_OK) {
+                CERROR("PtlEQAlloc failed: %d\n", rc);
+                rc = -ENODEV;
+                goto failed3;
+        }
+
+        /*
+         * Fetch the Portals NID
+         */
+        if(rc != PtlGetId(plni->plni_nih,&plni->plni_portals_id)){
+                CERROR ("PtlGetID failed : %d\n", rc);
+                rc = -EINVAL;
+                goto failed4;
+        }
+
+        CDEBUG(D_NET, "lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
+
+        /*
+         * Create the new NID.  Based on the LND network type
+         * and the lower ni's address data.
+         */
+        ni->ni_nid = ptllnd_ptl2lnetnid(ni, plni->plni_portals_id.nid);
+
+        CDEBUG(D_NET, "ptl id  =%s\n", ptllnd_ptlid2str(plni->plni_portals_id));
+        CDEBUG(D_NET, "lnet id =%s (passed back)\n",
+               libcfs_id2str((lnet_process_id_t) {
+                       .nid = ni->ni_nid, .pid = the_lnet.ln_pid}));
+
+        rc = ptllnd_grow_buffers(ni);
+        if (rc != 0)
+                goto failed4;
+
+       return 0;
+
+ failed4:
+        ptllnd_destroy_buffers(ni);
+        PtlEQFree(plni->plni_eqh);
+ failed3:
+        PtlNIFini(plni->plni_nih);
+ failed2:
+        ptllnd_destroy_peer_hash(ni);
+ failed1:
+        LIBCFS_FREE(plni, sizeof(*plni));
+ failed0:
+        ptllnd_ni_count--;
+        CDEBUG(D_NET, "<<< rc=%d\n",rc);
+        return rc;
+}
+
+const char *ptllnd_evtype2str(int type)
+{
+#define DO_TYPE(x) case x: return #x;
+        switch(type)
+        {
+                DO_TYPE(PTL_EVENT_GET_START);
+                DO_TYPE(PTL_EVENT_GET_END);
+                DO_TYPE(PTL_EVENT_PUT_START);
+                DO_TYPE(PTL_EVENT_PUT_END);
+                DO_TYPE(PTL_EVENT_REPLY_START);
+                DO_TYPE(PTL_EVENT_REPLY_END);
+                DO_TYPE(PTL_EVENT_ACK);
+                DO_TYPE(PTL_EVENT_SEND_START);
+                DO_TYPE(PTL_EVENT_SEND_END);
+                DO_TYPE(PTL_EVENT_UNLINK);
+        default:
+                return "";
+        }
+#undef DO_TYPE
+}
+
+const char *ptllnd_msgtype2str(int type)
+{
+#define DO_TYPE(x) case x: return #x;
+        switch(type)
+        {
+                DO_TYPE(PTLLND_MSG_TYPE_INVALID);
+                DO_TYPE(PTLLND_MSG_TYPE_PUT);
+                DO_TYPE(PTLLND_MSG_TYPE_GET);
+                DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE);
+                DO_TYPE(PTLLND_MSG_TYPE_HELLO);
+                DO_TYPE(PTLLND_MSG_TYPE_NOOP);
+                DO_TYPE(PTLLND_MSG_TYPE_NAK);
+        default:
+                return "";
+        }
+#undef DO_TYPE
+}
diff --git a/lnet/ulnds/ptllnd/ptllnd.h b/lnet/ulnds/ptllnd/ptllnd.h
new file mode 100644 (file)
index 0000000..f637c7d
--- /dev/null
@@ -0,0 +1,262 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: Eric Barton <eeb@bartonsoftware.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+
+#define DEBUG_SUBSYSTEM S_LND
+
+#include <lnet/lib-lnet.h>
+#include <lnet/ptllnd_wire.h>
+
+#include <portals/p30.h>
+#include <lnet/ptllnd.h>           /* Depends on portals/p30.h */
+
+#define PTLLND_DEBUG_TIMING 0
+
+#define PTLLND_MSGS_PER_BUFFER     64
+#define PTLLND_MSGS_SPARE          256
+#define PTLLND_PEER_HASH_SIZE      101
+#define PTLLND_EQ_SIZE             1024
+#if PTLLND_DEBUG_TIMING
+# define PTLLND_TX_HISTORY         1024
+#else
+# define PTLLND_TX_HISTORY         0
+#endif
+#define PTLLND_WARN_LONG_WAIT      5 /* seconds */
+#define PTLLND_ABORT_ON_NAK        1 /* abort app on protocol version mismatch */
+
+#define PTLLND_MD_OPTIONS        (PTL_MD_LUSTRE_COMPLETION_SEMANTICS |\
+                                  PTL_MD_EVENT_START_DISABLE)
+typedef struct
+{
+        int                        plni_portal;
+        ptl_pid_t                  plni_ptllnd_pid; /* Portals PID of peers I may connect to */
+        int                        plni_peer_credits;
+        int                        plni_max_msg_size;
+        int                        plni_buffer_size;
+        int                        plni_msgs_spare;
+        int                        plni_peer_hash_size;
+        int                        plni_eq_size;
+        int                        plni_checksum;
+        int                        plni_max_tx_history;
+        int                        plni_abort_on_nak;
+
+        __u64                      plni_stamp;
+        struct list_head           plni_active_txs;
+        struct list_head           plni_zombie_txs;
+        int                        plni_ntxs;
+        int                        plni_nrxs;
+
+        ptl_handle_ni_t            plni_nih;
+        ptl_handle_eq_t            plni_eqh;
+        ptl_process_id_t           plni_portals_id;   /* Portals ID of interface */
+
+        struct list_head          *plni_peer_hash;
+        int                        plni_npeers;
+
+        struct list_head           plni_tx_history;
+        int                        plni_ntx_history;
+
+        struct list_head           plni_buffers;
+        int                        plni_nbuffers;
+        int                        plni_nposted_buffers;
+} ptllnd_ni_t;
+
+#define PTLLND_CREDIT_HIGHWATER(plni) ((plni)->plni_peer_credits - 1)
+
+typedef struct
+{
+        struct list_head           plp_list;
+        lnet_ni_t                 *plp_ni;
+        lnet_process_id_t          plp_id;
+        ptl_process_id_t           plp_ptlid;
+        int                        plp_credits;
+        int                        plp_max_credits;
+        int                        plp_outstanding_credits;
+        int                        plp_max_msg_size;
+        int                        plp_refcount;
+        int                        plp_recvd_hello:1;
+        int                        plp_closing:1;
+        __u64                      plp_match;
+        __u64                      plp_stamp;
+        struct list_head           plp_txq;
+        struct list_head           plp_activeq;
+} ptllnd_peer_t;
+
+typedef struct
+{
+        struct list_head           plb_list;
+        lnet_ni_t                 *plb_ni;
+        int                        plb_posted;
+        ptl_handle_md_t            plb_md;
+        char                      *plb_buffer;
+} ptllnd_buffer_t;
+
+typedef struct
+{
+        ptllnd_peer_t             *rx_peer;
+        kptl_msg_t                *rx_msg;
+        int                        rx_nob;
+} ptllnd_rx_t;
+
+typedef struct
+{
+        struct list_head           tx_list;
+        int                        tx_type;
+        int                        tx_status;
+        ptllnd_peer_t             *tx_peer;
+        lnet_msg_t                *tx_lnetmsg;
+        lnet_msg_t                *tx_lnetreplymsg;
+        unsigned int               tx_niov;
+        ptl_md_iovec_t            *tx_iov;
+        ptl_handle_md_t            tx_bulkmdh;
+        ptl_handle_md_t            tx_reqmdh;
+#if PTLLND_DEBUG_TIMING
+        struct timeval             tx_bulk_posted;
+        struct timeval             tx_bulk_done;
+        struct timeval             tx_req_posted;
+        struct timeval             tx_req_done;
+#endif
+        int                        tx_completing; /* someone already completing */
+        int                        tx_msgsize;  /* # bytes in tx_msg */
+        kptl_msg_t                 tx_msg;      /* message to send */
+} ptllnd_tx_t;
+
+#define PTLLND_RDMA_WRITE           0x100       /* pseudo message type */
+#define PTLLND_RDMA_READ            0x101       /* (no msg actually sent) */
+
+/* Hack to extract object type from event's user_ptr relies on (and checks)
+ * that structs are somewhat aligned. */
+#define PTLLND_EVENTARG_TYPE_TX     0x1
+#define PTLLND_EVENTARG_TYPE_BUF    0x2
+#define PTLLND_EVENTARG_TYPE_MASK   0x3
+
+static inline void *
+ptllnd_obj2eventarg (void *obj, int type)
+{
+        unsigned long ptr = (unsigned long)obj;
+
+        LASSERT ((ptr & PTLLND_EVENTARG_TYPE_MASK) == 0);
+        LASSERT ((type & ~PTLLND_EVENTARG_TYPE_MASK) == 0);
+
+        return (void *)(ptr | type);
+}
+
+static inline int
+ptllnd_eventarg2type (void *arg)
+{
+        unsigned long ptr = (unsigned long)arg;
+
+        return (ptr & PTLLND_EVENTARG_TYPE_MASK);
+}
+
+static inline void *
+ptllnd_eventarg2obj (void *arg)
+{
+        unsigned long ptr = (unsigned long)arg;
+
+        return (void *)(ptr & ~PTLLND_EVENTARG_TYPE_MASK);
+}
+
+#if PTLLND_DEBUG_TIMING
+# define PTLLND_DBGT_INIT(tv)  memset(&(tv), 0, sizeof(tv))
+# define PTLLND_DBGT_STAMP(tv) gettimeofday(&(tv), NULL)
+# define DBGT_FMT              "%ld.%06ld"
+# define DBGT_ARGS(tv)         , (long)((tv).tv_sec), (long)((tv).tv_usec)
+#else
+# define PTLLND_DBGT_INIT(tv)
+# define PTLLND_DBGT_STAMP(tv)
+# define DBGT_FMT              "-"
+# define DBGT_ARGS(tv)
+#endif
+
+void ptllnd_cull_tx_history(ptllnd_ni_t *plni);
+int ptllnd_startup(lnet_ni_t *ni);
+void ptllnd_shutdown(lnet_ni_t *ni);
+int ptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
+int ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg);
+int ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
+                int delayed, unsigned int niov,
+                struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
+int ptllnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
+                      void **new_privatep);
+
+ptllnd_tx_t *ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob);
+void ptllnd_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive);
+void ptllnd_wait(lnet_ni_t *ni, int milliseconds);
+void ptllnd_check_sends(ptllnd_peer_t *peer);
+void ptllnd_debug_peer(lnet_ni_t *ni, lnet_process_id_t id);
+void ptllnd_destroy_peer(ptllnd_peer_t *peer);
+void ptllnd_close_peer(ptllnd_peer_t *peer, int error);
+int ptllnd_post_buffer(ptllnd_buffer_t *buf);
+int ptllnd_grow_buffers (lnet_ni_t *ni);
+const char *ptllnd_evtype2str(int type);
+const char *ptllnd_msgtype2str(int type);
+char *ptllnd_ptlid2str(ptl_process_id_t id);
+
+static inline void
+ptllnd_peer_addref (ptllnd_peer_t *peer)
+{
+        LASSERT (peer->plp_refcount > 0);
+        peer->plp_refcount++;
+}
+
+static inline void
+ptllnd_peer_decref (ptllnd_peer_t *peer)
+{
+        LASSERT (peer->plp_refcount > 0);
+        peer->plp_refcount--;
+        if (peer->plp_refcount == 0)
+                ptllnd_destroy_peer(peer);
+}
+
+static inline void
+ptllnd_post_tx(ptllnd_tx_t *tx)
+{
+        ptllnd_peer_t *peer = tx->tx_peer;
+        LASSERT(tx->tx_peer != NULL);
+        list_add_tail(&tx->tx_list, &peer->plp_txq);
+        ptllnd_check_sends(peer);
+}
+
+static inline lnet_nid_t
+ptllnd_ptl2lnetnid(lnet_ni_t *ni, ptl_nid_t portals_nid)
+{
+       return LNET_MKNID(LNET_NIDNET(ni->ni_nid), portals_nid);
+}
+
+static inline ptl_nid_t
+ptllnd_lnet2ptlnid(lnet_nid_t lnet_nid)
+{
+       return LNET_NIDADDR(lnet_nid);
+}
+
+/*
+ * A note about lprintf():
+ *  Normally printf() is redirected to stdout of the console
+ *  from which yod launched the catamount application.  However
+ *  there is a lot of initilziation code that runs before this
+ *  redirection is hooked up, and printf() seems to go to the bit bucket
+ *
+ *  To get any kind of debug output and init time lprintf() can
+ *  be used to output to the console from which bookqk was used to
+ *  boot the catamount node.  This works for debugging some simple
+ *  cases.
+ */
+
+
diff --git a/lnet/ulnds/ptllnd/ptllnd_cb.c b/lnet/ulnds/ptllnd/ptllnd_cb.c
new file mode 100644 (file)
index 0000000..0114c42
--- /dev/null
@@ -0,0 +1,1684 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
+ *   Author: Eric Barton <eeb@bartonsoftware.com>
+ *
+ *   This file is part of the Lustre file system, http://www.lustre.org
+ *   Lustre is a trademark of Cluster File Systems, Inc.
+ *
+ *   This file is confidential source code owned by Cluster File Systems.
+ *   No viewing, modification, compilation, redistribution, or any other
+ *   form of use is permitted except through a signed license agreement.
+ *
+ *   If you have not signed such an agreement, then you have no rights to
+ *   this file.  Please destroy it immediately and contact CFS.
+ *
+ */
+
+#include "ptllnd.h"
+
+char *
+ptllnd_ptlid2str(ptl_process_id_t id)
+{
+        static char strs[8][32];
+        static int  idx = 0;
+
+        char   *str = strs[idx++];
+        
+        if (idx >= sizeof(strs)/sizeof(strs[0]))
+                idx = 0;
+
+        snprintf(str, sizeof(strs[0]), FMT_PTLID, id.pid, id.nid);
+        return str;
+}
+
+void
+ptllnd_destroy_peer(ptllnd_peer_t *peer)
+{
+        lnet_ni_t         *ni = peer->plp_ni;
+        ptllnd_ni_t       *plni = ni->ni_data;
+
+        LASSERT (peer->plp_closing);
+        LASSERT (plni->plni_npeers > 0);
+        LASSERT (list_empty(&peer->plp_txq));
+        LASSERT (list_empty(&peer->plp_activeq));
+        plni->plni_npeers--;
+        LIBCFS_FREE(peer, sizeof(*peer));
+}
+
+void
+ptllnd_abort_txs(ptllnd_ni_t *plni, struct list_head *q)
+{
+        while (!list_empty(q)) {
+                ptllnd_tx_t *tx = list_entry(q->next, ptllnd_tx_t, tx_list);
+
+                tx->tx_status = -ESHUTDOWN;
+                list_del(&tx->tx_list);
+                list_add_tail(&tx->tx_list, &plni->plni_zombie_txs);
+        }
+}
+
+void
+ptllnd_close_peer(ptllnd_peer_t *peer, int error)
+{
+        lnet_ni_t   *ni = peer->plp_ni;
+        ptllnd_ni_t *plni = ni->ni_data;
+
+        if (peer->plp_closing)
+                return;
+
+        peer->plp_closing = 1;
+
+        if (!list_empty(&peer->plp_txq) ||
+            !list_empty(&peer->plp_activeq) ||
+            error != 0) {
+                CERROR("Closing %s\n", libcfs_id2str(peer->plp_id));
+                ptllnd_debug_peer(ni, peer->plp_id);
+        }
+        
+        ptllnd_abort_txs(plni, &peer->plp_txq);
+        ptllnd_abort_txs(plni, &peer->plp_activeq);
+
+        list_del(&peer->plp_list);
+        ptllnd_peer_decref(peer);
+}
+
+ptllnd_peer_t *
+ptllnd_find_peer(lnet_ni_t *ni, lnet_process_id_t id, int create)
+{
+        ptllnd_ni_t       *plni = ni->ni_data;
+        unsigned int       hash = LNET_NIDADDR(id.nid) % plni->plni_peer_hash_size;
+        struct list_head  *tmp;
+        ptllnd_peer_t     *plp;
+        ptllnd_tx_t       *tx;
+        int                rc;
+
+        LASSERT (LNET_NIDNET(id.nid) == LNET_NIDNET(ni->ni_nid));
+
+        list_for_each(tmp, &plni->plni_peer_hash[hash]) {
+                plp = list_entry(tmp, ptllnd_peer_t, plp_list);
+
+                if (plp->plp_id.nid == id.nid &&
+                    plp->plp_id.pid == id.pid) {
+                        ptllnd_peer_addref(plp);
+                        return plp;
+                }
+        }
+
+        if (!create)
+                return NULL;
+
+        /* New peer: check first for enough posted buffers */
+        plni->plni_npeers++;
+        rc = ptllnd_grow_buffers(ni);
+        if (rc != 0) {
+                plni->plni_npeers--;
+                return NULL;
+        }
+
+        LIBCFS_ALLOC(plp, sizeof(*plp));
+        if (plp == NULL) {
+                CERROR("Can't allocate new peer %s\n", libcfs_id2str(id));
+                plni->plni_npeers--;
+                return NULL;
+        }
+
+        CDEBUG(D_NET, "new peer=%p\n",plp);
+
+        plp->plp_ni = ni;
+        plp->plp_id = id;
+        plp->plp_ptlid.nid = LNET_NIDADDR(id.nid);
+        plp->plp_ptlid.pid = plni->plni_ptllnd_pid;
+        plp->plp_max_credits =
+        plp->plp_credits = 1; /* add more later when she gives me credits */
+        plp->plp_max_msg_size = plni->plni_max_msg_size; /* until I hear from her */
+        plp->plp_outstanding_credits = plni->plni_peer_credits - 1;
+        plp->plp_match = 0;
+        plp->plp_stamp = 0;
+        plp->plp_recvd_hello = 0;
+        plp->plp_closing = 0;
+        plp->plp_refcount = 1;
+        CFS_INIT_LIST_HEAD(&plp->plp_list);
+        CFS_INIT_LIST_HEAD(&plp->plp_txq);
+        CFS_INIT_LIST_HEAD(&plp->plp_activeq);
+
+        ptllnd_peer_addref(plp);
+        list_add_tail(&plp->plp_list, &plni->plni_peer_hash[hash]);
+
+        tx = ptllnd_new_tx(plp, PTLLND_MSG_TYPE_HELLO, 0);
+        if (tx == NULL) {
+                CERROR("Can't send HELLO to %s\n", libcfs_id2str(id));
+                ptllnd_close_peer(plp, -ENOMEM);
+                ptllnd_peer_decref(plp);
+                return NULL;
+        }
+
+        tx->tx_msg.ptlm_u.hello.kptlhm_matchbits = PTL_RESERVED_MATCHBITS;
+        tx->tx_msg.ptlm_u.hello.kptlhm_max_msg_size = plni->plni_max_msg_size;
+
+        ptllnd_post_tx(tx);
+
+        return plp;
+}
+
+int
+ptllnd_count_q(struct list_head *q)
+{
+        struct list_head *e;
+        int               n = 0;
+        
+        list_for_each(e, q) {
+                n++;
+        }
+        
+        return n;
+}
+
+const char *
+ptllnd_tx_typestr(int type) 
+{
+        switch (type) {
+        case PTLLND_RDMA_WRITE:
+                return "rdma_write";
+                
+        case PTLLND_RDMA_READ:
+                return "rdma_read";
+
+        case PTLLND_MSG_TYPE_PUT:
+                return "put_req";
+                
+        case PTLLND_MSG_TYPE_GET:
+                return "get_req";
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                return "immediate";
+
+        case PTLLND_MSG_TYPE_NOOP:
+                return "noop";
+
+        case PTLLND_MSG_TYPE_HELLO:
+                return "hello";
+
+        default:
+                return "<unknown>";
+        }
+}
+
+void
+ptllnd_debug_tx(ptllnd_tx_t *tx) 
+{
+        CDEBUG(D_WARNING, "%s %s b "DBGT_FMT"/"DBGT_FMT
+               " r "DBGT_FMT"/"DBGT_FMT" status %d\n",
+               ptllnd_tx_typestr(tx->tx_type),
+               libcfs_id2str(tx->tx_peer->plp_id)
+               DBGT_ARGS(tx->tx_bulk_posted) DBGT_ARGS(tx->tx_bulk_done)
+               DBGT_ARGS(tx->tx_req_posted) DBGT_ARGS(tx->tx_req_done),
+               tx->tx_status);
+}
+
+void
+ptllnd_debug_peer(lnet_ni_t *ni, lnet_process_id_t id)
+{
+        ptllnd_peer_t    *plp = ptllnd_find_peer(ni, id, 0);
+        struct list_head *tmp;
+        ptllnd_ni_t      *plni = ni->ni_data;
+        ptllnd_tx_t      *tx;
+        
+        if (plp == NULL) {
+                CDEBUG(D_WARNING, "No peer %s\n", libcfs_id2str(id));
+                return;
+        }
+        
+        CDEBUG(D_WARNING, "%s %s%s [%d] "LPD64".%06d m "LPD64" q %d/%d c %d/%d(%d)\n",
+               libcfs_id2str(id), 
+               plp->plp_recvd_hello ? "H" : "_",
+               plp->plp_closing     ? "C" : "_",
+               plp->plp_refcount,
+               plp->plp_stamp / 1000000, (int)(plp->plp_stamp % 1000000),
+               plp->plp_match,
+               ptllnd_count_q(&plp->plp_txq),
+               ptllnd_count_q(&plp->plp_activeq),
+               plp->plp_credits, plp->plp_outstanding_credits, plp->plp_max_credits);
+
+        CDEBUG(D_WARNING, "txq:\n");
+        list_for_each (tmp, &plp->plp_txq) {
+                tx = list_entry(tmp, ptllnd_tx_t, tx_list);
+                
+                ptllnd_debug_tx(tx);
+        }
+
+        CDEBUG(D_WARNING, "activeq:\n");
+        list_for_each (tmp, &plp->plp_activeq) {
+                tx = list_entry(tmp, ptllnd_tx_t, tx_list);
+                
+                ptllnd_debug_tx(tx);
+        }
+
+        CDEBUG(D_WARNING, "zombies:\n");
+        list_for_each (tmp, &plni->plni_zombie_txs) {
+                tx = list_entry(tmp, ptllnd_tx_t, tx_list);
+                
+                if (tx->tx_peer->plp_id.nid == id.nid &&
+                    tx->tx_peer->plp_id.pid == id.pid)
+                        ptllnd_debug_tx(tx);
+        }
+        
+        CDEBUG(D_WARNING, "history:\n");
+        list_for_each (tmp, &plni->plni_tx_history) {
+                tx = list_entry(tmp, ptllnd_tx_t, tx_list);
+                
+                if (tx->tx_peer->plp_id.nid == id.nid &&
+                    tx->tx_peer->plp_id.pid == id.pid)
+                        ptllnd_debug_tx(tx);
+        }
+        
+        ptllnd_peer_decref(plp);
+}
+
+void
+ptllnd_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive)
+{
+        lnet_process_id_t  id;
+        ptllnd_peer_t     *peer;
+        time_t             start = cfs_time_current_sec();
+        int                w = PTLLND_WARN_LONG_WAIT;
+        
+        /* This is only actually used to connect to routers at startup! */
+        if (!alive) {
+                LBUG();
+                return;
+        }
+
+        id.nid = nid;
+        id.pid = LUSTRE_SRV_LNET_PID;
+        
+        peer = ptllnd_find_peer(ni, id, 1);
+        if (peer == NULL)
+                return;
+
+        /* wait for the peer to reply */
+        while (!peer->plp_recvd_hello) {
+                if (cfs_time_current_sec() > start + w) {
+                        CWARN("Waited %ds to connect to %s\n",
+                              w, libcfs_id2str(id));
+                        w *= 2;
+                }
+                
+                ptllnd_wait(ni, w*1000);
+        }
+        
+        ptllnd_peer_decref(peer);
+}
+
+__u32
+ptllnd_cksum (void *ptr, int nob)
+{
+        char  *c  = ptr;
+        __u32  sum = 0;
+
+        while (nob-- > 0)
+                sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+        /* ensure I don't return 0 (== no checksum) */
+        return (sum == 0) ? 1 : sum;
+}
+
+ptllnd_tx_t *
+ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob)
+{
+        lnet_ni_t   *ni = peer->plp_ni;
+        ptllnd_ni_t *plni = ni->ni_data;
+        ptllnd_tx_t *tx;
+        int          msgsize;
+
+        CDEBUG(D_NET, "peer=%p type=%d payload=%d\n",peer,type,payload_nob);
+
+        switch (type) {
+        default:
+                LBUG();
+
+        case PTLLND_RDMA_WRITE:
+        case PTLLND_RDMA_READ:
+                LASSERT (payload_nob == 0);
+                msgsize = 0;
+                break;
+
+        case PTLLND_MSG_TYPE_PUT:
+        case PTLLND_MSG_TYPE_GET:
+                LASSERT (payload_nob == 0);
+                msgsize = offsetof(kptl_msg_t, ptlm_u) + 
+                          sizeof(kptl_rdma_msg_t);
+                break;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                msgsize = offsetof(kptl_msg_t,
+                                   ptlm_u.immediate.kptlim_payload[payload_nob]);
+                break;
+
+        case PTLLND_MSG_TYPE_NOOP:
+                LASSERT (payload_nob == 0);
+                msgsize = offsetof(kptl_msg_t, ptlm_u);
+                break;
+
+        case PTLLND_MSG_TYPE_HELLO:
+                LASSERT (payload_nob == 0);
+                msgsize = offsetof(kptl_msg_t, ptlm_u) +
+                          sizeof(kptl_hello_msg_t);
+                break;
+        }
+
+        msgsize = (msgsize + 7) & ~7;
+        LASSERT (msgsize <= peer->plp_max_msg_size);
+
+        CDEBUG(D_NET, "msgsize=%d\n",msgsize);
+
+        LIBCFS_ALLOC(tx, offsetof(ptllnd_tx_t, tx_msg) + msgsize);
+
+        if (tx == NULL) {
+                CERROR("Can't allocate msg type %d for %s\n",
+                       type, libcfs_id2str(peer->plp_id));
+                return NULL;
+        }
+
+        CFS_INIT_LIST_HEAD(&tx->tx_list);
+        tx->tx_peer = peer;
+        tx->tx_type = type;
+        tx->tx_lnetmsg = tx->tx_lnetreplymsg = NULL;
+        tx->tx_niov = 0;
+        tx->tx_iov = NULL;
+        tx->tx_reqmdh = PTL_INVALID_HANDLE;
+        tx->tx_bulkmdh = PTL_INVALID_HANDLE;
+        tx->tx_msgsize = msgsize;
+        tx->tx_completing = 0;
+        tx->tx_status = 0;
+
+        PTLLND_DBGT_INIT(tx->tx_bulk_posted);
+        PTLLND_DBGT_INIT(tx->tx_bulk_done);
+        PTLLND_DBGT_INIT(tx->tx_req_posted);
+        PTLLND_DBGT_INIT(tx->tx_req_done);
+
+        if (msgsize != 0) {
+                tx->tx_msg.ptlm_magic = PTLLND_MSG_MAGIC;
+                tx->tx_msg.ptlm_version = PTLLND_MSG_VERSION;
+                tx->tx_msg.ptlm_type = type;
+                tx->tx_msg.ptlm_credits = 0;
+                tx->tx_msg.ptlm_nob = msgsize;
+                tx->tx_msg.ptlm_cksum = 0;
+                tx->tx_msg.ptlm_srcnid = ni->ni_nid;
+                tx->tx_msg.ptlm_srcstamp = plni->plni_stamp;
+                tx->tx_msg.ptlm_dstnid = peer->plp_id.nid;
+                tx->tx_msg.ptlm_dststamp = peer->plp_stamp;
+                tx->tx_msg.ptlm_srcpid = the_lnet.ln_pid;
+                tx->tx_msg.ptlm_dstpid = peer->plp_id.pid;
+        }
+
+        ptllnd_peer_addref(peer);
+        plni->plni_ntxs++;
+
+        CDEBUG(D_NET, "tx=%p\n",tx);
+
+        return tx;
+}
+
+void
+ptllnd_abort_tx(ptllnd_tx_t *tx, ptl_handle_md_t *mdh)
+{
+        ptllnd_peer_t   *peer = tx->tx_peer;
+        lnet_ni_t       *ni = peer->plp_ni;
+        int              rc;
+        time_t           start = cfs_time_current_sec();
+        int              w = PTLLND_WARN_LONG_WAIT;
+
+        while (!PtlHandleIsEqual(*mdh, PTL_INVALID_HANDLE)) {
+                rc = PtlMDUnlink(*mdh);
+#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS
+                if (rc == PTL_OK) /* unlink successful => no unlinked event */
+                        return;
+                LASSERT (rc == PTL_MD_IN_USE);
+#endif
+                if (cfs_time_current_sec() > start + w) {
+                        CWARN("Waited %ds to abort tx to %s\n",
+                              w, libcfs_id2str(peer->plp_id));
+                        w *= 2;
+                }
+                /* Wait for ptllnd_tx_event() to invalidate */
+                ptllnd_wait(ni, w*1000);
+        }
+}
+
+void
+ptllnd_cull_tx_history(ptllnd_ni_t *plni)
+{
+        int max = plni->plni_max_tx_history;
+
+        while (plni->plni_ntx_history > max) {
+                ptllnd_tx_t *tx = list_entry(plni->plni_tx_history.next, 
+                                             ptllnd_tx_t, tx_list);
+                list_del(&tx->tx_list);
+
+                ptllnd_peer_decref(tx->tx_peer);
+
+                LIBCFS_FREE(tx, offsetof(ptllnd_tx_t, tx_msg) + tx->tx_msgsize);
+
+                LASSERT (plni->plni_ntxs > 0);
+                plni->plni_ntxs--;
+                plni->plni_ntx_history--;
+        }
+}
+
+void
+ptllnd_tx_done(ptllnd_tx_t *tx)
+{
+        ptllnd_peer_t   *peer = tx->tx_peer;
+        lnet_ni_t       *ni = peer->plp_ni;
+        ptllnd_ni_t     *plni = ni->ni_data;
+
+        /* CAVEAT EMPTOR: If this tx is being aborted, I'll continue to get
+         * events for this tx until it's unlinked.  So I set tx_completing to
+         * flag the tx is getting handled */
+
+        if (tx->tx_completing)
+                return;
+
+        tx->tx_completing = 1;
+
+        if (!list_empty(&tx->tx_list))
+                list_del_init(&tx->tx_list);
+
+        if (tx->tx_status != 0) {
+                CERROR("Completing tx with error\n");
+                ptllnd_debug_tx(tx);
+                ptllnd_close_peer(peer, tx->tx_status);
+        }
+        
+        ptllnd_abort_tx(tx, &tx->tx_reqmdh);
+        ptllnd_abort_tx(tx, &tx->tx_bulkmdh);
+
+        if (tx->tx_niov > 0) {
+                LIBCFS_FREE(tx->tx_iov, tx->tx_niov * sizeof(*tx->tx_iov));
+                tx->tx_niov = 0;
+        }
+
+        if (tx->tx_lnetreplymsg != NULL) {
+                LASSERT (tx->tx_type == PTLLND_MSG_TYPE_GET);
+                LASSERT (tx->tx_lnetmsg != NULL);
+                /* Simulate GET success always  */
+                lnet_finalize(ni, tx->tx_lnetmsg, 0);
+                CDEBUG(D_NET, "lnet_finalize(tx_lnetreplymsg=%p)\n",tx->tx_lnetreplymsg);
+                lnet_finalize(ni, tx->tx_lnetreplymsg, tx->tx_status);
+        } else if (tx->tx_lnetmsg != NULL) {
+                lnet_finalize(ni, tx->tx_lnetmsg, tx->tx_status);
+        }
+
+        plni->plni_ntx_history++;
+        list_add_tail(&tx->tx_list, &plni->plni_tx_history);
+        
+        ptllnd_cull_tx_history(plni);
+}
+
+int
+ptllnd_set_txiov(ptllnd_tx_t *tx,
+                 unsigned int niov, struct iovec *iov,
+                 unsigned int offset, unsigned int len)
+{
+        ptl_md_iovec_t *piov;
+        int             npiov;
+
+        if (len == 0) {
+                tx->tx_niov = 0;
+                return 0;
+        }
+
+        CDEBUG(D_NET, "niov  =%d\n",niov);
+        CDEBUG(D_NET, "offset=%d\n",offset);
+        CDEBUG(D_NET, "len   =%d\n",len);
+
+
+        /*
+         * Remove iovec's at the beginning that
+         * are skipped because of the offset.
+         * Adjust the offset accordingly
+         */
+        for (;;) {
+                LASSERT (niov > 0);
+                if (offset < iov->iov_len)
+                        break;
+                offset -= iov->iov_len;
+                niov--;
+                iov++;
+        }
+
+        CDEBUG(D_NET, "niov  =%d (after)\n",niov);
+        CDEBUG(D_NET, "offset=%d (after)\n",offset);
+        CDEBUG(D_NET, "len   =%d (after)\n",len);
+
+        for (;;) {
+                int temp_offset = offset;
+                int resid = len;
+                LIBCFS_ALLOC(piov, niov * sizeof(*piov));
+                if (piov == NULL)
+                        return -ENOMEM;
+
+                for (npiov = 0;; npiov++) {
+                        CDEBUG(D_NET, "npiov=%d\n",npiov);
+                        CDEBUG(D_NET, "offset=%d\n",temp_offset);
+                        CDEBUG(D_NET, "len=%d\n",resid);
+                        CDEBUG(D_NET, "iov[npiov].iov_len=%d\n",iov[npiov].iov_len);
+
+                        LASSERT (npiov < niov);
+                        LASSERT (iov->iov_len >= temp_offset);
+
+                        piov[npiov].iov_base = iov[npiov].iov_base + temp_offset;
+                        piov[npiov].iov_len = iov[npiov].iov_len - temp_offset;
+                        
+                        if (piov[npiov].iov_len >= resid) {
+                                piov[npiov].iov_len = resid;
+                                npiov++;
+                                break;
+                        }
+                        resid -= piov[npiov].iov_len;
+                        temp_offset = 0;
+                }
+
+                if (npiov == niov) {
+                        tx->tx_niov = niov;
+                        tx->tx_iov = piov;
+                        CDEBUG(D_NET, "tx->tx_iov=%p\n",tx->tx_iov);
+                        CDEBUG(D_NET, "tx->tx_niov=%d\n",tx->tx_niov);
+                        return 0;
+                }
+
+                /* Dang! The piov I allocated was too big and it's a drag to
+                 * have to maintain separate 'allocated' and 'used' sizes, so
+                 * I'll just do it again; NB this doesn't happen normally... */
+                LIBCFS_FREE(piov, niov * sizeof(*piov));
+                niov = npiov;
+        }
+}
+
+void
+ptllnd_set_md_buffer(ptl_md_t *md, ptllnd_tx_t *tx)
+{
+        unsigned int    niov = tx->tx_niov;
+        ptl_md_iovec_t *iov = tx->tx_iov;
+
+        LASSERT ((md->options & PTL_MD_IOVEC) == 0);
+
+        if (niov == 0) {
+                md->start = NULL;
+                md->length = 0;
+        } else if (niov == 1) {
+                md->start = iov[0].iov_base;
+                md->length = iov[0].iov_len;
+        } else {
+                md->start = iov;
+                md->length = niov;
+                md->options |= PTL_MD_IOVEC;
+        }
+}
+
+int
+ptllnd_post_buffer(ptllnd_buffer_t *buf)
+{
+        lnet_ni_t        *ni = buf->plb_ni;
+        ptllnd_ni_t      *plni = ni->ni_data;
+        ptl_process_id_t  anyid = {
+                .nid       = PTL_NID_ANY,
+                .pid       = PTL_PID_ANY};
+        ptl_md_t          md = {
+                .start     = buf->plb_buffer,
+                .length    = plni->plni_buffer_size,
+                .threshold = PTL_MD_THRESH_INF,
+                .max_size  = plni->plni_max_msg_size,
+                .options   = (PTLLND_MD_OPTIONS |
+                              PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | 
+                              PTL_MD_LOCAL_ALIGN8),
+                .user_ptr  = ptllnd_obj2eventarg(buf, PTLLND_EVENTARG_TYPE_BUF),
+                .eq_handle = plni->plni_eqh};
+        ptl_handle_me_t meh;
+        int             rc;
+
+        LASSERT (!buf->plb_posted);
+
+        rc = PtlMEAttach(plni->plni_nih, plni->plni_portal,
+                         anyid, LNET_MSG_MATCHBITS, 0,
+                         PTL_UNLINK, PTL_INS_AFTER, &meh);
+        if (rc != PTL_OK) {
+                CERROR("PtlMEAttach failed: %d\n", rc);
+                return -ENOMEM;
+        }
+
+        buf->plb_posted = 1;
+        plni->plni_nposted_buffers++;
+
+        rc = PtlMDAttach(meh, md, LNET_UNLINK, &buf->plb_md);
+        if (rc == PTL_OK)
+                return 0;
+
+        CERROR("PtlMDAttach failed: %d\n", rc);
+
+        buf->plb_posted = 0;
+        plni->plni_nposted_buffers--;
+
+        rc = PtlMEUnlink(meh);
+        LASSERT (rc == PTL_OK);
+
+        return -ENOMEM;
+}
+
+void
+ptllnd_check_sends(ptllnd_peer_t *peer)
+{
+        lnet_ni_t      *ni = peer->plp_ni;
+        ptllnd_ni_t    *plni = ni->ni_data;
+        ptllnd_tx_t    *tx;
+        ptl_md_t        md;
+        ptl_handle_md_t mdh;
+        int             rc;
+
+        CDEBUG(D_NET, "plp_outstanding_credits=%d\n",peer->plp_outstanding_credits);
+
+        if (list_empty(&peer->plp_txq) &&
+            peer->plp_outstanding_credits >=
+            PTLLND_CREDIT_HIGHWATER(plni)) {
+
+                tx = ptllnd_new_tx(peer, PTLLND_MSG_TYPE_NOOP, 0);
+                CDEBUG(D_NET, "NOOP tx=%p\n",tx);
+                if (tx == NULL) {
+                        CERROR("Can't return credits to %s\n",
+                               libcfs_id2str(peer->plp_id));
+                } else {
+                        list_add_tail(&tx->tx_list, &peer->plp_txq);
+                }
+        }
+
+        while (!list_empty(&peer->plp_txq)) {
+                tx = list_entry(peer->plp_txq.next, ptllnd_tx_t, tx_list);
+
+                CDEBUG(D_NET, "Looking at TX=%p\n",tx);
+                CDEBUG(D_NET, "plp_credits=%d\n",peer->plp_credits);
+                CDEBUG(D_NET, "plp_outstanding_credits=%d\n",peer->plp_outstanding_credits);
+
+                LASSERT (tx->tx_msgsize > 0);
+
+                LASSERT (peer->plp_outstanding_credits >= 0);
+                LASSERT (peer->plp_outstanding_credits <=
+                         plni->plni_peer_credits);
+                LASSERT (peer->plp_credits >= 0);
+                LASSERT (peer->plp_credits <= peer->plp_max_credits);
+
+                if (peer->plp_credits == 0)     /* no credits */
+                        break;
+
+                if (peer->plp_credits == 1 &&   /* last credit reserved for */
+                    peer->plp_outstanding_credits == 0) /* returning credits */
+                        break;
+
+                list_del(&tx->tx_list);
+                list_add_tail(&tx->tx_list, &peer->plp_activeq);
+
+                CDEBUG(D_NET, "Sending at TX=%p type=%s (%d)\n",tx,
+                        ptllnd_msgtype2str(tx->tx_type),tx->tx_type);
+
+                if (tx->tx_type == PTLLND_MSG_TYPE_NOOP &&
+                    (!list_empty(&peer->plp_txq) ||
+                     peer->plp_outstanding_credits <
+                     PTLLND_CREDIT_HIGHWATER(plni))) {
+                        /* redundant NOOP */
+                        ptllnd_tx_done(tx);
+                        continue;
+                }
+
+                /* Set stamp at the last minute; on a new peer, I don't know it
+                 * until I receive the HELLO back */
+                tx->tx_msg.ptlm_dststamp = peer->plp_stamp;
+
+                CDEBUG(D_NET, "Returning %d to peer\n",peer->plp_outstanding_credits);
+
+                /*
+                 * Return all the credits we have
+                 */
+                tx->tx_msg.ptlm_credits = peer->plp_outstanding_credits;
+                peer->plp_outstanding_credits = 0;
+
+                /*
+                 * One less credit
+                 */
+                peer->plp_credits--;
+
+                if (plni->plni_checksum)
+                        tx->tx_msg.ptlm_cksum = 
+                                ptllnd_cksum(&tx->tx_msg,
+                                             offsetof(kptl_msg_t, ptlm_u));
+
+                md.user_ptr = ptllnd_obj2eventarg(tx, PTLLND_EVENTARG_TYPE_TX);
+                md.eq_handle = plni->plni_eqh;
+                md.threshold = 1;
+                md.options = PTLLND_MD_OPTIONS;
+                md.start = &tx->tx_msg;
+                md.length = tx->tx_msgsize;
+
+                rc = PtlMDBind(plni->plni_nih, md, LNET_UNLINK, &mdh);
+                if (rc != PTL_OK) {
+                        CERROR("PtlMDBind for %s failed: %d\n",
+                               libcfs_id2str(peer->plp_id), rc);
+                        tx->tx_status = -EIO;
+                        ptllnd_tx_done(tx);
+                        break;
+                }
+
+                tx->tx_reqmdh = mdh;
+                PTLLND_DBGT_STAMP(tx->tx_req_posted);
+
+                rc = PtlPut(mdh, PTL_NOACK_REQ, peer->plp_ptlid,
+                            plni->plni_portal, 0, LNET_MSG_MATCHBITS, 0, 0);
+                if (rc != PTL_OK) {
+                        CERROR("PtlPut for %s failed: %d\n",
+                               libcfs_id2str(peer->plp_id), rc);
+                        tx->tx_status = -EIO;
+                        ptllnd_tx_done(tx);
+                        break;
+                }
+        }
+}
+
+int
+ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg,
+                    unsigned int niov, struct iovec *iov,
+                    unsigned int offset, unsigned int len)
+{
+        lnet_ni_t      *ni = peer->plp_ni;
+        ptllnd_ni_t    *plni = ni->ni_data;
+        ptllnd_tx_t    *tx = ptllnd_new_tx(peer, type, 0);
+        __u64           matchbits;
+        ptl_md_t        md;
+        ptl_handle_md_t mdh;
+        ptl_handle_me_t meh;
+        int             rc;
+        int             rc2;
+        time_t          start;
+        int             w;
+
+        CDEBUG(D_NET, "niov=%d offset=%d len=%d\n",niov,offset,len);
+
+        LASSERT (type == PTLLND_MSG_TYPE_GET ||
+                 type == PTLLND_MSG_TYPE_PUT);
+
+        if (tx == NULL) {
+                CERROR("Can't allocate %s tx for %s\n",
+                       type == PTLLND_MSG_TYPE_GET ? "GET" : "PUT/REPLY",
+                       libcfs_id2str(peer->plp_id));
+                return -ENOMEM;
+        }
+
+        rc = ptllnd_set_txiov(tx, niov, iov, offset, len);
+        if (rc != 0) {
+                CERROR ("Can't allocate iov %d for %s\n",
+                        niov, libcfs_id2str(peer->plp_id));
+                rc = -ENOMEM;
+                goto failed;
+        }
+
+        md.user_ptr = ptllnd_obj2eventarg(tx, PTLLND_EVENTARG_TYPE_TX);
+        md.eq_handle = plni->plni_eqh;
+        md.threshold = 1;
+        md.max_size = 0;
+        md.options = PTLLND_MD_OPTIONS;
+        if(type == PTLLND_MSG_TYPE_GET)
+                md.options |= PTL_MD_OP_PUT | PTL_MD_ACK_DISABLE;
+        else
+                md.options |= PTL_MD_OP_GET;
+        ptllnd_set_md_buffer(&md, tx);
+
+        start = cfs_time_current_sec();
+        w = PTLLND_WARN_LONG_WAIT;
+
+        while (!peer->plp_recvd_hello) {        /* wait to validate plp_match */
+                if (peer->plp_closing) {
+                        rc = -EIO;
+                        goto failed;
+                }
+                if (cfs_time_current_sec() > start + w) {
+                        CWARN("Waited %ds to connect to %s\n",
+                              w, libcfs_id2str(peer->plp_id));
+                        w *= 2;
+                }
+                ptllnd_wait(ni, w*1000);
+        }
+
+        if (peer->plp_match < PTL_RESERVED_MATCHBITS)
+                peer->plp_match = PTL_RESERVED_MATCHBITS;
+        matchbits = peer->plp_match++;
+        CDEBUG(D_NET, "matchbits " LPX64 " %s\n", matchbits,
+               ptllnd_ptlid2str(peer->plp_ptlid));
+
+        rc = PtlMEAttach(plni->plni_nih, plni->plni_portal, peer->plp_ptlid,
+                         matchbits, 0, PTL_UNLINK, PTL_INS_BEFORE, &meh);
+        if (rc != PTL_OK) {
+                CERROR("PtlMEAttach for %s failed: %d\n",
+                       libcfs_id2str(peer->plp_id), rc);
+                rc = -EIO;
+                goto failed;
+        }
+
+        CDEBUG(D_NET, "md.start=%p\n",md.start);
+        CDEBUG(D_NET, "md.length=%d\n",md.length);
+        CDEBUG(D_NET, "md.threshold=%d\n",md.threshold);
+        CDEBUG(D_NET, "md.max_size=%d\n",md.max_size);
+        CDEBUG(D_NET, "md.options=0x%x\n",md.options);
+        CDEBUG(D_NET, "md.user_ptr=%p\n",md.user_ptr);
+
+        PTLLND_DBGT_STAMP(tx->tx_bulk_posted);
+
+        rc = PtlMDAttach(meh, md, LNET_UNLINK, &mdh);
+        if (rc != PTL_OK) {
+                CERROR("PtlMDAttach for %s failed: %d\n",
+                       libcfs_id2str(peer->plp_id), rc);
+                rc2 = PtlMEUnlink(meh);
+                LASSERT (rc2 == PTL_OK);
+                rc = -EIO;
+                goto failed;
+        }
+        tx->tx_bulkmdh = mdh;
+
+        /*
+         * We need to set the stamp here because it
+         * we could have received a HELLO above that set
+         * peer->plp_stamp
+         */
+        tx->tx_msg.ptlm_dststamp = peer->plp_stamp;
+
+        tx->tx_msg.ptlm_u.rdma.kptlrm_hdr = msg->msg_hdr;
+        tx->tx_msg.ptlm_u.rdma.kptlrm_matchbits = matchbits;
+
+        if (type == PTLLND_MSG_TYPE_GET) {
+                tx->tx_lnetreplymsg = lnet_create_reply_msg(ni, msg);
+                if (tx->tx_lnetreplymsg == NULL) {
+                        CERROR("Can't create reply for GET to %s\n",
+                               libcfs_id2str(msg->msg_target));
+                        rc = -ENOMEM;
+                        goto failed;
+                }
+        }
+
+        tx->tx_lnetmsg = msg;
+        ptllnd_post_tx(tx);
+        return 0;
+
+ failed:
+        ptllnd_tx_done(tx);
+        return rc;
+}
+
+int
+ptllnd_active_rdma(ptllnd_peer_t *peer, int type,
+                   lnet_msg_t *msg, __u64 matchbits,
+                   unsigned int niov, struct iovec *iov,
+                   unsigned int offset, unsigned int len)
+{
+        lnet_ni_t       *ni = peer->plp_ni;
+        ptllnd_ni_t     *plni = ni->ni_data;
+        ptllnd_tx_t     *tx = ptllnd_new_tx(peer, type, 0);
+        ptl_md_t         md;
+        ptl_handle_md_t  mdh;
+        int              rc;
+
+        LASSERT (type == PTLLND_RDMA_READ ||
+                 type == PTLLND_RDMA_WRITE);
+
+        if (tx == NULL) {
+                CERROR("Can't allocate tx for RDMA %s with %s\n",
+                       (type == PTLLND_RDMA_WRITE) ? "write" : "read",
+                       libcfs_id2str(peer->plp_id));
+                ptllnd_close_peer(peer, -ENOMEM);
+                return -ENOMEM;
+        }
+
+        rc = ptllnd_set_txiov(tx, niov, iov, offset, len);
+        if (rc != 0) {
+                CERROR ("Can't allocate iov %d for %s\n",
+                        niov, libcfs_id2str(peer->plp_id));
+                rc = -ENOMEM;
+                goto failed;
+        }
+
+        md.user_ptr = ptllnd_obj2eventarg(tx, PTLLND_EVENTARG_TYPE_TX);
+        md.eq_handle = plni->plni_eqh;
+        md.max_size = 0;
+        md.options = PTLLND_MD_OPTIONS;
+        md.threshold = (type == PTLLND_RDMA_READ) ? 2 : 1;
+
+        ptllnd_set_md_buffer(&md, tx);
+
+        rc = PtlMDBind(plni->plni_nih, md, LNET_UNLINK, &mdh);
+        if (rc != PTL_OK) {
+                CERROR("PtlMDBind for %s failed: %d\n",
+                       libcfs_id2str(peer->plp_id), rc);
+                rc = -EIO;
+                goto failed;
+        }
+
+        tx->tx_bulkmdh = mdh;
+        tx->tx_lnetmsg = msg;
+
+        list_add_tail(&tx->tx_list, &peer->plp_activeq);
+        PTLLND_DBGT_STAMP(tx->tx_bulk_posted);
+
+        if (type == PTLLND_RDMA_READ)
+                rc = PtlGet(mdh, peer->plp_ptlid,
+                            plni->plni_portal, 0, matchbits, 0);
+        else
+                rc = PtlPut(mdh, PTL_NOACK_REQ, peer->plp_ptlid,
+                            plni->plni_portal, 0, matchbits, 0, 
+                            (msg == NULL) ? PTLLND_RDMA_FAIL : PTLLND_RDMA_OK);
+
+        if (rc == PTL_OK)
+                return 0;
+
+        CERROR("Can't initiate RDMA with %s: %d\n",
+               libcfs_id2str(peer->plp_id), rc);
+
+        tx->tx_lnetmsg = NULL;
+ failed:
+        tx->tx_status = rc;
+        ptllnd_tx_done(tx);    /* this will close peer */
+        return rc;
+}
+
+int
+ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg)
+{
+        ptllnd_ni_t    *plni = ni->ni_data;
+        ptllnd_peer_t  *plp;
+        ptllnd_tx_t    *tx;
+        int             nob;
+        int             rc;
+
+        LASSERT (!msg->msg_routing);
+        LASSERT (msg->msg_kiov == NULL);
+
+        LASSERT (msg->msg_niov <= PTL_MD_MAX_IOV); /* !!! */
+
+        CDEBUG(D_NET, "%s [%d]+%d,%d -> %s%s\n", 
+               lnet_msgtyp2str(msg->msg_type),
+               msg->msg_niov, msg->msg_offset, msg->msg_len,
+               libcfs_nid2str(msg->msg_target.nid),
+               msg->msg_target_is_router ? "(rtr)" : "");
+
+        if ((msg->msg_target.pid & LNET_PID_USERFLAG) != 0) {
+                CERROR("Can't send to non-kernel peer %s\n",
+                       libcfs_id2str(msg->msg_target));
+                return -EHOSTUNREACH;
+        }
+        
+        plp = ptllnd_find_peer(ni, msg->msg_target, 1);
+        if (plp == NULL)
+                return -ENOMEM;
+
+        switch (msg->msg_type) {
+        default:
+                LBUG();
+
+        case LNET_MSG_ACK:
+                CDEBUG(D_NET, "LNET_MSG_ACK\n");
+
+                LASSERT (msg->msg_len == 0);
+                break;                          /* send IMMEDIATE */
+
+        case LNET_MSG_GET:
+                CDEBUG(D_NET, "LNET_MSG_GET nob=%d\n",msg->msg_md->md_length);
+
+                if (msg->msg_target_is_router)
+                        break;                  /* send IMMEDIATE */
+
+                nob = msg->msg_md->md_length;
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[nob]);
+                if (nob <= plni->plni_max_msg_size)
+                        break;
+
+                LASSERT ((msg->msg_md->md_options & LNET_MD_KIOV) == 0);
+                rc = ptllnd_passive_rdma(plp, PTLLND_MSG_TYPE_GET, msg,
+                                         msg->msg_md->md_niov,
+                                         msg->msg_md->md_iov.iov,
+                                         0, msg->msg_md->md_length);
+                ptllnd_peer_decref(plp);
+                return rc;
+
+        case LNET_MSG_REPLY:
+        case LNET_MSG_PUT:
+                CDEBUG(D_NET, "LNET_MSG_PUT nob=%d\n",msg->msg_len);
+                nob = msg->msg_len;
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[nob]);
+                CDEBUG(D_NET, "msg_size=%d max=%d\n",msg->msg_len,plp->plp_max_msg_size);
+                if (nob <= plp->plp_max_msg_size)
+                        break;                  /* send IMMEDIATE */
+
+                rc = ptllnd_passive_rdma(plp, PTLLND_MSG_TYPE_PUT, msg,
+                                         msg->msg_niov, msg->msg_iov,
+                                         msg->msg_offset, msg->msg_len);
+                ptllnd_peer_decref(plp);
+                return rc;
+        }
+
+        /* send IMMEDIATE
+         * NB copy the payload so we don't have to do a fragmented send */
+
+        CDEBUG(D_NET, "IMMEDIATE len=%d\n", msg->msg_len);
+        tx = ptllnd_new_tx(plp, PTLLND_MSG_TYPE_IMMEDIATE, msg->msg_len);
+        if (tx == NULL) {
+                CERROR("Can't allocate tx for lnet type %d to %s\n",
+                       msg->msg_type, libcfs_id2str(msg->msg_target));
+                ptllnd_peer_decref(plp);
+                return -ENOMEM;
+        }
+
+        lnet_copy_iov2flat(tx->tx_msgsize, &tx->tx_msg,
+                           offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload),
+                           msg->msg_niov, msg->msg_iov, msg->msg_offset,
+                           msg->msg_len);
+        tx->tx_msg.ptlm_u.immediate.kptlim_hdr = msg->msg_hdr;
+
+        tx->tx_lnetmsg = msg;
+        ptllnd_post_tx(tx);
+        ptllnd_peer_decref(plp);
+        return 0;
+}
+
+void
+ptllnd_rx_done(ptllnd_rx_t *rx)
+{
+        ptllnd_peer_t *plp = rx->rx_peer;
+        lnet_ni_t     *ni = plp->plp_ni;
+        ptllnd_ni_t   *plni = ni->ni_data;
+
+        CDEBUG(D_NET, "rx=%p\n", rx);
+
+        plp->plp_outstanding_credits++;
+        ptllnd_check_sends(rx->rx_peer);
+
+        LASSERT (plni->plni_nrxs > 0);
+        plni->plni_nrxs--;
+}
+
+int
+ptllnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
+                  void **new_privatep)
+{
+        /* Shouldn't get here; recvs only block for router buffers */
+        LBUG();
+        return 0;
+}
+
+int
+ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg,
+            int delayed, unsigned int niov,
+            struct iovec *iov, lnet_kiov_t *kiov,
+            unsigned int offset, unsigned int mlen, unsigned int rlen)
+{
+        ptllnd_rx_t    *rx = private;
+        int             rc = 0;
+        int             nob;
+
+        LASSERT (kiov == NULL);
+        LASSERT (niov <= PTL_MD_MAX_IOV);       /* !!! */
+
+        switch (rx->rx_msg->ptlm_type) {
+        default:
+                LBUG();
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[mlen]);
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE nob=%d\n",nob);
+                if (nob > rx->rx_nob) {
+                        CERROR("Immediate message from %s too big: %d(%d)\n",
+                               libcfs_id2str(rx->rx_peer->plp_id),
+                               nob, rx->rx_nob);
+                        rc = -EPROTO;
+                        break;
+                }
+                lnet_copy_flat2iov(niov, iov, offset,
+                                   rx->rx_nob, rx->rx_msg,
+                                   offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload),
+                                   mlen);
+                lnet_finalize(ni, msg, 0);
+                break;
+
+        case PTLLND_MSG_TYPE_PUT:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_PUT offset=%d mlen=%d\n",offset,mlen);
+                rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_READ, msg,
+                                        rx->rx_msg->ptlm_u.rdma.kptlrm_matchbits,
+                                        niov, iov, offset, mlen);
+                break;
+
+        case PTLLND_MSG_TYPE_GET:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_GET\n");
+                if (msg != NULL)
+                        rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_WRITE, msg,
+                                                rx->rx_msg->ptlm_u.rdma.kptlrm_matchbits,
+                                                msg->msg_niov, msg->msg_iov,
+                                                msg->msg_offset, msg->msg_len);
+                else
+                        rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_WRITE, NULL,
+                                                rx->rx_msg->ptlm_u.rdma.kptlrm_matchbits,
+                                                0, NULL, 0, 0);
+                break;
+        }
+
+        ptllnd_rx_done(rx);
+        return rc;
+}
+
+void
+ptllnd_abort_on_nak(lnet_ni_t *ni)
+{
+        ptllnd_ni_t      *plni = ni->ni_data;
+
+        if (plni->plni_abort_on_nak)
+                abort();
+}
+
+void
+ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator,
+                     kptl_msg_t *msg, unsigned int nob)
+{
+        ptllnd_ni_t      *plni = ni->ni_data;
+        const int         basenob = offsetof(kptl_msg_t, ptlm_u);
+        lnet_process_id_t srcid;
+        ptllnd_rx_t       rx;
+        int               flip;
+        __u16             msg_version;
+        __u32             msg_cksum;
+        ptllnd_peer_t    *plp;
+        int               rc;
+
+        if (nob < 6) {
+                CERROR("Very short receive from %s\n",
+                       ptllnd_ptlid2str(initiator));
+                return;
+        }
+
+        /* I can at least read MAGIC/VERSION */
+
+        flip = msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC);
+        if (!flip && msg->ptlm_magic != PTLLND_MSG_MAGIC) {
+                CERROR("Bad protocol magic %08x from %s\n", 
+                       msg->ptlm_magic, ptllnd_ptlid2str(initiator));
+                return;
+        }
+
+        msg_version = flip ? __swab16(msg->ptlm_version) : msg->ptlm_version;
+
+        if (msg_version != PTLLND_MSG_VERSION) {
+                CERROR("Bad protocol version %04x from %s\n", 
+                       (__u32)msg_version, ptllnd_ptlid2str(initiator));
+                ptllnd_abort_on_nak(ni);
+                return;
+        }
+
+        if (nob < basenob) {
+                CERROR("Short receive from %s: got %d, wanted at least %d\n",
+                       ptllnd_ptlid2str(initiator), nob, basenob);
+                return;
+        }
+
+        /* checksum must be computed with
+         * 1) ptlm_cksum zero and
+         * 2) BEFORE anything gets modified/flipped
+         */
+        msg_cksum = flip ? __swab32(msg->ptlm_cksum) : msg->ptlm_cksum;
+        msg->ptlm_cksum = 0;
+        if (msg_cksum != 0 &&
+            msg_cksum != ptllnd_cksum(msg, offsetof(kptl_msg_t, ptlm_u))) {
+                CERROR("Bad checksum from %s\n", ptllnd_ptlid2str(initiator));
+                return;
+        }
+
+        msg->ptlm_version = msg_version;
+        msg->ptlm_cksum = msg_cksum;
+        
+        if (flip) {
+                /* NB stamps are opaque cookies */
+                __swab32s(&msg->ptlm_nob);
+                __swab64s(&msg->ptlm_srcnid);
+                __swab64s(&msg->ptlm_dstnid);
+                __swab32s(&msg->ptlm_srcpid);
+                __swab32s(&msg->ptlm_dstpid);
+        }
+        
+        srcid.nid = msg->ptlm_srcnid;
+        srcid.pid = msg->ptlm_srcpid;
+
+        if (LNET_NIDNET(msg->ptlm_srcnid) != LNET_NIDNET(ni->ni_nid)) {
+                CERROR("Bad source id %s from %s\n",
+                       libcfs_id2str(srcid),
+                       ptllnd_ptlid2str(initiator));
+                return;
+        }
+
+        if (msg->ptlm_type == PTLLND_MSG_TYPE_NAK) {
+                CERROR("NAK from %s (%s)\n", 
+                       libcfs_id2str(srcid),
+                       ptllnd_ptlid2str(initiator));
+                ptllnd_abort_on_nak(ni);
+                return;
+        }
+        
+        if (msg->ptlm_dstnid != ni->ni_nid ||
+            msg->ptlm_dstpid != the_lnet.ln_pid) {
+                CERROR("Bad dstid %s (%s expected) from %s\n",
+                       libcfs_id2str((lnet_process_id_t) {
+                               .nid = msg->ptlm_dstnid,
+                               .pid = msg->ptlm_dstpid}),
+                       libcfs_id2str((lnet_process_id_t) {
+                               .nid = ni->ni_nid,
+                               .pid = the_lnet.ln_pid}),
+                       libcfs_id2str(srcid));
+                return;
+        }
+
+        if (msg->ptlm_dststamp != plni->plni_stamp) {
+                CERROR("Bad dststamp "LPX64"("LPX64" expected) from %s\n",
+                       msg->ptlm_dststamp, plni->plni_stamp,
+                       libcfs_id2str(srcid));
+                return;
+        }
+       
+        switch (msg->ptlm_type) {
+        case PTLLND_MSG_TYPE_PUT:
+        case PTLLND_MSG_TYPE_GET:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n",
+                        msg->ptlm_type==PTLLND_MSG_TYPE_PUT ? "PUT" : "GET");
+                if (nob < basenob + sizeof(kptl_rdma_msg_t)) {
+                        CERROR("Short rdma request from %s(%s)\n",
+                               libcfs_id2str(srcid),
+                               ptllnd_ptlid2str(initiator));
+                        return;
+                }
+                if (flip)
+                        __swab64s(&msg->ptlm_u.rdma.kptlrm_matchbits);
+                break;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
+                if (nob < offsetof(kptl_msg_t,
+                                   ptlm_u.immediate.kptlim_payload)) {
+                        CERROR("Short immediate from %s(%s)\n",
+                               libcfs_id2str(srcid),
+                               ptllnd_ptlid2str(initiator));
+                        return;
+                }
+                break;
+
+        case PTLLND_MSG_TYPE_HELLO:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_HELLO from %s(%s)\n",
+                               libcfs_id2str(srcid),
+                               ptllnd_ptlid2str(initiator));
+                if (nob < basenob + sizeof(kptl_hello_msg_t)) {
+                        CERROR("Short hello from %s(%s)\n",
+                               libcfs_id2str(srcid),
+                               ptllnd_ptlid2str(initiator));
+                        return;
+                }
+                if(flip){
+                        __swab64s(&msg->ptlm_u.hello.kptlhm_matchbits);
+                        __swab32s(&msg->ptlm_u.hello.kptlhm_max_msg_size);
+                }
+                break;
+                
+        case PTLLND_MSG_TYPE_NOOP:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_NOOP from %s(%s)\n",
+                               libcfs_id2str(srcid),
+                               ptllnd_ptlid2str(initiator));        
+                break;
+
+        default:
+                CERROR("Bad message type %d from %s(%s)\n", msg->ptlm_type,
+                       libcfs_id2str(srcid),
+                       ptllnd_ptlid2str(initiator));
+                return;
+        }
+
+        plp = ptllnd_find_peer(ni, srcid,
+                               msg->ptlm_type == PTLLND_MSG_TYPE_HELLO);
+        if (plp == NULL) {
+                CERROR("Can't find peer %s\n", libcfs_id2str(srcid));
+                return;
+        }
+
+        if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) {
+                if (plp->plp_recvd_hello) {
+                        CERROR("Unexpected HELLO from %s\n",
+                               libcfs_id2str(srcid));
+                        ptllnd_peer_decref(plp);
+                        return;
+                }
+
+                CDEBUG(D_NET, "maxsz %d match "LPX64" stamp "LPX64"\n",
+                       msg->ptlm_u.hello.kptlhm_max_msg_size,
+                       msg->ptlm_u.hello.kptlhm_matchbits,
+                       msg->ptlm_srcstamp);
+
+                plp->plp_max_msg_size = MAX(plni->plni_max_msg_size,
+                        msg->ptlm_u.hello.kptlhm_max_msg_size);
+                plp->plp_match = msg->ptlm_u.hello.kptlhm_matchbits;
+                plp->plp_stamp = msg->ptlm_srcstamp;
+                plp->plp_max_credits += msg->ptlm_credits;
+                plp->plp_recvd_hello = 1;
+
+                CDEBUG(D_NET, "plp_max_msg_size=%d\n",plp->plp_max_msg_size);
+
+        } else if (!plp->plp_recvd_hello) {
+
+                CERROR("Bad message type %d (HELLO expected) from %s\n",
+                       msg->ptlm_type, libcfs_id2str(srcid));
+                ptllnd_peer_decref(plp);
+                return;
+
+        } else if (msg->ptlm_srcstamp != plp->plp_stamp) {
+
+                CERROR("Bad srcstamp "LPX64"("LPX64" expected) from %s\n",
+                       msg->ptlm_srcstamp, plp->plp_stamp,
+                       libcfs_id2str(srcid));
+                ptllnd_peer_decref(plp);
+                return;
+        }
+
+        if (msg->ptlm_credits > 0) {
+                CDEBUG(D_NET, "Getting back %d credits from peer\n",msg->ptlm_credits);
+                if (plp->plp_credits + msg->ptlm_credits >
+                    plp->plp_max_credits) {
+                        CWARN("Too many credits from %s: %d + %d > %d\n",
+                              libcfs_id2str(srcid),
+                              plp->plp_credits, msg->ptlm_credits,
+                              plp->plp_max_credits);
+                        plp->plp_credits = plp->plp_max_credits;
+                } else {
+                        plp->plp_credits += msg->ptlm_credits;
+                }
+                ptllnd_check_sends(plp);
+        }
+
+        /* All OK so far; assume the message is good... */
+
+        rx.rx_peer      = plp;
+        rx.rx_msg       = msg;
+        rx.rx_nob       = nob;
+        plni->plni_nrxs++;
+
+        CDEBUG(D_NET, "rx=%p type=%d\n",&rx,msg->ptlm_type);
+
+        switch (msg->ptlm_type) {
+        default: /* message types have been checked already */
+                ptllnd_rx_done(&rx);
+                break;
+
+        case PTLLND_MSG_TYPE_PUT:
+        case PTLLND_MSG_TYPE_GET:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n",
+                        msg->ptlm_type==PTLLND_MSG_TYPE_PUT ? "PUT" : "GET");
+                rc = lnet_parse(ni, &msg->ptlm_u.rdma.kptlrm_hdr,
+                                msg->ptlm_srcnid, &rx, 1);
+                CDEBUG(D_NET, "lnet_parse rc=%d\n",rc);
+                if (rc < 0)
+                        ptllnd_rx_done(&rx);
+                break;
+
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n");
+                rc = lnet_parse(ni, &msg->ptlm_u.immediate.kptlim_hdr,
+                                msg->ptlm_srcnid, &rx, 0);
+                CDEBUG(D_NET, "lnet_parse rc=%d\n",rc);
+                if (rc < 0)
+                        ptllnd_rx_done(&rx);
+                break;
+        }
+
+        ptllnd_peer_decref(plp);
+}
+
+void
+ptllnd_buf_event (lnet_ni_t *ni, ptl_event_t *event)
+{
+        ptllnd_buffer_t *buf = ptllnd_eventarg2obj(event->md.user_ptr);
+        ptllnd_ni_t     *plni = ni->ni_data;
+        char            *msg = &buf->plb_buffer[event->offset];
+        int              repost;
+        int              unlinked = event->type == PTL_EVENT_UNLINK;
+
+        LASSERT (buf->plb_ni == ni);
+        LASSERT (event->type == PTL_EVENT_PUT_END ||
+                 event->type == PTL_EVENT_UNLINK);
+
+        CDEBUG(D_NET, "buf=%p event=%d\n",buf,event->type);
+
+        if (event->ni_fail_type != PTL_NI_OK) {
+
+                CERROR("event type %d, status %d from %s\n",
+                       event->type, event->ni_fail_type,
+                       ptllnd_ptlid2str(event->initiator));
+
+        } else if (event->type == PTL_EVENT_PUT_END) {
+#if (PTL_MD_LOCAL_ALIGN8 == 0)
+                /* Portals can't force message alignment - someone sending an
+                 * odd-length message could misalign subsequent messages */
+                if ((event->mlength & 7) != 0) {
+                        CERROR("Message from %s has odd length %d: "
+                               "probable version incompatibility\n",
+                               ptllnd_ptlid2str(event->initiator),
+                               event->mlength);
+                        LBUG();
+                }
+#endif
+                LASSERT ((event->offset & 7) == 0);
+
+                ptllnd_parse_request(ni, event->initiator,
+                                     (kptl_msg_t *)msg, event->mlength);
+        }
+
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+        /* UNLINK event only on explicit unlink */
+        repost = (event->unlinked && event->type != PTL_EVENT_UNLINK);
+        if (event->unlinked)
+                unlinked = 1;
+#else
+        /* UNLINK event only on implicit unlink */
+        repost = (event->type == PTL_EVENT_UNLINK);
+#endif
+
+        CDEBUG(D_NET, "repost=%d unlinked=%d\n",repost,unlinked);
+
+        if (unlinked) {
+                LASSERT(buf->plb_posted);
+                buf->plb_posted = 0;
+                plni->plni_nposted_buffers--;
+        }
+
+        if (repost)
+                (void) ptllnd_post_buffer(buf);
+}
+
+void
+ptllnd_tx_event (lnet_ni_t *ni, ptl_event_t *event)
+{
+        ptllnd_ni_t *plni = ni->ni_data;
+        ptllnd_tx_t *tx = ptllnd_eventarg2obj(event->md.user_ptr);
+        int          error = (event->ni_fail_type != PTL_NI_OK);
+        int          isreq;
+        int          isbulk;
+#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
+        int          unlinked = event->unlinked;
+#else
+        int          unlinked = (event->type == PTL_EVENT_UNLINK);
+#endif
+
+        if (error)
+                CERROR("Error event type %d for %s for %s\n",
+                       event->type, ptllnd_msgtype2str(tx->tx_type),
+                       libcfs_id2str(tx->tx_peer->plp_id));
+
+        LASSERT (!PtlHandleIsEqual(event->md_handle, PTL_INVALID_HANDLE));
+
+        CDEBUG(D_NET, "tx=%p type=%s (%d)\n",tx,
+                ptllnd_msgtype2str(tx->tx_type),tx->tx_type);
+        CDEBUG(D_NET, "unlinked=%d\n",unlinked);
+        CDEBUG(D_NET, "error=%d\n",error);
+
+        isreq = PtlHandleIsEqual(event->md_handle, tx->tx_reqmdh);
+        CDEBUG(D_NET, "isreq=%d\n",isreq);
+        if (isreq) {
+                LASSERT (event->md.start == (void *)&tx->tx_msg);
+                if (unlinked) {
+                        tx->tx_reqmdh = PTL_INVALID_HANDLE;
+                        PTLLND_DBGT_STAMP(tx->tx_req_done);
+                }
+        }
+
+        isbulk = PtlHandleIsEqual(event->md_handle, tx->tx_bulkmdh);
+        CDEBUG(D_NET, "isbulk=%d\n",isbulk);
+        if ( isbulk && unlinked ) {
+                tx->tx_bulkmdh = PTL_INVALID_HANDLE;
+                PTLLND_DBGT_STAMP(tx->tx_bulk_done);
+        }
+
+        LASSERT (!isreq != !isbulk);            /* always one and only 1 match */
+
+        switch (tx->tx_type) {
+        default:
+                LBUG();
+
+        case PTLLND_MSG_TYPE_NOOP:
+        case PTLLND_MSG_TYPE_HELLO:
+        case PTLLND_MSG_TYPE_IMMEDIATE:
+                LASSERT (event->type == PTL_EVENT_UNLINK ||
+                         event->type == PTL_EVENT_SEND_END);
+                LASSERT (isreq);
+                break;
+
+        case PTLLND_MSG_TYPE_GET:
+                LASSERT (event->type == PTL_EVENT_UNLINK ||
+                         (isreq && event->type == PTL_EVENT_SEND_END) ||
+                         (isbulk && event->type == PTL_EVENT_PUT_END));
+
+                if (isbulk && !error && event->type == PTL_EVENT_PUT_END) {
+                        /* Check GET matched */
+                        if (event->hdr_data == PTLLND_RDMA_OK) {
+                                lnet_set_reply_msg_len(ni, 
+                                                       tx->tx_lnetreplymsg,
+                                                       event->mlength);
+                        } else {
+                                CERROR ("Unmatched GET with %s\n",
+                                        libcfs_id2str(tx->tx_peer->plp_id));
+                                tx->tx_status = -EIO;
+                        }
+                }
+                break;
+
+        case PTLLND_MSG_TYPE_PUT:
+                LASSERT (event->type == PTL_EVENT_UNLINK ||
+                         (isreq && event->type == PTL_EVENT_SEND_END) ||
+                         (isbulk && event->type == PTL_EVENT_GET_END));
+                break;
+
+        case PTLLND_RDMA_READ:
+                LASSERT (event->type == PTL_EVENT_UNLINK ||
+                         event->type == PTL_EVENT_SEND_END ||
+                         event->type == PTL_EVENT_REPLY_END);
+                LASSERT (isbulk);
+                break;
+
+        case PTLLND_RDMA_WRITE:
+                LASSERT (event->type == PTL_EVENT_UNLINK ||
+                         event->type == PTL_EVENT_SEND_END);
+                LASSERT (isbulk);
+        }
+
+        /* Schedule ptllnd_tx_done() on error or last completion event */
+        if (error ||
+            (PtlHandleIsEqual(tx->tx_bulkmdh, PTL_INVALID_HANDLE) &&
+             PtlHandleIsEqual(tx->tx_reqmdh, PTL_INVALID_HANDLE))) {
+                if (error)
+                        tx->tx_status = -EIO;
+                list_del(&tx->tx_list);
+                list_add_tail(&tx->tx_list, &plni->plni_zombie_txs);
+                CDEBUG(D_NET, "tx=%p ONTO ZOMBIE LIST\n",tx);
+        }
+}
+
+void
+ptllnd_wait (lnet_ni_t *ni, int milliseconds)
+{
+        ptllnd_ni_t   *plni = ni->ni_data;
+        ptllnd_tx_t   *tx;
+        ptl_event_t    event;
+        int            which;
+        int            rc;
+        int            blocked = 0;
+        int            found = 0;
+        int            timeout = 0;
+
+        /* Handle any currently queued events, returning immediately if any.
+         * Otherwise block for the timeout and handle all events queued
+         * then. */
+
+        for (;;) {
+                time_t  then = cfs_time_current_sec();
+
+                CDEBUG(D_NET, "Poll(%d)\n", timeout);
+                
+                rc = PtlEQPoll(&plni->plni_eqh, 1,
+                               (timeout < 0) ? PTL_TIME_FOREVER : timeout,
+                               &event, &which);
+
+                if (timeout >= 0 &&
+                    (cfs_time_current_sec() - then)*1000 > timeout + 1000) {
+                        /* 1000 mS grace.............................^ */
+                        CERROR("SLOW PtlEQPoll(%d): %d seconds\n", timeout,
+                               (int)(cfs_time_current_sec() - then));
+                }
+                
+                CDEBUG(D_NET, "PtlEQPoll rc=%d\n",rc);
+                timeout = 0;
+
+                if (rc == PTL_EQ_EMPTY) {
+                        if (found ||            /* handled some events */
+                            milliseconds == 0 || /* just checking */
+                            blocked)            /* blocked already */
+                                break;
+
+                        blocked = 1;
+                        timeout = milliseconds;
+                        continue;
+                }
+
+                LASSERT (rc == PTL_OK || rc == PTL_EQ_DROPPED);
+
+                if (rc == PTL_EQ_DROPPED)
+                        CERROR("Event queue: size %d is too small\n",
+                               plni->plni_eq_size);
+
+                CDEBUG(D_NET, "event.type=%s(%d)\n",
+                       ptllnd_evtype2str(event.type),event.type);
+
+                found = 1;
+                switch (ptllnd_eventarg2type(event.md.user_ptr)) {
+                default:
+                        LBUG();
+
+                case PTLLND_EVENTARG_TYPE_TX:
+                        ptllnd_tx_event(ni, &event);
+                        break;
+
+                case PTLLND_EVENTARG_TYPE_BUF:
+                        ptllnd_buf_event(ni, &event);
+                        break;
+                }
+        }
+
+        while (!list_empty(&plni->plni_zombie_txs)) {
+                tx = list_entry(plni->plni_zombie_txs.next,
+                                ptllnd_tx_t, tx_list);
+                CDEBUG(D_NET, "Process ZOMBIE tx=%p\n",tx);
+                ptllnd_tx_done(tx);
+        }
+}
diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c
deleted file mode 100644 (file)
index 49c770f..0000000
+++ /dev/null
@@ -1,421 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2002 Eric Hoffman
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* select.c:
- *  Provides a general mechanism for registering and dispatching
- *  io events through the select system call.
- */
-
-#define DEBUG_SUBSYSTEM S_NAL
-
-#ifdef sun
-#include <sys/filio.h>
-#else
-#include <sys/ioctl.h>
-#endif
-
-#include <sys/time.h>
-#include <sys/types.h>
-#include <stdlib.h>
-#include <syscall.h>
-#include <pthread.h>
-#include <errno.h>
-#include <pqtimer.h>
-#include <dispatch.h>
-#include <procbridge.h>
-
-
-static struct timeval beginning_of_epoch;
-static io_handler io_handlers;
-
-/* Function: now
- *
- * Return: the current time in canonical units: a 64 bit number
- *   where the most significant 32 bits contains the number
- *   of seconds, and the least signficant a count of (1/(2^32))ths
- *   of a second.
- */
-when now()
-{
-    struct timeval result;
-
-    gettimeofday(&result,0);
-    return((((unsigned long long)result.tv_sec)<<32)|
-           (((unsigned long long)result.tv_usec)<<32)/1000000);
-}
-
-
-/* Function: register_io_handler
- * Arguments: fd: the file descriptor of interest
- *            type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER
- *            function: a function to call when io is available on fd
- *            arg: an opaque correlator to return to the handler
- * Returns: a pointer to the io_handler structure
- */
-io_handler register_io_handler(int fd,
-                               int type,
-                               int (*function)(void *),
-                               void *arg)
-{
-    io_handler i=(io_handler)malloc(sizeof(struct io_handler));
-    if ((i->fd=fd)>=0){
-        i->type=type;
-        i->function=function;
-        i->argument=arg;
-        i->disabled=0;
-        i->last=&io_handlers;
-        if ((i->next=io_handlers)) i->next->last=&i->next;
-        io_handlers=i;
-    }
-    return(i);
-}
-
-/* Function: remove_io_handler
- * Arguments: i: a pointer to the handler to stop servicing
- *
- * remove_io_handler() doesn't actually free the handler, due
- * to reentrancy problems. it just marks the handler for
- * later cleanup by the blocking function.
- */
-void remove_io_handler (io_handler i)
-{
-    i->disabled=1;
-}
-
-static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e)
-{
-    if (n->type & READ_HANDLER) FD_SET(n->fd, r);
-    if (n->type & WRITE_HANDLER) FD_SET(n->fd, w);
-    if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e);
-}
-
-static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e)
-{
-    io_handler j;
-    io_handler *k;
-    int max = 0;
-
-    FD_ZERO(r);
-    FD_ZERO(w);
-    FD_ZERO(e);
-    for (k=&io_handlers;*k;){
-        if ((*k)->disabled){
-            j=*k;
-            *k=(*k)->next;
-            free(j);
-        }
-        if (*k) {
-           set_flag(*k,r,w,e);
-            if ((*k)->fd > max)
-                max = (*k)->fd;
-           k=&(*k)->next;
-       }
-    }
-    return max + 1;
-}
-
-static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e)
-{
-    io_handler j;
-    int n = 0, t;
-
-    for (j = io_handlers; j; j = j->next) {
-        if (j->disabled)
-            continue;
-
-        t = 0;
-        if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) {
-            FD_CLR(j->fd, r);
-            t++;
-        }
-        if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) {
-            FD_CLR(j->fd, w);
-            t++;
-        }
-        if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) {
-            FD_CLR(j->fd, e);
-            t++;
-        }
-        if (t == 0)
-            continue;
-
-        if (!(*j->function)(j->argument))
-            j->disabled = 1;
-
-        n += t;
-    }
-
-    return n;
-}
-
-#ifdef ENABLE_SELECT_DISPATCH
-
-static struct {
-    pthread_mutex_t mutex;
-    pthread_cond_t  cond;
-    int             submitted;
-    int             nready;
-    int             maxfd;
-    fd_set         *rset;
-    fd_set         *wset;
-    fd_set         *eset;
-    struct timeval *timeout;
-    struct timeval  submit_time;
-} fd_extra = {
-    PTHREAD_MUTEX_INITIALIZER,
-    PTHREAD_COND_INITIALIZER,
-    0, 0, 0,
-    NULL, NULL, NULL, NULL,
-};
-
-extern int liblustre_wait_event(int timeout);
-extern procbridge __global_procbridge;
-
-/*
- * this will intercept syscall select() of user apps
- * such as MPI libs.
- */
-int select(int n, fd_set *rset, fd_set *wset, fd_set *eset,
-           struct timeval *timeout)
-{
-    LASSERT(fd_extra.submitted == 0);
-
-    fd_extra.nready = 0;
-    fd_extra.maxfd = n;
-    fd_extra.rset = rset;
-    fd_extra.wset = wset;
-    fd_extra.eset = eset;
-    fd_extra.timeout = timeout;
-
-    liblustre_wait_event(0);
-    pthread_mutex_lock(&fd_extra.mutex);
-    gettimeofday(&fd_extra.submit_time, NULL);
-    fd_extra.submitted = 1;
-    LASSERT(__global_procbridge);
-    procbridge_wakeup_nal(__global_procbridge);
-
-again:
-    if (fd_extra.submitted)
-        pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex);
-    pthread_mutex_unlock(&fd_extra.mutex);
-
-    liblustre_wait_event(0);
-
-    pthread_mutex_lock(&fd_extra.mutex);
-    if (fd_extra.submitted)
-        goto again;
-    pthread_mutex_unlock(&fd_extra.mutex);
-
-    LASSERT(fd_extra.nready >= 0);
-    LASSERT(fd_extra.submitted == 0);
-    return fd_extra.nready;
-}
-
-static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset)
-{
-    int i;
-
-    LASSERT(rset);
-    LASSERT(wset);
-    LASSERT(eset);
-
-    for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) {
-        LASSERT(!fd_extra.rset ||
-                !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i]));
-        LASSERT(!fd_extra.wset ||
-                !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i]));
-        LASSERT(!fd_extra.eset ||
-                !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i]));
-
-        if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i])
-            __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i];
-        if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i])
-            __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i];
-        if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i])
-            __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i];
-    }
-
-    return (fd_extra.maxfd > max ? fd_extra.maxfd : max);
-}
-
-static inline
-int timeval_ge(struct timeval *tv1, struct timeval *tv2)
-{
-    LASSERT(tv1 && tv2);
-    return ((tv1->tv_sec - tv2->tv_sec) * 1000000 +
-            (tv1->tv_usec - tv2->tv_usec) >= 0);
-}
-
-/*
- * choose the most recent timeout value
- */
-static struct timeval *choose_timeout(struct timeval *tv1,
-                                      struct timeval *tv2)
-{
-    if (!tv1)
-        return tv2;
-    else if (!tv2)
-        return tv1;
-
-    if (timeval_ge(tv1, tv2))
-        return tv2;
-    else
-        return tv1;
-}
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- *
- *   This function dispatches the various file descriptors' handler
- *   functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
-{
-    fd_set fds[3];
-    struct timeval timeout;
-    struct timeval *timeout_pointer, *select_timeout;
-    int max, nready, nexec;
-    int fd_handling;
-
-again:
-    if (until) {
-        when interval;
-
-        interval = until - now();
-        timeout.tv_sec = (interval >> 32);
-        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
-        timeout_pointer = &timeout;
-    } else
-        timeout_pointer = NULL;
-
-    fd_handling = 0;
-    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
-    select_timeout = timeout_pointer;
-
-    pthread_mutex_lock(&fd_extra.mutex);
-    fd_handling = fd_extra.submitted;
-    pthread_mutex_unlock(&fd_extra.mutex);
-    if (fd_handling) {
-        max = merge_fds(max, &fds[0], &fds[1], &fds[2]);
-        select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout);
-    }
-
-    /* XXX only compile for linux */
-#if __WORDSIZE == 64
-    nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
-                     select_timeout);
-#else
-    nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2],
-                     select_timeout);
-#endif
-    if (nready < 0) {
-        CERROR("select return err %d, errno %d\n", nready, errno);
-        return;
-    }
-
-    if (nready) {
-        nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]);
-        nready -= nexec;
-    } else
-        nexec = 0;
-
-    /* even both nready & nexec are 0, we still need try to wakeup
-     * upper thread since it may have timed out
-     */
-    if (fd_handling) {
-        LASSERT(nready >= 0);
-
-        pthread_mutex_lock(&fd_extra.mutex);
-        if (nready) {
-            if (fd_extra.rset)
-                *fd_extra.rset = fds[0];
-            if (fd_extra.wset)
-                *fd_extra.wset = fds[1];
-            if (fd_extra.eset)
-                *fd_extra.eset = fds[2];
-            fd_extra.nready = nready;
-            fd_extra.submitted = 0;
-        } else {
-            struct timeval t;
-
-            fd_extra.nready = 0;
-            if (fd_extra.timeout) {
-                gettimeofday(&t, NULL);
-                if (timeval_ge(&t, &fd_extra.submit_time))
-                    fd_extra.submitted = 0;
-            }
-        }
-
-        pthread_cond_signal(&fd_extra.cond);
-        pthread_mutex_unlock(&fd_extra.mutex);
-    }
-
-    /* haven't found portals event, go back to loop if time
-     * is not expired */
-    if (!nexec) {
-        if (timeout_pointer == NULL || now() >= until)
-            goto again;
-    }
-}
-
-#else /* !ENABLE_SELECT_DISPATCH */
-
-/* Function: select_timer_block
- * Arguments: until: an absolute time when the select should return
- *
- *   This function dispatches the various file descriptors' handler
- *   functions, if the kernel indicates there is io available.
- */
-void select_timer_block(when until)
-{
-    fd_set fds[3];
-    struct timeval timeout;
-    struct timeval *timeout_pointer;
-    int max, nready;
-
-again:
-    if (until) {
-        when interval;
-        interval = until - now();
-        timeout.tv_sec = (interval >> 32);
-        timeout.tv_usec = ((interval << 32) / 1000000) >> 32;
-        timeout_pointer = &timeout;
-    } else
-        timeout_pointer = NULL;
-
-    max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]);
-
-    nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer);
-    if (nready > 0)
-        execute_callbacks(&fds[0], &fds[1], &fds[2]);
-}
-#endif /* ENABLE_SELECT_DISPATCH */
-
-/* Function: init_unix_timer()
- *   is called to initialize the library
- */
-void init_unix_timer()
-{
-    io_handlers=0;
-    gettimeofday(&beginning_of_epoch, 0);
-    initialize_timer(select_timer_block);
-}
diff --git a/lnet/ulnds/socklnd/.cvsignore b/lnet/ulnds/socklnd/.cvsignore
new file mode 100644 (file)
index 0000000..e995588
--- /dev/null
@@ -0,0 +1,3 @@
+.deps
+Makefile
+Makefile.in
index 3437d39..f970be9 100644 (file)
@@ -1,10 +1,13 @@
 if LIBLUSTRE
-if !CRAY_PORTALS
-noinst_LIBRARIES = libtcpnal.a
+if BUILD_USOCKLND
+noinst_LIBRARIES = libsocklnd.a
 endif
 endif
 
-noinst_HEADERS =  pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h
-libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h
-libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS)
-libtcpnal_a_CFLAGS = $(LLCFLAGS)
+noinst_HEADERS =  pqtimer.h dispatch.h table.h timer.h \
+                 connection.h bridge.h procbridge.h
+libsocklnd_a_SOURCES = pqtimer.c select.c table.c pqtimer.h \
+                     dispatch.h table.h timer.h procapi.c proclib.c \
+                     connection.c tcplnd.c connection.h
+libsocklnd_a_CPPFLAGS = $(LLCPPFLAGS)
+libsocklnd_a_CFLAGS = $(LLCFLAGS)
diff --git a/lnet/ulnds/socklnd/address.c b/lnet/ulnds/socklnd/address.c
deleted file mode 100644 (file)
index 07b4249..0000000
+++ /dev/null
@@ -1,147 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* address.c:
- * this file provides functions to aquire the IP address of the node
- * and translate them into a NID/PID pair which supports a static
- * mapping of virtual nodes into the port range of an IP socket.
-*/
-
-#define DEBUG_SUBSYSTEM S_NAL
-
-#include <stdlib.h>
-#include <netdb.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <portals/p30.h>
-#include <bridge.h>
-#include <ipmap.h>
-
-
-/* Function:  get_node_id
- * Returns: a 32 bit id for this node, actually a big-endian IP address
- *
- * get_node_id() determines the host name and uses the resolver to
- *  find out its ip address. This is fairly fragile and inflexible, but
- *  explicitly asking about interfaces and their addresses is very
- *  complicated and nonportable.
- */
-static unsigned int get_node_id(void)
-{
-    char buffer[255];
-    unsigned int x;
-    struct hostent *he;
-    char * host_envp;
-
-    if (!(host_envp = getenv("PTL_HOSTID")))
-        {
-            gethostname(buffer,sizeof(buffer));
-            he=gethostbyname(buffer);
-            if (he)
-                    x=*(unsigned int *)he->h_addr_list[0];
-            else
-                    x = 0;
-            return(ntohl(x));
-        }
-    else
-        {
-            if (host_envp[1] != 'x')
-                {
-                    int a, b, c, d;
-                    sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d);
-                    return ((a<<24) | (b<<16) | (c<<8) | d);
-                }
-            else
-                {
-                    long long hostid = strtoll(host_envp, 0, 0);
-                    return((unsigned int) hostid);
-                }
-        }
-}
-
-
-/* Function:  set_address
- * Arugments: t: a procnal structure to populate with the request
- *
- * set_address performs the bit manipulations to set the nid, pid, and
- *    iptop8 fields of the procnal structures.
- *
- * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY
- */
-
-#ifdef DIRECT_IP_MODE
-void set_address(bridge t,ptl_pid_t pidrequest)
-{
-    int port;
-    if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0;
-    else port=pidrequest;
-    t->lib_nal->libnal_ni.ni_pid.nid=get_node_id();
-    t->lib_nal->libnal_ni.ni_pid.pid=port;
-}
-#else
-
-void set_address(bridge t,ptl_pid_t pidrequest)
-{
-    int virtnode, in_addr, port;
-    ptl_pid_t pid;
-
-    /* get and remember my node id*/
-    if (!getenv("PTL_VIRTNODE"))
-        virtnode = 0;
-    else
-        {
-            int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT
-                                              >> PNAL_VNODE_SHIFT);
-            virtnode = atoi(getenv("PTL_VIRTNODE"));
-            if (virtnode > maxvnode)
-                {
-                    fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n",
-                            virtnode, maxvnode);
-                    return;
-                }
-        }
-
-    in_addr = get_node_id();
-
-    t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */
-    t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK)
-                                        << PNAL_VNODE_SHIFT)
-                                       + virtnode;
-    pid=pidrequest;
-    /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */
-#ifdef notyet
-    if (pid==(unsigned short)PTL_PID_ANY) port = 0;
-#endif
-    if (pid==(unsigned short)PTL_PID_ANY)
-        {
-            fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n");
-            return;
-        }
-    else if (pid > PNAL_PID_MASK)
-        {
-            fprintf(stderr, "portal pid of %d is too large - max %d\n",
-                    pid, PNAL_PID_MASK);
-            return;
-        }
-    else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT;
-    t->lib_nal->libnal_ni.ni_pid.pid=pid;
-}
-#endif
index d2f0f2c..a46cb13 100644 (file)
@@ -9,26 +9,15 @@
 #ifndef TCPNAL_PROCBRIDGE_H
 #define TCPNAL_PROCBRIDGE_H
 
-#include <portals/lib-p30.h>
-#include <portals/nal.h>
-
-#define PTL_IFACE_TCP 1
-#define PTL_IFACE_ER 2
-#define PTL_IFACE_SS 3
-#define PTL_IFACE_MAX 4
+#include <lnet/lib-lnet.h>
 
 typedef struct bridge {
     int alive;
-    lib_nal_t *lib_nal;
+    lnet_ni_t *b_ni;
     void *lower;
     void *local;
-    void (*shutdown)(struct bridge *);
     /* this doesn't really belong here */
     unsigned char iptop8;
 } *bridge;
 
-
-typedef int (*nal_initialize)(bridge);
-extern nal_initialize nal_table[PTL_IFACE_MAX];
-
 #endif
index 49cca96..51aa535 100644 (file)
@@ -22,8 +22,7 @@
 /* connection.c:
    This file provides a simple stateful connection manager which
    builds tcp connections on demand and leaves them open for
-   future use. It also provides the machinery to allow peers
-   to connect to it
+   future use. 
 */
 
 #include <stdlib.h>
@@ -38,9 +37,9 @@
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
-#include <portals/types.h>
-#include <portals/lib-types.h>
-#include <portals/socknal.h>
+#include <lnet/types.h>
+#include <lnet/lib-types.h>
+#include <lnet/socklnd.h>
 #include <libcfs/kp30.h>
 #include <connection.h>
 #include <pthread.h>
 #include <syscall.h>
 #endif
 
-/* global variable: acceptor port */
-unsigned short tcpnal_acceptor_port = 988;
+/* tunables (via environment) */
+int tcpnal_acceptor_port = 988;
+int tcpnal_buffer_size   = 0;
+int tcpnal_nagle         = 0;
 
+int
+tcpnal_env_param (char *name, int *val)
+{
+        char   *env = getenv(name);
+        int     n;
+
+        if (env == NULL)
+                return 1;
+
+        n = strlen(env);                /* scanf may not assign on EOS */
+        if (sscanf(env, "%i%n", val, &n) >= 1 && n == strlen(env)) {
+                CDEBUG(D_INFO, "Environment variable %s set to %d\n",
+                       name, *val);
+                return 1;
+        }
+
+        CERROR("Can't parse environment variable '%s=%s'\n",
+               name, env);
+        return 0;
+}
+
+int
+tcpnal_set_global_params (void)
+{
+        return  tcpnal_env_param("TCPNAL_PORT",
+                                &tcpnal_acceptor_port) &&
+                tcpnal_env_param("TCPLND_PORT",
+                                &tcpnal_acceptor_port) &&
+                tcpnal_env_param("TCPNAL_BUFFER_SIZE",
+                                 &tcpnal_buffer_size) &&
+                tcpnal_env_param("TCPLND_BUFFER_SIZE",
+                                 &tcpnal_buffer_size) &&
+                tcpnal_env_param("TCPNAL_NAGLE",
+                                 &tcpnal_nagle) &&
+                tcpnal_env_param("TCPLND_NAGLE",
+                                 &tcpnal_nagle);
+}
 
 /* Function:  compare_connection
  * Arguments: connection c:      a connection in the hash table
- *            ptl_process_id_t:  an id to verify  agains
+ *            lnet_process_id_t:  an id to verify  agains
  * Returns: 1 if the connection is the one requested, 0 otherwise
  *
  *    compare_connection() tests for collisions in the hash table
  */
 static int compare_connection(void *arg1, void *arg2)
 {
-    connection c = arg1;
-    unsigned int * id = arg2;
-#if 0
-    return((c->ip==id[0]) && (c->port==id[1]));
-#else
-    /* CFS specific hacking */
-    return (c->ip == id[0]);
-#endif
-}
+    connection  c = arg1;
+    lnet_nid_t *nid = arg2;
 
+    return (c->peer_nid == *nid);
+}
 
 /* Function:  connection_key
- * Arguments: ptl_process_id_t id:  an id to hash
+ * Arguments: lnet_process_id_t id:  an id to hash
  * Returns: a not-particularily-well-distributed hash
  *          of the id
  */
-static unsigned int connection_key(unsigned int *id)
+static unsigned int connection_key(void *arg)
 {
-#if 0
-    return(id[0]^id[1]);
-#else
-    /* CFS specific hacking */
-    return (unsigned int) id[0];
-#endif
+        lnet_nid_t *nid = arg;
+        
+        return (unsigned int)(*nid);
 }
 
+void
+close_connection(void *arg)
+{
+        connection c = arg;
+        
+        close(c->fd);
+        free(c);
+}
 
 /* Function:  remove_connection
  * Arguments: c: the connection to remove
@@ -95,13 +133,9 @@ static unsigned int connection_key(unsigned int *id)
 void remove_connection(void *arg)
 {
         connection c = arg;
-        unsigned int id[2];
         
-        id[0]=c->ip;
-        id[1]=c->port;
-        hash_table_remove(c->m->connections,id);
-        close(c->fd);
-        free(c);
+        hash_table_remove(c->m->connections,&c->peer_nid);
+        close_connection(c);
 }
 
 
@@ -149,172 +183,179 @@ static int connection_input(void *d)
 }
 
 
-/* Function:  allocate_connection
- * Arguments: t:    tcpnal the allocation is occuring in the context of
- *            dest: portal endpoint address for this connection
- *            fd:   open file descriptor for the socket
- * Returns: an allocated connection structure
- *
- * just encompasses the action common to active and passive
- *  connections of allocation and placement in the global table
- */
-static connection allocate_connection(manager m,
-                               unsigned int ip,
-                               unsigned short port,
-                               int fd)
+static connection 
+allocate_connection(manager        m,
+                    lnet_nid_t     nid,
+                    int            fd)
 {
     connection c=malloc(sizeof(struct connection));
-    unsigned int id[2];
+
     c->m=m;
     c->fd=fd;
-    c->ip=ip;
-    c->port=port;
-    id[0]=ip;
-    id[1]=port;
+    c->peer_nid = nid;
+
     register_io_handler(fd,READ_HANDLER,connection_input,c);
-    hash_table_insert(m->connections,c,id);
+    hash_table_insert(m->connections,c,&nid);
     return(c);
 }
 
-
-/* Function:  new_connection
- * Arguments: t: opaque argument holding the tcpname
- * Returns: 1 in order to reregister for new connection requests
- *
- *  called when the bound service socket recieves
- *     a new connection request, it always accepts and
- *     installs a new connection
- */
-static int new_connection(void *z)
+int
+tcpnal_write(lnet_nid_t nid, int sockfd, void *buffer, int nob)
 {
-    manager m=z;
-    struct sockaddr_in s;
-    int len=sizeof(struct sockaddr_in);
-    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
-    unsigned int nid=*((unsigned int *)&s.sin_addr);
-    /* cfs specific hack */
-    //unsigned short pid=s.sin_port;
-    pthread_mutex_lock(&m->conn_lock);
-    allocate_connection(m,htonl(nid),0/*pid*/,fd);
-    pthread_mutex_unlock(&m->conn_lock);
-    return(1);
+        int rc = syscall(SYS_write, sockfd, buffer, nob);
+        
+        /* NB called on an 'empty' socket with huge buffering! */
+        if (rc == nob)
+                return 0;
+
+        if (rc < 0) {
+                CERROR("Failed to send to %s: %s\n",
+                       libcfs_nid2str(nid), strerror(errno));
+                return -1;
+        }
+        
+        CERROR("Short send to %s: %d/%d\n",
+               libcfs_nid2str(nid), rc, nob);
+        return -1;
 }
 
-extern ptl_nid_t tcpnal_mynid;
-
 int
-tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
+tcpnal_read(lnet_nid_t nid, int sockfd, void *buffer, int nob) 
 {
-        int                 rc;
-        int                 nob;
-        ptl_hdr_t           hdr;
-        ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid;
+        int       rc;
+
+        while (nob > 0) {
+                rc = syscall(SYS_read, sockfd, buffer, nob);
+                
+                if (rc == 0) {
+                        CERROR("Unexpected EOF from %s\n",
+                               libcfs_nid2str(nid));
+                        return -1;
+                }
+
+                if (rc < 0) {
+                        CERROR("Failed to receive from %s: %s\n",
+                               libcfs_nid2str(nid), strerror(errno));
+                        return -1;
+                }
+
+                nob -= rc;
+        }
+        return 0;
+}
 
-        LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid));
+int
+tcpnal_hello (int sockfd, lnet_nid_t nid)
+{
+        struct timeval          tv;
+        __u64                   incarnation;
+        int                     rc;
+        int                     nob;
+        lnet_acceptor_connreq_t cr;
+        lnet_hdr_t              hdr;
+        lnet_magicversion_t     hmv;
+
+        gettimeofday(&tv, NULL);
+        incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
+        memset(&cr, 0, sizeof(cr));
+        cr.acr_magic   = LNET_PROTO_ACCEPTOR_MAGIC;
+        cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION;
+        cr.acr_nid     = nid;
+
+        /* hmv initialised and copied separately into hdr; compiler "optimize"
+         * likely due to confusion about pointer alias of hmv and hdr when this
+         * was done in-place. */
+        hmv.magic         = cpu_to_le32(LNET_PROTO_TCP_MAGIC);
+        hmv.version_major = cpu_to_le32(LNET_PROTO_TCP_VERSION_MAJOR);
+        hmv.version_minor = cpu_to_le32(LNET_PROTO_TCP_VERSION_MINOR);
 
         memset (&hdr, 0, sizeof (hdr));
-        hmv->magic         = cpu_to_le32(PORTALS_PROTO_MAGIC);
-        hmv->version_major = cpu_to_le32(PORTALS_PROTO_VERSION_MAJOR);
-        hmv->version_minor = cpu_to_le32(PORTALS_PROTO_VERSION_MINOR);
-        
-        hdr.src_nid = cpu_to_le64(tcpnal_mynid);
-        hdr.type    = cpu_to_le32(PTL_MSG_HELLO);
 
-        hdr.msg.hello.type = cpu_to_le32(type);
+        CLASSERT (sizeof (hmv) == sizeof (hdr.dest_nid));
+        memcpy(&hdr.dest_nid, &hmv, sizeof(hmv));
+
+        /* hdr.src_nid/src_pid are ignored at dest */
+
+        hdr.type    = cpu_to_le32(LNET_MSG_HELLO);
+        hdr.msg.hello.type = cpu_to_le32(SOCKLND_CONN_ANY);
         hdr.msg.hello.incarnation = cpu_to_le64(incarnation);
 
         /* I don't send any interface info */
 
-        /* Assume sufficient socket buffering for this message */
-        rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr));
-        if (rc <= 0) {
-                CERROR ("Error %d sending HELLO to "LPX64"\n", rc, *nid);
-                return (rc);
-        }
+        /* Assume sufficient socket buffering for these messages... */
+        rc = tcpnal_write(nid, sockfd, &cr, sizeof(cr));
+        if (rc != 0)
+                return -1;
 
-        rc = syscall(SYS_read, sockfd, hmv, sizeof(*hmv));
-        if (rc <= 0) {
-                CERROR ("Error %d reading HELLO from "LPX64"\n", rc, *nid);
-                return (rc);
-        }
+        rc = tcpnal_write(nid, sockfd, &hdr, sizeof(hdr));
+        if (rc != 0)
+                return -1;
+
+        rc = tcpnal_read(nid, sockfd, &hmv, sizeof(hmv));
+        if (rc != 0)
+                return -1;
         
-        if (hmv->magic != le32_to_cpu(PORTALS_PROTO_MAGIC)) {
-                CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n",
-                        cpu_to_le32(hmv->magic), PORTALS_PROTO_MAGIC, *nid);
-                return (-EPROTO);
+        if (hmv.magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) {
+                CERROR ("Bad magic %#08x (%#08x expected) from %s\n",
+                        cpu_to_le32(hmv.magic), LNET_PROTO_TCP_MAGIC, 
+                        libcfs_nid2str(nid));
+                return -1;
         }
 
-        if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) ||
-            hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) {
+        if (hmv.version_major != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MAJOR) ||
+            hmv.version_minor != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MINOR)) {
                 CERROR ("Incompatible protocol version %d.%d (%d.%d expected)"
-                        " from "LPX64"\n",
-                        le16_to_cpu (hmv->version_major),
-                        le16_to_cpu (hmv->version_minor),
-                        PORTALS_PROTO_VERSION_MAJOR,
-                        PORTALS_PROTO_VERSION_MINOR,
-                        *nid);
-                return (-EPROTO);
+                        " from %s\n",
+                        le16_to_cpu (hmv.version_major),
+                        le16_to_cpu (hmv.version_minor),
+                        LNET_PROTO_TCP_VERSION_MAJOR,
+                        LNET_PROTO_TCP_VERSION_MINOR,
+                        libcfs_nid2str(nid));
+                return -1;
         }
 
-#if (PORTALS_PROTO_VERSION_MAJOR != 1)
+#if (LNET_PROTO_TCP_VERSION_MAJOR != 1)
 # error "This code only understands protocol version 1.x"
 #endif
         /* version 1 sends magic/version as the dest_nid of a 'hello' header,
          * so read the rest of it in now... */
 
-        rc = syscall(SYS_read, sockfd, hmv + 1, sizeof(hdr) - sizeof(*hmv));
-        if (rc <= 0) {
-                CERROR ("Error %d reading rest of HELLO hdr from "LPX64"\n",
-                        rc, *nid);
-                return (rc);
-        }
+        rc = tcpnal_read(nid, sockfd, ((char *)&hdr) + sizeof (hmv),
+                         sizeof(hdr) - sizeof(hmv));
+        if (rc != 0)
+                return -1;
 
         /* ...and check we got what we expected */
-        if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) {
+        if (hdr.type != cpu_to_le32 (LNET_MSG_HELLO)) {
                 CERROR ("Expecting a HELLO hdr "
-                        " but got type %d with %d payload from "LPX64"\n",
+                        " but got type %d with %d payload from %s\n",
                         le32_to_cpu (hdr.type),
-                        le32_to_cpu (hdr.payload_length), *nid);
-                return (-EPROTO);
+                        le32_to_cpu (hdr.payload_length), libcfs_nid2str(nid));
+                return -1;
         }
 
-        if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) {
-                CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n");
-                return (-EPROTO);
+        if (le64_to_cpu(hdr.src_nid) == LNET_NID_ANY) {
+                CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY\n");
+                return -1;
         }
 
-        if (*nid == PTL_NID_ANY) {              /* don't know peer's nid yet */
-                *nid = le64_to_cpu(hdr.src_nid);
-        } else if (*nid != le64_to_cpu (hdr.src_nid)) {
-                CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n",
-                        le64_to_cpu (hdr.src_nid), *nid);
-                return (-EPROTO);
+        if (nid != le64_to_cpu (hdr.src_nid)) {
+                CERROR ("Connected to %s, but expecting %s\n",
+                        libcfs_nid2str(le64_to_cpu (hdr.src_nid)), 
+                        libcfs_nid2str(nid));
+                return -1;
         }
 
         /* Ignore any interface info in the payload */
         nob = le32_to_cpu(hdr.payload_length);
-        if (nob > getpagesize()) {
-                CERROR("Unexpected HELLO payload %d from "LPX64"\n",
-                       nob, *nid);
-                return (-EPROTO);
-        }
-        if (nob > 0) {
-                char *space = (char *)malloc(nob);
-                
-                if (space == NULL) {
-                        CERROR("Can't allocate scratch buffer %d\n", nob);
-                        return (-ENOMEM);
-                }
-                
-                rc = syscall(SYS_read, sockfd, space, nob);
-                if (rc <= 0) {
-                        CERROR("Error %d skipping HELLO payload from "
-                               LPX64"\n", rc, *nid);
-                        return (rc);
-                }
+        if (nob != 0) {
+                CERROR("Unexpected HELLO payload %d from %s\n",
+                       nob, libcfs_nid2str(nid));
+                return -1;
         }
 
-        return (0);
+        return 0;
 }
 
 /* Function:  force_tcp_connection
@@ -323,44 +364,81 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation)
  * Returns: an allocated connection structure, either
  *          a pre-existing one, or a new connection
  */
-connection force_tcp_connection(manager m,
-                                unsigned int ip,
-                                unsigned short port,
+connection force_tcp_connection(manager    m,
+                                lnet_nid_t nid,
                                 procbridge pb)
 {
-    connection conn;
+    unsigned int       ip = LNET_NIDADDR(nid);
+    connection         conn;
     struct sockaddr_in addr;
     struct sockaddr_in locaddr; 
-    unsigned int id[2];
-    struct timeval tv;
-    __u64 incarnation;
-
-    int fd;
-    int option;
-    int rc;
-    int rport;
-    ptl_nid_t peernid = PTL_NID_ANY;
-
-    port = tcpnal_acceptor_port;
-
-    id[0] = ip;
-    id[1] = port;
+    int                fd;
+    int                option;
+    int                rc;
+    int                sz;
 
     pthread_mutex_lock(&m->conn_lock);
 
-    conn = hash_table_find(m->connections, id);
+    conn = hash_table_find(m->connections, &nid);
     if (conn)
             goto out;
 
     memset(&addr, 0, sizeof(addr));
     addr.sin_family      = AF_INET;
     addr.sin_addr.s_addr = htonl(ip);
-    addr.sin_port        = htons(port);
+    addr.sin_port        = htons(tcpnal_acceptor_port);
 
     memset(&locaddr, 0, sizeof(locaddr)); 
     locaddr.sin_family = AF_INET; 
     locaddr.sin_addr.s_addr = INADDR_ANY;
+    locaddr.sin_port = htons(m->port);
+
+#if 1 /* tcpnal connects from a non-privileged port */
+    fd = socket(AF_INET, SOCK_STREAM, 0);
+    if (fd < 0) {
+            perror("tcpnal socket failed");
+            goto out;
+    } 
+
+    option = 1;
+    rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
+                    &option, sizeof(option));
+    if (rc != 0) {
+            perror ("Can't set SO_REUSEADDR for socket"); 
+            close(fd);
+            goto out;
+    } 
+
+    if (m->port != 0) {
+            /* Bind all subsequent connections to the same port */
+            rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr));
+            if (rc != 0) {
+                    perror("Error binding port");
+                    close(fd);
+                    goto out;
+            }
+    }
+    
+    rc = connect(fd, (struct sockaddr *)&addr,
+                 sizeof(struct sockaddr_in));
+    if (rc != 0) {
+            perror("Error connecting to remote host");
+            close(fd);
+            goto out;
+    }
+
+    sz = sizeof(locaddr);
+    rc = getsockname(fd, (struct sockaddr *)&locaddr, &sz);
+    if (rc != 0) {
+            perror ("Error on getsockname");
+            close(fd);
+            goto out;
+    }
 
+    if (m->port == 0)
+            m->port = ntohs(locaddr.sin_port);
+    
+#else
     for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
             fd = socket(AF_INET, SOCK_STREAM, 0);
             if (fd < 0) {
@@ -401,24 +479,22 @@ connection force_tcp_connection(manager m,
             fprintf(stderr, "Out of ports trying to bind to a reserved port\n");
             goto out;
     }
+#endif
     
-#if 1
-    option = 1;
+    option = tcpnal_nagle ? 0 : 1;
     setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option));
-    option = 1<<20;
-    setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
-    option = 1<<20;
-    setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
-#endif
-   
-    gettimeofday(&tv, NULL);
-    incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-
+    option = tcpnal_buffer_size;
+    if (option != 0) {
+            setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));
+            option = tcpnal_buffer_size;
+            setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option));
+    }
+    
     /* say hello */
-    if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation))
-            exit(-1);
+    if (tcpnal_hello(fd, nid))
+            goto out;
     
-    conn = allocate_connection(m, ip, port, fd);
+    conn = allocate_connection(m, nid, fd);
     
     /* let nal thread know this event right away */
     if (conn)
@@ -430,6 +506,30 @@ out:
 }
 
 
+#if 0                                           /* we don't accept connections */
+/* Function:  new_connection
+ * Arguments: t: opaque argument holding the tcpname
+ * Returns: 1 in order to reregister for new connection requests
+ *
+ *  called when the bound service socket recieves
+ *     a new connection request, it always accepts and
+ *     installs a new connection
+ */
+static int new_connection(void *z)
+{
+    manager m=z;
+    struct sockaddr_in s;
+    int len=sizeof(struct sockaddr_in);
+    int fd=accept(m->bound,(struct sockaddr *)&s,&len);
+    unsigned int nid=*((unsigned int *)&s.sin_addr);
+    /* cfs specific hack */
+    //unsigned short pid=s.sin_port;
+    pthread_mutex_lock(&m->conn_lock);
+    allocate_connection(m,htonl(nid),0/*pid*/,fd);
+    pthread_mutex_unlock(&m->conn_lock);
+    return(1);
+}
+
 /* Function:  bind_socket
  * Arguments: t: the nal state for this interface
  *            port: the port to attempt to bind to
@@ -455,7 +555,7 @@ static int bind_socket(manager m,unsigned short port)
     addr.sin_port        = htons(port);
 
     if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){
-        fprintf(stderr, "tcpnal bind: %s port %u\n", strerror(errno), port); 
+        perror ("tcpnal bind"); 
         return(0);
     }
     
@@ -467,6 +567,7 @@ static int bind_socket(manager m,unsigned short port)
     m->port=addr.sin_port;
     return(1);
 }
+#endif
 
 
 /* Function:  shutdown_connections
@@ -476,32 +577,37 @@ static int bind_socket(manager m,unsigned short port)
  */
 void shutdown_connections(manager m)
 {
-    close(m->bound);
-    remove_io_handler(m->bound_handler);
-    hash_destroy_table(m->connections,remove_connection);
-    free(m);
+#if 0
+        /* we don't accept connections */
+        close(m->bound);
+        remove_io_handler(m->bound_handler);
+#endif
+        hash_destroy_table(m->connections,close_connection);
+        free(m);
 }
 
 
 /* Function:  init_connections
  * Arguments: t: the nal state for this interface
- *            port: the port to attempt to bind to
  * Returns: a newly allocated manager structure, or
  *          zero if the fixed port could not be bound
  */
-manager init_connections(unsigned short pid,
-                         int (*input)(void *, void *),
-                         void *a)
+manager init_connections(int (*input)(void *, void *), void *a)
 {
     manager m = (manager)malloc(sizeof(struct manager));
+
     m->connections = hash_create_table(compare_connection,connection_key);
     m->handler = input;
     m->handler_arg = a;
+    m->port = 0;                                /* set on first connection */
     pthread_mutex_init(&m->conn_lock, 0);
 
+    return m;
+#if 0
     if (bind_socket(m,pid))
         return(m);
 
     free(m);
     return(0);
+#endif
 }
index 343ffa6..0c4718e 100644 (file)
 #include <procbridge.h>
 
 typedef struct manager {
-    table connections;
+    table           connections;
     pthread_mutex_t conn_lock; /* protect connections table */
-    int bound;
-    io_handler bound_handler;
-    int (*handler)(void *, void *);
-    void *handler_arg;
-    unsigned short port;
+#if 0                          /* we don't accept connections */
+    int             bound;
+    io_handler      bound_handler;
+#endif
+    int           (*handler)(void *, void *);
+    void           *handler_arg;
+    int             port;
 } *manager;
 
 
 typedef struct connection {
-    unsigned int ip;
-    unsigned short port;
-    int fd;
-    manager m;
+        lnet_nid_t      peer_nid;
+        int            fd;
+        manager        m;
 } *connection;
 
-connection force_tcp_connection(manager m, unsigned int ip, unsigned int short,
-                                procbridge pb);
-manager init_connections(unsigned short, int (*f)(void *, void *), void *);
+connection force_tcp_connection(manager m, lnet_nid_t nid, procbridge pb);
+manager init_connections(int (*f)(void *, void *), void *);
 void remove_connection(void *arg);
 void shutdown_connections(manager m);
 int read_connection(connection c, unsigned char *dest, int len);
diff --git a/lnet/ulnds/socklnd/debug.c b/lnet/ulnds/socklnd/debug.c
deleted file mode 100644 (file)
index b82bb2f..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- *   Author: Phil Schwan <phil@clusterfs.com>
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <stdio.h>
-#include <fcntl.h>
-#include <errno.h>
-#include <stdarg.h>
-#include <sys/time.h>
-
-int smp_processor_id = 1;
-char debug_file_path[1024] = "/tmp/lustre-log";
-char debug_file_name[1024];
-FILE *debug_file_fd;
-
-int portals_do_debug_dumplog(void *arg)
-{
-        printf("Look in %s\n", debug_file_name);
-        return 0;
-}
-
-
-void portals_debug_print(void)
-{
-        return;
-}
-
-
-void portals_debug_dumplog(void)
-{
-        printf("Look in %s\n", debug_file_name);
-        return;
-}
-
-
-int portals_debug_init(unsigned long bufsize)
-{ 
-        debug_file_fd = stdout;
-        return 0;
-}
-
-int portals_debug_cleanup(void)
-{
-        return 0; //close(portals_debug_fd);
-}
-
-int portals_debug_clear_buffer(void)
-{
-        return 0;
-}
-
-int portals_debug_mark_buffer(char *text)
-{
-
-        fprintf(debug_file_fd, "*******************************************************************************\n");
-        fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text);
-        fprintf(debug_file_fd, "*******************************************************************************\n");
-
-        return 0;
-}
-
-int portals_debug_copy_to_user(char *buf, unsigned long len)
-{
-        return 0;
-}
-
-/* FIXME: I'm not very smart; someone smarter should make this better. */
-void
-portals_debug_msg (int subsys, int mask, char *file, const char *fn, 
-                   const int line, const char *format, ...)
-{
-        va_list       ap;
-        unsigned long flags;
-        struct timeval tv;
-        int nob;
-
-
-        /* NB since we pass a non-zero sized buffer (at least) on the first
-         * print, we can be assured that by the end of all the snprinting,
-         * we _do_ have a terminated buffer, even if our message got truncated.
-         */
-
-        gettimeofday(&tv, NULL);
-
-        nob += fprintf(debug_file_fd,
-                              "%02x:%06x:%d:%lu.%06lu ",
-                              subsys >> 24, mask, smp_processor_id,
-                              tv.tv_sec, tv.tv_usec);
-
-        nob += fprintf(debug_file_fd,
-                            "(%s:%d:%s() %d+%ld): ",
-                            file, line, fn, 0,
-                            8192 - ((unsigned long)&flags & 8191UL));
-
-        va_start (ap, format);
-        nob += fprintf(debug_file_fd, format, ap);
-        va_end (ap);
-
-
-}
-
index a8f916d..300f33b 100644 (file)
@@ -41,6 +41,4 @@ when now(void);
 /*
  * hacking for CFS internal MPI testing
  */ 
-#if !CRAY_PORTALS
 #define ENABLE_SELECT_DISPATCH
-#endif
diff --git a/lnet/ulnds/socklnd/ipmap.h b/lnet/ulnds/socklnd/ipmap.h
deleted file mode 100644 (file)
index 85b1e18..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-#define DIRECT_IP_MODE
-#ifdef DIRECT_IP_MODE
-#define PNAL_NID(in_addr, port) (in_addr)
-#define PNAL_PID(pid) (pid)
-#define PNAL_IP(in_addr, port) (in_addr)
-#define PNAL_PORT(nid, pid) (pid)
-#else
-
-#define PNAL_BASE_PORT 4096
-#define PNAL_HOSTID_SHIFT 24
-#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1)
-#define PNAL_VNODE_SHIFT 8
-#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1)
-#define PNAL_PID_SHIFT 8
-#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1)
-
-#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \
-                                    << PNAL_VNODE_SHIFT) \
-                                   | (((ntohs(port)-PNAL_BASE_PORT) >>\
-                                       PNAL_PID_SHIFT)))
-#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT)  & PNAL_PID_MASK)
-
-#define PNAL_IP(nid,t)  (htonl((((unsigned)(nid))\
-                                >> PNAL_VNODE_SHIFT)\
-                               | (t->iptop8 << PNAL_HOSTID_SHIFT)))
-#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \
-                                 << PNAL_VNODE_SHIFT) \
-                                | ((pid) & PNAL_PID_MASK)) \
-                               + PNAL_BASE_PORT))
-#endif
index 6b471c0..5fd5f46 100644 (file)
 #include <unistd.h>
 #include <string.h>
 #ifndef __CYGWIN__
-#include <syscall.h>
+# include <syscall.h>
 #endif
+#include <netdb.h>
 #include <sys/socket.h>
+#include <netinet/in.h>
 #include <procbridge.h>
 #include <pqtimer.h>
 #include <dispatch.h>
 #include <errno.h>
+#ifdef HAVE_GETHOSTBYNAME
+# include <sys/utsname.h>
+#endif
 
+#if !HAVE_LIBPTHREAD
+# error "This LND requires a multi-threaded runtime"
+#endif
 
 /* XXX CFS workaround, to give a chance to let nal thread wake up
  * from waiting in select
@@ -60,17 +68,26 @@ void procbridge_wakeup_nal(procbridge p)
     syscall(SYS_write, p->notifier[0], buf, sizeof(buf));
 }
 
+lnd_t the_tcplnd = {
+        .lnd_type      = SOCKLND,
+        .lnd_startup   = procbridge_startup,
+        .lnd_shutdown  = procbridge_shutdown,
+        .lnd_send      = tcpnal_send,
+        .lnd_recv      = tcpnal_recv,
+        .lnd_notify    = tcpnal_notify,
+};
+int       tcpnal_running;
+
 /* Function: shutdown
- * Arguments: nal: a pointer to my top side nal structure
- *            ni: my network interface index
+ * Arguments: ni: the instance of me
  *
  * cleanup nal state, reclaim the lower side thread and
  *   its state using PTL_FINI codepoint
  */
-static void procbridge_shutdown(nal_t *n)
+void
+procbridge_shutdown(lnet_ni_t *ni)
 {
-    lib_nal_t *nal = n->nal_data;
-    bridge b=(bridge)nal->libnal_data;
+    bridge b=(bridge)ni->ni_data;
     procbridge p=(procbridge)b->local;
 
     p->nal_flags |= NAL_FLAG_STOPPING;
@@ -87,68 +104,51 @@ static void procbridge_shutdown(nal_t *n)
     } while (1);
 
     free(p);
+    tcpnal_running = 0;
 }
 
-
-/* forward decl */
-extern int procbridge_startup (nal_t *, ptl_pid_t,
-                               ptl_ni_limits_t *, ptl_ni_limits_t *);
-
-/* api_nal
- *  the interface vector to allow the generic code to access
- *  this nal. this is seperate from the library side lib_nal.
- *  TODO: should be dyanmically allocated
- */
-nal_t procapi_nal = {
-    nal_data: NULL,
-    nal_ni_init: procbridge_startup,
-    nal_ni_fini: procbridge_shutdown,
-};
-
-ptl_nid_t tcpnal_mynid;
-
 #ifdef ENABLE_SELECT_DISPATCH
 procbridge __global_procbridge = NULL;
 #endif
 
 /* Function: procbridge_startup
  *
- * Arguments:  pid: requested process id (port offset)
- *                  PTL_ID_ANY not supported.
- *             desired: limits passed from the application
- *                      and effectively ignored
- *             actual:  limits actually allocated and returned
+ * Arguments:  ni:          the instance of me
+ *             interfaces:  ignored
  *
  * Returns: portals rc
  *
  * initializes the tcp nal. we define unix_failure as an
  * error wrapper to cut down clutter.
  */
-int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
-                        ptl_ni_limits_t *requested_limits,
-                        ptl_ni_limits_t *actual_limits)
+int
+procbridge_startup (lnet_ni_t *ni)
 {
-    nal_init_args_t args;
-
     procbridge p;
-    bridge b;
-    /* XXX nal_type is purely private to tcpnal here */
-    int nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */
-
-    LASSERT(nal == &procapi_nal);
-
+    bridge     b;
+    int        rc;
+
+    /* NB The local NID is not assigned.  We only ever connect to the socknal,
+     * which assigns the src nid/pid on incoming non-privileged connections
+     * (i.e. us), and we don't accept connections. */
+
+    LASSERT (ni->ni_lnd == &the_tcplnd);
+    LASSERT (!tcpnal_running);                  /* only single instance supported */
+    LASSERT (ni->ni_interfaces[0] == NULL);     /* explicit interface(s) not supported */
+
+    /* The credit settings here are pretty irrelevent.  Userspace tcplnd has no
+     * tx descriptor pool to exhaust and does a blocking send; that's the real
+     * limit on send concurrency. */
+    ni->ni_maxtxcredits = 1000;
+    ni->ni_peertxcredits = 1000;
+    
     init_unix_timer();
 
     b=(bridge)malloc(sizeof(struct bridge));
     p=(procbridge)malloc(sizeof(struct procbridge));
     b->local=p;
-
-    args.nia_requested_pid = requested_pid;
-    args.nia_requested_limits = requested_limits;
-    args.nia_actual_limits = actual_limits;
-    args.nia_nal_type = nal_type;
-    args.nia_bridge = b;
-    args.nia_apinal = nal;
+    b->b_ni = ni;
+    ni->ni_data = b;
 
     /* init procbridge */
     pthread_mutex_init(&p->mutex,0);
@@ -158,13 +158,14 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
     /* initialize notifier */
     if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) {
         perror("socketpair failed");
-        return PTL_FAIL;
+        rc = -errno;
+        return rc;
     }
 
     if (!register_io_handler(p->notifier[1], READ_HANDLER,
                 procbridge_notifier_handler, p)) {
         perror("fail to register notifier handler");
-        return PTL_FAIL;
+        return -ENOMEM;
     }
 
 #ifdef ENABLE_SELECT_DISPATCH
@@ -172,9 +173,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
 #endif
 
     /* create nal thread */
-    if (pthread_create(&p->t, NULL, nal_thread, &args)) {
+    rc = pthread_create(&p->t, NULL, nal_thread, b);
+    if (rc != 0) {
         perror("nal_init: pthread_create");
-        return PTL_FAIL;
+        return -ESRCH;
     }
 
     do {
@@ -188,9 +190,9 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid,
     } while (1);
 
     if (p->nal_flags & NAL_FLAG_STOPPED)
-        return PTL_FAIL;
+        return -ENETDOWN;
 
-    b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid;
+    tcpnal_running = 1;
 
-    return PTL_OK;
+    return 0;
 }
index 1f91ced..2dd534b 100644 (file)
@@ -12,7 +12,6 @@
 
 #include <pthread.h>
 #include <bridge.h>
-#include <ipmap.h>
 
 
 #define NAL_FLAG_RUNNING        1
@@ -33,24 +32,27 @@ typedef struct procbridge {
 } *procbridge;
 
 typedef struct nal_init_args {
-    ptl_pid_t        nia_requested_pid;
-    ptl_ni_limits_t *nia_requested_limits;
-    ptl_ni_limits_t *nia_actual_limits;
-    int              nia_nal_type;
+    lnet_pid_t        nia_requested_pid;
     bridge           nia_bridge;
-    nal_t           *nia_apinal;
 } nal_init_args_t;
 
 extern void *nal_thread(void *);
 
+extern void procbridge_wakeup_nal(procbridge p);
+
+extern int procbridge_startup (lnet_ni_t *);
+extern void procbridge_shutdown (lnet_ni_t *);
+
+extern void tcpnal_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive);
+
+extern int tcpnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
+int tcpnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *cookie,
+                int delayed, unsigned int niov,
+                struct iovec *iov, lnet_kiov_t *kiov,
+                unsigned int offset, unsigned int mlen, unsigned int rlen);
+extern int tcpnal_set_global_params();
 
-#define PTL_INIT        (LIB_MAX_DISPATCH+1)
-#define PTL_FINI        (LIB_MAX_DISPATCH+2)
 
-#define MAX_ACLS        1
-#define MAX_PTLS        128
 
-extern void set_address(bridge t,ptl_pid_t pidrequest);
-extern void procbridge_wakeup_nal(procbridge p);
 
 #endif
index 7ee7c71..01faf05 100644 (file)
 
 /* the following functions are stubs to satisfy the nal definition
    without doing anything particularily useful*/
-
-static int nal_dist(lib_nal_t *nal,
-                    ptl_nid_t nid,
-                    unsigned long *dist)
-{
-    return 0;
-}
+extern int tcpnal_init(bridge);
+extern void tcpnal_shutdown(bridge);
 
 static void check_stopping(void *z)
 {
@@ -58,6 +53,8 @@ static void check_stopping(void *z)
     if ((p->nal_flags & NAL_FLAG_STOPPING) == 0)
             return;
     
+    tcpnal_shutdown(b);
+
     pthread_mutex_lock(&p->mutex);
     p->nal_flags |= NAL_FLAG_STOPPED;
     pthread_cond_broadcast(&p->cond);
@@ -79,53 +76,27 @@ static void check_stopping(void *z)
  *  We define a limit macro to place a ceiling on limits
  *   for syntactic convenience
  */
-extern int tcpnal_init(bridge);
-
-nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0};
 
 void *nal_thread(void *z)
 {
-    nal_init_args_t *args = (nal_init_args_t *) z;
-    bridge b = args->nia_bridge;
+    bridge b = (bridge) z;
     procbridge p=b->local;
     int rc;
-    ptl_process_id_t process_id;
-    int nal_type;
     
-    b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t));
-    b->lib_nal->libnal_data=b;
-    b->lib_nal->libnal_map=NULL;
-    b->lib_nal->libnal_unmap=NULL;
-    b->lib_nal->libnal_dist=nal_dist;
-
-    nal_type = args->nia_nal_type;
-
-    /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which
-     * lib_init() is about to do from the process_id passed to it...*/
-    set_address(b,args->nia_requested_pid);
-
-    process_id = b->lib_nal->libnal_ni.ni_pid;
-    
-    if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b);
-    /* initialize the generic 'library' level code */
-
-    rc = lib_init(b->lib_nal, args->nia_apinal, 
-                  process_id, 
-                  args->nia_requested_limits, 
-                  args->nia_actual_limits);
+    rc = tcpnal_init(b);
 
     /*
      * Whatever the initialization returned is passed back to the
      * user level code for further interpretation.  We just exit if
      * it is non-zero since something went wrong.
      */
-    /* this should perform error checking */
+
     pthread_mutex_lock(&p->mutex);
-    p->nal_flags |= (rc != PTL_OK) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING;
+    p->nal_flags |= (rc != 0) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING;
     pthread_cond_broadcast(&p->cond);
     pthread_mutex_unlock(&p->mutex);
 
-    if (rc == PTL_OK) {
+    if (rc == 0) {
         /* the thunk function is called each time the timer loop
            performs an operation and returns to blocking mode. we
            overload this function to inform the api side that
index 49c770f..42c9bc1 100644 (file)
@@ -25,7 +25,7 @@
  *  io events through the select system call.
  */
 
-#define DEBUG_SUBSYSTEM S_NAL
+#define DEBUG_SUBSYSTEM S_LND
 
 #ifdef sun
 #include <sys/filio.h>
@@ -320,7 +320,7 @@ again:
     }
 
     /* XXX only compile for linux */
-#if __WORDSIZE == 64
+#if (__WORDSIZE == 64) && !defined(__mips64__)
     nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2],
                      select_timeout);
 #else
index 662775a..eb390c4 100644 (file)
@@ -110,7 +110,7 @@ unsigned int key_from_string(char *s)
  * Returns: a pointer to the new table
  */
 table hash_create_table (int (*compare_function)(void *, void *),
-                    unsigned int (*key_function)(unsigned int *))
+                    unsigned int (*key_function)(void *))
 {
     table new=(table)malloc(sizeof(struct table));
     memset(new, 0, sizeof(struct table));
index 7fab586..0cb9669 100644 (file)
@@ -22,13 +22,14 @@ typedef struct table {
   int number_of_entries;
   table_entry *entries;
   int (*compare_function)(void *, void *);
-  unsigned int (*key_function)(unsigned int *);
+  unsigned int (*key_function)(void *);
 } *table;
 
 /* table.c */
 unsigned int key_from_int(int i);
 unsigned int key_from_string(char *s);
-table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
+table hash_create_table(int (*compare_function)(void *, void *), 
+                        unsigned int (*key_function)(void *));
 void *hash_table_find(table t, void *comparator);
 void hash_table_insert(table t, void *value, void *comparator);
 void hash_table_remove(table t, void *comparator);
index abb6d01..bd73fb2 100644 (file)
 #include <netinet/in.h>
 #include <pqtimer.h>
 #include <dispatch.h>
-#include <bridge.h>
-#include <ipmap.h>
+#include <procbridge.h>
 #include <connection.h>
-#include <pthread.h>
 #include <errno.h>
+
 #ifndef __CYGWIN__
 #include <syscall.h>
 #endif
 
-/* Function:  tcpnal_send
- * Arguments: nal:     pointer to my nal control block
- *            private: unused
- *            cookie:  passed back to the portals library
- *            hdr:     pointer to the portals header
- *            nid:     destination node
- *            pid:     destination process
- *            data:    body of the message
- *            len:     length of the body
- * Returns: zero on success
- *
+void
+tcpnal_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive)
+{
+        bridge     b = (bridge)ni->ni_data;
+        connection c;
+
+        if (!alive) {
+                LBUG();
+        }
+
+        c = force_tcp_connection((manager)b->lower, nid, b->local);
+        if (c == NULL)
+                CERROR("Can't create connection to %s\n",
+                       libcfs_nid2str(nid));
+}
+
+/*
  * sends a packet to the peer, after insuring that a connection exists
  */
-ptl_err_t tcpnal_send(lib_nal_t *n,
-                      void *private,
-                      lib_msg_t *cookie,
-                      ptl_hdr_t *hdr,
-                      int type,
-                      ptl_nid_t nid,
-                      ptl_pid_t pid,
-                      unsigned int niov,
-                      struct iovec *iov,
-                      size_t offset,
-                      size_t len)
+int tcpnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg)
 {
-    connection c;
-    bridge b=(bridge)n->libnal_data;
-    struct iovec tiov[257];
-    static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
-    ptl_err_t rc = PTL_OK;
-    int   sysrc;
-    int   total;
-    int   ntiov;
-    int i;
-
-    if (!(c=force_tcp_connection((manager)b->lower,
-                                 PNAL_IP(nid,b),
-                                 PNAL_PORT(nid,pid),
-                                 b->local)))
-        return(PTL_FAIL);
-
-    /* TODO: these results should be checked. furthermore, provision
-       must be made for the SIGPIPE which is delivered when
-       writing on a tcp socket which has closed underneath
-       the application. there is a linux flag in the sendmsg
-       call which turns off the signally behaviour, but its
-       nonstandard */
-
-    LASSERT (niov <= 256);
-
-    tiov[0].iov_base = hdr;
-    tiov[0].iov_len = sizeof(ptl_hdr_t);
-    ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
-
-    pthread_mutex_lock(&send_lock);
+        lnet_hdr_t        *hdr = &lntmsg->msg_hdr;
+        lnet_process_id_t  target = lntmsg->msg_target;
+        unsigned int       niov = lntmsg->msg_niov;
+        struct iovec      *iov = lntmsg->msg_iov;
+        unsigned int       offset = lntmsg->msg_offset;
+        unsigned int       len = lntmsg->msg_len;
+
+        connection c;
+        bridge b = (bridge)ni->ni_data;
+        struct iovec tiov[257];
+        static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
+        int rc = 0;
+        int   sysrc;
+        int   total;
+        int   ntiov;
+        int i;
+
+        if (!(c = force_tcp_connection((manager)b->lower, target.nid,
+                                       b->local)))
+                return(-EIO);
+
+        /* TODO: these results should be checked. furthermore, provision
+           must be made for the SIGPIPE which is delivered when
+           writing on a tcp socket which has closed underneath
+           the application. there is a linux flag in the sendmsg
+           call which turns off the signally behaviour, but its
+           nonstandard */
+
+        LASSERT (niov <= 256);
+        LASSERT (len == 0 || iov != NULL);      /* I don't understand kiovs */
+
+        tiov[0].iov_base = hdr;
+        tiov[0].iov_len = sizeof(lnet_hdr_t);
+        ntiov = 1 + lnet_extract_iov(256, &tiov[1], niov, iov, offset, len);
+
+        pthread_mutex_lock(&send_lock);
 #if 1
-    for (i = total = 0; i < ntiov; i++)
-            total += tiov[i].iov_len;
-    
-    sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
-    if (sysrc != total) {
-            fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
-                     rc, total, errno);
-            rc = PTL_FAIL;
-    }
+        for (i = total = 0; i < ntiov; i++)
+                total += tiov[i].iov_len;
+
+        sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
+        if (sysrc != total) {
+                fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
+                         rc, total, errno);
+                rc = -errno;
+        }
 #else
-    for (i = total = 0; i <= ntiov; i++) {
-            rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
-            
-            if (rc != tiov[i].iov_len) {
-                    fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
-                             rc, tiov[i].iov_len, errno);
-                    rc = PTL_FAIL;
-                    break;
-            }
-            total += rc;
-    }
+        for (i = total = 0; i <= ntiov; i++) {
+                rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
+
+                if (rc != tiov[i].iov_len) {
+                        fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
+                                 rc, tiov[i].iov_len, errno);
+                        rc = -errno;
+                        break;
+                }
+                total += rc;
+        }
 #endif
 #if 0
-    fprintf (stderr, "sent %s total %d in %d frags\n", 
-             hdr->type == PTL_MSG_ACK ? "ACK" :
-             hdr->type == PTL_MSG_PUT ? "PUT" :
-             hdr->type == PTL_MSG_GET ? "GET" :
-             hdr->type == PTL_MSG_REPLY ? "REPLY" :
-             hdr->type == PTL_MSG_HELLO ? "HELLO" : "UNKNOWN",
-             total, niov + 1);
+        fprintf (stderr, "sent %s total %d in %d frags\n",
+                 hdr->type == LNET_MSG_ACK ? "ACK" :
+                 hdr->type == LNET_MSG_PUT ? "PUT" :
+                 hdr->type == LNET_MSG_GET ? "GET" :
+                 hdr->type == LNET_MSG_REPLY ? "REPLY" :
+                 hdr->type == LNET_MSG_HELLO ? "HELLO" : "UNKNOWN",
+                 total, niov + 1);
 #endif
-    pthread_mutex_unlock(&send_lock);
+        pthread_mutex_unlock(&send_lock);
 
-    if (rc == PTL_OK) {
-            /* NB the NAL only calls lib_finalize() if it returns PTL_OK
-             * from cb_send() */
-            lib_finalize(n, private, cookie, PTL_OK);
-    }
+        if (rc == 0) {
+                /* NB the NAL only calls lnet_finalize() if it returns 0
+                 * from cb_send() */
+                lnet_finalize(ni, lntmsg, 0);
+        }
 
-    return(rc);
+        return(rc);
 }
 
 
-/* Function:  tcpnal_recv
- * Arguments: lib_nal_t *nal:    pointer to my nal control block
- *            void *private:     connection pointer passed through
- *                               lib_parse()
- *            lib_msg_t *cookie: passed back to portals library
- *            user_ptr data:     pointer to the destination buffer
- *            size_t mlen:       length of the body
- *            size_t rlen:       length of data in the network
- * Returns: zero on success
- *
- * blocking read of the requested data. must drain out the
- * difference of mainpulated and requested lengths from the network
- */
-ptl_err_t tcpnal_recv(lib_nal_t *n,
-                      void *private,
-                      lib_msg_t *cookie,
-                      unsigned int niov,
-                      struct iovec *iov,
-                      size_t offset,
-                      size_t mlen,
-                      size_t rlen)
-
+int tcpnal_recv(lnet_ni_t     *ni,
+                void         *private,
+                lnet_msg_t   *cookie,
+                int           delayed,
+                unsigned int  niov,
+                struct iovec *iov,
+                lnet_kiov_t  *kiov,
+                unsigned int  offset,
+                unsigned int  mlen,
+                unsigned int  rlen)
 {
-    struct iovec tiov[256];
-    int ntiov;
-    int i;
-
-    if (!niov)
-            goto finalize;
-
-    LASSERT(mlen);
-    LASSERT(rlen);
-    LASSERT(rlen >= mlen);
-
-    ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
-    
-    /* FIXME
-     * 1. Is this effecient enough? change to use readv() directly?
-     * 2. need check return from read_connection()
-     * - MeiJia
-     */
-    for (i = 0; i < ntiov; i++)
-        read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
+        struct iovec tiov[256];
+        int ntiov;
+        int i;
+
+        if (mlen == 0)
+                goto finalize;
+
+        LASSERT(iov != NULL);           /* I don't understand kiovs */
+
+        ntiov = lnet_extract_iov(256, tiov, niov, iov, offset, mlen);
+
+        /* FIXME
+         * 1. Is this effecient enough? change to use readv() directly?
+         * 2. need check return from read_connection()
+         * - MeiJia
+         */
+        for (i = 0; i < ntiov; i++)
+                read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
 
 finalize:
-    /* FIXME; we always assume success here... */
-    lib_finalize(n, private, cookie, PTL_OK);
-
-    if (mlen!=rlen){
-        char *trash=malloc(rlen-mlen);
-        
-        /*TODO: check error status*/
-        read_connection(private,trash,rlen-mlen);
-        free(trash);
-    }
-
-    return(PTL_OK);
+        /* FIXME; we always assume success here... */
+        lnet_finalize(ni, cookie, 0);
+
+        LASSERT(rlen >= mlen);
+
+        if (mlen != rlen){
+                char *trash=malloc(rlen - mlen);
+
+                /*TODO: check error status*/
+                read_connection(private, trash, rlen - mlen);
+                free(trash);
+        }
+
+        return(0);
 }
 
 
-/* Function:  from_connection: 
- * Arguments: c: the connection to read from 
+/* Function:  from_connection:
+ * Arguments: c: the connection to read from
  * Returns: whether or not to continue reading from this connection,
  *          expressed as a 1 to continue, and a 0 to not
  *
- *  from_connection() is called from the select loop when i/o is 
- *  available. It attempts to read the portals header and 
+ *  from_connection() is called from the select loop when i/o is
+ *  available. It attempts to read the portals header and
  *  pass it to the generic library for processing.
  */
 static int from_connection(void *a, void *d)
 {
-    connection c = d;
-    bridge b = a;
-    ptl_hdr_t hdr;
-
-    if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
-        lib_parse(b->lib_nal, &hdr, c);
-        /*TODO: check error status*/
-        return(1);
-    }
-    return(0);
+        connection c = d;
+        bridge     b = a;
+        lnet_hdr_t hdr;
+        int  rc;
+
+        if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))) {
+                /* replace dest_nid,pid (socknal sets its own) */
+                hdr.dest_nid = cpu_to_le64(b->b_ni->ni_nid);
+                hdr.dest_pid = cpu_to_le32(the_lnet.ln_pid);
+
+                rc = lnet_parse(b->b_ni, &hdr, c->peer_nid, c, 0);
+                if (rc < 0) {
+                        CERROR("Error %d from lnet_parse\n", rc);
+                        return 0;
+                }
+
+                return(1);
+        }
+        return(0);
 }
 
 
-static void tcpnal_shutdown(bridge b)
+void tcpnal_shutdown(bridge b)
 {
-    shutdown_connections(b->lower);
+        shutdown_connections(b->lower);
 }
 
 /* Function:  PTL_IFACE_TCP
@@ -238,19 +236,14 @@ static void tcpnal_shutdown(bridge b)
  */
 int tcpnal_init(bridge b)
 {
-    manager m;
-        
-    b->lib_nal->libnal_send=tcpnal_send;
-    b->lib_nal->libnal_recv=tcpnal_recv;
-    b->shutdown=tcpnal_shutdown;
-    
-    if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid,
-                                       b->lib_nal->libnal_ni.ni_pid.pid),
-                             from_connection,b))){
-        /* TODO: this needs to shut down the
-           newly created junk */
-        return(PTL_NAL_FAILED);
-    }
-    b->lower=m;
-    return(PTL_OK);
+        manager m;
+
+        tcpnal_set_global_params();
+
+        if (!(m = init_connections(from_connection, b))) {
+                /* TODO: this needs to shut down the newly created junk */
+                return(-ENXIO);
+        }
+        b->lower = m;
+        return(0);
 }
diff --git a/lnet/ulnds/socklnd/utypes.h b/lnet/ulnds/socklnd/utypes.h
deleted file mode 100644 (file)
index 7eca959..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-typedef unsigned short uint16;
-typedef unsigned long uint32;
-typedef unsigned long long uint64;
-typedef unsigned char uint8;
diff --git a/lnet/ulnds/table.c b/lnet/ulnds/table.c
deleted file mode 100644 (file)
index 662775a..0000000
+++ /dev/null
@@ -1,264 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2002 Eric Hoffman
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <table.h>
-#include <stdlib.h>
-#include <string.h>
-
-
-/* table.c:
- * a very simple hash table implementation with paramerterizable 
- * comparison and key generation functions. it does resize
- * in order to accomidate more entries, but never collapses 
- * the table 
- */
-
-static table_entry *table_lookup (table t,void *comparator,
-                                  unsigned int k,
-                                  int (*compare_function)(void *, void *),
-                                  int *success)
-{
-    unsigned int key=k%t->size;
-    table_entry *i;
-
-    for (i=&(t->entries[key]);*i;i=&((*i)->next)){
-        if (compare_function && ((*i)->key==k))
-            if ((*t->compare_function)((*i)->value,comparator)){
-                *success=1;
-                return(i);
-            }
-    }
-    *success=0;
-    return(&(t->entries[key]));
-}
-
-
-static void resize_table(table t, int size)
-{
-    int old_size=t->size;
-    table_entry *old_entries=t->entries;
-    int i; 
-    table_entry j,n;
-    table_entry *position;
-    int success;
-  
-    t->size=size;
-    t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size);
-    memset(t->entries,0,sizeof(table_entry)*t->size);
-
-    for (i=0;i<old_size;i++)
-        for (j=old_entries[i];j;j=n){
-            n=j->next;
-            position=table_lookup(t,0,j->key,0,&success);
-            j->next= *position;
-            *position=j;
-        }
-    free(old_entries);
-}
-
-
-/* Function: key_from_int
- * Arguments: int i: value to compute the key of
- * Returns: the key 
- */
-unsigned int key_from_int(int i)
-{
-    return(i);
-}
-
-
-/* Function: key_from_string
- * Arguments: char *s: the null terminated string
- *                     to compute the key of
- * Returns: the key 
- */
-unsigned int key_from_string(char *s)
-{
-    unsigned int result=0;
-    unsigned char *n;
-    int i;
-    if (!s) return(1);
-    for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i;
-    return(result);
-}
-
-
-/* Function: hash_create_table
- * Arguments: compare_function: a function to compare
- *                              a table instance with a correlator
- *            key_function: a function to generate a 32 bit 
- *                          hash key from a correlator
- * Returns: a pointer to the new table
- */
-table hash_create_table (int (*compare_function)(void *, void *),
-                    unsigned int (*key_function)(unsigned int *))
-{
-    table new=(table)malloc(sizeof(struct table));
-    memset(new, 0, sizeof(struct table));
-
-    new->compare_function=compare_function;
-    new->key_function=key_function;
-    new->number_of_entries=0;
-    new->size=4;
-    new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size);
-    memset(new->entries,0,sizeof(table_entry)*new->size);
-    return(new);
-}
-
-
-/* Function: hash_table_find
- * Arguments: t: a table to look in
- *            comparator: a value to access the table entry
- * Returns: the element references to by comparator, or null
- */
-void *hash_table_find (table t, void *comparator)
-{
-    int success;
-    table_entry* entry=table_lookup(t,comparator,
-                                    (*t->key_function)(comparator),
-                                    t->compare_function,
-                                    &success);
-    if (success)  return((*entry)->value);
-    return(0);
-}
-
-
-/* Function: hash_table_insert
- * Arguments: t: a table to insert the object
- *            value: the object to put in the table
- *            comparator: the value by which the object 
- *                        will be addressed
- * Returns: nothing
- */
-void hash_table_insert (table t, void *value, void *comparator)
-{
-    int success;
-    unsigned int k=(*t->key_function)(comparator);
-    table_entry *position=table_lookup(t,comparator,k,
-                                       t->compare_function,&success);
-    table_entry entry;
-
-    if (success) {
-        entry = *position;
-    } else {
-        entry = (table_entry)malloc(sizeof(struct table_entry));
-        memset(entry, 0, sizeof(struct table_entry));
-        entry->next= *position;
-        *position=entry;
-        t->number_of_entries++;
-    }
-    entry->value=value;
-    entry->key=k;
-    if (t->number_of_entries > t->size) resize_table(t,t->size*2);
-}
-
-/* Function: hash_table_remove
- * Arguments: t: the table to remove the object from
- *            comparator: the index value of the object to remove
- * Returns: 
- */
-void hash_table_remove (table t, void *comparator)
-{
-    int success;
-    table_entry temp;
-    table_entry *position=table_lookup(t,comparator,
-                                       (*t->key_function)(comparator),
-                                       t->compare_function,&success);
-    if(success) {
-        temp=*position;
-        *position=(*position)->next;
-        free(temp); /* the value? */
-        t->number_of_entries--;
-    }
-}
-
-/* Function: hash_iterate_table_entries
- * Arguments: t: the table to iterate over
- *            handler: a function to call with each element
- *                     of the table, along with arg
- *            arg: the opaque object to pass to handler
- * Returns: nothing
- */
-void hash_iterate_table_entries(table t,
-                           void (*handler)(void *,void *), 
-                           void *arg)
-{
-    int i;
-    table_entry *j,*next;
-  
-    for (i=0;i<t->size;i++)
-        for (j=t->entries+i;*j;j=next){
-            next=&((*j)->next);
-            (*handler)(arg,(*j)->value);
-        }
-}
-
-/* Function: hash_filter_table_entries
- * Arguments: t: the table to iterate over
- *            handler: a function to call with each element
- *                     of the table, along with arg
- *            arg: the opaque object to pass to handler
- * Returns: nothing
- * Notes: operations on the table inside handler are not safe
- *
- * filter_table_entires() calls the handler function for each
- *   item in the table, passing it and arg. The handler function
- *   returns 1 if it is to be retained in the table, and 0
- *   if it is to be removed.
- */
-void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg)
-{
-    int i;
-    table_entry *j,*next,v;
-  
-    for (i=0;i<t->size;i++)
-        for (j=t->entries+i;*j;j=next){
-            next=&((*j)->next);
-            if (!(*handler)(arg,(*j)->value)){
-                next=j;
-                v=*j;
-                *j=(*j)->next;
-                free(v);
-                t->number_of_entries--;
-            }
-        }
-}
-
-/* Function: destroy_table
- * Arguments: t: the table to free
- *            thunk: a function to call with each element,
- *                   most likely free()
- * Returns: nothing
- */
-void hash_destroy_table(table t,void (*thunk)(void *))
-{
-    table_entry j,next;
-    int i;
-    for (i=0;i<t->size;i++)
-        for (j=t->entries[i];j;j=next){
-            next=j->next;
-            if (thunk) (*thunk)(j->value);
-            free(j);
-        }
-    free(t->entries);
-    free(t);
-}
diff --git a/lnet/ulnds/table.h b/lnet/ulnds/table.h
deleted file mode 100644 (file)
index 7fab586..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2002 Eric Hoffman
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-#ifndef E_TABLE
-#define E_TABLE
-
-typedef struct table_entry {
-  unsigned int key;
-  void *value;
-  struct table_entry *next;
-} *table_entry;
-
-
-typedef struct table {
-  unsigned int size;
-  int number_of_entries;
-  table_entry *entries;
-  int (*compare_function)(void *, void *);
-  unsigned int (*key_function)(unsigned int *);
-} *table;
-
-/* table.c */
-unsigned int key_from_int(int i);
-unsigned int key_from_string(char *s);
-table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *));
-void *hash_table_find(table t, void *comparator);
-void hash_table_insert(table t, void *value, void *comparator);
-void hash_table_remove(table t, void *comparator);
-void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg);
-void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg);
-void hash_destroy_table(table t, void (*thunk)(void *));
-
-#endif
diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c
deleted file mode 100644 (file)
index abb6d01..0000000
+++ /dev/null
@@ -1,256 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2003 Cluster File Systems, Inc.
- *
- *   This file is part of Lustre, http://www.lustre.org.
- *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
- *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/* tcpnal.c:
-   This file implements the TCP-based nal by providing glue
-   between the connection service and the generic NAL implementation */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <pqtimer.h>
-#include <dispatch.h>
-#include <bridge.h>
-#include <ipmap.h>
-#include <connection.h>
-#include <pthread.h>
-#include <errno.h>
-#ifndef __CYGWIN__
-#include <syscall.h>
-#endif
-
-/* Function:  tcpnal_send
- * Arguments: nal:     pointer to my nal control block
- *            private: unused
- *            cookie:  passed back to the portals library
- *            hdr:     pointer to the portals header
- *            nid:     destination node
- *            pid:     destination process
- *            data:    body of the message
- *            len:     length of the body
- * Returns: zero on success
- *
- * sends a packet to the peer, after insuring that a connection exists
- */
-ptl_err_t tcpnal_send(lib_nal_t *n,
-                      void *private,
-                      lib_msg_t *cookie,
-                      ptl_hdr_t *hdr,
-                      int type,
-                      ptl_nid_t nid,
-                      ptl_pid_t pid,
-                      unsigned int niov,
-                      struct iovec *iov,
-                      size_t offset,
-                      size_t len)
-{
-    connection c;
-    bridge b=(bridge)n->libnal_data;
-    struct iovec tiov[257];
-    static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER;
-    ptl_err_t rc = PTL_OK;
-    int   sysrc;
-    int   total;
-    int   ntiov;
-    int i;
-
-    if (!(c=force_tcp_connection((manager)b->lower,
-                                 PNAL_IP(nid,b),
-                                 PNAL_PORT(nid,pid),
-                                 b->local)))
-        return(PTL_FAIL);
-
-    /* TODO: these results should be checked. furthermore, provision
-       must be made for the SIGPIPE which is delivered when
-       writing on a tcp socket which has closed underneath
-       the application. there is a linux flag in the sendmsg
-       call which turns off the signally behaviour, but its
-       nonstandard */
-
-    LASSERT (niov <= 256);
-
-    tiov[0].iov_base = hdr;
-    tiov[0].iov_len = sizeof(ptl_hdr_t);
-    ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len);
-
-    pthread_mutex_lock(&send_lock);
-#if 1
-    for (i = total = 0; i < ntiov; i++)
-            total += tiov[i].iov_len;
-    
-    sysrc = syscall(SYS_writev, c->fd, tiov, ntiov);
-    if (sysrc != total) {
-            fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
-                     rc, total, errno);
-            rc = PTL_FAIL;
-    }
-#else
-    for (i = total = 0; i <= ntiov; i++) {
-            rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0);
-            
-            if (rc != tiov[i].iov_len) {
-                    fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n",
-                             rc, tiov[i].iov_len, errno);
-                    rc = PTL_FAIL;
-                    break;
-            }
-            total += rc;
-    }
-#endif
-#if 0
-    fprintf (stderr, "sent %s total %d in %d frags\n", 
-             hdr->type == PTL_MSG_ACK ? "ACK" :
-             hdr->type == PTL_MSG_PUT ? "PUT" :
-             hdr->type == PTL_MSG_GET ? "GET" :
-             hdr->type == PTL_MSG_REPLY ? "REPLY" :
-             hdr->type == PTL_MSG_HELLO ? "HELLO" : "UNKNOWN",
-             total, niov + 1);
-#endif
-    pthread_mutex_unlock(&send_lock);
-
-    if (rc == PTL_OK) {
-            /* NB the NAL only calls lib_finalize() if it returns PTL_OK
-             * from cb_send() */
-            lib_finalize(n, private, cookie, PTL_OK);
-    }
-
-    return(rc);
-}
-
-
-/* Function:  tcpnal_recv
- * Arguments: lib_nal_t *nal:    pointer to my nal control block
- *            void *private:     connection pointer passed through
- *                               lib_parse()
- *            lib_msg_t *cookie: passed back to portals library
- *            user_ptr data:     pointer to the destination buffer
- *            size_t mlen:       length of the body
- *            size_t rlen:       length of data in the network
- * Returns: zero on success
- *
- * blocking read of the requested data. must drain out the
- * difference of mainpulated and requested lengths from the network
- */
-ptl_err_t tcpnal_recv(lib_nal_t *n,
-                      void *private,
-                      lib_msg_t *cookie,
-                      unsigned int niov,
-                      struct iovec *iov,
-                      size_t offset,
-                      size_t mlen,
-                      size_t rlen)
-
-{
-    struct iovec tiov[256];
-    int ntiov;
-    int i;
-
-    if (!niov)
-            goto finalize;
-
-    LASSERT(mlen);
-    LASSERT(rlen);
-    LASSERT(rlen >= mlen);
-
-    ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen);
-    
-    /* FIXME
-     * 1. Is this effecient enough? change to use readv() directly?
-     * 2. need check return from read_connection()
-     * - MeiJia
-     */
-    for (i = 0; i < ntiov; i++)
-        read_connection(private, tiov[i].iov_base, tiov[i].iov_len);
-
-finalize:
-    /* FIXME; we always assume success here... */
-    lib_finalize(n, private, cookie, PTL_OK);
-
-    if (mlen!=rlen){
-        char *trash=malloc(rlen-mlen);
-        
-        /*TODO: check error status*/
-        read_connection(private,trash,rlen-mlen);
-        free(trash);
-    }
-
-    return(PTL_OK);
-}
-
-
-/* Function:  from_connection: 
- * Arguments: c: the connection to read from 
- * Returns: whether or not to continue reading from this connection,
- *          expressed as a 1 to continue, and a 0 to not
- *
- *  from_connection() is called from the select loop when i/o is 
- *  available. It attempts to read the portals header and 
- *  pass it to the generic library for processing.
- */
-static int from_connection(void *a, void *d)
-{
-    connection c = d;
-    bridge b = a;
-    ptl_hdr_t hdr;
-
-    if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){
-        lib_parse(b->lib_nal, &hdr, c);
-        /*TODO: check error status*/
-        return(1);
-    }
-    return(0);
-}
-
-
-static void tcpnal_shutdown(bridge b)
-{
-    shutdown_connections(b->lower);
-}
-
-/* Function:  PTL_IFACE_TCP
- * Arguments: pid_request: desired port number to bind to
- *            desired: passed NAL limits structure
- *            actual: returned NAL limits structure
- * Returns: a nal structure on success, or null on failure
- */
-int tcpnal_init(bridge b)
-{
-    manager m;
-        
-    b->lib_nal->libnal_send=tcpnal_send;
-    b->lib_nal->libnal_recv=tcpnal_recv;
-    b->shutdown=tcpnal_shutdown;
-    
-    if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid,
-                                       b->lib_nal->libnal_ni.ni_pid.pid),
-                             from_connection,b))){
-        /* TODO: this needs to shut down the
-           newly created junk */
-        return(PTL_NAL_FAILED);
-    }
-    b->lower=m;
-    return(PTL_OK);
-}
diff --git a/lnet/ulnds/timer.h b/lnet/ulnds/timer.h
deleted file mode 100644 (file)
index aaf39d2..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *  Copyright (c) 2002 Eric Hoffman
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-/* TODO: make this an explicit type when they become available */
-typedef unsigned long long when;
-
-typedef struct timer {
-  void (*function)(void *);
-  void *arg;
-  when w;
-  int interval;
-  int disable;
-} *timer;
-
-timer register_timer(when, void (*f)(void *), void *a);
-void remove_timer(timer t);
-void timer_loop(void);
-void initialize_timer(void);
-void register_thunk(void (*f)(void *),void *a);
-
-
-#define HZ 0x100000000ull
-
-
diff --git a/lnet/ulnds/utypes.h b/lnet/ulnds/utypes.h
deleted file mode 100644 (file)
index 7eca959..0000000
+++ /dev/null
@@ -1,12 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- *  Copyright (c) 2002 Cray Inc.
- *
- *   This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- */
-
-typedef unsigned short uint16;
-typedef unsigned long uint32;
-typedef unsigned long long uint64;
-typedef unsigned char uint8;
index e2a0d44..13c2683 100644 (file)
@@ -6,5 +6,5 @@ ptlctl
 .deps
 routerstat
 wirecheck
-gmnalnid
+gmlndnid
 .*.cmd
index 70a9ad8..9cd3f25 100644 (file)
@@ -11,34 +11,29 @@ if LIBLUSTRE
 noinst_LIBRARIES = libuptlctl.a
 endif
 
-libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c
-libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS) -DLUSTRE_UTILS=1
-libuptlctl_a_CFLAGS = $(LLCFLAGS)
+libuptlctl_a_SOURCES = portals.c nidstrings.c debug.c l_ioctl.c
+libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS)
+libuptlctl_a_CFLAGS = $(LLCFLAGS) -DLUSTRE_UTILS=1
 
 sbin_PROGRAMS = debugctl
 
 lib_LIBRARIES = libptlctl.a
 
-libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+libptlctl_a_SOURCES = portals.c nidstrings.c debug.c l_ioctl.c parser.c parser.h
 
 if UTILS
-if !CRAY_PORTALS
-sbin_PROGRAMS += acceptor ptlctl routerstat wirecheck 
+sbin_PROGRAMS += ptlctl routerstat wirecheck 
+if BUILD_GMLND
+sbin_PROGRAMS += gmlndnid
 endif
-if BUILD_GMNAL
-sbin_PROGRAMS += gmnalnid
 endif
-endif
-
-acceptor_SOURCES = acceptor.c
-acceptor_LDADD = $(LIBWRAP)
 
 wirecheck_SOURCES = wirecheck.c
 
-gmnalnid_SOURCES = gmnalnid.c
-gmnalnid_CFLAGS = $(GMCPPFLAGS)
-gmnalnid_LDFLAGS = -static
-gmnalnid_LDADD = $(GMLIBS) -lgm
+gmlndnid_SOURCES = gmlndnid.c
+gmlndnid_CFLAGS = $(GMCPPFLAGS)
+gmlndnid_LDFLAGS = -static
+gmlndnid_LDADD = $(GMLIBS) -lgm
 
 ptlctl_SOURCES = ptlctl.c
 ptlctl_LDADD =  -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE)
@@ -49,3 +44,6 @@ routerstat_SOURCES = routerstat.c
 debugctl_SOURCES = debugctl.c
 debugctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE)
 debugctl_DEPENDENCIES = libptlctl.a
+
+nidstrings.c: @top_srcdir@/lnet/libcfs/nidstrings.c
+       ln -sf $< $@
diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c
deleted file mode 100644 (file)
index a270ad2..0000000
+++ /dev/null
@@ -1,363 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- */
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/tcp.h>
-#include <netdb.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <unistd.h>
-#include <syslog.h>
-#include <stdarg.h>
-#include <signal.h>
-#include <errno.h>
-#ifdef HAVE_LIBWRAP
-#include <arpa/inet.h>
-#include <netinet/in.h>
-#include <tcpd.h>
-#endif
-
-#include <libcfs/portals_utils.h>
-#include <portals/api-support.h>
-#include <portals/lib-types.h>
-#include <portals/socknal.h>
-
-/* should get this from autoconf somehow */
-#ifndef PIDFILE_DIR
-#define PIDFILE_DIR "/var/run"
-#endif
-
-char progname[] = "acceptor";
-char name_port[40];             /* for signal handler */
-
-#ifdef HAVE_LIBWRAP
-/* needed because libwrap declares these as externs */
-int allow_severity = LOG_INFO;
-int deny_severity = LOG_WARNING;
-#endif
-
-void usage(char *progname)
-{
-        fprintf(stderr, "usage: %s [-N nal_id] [-p] [-l] port\n\n"
-                " -l\tKeep stdin/stdout open\n"
-                " -p\tAllow connections from non-privileged ports\n", progname);
-        exit (1);
-}
-
-void errlog(int level, const char *fmt, ...)
-{
-        va_list arg;
-        FILE *out;
-
-        switch (level) {
-        case LOG_DEBUG:
-        case LOG_INFO:
-        case LOG_NOTICE:
-                out = stdout;
-                break;
-        default:
-                out = stderr;
-                break;
-        }
-        va_start(arg, fmt);
-        fprintf(out, "%s: ", name_port);
-        vfprintf(out, fmt, arg);
-        va_end(arg);
-        va_start(arg, fmt);
-        vsyslog(level, fmt, arg);
-        va_end(arg);
-}
-
-char *pidfile_name(char *name_port)
-{
-        static char pidfile[1024];
-
-        snprintf(pidfile, sizeof(pidfile), "%s/%s.pid", PIDFILE_DIR, name_port);
-
-        return pidfile;
-}
-
-int pidfile_create(char *name_port)
-{
-        char *pidfile = pidfile_name(name_port);
-        int fd, rc;
-
-        if ((fd = open(pidfile, O_CREAT | O_WRONLY)) >= 0) {
-                char pid[16];
-                int size = snprintf(pid, sizeof(pid), "%u\n", getpid());
-                if (write(fd, pid, size) != size) {
-                        /* hard error or short write */
-                        rc = errno ? : EIO;
-                } else {
-                        rc = 0;
-                }
-                close(fd);
-        } else {
-                rc = errno;
-        }
-
-        if (rc)
-                errlog(LOG_ERR, " error creating %s: %s\n",
-                       pidfile, strerror(rc));
-
-        return rc;
-}
-
-int pidfile_cleanup(char *name_port)
-{
-        char *pidfile = pidfile_name(name_port);
-        int rc;
-
-        rc = unlink(pidfile);
-        if (rc && errno != -ENOENT)
-                fprintf(stderr, "%s: error removing %s: %s\n",
-                        progname, pidfile, strerror(errno));
-        errlog(LOG_NOTICE, "exiting\n");
-
-        return errno;
-}
-
-int pidfile_exists(char *name_port)
-{
-        char *pidfile = pidfile_name(name_port);
-        FILE *fpid;
-        int pid, rc;
-
-        fpid = fopen(pidfile, "r+");
-        if (fpid == NULL) {
-                if (errno == ENOENT)
-                        return 0;
-
-                fprintf(stderr, "%s: error opening %s: %s.\n",
-                        progname, pidfile, strerror(errno));
-                return (1);
-        }
-
-        rc = fscanf(fpid, "%i", &pid);
-        fclose(fpid);
-        if (rc != 1) {
-                fprintf(stderr,"%s: %s didn't contain a valid pid, removing.\n",
-                        progname, pidfile);
-                goto stale;
-        }
-
-        if (kill(pid, 0) == 0) {
-                fprintf(stderr, "%s: %s exists, acceptor pid %d running.\n",
-                        progname, pidfile, pid);
-                return (1);
-        }
-
-        fprintf(stderr, "%s: stale %s exists, pid %d doesn't, removing.\n",
-                progname, pidfile, pid);
-stale:
-        pidfile_cleanup(name_port);
-        return (0);
-}
-
-void handler(int sig)
-{
-        exit(sig);
-}
-
-void atexit_handler(void)
-{
-        pidfile_cleanup(name_port);
-}
-
-void show_connection(int fd, __u32 net_ip)
-{
-        static long last_time;
-        static __u32 host_ip;
-        long now = time(0);
-        struct hostent *h;
-        int  len;
-        char host[1024];
-
-        /* Don't show repeats for same host, it adds no value */
-        if (host_ip == ntohl(net_ip) && (now - last_time) < 5)
-                return;
-
-        h = gethostbyaddr((char *)&net_ip, sizeof(net_ip), AF_INET);
-        last_time = now;
-        host_ip = ntohl(net_ip);
-
-        if (h == NULL)
-                snprintf(host, sizeof(host), "%d.%d.%d.%d",
-                         (host_ip >> 24) & 0xff, (host_ip >> 16) & 0xff,
-                         (host_ip >> 8)  & 0xff, host_ip & 0xff);
-        else
-                snprintf(host, sizeof(host), "%s", h->h_name);
-
-        syslog(LOG_INFO, "accepted host: %s\n", host);
-}
-
-int main(int argc, char **argv)
-{
-        int o, fd, rc, port, pfd;
-        struct sockaddr_in srvaddr;
-        int c;
-        int noclose = 0;
-        int nal = SOCKNAL;
-        int rport;
-        int require_privports = 1;
-
-        while ((c = getopt (argc, argv, "N:lp")) != -1) {
-                switch (c) {
-                case 'N':
-                        if (sscanf(optarg, "%d", &nal) != 1 ||
-                            nal < 0 || nal > NAL_MAX_NR)
-                                usage(argv[0]);
-                        break;
-                case 'l':
-                        noclose = 1;
-                        break;
-                case 'p':
-                        require_privports = 0;
-                        break;
-                default:
-                        usage (argv[0]);
-                        break;
-                }
-        }
-
-        if (optind >= argc)
-                usage (argv[0]);
-
-        port = atol(argv[optind++]);
-
-        snprintf(name_port, sizeof(name_port) - 1, "%s-%d", progname, port);
-        if (pidfile_exists(name_port))
-                return(EEXIST);
-        openlog(name_port, LOG_PID, LOG_DAEMON);
-
-        memset(&srvaddr, 0, sizeof(srvaddr));
-        srvaddr.sin_family = AF_INET;
-        srvaddr.sin_port = htons(port);
-        srvaddr.sin_addr.s_addr = INADDR_ANY;
-
-        fd = socket(PF_INET, SOCK_STREAM, 0);
-        if (fd < 0) {
-                rc = errno;
-                errlog(LOG_ERR, "error opening socket: %s\n", strerror(errno));
-                return(rc);
-        }
-
-        o = 1;
-        if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) {
-                rc = errno;
-                errlog(LOG_ERR, "cannot set REUSEADDR socket opt: %s\n",
-                       strerror(errno));
-                return(rc);
-        }
-
-        rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
-        if (rc == -1) {
-                rc = errno;
-                errlog(LOG_ERR, "error binding to socket: %s\n",
-                       strerror(errno));
-                return(rc);
-        }
-
-        if (listen(fd, 127)) {
-                rc = errno;
-                perror("listen: ");
-                return(rc);
-        }
-        printf("listening on port %d\n", port);
-
-        pfd = open("/dev/portals", O_RDWR);
-        if (pfd < 0) {
-                rc = errno;
-                errlog(LOG_ERR, "opening portals device: %s\n",strerror(errno));
-                return(rc);
-        }
-
-        rc = daemon(0, noclose);
-        if (rc < 0) {
-                rc = errno;
-                errlog(LOG_ERR, "error daemonizing: %s\n", strerror(errno));
-                return(rc);
-        }
-
-        signal(SIGHUP, SIG_IGN);
-        signal(SIGINT, handler);
-        signal(SIGQUIT, handler);
-        signal(SIGTERM, handler);
-
-        errlog(LOG_NOTICE, "started, listening on port %d\n", port);
-        if (pidfile_create(name_port) == 0)
-                atexit(atexit_handler);
-
-        while (1) {
-                struct sockaddr_in clntaddr;
-                int len = sizeof(clntaddr);
-                int cfd;
-                struct portal_ioctl_data data;
-                struct portals_cfg pcfg;
-#ifdef HAVE_LIBWRAP
-                struct request_info request;
-#endif
-                char addrstr[INET_ADDRSTRLEN];
-
-                cfd = accept(fd, (struct sockaddr *)&clntaddr, &len);
-                if (cfd < 0) {
-                        errlog(LOG_ERR, "error accepting connection: %s\n",
-                               strerror(errno));
-                        break;
-                        //continue;
-                }
-
-                inet_ntop(AF_INET, &clntaddr.sin_addr, addrstr,INET_ADDRSTRLEN);
-#ifdef HAVE_LIBWRAP
-                /* libwrap access control */
-                request_init(&request, RQ_DAEMON, "lustre", RQ_FILE, cfd, 0);
-                sock_host(&request);
-                if (!hosts_access(&request)) {
-                        errlog(LOG_WARNING, "unauthorized access from %s:%hd\n",
-                               addrstr, ntohs(clntaddr.sin_port));
-                        close (cfd);
-                        continue;
-                }
-#endif
-
-                if (require_privports &&
-                    ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) {
-                        errlog(LOG_ERR,
-                               "closing non-privileged connection from %s:%d\n",
-                               addrstr, ntohs(clntaddr.sin_port));
-                        rc = close(cfd);
-                        if (rc)
-                                perror ("close un-privileged client failed");
-                        continue;
-                }
-
-                show_connection (cfd, clntaddr.sin_addr.s_addr);
-
-                PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
-                pcfg.pcfg_nal = nal;
-                pcfg.pcfg_fd = cfd;
-                pcfg.pcfg_misc = SOCKNAL_CONN_NONE; /* == incoming connection */
-
-                PORTAL_IOC_INIT(data);
-                data.ioc_pbuf1 = (char*)&pcfg;
-                data.ioc_plen1 = sizeof(pcfg);
-
-                if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) {
-                        errlog(LOG_ERR, "portals ioctl failed for %s: %s\n",
-                               addrstr, strerror(errno));
-                } else {
-                        errlog(LOG_DEBUG, "client %s registered\n", addrstr);
-                }
-                rc = close(cfd);
-                if (rc)
-                        perror("close failed");
-        }
-
-        closelog();
-
-        return (0);
-}
index 857be97..6dec5b8 100644 (file)
@@ -3,19 +3,19 @@
  *
  * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
  *
- *   This file is part of Portals, http://www.sf.net/projects/lustre/
+ *   This file is part of Lustre Networking, http://www.lustre.org.
  *
- *   Portals is free software; you can redistribute it and/or
+ *   LNET is free software; you can redistribute it and/or
  *   modify it under the terms of version 2 of the GNU General Public
  *   License as published by the Free Software Foundation.
  *
- *   Portals is distributed in the hope that it will be useful,
+ *   LNET is distributed in the hope that it will be useful,
  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *   GNU General Public License for more details.
  *
  *   You should have received a copy of the GNU General Public License
- *   along with Portals; if not, write to the Free Software
+ *   along with LNET; if not, write to the Free Software
  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  * Some day I'll split all of this functionality into a cfs_debug module
@@ -41,6 +41,7 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
+#include <assert.h>
 
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -49,8 +50,8 @@
 #include <sys/mman.h>
 #include <sys/utsname.h>
 
-#include <portals/api-support.h>
-#include <portals/ptlctl.h>
+#include <lnet/api-support.h>
+#include <lnet/lnetctl.h>
 #include <libcfs/portals_utils.h>
 #include "parser.h"
 
@@ -63,22 +64,23 @@ static int max = 8192;
 static int subsystem_mask = ~0;
 static int debug_mask = ~0;
 
-#define MAX_MARK_SIZE 100
+#define MAX_MARK_SIZE 256
 
-static const char *portal_debug_subsystems[] =
-        {"undefined", "mdc", "mds", "osc", 
+static const char *libcfs_debug_subsystems[] =
+        {"undefined", "mdc", "mds", "osc",
          "ost", "class", "log", "llite",
-         "rpc", "mgmt", "portals", "nal", 
-         "pinger", "filter", "ptlbd", "echo", 
-         "ldlm", "lov", "router", "cobd", 
-         "sm", "asobd", "confobd", "lmv", 
-         "cmobd", "sec", NULL};
-static const char *portal_debug_masks[] =
-        {"trace", "inode", "super", "ext2", 
+         "rpc", "", "lnet", "lnd",
+         "pinger", "filter", "", "echo",
+         "ldlm", "lov", "", "",
+         "", "", "", "lmv",
+         "", "sec", "gss", "", "mgc", "mgs",
+         "fid", "fld", NULL};
+static const char *libcfs_debug_masks[] =
+        {"trace", "inode", "super", "ext2",
          "malloc", "cache", "info", "ioctl",
-         "blocks", "net", "warning", "buffs", 
-         "other", "dentry", "portals", "page", 
-         "dlmtrace", "error", "emerg", "ha", 
+         "blocks", "net", "warning", "buffs",
+         "other", "dentry", "lnet", "page",
+         "dlmtrace", "error", "emerg", "ha",
          "rpctrace", "vfstrace", "reada", "mmap",
          "config", "console", "quota", "sec", NULL};
 
@@ -87,22 +89,101 @@ struct debug_daemon_cmd {
         unsigned int cmdv;
 };
 
-static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = {
+static const struct debug_daemon_cmd libcfs_debug_daemon_cmd[] = {
         {"start", DEBUG_DAEMON_START},
         {"stop", DEBUG_DAEMON_STOP},
         {0, 0}
 };
 
+#ifdef __linux__
+
+#define DAEMON_CTL_NAME         "/proc/sys/lnet/daemon_file"
+#define SUBSYS_DEBUG_CTL_NAME   "/proc/sys/lnet/subsystem_debug"
+#define DEBUG_CTL_NAME          "/proc/sys/lnet/debug"
+#define DUMP_KERNEL_CTL_NAME    "/proc/sys/lnet/dump_kernel"
+
+static int
+dbg_open_ctlhandle(const char *str)
+{
+        int fd;
+        fd = open(str, O_WRONLY);
+        if (fd < 0) {
+                fprintf(stderr, "open %s failed: %s\n", str,
+                        strerror(errno));
+                return -1;
+        }
+        return fd;
+}
+
+static void
+dbg_close_ctlhandle(int fd)
+{
+        close(fd);
+}
+
+static int
+dbg_write_cmd(int fd, char *str, int len)
+{
+        int    rc  = write(fd, str, len);
+
+        return (rc == len ? 0 : 1);
+}
+
+#elif defined(__DARWIN__)
+
+#define DAEMON_CTL_NAME         "lnet.trace_daemon"
+#define SUBSYS_DEBUG_CTL_NAME   "lnet.subsystem_debug"
+#define DEBUG_CTL_NAME          "lnet.debug"
+#define DUMP_KERNEL_CTL_NAME    "lnet.trace_dumpkernel"
+
+static char     sysctl_name[128];
+static int
+dbg_open_ctlhandle(const char *str)
+{
+
+        if (strlen(str)+1 > 128) {
+                fprintf(stderr, "sysctl name is too long: %s.\n", str);
+                return -1;
+        }
+        strcpy(sysctl_name, str);
+
+        return 0;
+}
+
+static void
+dbg_close_ctlhandle(int fd)
+{
+        sysctl_name[0] = '\0';
+        return;
+}
+
+static int
+dbg_write_cmd(int fd, char *str, int len)
+{
+        int     rc;
+
+        rc = sysctlbyname(sysctl_name, NULL, NULL, str, len+1);
+        if (rc != 0) {
+                fprintf(stderr, "sysctl %s with cmd (%s) error: %d\n",
+                        sysctl_name, str, errno);
+        }
+        return (rc == 0 ? 0: 1);
+}
+
+#else
+#error - Unknown sysctl convention.
+#endif
+
 static int do_debug_mask(char *name, int enable)
 {
         int found = 0, i;
 
-        for (i = 0; portal_debug_subsystems[i] != NULL; i++) {
-                if (strcasecmp(name, portal_debug_subsystems[i]) == 0 ||
+        for (i = 0; libcfs_debug_subsystems[i] != NULL; i++) {
+                if (strcasecmp(name, libcfs_debug_subsystems[i]) == 0 ||
                     strcasecmp(name, "all_subs") == 0) {
                         printf("%s output from subsystem \"%s\"\n",
                                 enable ? "Enabling" : "Disabling",
-                                portal_debug_subsystems[i]);
+                                libcfs_debug_subsystems[i]);
                         if (enable)
                                 subsystem_mask |= (1 << i);
                         else
@@ -110,12 +191,12 @@ static int do_debug_mask(char *name, int enable)
                         found = 1;
                 }
         }
-        for (i = 0; portal_debug_masks[i] != NULL; i++) {
-                if (strcasecmp(name, portal_debug_masks[i]) == 0 ||
+        for (i = 0; libcfs_debug_masks[i] != NULL; i++) {
+                if (strcasecmp(name, libcfs_debug_masks[i]) == 0 ||
                     strcasecmp(name, "all_types") == 0) {
                         printf("%s output of type \"%s\"\n",
                                 enable ? "Enabling" : "Disabling",
-                                portal_debug_masks[i]);
+                                libcfs_debug_masks[i]);
                         if (enable)
                                 debug_mask |= (1 << i);
                         else
@@ -173,38 +254,38 @@ static int applymask(char* procpath, int value)
         char buf[64];
         int len = snprintf(buf, 64, "%d", value);
 
-        int fd = open(procpath, O_WRONLY);
+        int fd = dbg_open_ctlhandle(procpath);
         if (fd == -1) {
                 fprintf(stderr, "Unable to open %s: %s\n",
                         procpath, strerror(errno));
                 return fd;
         }
-        rc = write(fd, buf, len+1);
-        if (rc<0) {
+        rc = dbg_write_cmd(fd, buf, len+1);
+        if (rc != 0) {
                 fprintf(stderr, "Write to %s failed: %s\n",
                         procpath, strerror(errno));
                 return rc;
         }
-        close(fd);
+        dbg_close_ctlhandle(fd);
         return 0;
 }
 
 static void applymask_all(unsigned int subs_mask, unsigned int debug_mask)
 {
         if (!dump_filename) {
-                applymask("/proc/sys/portals/subsystem_debug", subs_mask);
-                applymask("/proc/sys/portals/debug", debug_mask);
+                applymask(SUBSYS_DEBUG_CTL_NAME, subs_mask);
+                applymask(DEBUG_CTL_NAME, debug_mask);
         } else {
-                struct portals_debug_ioctl_data data;
+                struct libcfs_debug_ioctl_data data;
 
                 data.hdr.ioc_len = sizeof(data);
                 data.hdr.ioc_version = 0;
                 data.subs = subs_mask;
                 data.debug = debug_mask;
 
-                dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data);
+                dump(OBD_DEV_ID, LIBCFS_IOC_DEBUG_MASK, &data);
         }
-        printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n",
+        printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/lnet\n",
                subs_mask, debug_mask);
 }
 
@@ -219,13 +300,14 @@ int jt_dbg_list(int argc, char **argv)
 
         if (strcasecmp(argv[1], "subs") == 0) {
                 printf("Subsystems: all_subs");
-                for (i = 0; portal_debug_subsystems[i] != NULL; i++)
-                        printf(", %s", portal_debug_subsystems[i]);
+                for (i = 0; libcfs_debug_subsystems[i] != NULL; i++)
+                        if (libcfs_debug_subsystems[i][0])
+                                printf(", %s", libcfs_debug_subsystems[i]);
                 printf("\n");
         } else if (strcasecmp(argv[1], "types") == 0) {
                 printf("Types: all_types");
-                for (i = 0; portal_debug_masks[i] != NULL; i++)
-                        printf(", %s", portal_debug_masks[i]);
+                for (i = 0; libcfs_debug_masks[i] != NULL; i++)
+                        printf(", %s", libcfs_debug_masks[i]);
                 printf("\n");
         } else if (strcasecmp(argv[1], "applymasks") == 0) {
                 applymask_all(subsystem_mask, debug_mask);
@@ -275,7 +357,7 @@ static void print_saved_records(struct list_head *list, FILE *out)
                 list_del(&line->chain);
 
                 hdr = line->hdr;
-                fprintf(out, "%07x:%06x:%u:%u.%06Lu:%u:%u:%u:(%s:%u:%s()) %s",
+                fprintf(out, "%08x:%08x:%u:%u.%06llu:%u:%u:%u:(%s:%u:%s()) %s",
                         hdr->ph_subsys, hdr->ph_mask, hdr->ph_cpu_id,
                         hdr->ph_sec, (unsigned long long)hdr->ph_usec,
                         hdr->ph_stack, hdr->ph_pid, hdr->ph_extern_pid,
@@ -297,7 +379,7 @@ static int parse_buffer(FILE *in, FILE *out)
         CFS_INIT_LIST_HEAD(&chunk_list);
 
         while (1) {
-                rc = fread(buf, sizeof(hdr->ph_len), 1, in);
+                rc = fread(buf, sizeof(hdr->ph_len) + sizeof(hdr->ph_flags), 1, in);
                 if (rc <= 0)
                         break;
 
@@ -316,8 +398,8 @@ static int parse_buffer(FILE *in, FILE *out)
                         assert(list_empty(&chunk_list));
                 }
 
-                rc = fread(buf + sizeof(hdr->ph_len), 1,
-                           hdr->ph_len - sizeof(hdr->ph_len), in);
+                rc = fread(buf + sizeof(hdr->ph_len) + sizeof(hdr->ph_flags), 1,
+                           hdr->ph_len - sizeof(hdr->ph_len) - sizeof(hdr->ph_flags), in);
                 if (rc <= 0)
                         break;
 
@@ -337,6 +419,7 @@ static int parse_buffer(FILE *in, FILE *out)
 
                 line->hdr = malloc(hdr->ph_len + 1);
                 if (line->hdr == NULL) {
+                        free(line);
                         fprintf(stderr, "malloc failed; printing accumulated "
                                 "records and exiting.\n");
                         break;
@@ -394,21 +477,21 @@ int jt_dbg_debug_kernel(int argc, char **argv)
         if (stat(filename, &st) == 0 && S_ISREG(st.st_mode))
                 unlink(filename);
 
-        fd = open("/proc/sys/portals/dump_kernel", O_WRONLY);
+        fd = dbg_open_ctlhandle(DUMP_KERNEL_CTL_NAME);
         if (fd < 0) {
                 fprintf(stderr, "open(dump_kernel) failed: %s\n",
                         strerror(errno));
                 return 1;
         }
 
-        rc = write(fd, filename, strlen(filename));
-        if (rc != strlen(filename)) {
+        rc = dbg_write_cmd(fd, filename, strlen(filename));
+        if (rc != 0) {
                 fprintf(stderr, "write(%s) failed: %s\n", filename,
                         strerror(errno));
                 close(fd);
                 return 1;
         }
-        close(fd);
+        dbg_close_ctlhandle(fd);
 
         if (raw)
                 return 0;
@@ -476,8 +559,8 @@ int jt_dbg_debug_file(int argc, char **argv)
                 return 1;
         }
         if (argc > 2) {
-                fdout = open(argv[2], 
-                             O_CREAT | O_TRUNC | O_WRONLY | O_LARGEFILE, 
+                fdout = open(argv[2],
+                             O_CREAT | O_TRUNC | O_WRONLY | O_LARGEFILE,
                              0600);
                 if (fdout == -1) {
                         fprintf(stderr, "open(%s) failed: %s\n", argv[2],
@@ -504,17 +587,8 @@ int jt_dbg_debug_file(int argc, char **argv)
         return rc;
 }
 
-static int
-dbg_write_cmd(int fd, char *str)
-{
-        int    len = strlen(str);
-        int    rc  = write(fd, str, len);
-        
-        return (rc == len ? 0 : 1);
-}
-
 const char debug_daemon_usage[] = "usage: %s {start file [MB]|stop}\n";
-#define DAEMON_FILE "/proc/sys/portals/daemon_file"
+
 int jt_dbg_debug_daemon(int argc, char **argv)
 {
         int  rc;
@@ -525,13 +599,10 @@ int jt_dbg_debug_daemon(int argc, char **argv)
                 return 1;
         }
 
-        fd = open(DAEMON_FILE, O_WRONLY);
-        if (fd < 0) {
-                fprintf(stderr, "open %s failed: %s\n", DAEMON_FILE,
-                        strerror(errno));
+        fd = dbg_open_ctlhandle(DAEMON_CTL_NAME);
+        if (fd < 0)
                 return -1;
-        }
-        
+
         rc = -1;
         if (strcasecmp(argv[1], "start") == 0) {
              if (argc < 3 || argc > 4 ||
@@ -556,7 +627,7 @@ int jt_dbg_debug_daemon(int argc, char **argv)
                                 goto out;
                         }
                         snprintf(buf, sizeof(buf), "size=%ld", size);
-                        rc = dbg_write_cmd(fd, buf);
+                        rc = dbg_write_cmd(fd, buf, strlen(buf));
 
                         if (rc != 0) {
                                 fprintf(stderr, "set %s failed: %s\n",
@@ -565,7 +636,7 @@ int jt_dbg_debug_daemon(int argc, char **argv)
                         }
                 }
 
-                rc = dbg_write_cmd(fd, argv[2]);
+                rc = dbg_write_cmd(fd, argv[2], strlen(argv[2]));
                 if (rc != 0) {
                         fprintf(stderr, "start debug_daemon on %s failed: %s\n",
                                 argv[2], strerror(errno));
@@ -575,7 +646,7 @@ int jt_dbg_debug_daemon(int argc, char **argv)
                 goto out;
         }
         if (strcasecmp(argv[1], "stop") == 0) {
-                rc = dbg_write_cmd(fd, "stop");
+                rc = dbg_write_cmd(fd, "stop", 4);
                 if (rc != 0) {
                         fprintf(stderr, "stopping debug_daemon failed: %s\n",
                                 strerror(errno));
@@ -589,14 +660,14 @@ int jt_dbg_debug_daemon(int argc, char **argv)
         fprintf(stderr, debug_daemon_usage, argv[0]);
         rc = -1;
 out:
-        close(fd);
+        dbg_close_ctlhandle(fd);
         return rc;
 }
 
 int jt_dbg_clear_debug_buf(int argc, char **argv)
 {
         int rc;
-        struct portal_ioctl_data data;
+        struct libcfs_ioctl_data data;
 
         if (argc != 1) {
                 fprintf(stderr, "usage: %s\n", argv[0]);
@@ -604,14 +675,14 @@ int jt_dbg_clear_debug_buf(int argc, char **argv)
         }
 
         memset(&data, 0, sizeof(data));
-        if (portal_ioctl_pack(&data, &buf, max) != 0) {
-                fprintf(stderr, "portal_ioctl_pack failed.\n");
+        if (libcfs_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "libcfs_ioctl_pack failed.\n");
                 return -1;
         }
 
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_CLEAR_DEBUG, buf);
         if (rc) {
-                fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n",
+                fprintf(stderr, "IOC_LIBCFS_CLEAR_DEBUG failed: %s\n",
                         strerror(errno));
                 return -1;
         }
@@ -620,41 +691,37 @@ int jt_dbg_clear_debug_buf(int argc, char **argv)
 
 int jt_dbg_mark_debug_buf(int argc, char **argv)
 {
+        static char scratch[MAX_MARK_SIZE] = { '\0' };
         int rc, max_size = MAX_MARK_SIZE-1;
-        struct portal_ioctl_data data;
+        struct libcfs_ioctl_data data = { 0 };
         char *text;
         time_t now = time(NULL);
 
         if (argc > 1) {
-                int counter;
-                text = malloc(MAX_MARK_SIZE);
+                int count;
+                text = scratch;
                 strncpy(text, argv[1], max_size);
                 max_size-=strlen(argv[1]);
-                for(counter = 2; (counter < argc) && (max_size > 0) ; counter++){
-                        strncat(text, " ", 1);
-                        max_size-=1;
-                        strncat(text, argv[counter], max_size);
-                        max_size-=strlen(argv[counter]);
+                for (count = 2; (count < argc) && (max_size > 0); count++){
+                        strncat(text, " ", max_size);
+                        max_size -= 1;
+                        strncat(text, argv[count], max_size);
+                        max_size -= strlen(argv[count]);
                 }
         } else {
                 text = ctime(&now);
-                text[strlen(text) - 1] = '\0'; /* stupid \n */
-        }
-        if (!max_size) {
-                text[MAX_MARK_SIZE - 1] = '\0';
         }
 
-        memset(&data, 0, sizeof(data));
         data.ioc_inllen1 = strlen(text) + 1;
         data.ioc_inlbuf1 = text;
-        if (portal_ioctl_pack(&data, &buf, max) != 0) {
-                fprintf(stderr, "portal_ioctl_pack failed.\n");
+        if (libcfs_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "libcfs_ioctl_pack failed.\n");
                 return -1;
         }
 
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_MARK_DEBUG, buf);
         if (rc) {
-                fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n",
+                fprintf(stderr, "IOC_LIBCFS_MARK_DEBUG failed: %s\n",
                         strerror(errno));
                 return -1;
         }
@@ -664,46 +731,56 @@ int jt_dbg_mark_debug_buf(int argc, char **argv)
 static struct mod_paths {
         char *name, *path;
 } mod_paths[] = {
-        {"libcfs", "portals/libcfs"},
-        {"portals", "portals/portals"},
-        {"ksocknal", "portals/knals/socknal"},
-        {"kptlrouter", "portals/router"},
+        {"libcfs", "lnet/libcfs"},
+        {"lnet", "lnet/lnet"},
+        {"kciblnd", "lnet/klnds/ciblnd"},
+        {"kgmlnd", "lnet/klnds/gmlnd"},
+        {"kmxlnd", "lnet/klnds/mxlnd"},
+        {"kiiblnd", "lnet/klnds/iiblnd"},
+        {"ko2iblnd", "lnet/klnds/o2iblnd"},
+        {"kopeniblnd", "lnet/klnds/openiblnd"},
+        {"kptllnd", "lnet/klnds/ptllnd"},
+        {"kqswlnd", "lnet/klnds/qswlnd"},
+        {"kralnd", "lnet/klnds/ralnd"},
+        {"ksocklnd", "lnet/klnds/socklnd"},
+        {"ktdilnd", "lnet/klnds/tdilnd"},
+        {"kviblnd", "lnet/klnds/viblnd"},
         {"lvfs", "lustre/lvfs"},
         {"obdclass", "lustre/obdclass"},
         {"llog_test", "lustre/obdclass"},
-        {"ptlrpcs", "lustre/sec"},
-        {"ptlrpcs_gss", "lustre/sec/gss"},
+        {"ptlrpc_gss", "lustre/ptlrpc/gss"},
+        {"ptlrpc", "lustre/ptlrpc"},
         {"gks", "lustre/sec/gks"},
         {"gkc", "lustre/sec/gks"},
-        {"ptlrpc", "lustre/ptlrpc"},
-        {"obdext2", "lustre/obdext2"},
         {"ost", "lustre/ost"},
         {"osc", "lustre/osc"},
         {"mds", "lustre/mds"},
         {"mdc", "lustre/mdc"},
         {"llite", "lustre/llite"},
+        {"lustre", "lustre/llite"},
         {"ldiskfs", "lustre/ldiskfs"},
         {"smfs", "lustre/smfs"},
         {"obdecho", "lustre/obdecho"},
         {"ldlm", "lustre/ldlm"},
         {"obdfilter", "lustre/obdfilter"},
-        {"extN", "lustre/extN"},
         {"lov", "lustre/lov"},
         {"lmv", "lustre/lmv"},
         {"fsfilt_ext3", "lustre/lvfs"},
-        {"fsfilt_extN", "lustre/lvfs"},
         {"fsfilt_reiserfs", "lustre/lvfs"},
         {"fsfilt_smfs", "lustre/lvfs"},
         {"fsfilt_ldiskfs", "lustre/lvfs"},
-        {"mds_ext2", "lustre/mds"},
         {"mds_ext3", "lustre/mds"},
-        {"mds_extN", "lustre/mds"},
-        {"ptlbd", "lustre/ptlbd"},
-        {"mgmt_svc", "lustre/mgmt"},
-        {"mgmt_cli", "lustre/mgmt"},
         {"cobd", "lustre/cobd"},
         {"cmobd", "lustre/cmobd"},
-        {"confobd", "lustre/obdclass"},
+        {"lquota", "lustre/quota"},
+        {"mgs", "lustre/mgs"},
+        {"mgc", "lustre/mgc"},
+        {"mdt", "lustre/mdt"},
+        {"mdd", "lustre/mdd"},
+        {"osd", "lustre/osd"},
+        {"cmm", "lustre/cmm"},
+        {"fid", "lustre/fid"},
+        {"fld", "lustre/fld"},
         {NULL, NULL}
 };
 
@@ -724,7 +801,6 @@ static int jt_dbg_modules_2_4(int argc, char **argv)
                 return 0;
         }
 
-        printf("dir\n");
         for (mp = mod_paths; mp->name != NULL; mp++) {
                 struct module_info info;
                 int rc;
@@ -742,8 +818,6 @@ static int jt_dbg_modules_2_4(int argc, char **argv)
                         printf("add-symbol-file %s%s%s/%s.o 0x%0lx\n", path,
                                path[0] ? "/" : "", mp->path, mp->name,
                                info.addr + sizeof(struct module));
-                        printf("dir %s%s%s\n", path,
-                               path[0] ? "/" : "", mp->path);
                 }
         }
 
@@ -759,7 +833,7 @@ static int jt_dbg_modules_2_5(int argc, char **argv)
         char *path = "";
         char *kernel = "linux";
         const char *proc = "/proc/modules";
-        char modname[128], others[128];
+        char modname[128], others[4096];
         long modaddr;
         int rc;
         FILE *file;
@@ -779,7 +853,6 @@ static int jt_dbg_modules_2_5(int argc, char **argv)
                 return 0;
         }
 
-        printf("dir\n");
         while ((rc = fscanf(file, "%s %s %s %s %s %lx\n",
                 modname, others, others, others, others, &modaddr)) == 6) {
                 for (mp = mod_paths; mp->name != NULL; mp++) {
@@ -789,11 +862,10 @@ static int jt_dbg_modules_2_5(int argc, char **argv)
                 if (mp->name) {
                         printf("add-symbol-file %s%s%s/%s.o 0x%0lx\n", path,
                                path[0] ? "/" : "", mp->path, mp->name, modaddr);
-                        printf("dir %s%s%s\n", path,
-                               path[0] ? "/" : "", mp->path);
                 }
         }
 
+        fclose(file);
         return 0;
 }
 
@@ -820,7 +892,7 @@ int jt_dbg_modules(int argc, char **argv)
 int jt_dbg_panic(int argc, char **argv)
 {
         int rc;
-        struct portal_ioctl_data data;
+        struct libcfs_ioctl_data data;
 
         if (argc != 1) {
                 fprintf(stderr, "usage: %s\n", argv[0]);
@@ -828,14 +900,14 @@ int jt_dbg_panic(int argc, char **argv)
         }
 
         memset(&data, 0, sizeof(data));
-        if (portal_ioctl_pack(&data, &buf, max) != 0) {
-                fprintf(stderr, "portal_ioctl_pack failed.\n");
+        if (libcfs_ioctl_pack(&data, &buf, max) != 0) {
+                fprintf(stderr, "libcfs_ioctl_pack failed.\n");
                 return -1;
         }
 
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PANIC, buf);
         if (rc) {
-                fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n",
+                fprintf(stderr, "IOC_LIBCFS_PANIC failed: %s\n",
                         strerror(errno));
                 return -1;
         }
index 1b6cd96..cf70fd8 100644 (file)
@@ -25,8 +25,8 @@
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <portals/api-support.h>
-#include <portals/ptlctl.h>
+#include <lnet/api-support.h>
+#include <lnet/lnetctl.h>
 #include "parser.h"
 
 
@@ -53,7 +53,8 @@ int main(int argc, char **argv)
         if (dbg_initialize(argc, argv) < 0)
                 exit(2);
 
-        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
+        register_ioc_dev(LNET_DEV_ID, LNET_DEV_PATH, 
+                         LNET_DEV_MAJOR, LNET_DEV_MINOR);
 
         Parser_init("debugctl > ", list);
         if (argc > 1)
@@ -61,6 +62,6 @@ int main(int argc, char **argv)
 
         Parser_commands();
 
-        unregister_ioc_dev(PORTALS_DEV_ID);
+        unregister_ioc_dev(LNET_DEV_ID);
         return 0;
 }
index f7e5250..ce5cb14 100644 (file)
 #include <syslog.h>
 #include <errno.h>
 
-#include <portals/api-support.h>
-#include <portals/lib-types.h>
+#include <lnet/api-support.h>
+#include <lnet/lib-types.h>
 
 #include <gm.h>
 
-#define GMNAL_IOC_GET_GNID 1
 /*
  *      portals always uses unit 0
  *      Can this be configurable?
 void
 usage(char *prg, int h)
 {
-        fprintf(stderr, "usage %s -n hostname | -l | -h\n", prg);
-        if (h) {
-                printf("\nGet Myrinet Global network ids for specified host\n"
-                       "-l gets network id for local host\n");
-        }
+        fprintf(stderr,
+                "usage %s -h\n"
+                "      %s [-l] [-n hostname] [-L] [hostnames]\n", prg);
+
+        if (h)
+                printf("Print Myrinet Global network ids for specified hosts\n"
+                       "-l                    print local host's ID\n"
+                       "-n hostname           print given host's ID\n"
+                       "-L                    print Myringet local net ID too\n"
+                       "[hostnames]           print ids of given hosts (local if none)\n");
 }
 
-unsigned
-u_getgmnid(char *name, int get_local_id)
+gm_status_t
+print_gmid(char *name, int name_fieldlen, int show_local_id)
 {
         struct gm_port *gm_port;
-        int             gm_port_id = 2;
-        gm_status_t     gm_status = GM_SUCCESS;
-        unsigned        global_nid = 0, local_nid = 0; /* gm ids never 0 */
+        int             gm_port_id;
+        gm_status_t     gm_status;
+        unsigned int    local_id;
+        unsigned int    global_id;
 
         gm_status = gm_init();
         if (gm_status != GM_SUCCESS) {
                 fprintf(stderr, "gm_init: %s\n", gm_strerror(gm_status));
-                return(0);
+                return gm_status;
         }
 
+        gm_port_id = 2;
         gm_status = gm_open(&gm_port, GM_UNIT, gm_port_id, "gmnalnid",
                             GM_API_VERSION);
         if (gm_status != GM_SUCCESS) {
@@ -83,77 +89,96 @@ u_getgmnid(char *name, int get_local_id)
 
                 if (gm_status != GM_SUCCESS) {
                         fprintf(stderr, "gm_open: %s\n",gm_strerror(gm_status));
-                        gm_finalize();
-                        return(0);
+                        goto out_0;
                 }
         }
 
-        if (get_local_id) {
-                local_nid = 1;
+        if (name == NULL) {
+                local_id = 1;
+                name = "<local>";
         } else {
                 gm_status = gm_host_name_to_node_id_ex(gm_port, 1000000, name,
-                                                       &local_nid);
+                                                       &local_id);
                 if (gm_status != GM_SUCCESS) {
-                        fprintf(stderr, "gm_host_name_to_node_id_ex: %s\n",
-                                gm_strerror(gm_status));
-                        gm_close(gm_port);
-                        gm_finalize();
-                        return(0);
+                        fprintf(stderr, "gm_host_name_to_node_id_ex(%s): %s\n",
+                                name, gm_strerror(gm_status));
+                        goto out_1;
                 }
         }
 
-        gm_status = gm_node_id_to_global_id(gm_port, local_nid, &global_nid) ;
+        gm_status = gm_node_id_to_global_id(gm_port, local_id, &global_id) ;
         if (gm_status != GM_SUCCESS) {
-                fprintf(stderr, "gm_node_id_to_global_id: %s\n",
-                        gm_strerror(gm_status));
-                gm_close(gm_port);
-                gm_finalize();
-                return(0);
+                fprintf(stderr, "gm_node_id_to_global_id(%s:%d): %s\n",
+                        name, local_id, gm_strerror(gm_status));
+                goto out_1;
         }
+
+        if (name_fieldlen > 0)
+                printf ("%*s ", name_fieldlen, name);
+
+        if (!show_local_id)
+                printf("0x%x\n", global_id);
+        else
+                printf("local 0x%x global 0x%x\n", local_id, global_id);
+
+ out_1:
         gm_close(gm_port);
+ out_0:
         gm_finalize();
-        return(global_nid);
+
+        return gm_status;
 }
 
-int main(int argc, char **argv)
+int
+main (int argc, char **argv)
 {
-        unsigned int        nid = 0;
-        char               *name = NULL;
         int                 c;
-        int                 get_local_id = 0;
+        gm_status_t         gmrc;
+        int                 rc;
+        int                 max_namelen = 0;
+        int                 show_local_id = 0;
 
-        while ((c = getopt(argc, argv, "n:lh")) != -1) {
+        while ((c = getopt(argc, argv, "n:lLh")) != -1)
                 switch(c) {
-                case('n'):
-                        if (get_local_id) {
-                                usage(argv[0], 0);
-                                exit(-1);
-                        }
-                        name = optarg;
-                        break;
-                case('h'):
+                case 'h':
                         usage(argv[0], 1);
-                        exit(-1);
-                        break;
-                case('l'):
-                        if (name) {
-                                usage(argv[0], 0);
-                                exit(-1);
-                        }
-                        get_local_id = 1;
+                        return 0;
+
+                case 'L':
+                        show_local_id = 1;
                         break;
+
+                case 'n':
+                        gmrc = print_gmid(optarg, 0, show_local_id);
+                        return (gmrc == GM_SUCCESS) ? 0 : 1;
+
+                case 'l':
+                        gmrc = print_gmid(NULL, 0, show_local_id);
+                        return (gmrc == GM_SUCCESS) ? 0 : 1;
+
                 default:
                         usage(argv[0], 0);
-                        exit(-1);
+                        return 2;
                 }
+
+        if (optind == argc) {
+                gmrc = print_gmid(NULL, 0, show_local_id);
+                return (gmrc == GM_SUCCESS) ? 0 : 1;
         }
 
-        if (!name && !get_local_id) {
-                usage(argv[0], 0);
-                exit(-1);
+        if (optind != argc - 1)
+                for (c = optind; c < argc; c++)
+                        if (strlen(argv[c]) > max_namelen)
+                                max_namelen = strlen(argv[c]);
+
+        rc = 0;
+
+        for (c = optind; c < argc; c++) {
+                gmrc = print_gmid(argv[c], max_namelen, show_local_id);
+
+                if (gmrc != GM_SUCCESS)
+                        rc = 1;
         }
 
-        nid = u_getgmnid(name, get_local_id);
-        printf("%u\n", nid);
-        exit(0);
+        return rc;
 }
index 01dccb1..0bdb782 100644 (file)
 #include <errno.h>
 #include <unistd.h>
 
-#include <portals/api-support.h>
-#include <portals/ptlctl.h>
+#include <lnet/api-support.h>
+#include <lnet/lnetctl.h>
 #include <libcfs/portals_utils.h>
 
+
 static ioc_handler_t  do_ioctl;                 /* forward ref */
 static ioc_handler_t *current_ioc_handler = &do_ioctl;
 
 struct ioc_dev {
-       const char * dev_name;
-       int dev_fd;
+        const char * dev_name;
+        int dev_fd;
+        int dev_major;
+        int dev_minor;
 };
 
 static struct ioc_dev ioc_dev_list[10];
 
 struct dump_hdr {
-       int magic;
-       int dev_id;
+        int magic;
+        int dev_id;
         unsigned int opc;
 };
 
@@ -64,60 +67,78 @@ set_ioc_handler (ioc_handler_t *handler)
                 current_ioc_handler = handler;
 }
 
+/* Catamount has no <linux/kdev_t.h>, so just define it here */
+#ifndef MKDEV
+# define MKDEV(a,b) (((a) << 8) | (b))
+#endif
+
 static int
 open_ioc_dev(int dev_id) 
 {
-       const char * dev_name;
+        const char * dev_name;
 
-       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
-               return -EINVAL;
+        if (dev_id < 0 || 
+            dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+                return -EINVAL;
 
-       dev_name = ioc_dev_list[dev_id].dev_name;
-       if (dev_name == NULL) {
+        dev_name = ioc_dev_list[dev_id].dev_name;
+        if (dev_name == NULL) {
                 fprintf(stderr, "unknown device id: %d\n", dev_id);
-               return -EINVAL;
-       }
-
-       if (ioc_dev_list[dev_id].dev_fd < 0) {
-               int fd = open(dev_name, O_RDWR);
-               
-               if (fd < 0) {
-                       fprintf(stderr, "opening %s failed: %s\n"
-                               "hint: the kernel modules may not be loaded\n",
-                               dev_name, strerror(errno));
-                       return fd;
-               }
-               ioc_dev_list[dev_id].dev_fd = fd;
-       }
-
-       return ioc_dev_list[dev_id].dev_fd;
+                return -EINVAL;
+        }
+
+        if (ioc_dev_list[dev_id].dev_fd < 0) {
+                int fd = open(dev_name, O_RDWR);
+
+                /* Make the /dev/ node if we need to */
+                if (fd < 0 && errno == ENOENT) {
+                        if (mknod(dev_name, 
+                                  S_IFCHR|S_IWUSR|S_IRUSR,
+                                  MKDEV(ioc_dev_list[dev_id].dev_major,
+                                        ioc_dev_list[dev_id].dev_minor)) == 0)
+                                fd = open(dev_name, O_RDWR);
+                        else
+                                fprintf(stderr, "mknod %s failed: %s\n",
+                                        dev_name, strerror(errno));
+                }
+
+                if (fd < 0) {
+                        fprintf(stderr, "opening %s failed: %s\n"
+                                "hint: the kernel modules may not be loaded\n",
+                                dev_name, strerror(errno));
+                        return fd;
+                }
+                ioc_dev_list[dev_id].dev_fd = fd;
+        }
+
+        return ioc_dev_list[dev_id].dev_fd;
 }
 
 
 static int 
 do_ioctl(int dev_id, unsigned int opc, void *buf)
 {
-       int fd, rc;
-       
-       fd = open_ioc_dev(dev_id);
-       if (fd < 0) 
-               return fd;
-
-       rc = ioctl(fd, opc, buf);
-       return rc;
-       
+        int fd, rc;
+        
+        fd = open_ioc_dev(dev_id);
+        if (fd < 0) 
+                return fd;
+
+        rc = ioctl(fd, opc, buf);
+        return rc;
+        
 }
 
 static FILE *
 get_dump_file() 
 {
-       FILE *fp = NULL;
-       
-       if (!dump_filename) {
-               fprintf(stderr, "no dump filename\n");
-       } else 
-               fp = fopen(dump_filename, "a");
-       return fp;
+        FILE *fp = NULL;
+        
+        if (!dump_filename) {
+                fprintf(stderr, "no dump filename\n");
+        } else 
+                fp = fopen(dump_filename, "a");
+        return fp;
 }
 
 /*
@@ -127,25 +148,25 @@ get_dump_file()
 int 
 dump(int dev_id, unsigned int opc, void *buf)
 {
-       FILE *fp;
-       struct dump_hdr dump_hdr;
-        struct portal_ioctl_hdr * ioc_hdr = (struct  portal_ioctl_hdr *) buf;
-       int rc;
-       
-       printf("dumping opc %x to %s\n", opc, dump_filename);
-       
-
-       dump_hdr.magic = 0xdeadbeef;
-       dump_hdr.dev_id = dev_id;
-       dump_hdr.opc = opc;
-
-       fp = get_dump_file();
-       if (fp == NULL) {
-               fprintf(stderr, "%s: %s\n", dump_filename, 
-                       strerror(errno));
-               return -EINVAL;
-       }
-       
+        FILE *fp;
+        struct dump_hdr dump_hdr;
+        struct libcfs_ioctl_hdr * ioc_hdr = (struct  libcfs_ioctl_hdr *) buf;
+        int rc;
+        
+        printf("dumping opc %x to %s\n", opc, dump_filename);
+        
+
+        dump_hdr.magic = 0xdeadbeef;
+        dump_hdr.dev_id = dev_id;
+        dump_hdr.opc = opc;
+
+        fp = get_dump_file();
+        if (fp == NULL) {
+                fprintf(stderr, "%s: %s\n", dump_filename, 
+                        strerror(errno));
+                return -EINVAL;
+        }
+        
         rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp);
         if (rc == 1)
                 rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp);
@@ -161,32 +182,36 @@ dump(int dev_id, unsigned int opc, void *buf)
 
 /* register a device to send ioctls to.  */
 int 
-register_ioc_dev(int dev_id, const char * dev_name) 
+register_ioc_dev(int dev_id, const char * dev_name, int major, int minor
 {
 
-       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
-               return -EINVAL;
-
-       unregister_ioc_dev(dev_id);
+        if (dev_id < 0 || 
+            dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+                return -EINVAL;
 
-       ioc_dev_list[dev_id].dev_name = dev_name;
-       ioc_dev_list[dev_id].dev_fd = -1;
+        unregister_ioc_dev(dev_id);
 
-       return dev_id;
+        ioc_dev_list[dev_id].dev_name = dev_name;
+        ioc_dev_list[dev_id].dev_fd = -1;
+        ioc_dev_list[dev_id].dev_major = major;
+        ioc_dev_list[dev_id].dev_minor = minor;
+        return dev_id;
 }
 
 void
 unregister_ioc_dev(int dev_id) 
 {
 
-       if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list))
-               return;
-       if (ioc_dev_list[dev_id].dev_name != NULL &&
-           ioc_dev_list[dev_id].dev_fd >= 0) 
-               close(ioc_dev_list[dev_id].dev_fd);
+        if (dev_id < 0 || 
+            dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0]))
+                return;
+        if (ioc_dev_list[dev_id].dev_name != NULL &&
+            ioc_dev_list[dev_id].dev_fd >= 0) 
+                close(ioc_dev_list[dev_id].dev_fd);
 
-       ioc_dev_list[dev_id].dev_name = NULL;
-       ioc_dev_list[dev_id].dev_fd = -1;
+        ioc_dev_list[dev_id].dev_name = NULL;
+        ioc_dev_list[dev_id].dev_fd = -1;
 }
 
 /* If this file is set, then all ioctl buffers will be 
@@ -194,15 +219,15 @@ unregister_ioc_dev(int dev_id)
 int
 set_ioctl_dump(char * file)
 {
-       if (dump_filename)
-               free(dump_filename);
-       
-       dump_filename = strdup(file);
+        if (dump_filename)
+                free(dump_filename);
+        
+        dump_filename = strdup(file);
         if (dump_filename == NULL)
                 abort();
 
         set_ioc_handler(&dump);
-       return 0;
+        return 0;
 }
 
 int
@@ -222,69 +247,69 @@ l_ioctl(int dev_id, unsigned int opc, void *buf)
 int 
 parse_dump(char * dump_file, ioc_handler_t ioc_func)
 {
-       int line =0;
-       struct stat st;
-       char *start, *buf, *end;
+        int line =0;
+        struct stat st;
+        char *start, *buf, *end;
 #ifndef __CYGWIN__
         int fd;
 #else
         HANDLE fd, hmap;
         DWORD size;
 #endif
-       
+        
 #ifndef __CYGWIN__
-       fd = syscall(SYS_open, dump_file, O_RDONLY);
+        fd = syscall(SYS_open, dump_file, O_RDONLY);
         if (fd < 0) {
                 fprintf(stderr, "couldn't open %s: %s\n", dump_file, 
                         strerror(errno));
                 exit(1);
         }
 
-       if (fstat(fd, &st)) { 
-               perror("stat fails");
-               exit(1);
-       }
+        if (fstat(fd, &st)) { 
+                perror("stat fails");
+                exit(1);
+        }
 
-       if (st.st_size < 1) {
-               fprintf(stderr, "KML is empty\n");
-               exit(1);
-       }
+        if (st.st_size < 1) {
+                fprintf(stderr, "KML is empty\n");
+                exit(1);
+        }
 
-       start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
-       end = start + st.st_size;
-       close(fd);
+        start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0);
+        end = start + st.st_size;
+        close(fd);
         if (start == MAP_FAILED) {
-               fprintf(stderr, "can't create file mapping\n");
-               exit(1);
+                fprintf(stderr, "can't create file mapping\n");
+                exit(1);
         }
 #else
         fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL,
                         OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
         size = GetFileSize(fd, NULL);
         if (size < 1) {
-               fprintf(stderr, "KML is empty\n");
-               exit(1);
-       }
+                fprintf(stderr, "KML is empty\n");
+                exit(1);
+        }
 
         hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL);
         start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0);
         end = buf + size;
         CloseHandle(fd);
         if (start == NULL) {
-               fprintf(stderr, "can't create file mapping\n");
-               exit(1);
+                fprintf(stderr, "can't create file mapping\n");
+                exit(1);
         }
 #endif /* __CYGWIN__ */
 
-       while (buf < end) {
+        while (buf < end) {
                 struct dump_hdr *dump_hdr = (struct dump_hdr *) buf;
-                struct portal_ioctl_hdr * data;
+                struct libcfs_ioctl_hdr * data;
                 char tmp[8096];
                 int rc;
 
                 line++;
 
-                data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr));
+                data = (struct libcfs_ioctl_hdr *) (buf + sizeof(*dump_hdr));
                 if (buf + data->ioc_len > end ) {
                         fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf,
                                 data->ioc_len, end);
@@ -307,7 +332,7 @@ parse_dump(char * dump_file, ioc_handler_t ioc_func)
                 }
 
                 buf += data->ioc_len + sizeof(*dump_hdr);
-       }
+        }
 
 #ifndef __CYGWIN__
         munmap(start, end - start);
@@ -316,7 +341,7 @@ parse_dump(char * dump_file, ioc_handler_t ioc_func)
         CloseHandle(hmap);
 #endif
 
-       return 0;
+        return 0;
 }
 
 int 
@@ -326,8 +351,8 @@ jt_ioc_dump(int argc, char **argv)
                 fprintf(stderr, "usage: %s [hostname]\n", argv[0]);
                 return 0;
         }
-       printf("setting dumpfile to: %s\n", argv[1]);
-       
-       set_ioctl_dump(argv[1]);
-       return 0;
+        printf("setting dumpfile to: %s\n", argv[1]);
+        
+        set_ioctl_dump(argv[1]);
+        return 0;
 }
diff --git a/lnet/utils/lbstats b/lnet/utils/lbstats
new file mode 100755 (executable)
index 0000000..a8f0857
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+echo "=== Router Buffers ======="
+test -e /proc/sys/lnet/buffers && cat /proc/sys/lnet/buffers
+echo
+echo "=== NIs ============================================"
+test -e /proc/sys/lnet/nis && cat /proc/sys/lnet/nis
+echo
+echo "=== Peers ============================================================="
+test -e /proc/sys/lnet/peers && cat /proc/sys/lnet/peers
+echo
index b91295b..2f740c1 100644 (file)
 #include <unistd.h>
 #include <sys/param.h>
 #include <assert.h>
-
-#ifdef HAVE_LIBREADLINE
-#define        READLINE_LIBRARY
-#include <readline/readline.h>
-
-/* completion_matches() is #if 0-ed out in modern glibc */
-#ifndef completion_matches
-#  define completion_matches rl_completion_matches
-#endif
-#endif
-
-extern void using_history(void);
-extern void stifle_history(int);
-extern void add_history(char *);
+#include <lnet/api-support.h>
 
 #include "parser.h"
 
index 692342f..671e78c 100644 (file)
 #ifdef HAVE_ENDIAN_H
 #include <endian.h>
 #endif
-#if CRAY_PORTALS
-#ifdef REDSTORM
-#define __QK__
-#endif
-#include <portals/ipmap.h>
-#endif
 
 #include <libcfs/portals_utils.h>
-#include <portals/api-support.h>
-#include <portals/ptlctl.h>
-#include <portals/lib-types.h>
-#include <portals/socknal.h>
+#include <lnet/api-support.h>
+#include <lnet/lnetctl.h>
+#include <lnet/socklnd.h>
 #include "parser.h"
 
-unsigned int portal_debug;
-unsigned int portal_printk;
-
-static unsigned int g_nal = 0;
-
-typedef struct
-{
-        char *name;
-        int   num;
-} name2num_t;
-
-static name2num_t nalnames[] = {
-        {"any",         0},
-#if !CRAY_PORTALS
-        {"tcp",                SOCKNAL},
-        {"elan",       QSWNAL},
-        {"gm",         GMNAL},
-        {"openib",      OPENIBNAL},
-        {"iib",         IIBNAL},
-        {"vib",         VIBNAL},
-        {"lo",          LONAL},
-        {"ra",          RANAL},
-#else
-        {"cray_kern_nal", CRAY_KERN_NAL},
-        {"cray_user_nal", CRAY_USER_NAL},
-        {"cray_qk_nal",   CRAY_QK_NAL},
-#endif
-        {NULL,         -1}
-};
+unsigned int libcfs_debug;
+unsigned int libcfs_printk;
 
-static cfg_record_cb_t g_record_cb;
+static int   g_net_set;
+static __u32 g_net;
 
 /* Convert a string boolean to an int; "enable" -> 1 */
-int ptl_parse_bool (int *b, char *str) {
+int 
+lnet_parse_bool (int *b, char *str) 
+{
         if (!strcasecmp (str, "no") ||
             !strcasecmp (str, "n") ||
             !strcasecmp (str, "off") ||
@@ -116,116 +85,18 @@ int ptl_parse_bool (int *b, char *str) {
         return (-1);
 }
 
-/* Convert human readable size string to and int; "1k" -> 1000 */
-int ptl_parse_size (int *sizep, char *str) {
-        int size;
-        char mod[32];
-
-        switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) {
-        default:
-                return (-1);
-
-        case 1:
-                *sizep = size;
-                return (0);
-
-        case 2:
-                switch (*mod) {
-                case 'g':
-                case 'G':
-                        *sizep = size << 30;
-                        return (0);
-
-                case 'm':
-                case 'M':
-                        *sizep = size << 20;
-                        return (0);
-
-                case 'k':
-                case 'K':
-                        *sizep = size << 10;
-                        return (0);
-
-                default:
-                        *sizep = size;
-                        return (0);
-                }
-        }
-}
-
-int 
-ptl_set_cfg_record_cb(cfg_record_cb_t cb)
-{
-        g_record_cb = cb;
-        return 0;
-}
-
-int 
-pcfg_ioctl(struct portals_cfg *pcfg) 
-{
-        int rc;
-
-        if (pcfg->pcfg_nal ==0)
-                pcfg->pcfg_nal    = g_nal;
-
-        if (g_record_cb) {
-                rc = g_record_cb(PORTALS_CFG_TYPE, sizeof(*pcfg), pcfg);
-        } else {
-                struct portal_ioctl_data data;
-                PORTAL_IOC_INIT (data);
-                data.ioc_pbuf1   = (char*)pcfg;
-                data.ioc_plen1   = sizeof(*pcfg);
-                /* XXX liblustre hack XXX */
-                data.ioc_nal_cmd = pcfg->pcfg_command;
-                data.ioc_nid = pcfg->pcfg_nid;
-
-                rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
-
-                if (rc == 0 && pcfg->pcfg_version != PORTALS_CFG_VERSION)
-                        return -EINVAL;
-        }
-
-        return (rc);
-}
-
-
-
-static name2num_t *
-name2num_lookup_name (name2num_t *table, char *str)
-{
-        while (table->name != NULL)
-                if (!strcmp (str, table->name))
-                        return (table);
-                else
-                        table++;
-        return (NULL);
-}
-
-static name2num_t *
-name2num_lookup_num (name2num_t *table, int num)
-{
-        while (table->name != NULL)
-                if (num == table->num)
-                        return (table);
-                else
-                        table++;
-        return (NULL);
-}
-
 int
-ptl_name2nal (char *str)
-{
-        name2num_t *e = name2num_lookup_name (nalnames, str);
-
-        return ((e == NULL) ? -1 : e->num);
-}
-
-static char *
-nal2name (int nal)
+lnet_parse_port (int *port, char *str)
 {
-        name2num_t *e = name2num_lookup_num (nalnames, nal);
+        char      *end;
+        
+        *port = strtol (str, &end, 0);
 
-        return ((e == NULL) ? "???" : e->name);
+        if (*end == 0 &&                        /* parsed whole string */
+            *port > 0 && *port < 65536)         /* minimal sanity check */
+                return (0);
+        
+        return (-1);
 }
 
 #ifdef HAVE_GETHOSTBYNAME
@@ -252,50 +123,7 @@ ptl_gethostbyname(char * hname) {
 #endif
 
 int
-ptl_parse_port (int *port, char *str)
-{
-        char      *end;
-        
-        *port = strtol (str, &end, 0);
-
-        if (*end == 0 &&                        /* parsed whole string */
-            *port > 0 && *port < 65536)         /* minimal sanity check */
-                return (0);
-        
-        return (-1);
-}
-
-int
-ptl_parse_time (time_t *t, char *str) 
-{
-        char          *end;
-        int            n;
-        struct tm      tm;
-        
-        *t = strtol (str, &end, 0);
-        if (*end == 0) /* parsed whole string */
-                return (0);
-        
-        memset (&tm, 0, sizeof (tm));
-        n = sscanf (str, "%d-%d-%d-%d:%d:%d",
-                    &tm.tm_year, &tm.tm_mon, &tm.tm_mday, 
-                    &tm.tm_hour, &tm.tm_min, &tm.tm_sec);
-        if (n != 6)
-                return (-1);
-        
-        tm.tm_mon--;                    /* convert to 0 == Jan */
-        tm.tm_year -= 1900;             /* y2k quirk */
-        tm.tm_isdst = -1;               /* dunno if it's daylight savings... */
-        
-        *t = mktime (&tm);
-        if (*t == (time_t)-1)
-                return (-1);
-                        
-        return (0);
-}
-
-int
-ptl_parse_ipquad (__u32 *ipaddrp, char *str)
+lnet_parse_ipquad (__u32 *ipaddrp, char *str)
 {
         int             a;
         int             b;
@@ -314,7 +142,7 @@ ptl_parse_ipquad (__u32 *ipaddrp, char *str)
 }
 
 int
-ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
+lnet_parse_ipaddr (__u32 *ipaddrp, char *str)
 {
 #ifdef HAVE_GETHOSTBYNAME
         struct hostent *he;
@@ -325,7 +153,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
                 return (0);
         }
 
-        if (ptl_parse_ipquad(ipaddrp, str) == 0)
+        if (lnet_parse_ipquad(ipaddrp, str) == 0)
                 return (0);
 
 #ifdef HAVE_GETHOSTBYNAME
@@ -366,226 +194,272 @@ ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup)
 }
 
 int
-ptl_parse_nid (ptl_nid_t *nidp, char *str)
+lnet_parse_time (time_t *t, char *str) 
 {
-        __u32               ipaddr;
-        char               *end;
-        unsigned long long  ullval;
+        char          *end;
+        int            n;
+        struct tm      tm;
         
-        if (ptl_parse_ipaddr (&ipaddr, str) == 0) {
-#if !CRAY_PORTALS
-                *nidp = (ptl_nid_t)ipaddr;
-#else
-                *nidp = (((ptl_nid_t)ipaddr & PNAL_HOSTID_MASK) << PNAL_VNODE_SHIFT);
-#endif
-                return (0);
-        }
-
-        ullval = strtoull(str, &end, 0);
-        if (end != str && *end == 0) {
-                /* parsed whole non-empty string */
-                *nidp = (ptl_nid_t)ullval;
+        *t = strtol (str, &end, 0);
+        if (*end == 0) /* parsed whole string */
                 return (0);
-        }
+        
+        memset (&tm, 0, sizeof (tm));
+        n = sscanf (str, "%d-%d-%d-%d:%d:%d",
+                    &tm.tm_year, &tm.tm_mon, &tm.tm_mday, 
+                    &tm.tm_hour, &tm.tm_min, &tm.tm_sec);
+        if (n != 6)
+                return (-1);
+        
+        tm.tm_mon--;                    /* convert to 0 == Jan */
+        tm.tm_year -= 1900;             /* y2k quirk */
+        tm.tm_isdst = -1;               /* dunno if it's daylight savings... */
+        
+        *t = mktime (&tm);
+        if (*t == (time_t)-1)
+                return (-1);
+                        
+        return (0);
+}
 
-        return (-1);
+int g_net_is_set (char *cmd) 
+{
+        if (g_net_set)
+                return 1;
+        
+        if (cmd != NULL)
+                fprintf(stderr, 
+                        "You must run the 'network' command before '%s'.\n",
+                        cmd);
+        return 0;
 }
 
-int
-ptl_parse_anynid (ptl_nid_t *nidp, char *str)
+int g_net_is_compatible (char *cmd, ...)
 {
-        if (!strcmp (str, "_all_")) {
-                *nidp = PTL_NID_ANY;
+        va_list       ap;
+        int           nal;
+
+        if (!g_net_is_set(cmd))
                 return 0;
-        }
 
-        return ptl_parse_nid(nidp, str);
+        va_start(ap, cmd);
+
+        do {
+                nal = va_arg (ap, int);
+                if (nal == LNET_NETTYP(g_net)) {
+                        va_end (ap);
+                        return 1;
+                }
+        } while (nal != 0);
+        
+        va_end (ap);
+        
+        if (cmd != NULL)
+                fprintf (stderr, 
+                         "Command %s not compatible with %s NAL\n",
+                         cmd, 
+                         libcfs_lnd2str(LNET_NETTYP(g_net)));
+        return 0;
 }
 
-__u64 ptl_nid2u64(ptl_nid_t nid)
+int ptl_initialize(int argc, char **argv) 
 {
-        switch (sizeof (nid)) {
-        case 8:
-                return (nid);
-        case 4:
-                return ((__u32)nid);
-        default:
-                fprintf(stderr, "Unexpected sizeof(ptl_nid_t) == %u\n",
-                        (int)sizeof(nid));
-                abort();
-                /* notreached */
-                return (-1);
-        }
+        register_ioc_dev(LNET_DEV_ID, LNET_DEV_PATH,
+                         LNET_DEV_MAJOR, LNET_DEV_MINOR);
+        return 0;
 }
 
-char *
-ptl_nid2str (char *buffer, ptl_nid_t nid)
-{
-        __u64           nid64 = ptl_nid2u64(nid);
-#ifdef HAVE_GETHOSTBYNAME
-        struct hostent *he = 0;
 
-        /* Don't try to resolve NIDs that are e.g. Elan host IDs.  Assume
-         * TCP addresses in the 0.x.x.x subnet are not in use.  This can
-         * happen on routers and slows things down a _lot_.  Bug 3442. */
-        if (nid & 0xff000000) {
-                __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */
+int jt_ptl_network(int argc, char **argv)
+{
+        struct libcfs_ioctl_data data;
+        __u32                    net = LNET_NIDNET(LNET_NID_ANY);
+        int                      rc;
 
-                he = gethostbyaddr((const char *)&addr, sizeof(addr), AF_INET);
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s <net>|up|down\n", argv[0]);
+                return 0;
         }
 
-        if (he != NULL)
-                sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name);
-        else
-#endif /* HAVE_GETHOSTBYNAME */
-                sprintf(buffer, LPX64, nid64);
+        if (!strcmp(argv[1], "unconfigure") ||
+            !strcmp(argv[1], "down")) {
+                LIBCFS_IOC_INIT(data);
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_UNCONFIGURE, &data);
 
-        return (buffer);
-}
+                if (rc == 0) {
+                        printf ("LNET ready to unload\n");
+                        return 0;
+                }
 
-int g_nal_is_set () 
-{
-        if (g_nal == 0) {
-                fprintf (stderr, "Error: you must run the 'network' command first.\n");
-                return (0);
+                if (errno == EBUSY)
+                        fprintf(stderr, "LNET busy\n");
+                else
+                        fprintf(stderr, "LNET unconfigure error %d: %s\n",
+                                errno, strerror(errno));
+                return -1;
         }
 
-        return (1);
-}
-
-int g_nal_is_compatible (char *cmd, ...)
-{
-        va_list       ap;
-        int           nal;
-
-        if (!g_nal_is_set ())
-                return (0);
+        if (!strcmp(argv[1], "configure") ||
+            !strcmp(argv[1], "up")) {
+                LIBCFS_IOC_INIT(data);
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_CONFIGURE, &data);
 
-        va_start (ap, cmd);
+                if (rc == 0) {
+                        printf ("LNET configured\n");
+                        return 0;
+                }
 
-        do {
-                nal = va_arg (ap, int);
-        } while (nal != 0 && nal != g_nal);
-        
-        va_end (ap);
-        
-        if (g_nal == nal)
-                return (1);
+                fprintf(stderr, "LNET configure error %d: %s\n",
+                        errno, strerror(errno));
+                return -1;
+        }
 
-        if (cmd != NULL) {
-                /* Don't complain verbosely if we've not been passed a command
-                 * name to complain about! */
-                fprintf (stderr, "Command %s not compatible with nal %s\n",
-                         cmd, nal2name (g_nal));
+        net = libcfs_str2net(argv[1]);
+        if (net == LNET_NIDNET(LNET_NID_ANY)) {
+                fprintf(stderr, "Can't parse net %s\n", argv[1]);
+                return -1;
         }
-        return (0);
+
+        g_net_set = 1;
+        g_net = net;
+        return 0;
 }
 
 int
-sock_write (int cfd, void *buffer, int nob)
+jt_ptl_list_nids(int argc, char **argv)
 {
-        while (nob > 0)
-        {
-                int rc = write (cfd, buffer, nob);
+        struct libcfs_ioctl_data data;
+        int                      all = 0, return_nid = 0;
+        int                      count;
+        int                      rc;
 
-                if (rc < 0)
-                {
-                        if (errno == EINTR)
-                                continue;
-                        
-                        return (rc);
-                }
+        all = (argc == 2) && (strcmp(argv[1], "all") == 0);
+        /* Hack to pass back value */
+        return_nid = (argc == 2) && (argv[1][0] == 1);
 
-                if (rc == 0)
-                {
-                        fprintf (stderr, "Unexpected zero sock_write\n");
-                        abort();
+        if ((argc > 2) && !(all || return_nid)) {
+                fprintf(stderr, "usage: %s [all]\n", argv[0]);
+                return 0;
+        }
+
+        for (count = 0;; count++) {
+                LIBCFS_IOC_INIT (data);
+                data.ioc_count = count;
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_NI, &data);
+                
+                if (rc < 0) {
+                        if ((count > 0) && (errno == ENOENT))
+                                /* We found them all */
+                                break;
+                        fprintf(stderr,"IOC_LIBCFS_GET_NI error %d: %s\n",
+                                errno, strerror(errno));
+                        return -1;
                 }
 
-                nob -= rc;
-                buffer = (char *)buffer + nob;
+                if (all || (LNET_NETTYP(LNET_NIDNET(data.ioc_nid)) != LOLND)) {
+                        printf("%s\n", libcfs_nid2str(data.ioc_nid));
+                        if (return_nid) { 
+                                *(__u64 *)(argv[1]) = data.ioc_nid;
+                                return_nid--;
+                        }
+                }
         }
-        
-        return (0);
+
+        return 0;
 }
 
 int
-sock_read (int cfd, void *buffer, int nob)
+jt_ptl_which_nid (int argc, char **argv)
 {
-        while (nob > 0)
-        {
-                int rc = read (cfd, buffer, nob);
+        struct libcfs_ioctl_data data;
+        int          best_dist = 0;
+        int          best_order = 0;
+        lnet_nid_t   best_nid = LNET_NID_ANY;
+        int          dist;
+        int          order;
+        lnet_nid_t   nid;
+        char        *nidstr;
+        int          rc;
+        int          i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s NID [NID...]\n", argv[0]);
+                return 0;
+        }
+        
+        for (i = 1; i < argc; i++) {
+                nidstr = argv[i];
+                nid = libcfs_str2nid(nidstr);
+                if (nid == LNET_NID_ANY) {
+                        fprintf(stderr, "Can't parse NID %s\n", nidstr);
+                        return -1;
+                }
                 
-                if (rc < 0)
-                {
-                        if (errno == EINTR)
-                                continue;
-                        
-                        return (rc);
+                LIBCFS_IOC_INIT(data);
+                data.ioc_nid = nid;
+                
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LNET_DIST, &data);
+                if (rc != 0) {
+                        fprintf(stderr, "Can't get distance to %s: %s\n",
+                                nidstr, strerror(errno));
+                        return -1;
                 }
+
+                dist = data.ioc_u32[0];
+                order = data.ioc_u32[1];
+
+                if (dist < 0) {
+                        if (dist == -EHOSTUNREACH)
+                                continue;
                 
-                if (rc == 0)                    /* EOF */
-                {
-                        errno = ECONNABORTED;
-                        return (-1);
+                        fprintf(stderr, "Unexpected distance to %s: %d\n",
+                                nidstr, dist);
+                        return -1;
                 }
                 
-                nob -= rc;
-                buffer = (char *)buffer + nob;
+                if (best_nid == LNET_NID_ANY ||
+                    dist < best_dist ||
+                    (dist == best_dist && order < best_order)) {
+                        best_dist = dist;
+                        best_order = order;
+                        best_nid = nid;
+                }
         }
-        
-        return (0);
-}
 
-int ptl_initialize(int argc, char **argv) 
-{
-        register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH);
-        return 0;
-}
-
-
-int jt_ptl_network(int argc, char **argv)
-{
-        name2num_t *entry;
-        int         nal;
-        
-        if (argc == 2 &&
-            (nal = ptl_name2nal (argv[1])) >= 0) {
-                g_nal = nal;
-                return (0);
+        if (best_nid == LNET_NID_ANY) {
+                fprintf(stderr, "No reachable NID\n");
+                return -1;
         }
-                
-        fprintf(stderr, "usage: %s \n", argv[0]);
-        for (entry = nalnames; entry->name != NULL; entry++)
-                fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name);
-        fprintf(stderr, ">\n");
-        return (-1);
+        
+        printf("%s\n", libcfs_nid2str(best_nid));
+        return 0;
 }
 
 int
 jt_ptl_print_interfaces (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
         char                     buffer[3][64];
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, 0))
+        if (!g_net_is_compatible (argv[0], SOCKLND, 0))
                 return -1;
 
         for (index = 0;;index++) {
-                PCFG_INIT (pcfg, NAL_CMD_GET_INTERFACE);
-                pcfg.pcfg_count = index;
-
-                rc = pcfg_ioctl (&pcfg);
+                LIBCFS_IOC_INIT(data);
+                data.ioc_net   = g_net;
+                data.ioc_count = index;
+                
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_INTERFACE, &data);
                 if (rc != 0)
                         break;
 
                 printf ("%s: (%s/%s) npeer %d nroute %d\n",
-                        ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[2], 1),
-                        ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[0], 0),
-                        ptl_ipaddr_2_str(pcfg.pcfg_misc, buffer[1], 0),
-                        pcfg.pcfg_fd, pcfg.pcfg_count);
+                        ptl_ipaddr_2_str(data.ioc_u32[0], buffer[2], 1),
+                        ptl_ipaddr_2_str(data.ioc_u32[0], buffer[0], 0),
+                        ptl_ipaddr_2_str(data.ioc_u32[1], buffer[1], 0),
+                        data.ioc_u32[2], data.ioc_u32[3]);
         }
 
         if (index == 0) {
@@ -604,7 +478,7 @@ jt_ptl_print_interfaces (int argc, char **argv)
 int
 jt_ptl_add_interface (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
         __u32                    ipaddr;
         int                      rc;
         __u32                    netmask = 0xffffff00;
@@ -617,10 +491,10 @@ jt_ptl_add_interface (int argc, char **argv)
                 return 0;
         }
 
-        if (!g_nal_is_compatible(argv[0], SOCKNAL, 0))
+        if (!g_net_is_compatible(argv[0], SOCKLND, 0))
                 return -1;
 
-        if (ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) {
+        if (lnet_parse_ipaddr(&ipaddr, argv[1]) != 0) {
                 fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
                 return -1;
         }
@@ -631,17 +505,18 @@ jt_ptl_add_interface (int argc, char **argv)
                         netmask = 0;
                         for (i = count; i > 0; i--)
                                 netmask = netmask|(1<<(32-i));
-                } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) {
+                } else if (lnet_parse_ipquad(&netmask, argv[2]) != 0) {
                         fprintf (stderr, "Can't parse netmask: %s\n", argv[2]);
                         return -1;
                 }
         }
 
-        PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE);
-        pcfg.pcfg_id     = ipaddr;
-        pcfg.pcfg_misc   = netmask;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net    = g_net;
+        data.ioc_u32[0] = ipaddr;
+        data.ioc_u32[1] = netmask;
 
-        rc = pcfg_ioctl (&pcfg);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_INTERFACE, &data);
         if (rc != 0) {
                 fprintf (stderr, "failed to add interface: %s\n",
                          strerror (errno));
@@ -654,7 +529,7 @@ jt_ptl_add_interface (int argc, char **argv)
 int
 jt_ptl_del_interface (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
         int                      rc;
         __u32                    ipaddr = 0;
 
@@ -663,19 +538,20 @@ jt_ptl_del_interface (int argc, char **argv)
                 return 0;
         }
 
-        if (!g_nal_is_compatible(argv[0], SOCKNAL, 0))
+        if (!g_net_is_compatible(argv[0], SOCKLND, 0))
                 return -1;
 
         if (argc == 2 &&
-            ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) {
+            lnet_parse_ipaddr(&ipaddr, argv[1]) != 0) {
                 fprintf (stderr, "Can't parse ip: %s\n", argv[1]);
                 return -1;
         }
         
-        PCFG_INIT(pcfg, NAL_CMD_DEL_INTERFACE);
-        pcfg.pcfg_id = ipaddr;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net    = g_net;
+        data.ioc_u32[0] = ipaddr;
 
-        rc = pcfg_ioctl (&pcfg);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_INTERFACE, &data);
         if (rc != 0) {
                 fprintf (stderr, "failed to delete interface: %s\n",
                          strerror (errno));
@@ -688,37 +564,64 @@ jt_ptl_del_interface (int argc, char **argv)
 int
 jt_ptl_print_peers (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
+        lnet_process_id_t        id;
         char                     buffer[2][64];
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, 
-                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
+        if (!g_net_is_compatible (argv[0], SOCKLND, RALND, PTLLND,
+                                  OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0))
                 return -1;
 
         for (index = 0;;index++) {
-                PCFG_INIT (pcfg, NAL_CMD_GET_PEER);
-                pcfg.pcfg_count   = index;
-
-                rc = pcfg_ioctl (&pcfg);
+                LIBCFS_IOC_INIT(data);
+                data.ioc_net     = g_net;
+                data.ioc_count   = index;
+                
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_PEER, &data);
                 if (rc != 0)
                         break;
 
-                if (g_nal_is_compatible(NULL, SOCKNAL, 0))
-                        printf (LPX64"[%d]%s@%s:%d #%d\n",
-                                pcfg.pcfg_nid, pcfg.pcfg_wait,
-                                ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1),
-                                ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1),
-                                pcfg.pcfg_misc, pcfg.pcfg_count);
-                else if (g_nal_is_compatible(NULL, RANAL, OPENIBNAL, VIBNAL, 0))
-                        printf (LPX64"[%d]@%s:%d\n",
-                                pcfg.pcfg_nid, pcfg.pcfg_wait,
-                                ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1),
-                                pcfg.pcfg_misc);
-                else
-                        printf (LPX64"[%d]\n",
-                                pcfg.pcfg_nid, pcfg.pcfg_wait);
+                if (g_net_is_compatible(NULL, SOCKLND, 0)) {
+                        id.nid = data.ioc_nid;
+                        id.pid = data.ioc_u32[4];
+                        printf ("%-20s [%d]%s->%s:%d #%d\n",
+                                libcfs_id2str(id), 
+                                data.ioc_count, /* persistence */
+                                ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* my ip */
+                                ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */
+                                data.ioc_u32[1], /* peer port */
+                                data.ioc_u32[3]); /* conn_count */
+                } else if (g_net_is_compatible(NULL, PTLLND, 0)) {
+                        id.nid = data.ioc_nid;
+                        id.pid = data.ioc_u32[4];
+                        printf ("%-20s s %d%s [%d] "LPD64".%06d"
+                                " m "LPD64"/"LPD64" q %d/%d c %d/%d\n",
+                                libcfs_id2str(id),
+                                data.ioc_net,   /* state */
+                                data.ioc_flags ? "" : " ~!h", /* sent_hello */
+                                data.ioc_count, /* refcount */
+                                data.ioc_u64[0]/1000000, /* incarnation secs */
+                                (int)(data.ioc_u64[0]%1000000), /* incarnation usecs */
+                                (((__u64)data.ioc_u32[1])<<32) |
+                                ((__u64)data.ioc_u32[0]), /* next_matchbits */
+                                (((__u64)data.ioc_u32[3])<<32) |
+                                ((__u64)data.ioc_u32[2]), /* last_matchbits_seen */
+                                data.ioc_u32[5] >> 16, /* nsendq */
+                                data.ioc_u32[5] & 0xffff, /* nactiveq */
+                                data.ioc_u32[6] >> 16, /* credits */
+                                data.ioc_u32[6] & 0xffff); /* outstanding_credits */
+                } else if (g_net_is_compatible(NULL, RALND, OPENIBLND, CIBLND, VIBLND, 0)) {
+                        printf ("%-20s [%d]@%s:%d\n",
+                                libcfs_nid2str(data.ioc_nid), /* peer nid */
+                                data.ioc_count,   /* peer persistence */
+                                ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */
+                                data.ioc_u32[1]); /* peer port */
+                } else {
+                        printf ("%-20s [%d]\n",
+                                libcfs_nid2str(data.ioc_nid), data.ioc_count);
+                }
         }
 
         if (index == 0) {
@@ -736,23 +639,23 @@ jt_ptl_print_peers (int argc, char **argv)
 int 
 jt_ptl_add_peer (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
-        ptl_nid_t                nid;
+        struct libcfs_ioctl_data data;
+        lnet_nid_t               nid;
         __u32                    ip = 0;
         int                      port = 0;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL
-                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
+        if (!g_net_is_compatible (argv[0], SOCKLND, RALND
+                                  OPENIBLND, CIBLND, IIBLND, VIBLND, 0))
                 return -1;
 
-        if (g_nal_is_compatible(NULL, SOCKNAL, OPENIBNAL, RANAL, 0)) {
+        if (g_net_is_compatible(NULL, SOCKLND, OPENIBLND, CIBLND, RALND, 0)) {
                 if (argc != 4) {
-                        fprintf (stderr, "usage(tcp,openib,ra): %s nid ipaddr port\n", 
+                        fprintf (stderr, "usage(tcp,openib,cib,ra): %s nid ipaddr port\n", 
                                  argv[0]);
                         return 0;
                 }
-        } else if (g_nal_is_compatible(NULL, VIBNAL, 0)) {
+        } else if (g_net_is_compatible(NULL, VIBLND, 0)) {
                 if (argc != 3) {
                         fprintf (stderr, "usage(vib): %s nid ipaddr\n", 
                                  argv[0]);
@@ -763,30 +666,31 @@ jt_ptl_add_peer (int argc, char **argv)
                 return 0;
         }
 
-        if (ptl_parse_nid (&nid, argv[1]) != 0 ||
-                nid == PTL_NID_ANY) {
+        nid = libcfs_str2nid(argv[1]);
+        if (nid == LNET_NID_ANY) {
                 fprintf (stderr, "Can't parse NID: %s\n", argv[1]);
                 return -1;
         }
 
-        if (g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, VIBNAL, RANAL, 0) &&
-            ptl_parse_ipaddr (&ip, argv[2]) != 0) {
+        if (g_net_is_compatible (NULL, SOCKLND, OPENIBLND, CIBLND, VIBLND, RALND, 0) &&
+            lnet_parse_ipaddr (&ip, argv[2]) != 0) {
                 fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]);
                 return -1;
         }
 
-        if (g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, RANAL, 0) &&
-            ptl_parse_port (&port, argv[3]) != 0) {
+        if (g_net_is_compatible (NULL, SOCKLND, OPENIBLND, CIBLND, RALND, 0) &&
+            lnet_parse_port (&port, argv[3]) != 0) {
                 fprintf (stderr, "Can't parse port: %s\n", argv[3]);
                 return -1;
         }
 
-        PCFG_INIT(pcfg, NAL_CMD_ADD_PEER);
-        pcfg.pcfg_nid     = nid;
-        pcfg.pcfg_id      = ip;
-        pcfg.pcfg_misc    = port;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net    = g_net;
+        data.ioc_nid    = nid;
+        data.ioc_u32[0] = ip;
+        data.ioc_u32[1] = port;
 
-        rc = pcfg_ioctl (&pcfg);
+        rc = l_ioctl (LNET_DEV_ID, IOC_LIBCFS_ADD_PEER, &data);
         if (rc != 0) {
                 fprintf (stderr, "failed to add peer: %s\n",
                          strerror (errno));
@@ -799,60 +703,65 @@ jt_ptl_add_peer (int argc, char **argv)
 int 
 jt_ptl_del_peer (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
-        ptl_nid_t                nid = PTL_NID_ANY;
+        struct libcfs_ioctl_data data;
+        lnet_nid_t               nid = LNET_NID_ANY;
+        lnet_pid_t               pid = LNET_PID_ANY;
         __u32                    ip = 0;
-        int                      single_share = 0;
-        int                      argidx;
+        char                    *end;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, 
-                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
+        if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, PTLLND,
+                                  OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0))
                 return -1;
 
-        if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
-                if (argc > 4) {
-                        fprintf (stderr, "usage: %s [nid] [ipaddr] [single_share]\n",
+        if (g_net_is_compatible(NULL, SOCKLND, 0)) {
+                if (argc > 3) {
+                        fprintf (stderr, "usage: %s [nid] [ipaddr]\n",
                                  argv[0]);
                         return 0;
                 }
-        } else if (argc > 3) {
-                fprintf (stderr, "usage: %s [nid] [single_share]\n", argv[0]);
+        } else if (g_net_is_compatible(NULL, PTLLND, 0)) {
+                if (argc > 3) {
+                        fprintf (stderr, "usage: %s [nid] [pid]\n",
+                                 argv[0]);
+                        return 0;
+                }
+        } else if (argc > 2) {
+                fprintf (stderr, "usage: %s [nid]\n", argv[0]);
                 return 0;
         }
                 
         if (argc > 1 &&
-            ptl_parse_anynid (&nid, argv[1]) != 0) {
+            !libcfs_str2anynid(&nid, argv[1])) {
                 fprintf (stderr, "Can't parse nid: %s\n", argv[1]);
                 return -1;
         }
 
-        argidx = 2;
-        if (g_nal_is_compatible(NULL, SOCKNAL, 0)) {
-                if (argc > argidx &&
-                    ptl_parse_ipaddr (&ip, argv[argidx]) != 0) {
+        if (g_net_is_compatible(NULL, SOCKLND, 0)) {
+                if (argc > 2 &&
+                    lnet_parse_ipaddr (&ip, argv[2]) != 0) {
                         fprintf (stderr, "Can't parse ip addr: %s\n",
-                                 argv[argidx]);
+                                 argv[2]);
                         return -1;
                 }
-                argidx++;
-        }
-        
-        if (argc > argidx) {
-                if (!strcmp (argv[argidx], "single_share")) {
-                        single_share = 1;
-                } else {
-                        fprintf (stderr, "Unrecognised arg %s'\n", argv[3]);
-                        return -1;
+        } else if (g_net_is_compatible(NULL, PTLLND, 0)) {
+                if (argc > 2) {
+                        pid = strtol(argv[2], &end, 0);
+                        if (end == argv[2] || *end == 0) {
+                                fprintf(stderr, "Can't parse pid %s\n",
+                                        argv[2]);
+                                return -1;
+                        }
                 }
         }
+                   
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net    = g_net;
+        data.ioc_nid    = nid;
+        data.ioc_u32[0] = ip;
+        data.ioc_u32[1] = pid;
 
-        PCFG_INIT(pcfg, NAL_CMD_DEL_PEER);
-        pcfg.pcfg_nid = nid;
-        pcfg.pcfg_id = ip;
-        pcfg.pcfg_flags = single_share;
-
-        rc = pcfg_ioctl (&pcfg);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_PEER, &data);
         if (rc != 0) {
                 fprintf (stderr, "failed to remove peer: %s\n",
                          strerror (errno));
@@ -865,44 +774,48 @@ jt_ptl_del_peer (int argc, char **argv)
 int 
 jt_ptl_print_connections (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
+        lnet_process_id_t        id;
         char                     buffer[2][64];
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL
-                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
+        if (!g_net_is_compatible (argv[0], SOCKLND, RALND
+                                  OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0))
                 return -1;
 
         for (index = 0; ; index++) {
-                PCFG_INIT (pcfg,  NAL_CMD_GET_CONN);
-                pcfg.pcfg_count   = index;
-                
-                rc = pcfg_ioctl (&pcfg);
+                LIBCFS_IOC_INIT(data);
+                data.ioc_net     = g_net;
+                data.ioc_count   = index;
+
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_CONN, &data);
                 if (rc != 0)
                         break;
 
-                if (g_nal_is_compatible (NULL, SOCKNAL, 0))
-                        printf ("[%d]%s:"LPX64"@%s:%d:%s %d/%d %s\n",
-                                pcfg.pcfg_gw_nal,       /* scheduler */
-                                ptl_ipaddr_2_str (pcfg.pcfg_fd, buffer[0], 1), /* local IP addr */
-                                pcfg.pcfg_nid, 
-                                ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), /* remote IP addr */
-                                pcfg.pcfg_misc,         /* remote port */
-                                (pcfg.pcfg_flags == SOCKNAL_CONN_ANY) ? "A" :
-                                (pcfg.pcfg_flags == SOCKNAL_CONN_CONTROL) ? "C" :
-                                (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_IN) ? "I" :
-                                (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_OUT) ? "O" : "?",
-                                pcfg.pcfg_count,        /* tx buffer size */
-                                pcfg.pcfg_size,         /* rx buffer size */
-                                pcfg.pcfg_wait ? "nagle" : "nonagle");
-                else if (g_nal_is_compatible (NULL, RANAL, 0))
-                        printf ("[%d]"LPX64"\n",
-                                pcfg.pcfg_id,       /* device id */
-                                pcfg.pcfg_nid);
-                else
-                        printf (LPX64"\n",
-                                pcfg.pcfg_nid);
+                if (g_net_is_compatible (NULL, SOCKLND, 0)) {
+                        id.nid = data.ioc_nid;
+                        id.pid = data.ioc_u32[6];
+                        printf ("%-20s %s[%d]%s->%s:%d %d/%d %s\n",
+                                libcfs_id2str(id),
+                                (data.ioc_u32[3] == SOCKLND_CONN_ANY) ? "A" :
+                                (data.ioc_u32[3] == SOCKLND_CONN_CONTROL) ? "C" :
+                                (data.ioc_u32[3] == SOCKLND_CONN_BULK_IN) ? "I" :
+                                (data.ioc_u32[3] == SOCKLND_CONN_BULK_OUT) ? "O" : "?",
+                                data.ioc_u32[4], /* scheduler */
+                                ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* local IP addr */
+                                ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* remote IP addr */
+                                data.ioc_u32[1],         /* remote port */
+                                data.ioc_count, /* tx buffer size */
+                                data.ioc_u32[5], /* rx buffer size */
+                                data.ioc_flags ? "nagle" : "nonagle");
+                } else if (g_net_is_compatible (NULL, RALND, 0)) {
+                        printf ("%-20s [%d]\n",
+                                libcfs_nid2str(data.ioc_nid),
+                                data.ioc_u32[0] /* device id */);
+                } else {
+                        printf ("%s\n", libcfs_nid2str(data.ioc_nid));
+                }
         }
 
         if (index == 0) {
@@ -917,154 +830,10 @@ jt_ptl_print_connections (int argc, char **argv)
         return 0;
 }
 
-int jt_ptl_connect(int argc, char **argv)
-{
-#ifndef HAVE_CONNECT
-        /* no connect() support */
-        return -1;
-#else /* HAVE_CONNECT */
-        struct portals_cfg pcfg;
-        struct sockaddr_in srvaddr;
-        struct sockaddr_in locaddr;
-        __u32 ipaddr;
-        char *flag;
-        int fd, rc;
-        int type = SOCKNAL_CONN_ANY;
-        int port, rport;
-        int o;
-
-        if (argc < 3) {
-                fprintf(stderr, "usage: %s ip port [type]\n", argv[0]);
-                return 0;
-        }
-
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, 0))
-                return -1;
-        
-        rc = ptl_parse_ipaddr (&ipaddr, argv[1]);
-        if (rc != 0) {
-                fprintf(stderr, "Can't parse hostname: %s\n", argv[1]);
-                return -1;
-        }
-
-        if (ptl_parse_port (&port, argv[2]) != 0) {
-                fprintf (stderr, "Can't parse port: %s\n", argv[2]);
-                return -1;
-        }
-
-        if (argc > 3)
-                for (flag = argv[3]; *flag != 0; flag++)
-                        switch (*flag)
-                        {
-                        case 'I':
-                                if (type != SOCKNAL_CONN_ANY) {
-                                        fprintf(stderr, "Can't flag type twice\n");
-                                        return -1;
-                                }
-                                type = SOCKNAL_CONN_BULK_IN;
-                                break;
-
-                        case 'O':
-                                if (type != SOCKNAL_CONN_ANY) {
-                                        fprintf(stderr, "Can't flag type twice\n");
-                                        return -1;
-                                }
-                                type = SOCKNAL_CONN_BULK_OUT;
-                                break;
-
-                        case 'C':
-                                if (type != SOCKNAL_CONN_ANY) {
-                                        fprintf(stderr, "Can't flag type twice\n");
-                                        return -1;
-                                }
-                                type = SOCKNAL_CONN_CONTROL;
-                                break;
-                                
-                        default:
-                                fprintf (stderr, "unrecognised flag '%c'\n",
-                                         *flag);
-                                return (-1);
-                        }
-
-        memset(&locaddr, 0, sizeof(locaddr)); 
-        locaddr.sin_family = AF_INET; 
-        locaddr.sin_addr.s_addr = INADDR_ANY;
-
-        memset(&srvaddr, 0, sizeof(srvaddr));
-        srvaddr.sin_family = AF_INET;
-        srvaddr.sin_port = htons(port);
-        srvaddr.sin_addr.s_addr = htonl(ipaddr);
-
-
-        for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) {
-                fd = socket(PF_INET, SOCK_STREAM, 0); 
-                if ( fd < 0 ) { 
-                        fprintf(stderr, "socket() failed: %s\n", strerror(errno)); 
-                        return -1; 
-                }
-
-                o = 1;
-                rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 
-                                &o, sizeof(o));
-                
-                locaddr.sin_port = htons(rport);
-                rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); 
-                if (rc == 0 || errno == EACCES) {
-                        rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr));
-                        if (rc == 0) {
-                                break;
-                        } else if (errno != EADDRINUSE) {
-                                fprintf(stderr, "Error connecting to host: %s\n", strerror(errno));
-                                close(fd);
-                                return -1;
-                        }
-                } else if (errno != EADDRINUSE) {
-                        fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno));
-                        close(fd);
-                        return -1;
-                }
-        }
-
-        if (rport == IPPORT_RESERVED / 2) {
-                fprintf(stderr,
-                        "Warning: all privileged ports are in use.\n"); 
-                return -1;
-        }
-
-        printf("Connected host: %s type: %s\n", 
-               argv[1],
-               (type == SOCKNAL_CONN_ANY) ? "A" :
-               (type == SOCKNAL_CONN_CONTROL) ? "C" :
-               (type == SOCKNAL_CONN_BULK_IN) ? "I" :
-               (type == SOCKNAL_CONN_BULK_OUT) ? "O" : "?");
-
-        PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD);
-        pcfg.pcfg_nal = g_nal;
-        pcfg.pcfg_fd = fd;
-        pcfg.pcfg_misc = type;
-        
-        rc = pcfg_ioctl(&pcfg);
-        if (rc) {
-                fprintf(stderr, "failed to register fd with portals: %s\n", 
-                        strerror(errno));
-                close (fd);
-                return -1;
-        }
-
-        printf("Connection to %s registered with socknal\n", argv[1]);
-
-        rc = close(fd);
-        if (rc)
-                fprintf(stderr, "close failed: %d\n", rc);
-
-        return 0;
-#endif /* HAVE_CONNECT */
-}
-
 int jt_ptl_disconnect(int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
-        ptl_nid_t                nid = PTL_NID_ANY;
+        struct libcfs_ioctl_data data;
+        lnet_nid_t               nid = LNET_NID_ANY;
         __u32                    ipaddr = 0;
         int                      rc;
 
@@ -1073,29 +842,30 @@ int jt_ptl_disconnect(int argc, char **argv)
                 return 0;
         }
 
-        if (!g_nal_is_compatible (NULL, SOCKNAL, RANAL, 
-                                  OPENIBNAL, IIBNAL, VIBNAL, 0))
+        if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND,
+                                  OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0))
                 return 0;
 
         if (argc >= 2 &&
-            ptl_parse_anynid (&nid, argv[1]) != 0) {
+            !libcfs_str2anynid(&nid, argv[1])) {
                 fprintf (stderr, "Can't parse nid %s\n", argv[1]);
                 return -1;
         }
 
-        if (g_nal_is_compatible (NULL, SOCKNAL, 0) &&
+        if (g_net_is_compatible (NULL, SOCKLND, 0) &&
             argc >= 3 &&
-            ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) {
+            lnet_parse_ipaddr (&ipaddr, argv[2]) != 0) {
                 fprintf (stderr, "Can't parse ip addr %s\n", argv[2]);
                 return -1;
         }
 
-        PCFG_INIT(pcfg, NAL_CMD_CLOSE_CONNECTION);
-        pcfg.pcfg_nid     = nid;
-        pcfg.pcfg_id      = ipaddr;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net     = g_net;
+        data.ioc_nid     = nid;
+        data.ioc_u32[0]  = ipaddr;
         
-        rc = pcfg_ioctl(&pcfg);
-        if (rc) {
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_CLOSE_CONNECTION, &data);
+        if (rc != 0) {
                 fprintf(stderr, "failed to remove connection: %s\n",
                         strerror(errno));
                 return -1;
@@ -1106,36 +876,30 @@ int jt_ptl_disconnect(int argc, char **argv)
 
 int jt_ptl_push_connection (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
         int                      rc;
-        ptl_nid_t                nid = PTL_NID_ANY;
-        __u32                    ipaddr = 0;
+        lnet_nid_t               nid = LNET_NID_ANY;
 
-        if (argc > 3) {
-                fprintf(stderr, "usage: %s [nid] [ip]\n", argv[0]);
+        if (argc > 2) {
+                fprintf(stderr, "usage: %s [nid]\n", argv[0]);
                 return 0;
         }
 
-        if (!g_nal_is_compatible (argv[0], SOCKNAL, 0))
+        if (!g_net_is_compatible (argv[0], SOCKLND, 0))
                 return -1;
         
         if (argc > 1 &&
-            ptl_parse_anynid (&nid, argv[1]) != 0) {
+            !libcfs_str2anynid(&nid, argv[1])) {
                 fprintf(stderr, "Can't parse nid: %s\n", argv[1]);
                 return -1;
         }
                         
-        if (argc > 2 &&
-            ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) {
-                fprintf(stderr, "Can't parse ipaddr: %s\n", argv[2]);
-        }
-
-        PCFG_INIT(pcfg, NAL_CMD_PUSH_CONNECTION);
-        pcfg.pcfg_nid     = nid;
-        pcfg.pcfg_id      = ipaddr;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net     = g_net;
+        data.ioc_nid     = nid;
         
-        rc = pcfg_ioctl(&pcfg);
-        if (rc) {
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PUSH_CONNECTION, &data);
+        if (rc != 0) {
                 fprintf(stderr, "failed to push connection: %s\n",
                         strerror(errno));
                 return -1;
@@ -1147,33 +911,32 @@ int jt_ptl_push_connection (int argc, char **argv)
 int 
 jt_ptl_print_active_txs (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
         int                      index;
         int                      rc;
 
-        if (!g_nal_is_compatible (argv[0], QSWNAL, 0))
+        if (!g_net_is_compatible (argv[0], QSWLND, 0))
                 return -1;
 
         for (index = 0;;index++) {
-                PCFG_INIT(pcfg, NAL_CMD_GET_TXDESC);
-                pcfg.pcfg_count   = index;
-        
-                rc = pcfg_ioctl(&pcfg);
+                LIBCFS_IOC_INIT(data);
+                data.ioc_net   = g_net;
+                data.ioc_count = index;
+
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_TXDESC, &data);
                 if (rc != 0)
                         break;
 
-                printf ("%5s payload %6d bytes to "LPX64" via "LPX64" by pid %6d: %s, %s, state %d\n",
-                        pcfg.pcfg_count == PTL_MSG_ACK ? "ACK" :
-                        pcfg.pcfg_count == PTL_MSG_PUT ? "PUT" :
-                        pcfg.pcfg_count == PTL_MSG_GET ? "GET" :
-                        pcfg.pcfg_count == PTL_MSG_REPLY ? "REPLY" : "<weird message>",
-                        pcfg.pcfg_size,
-                        pcfg.pcfg_nid,
-                        pcfg.pcfg_nid2,
-                        pcfg.pcfg_misc,
-                        (pcfg.pcfg_flags & 1) ? "delayed" : "immediate",
-                        (pcfg.pcfg_flags & 2) ? "nblk"    : "normal",
-                        pcfg.pcfg_flags >> 2);
+                printf ("type %u payload %6d to %s via %s by pid %6d: "
+                        "%s, %s, state %d\n",
+                        data.ioc_u32[0],
+                        data.ioc_count,
+                        libcfs_nid2str(data.ioc_nid),
+                        libcfs_nid2str(data.ioc_u64[0]),
+                        data.ioc_u32[1],
+                        (data.ioc_flags & 1) ? "delayed" : "immediate",
+                        (data.ioc_flags & 2) ? "nblk"    : "normal",
+                        data.ioc_flags >> 2);
         }
 
         if (index == 0) {
@@ -1188,25 +951,22 @@ jt_ptl_print_active_txs (int argc, char **argv)
         return 0;
 }
 
-int jt_ptl_ping(int argc, char **argv)
+int jt_ptl_ping_test(int argc, char **argv)
 {
         int       rc;
-        ptl_nid_t nid;
+        lnet_nid_t nid;
         long      count   = 1;
         long      size    = 4;
         long      timeout = 1;
-        struct portal_ioctl_data data;
+        struct libcfs_ioctl_data data;
 
         if (argc < 2) {
                 fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]);
                 return 0;
         }
 
-        if (!g_nal_is_set())
-                return -1;
-
-        if (ptl_parse_nid (&nid, argv[1]) != 0)
-        {
+        nid = libcfs_str2nid(argv[1]);
+        if (nid == LNET_NID_ANY) {
                 fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
                 return (-1);
         }
@@ -1228,14 +988,13 @@ int jt_ptl_ping(int argc, char **argv)
         if (argc > 4)
                 timeout = atol (argv[4]);
         
-        PORTAL_IOC_INIT (data);
+        LIBCFS_IOC_INIT (data);
         data.ioc_count   = count;
-        data.ioc_size    = size;
         data.ioc_nid     = nid;
-        data.ioc_nal     = g_nal;
-        data.ioc_timeout = timeout;
+        data.ioc_u32[0]  = size;
+        data.ioc_u32[1]  = timeout;
         
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PING_TEST, &data);
         if (rc) {
                 fprintf(stderr, "failed to start pinger: %s\n",
                         strerror(errno));
@@ -1244,73 +1003,90 @@ int jt_ptl_ping(int argc, char **argv)
         return 0;
 }
 
-int jt_ptl_shownid(int argc, char **argv)
+int jt_ptl_ping(int argc, char **argv)
 {
-        struct portal_ioctl_data data;
         int                      rc;
-        
-        if (argc > 1) {
-                fprintf(stderr, "usage: %s\n", argv[0]);
+        int                      timeout;
+        lnet_process_id_t        id;
+        lnet_process_id_t        ids[16];
+        int                      maxids = sizeof(ids)/sizeof(ids[0]);
+        struct libcfs_ioctl_data data;
+        int                      i;
+
+        if (argc < 2) {
+                fprintf(stderr, "usage: %s nid [timeout (secs)] [pid]\n", argv[0]);
                 return 0;
         }
-        
-        if (!g_nal_is_set())
+
+        id.nid = libcfs_str2nid(argv[1]);
+        if (id.nid == LNET_NID_ANY) {
+                fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
                 return -1;
+        }
 
-        PORTAL_IOC_INIT (data);
-        data.ioc_nal = g_nal;
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
-        if (rc < 0)
-                fprintf(stderr, "getting my NID failed: %s\n",
-                        strerror (errno));
+        if (argc > 2)
+                timeout = 1000 * atol(argv[2]);
         else
-                printf(LPX64"\n", data.ioc_nid);
+                timeout = 1000;                 /* default 1 second timeout */
+
+        if (argc > 3)
+                id.pid = atol(argv[3]);
+        else
+                id.pid = LNET_PID_ANY;
+
+        LIBCFS_IOC_INIT (data);
+        data.ioc_nid     = id.nid;
+        data.ioc_u32[0]  = id.pid;
+        data.ioc_u32[1]  = timeout;
+        data.ioc_plen1   = sizeof(ids);
+        data.ioc_pbuf1   = (char *)ids;
+
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PING, &data);
+        if (rc != 0) {
+                fprintf(stderr, "failed to ping %s: %s\n",
+                        id.pid == LNET_PID_ANY ?
+                        libcfs_nid2str(id.nid) : libcfs_id2str(id),
+                        strerror(errno));
+                return -1;
+        }
+
+        for (i = 0; i < data.ioc_count && i < maxids; i++)
+                printf("%s\n", libcfs_id2str(ids[i]));
+
+        if (data.ioc_count > maxids)
+                printf("%d out of %d ids listed\n", maxids, data.ioc_count);
+
         return 0;
 }
 
 int jt_ptl_mynid(int argc, char **argv)
 {
+        struct libcfs_ioctl_data data;
+        lnet_nid_t               nid;
         int rc;
-        char hostname[1024];
-        char *nidstr;
-        struct portals_cfg pcfg;
-        ptl_nid_t mynid;
 
-        if (argc > 2) {
-                fprintf(stderr, "usage: %s [NID]\n", argv[0]);
-                fprintf(stderr, "NID defaults to the primary IP address of the machine.\n");
+        if (argc != 2) {
+                fprintf(stderr, "usage: %s NID\n", argv[0]);
                 return 0;
         }
 
-        if (!g_nal_is_set())
-                return -1;
-
-        if (argc >= 2)
-                nidstr = argv[1];
-        else if (gethostname(hostname, sizeof(hostname)) != 0) {
-                fprintf(stderr, "gethostname failed: %s\n",
-                        strerror(errno));
+        nid = libcfs_str2nid(argv[1]);
+        if (nid == LNET_NID_ANY) {
+                fprintf(stderr, "Can't parse NID '%s'\n", argv[1]);
                 return -1;
         }
-        else
-                nidstr = hostname;
 
-        rc = ptl_parse_nid (&mynid, nidstr);
-        if (rc != 0) {
-                fprintf (stderr, "Can't convert '%s' into a NID\n", nidstr);
-                return -1;
-        }
-        
-        PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID);
-        pcfg.pcfg_nid = mynid;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net = LNET_NIDNET(nid);
+        data.ioc_nid = nid;
 
-        rc = pcfg_ioctl(&pcfg);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_REGISTER_MYNID, &data);
         if (rc < 0)
                 fprintf(stderr, "setting my NID failed: %s\n",
                        strerror(errno));
         else
-                printf("registered my nid "LPX64" (%s)\n", 
-                       ptl_nid2u64(mynid), hostname);
+                printf("registered my nid %s\n", libcfs_nid2str(nid));
+
         return 0;
 }
 
@@ -1318,42 +1094,36 @@ int
 jt_ptl_fail_nid (int argc, char **argv)
 {
         int                      rc;
-        ptl_nid_t                nid;
+        lnet_nid_t               nid;
         unsigned int             threshold;
-        struct portal_ioctl_data data;
+        struct libcfs_ioctl_data data;
 
         if (argc < 2 || argc > 3)
         {
-                fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]);
+                fprintf (stderr, "usage: %s nid|\"*\" [count (0 == mend)]\n", argv[0]);
                 return (0);
         }
         
-        if (!g_nal_is_set())
-                return (-1);
-
-        if (!strcmp (argv[1], "_all_"))
-                nid = PTL_NID_ANY;
-        else if (ptl_parse_anynid (&nid, argv[1]) != 0)
+        if (!libcfs_str2anynid(&nid, argv[1]))
         {
                 fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]);
                 return (-1);
         }
 
-        if (argc < 3)
-                threshold = PTL_MD_THRESH_INF;
-        else if (sscanf (argv[2], "%i", &threshold) != 1) {
+        if (argc < 3) {
+                threshold = LNET_MD_THRESH_INF;
+        else if (sscanf (argv[2], "%i", &threshold) != 1) {
                 fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]);
                 return (-1);
         }
         
-        PORTAL_IOC_INIT (data);
-        data.ioc_nal = g_nal;
+        LIBCFS_IOC_INIT (data);
         data.ioc_nid = nid;
         data.ioc_count = threshold;
         
-        rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data);
+        rc = l_ioctl (LNET_DEV_ID, IOC_LIBCFS_FAIL_NID, &data);
         if (rc < 0)
-                fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n",
+                fprintf (stderr, "IOC_LIBCFS_FAIL_NID failed: %s\n",
                          strerror (errno));
         else
                 printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]);
@@ -1362,92 +1132,45 @@ jt_ptl_fail_nid (int argc, char **argv)
 }
 
 int
-jt_ptl_loopback (int argc, char **argv)
-{
-        int                      rc;
-        int                      set;
-        int                      enable;
-        struct portal_ioctl_data data;
-
-        if (argc > 2)
-        {
-                fprintf (stderr, "usage: %s [on|off]\n", argv[0]);
-                return (0);
-        }
-        
-        if (!g_nal_is_set())
-                return (-1);
-
-        set = argc > 1;
-        if (set && ptl_parse_bool (&enable, argv[1]) != 0) {
-                fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
-                return (-1);
-        }
-
-        PORTAL_IOC_INIT (data);
-        data.ioc_nal = g_nal;
-        data.ioc_flags = enable;
-        data.ioc_misc = set;
-        
-        rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_LOOPBACK, &data);
-        if (rc < 0)
-                fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n",
-                         strerror (errno));
-        else
-                printf ("loopback %s\n", data.ioc_flags ? "enabled" : "disabled");
-        
-        return (0);
-}
-
-int
 jt_ptl_add_route (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
-        ptl_nid_t                nid1;
-        ptl_nid_t                nid2;
-        ptl_nid_t                gateway_nid;
+        struct libcfs_ioctl_data data;
+        lnet_nid_t               gateway_nid;
+        unsigned int             hops = 1;
+        char                    *end;
         int                      rc;
         
-        if (argc < 3)
+        if (argc < 2 || argc > 3)
         {
-                fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]);
+                fprintf (stderr, "usage: %s gateway [hopcount]\n", argv[0]);
                 return (0);
         }
 
-        if (!g_nal_is_set())
+        if (!g_net_is_set(argv[0]))
                 return (-1);
 
-        if (ptl_parse_nid (&gateway_nid, argv[1]) != 0)
-        {
+        gateway_nid = libcfs_str2nid(argv[1]);
+        if (gateway_nid == LNET_NID_ANY) {
                 fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
                 return (-1);
         }
 
-        if (ptl_parse_nid (&nid1, argv[2]) != 0)
-        {
-                fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]);
-                return (-1);
-        }
-
-        if (argc < 4)
-                nid2 = nid1;
-        else if (ptl_parse_nid (&nid2, argv[3]) != 0)
-        {
-                fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]);
-                return (-1);
+        if (argc == 3) {
+                hops = strtoul(argv[2], &end, 0);
+                if (hops >= 256 || *end != 0) {
+                        fprintf (stderr, "Can't parse hopcount \"%s\"\n", argv[2]);
+                        return -1;
+                }
         }
+        
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net = g_net;
+        data.ioc_count = hops;
+        data.ioc_nid = gateway_nid;
 
-        PCFG_INIT(pcfg, NAL_CMD_ADD_ROUTE);
-        pcfg.pcfg_nid = gateway_nid;
-        pcfg.pcfg_nal = ROUTER;
-        pcfg.pcfg_gw_nal = g_nal;
-        pcfg.pcfg_nid2 = MIN (nid1, nid2);
-        pcfg.pcfg_nid3 = MAX (nid1, nid2);
-
-        rc = pcfg_ioctl(&pcfg);
-        if (rc != 0) 
-        {
-                fprintf (stderr, "NAL_CMD_ADD_ROUTE failed: %s\n", strerror (errno));
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_ROUTE, &data);
+        if (rc != 0) {
+                fprintf (stderr, "IOC_LIBCFS_ADD_ROUTE failed: %s\n", strerror (errno));
                 return (-1);
         }
         
@@ -1457,62 +1180,29 @@ jt_ptl_add_route (int argc, char **argv)
 int
 jt_ptl_del_route (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
-        ptl_nid_t                nid;
-        ptl_nid_t                nid1 = PTL_NID_ANY;
-        ptl_nid_t                nid2 = PTL_NID_ANY;
+        struct libcfs_ioctl_data data;
+        lnet_nid_t               nid;
         int                      rc;
         
-        if (argc < 2)
-        {
-                fprintf (stderr, "usage: %s targetNID\n", argv[0]);
+        if (argc != 2) {
+                fprintf (stderr, "usage: %s gatewayNID\n", argv[0]);
                 return (0);
         }
 
-        if (!g_nal_is_set())
-                return (-1);
-
-        if (ptl_parse_nid (&nid, argv[1]) != 0)
-        {
-                fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
-                return (-1);
-        }
-
-        if (argc >= 3 &&
-            ptl_parse_nid (&nid1, argv[2]) != 0)
-        {
-                fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]);
-                return (-1);
+        if (!libcfs_str2anynid(&nid, argv[1])) {
+                fprintf (stderr, "Can't parse gateway NID "
+                         "\"%s\"\n", argv[1]);
+                return -1;
         }
 
-        if (argc < 4) {
-                nid2 = nid1;
-        } else {
-                if (ptl_parse_nid (&nid2, argv[3]) != 0) {
-                        fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]);
-                        return (-1);
-                }
+        LIBCFS_IOC_INIT(data);
+        data.ioc_net = g_net_set ? g_net : LNET_NIDNET(LNET_NID_ANY);
+        data.ioc_nid = nid;
 
-                if (nid1 > nid2) {
-                        ptl_nid_t tmp = nid1;
-                        
-                        nid1 = nid2;
-                        nid2 = tmp;
-                }
-        }
-        
-        PCFG_INIT(pcfg, NAL_CMD_DEL_ROUTE);
-        pcfg.pcfg_nal = ROUTER;
-        pcfg.pcfg_gw_nal = g_nal;
-        pcfg.pcfg_nid = nid;
-        pcfg.pcfg_nid2 = nid1;
-        pcfg.pcfg_nid3 = nid2;
-
-        rc = pcfg_ioctl(&pcfg);
-        if (rc != 0) 
-        {
-                fprintf (stderr, "NAL_CMD_DEL_ROUTE ("LPX64") failed: %s\n", 
-                         ptl_nid2u64(nid), strerror (errno));
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_ROUTE, &data);
+        if (rc != 0) {
+                fprintf (stderr, "IOC_LIBCFS_DEL_ROUTE (%s) failed: %s\n", 
+                         libcfs_nid2str(nid), strerror (errno));
                 return (-1);
         }
         
@@ -1522,9 +1212,9 @@ jt_ptl_del_route (int argc, char **argv)
 int
 jt_ptl_notify_router (int argc, char **argv)
 {
-        struct portals_cfg       pcfg;
+        struct libcfs_ioctl_data data;
         int                      enable;
-        ptl_nid_t                nid;
+        lnet_nid_t               nid;
         int                      rc;
         struct timeval           now;
         time_t                   when;
@@ -1536,13 +1226,13 @@ jt_ptl_notify_router (int argc, char **argv)
                 return (0);
         }
 
-        if (ptl_parse_nid (&nid, argv[1]) != 0)
-        {
+        nid = libcfs_str2nid(argv[1]);
+        if (nid == LNET_NID_ANY) {
                 fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
                 return (-1);
         }
 
-        if (ptl_parse_bool (&enable, argv[2]) != 0) {
+        if (lnet_parse_bool (&enable, argv[2]) != 0) {
                 fprintf (stderr, "Can't parse boolean %s\n", argv[2]);
                 return (-1);
         }
@@ -1551,7 +1241,7 @@ jt_ptl_notify_router (int argc, char **argv)
         
         if (argc < 4) {
                 when = now.tv_sec;
-        } else if (ptl_parse_time (&when, argv[3]) != 0) {
+        } else if (lnet_parse_time (&when, argv[3]) != 0) {
                 fprintf(stderr, "Can't parse time %s\n"
                         "Please specify either 'YYYY-MM-DD-HH:MM:SS'\n"
                         "or an absolute unix time in seconds\n", argv[3]);
@@ -1562,19 +1252,16 @@ jt_ptl_notify_router (int argc, char **argv)
                 return (-1);
         }
 
-        PCFG_INIT(pcfg, NAL_CMD_NOTIFY_ROUTER);
-        pcfg.pcfg_nal = ROUTER;
-        pcfg.pcfg_gw_nal = g_nal;
-        pcfg.pcfg_nid = nid;
-        pcfg.pcfg_flags = enable;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_nid = nid;
+        data.ioc_flags = enable;
         /* Yeuch; 'cept I need a __u64 on 64 bit machines... */
-        pcfg.pcfg_nid3 = (__u64)when;
+        data.ioc_u64[0] = (__u64)when;
         
-        rc = pcfg_ioctl(&pcfg);
-        if (rc != 0) 
-        {
-                fprintf (stderr, "NAL_CMD_NOTIFY_ROUTER ("LPX64") failed: %s\n",
-                         ptl_nid2u64(nid), strerror (errno));
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_NOTIFY_ROUTER, &data);
+        if (rc != 0) {
+                fprintf (stderr, "IOC_LIBCFS_NOTIFY_ROUTER (%s) failed: %s\n",
+                         libcfs_nid2str(nid), strerror (errno));
                 return (-1);
         }
         
@@ -1584,105 +1271,96 @@ jt_ptl_notify_router (int argc, char **argv)
 int
 jt_ptl_print_routes (int argc, char **argv)
 {
-        char                      buffer[3][128];
-        struct portals_cfg        pcfg;
+        struct libcfs_ioctl_data  data;
         int                       rc;
         int                       index;
-        int                      gateway_nal;
-        ptl_nid_t                gateway_nid;
-        ptl_nid_t                nid1;
-        ptl_nid_t                nid2;
+        __u32                     net;
+        lnet_nid_t                nid;
+        unsigned int              hops;
         int                       alive;
 
         for (index = 0;;index++)
         {
-                PCFG_INIT(pcfg, NAL_CMD_GET_ROUTE);
-                pcfg.pcfg_nal = ROUTER;
-                pcfg.pcfg_count = index;
+                LIBCFS_IOC_INIT(data);
+                data.ioc_count = index;
                 
-                rc = pcfg_ioctl(&pcfg);
+                rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_ROUTE, &data);
                 if (rc != 0)
                         break;
 
-                gateway_nal = pcfg.pcfg_gw_nal;
-                gateway_nid = pcfg.pcfg_nid;
-                nid1 = pcfg.pcfg_nid2;
-                nid2 = pcfg.pcfg_nid3;
-                alive = pcfg.pcfg_flags;
+                net     = data.ioc_net;
+                hops    = data.ioc_count;
+                nid     = data.ioc_nid;
+                alive   = data.ioc_flags;
 
-                printf ("%8s %18s : %s - %s, %s\n", 
-                        nal2name (gateway_nal), 
-                        ptl_nid2str (buffer[0], gateway_nid),
-                        ptl_nid2str (buffer[1], nid1),
-                        ptl_nid2str (buffer[2], nid2),
-                        alive ? "up" : "down");
+                printf ("net %18s hops %u gw %32s %s\n", 
+                        libcfs_net2str(net), hops,
+                        libcfs_nid2str(nid), alive ? "up" : "down");
         }
 
-        if (index == 0 && errno != ENOENT) {
+        if (errno != ENOENT)
                 fprintf(stderr, "Error getting routes: %s: check dmesg.\n",
                         strerror(errno));
-        }
+
         return (0);
 }
 
 static int
 lwt_control(int enable, int clear)
 {
-        struct portal_ioctl_data data;
+        struct libcfs_ioctl_data data;
         int                      rc;
 
-        PORTAL_IOC_INIT(data);
-        data.ioc_flags = enable;
-        data.ioc_misc = clear;
+        LIBCFS_IOC_INIT(data);
+        data.ioc_flags = (enable ? 1 : 0) | (clear ? 2 : 0);
 
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_CONTROL, &data);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_CONTROL, &data);
         if (rc == 0)
                 return (0);
 
-        fprintf(stderr, "IOC_PORTAL_LWT_CONTROL failed: %s\n",
+        fprintf(stderr, "IOC_LIBCFS_LWT_CONTROL failed: %s\n",
                 strerror(errno));
         return (-1);
 }
 
 static int
-lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize,
+lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize, 
              lwt_event_t *events, int size)
 {
-        struct portal_ioctl_data data;
+        struct libcfs_ioctl_data data;
         int                      rc;
 
-        PORTAL_IOC_INIT(data);
+        LIBCFS_IOC_INIT(data);
         data.ioc_pbuf1 = (char *)events;
         data.ioc_plen1 = size;
 
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_SNAPSHOT, &data);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_SNAPSHOT, &data);
         if (rc != 0) {
-                fprintf(stderr, "IOC_PORTAL_LWT_SNAPSHOT failed: %s\n",
+                fprintf(stderr, "IOC_LIBCFS_LWT_SNAPSHOT failed: %s\n",
                         strerror(errno));
                 return (-1);
         }
 
         /* crappy overloads */
-        if (data.ioc_nid2 != sizeof(lwt_event_t) ||
-            data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) {
+        if (data.ioc_u32[2] != sizeof(lwt_event_t) ||
+            data.ioc_u32[3] != offsetof(lwt_event_t, lwte_where)) {
                 fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n",
-                        (int)data.ioc_nid2, (int)sizeof(lwt_event_t),
-                        (int)data.ioc_nid3,
+                        (int)data.ioc_u32[2], (int)sizeof(lwt_event_t),
+                        (int)data.ioc_u32[3],
                         (int)offsetof(lwt_event_t, lwte_where));
                 return (-1);
         }
 
-        LASSERT (data.ioc_count != 0);
-        LASSERT (data.ioc_misc != 0);
-
         if (now != NULL)
-                *now = data.ioc_nid;
+                *now = data.ioc_u64[0];
 
+        LASSERT (data.ioc_u32[0] != 0);
         if (ncpu != NULL)
-                *ncpu = data.ioc_count;
+                *ncpu = data.ioc_u32[0];
 
+        LASSERT (data.ioc_u32[1] != 0);
         if (totalsize != NULL)
-                *totalsize = data.ioc_misc;
+                *totalsize = data.ioc_u32[1];
 
         return (0);
 }
@@ -1691,22 +1369,22 @@ static char *
 lwt_get_string(char *kstr)
 {
         char                     *ustr;
-        struct portal_ioctl_data  data;
+        struct libcfs_ioctl_data  data;
         int                       size;
         int                       rc;
 
         /* FIXME: this could maintain a symbol table since we expect to be
          * looking up the same strings all the time... */
 
-        PORTAL_IOC_INIT(data);
+        LIBCFS_IOC_INIT(data);
         data.ioc_pbuf1 = kstr;
         data.ioc_plen1 = 1;        /* non-zero just to fool portal_ioctl_is_invalid() */
         data.ioc_pbuf2 = NULL;
         data.ioc_plen2 = 0;
 
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_LOOKUP_STRING, &data);
         if (rc != 0) {
-                fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n",
+                fprintf(stderr, "IOC_LIBCFS_LWT_LOOKUP_STRING failed: %s\n",
                         strerror(errno));
                 return (NULL);
         }
@@ -1719,15 +1397,15 @@ lwt_get_string(char *kstr)
                 return (NULL);
         }
 
-        PORTAL_IOC_INIT(data);
+        LIBCFS_IOC_INIT(data);
         data.ioc_pbuf1 = kstr;
         data.ioc_plen1 = 1;        /* non-zero just to fool portal_ioctl_is_invalid() */
         data.ioc_pbuf2 = ustr;
         data.ioc_plen2 = size;
 
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_LOOKUP_STRING, &data);
         if (rc != 0) {
-                fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n",
+                fprintf(stderr, "IOC_LIBCFS_LWT_LOOKUP_STRING failed: %s\n",
                         strerror(errno));
                 return (NULL);
         }
@@ -1994,7 +1672,7 @@ int jt_ptl_memhog(int argc, char **argv)
 {
         static int                gfp = 0;        /* sticky! */
 
-        struct portal_ioctl_data  data;
+        struct libcfs_ioctl_data  data;
         int                       rc;
         int                       count;
         char                     *end;
@@ -2019,10 +1697,10 @@ int jt_ptl_memhog(int argc, char **argv)
                 gfp = rc;
         }
         
-        PORTAL_IOC_INIT(data);
+        LIBCFS_IOC_INIT(data);
         data.ioc_count = count;
         data.ioc_flags = gfp;
-        rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MEMHOG, &data);
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_MEMHOG, &data);
 
         if (rc != 0) {
                 fprintf(stderr, "memhog %d failed: %s\n", count, strerror(errno));
@@ -2033,3 +1711,36 @@ int jt_ptl_memhog(int argc, char **argv)
         return 0;
 }
 
+int jt_ptl_testprotocompat(int argc, char **argv)
+{
+        struct libcfs_ioctl_data  data;
+        int                       rc;
+        int                       flags;
+        char                     *end;
+
+        if (argc < 2)  {
+                fprintf(stderr, "usage: %s <number>\n", argv[0]);
+                return 0;
+        }
+
+        flags = strtol(argv[1], &end, 0);
+        if (flags < 0 || *end != 0) {
+                fprintf(stderr, "Can't parse flags '%s'\n", argv[1]);
+                return -1;
+        }
+
+        LIBCFS_IOC_INIT(data);
+        data.ioc_flags = flags;
+        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_TESTPROTOCOMPAT, &data);
+
+        if (rc != 0) {
+                fprintf(stderr, "test proto compat %x failed: %s\n",
+                        flags, strerror(errno));
+                return -1;
+        }
+
+        printf("test proto compat %x OK\n", flags);
+        return 0;
+}
+
+
index 3089211..c3ab2b7 100644 (file)
 
 #include <stdio.h>
 #include <stdlib.h>
-#include <portals/api-support.h>
-#include <portals/ptlctl.h>
+#include <lnet/api-support.h>
+#include <lnet/lnetctl.h>
 
 #include "parser.h"
 
 
 command_t list[] = {
-        {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
+        {"network", jt_ptl_network, 0,"select/configure network (args: up|down|LND name)"},
+        {"net", jt_ptl_network, 0,"select/configure network (args: up|down|LND name)"},
+        {"list_nids", jt_ptl_list_nids, 0,"list local NIDs"},
+        {"which_nid", jt_ptl_which_nid, 0,"select the closest NID"},
         {"print_interfaces", jt_ptl_print_interfaces, 0, "print interface entries (no args)"},
         {"add_interface", jt_ptl_add_interface, 0, "add interface entry (args: ip [netmask])"},
         {"del_interface", jt_ptl_del_interface, 0, "delete interface entries (args: [ip])"},
@@ -37,12 +40,11 @@ command_t list[] = {
         {"add_peer", jt_ptl_add_peer, 0, "add peer entry (args: nid host port)"},
         {"del_peer", jt_ptl_del_peer, 0, "delete peer entry (args: [nid] [host])"},
         {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"},
-        {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [iIOC])"},
         {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [nid] [host]"},
         {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [nid]"},
         {"active_tx", jt_ptl_print_active_txs, 0, "print active transmits (no args)"},
-        {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"},
-        {"shownid", jt_ptl_shownid, 0, "print the local NID"},
+        {"testping", jt_ptl_ping_test, 0, "do a ping test (args: nid [count] [size] [timeout])"},
+        {"ping", jt_ptl_ping, 0, "ping (args: nid [timeout] [pid])"},
         {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"},
         {"add_route", jt_ptl_add_route, 0, 
          "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
@@ -53,7 +55,7 @@ command_t list[] = {
         {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
         {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"},
         {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"},
-        {"loopback", jt_ptl_loopback, 0, "usage: loopback [on|off]"},
+        {"testprotocompat", jt_ptl_testprotocompat, 0, "usage: testprotocompat count"},
         {"help", Parser_help, 0, "help"},
         {"exit", Parser_quit, 0, "quit"},
         {"quit", Parser_quit, 0, "quit"},
index 99bc59b..febe89a 100644 (file)
@@ -16,20 +16,56 @@ timenow ()
    return (tv.tv_sec + tv.tv_usec / 1000000.0);
 }
 
+typedef struct {
+        unsigned long        msgs_alloc;
+        unsigned long        msgs_max;
+        unsigned long        errors;
+        unsigned long        send_count;
+        unsigned long        recv_count;
+        unsigned long        route_count;
+        unsigned long        drop_count;
+        unsigned long long   send_length;
+        unsigned long long   recv_length;
+        unsigned long long   route_length;
+        unsigned long long   drop_length;
+} counters_t;
+
+unsigned long long subull(unsigned long long a, unsigned long long b)
+{
+       if (a < b)
+               return -1ULL - b + a + 1;
+       
+       return a - b;
+}
+
+unsigned long long subul(unsigned long a, unsigned long b)
+{
+       if (a < b)
+               return -1UL - b + a + 1;
+       
+       return a - b;
+}
+
+double rul(unsigned long a, double secs)
+{
+       return (double)a/secs;
+}
+
+double rull(unsigned long long a, double secs)
+{
+       return (double)a/secs;
+}
+
 void
 do_stat (int fd)
 {
    static char  buffer[1024];
    static double last = 0.0;
-   static unsigned long long old_bytes;
-   static unsigned long      old_packets;
-   static unsigned long      old_errors;
+   static counters_t old_counter;
    double now;
    double t;
-   unsigned long long new_bytes, bytes;
-   unsigned long      new_packets, packets;
-   unsigned long      new_errors, errors;
-   unsigned long      depth;
+   counters_t new_counter;
+   counters_t counter;
    int    n;
    
    lseek (fd, 0, SEEK_SET);
@@ -42,51 +78,53 @@ do_stat (int fd)
    }    
    buffer[n] = 0;
    
-   n = sscanf (buffer, "%Lu %lu %lu %lu",
-              &new_bytes, &new_packets, &new_errors, &depth);
-   
-   if (n < 3)
+   n = sscanf (buffer, "%u %u %u %u %u %u %u %Lu %Lu %Lu %Lu",
+              &new_counter.msgs_alloc, &new_counter.msgs_max,
+              &new_counter.errors, 
+              &new_counter.send_count, &new_counter.recv_count,
+              &new_counter.route_count, &new_counter.drop_count,
+              &new_counter.send_length, &new_counter.recv_length,
+              &new_counter.route_length, &new_counter.drop_length);
+   if (n < 11)
    {
       fprintf (stderr, "Can't parse statfile\n");
       exit (1);
    }
    
-   if (last == 0.0)
-      printf ("%llu bytes, %lu packets (sz %lld), %lu errors", 
-             new_bytes, new_packets,
-             (long long)((new_packets == 0) ? 0LL : new_bytes/new_packets),
-             new_errors);
-   else
-   {
-      t = now - last;
+   if (last == 0.0) {
+          printf ("M %lu(%lu) E %lu S %lu/%llu R %lu/%llu F %lu/%llu D %lu/%llu\n", 
+                  new_counter.msgs_alloc, new_counter.msgs_max,
+                  new_counter.errors, 
+                  new_counter.send_count, new_counter.send_length,
+                  new_counter.recv_count, new_counter.recv_length,
+                  new_counter.route_count, new_counter.route_length, 
+                  new_counter.drop_count, new_counter.drop_length);
+   } else {
+          t = now - last;
 
-      if (new_bytes < old_bytes)
-         bytes = -1ULL - old_bytes + new_bytes + 1;
-      else
-         bytes = new_bytes - old_bytes;
-      if (new_packets < old_packets)
-         packets = -1UL - old_packets + new_packets + 1;
-      else
-         packets = new_packets - old_packets;
-      if (new_errors < old_errors)
-         errors = -1UL - old_errors + new_errors + 1;
-      else
-         errors = new_errors - old_errors;
-      
-      printf ("%9llu bytes (%7.2fMb/s), %7lu packets (sz %5lld, %5ld/s), %lu errors (%ld/s)", 
-             bytes, ((double)bytes)/((1<<20) * t),
-             packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t),
-             errors, (long)(errors/t));
-   }
-   old_bytes = new_bytes;
-   old_packets = new_packets;
-   old_errors = new_errors;
+          counter.msgs_alloc = new_counter.msgs_alloc;
+          counter.msgs_max = new_counter.msgs_max;
+          
+          counter.errors = subul(new_counter.errors, old_counter.errors);
+          counter.send_count = subul(new_counter.send_count, old_counter.send_count);
+          counter.recv_count = subul(new_counter.recv_count, old_counter.recv_count);
+          counter.route_count = subul(new_counter.route_count, old_counter.route_count);
+          counter.drop_count = subul(new_counter.drop_count, old_counter.drop_count);
+          counter.send_length = subull(new_counter.send_length, old_counter.send_length);
+          counter.recv_length = subull(new_counter.recv_length, old_counter.recv_length);
+          counter.route_length = subull(new_counter.route_length, old_counter.route_length);
+          counter.drop_length = subull(new_counter.drop_length, old_counter.drop_length);
 
-   if (n == 4)
-      printf (", depth (%ld)\n", depth);
-   else
-      printf ("\n");
+          printf ("M %3lu(%3lu) E %0.0f S %7.2f/%6.0f R %7.2f/%6.0f F %7.2f/%6.0f D %4.2f/%0.0f\n",
+                  counter.msgs_alloc, counter.msgs_max,
+                  rul(counter.errors,t),
+                  rull(counter.send_length,t*1024.0*1024.0), rul(counter.send_count, t),
+                  rull(counter.recv_length,t*1024.0*1024.0), rul(counter.recv_count, t),
+                  rull(counter.route_length,t*1024.0*1024.0), rul(counter.route_count, t),
+                  rull(counter.drop_length,t*1024.0*1024.0), rul(counter.drop_count, t));
+   }
 
+   old_counter = new_counter;
    fflush (stdout);
    
    lseek (fd, 0, SEEK_SET);
@@ -101,7 +139,7 @@ int main (int argc, char **argv)
    if (argc > 1)
       interval = atoi (argv[1]);
 
-   fd = open ("/proc/sys/portals/router", O_RDONLY);
+   fd = open ("/proc/sys/lnet/stats", O_RDONLY);
    if (fd < 0)
    {
       fprintf (stderr, "Can't open stat: %s\n", strerror (errno));
index 986d081..9590b8b 100644 (file)
@@ -4,8 +4,7 @@
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/wait.h>
-#include <portals/api-support.h>
-#include <portals/lib-types.h>
+#include <lnet/lib-lnet.h>
 
 #include <string.h>
 
@@ -25,14 +24,14 @@ do {                                            \
 
 #define STRINGIFY(a) #a
 
-#define CHECK_DEFINE(a)                                         \
-do {                                                            \
-        printf ("        LASSERT ("#a" == "STRINGIFY(a)");\n"); \
+#define CHECK_DEFINE(a)                                                 \
+do {                                                                    \
+        printf ("        CLASSERT ("#a" == "STRINGIFY(a)");\n");        \
 } while (0)
 
 #define CHECK_VALUE(a)                                  \
 do {                                                    \
-        printf ("        LASSERT ("#a" == %d);\n", a);  \
+        printf ("        CLASSERT ("#a" == %d);\n", a); \
 } while (0)
 
 #define CHECK_MEMBER_OFFSET(s,m)                \
@@ -59,64 +58,64 @@ do {                                            \
 } while (0)
 
 void
-check_ptl_handle_wire (void)
+check_lnet_handle_wire (void)
 {
-        CHECK_STRUCT (ptl_handle_wire_t);
-        CHECK_MEMBER (ptl_handle_wire_t, wh_interface_cookie);
-        CHECK_MEMBER (ptl_handle_wire_t, wh_object_cookie);
+        CHECK_STRUCT (lnet_handle_wire_t);
+        CHECK_MEMBER (lnet_handle_wire_t, wh_interface_cookie);
+        CHECK_MEMBER (lnet_handle_wire_t, wh_object_cookie);
 }
 
 void
-check_ptl_magicversion (void)
+check_lnet_magicversion (void)
 {
-        CHECK_STRUCT (ptl_magicversion_t);
-        CHECK_MEMBER (ptl_magicversion_t, magic);
-        CHECK_MEMBER (ptl_magicversion_t, version_major);
-        CHECK_MEMBER (ptl_magicversion_t, version_minor);
+        CHECK_STRUCT (lnet_magicversion_t);
+        CHECK_MEMBER (lnet_magicversion_t, magic);
+        CHECK_MEMBER (lnet_magicversion_t, version_major);
+        CHECK_MEMBER (lnet_magicversion_t, version_minor);
 }
 
 void
-check_ptl_hdr (void)
+check_lnet_hdr (void)
 {
-        CHECK_STRUCT (ptl_hdr_t);
-        CHECK_MEMBER (ptl_hdr_t, dest_nid);
-        CHECK_MEMBER (ptl_hdr_t, src_nid);
-        CHECK_MEMBER (ptl_hdr_t, dest_pid);
-        CHECK_MEMBER (ptl_hdr_t, src_pid);
-        CHECK_MEMBER (ptl_hdr_t, type);
-        CHECK_MEMBER (ptl_hdr_t, payload_length);
-        CHECK_MEMBER (ptl_hdr_t, msg);
+        CHECK_STRUCT (lnet_hdr_t);
+        CHECK_MEMBER (lnet_hdr_t, dest_nid);
+        CHECK_MEMBER (lnet_hdr_t, src_nid);
+        CHECK_MEMBER (lnet_hdr_t, dest_pid);
+        CHECK_MEMBER (lnet_hdr_t, src_pid);
+        CHECK_MEMBER (lnet_hdr_t, type);
+        CHECK_MEMBER (lnet_hdr_t, payload_length);
+        CHECK_MEMBER (lnet_hdr_t, msg);
 
         BLANK_LINE ();
         COMMENT ("Ack");
-        CHECK_MEMBER (ptl_hdr_t, msg.ack.dst_wmd);
-        CHECK_MEMBER (ptl_hdr_t, msg.ack.match_bits);
-        CHECK_MEMBER (ptl_hdr_t, msg.ack.mlength);
+        CHECK_MEMBER (lnet_hdr_t, msg.ack.dst_wmd);
+        CHECK_MEMBER (lnet_hdr_t, msg.ack.match_bits);
+        CHECK_MEMBER (lnet_hdr_t, msg.ack.mlength);
 
         BLANK_LINE ();
         COMMENT ("Put");
-        CHECK_MEMBER (ptl_hdr_t, msg.put.ack_wmd);
-        CHECK_MEMBER (ptl_hdr_t, msg.put.match_bits);
-        CHECK_MEMBER (ptl_hdr_t, msg.put.hdr_data);
-        CHECK_MEMBER (ptl_hdr_t, msg.put.ptl_index);
-        CHECK_MEMBER (ptl_hdr_t, msg.put.offset);
+        CHECK_MEMBER (lnet_hdr_t, msg.put.ack_wmd);
+        CHECK_MEMBER (lnet_hdr_t, msg.put.match_bits);
+        CHECK_MEMBER (lnet_hdr_t, msg.put.hdr_data);
+        CHECK_MEMBER (lnet_hdr_t, msg.put.ptl_index);
+        CHECK_MEMBER (lnet_hdr_t, msg.put.offset);
 
         BLANK_LINE ();
         COMMENT ("Get");
-        CHECK_MEMBER (ptl_hdr_t, msg.get.return_wmd);
-        CHECK_MEMBER (ptl_hdr_t, msg.get.match_bits);
-        CHECK_MEMBER (ptl_hdr_t, msg.get.ptl_index);
-        CHECK_MEMBER (ptl_hdr_t, msg.get.src_offset);
-        CHECK_MEMBER (ptl_hdr_t, msg.get.sink_length);
+        CHECK_MEMBER (lnet_hdr_t, msg.get.return_wmd);
+        CHECK_MEMBER (lnet_hdr_t, msg.get.match_bits);
+        CHECK_MEMBER (lnet_hdr_t, msg.get.ptl_index);
+        CHECK_MEMBER (lnet_hdr_t, msg.get.src_offset);
+        CHECK_MEMBER (lnet_hdr_t, msg.get.sink_length);
 
         BLANK_LINE ();
         COMMENT ("Reply");
-        CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_wmd);
+        CHECK_MEMBER (lnet_hdr_t, msg.reply.dst_wmd);
 
         BLANK_LINE ();
         COMMENT ("Hello");
-        CHECK_MEMBER (ptl_hdr_t, msg.hello.incarnation);
-        CHECK_MEMBER (ptl_hdr_t, msg.hello.type);
+        CHECK_MEMBER (lnet_hdr_t, msg.hello.incarnation);
+        CHECK_MEMBER (lnet_hdr_t, msg.hello.type);
 }
 
 void
@@ -174,13 +173,13 @@ system_string (char *cmdline, char *str, int len)
 int
 main (int argc, char **argv)
 {
-        char unameinfo[80];
-        char gccinfo[80];
+        char unameinfo[256];
+        char gccinfo[256];
 
         system_string("uname -a", unameinfo, sizeof(unameinfo));
         system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo));
 
-        printf ("void lib_assert_wire_constants (void)\n"
+        printf ("void lnet_assert_wire_constants (void)\n"
                 "{\n"
                 "        /* Wire protocol assertions generated by 'wirecheck'\n"
                 "         * running on %s\n"
@@ -190,19 +189,23 @@ main (int argc, char **argv)
         BLANK_LINE ();
 
         COMMENT ("Constants...");
-        CHECK_DEFINE (PORTALS_PROTO_MAGIC);
-        CHECK_DEFINE (PORTALS_PROTO_VERSION_MAJOR);
-        CHECK_DEFINE (PORTALS_PROTO_VERSION_MINOR);
-
-        CHECK_VALUE (PTL_MSG_ACK);
-        CHECK_VALUE (PTL_MSG_PUT);
-        CHECK_VALUE (PTL_MSG_GET);
-        CHECK_VALUE (PTL_MSG_REPLY);
-        CHECK_VALUE (PTL_MSG_HELLO);
-
-        check_ptl_handle_wire ();
-        check_ptl_magicversion ();
-        check_ptl_hdr ();
+
+        CHECK_DEFINE (LNET_PROTO_OPENIB_MAGIC);
+        CHECK_DEFINE (LNET_PROTO_RA_MAGIC);
+
+        CHECK_DEFINE (LNET_PROTO_TCP_MAGIC);
+        CHECK_DEFINE (LNET_PROTO_TCP_VERSION_MAJOR);
+        CHECK_DEFINE (LNET_PROTO_TCP_VERSION_MINOR);
+
+        CHECK_VALUE (LNET_MSG_ACK);
+        CHECK_VALUE (LNET_MSG_PUT);
+        CHECK_VALUE (LNET_MSG_GET);
+        CHECK_VALUE (LNET_MSG_REPLY);
+        CHECK_VALUE (LNET_MSG_HELLO);
+
+        check_lnet_handle_wire ();
+        check_lnet_magicversion ();
+        check_lnet_hdr ();
 
         printf ("}\n\n");