From ed88907a96ba81d3558e71ade9def98bdc785169 Mon Sep 17 00:00:00 2001 From: nathan Date: Sat, 10 Feb 2007 00:05:05 +0000 Subject: [PATCH] Landing b_hd_newconfig on HEAD --- lnet/ChangeLog | 212 +- lnet/Kernelenv.in | 4 +- lnet/Kernelenv.mk | 2 +- lnet/Makefile.in | 9 +- lnet/autoMakefile.am | 2 +- lnet/autoconf/Makefile.am | 2 +- lnet/autoconf/lustre-lnet.m4 | 991 ++- lnet/include/Makefile.am | 2 +- lnet/include/libcfs/Makefile.am | 4 +- lnet/include/libcfs/curproc.h | 2 + lnet/include/libcfs/darwin/Makefile.am | 2 +- lnet/include/libcfs/darwin/darwin-fs.h | 162 +- lnet/include/libcfs/darwin/darwin-lock.h | 52 +- lnet/include/libcfs/darwin/darwin-mem.h | 116 +- lnet/include/libcfs/darwin/darwin-prim.h | 377 +- lnet/include/libcfs/darwin/darwin-sync.h | 98 +- lnet/include/libcfs/darwin/darwin-tcpip.h | 90 + lnet/include/libcfs/darwin/darwin-time.h | 91 +- lnet/include/libcfs/darwin/darwin-types.h | 38 +- lnet/include/libcfs/darwin/darwin-utils.h | 11 +- lnet/include/libcfs/darwin/kp30.h | 39 +- lnet/include/libcfs/darwin/libcfs.h | 28 +- lnet/include/libcfs/darwin/lltrace.h | 4 +- lnet/include/libcfs/kp30.h | 598 +- lnet/include/libcfs/libcfs.h | 557 +- lnet/include/libcfs/linux/Makefile.am | 4 +- lnet/include/libcfs/linux/kp30.h | 72 +- lnet/include/libcfs/linux/libcfs.h | 88 +- lnet/include/libcfs/linux/linux-fs.h | 37 +- lnet/include/libcfs/linux/linux-lock.h | 18 +- lnet/include/libcfs/linux/linux-mem.h | 33 +- lnet/include/libcfs/linux/linux-prim.h | 74 +- lnet/include/libcfs/linux/linux-tcpip.h | 62 + lnet/include/libcfs/linux/linux-time.h | 82 +- lnet/include/libcfs/linux/lltrace.h | 4 +- lnet/include/libcfs/linux/portals_compat25.h | 33 +- lnet/include/libcfs/linux/portals_utils.h | 10 +- lnet/include/libcfs/list.h | 321 +- lnet/include/libcfs/lltrace.h | 27 +- lnet/include/libcfs/portals_lib.h | 97 - lnet/include/libcfs/portals_utils.h | 2 + lnet/include/libcfs/types.h | 17 + lnet/include/libcfs/user-lock.h | 52 +- lnet/include/libcfs/user-prim.h | 169 +- lnet/include/libcfs/user-time.h | 27 +- lnet/include/libcfs/winnt/kp30.h | 156 + lnet/include/libcfs/winnt/libcfs.h | 126 + .../{darwin/portals_lib.h => winnt/lltrace.h} | 15 +- .../portals_lib.h => winnt/portals_compat25.h} | 20 +- lnet/include/libcfs/winnt/portals_utils.h | 168 + lnet/include/libcfs/winnt/winnt-fs.h | 280 + lnet/include/libcfs/winnt/winnt-lock.h | 686 ++ lnet/include/libcfs/winnt/winnt-mem.h | 133 + lnet/include/libcfs/winnt/winnt-prim.h | 1082 ++++ lnet/include/libcfs/winnt/winnt-tcpip.h | 660 ++ lnet/include/libcfs/winnt/winnt-time.h | 315 + lnet/include/libcfs/winnt/winnt-types.h | 647 ++ lnet/include/lnet/Makefile.am | 9 +- lnet/include/lnet/api-support.h | 30 +- lnet/include/lnet/api.h | 181 +- lnet/include/lnet/build_check.h | 8 - lnet/include/lnet/darwin/Makefile.am | 2 +- lnet/include/lnet/darwin/api-support.h | 27 + lnet/include/lnet/darwin/lib-lnet.h | 10 +- lnet/include/lnet/darwin/lib-p30.h | 14 - lnet/include/lnet/darwin/lib-types.h | 20 +- lnet/include/lnet/darwin/lnet.h | 10 +- lnet/include/lnet/darwin/p30.h | 20 - lnet/include/lnet/errno.h | 53 - lnet/include/lnet/internal.h | 16 - lnet/include/lnet/kpr.h | 176 - lnet/include/lnet/lib-lnet.h | 703 +- lnet/include/lnet/lib-p30.h | 466 -- lnet/include/lnet/lib-types.h | 726 ++- lnet/include/lnet/linux/Makefile.am | 2 +- lnet/include/lnet/linux/api-support.h | 39 + lnet/include/lnet/linux/lib-lnet.h | 43 +- lnet/include/lnet/linux/lib-p30.h | 20 - lnet/include/lnet/linux/lib-types.h | 10 +- lnet/include/lnet/linux/lnet.h | 10 +- lnet/include/lnet/linux/p30.h | 25 - lnet/include/lnet/lnet.h | 18 +- lnet/include/lnet/lnetctl.h | 29 +- lnet/include/lnet/myrnal.h | 23 - lnet/include/lnet/nal.h | 88 - lnet/include/lnet/nalids.h | 2 - lnet/include/lnet/p30.h | 25 - lnet/include/lnet/ptlctl.h | 96 - lnet/include/lnet/ptllnd.h | 77 + lnet/include/lnet/ptllnd_wire.h | 93 + lnet/include/lnet/socklnd.h | 53 +- lnet/include/lnet/stringtab.h | 3 - lnet/include/lnet/types.h | 252 +- lnet/include/lnet/winnt/api-support.h | 9 + lnet/include/lnet/winnt/lib-lnet.h | 25 + lnet/include/lnet/winnt/lib-types.h | 55 + lnet/include/lnet/winnt/lnet.h | 511 ++ lnet/klnds/Makefile.in | 19 +- lnet/klnds/autoMakefile.am | 2 +- lnet/klnds/{lolnd => ciblnd}/.cvsignore | 0 lnet/klnds/ciblnd/Makefile.in | 8 + lnet/klnds/{lolnd => ciblnd}/autoMakefile.am | 11 +- lnet/klnds/ciblnd/ciblnd.c | 1 + lnet/klnds/ciblnd/ciblnd_cb.c | 1 + lnet/klnds/ciblnd/ciblnd_modparams.c | 1 + lnet/klnds/gmlnd/Makefile.in | 4 +- lnet/klnds/gmlnd/README | 73 + lnet/klnds/gmlnd/autoMakefile.am | 8 +- lnet/klnds/gmlnd/gm-reg-phys.patch | 107 + lnet/klnds/gmlnd/gmlnd.h | 228 +- lnet/klnds/gmlnd/gmlnd_api.c | 353 +- lnet/klnds/gmlnd/gmlnd_cb.c | 307 +- lnet/klnds/gmlnd/gmlnd_comm.c | 577 +- lnet/klnds/gmlnd/gmlnd_module.c | 97 +- lnet/klnds/gmlnd/gmlnd_utils.c | 616 +- lnet/klnds/iiblnd/Makefile.in | 4 +- lnet/klnds/iiblnd/autoMakefile.am | 10 +- lnet/klnds/iiblnd/iiblnd.c | 2040 +++--- lnet/klnds/iiblnd/iiblnd.h | 970 ++- lnet/klnds/iiblnd/iiblnd_cb.c | 4672 +++++++------- lnet/klnds/iiblnd/iiblnd_modparams.c | 179 + lnet/klnds/lolnd/Makefile.in | 4 - lnet/klnds/lolnd/lolnd.c | 164 - lnet/klnds/lolnd/lolnd.h | 72 - lnet/klnds/lolnd/lolnd_cb.c | 267 - lnet/klnds/mxlnd/.cvsignore | 11 + lnet/klnds/mxlnd/Makefile.in | 6 + lnet/klnds/mxlnd/README | 190 + lnet/{router => klnds/mxlnd}/autoMakefile.am | 12 +- lnet/klnds/mxlnd/mxlnd.c | 920 +++ lnet/klnds/mxlnd/mxlnd.h | 415 ++ lnet/klnds/mxlnd/mxlnd_cb.c | 3437 ++++++++++ lnet/klnds/mxlnd/mxlnd_modparams.c | 73 + lnet/klnds/mxlnd/mxlnd_wire.h | 95 + lnet/klnds/o2iblnd/.cvsignore | 11 + lnet/klnds/o2iblnd/Makefile.in | 6 + lnet/klnds/o2iblnd/autoMakefile.am | 13 + lnet/klnds/o2iblnd/o2iblnd.c | 1710 +++++ lnet/klnds/o2iblnd/o2iblnd.h | 630 ++ lnet/klnds/o2iblnd/o2iblnd_cb.c | 3159 +++++++++ lnet/klnds/o2iblnd/o2iblnd_modparams.c | 218 + lnet/klnds/openiblnd/Makefile.in | 4 +- lnet/klnds/openiblnd/autoMakefile.am | 10 +- lnet/klnds/openiblnd/openiblnd.c | 1486 ++--- lnet/klnds/openiblnd/openiblnd.h | 382 +- lnet/klnds/openiblnd/openiblnd_cb.c | 1450 +++-- lnet/klnds/openiblnd/openiblnd_modparams.c | 149 + lnet/klnds/ptllnd/.cvsignore | 11 + lnet/klnds/ptllnd/Makefile.in | 13 + lnet/klnds/ptllnd/README | 47 + lnet/klnds/ptllnd/autoMakefile.am | 8 + lnet/klnds/ptllnd/ptllnd.c | 836 +++ lnet/klnds/ptllnd/ptllnd.h | 538 ++ lnet/klnds/ptllnd/ptllnd_cb.c | 760 +++ lnet/klnds/ptllnd/ptllnd_modparams.c | 217 + lnet/klnds/ptllnd/ptllnd_peer.c | 1209 ++++ lnet/klnds/ptllnd/ptllnd_ptltrace.c | 172 + lnet/klnds/ptllnd/ptllnd_rx_buf.c | 720 +++ lnet/klnds/ptllnd/ptllnd_tx.c | 494 ++ lnet/klnds/ptllnd/wirecheck.c | 206 + lnet/klnds/qswlnd/Makefile.in | 4 +- lnet/klnds/qswlnd/autoMakefile.am | 10 +- lnet/klnds/qswlnd/qswlnd.c | 564 +- lnet/klnds/qswlnd/qswlnd.h | 294 +- lnet/klnds/qswlnd/qswlnd_cb.c | 1932 +++--- lnet/klnds/qswlnd/qswlnd_modparams.c | 149 + lnet/klnds/ralnd/Makefile.in | 4 +- lnet/klnds/ralnd/autoMakefile.am | 10 +- lnet/klnds/ralnd/ralnd.c | 1286 ++-- lnet/klnds/ralnd/ralnd.h | 140 +- lnet/klnds/ralnd/ralnd_cb.c | 684 +- lnet/klnds/ralnd/ralnd_modparams.c | 135 + lnet/klnds/socklnd/Info.plist | 20 +- lnet/klnds/socklnd/Makefile.in | 8 +- lnet/klnds/socklnd/autoMakefile.am | 20 +- lnet/klnds/socklnd/socklnd.c | 1776 +++--- lnet/klnds/socklnd/socklnd.h | 486 +- lnet/klnds/socklnd/socklnd_cb.c | 2821 ++++---- lnet/klnds/socklnd/socklnd_lib-darwin.c | 963 +-- lnet/klnds/socklnd/socklnd_lib-darwin.h | 13 +- lnet/klnds/socklnd/socklnd_lib-linux.c | 646 +- lnet/klnds/socklnd/socklnd_lib-linux.h | 45 +- lnet/klnds/socklnd/socklnd_lib-winnt.c | 832 +++ lnet/klnds/socklnd/socklnd_lib-winnt.h | 42 + lnet/klnds/socklnd/socklnd_modparams.c | 156 + lnet/klnds/viblnd/Makefile.in | 4 +- lnet/klnds/viblnd/autoMakefile.am | 10 +- lnet/klnds/viblnd/viblnd.c | 852 +-- lnet/klnds/viblnd/viblnd.h | 246 +- lnet/klnds/viblnd/viblnd_cb.c | 1858 +++--- lnet/klnds/viblnd/viblnd_modparams.c | 237 + lnet/klnds/viblnd/viblnd_wire.h | 21 +- lnet/klnds/viblnd/wirecheck.c | 18 +- lnet/libcfs/Info.plist | 14 +- lnet/libcfs/Makefile.in | 6 +- lnet/libcfs/autoMakefile.am | 24 +- lnet/libcfs/darwin/Makefile.am | 3 +- lnet/libcfs/darwin/darwin-curproc.c | 46 +- lnet/libcfs/darwin/darwin-debug.c | 70 +- lnet/libcfs/darwin/darwin-fs.c | 264 +- lnet/libcfs/darwin/darwin-internal.h | 22 + lnet/libcfs/darwin/darwin-mem.c | 465 +- lnet/libcfs/darwin/darwin-module.c | 200 +- lnet/libcfs/darwin/darwin-prim.c | 361 +- lnet/libcfs/darwin/darwin-proc.c | 327 +- lnet/libcfs/darwin/darwin-sync.c | 369 +- lnet/libcfs/darwin/darwin-tcpip.c | 1339 ++++ lnet/libcfs/darwin/darwin-tracefile.c | 275 +- lnet/libcfs/darwin/darwin-utils.c | 138 +- lnet/libcfs/debug.c | 731 ++- lnet/libcfs/linux/Makefile.am | 2 +- lnet/libcfs/linux/linux-curproc.c | 2 +- lnet/libcfs/linux/linux-debug.c | 119 +- lnet/libcfs/linux/linux-fs.c | 94 +- lnet/libcfs/linux/linux-lock.c | 2 +- lnet/libcfs/linux/linux-lwt.c | 2 +- lnet/libcfs/linux/linux-mem.c | 112 +- lnet/libcfs/linux/linux-module.c | 129 +- lnet/libcfs/linux/linux-prim.c | 143 +- lnet/libcfs/linux/linux-proc.c | 289 +- lnet/libcfs/linux/linux-sync.c | 2 +- lnet/libcfs/linux/linux-tcpip.c | 687 ++ lnet/libcfs/linux/linux-tracefile.c | 411 +- lnet/libcfs/linux/linux-utils.c | 21 +- lnet/libcfs/lwt.c | 16 +- lnet/libcfs/misc.c | 53 + lnet/libcfs/module.c | 350 +- lnet/libcfs/nidstrings.c | 533 ++ lnet/libcfs/tracefile.c | 586 +- lnet/libcfs/tracefile.h | 132 +- lnet/libcfs/user-lock.c | 21 +- lnet/libcfs/user-prim.c | 202 +- lnet/libcfs/watchdog.c | 203 +- lnet/libcfs/winnt/winnt-curproc.c | 453 ++ lnet/libcfs/winnt/winnt-debug.c | 1057 +++ lnet/libcfs/winnt/winnt-fs.c | 541 ++ lnet/libcfs/winnt/winnt-lock.c | 353 ++ lnet/libcfs/winnt/winnt-lwt.c | 20 + lnet/libcfs/winnt/winnt-mem.c | 332 + lnet/libcfs/winnt/winnt-module.c | 160 + lnet/libcfs/winnt/winnt-prim.c | 650 ++ lnet/libcfs/winnt/winnt-proc.c | 1990 ++++++ lnet/libcfs/winnt/winnt-sync.c | 449 ++ lnet/libcfs/winnt/winnt-tcpip.c | 6706 ++++++++++++++++++++ lnet/libcfs/winnt/winnt-tracefile.c | 300 + lnet/libcfs/winnt/winnt-usr.c | 85 + lnet/libcfs/winnt/winnt-utils.c | 158 + lnet/lnet/Info.plist | 18 +- lnet/lnet/Makefile.in | 12 +- lnet/lnet/acceptor.c | 537 ++ lnet/lnet/api-errno.c | 38 - lnet/lnet/api-ni.c | 1767 +++++- lnet/lnet/api-wrap.c | 379 -- lnet/lnet/autoMakefile.am | 40 +- lnet/lnet/config.c | 1386 ++++ lnet/lnet/lib-eq.c | 248 +- lnet/lnet/lib-init.c | 433 -- lnet/lnet/lib-md.c | 405 +- lnet/lnet/lib-me.c | 166 +- lnet/lnet/lib-move.c | 3034 +++++---- lnet/lnet/lib-msg.c | 239 +- lnet/lnet/lib-ni.c | 29 - lnet/lnet/lib-pid.c | 20 - lnet/lnet/lo.c | 112 + lnet/lnet/module.c | 268 +- lnet/lnet/peer.c | 244 + lnet/lnet/router.c | 1135 ++++ lnet/lnet/router_proc.c | 1094 ++++ lnet/router/Makefile.in | 4 - lnet/router/proc.c | 242 - lnet/router/router.c | 824 --- lnet/router/router.h | 102 - lnet/tests/Makefile.in | 10 +- lnet/tests/autoMakefile.am | 36 +- lnet/tests/ping.h | 51 +- lnet/tests/ping_cli.c | 187 +- lnet/tests/ping_cli/Info.plist | 28 +- lnet/tests/ping_cli/winnt-pingcli.c | 634 ++ lnet/tests/ping_srv.c | 137 +- lnet/tests/ping_srv/Info.plist | 65 +- lnet/tests/ping_srv/winnt-pingsrv.c | 634 ++ lnet/tests/sping_cli.c | 279 - lnet/tests/sping_srv.c | 294 - lnet/tests/startclient.sh | 39 +- lnet/tests/startserver.sh | 41 +- lnet/tests/stopclient.sh | 13 +- lnet/tests/stopserver.sh | 15 +- lnet/tests/ut.README | 43 + lnet/tests/ut.h | 45 + lnet/tests/ut_cli.c | 211 + lnet/tests/ut_srv.c | 144 + lnet/ulnds/.cvsignore | 3 +- lnet/ulnds/Makefile.am | 10 - lnet/ulnds/Makefile.in | 5 + lnet/ulnds/README | 53 - lnet/ulnds/address.c | 147 - lnet/ulnds/autoMakefile.am | 6 + lnet/ulnds/bridge.h | 34 - lnet/ulnds/connection.c | 507 -- lnet/ulnds/connection.h | 35 - lnet/ulnds/debug.c | 119 - lnet/ulnds/dispatch.h | 46 - lnet/ulnds/ipmap.h | 38 - lnet/ulnds/pqtimer.c | 226 - lnet/ulnds/pqtimer.h | 25 - lnet/ulnds/procapi.c | 196 - lnet/ulnds/procbridge.h | 56 - lnet/ulnds/proclib.c | 137 - lnet/ulnds/ptllnd/.cvsignore | 3 + lnet/ulnds/ptllnd/Makefile.am | 12 + lnet/ulnds/ptllnd/ptllnd.c | 629 ++ lnet/ulnds/ptllnd/ptllnd.h | 262 + lnet/ulnds/ptllnd/ptllnd_cb.c | 1684 +++++ lnet/ulnds/select.c | 421 -- lnet/ulnds/socklnd/.cvsignore | 3 + lnet/ulnds/socklnd/Makefile.am | 15 +- lnet/ulnds/socklnd/address.c | 147 - lnet/ulnds/socklnd/bridge.h | 15 +- lnet/ulnds/socklnd/connection.c | 492 +- lnet/ulnds/socklnd/connection.h | 26 +- lnet/ulnds/socklnd/debug.c | 119 - lnet/ulnds/socklnd/dispatch.h | 2 - lnet/ulnds/socklnd/ipmap.h | 38 - lnet/ulnds/socklnd/procapi.c | 110 +- lnet/ulnds/socklnd/procbridge.h | 26 +- lnet/ulnds/socklnd/proclib.c | 47 +- lnet/ulnds/socklnd/select.c | 4 +- lnet/ulnds/socklnd/table.c | 2 +- lnet/ulnds/socklnd/table.h | 5 +- lnet/ulnds/socklnd/tcplnd.c | 341 +- lnet/ulnds/socklnd/utypes.h | 12 - lnet/ulnds/table.c | 264 - lnet/ulnds/table.h | 39 - lnet/ulnds/tcplnd.c | 256 - lnet/ulnds/timer.h | 30 - lnet/ulnds/utypes.h | 12 - lnet/utils/.cvsignore | 2 +- lnet/utils/Makefile.am | 30 +- lnet/utils/acceptor.c | 363 -- lnet/utils/debug.c | 310 +- lnet/utils/debugctl.c | 9 +- lnet/utils/gmlndnid.c | 143 +- lnet/utils/l_ioctl.c | 253 +- lnet/utils/lbstats | 11 + lnet/utils/parser.c | 15 +- lnet/utils/portals.c | 1541 ++--- lnet/utils/ptlctl.c | 16 +- lnet/utils/routerstat.c | 128 +- lnet/utils/wirecheck.c | 115 +- 349 files changed, 77103 insertions(+), 29881 deletions(-) create mode 100644 lnet/include/libcfs/darwin/darwin-tcpip.h create mode 100644 lnet/include/libcfs/linux/linux-tcpip.h delete mode 100644 lnet/include/libcfs/portals_lib.h create mode 100755 lnet/include/libcfs/types.h create mode 100644 lnet/include/libcfs/winnt/kp30.h create mode 100644 lnet/include/libcfs/winnt/libcfs.h rename lnet/include/libcfs/{darwin/portals_lib.h => winnt/lltrace.h} (69%) rename lnet/include/libcfs/{linux/portals_lib.h => winnt/portals_compat25.h} (65%) create mode 100644 lnet/include/libcfs/winnt/portals_utils.h create mode 100644 lnet/include/libcfs/winnt/winnt-fs.h create mode 100644 lnet/include/libcfs/winnt/winnt-lock.h create mode 100644 lnet/include/libcfs/winnt/winnt-mem.h create mode 100644 lnet/include/libcfs/winnt/winnt-prim.h create mode 100644 lnet/include/libcfs/winnt/winnt-tcpip.h create mode 100644 lnet/include/libcfs/winnt/winnt-time.h create mode 100644 lnet/include/libcfs/winnt/winnt-types.h delete mode 100644 lnet/include/lnet/build_check.h create mode 100644 lnet/include/lnet/darwin/api-support.h delete mode 100644 lnet/include/lnet/darwin/lib-p30.h delete mode 100644 lnet/include/lnet/darwin/p30.h delete mode 100644 lnet/include/lnet/errno.h delete mode 100644 lnet/include/lnet/internal.h delete mode 100644 lnet/include/lnet/kpr.h delete mode 100644 lnet/include/lnet/lib-p30.h create mode 100644 lnet/include/lnet/linux/api-support.h delete mode 100644 lnet/include/lnet/linux/lib-p30.h delete mode 100644 lnet/include/lnet/linux/p30.h delete mode 100644 lnet/include/lnet/myrnal.h delete mode 100644 lnet/include/lnet/nal.h delete mode 100644 lnet/include/lnet/nalids.h delete mode 100644 lnet/include/lnet/p30.h delete mode 100644 lnet/include/lnet/ptlctl.h create mode 100755 lnet/include/lnet/ptllnd.h create mode 100644 lnet/include/lnet/ptllnd_wire.h delete mode 100644 lnet/include/lnet/stringtab.h create mode 100644 lnet/include/lnet/winnt/api-support.h create mode 100644 lnet/include/lnet/winnt/lib-lnet.h create mode 100644 lnet/include/lnet/winnt/lib-types.h create mode 100644 lnet/include/lnet/winnt/lnet.h rename lnet/klnds/{lolnd => ciblnd}/.cvsignore (100%) create mode 100644 lnet/klnds/ciblnd/Makefile.in rename lnet/klnds/{lolnd => ciblnd}/autoMakefile.am (54%) create mode 100644 lnet/klnds/ciblnd/ciblnd.c create mode 100644 lnet/klnds/ciblnd/ciblnd_cb.c create mode 100644 lnet/klnds/ciblnd/ciblnd_modparams.c create mode 100644 lnet/klnds/gmlnd/README create mode 100644 lnet/klnds/gmlnd/gm-reg-phys.patch create mode 100644 lnet/klnds/iiblnd/iiblnd_modparams.c delete mode 100644 lnet/klnds/lolnd/Makefile.in delete mode 100644 lnet/klnds/lolnd/lolnd.c delete mode 100644 lnet/klnds/lolnd/lolnd.h delete mode 100644 lnet/klnds/lolnd/lolnd_cb.c create mode 100644 lnet/klnds/mxlnd/.cvsignore create mode 100644 lnet/klnds/mxlnd/Makefile.in create mode 100644 lnet/klnds/mxlnd/README rename lnet/{router => klnds/mxlnd}/autoMakefile.am (52%) create mode 100644 lnet/klnds/mxlnd/mxlnd.c create mode 100644 lnet/klnds/mxlnd/mxlnd.h create mode 100644 lnet/klnds/mxlnd/mxlnd_cb.c create mode 100644 lnet/klnds/mxlnd/mxlnd_modparams.c create mode 100644 lnet/klnds/mxlnd/mxlnd_wire.h create mode 100644 lnet/klnds/o2iblnd/.cvsignore create mode 100644 lnet/klnds/o2iblnd/Makefile.in create mode 100644 lnet/klnds/o2iblnd/autoMakefile.am create mode 100644 lnet/klnds/o2iblnd/o2iblnd.c create mode 100644 lnet/klnds/o2iblnd/o2iblnd.h create mode 100644 lnet/klnds/o2iblnd/o2iblnd_cb.c create mode 100644 lnet/klnds/o2iblnd/o2iblnd_modparams.c create mode 100644 lnet/klnds/openiblnd/openiblnd_modparams.c create mode 100644 lnet/klnds/ptllnd/.cvsignore create mode 100755 lnet/klnds/ptllnd/Makefile.in create mode 100644 lnet/klnds/ptllnd/README create mode 100755 lnet/klnds/ptllnd/autoMakefile.am create mode 100755 lnet/klnds/ptllnd/ptllnd.c create mode 100755 lnet/klnds/ptllnd/ptllnd.h create mode 100644 lnet/klnds/ptllnd/ptllnd_cb.c create mode 100644 lnet/klnds/ptllnd/ptllnd_modparams.c create mode 100644 lnet/klnds/ptllnd/ptllnd_peer.c create mode 100644 lnet/klnds/ptllnd/ptllnd_ptltrace.c create mode 100644 lnet/klnds/ptllnd/ptllnd_rx_buf.c create mode 100644 lnet/klnds/ptllnd/ptllnd_tx.c create mode 100644 lnet/klnds/ptllnd/wirecheck.c create mode 100644 lnet/klnds/qswlnd/qswlnd_modparams.c create mode 100644 lnet/klnds/ralnd/ralnd_modparams.c create mode 100755 lnet/klnds/socklnd/socklnd_lib-winnt.c create mode 100755 lnet/klnds/socklnd/socklnd_lib-winnt.h create mode 100644 lnet/klnds/socklnd/socklnd_modparams.c create mode 100644 lnet/klnds/viblnd/viblnd_modparams.c create mode 100644 lnet/libcfs/darwin/darwin-internal.h create mode 100644 lnet/libcfs/darwin/darwin-tcpip.c create mode 100644 lnet/libcfs/linux/linux-tcpip.c create mode 100644 lnet/libcfs/misc.c create mode 100644 lnet/libcfs/nidstrings.c create mode 100644 lnet/libcfs/winnt/winnt-curproc.c create mode 100644 lnet/libcfs/winnt/winnt-debug.c create mode 100644 lnet/libcfs/winnt/winnt-fs.c create mode 100644 lnet/libcfs/winnt/winnt-lock.c create mode 100644 lnet/libcfs/winnt/winnt-lwt.c create mode 100644 lnet/libcfs/winnt/winnt-mem.c create mode 100644 lnet/libcfs/winnt/winnt-module.c create mode 100644 lnet/libcfs/winnt/winnt-prim.c create mode 100644 lnet/libcfs/winnt/winnt-proc.c create mode 100644 lnet/libcfs/winnt/winnt-sync.c create mode 100644 lnet/libcfs/winnt/winnt-tcpip.c create mode 100644 lnet/libcfs/winnt/winnt-tracefile.c create mode 100644 lnet/libcfs/winnt/winnt-usr.c create mode 100644 lnet/libcfs/winnt/winnt-utils.c create mode 100644 lnet/lnet/acceptor.c delete mode 100644 lnet/lnet/api-wrap.c create mode 100644 lnet/lnet/config.c delete mode 100644 lnet/lnet/lib-init.c delete mode 100644 lnet/lnet/lib-ni.c delete mode 100644 lnet/lnet/lib-pid.c create mode 100644 lnet/lnet/lo.c create mode 100644 lnet/lnet/peer.c create mode 100644 lnet/lnet/router.c create mode 100644 lnet/lnet/router_proc.c delete mode 100644 lnet/router/Makefile.in delete mode 100644 lnet/router/proc.c delete mode 100644 lnet/router/router.c delete mode 100644 lnet/router/router.h create mode 100644 lnet/tests/ping_cli/winnt-pingcli.c create mode 100644 lnet/tests/ping_srv/winnt-pingsrv.c delete mode 100644 lnet/tests/sping_cli.c delete mode 100644 lnet/tests/sping_srv.c create mode 100644 lnet/tests/ut.README create mode 100644 lnet/tests/ut.h create mode 100644 lnet/tests/ut_cli.c create mode 100644 lnet/tests/ut_srv.c delete mode 100644 lnet/ulnds/Makefile.am create mode 100644 lnet/ulnds/Makefile.in delete mode 100644 lnet/ulnds/README delete mode 100644 lnet/ulnds/address.c create mode 100644 lnet/ulnds/autoMakefile.am delete mode 100644 lnet/ulnds/bridge.h delete mode 100644 lnet/ulnds/connection.c delete mode 100644 lnet/ulnds/connection.h delete mode 100644 lnet/ulnds/debug.c delete mode 100644 lnet/ulnds/dispatch.h delete mode 100644 lnet/ulnds/ipmap.h delete mode 100644 lnet/ulnds/pqtimer.c delete mode 100644 lnet/ulnds/pqtimer.h delete mode 100644 lnet/ulnds/procapi.c delete mode 100644 lnet/ulnds/procbridge.h delete mode 100644 lnet/ulnds/proclib.c create mode 100644 lnet/ulnds/ptllnd/.cvsignore create mode 100644 lnet/ulnds/ptllnd/Makefile.am create mode 100644 lnet/ulnds/ptllnd/ptllnd.c create mode 100644 lnet/ulnds/ptllnd/ptllnd.h create mode 100644 lnet/ulnds/ptllnd/ptllnd_cb.c delete mode 100644 lnet/ulnds/select.c create mode 100644 lnet/ulnds/socklnd/.cvsignore delete mode 100644 lnet/ulnds/socklnd/address.c delete mode 100644 lnet/ulnds/socklnd/debug.c delete mode 100644 lnet/ulnds/socklnd/ipmap.h delete mode 100644 lnet/ulnds/socklnd/utypes.h delete mode 100644 lnet/ulnds/table.c delete mode 100644 lnet/ulnds/table.h delete mode 100644 lnet/ulnds/tcplnd.c delete mode 100644 lnet/ulnds/timer.h delete mode 100644 lnet/ulnds/utypes.h delete mode 100644 lnet/utils/acceptor.c create mode 100755 lnet/utils/lbstats diff --git a/lnet/ChangeLog b/lnet/ChangeLog index fed4790..79ca961 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -1,4 +1,204 @@ -tba Cluster File Systems, Inc. +TBD Cluster File Systems, Inc. + * version 1.4.10 + * Support for networks: + socklnd - kernels up to 2.6.15 (I believe this is accurate, SLES10) + qswlnd - Qsnet kernel modules 5.20 and later + openiblnd - IbGold 1.8.2 + o2iblnd - OFED 1.1 + viblnd - Voltaire ibhost 3.4.5 and later + ciblnd - Topspin 3.2.0 + iiblnd - Infiniserv 3.3 + PathBits patch + gmlnd - GM 2.1.22 and later + mxlnd - MX 1.2.1 or later + ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x + * bug fixes + +Severity : major +Frequency : rare +Bugzilla : 11616 +Description: o2iblnd handle early RDMA_CM_EVENT_DISCONNECTED. +Details : If the fabric is lossy, an RDMA_CM_EVENT_DISCONNECTED + callback can occur before a connection has actually been + established. This caused an assertion failure previously. + +Severity : enhancement +Bugzilla : 11094 +Description: Multiple instances for o2iblnd +Details : Allow multiple instances of o2iblnd to enable networking over + multiple HCAs and routing between them. + +Severity : major +Bugzilla : 11201 +Description: lnet deadlock in router_checker +Details : turned ksnd_connd_lock, ksnd_reaper_lock, and ksock_net_t:ksnd_lock + into BH locks to eliminate potential deadlock caused by + ksocknal_data_ready() preempting code holding these locks. + +Severity : major +Bugzilla : 11126 +Description: Millions of failed socklnd connection attempts cause a very slow FS +Details : added a new route flag ksnr_scheduled to distinguish from + ksnr_connecting, so that a peer connection request is only turned + down for race concerns when an active connection to the same peer + is under progress (instead of just being scheduled). + +------------------------------------------------------------------------------ + +2007-02-09 Cluster File Systems, Inc. + * version 1.4.9 + * Support for networks: + socklnd - kernels up to 2.6.15 (I believe this is accurate, SLES10) + qswlnd - Qsnet kernel modules 5.20 and later + openiblnd - IbGold 1.8.2 + o2iblnd - OFED 1.1 + viblnd - Voltaire ibhost 3.4.5 and later + ciblnd - Topspin 3.2.0 + iiblnd - Infiniserv 3.3 + PathBits patch + gmlnd - GM 2.1.22 and later + mxlnd - MX 1.2.1 or later + ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x + * bug fixes + +Severity : major on XT3 +Bugzilla : none +Description: libcfs overwrites /proc/sys/portals +Details : libcfs created a symlink from /proc/sys/portals to + /proc/sys/lnet for backwards compatibility. This is no + longer required and makes the Cray portals /proc variables + inaccessible. + +Severity : minor +Bugzilla : 11312 +Description: OFED FMR API change +Details : This changes parameter usage to reflect a change in + ib_fmr_pool_map_phys() between OFED 1.0 and OFED 1.1. Note + that FMR support is only used in experimental versions of the + o2iblnd - this change does not affect standard usage at all. + +Severity : enhancement +Bugzilla : 11245 +Description: new ko2iblnd module parameter: ib_mtu +Details : the default IB MTU of 2048 performs badly on 23108 Tavor + HCAs. You can avoid this problem by setting the MTU to 1024 + using this module parameter. + +Severity : enhancement +Bugzilla : 11118/11620 +Description: ptllnd small request message buffer alignment fix +Details : Set the PTL_MD_LOCAL_ALIGN8 option on small message receives. + Round up small message size on sends in case this option + is not supported. 11620 was a defect in the initial + implementation which effectively asserted all peers had to be + running the correct protocol version which was fixed by always + NAK-ing such requests and handling any misalignments they + introduce. + +Severity : minor +Frequency : rarely +Description: When kib(nal|lnd)_del_peer() is called upon a peer whose + ibp_tx_queue is not empty, kib(nal|lnd)_destroy_peer()'s + 'LASSERT(list_empty(&peer->ibp_tx_queue))' will fail. + +Severity : enhancement +Bugzilla : 11250 +Description: Patchless ZC(zero copy) socklnd +Details : New protocol for socklnd, socklnd can support zero copy without + kernel patch, it's compatible with old socklnd. Checksum is + moved from tunables to modparams. + +Severity : minor +Frequency : rarely +Description: When ksocknal_del_peer() is called upon a peer whose + ksnp_tx_queue is not empty, ksocknal_destroy_peer()'s + 'LASSERT(list_empty(&peer->ksnp_tx_queue))' will fail. + +Severity : normal +Frequency : when ptlrpc is under heavy use and runs out of request buffer +Bugzilla : 11318 +Description: In lnet_match_blocked_msg(), md can be used without holding a + ref on it. + +Severity : minor +Frequency : very rarely +Bugzilla : 10727 +Description: If ksocknal_lib_setup_sock() fails, a ref on peer is lost. + If connd connects a route which has been closed by + ksocknal_shutdown(), ksocknal_create_routes() may create new + routes which hold references on the peer, causing shutdown + process to wait for peer to disappear forever. + +Severity : enhancement +Bugzilla : 11234 +Description: Dump XT3 portals traces on kptllnd timeout +Details : Set the kptllnd module parameter "ptltrace_on_timeout=1" to + dump Cray portals debug traces to a file. The kptllnd module + parameter "ptltrace_basename", default "/tmp/lnet-ptltrace", + is the basename of the dump file. + +Severity : major +Frequency : infrequent +Bugzilla : 11308 +Description: kernel ptllnd fix bug in connection re-establishment +Details : Kernel ptllnd could produce protocol errors e.g. illegal + matchbits and/or violate the credit flow protocol when trying + to re-establish a connection with a peer after an error or + timeout. + +Severity : enhancement +Bugzilla : 10316 +Description: Allow /proc/sys/lnet/debug to be set symbolically +Details : Allow debug and subsystem debug values to be read/set by name + in addition to numerically, for ease of use. + +Severity : normal +Frequency : only in configurations with LNET routers +Bugzilla : 10316 +Description: routes automatically marked down and recovered +Details : In configurations with LNET routers if a router fails routers + now actively try to recover routes that are down, unless they + are marked down by an administrator. + +------------------------------------------------------------------------------ + +2006-07-31 Cluster File Systems, Inc. + * version 1.4.7 + - rework CDEBUG messages rate-limiting mechanism b=10375 + - add per-socket tunables for socklnd if the kernel is patched b=10327 + +------------------------------------------------------------------------------ + +2006-02-15 Cluster File Systems, Inc. + * version 1.4.6 + - fix use of portals/lnet pid to avoid dropping RPCs b=10074 + - iiblnd wasn't mapping all memory, resulting in comms errors b=9776 + - quiet LNET startup LNI message for liblustre b=10128 + - Better console error messages if 'ip2nets' can't match an IP address + - Fixed overflow/use-before-set bugs in linux-time.h + - Fixed ptllnd bug that wasn't initialising rx descriptors completely + - LNET teardown failed an assertion about the route table being empty + - Fixed a crash in LNetEQPoll() + - Future protocol compatibility work (b_rls146_lnetprotovrsn) + - improve debug message for liblustre/Catamount nodes (b=10116) + +2005-10-10 Cluster File Systems, Inc. + * Configuration change for the XT3 + The PTLLND is now used to run Lustre over Portals on the XT3. + The configure option(s) --with-cray-portals are no longer + used. Rather --with-portals= is + used to enable building on the XT3. In addition to enable + XT3 specific features the option --enable-cray-xt3 must be + used. + +2005-10-10 Cluster File Systems, Inc. + * Portals has been removed, replaced by LNET. + LNET is new networking infrastructure for Lustre, it includes a + reorganized network configuration mode (see the user + documentation for full details) as well as support for routing + between different network fabrics. Lustre Networking Devices + (LNDS) for the supported network fabrics have also been created + for this new infrastructure. + +2005-08-08 Cluster File Systems, Inc. * version 1.4.4 * bug fixes @@ -6,9 +206,9 @@ Severity : major Frequency : rare (large Voltaire clusters only) Bugzilla : 6993 Description: the default number of reserved transmit descriptors was too low - for some large clusters + for some large clusters Details : As a workaround, the number was increased. A proper fix includes - a run-time tunable. + a run-time tunable. 2005-06-02 Cluster File Systems, Inc. * version 1.4.3 @@ -18,14 +218,14 @@ Severity : major Frequency : occasional (large-scale events, cluster reboot, network failure) Bugzilla : 6411 Description: too many error messages on console obscure actual problem and - can slow down/panic server, or cause recovery to fail repeatedly + can slow down/panic server, or cause recovery to fail repeatedly Details : enable rate-limiting of console error messages, and some messages - that were console errors now only go to the kernel log + that were console errors now only go to the kernel log Severity : enhancement Bugzilla : 1693 Description: add /proc/sys/portals/catastrophe entry which will report if - that node has previously LBUGged + that node has previously LBUGged 2005-04-06 Cluster File Systems, Inc. * bugs diff --git a/lnet/Kernelenv.in b/lnet/Kernelenv.in index 7a48c58..59eda30 100644 --- a/lnet/Kernelenv.in +++ b/lnet/Kernelenv.in @@ -1,5 +1,5 @@ -EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/portals/include -# portals/utils/debug.c wants from userspace. sigh. +EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/lnet/include +# lnet/utils/debug.c wants from userspace. sigh. HOSTCFLAGS := -I@LINUX@/include $(EXTRA_CFLAGS) LIBREADLINE := @LIBREADLINE@ # 2.5's makefiles aren't nice to cross dir libraries in host programs diff --git a/lnet/Kernelenv.mk b/lnet/Kernelenv.mk index 7c66dfa..d973e5d 100644 --- a/lnet/Kernelenv.mk +++ b/lnet/Kernelenv.mk @@ -1,4 +1,4 @@ -EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/portals/include +EXTRA_CFLAGS := -Ifs/lustre/include -Ifs/lustre/lnet/include HOSTCFLAGS := $(EXTRA_CFLAGS) # the kernel doesn't want us to build archives for host binaries :/ PTLCTLOBJS := debug.o l_ioctl.o parser.o portals.o diff --git a/lnet/Makefile.in b/lnet/Makefile.in index 71d0dc8..553578c 100644 --- a/lnet/Makefile.in +++ b/lnet/Makefile.in @@ -1,9 +1,8 @@ subdir-m += libcfs -cray-subdirs += portals -cray-subdirs += knals -cray-subdirs += router -cray-subdirs += tests -@CRAY_PORTALS_FALSE@subdir-m += $(cray-subdirs) +lnet-subdirs += lnet +lnet-subdirs += klnds +lnet-subdirs += tests +subdir-m += $(lnet-subdirs) @INCLUDE_RULES@ diff --git a/lnet/autoMakefile.am b/lnet/autoMakefile.am index b49b8d4..27a60a8 100644 --- a/lnet/autoMakefile.am +++ b/lnet/autoMakefile.am @@ -3,7 +3,7 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -SUBDIRS = libcfs portals knals unals router tests doc utils include \ +SUBDIRS = libcfs lnet klnds ulnds tests doc utils include \ autoconf sources: diff --git a/lnet/autoconf/Makefile.am b/lnet/autoconf/Makefile.am index f65d2c0..171634a 100644 --- a/lnet/autoconf/Makefile.am +++ b/lnet/autoconf/Makefile.am @@ -1 +1 @@ -EXTRA_DIST := lustre-portals.m4 +EXTRA_DIST := lustre-lnet.m4 diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 9897290..479a1f5 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -1,9 +1,31 @@ # -# LP_CHECK_GCC_VERSION +# LN_CONFIG_MAX_PAYLOAD +# +# configure maximum payload +# +AC_DEFUN([LN_CONFIG_MAX_PAYLOAD], +[AC_MSG_CHECKING([for non-default maximum LNET payload]) +AC_ARG_WITH([max-payload-mb], + AC_HELP_STRING([--with-max-payload-mb=MBytes], + [set maximum lnet payload in MBytes]), + [ + AC_MSG_RESULT([$with_max_payload_mb]) + LNET_MAX_PAYLOAD_MB=$with_max_payload_mb + LNET_MAX_PAYLOAD="(($with_max_payload_mb)<<20)" + ], [ + AC_MSG_RESULT([no]) + LNET_MAX_PAYLOAD="LNET_MTU" + ]) + AC_DEFINE_UNQUOTED(LNET_MAX_PAYLOAD, $LNET_MAX_PAYLOAD, + [Max LNET payload]) +]) + +# +# LN_CHECK_GCC_VERSION # # Check compiler version # -AC_DEFUN([LP_CHECK_GCC_VERSION], +AC_DEFUN([LN_CHECK_GCC_VERSION], [AC_MSG_CHECKING([compiler version]) PTL_CC_VERSION=`$CC --version | awk '/^gcc/{print $ 3}'` PTL_MIN_CC_VERSION="3.2.2" @@ -20,35 +42,58 @@ fi ]) # -# LP_CONFIG_ZEROCOPY +# LN_CONFIG_ZEROCOPY # # check if zerocopy is available/wanted # -AC_DEFUN([LP_CONFIG_ZEROCOPY], -[AC_MSG_CHECKING([for zero-copy TCP support]) -AC_ARG_ENABLE([zerocopy], +AC_DEFUN([LN_CONFIG_ZEROCOPY], +[AC_ARG_ENABLE([zerocopy], AC_HELP_STRING([--disable-zerocopy], - [disable socknal zerocopy]), + [disable socklnd zerocopy]), [],[enable_zerocopy='yes']) +AC_MSG_CHECKING([for zero-copy TCP support]) if test x$enable_zerocopy = xno ; then AC_MSG_RESULT([no (by request)]) else ZCCD="`grep -c zccd $LINUX/include/linux/skbuff.h`" - if test "$ZCCD" != 0 ; then - AC_DEFINE(SOCKNAL_ZC, 1, [use zero-copy TCP]) - AC_MSG_RESULT(yes) - else + if test "$ZCCD" = 0 ; then AC_MSG_RESULT([no (no kernel support)]) + else + AC_MSG_RESULT([yes]) + AC_MSG_CHECKING([for up-to-date tcp zero-copy patch]) + LB_LINUX_TRY_COMPILE([ + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + ],[ + struct zccd zc = {0}; + + return atomic_read(&zc.zccd_refcount); + ],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(SOCKNAL_ZC, 1, [enable zero-copy support]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([old TCP zero-copy in kernel (bug 10889) - use --disable-zerocopy to continue ]) + ]) fi fi ]) # -# LP_CONFIG_AFFINITY +# LN_CONFIG_AFFINITY # # check if cpu affinity is available/wanted # -AC_DEFUN([LP_CONFIG_AFFINITY], +AC_DEFUN([LN_CONFIG_AFFINITY], [AC_ARG_ENABLE([affinity], AC_HELP_STRING([--disable-affinity], [disable process/irq affinity]), @@ -62,11 +107,11 @@ else #include ],[ struct task_struct t; - #ifdef CPU_ARRAY_SIZE - cpumask_t m; - #else - unsigned long m; - #endif + #if HAVE_CPUMASK_T + cpumask_t m; + #else + unsigned long m; + #endif set_cpus_allowed(&t, m); ],[ AC_DEFINE(CPU_AFFINITY, 1, [kernel has cpu affinity support]) @@ -78,11 +123,162 @@ fi ]) # -# LP_CONFIG_QUADRICS +# LN_CONFIG_PORTALS +# +# configure support for Portals +# +AC_DEFUN([LN_CONFIG_PORTALS], +[AC_MSG_CHECKING([for portals]) +AC_ARG_WITH([portals], + AC_HELP_STRING([--with-portals=path], + [set path to portals]), + [ + case $with_portals in + no) ENABLEPORTALS=0 + ;; + *) PORTALS="${with_portals}" + ENABLEPORTALS=1 + ;; + esac + + ], [ + ENABLEPORTALS=0 + ]) +PTLLNDCPPFLAGS="" +if test $ENABLEPORTALS -eq 0; then + AC_MSG_RESULT([no]) +elif test ! \( -f ${PORTALS}/include/portals/p30.h \); then + AC_MSG_RESULT([no]) + AC_MSG_ERROR([bad --with-portals path]) +else + AC_MSG_RESULT([$PORTALS]) + PTLLNDCPPFLAGS="-I${PORTALS}/include" +fi +AC_SUBST(PTLLNDCPPFLAGS) +]) + +# +# LN_CONFIG_BACKOFF +# +# check if tunable tcp backoff is available/wanted +# +AC_DEFUN([LN_CONFIG_BACKOFF], +[AC_MSG_CHECKING([for tunable backoff TCP support]) +AC_ARG_ENABLE([backoff], + AC_HELP_STRING([--disable-backoff], + [disable socknal tunable backoff]), + [],[enable_backoff='yes']) +if test x$enable_backoff = xno ; then + AC_MSG_RESULT([no (by request)]) +else + BOCD="`grep -c TCP_BACKOFF $LINUX/include/linux/tcp.h`" + if test "$BOCD" != 0 ; then + AC_DEFINE(SOCKNAL_BACKOFF, 1, [use tunable backoff TCP]) + AC_MSG_RESULT(yes) + else + AC_MSG_RESULT([no (no kernel support)]) + fi +fi +]) + +# +# LN_CONFIG_PANIC_DUMPLOG +# +# check if tunable panic_dumplog is wanted +# +AC_DEFUN([LN_CONFIG_PANIC_DUMPLOG], +[AC_MSG_CHECKING([for tunable panic_dumplog support]) +AC_ARG_ENABLE([panic_dumplog], + AC_HELP_STRING([--enable-panic_dumplog], + [enable panic_dumplog]), + [],[enable_panic_dumplog='no']) +if test x$enable_panic_dumplog = xyes ; then + AC_DEFINE(LNET_DUMP_ON_PANIC, 1, [use dumplog on panic]) + AC_MSG_RESULT([yes (by request)]) +else + AC_MSG_RESULT([no]) +fi +]) + +# +# LN_CONFIG_PTLLND +# +# configure support for Portals LND +# +AC_DEFUN([LN_CONFIG_PTLLND], +[ +if test -z "$ENABLEPORTALS"; then + LN_CONFIG_PORTALS +fi + +AC_MSG_CHECKING([whether to build the kernel portals LND]) + +PTLLND="" +if test $ENABLEPORTALS -ne 0; then + AC_MSG_RESULT([yes]) + PTLLND="ptllnd" +else + AC_MSG_RESULT([no]) +fi +AC_SUBST(PTLLND) +]) + +# +# LN_CONFIG_UPTLLND +# +# configure support for Portals LND +# +AC_DEFUN([LN_CONFIG_UPTLLND], +[ +if test -z "$ENABLEPORTALS"; then + LN_CONFIG_PORTALS +fi + +AC_MSG_CHECKING([whether to build the userspace portals LND]) + +UPTLLND="" +if test $ENABLEPORTALS -ne 0; then + AC_MSG_RESULT([yes]) + UPTLLND="ptllnd" +else + AC_MSG_RESULT([no]) +fi +AC_SUBST(UPTLLND) +]) + +# +# LN_CONFIG_USOCKLND +# +# configure support for userspace TCP/IP LND +# +AC_DEFUN([LN_CONFIG_USOCKLND], +[AC_MSG_CHECKING([whether to build usocklnd]) +AC_ARG_ENABLE([usocklnd], + AC_HELP_STRING([--disable-usocklnd], + [disable usocklnd]), + [],[enable_usocklnd='yes']) + +if test x$enable_usocklnd = xyes ; then + if test "$ENABLE_LIBPTHREAD" = "yes" ; then + AC_MSG_RESULT([yes]) + USOCKLND="usocklnd" + else + AC_MSG_RESULT([no (libpthread not present or disabled)]) + USOCKLND="" + fi +else + AC_MSG_RESULT([no (disabled explicitly)]) + USOCKLND="" +fi +AC_SUBST(USOCKLND) +]) + +# +# LN_CONFIG_QUADRICS # # check if quadrics support is in this kernel # -AC_DEFUN([LP_CONFIG_QUADRICS], +AC_DEFUN([LN_CONFIG_QUADRICS], [AC_MSG_CHECKING([for QsNet sources]) AC_ARG_WITH([qsnet], AC_HELP_STRING([--with-qsnet=path], @@ -94,78 +290,304 @@ AC_MSG_RESULT([$QSNET]) AC_MSG_CHECKING([if quadrics kernel headers are present]) if test -d $QSNET/drivers/net/qsnet ; then AC_MSG_RESULT([yes]) - QSWNAL="qswnal" + QSWLND="qswlnd" AC_MSG_CHECKING([for multirail EKC]) if test -f $QSNET/include/elan/epcomms.h; then AC_MSG_RESULT([supported]) QSWCPPFLAGS="-I$QSNET/include -DMULTIRAIL_EKC=1" else AC_MSG_RESULT([not supported]) - if test -d $QSNET/drivers/net/qsnet/include; then - QSWCPPFLAGS="-I$QSNET/drivers/net/qsnet/include" - else - QSWCPPFLAGS="-I$QSNET/include/linux" - fi + AC_MSG_ERROR([Need multirail EKC]) fi if test x$QSNET = x$LINUX ; then LB_LINUX_CONFIG([QSNET],[],[ LB_LINUX_CONFIG([QSNET_MODULE],[],[ - AC_MSG_WARN([QSNET is not enabled in this kernel; not building qswnal.]) - QSWNAL="" + AC_MSG_WARN([QSNET is not enabled in this kernel; not building qswlnd.]) + QSWLND="" QSWCPPFLAGS="" ]) ]) fi else AC_MSG_RESULT([no]) - QSWNAL="" + QSWLND="" QSWCPPFLAGS="" fi AC_SUBST(QSWCPPFLAGS) -AC_SUBST(QSWNAL) +AC_SUBST(QSWLND) ]) # -# LP_CONFIG_GM +# LN_CONFIG_GM # # check if GM support is available # -AC_DEFUN([LP_CONFIG_GM], -[LB_ARG_LIBS_INCLUDES([Myrinet],[gm]) -if test x$gm_includes != x ; then - GMCPPFLAGS="-I$gm_includes" - if test -d "$gm/drivers" ; then - GMCPPFLAGS="$GMCPPFLAGS -I$gm/drivers -I$gm/drivers/linux/gm" - fi +AC_DEFUN([LN_CONFIG_GM],[ +AC_MSG_CHECKING([whether to enable GM support]) +AC_ARG_WITH([gm], + AC_HELP_STRING([--with-gm=path-to-gm-source-tree], + [build gmlnd against path]), + [ + case $with_gm in + no) ENABLE_GM=0 + ;; + *) ENABLE_GM=1 + GM_SRC="$with_gm" + ;; + esac + ],[ + ENABLE_GM=0 + ]) +AC_ARG_WITH([gm-install], + AC_HELP_STRING([--with-gm-install=path-to-gm-install-tree], + [say where GM has been installed]), + [ + GM_INSTALL=$with_gm_install + ],[ + GM_INSTALL="/opt/gm" + ]) +if test $ENABLE_GM -eq 0; then + AC_MSG_RESULT([no]) +else + AC_MSG_RESULT([yes]) + + GMLND="gmlnd" + GMCPPFLAGS="-I$GM_SRC/include -I$GM_SRC/drivers -I$GM_SRC/drivers/linux/gm" + + if test -f $GM_INSTALL/lib/libgm.a -o \ + -f $GM_INSTALL/lib64/libgm.a; then + GMLIBS="-L$GM_INSTALL/lib -L$GM_INSTALL/lib64" + else + AC_MSG_ERROR([Cant find GM libraries under $GM_INSTALL]) + fi + + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$GMCPPFLAGS -DGM_KERNEL $EXTRA_KCFLAGS" + + AC_MSG_CHECKING([that code using GM compiles with given path]) + LB_LINUX_TRY_COMPILE([ + #define GM_STRONG_TYPES 1 + #ifdef VERSION + #undef VERSION + #endif + #include "gm.h" + #include "gm_internal.h" + ],[ + struct gm_port *port = NULL; + gm_recv_event_t *rxevent = gm_blocking_receive_no_spin(port); + return 0; + ],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([Bad --with-gm path]) + ]) + + AC_MSG_CHECKING([that GM has gm_register_memory_ex_phys()]) + LB_LINUX_TRY_COMPILE([ + #define GM_STRONG_TYPES 1 + #ifdef VERSION + #undef VERSION + #endif + #include "gm.h" + #include "gm_internal.h" + ],[ + gm_status_t gmrc; + struct gm_port *port = NULL; + gm_u64_t phys = 0; + gm_up_t pvma = 0; + + gmrc = gm_register_memory_ex_phys(port, phys, 100, pvma); + return 0; + ],[ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no. +Please patch the GM sources as follows... + cd $GM_SRC + patch -p0 < $PWD/lnet/klnds/gmlnd/gm-reg-phys.patch +...then rebuild and re-install them]) + AC_MSG_ERROR([Can't build GM without gm_register_memory_ex_phys()]) + ]) + + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" fi AC_SUBST(GMCPPFLAGS) +AC_SUBST(GMLIBS) +AC_SUBST(GMLND) +]) + -if test x$gm_libs != x ; then - GMLIBS="-L$gm_libs" +# +# LN_CONFIG_MX +# +AC_DEFUN([LN_CONFIG_MX], +[AC_MSG_CHECKING([whether to enable Myrinet MX support]) +# set default +MXPATH="/opt/mx" +AC_ARG_WITH([mx], + AC_HELP_STRING([--with-mx=path], + [build mxlnd against path]), + [ + case $with_mx in + yes) ENABLEMX=2 + ;; + no) ENABLEMX=0 + ;; + *) MXPATH=$with_mx + ENABLEMX=3 + ;; + esac + ],[ + ENABLEMX=1 + ]) +if test $ENABLEMX -eq 0; then + AC_MSG_RESULT([disabled]) +elif test ! \( -f ${MXPATH}/include/myriexpress.h -a \ + -f ${MXPATH}/include/mx_kernel_api.h -a \ + -f ${MXPATH}/include/mx_pin.h \); then + AC_MSG_RESULT([no]) + case $ENABLEMX in + 1) ;; + 2) AC_MSG_ERROR([Myrinet MX kernel headers not present]);; + 3) AC_MSG_ERROR([bad --with-mx path]);; + *) AC_MSG_ERROR([internal error]);; + esac +else + MXCPPFLAGS="-I$MXPATH/include" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $MXCPPFLAGS" + MXLIBS="-L$MXPATH/lib" + LB_LINUX_TRY_COMPILE([ + #define MX_KERNEL 1 + #include + #include + ],[ + mx_endpoint_t end; + mx_status_t status; + mx_request_t request; + int result; + + mx_init(); + mx_open_endpoint(MX_ANY_NIC, MX_ANY_ENDPOINT, 0, NULL, 0, &end); + mx_register_unexp_handler(end, (mx_unexp_handler_t) NULL, NULL); + mx_wait_any(end, MX_INFINITE, 0LL, 0LL, &status, &result); + mx_iconnect(end, 0LL, 0, 0, 0, NULL, &request); + return 0; + ],[ + AC_MSG_RESULT([yes]) + MXLND="mxlnd" + ],[ + AC_MSG_RESULT([no]) + case $ENABLEMX in + 1) ;; + 2) AC_MSG_ERROR([can't compile with Myrinet MX kernel headers]);; + 3) AC_MSG_ERROR([can't compile with Myrinet MX headers under $MXPATH]);; + *) AC_MSG_ERROR([internal error]);; + esac + MXLND="" + MXCPPFLAGS="" + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" fi -AC_SUBST(GMLIBS) +AC_SUBST(MXCPPFLAGS) +AC_SUBST(MXLIBS) +AC_SUBST(MXLND) +]) + + + +# +# LN_CONFIG_O2IB +# +AC_DEFUN([LN_CONFIG_O2IB],[ +AC_MSG_CHECKING([whether to enable OpenIB gen2 support]) +# set default +O2IBPATH="$LINUX/drivers/infiniband" +AC_ARG_WITH([o2ib], + AC_HELP_STRING([--with-o2ib=path], + [build o2iblnd against path]), + [ + case $with_o2ib in + yes) ENABLEO2IB=2 + ;; + no) ENABLEO2IB=0 + ;; + *) O2IBPATH=$with_o2ib + ENABLEO2IB=3 + ;; + esac + ],[ + ENABLEO2IB=1 + ]) +if test $ENABLEO2IB -eq 0; then + AC_MSG_RESULT([disabled]) +elif test ! \( -f ${O2IBPATH}/include/rdma/rdma_cm.h -a \ + -f ${O2IBPATH}/include/rdma/ib_cm.h -a\ + -f ${O2IBPATH}/include/rdma/ib_verbs.h -a\ + -f ${O2IBPATH}/include/rdma/ib_fmr_pool.h \); then + AC_MSG_RESULT([no]) + case $ENABLEO2IB in + 1) ;; + 2) AC_MSG_ERROR([kernel OpenIB gen2 headers not present]);; + 3) AC_MSG_ERROR([bad --with-o2ib path]);; + *) AC_MSG_ERROR([internal error]);; + esac +else + O2IBCPPFLAGS="-I$O2IBPATH/include" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $O2IBCPPFLAGS" + LB_LINUX_TRY_COMPILE([ + #include + #if !HAVE_GFP_T + typedef int gfp_t; + #endif + #include + #include + #include + #include + ],[ + struct rdma_cm_id *cm_id; + struct rdma_conn_param conn_param; + struct ib_device_attr device_attr; + struct ib_qp_attr qp_attr; + struct ib_pool_fmr pool_fmr; + enum ib_cm_rej_reason rej_reason; -ENABLE_GM=0 -if test x$gm != x ; then - GMNAL="gmnal" - ENABLE_GM=1 + cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP); + return PTR_ERR(cm_id); + ],[ + AC_MSG_RESULT([yes]) + O2IBLND="o2iblnd" + ],[ + AC_MSG_RESULT([no]) + case $ENABLEO2IB in + 1) ;; + 2) AC_MSG_ERROR([can't compile with kernel OpenIB gen2 headers]);; + 3) AC_MSG_ERROR([can't compile with OpenIB gen2 headers under $O2IBPATH]);; + *) AC_MSG_ERROR([internal error]);; + esac + O2IBLND="" + O2IBCPPFLAGS="" + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" fi -AC_SUBST(GMNAL) -AC_SUBST(ENABLE_GM) +AC_SUBST(O2IBCPPFLAGS) +AC_SUBST(O2IBLND) ]) # -# LP_CONFIG_OPENIB +# LN_CONFIG_OPENIB # # check for OpenIB in the kernel -AC_DEFUN([LP_CONFIG_OPENIB],[ +AC_DEFUN([LN_CONFIG_OPENIB],[ AC_MSG_CHECKING([whether to enable OpenIB support]) # set default OPENIBPATH="$LINUX/drivers/infiniband" AC_ARG_WITH([openib], AC_HELP_STRING([--with-openib=path], - [build openibnal against path]), + [build openiblnd against path]), [ case $with_openib in yes) ENABLEOPENIB=2 @@ -198,6 +620,7 @@ else *) AC_MSG_RESULT([no]) AC_MSG_ERROR([internal error]);; esac + OPENIBCPPFLAGS="$OPENIBCPPFLAGS -DIB_NTXRXPARAMS=4" EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" EXTRA_KCFLAGS="$EXTRA_KCFLAGS $OPENIBCPPFLAGS" LB_LINUX_TRY_COMPILE([ @@ -215,7 +638,7 @@ else return 0; ],[ AC_MSG_RESULT([yes]) - OPENIBNAL="openibnal" + OPENIBLND="openiblnd" ],[ AC_MSG_RESULT([no]) case $ENABLEOPENIB in @@ -224,32 +647,82 @@ else 3) AC_MSG_ERROR([can't compile with OpenIB headers under $OPENIBPATH]);; *) AC_MSG_ERROR([internal error]);; esac - OPENIBNAL="" + OPENIBLND="" OPENIBCPPFLAGS="" ]) EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" fi AC_SUBST(OPENIBCPPFLAGS) -AC_SUBST(OPENIBNAL) +AC_SUBST(OPENIBLND) ]) # -# LP_CONFIG_IIB -# -# check for infinicon infiniband support +# LN_CONFIG_CIBLND # +AC_DEFUN([LN_CONFIG_CIB],[ +AC_MSG_CHECKING([whether to enable Cisco/TopSpin IB support]) +# set default +CIBPATH="" +CIBLND="" +AC_ARG_WITH([cib], + AC_HELP_STRING([--with-cib=path], + [build ciblnd against path]), + [ + case $with_cib in + no) AC_MSG_RESULT([no]);; + *) CIBPATH="$with_cib" + if test -d "$CIBPATH"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + AC_MSG_ERROR([No directory $CIBPATH]) + fi;; + esac + ],[ + AC_MSG_RESULT([no]) + ]) +if test -n "$CIBPATH"; then + CIBCPPFLAGS="-I${CIBPATH}/ib/ts_api_ng/include -I${CIBPATH}/all/kernel_services/include -DUSING_TSAPI" + CIBCPPFLAGS="$CIBCPPFLAGS -DIB_NTXRXPARAMS=3" + EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="$EXTRA_KCFLAGS $CIBCPPFLAGS" + LB_LINUX_TRY_COMPILE([ + #include + #include + #include + ],[ + struct ib_device_properties dev_props; + struct ib_cm_active_param cm_active_params; + tTS_IB_CLIENT_QUERY_TID tid; + int enum1 = TS_IB_QP_ATTRIBUTE_STATE; + int enum2 = TS_IB_ACCESS_LOCAL_WRITE; + int enum3 = TS_IB_CQ_CALLBACK_INTERRUPT; + int enum4 = TS_IB_CQ_PROVIDER_REARM; + return 0; + ],[ + CIBLND="ciblnd" + ],[ + AC_MSG_ERROR([can't compile ciblnd with given path]) + CIBCPPFLAGS="" + ]) + EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" +fi +AC_SUBST(CIBCPPFLAGS) +AC_SUBST(CIBLND) +]) + # -# LP_CONFIG_IIB +# LN_CONFIG_IIB # # check for infinicon infiniband support # -AC_DEFUN([LP_CONFIG_IIB],[ +AC_DEFUN([LN_CONFIG_IIB],[ AC_MSG_CHECKING([whether to enable Infinicon support]) # set default IIBPATH="/usr/include" AC_ARG_WITH([iib], AC_HELP_STRING([--with-iib=path], - [build iibnal against path]), + [build iiblnd against path]), [ case $with_iib in yes) ENABLEIIB=2 @@ -293,7 +766,7 @@ else return rc == FSUCCESS ? 0 : 1; ],[ AC_MSG_RESULT([yes]) - IIBNAL="iibnal" + IIBLND="iiblnd" ],[ AC_MSG_RESULT([no]) case $ENABLEIIB in @@ -302,26 +775,26 @@ else 3) AC_MSG_ERROR([can't compile with Infinicon headers under $IIBPATH]);; *) AC_MSG_ERROR([internal error]);; esac - IIBNAL="" + IIBLND="" IIBCPPFLAGS="" ]) EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" fi AC_SUBST(IIBCPPFLAGS) -AC_SUBST(IIBNAL) +AC_SUBST(IIBLND) ]) # -# LP_CONFIG_VIB +# LN_CONFIG_VIB # # check for Voltaire infiniband support # -AC_DEFUN([LP_CONFIG_VIB], +AC_DEFUN([LN_CONFIG_VIB], [AC_MSG_CHECKING([whether to enable Voltaire IB support]) VIBPATH="" AC_ARG_WITH([vib], AC_HELP_STRING([--with-vib=path], - [build vibnal against path]), + [build viblnd against path]), [ case $with_vib in no) AC_MSG_RESULT([no]);; @@ -337,7 +810,7 @@ AC_ARG_WITH([vib], AC_MSG_RESULT([no]) ]) if test -z "$VIBPATH"; then - VIBNAL="" + VIBLND="" else VIBCPPFLAGS="-I${VIBPATH}/include -I${VIBPATH}/cm" EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" @@ -371,13 +844,13 @@ else NULL, 0); return 0; ],[ - VIBNAL="vibnal" + VIBLND="viblnd" ],[ - AC_MSG_ERROR([can't compile vibnal with given path]) + AC_MSG_ERROR([can't compile viblnd with given path]) ]) EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" fi -if test -n "$VIBNAL"; then +if test -n "$VIBLND"; then EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS" AC_MSG_CHECKING([if Voltaire still uses void * sg addresses]) @@ -405,39 +878,18 @@ if test -n "$VIBNAL"; then ],[ AC_MSG_RESULT([no]) ]) - AC_MSG_CHECKING([if page_to_phys() must avoid sign extension]) - LB_LINUX_TRY_COMPILE([ - #include - #include - #include - #include - #include - ],[ - struct page p; - - switch (42) { - case 0: - case (sizeof(typeof(page_to_phys(&p))) < 8): - break; - } - ],[ - AC_MSG_RESULT([yes]) - VIBCPPFLAGS="$VIBCPPFLAGS -DIBNAL_32BIT_PAGE2PHYS=1" - ],[ - AC_MSG_RESULT([no]) - ]) EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" fi AC_SUBST(VIBCPPFLAGS) -AC_SUBST(VIBNAL) +AC_SUBST(VIBLND) ]) # -# LP_CONFIG_RANAL +# LN_CONFIG_RALND # -# check whether to use the RapidArray nal +# check whether to use the RapidArray lnd # -AC_DEFUN([LP_CONFIG_RANAL], +AC_DEFUN([LN_CONFIG_RALND], [#### Rapid Array AC_MSG_CHECKING([if RapidArray kernel headers are present]) # placeholder @@ -456,23 +908,23 @@ LB_LINUX_TRY_COMPILE([ return rc == RAP_SUCCESS ? 0 : 1; ],[ AC_MSG_RESULT([yes]) - RANAL="ranal" + RALND="ralnd" ],[ AC_MSG_RESULT([no]) - RANAL="" + RALND="" RACPPFLAGS="" ]) EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" AC_SUBST(RACPPFLAGS) -AC_SUBST(RANAL) +AC_SUBST(RALND) ]) # -# LP_STRUCT_PAGE_LIST +# LN_STRUCT_PAGE_LIST # # 2.6.4 no longer has page->list # -AC_DEFUN([LP_STRUCT_PAGE_LIST], +AC_DEFUN([LN_STRUCT_PAGE_LIST], [AC_MSG_CHECKING([if struct page has a list field]) LB_LINUX_TRY_COMPILE([ #include @@ -488,11 +940,11 @@ LB_LINUX_TRY_COMPILE([ ]) # -# LP_STRUCT_SIGHAND +# LN_STRUCT_SIGHAND # # red hat 2.4 adds sighand to struct task_struct # -AC_DEFUN([LP_STRUCT_SIGHAND], +AC_DEFUN([LN_STRUCT_SIGHAND], [AC_MSG_CHECKING([if task_struct has a sighand field]) LB_LINUX_TRY_COMPILE([ #include @@ -508,11 +960,11 @@ LB_LINUX_TRY_COMPILE([ ]) # -# LP_FUNC_CPU_ONLINE +# LN_FUNC_CPU_ONLINE # # cpu_online is different in rh 2.4, vanilla 2.4, and 2.6 # -AC_DEFUN([LP_FUNC_CPU_ONLINE], +AC_DEFUN([LN_FUNC_CPU_ONLINE], [AC_MSG_CHECKING([if kernel defines cpu_online()]) LB_LINUX_TRY_COMPILE([ #include @@ -527,11 +979,30 @@ LB_LINUX_TRY_COMPILE([ ]) # -# LP_TYPE_CPUMASK_T +# LN_TYPE_GFP_T +# +# check if gfp_t is typedef-ed +# +AC_DEFUN([LN_TYPE_GFP_T], +[AC_MSG_CHECKING([if kernel defines gfp_t]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + return sizeof(gfp_t); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GFP_T, 1, [gfp_t found]) +],[ + AC_MSG_RESULT([no]) +]) +]) + +# +# LN_TYPE_CPUMASK_T # # same goes for cpumask_t # -AC_DEFUN([LP_TYPE_CPUMASK_T], +AC_DEFUN([LN_TYPE_CPUMASK_T], [AC_MSG_CHECKING([if kernel defines cpumask_t]) LB_LINUX_TRY_COMPILE([ #include @@ -546,11 +1017,11 @@ LB_LINUX_TRY_COMPILE([ ]) # -# LP_FUNC_SHOW_TASK +# LN_FUNC_SHOW_TASK # # we export show_task(), but not all kernels have it (yet) # -AC_DEFUN([LP_FUNC_SHOW_TASK], +AC_DEFUN([LN_FUNC_SHOW_TASK], [AC_MSG_CHECKING([if kernel exports show_task]) have_show_task=0 for file in ksyms sched ; do @@ -568,57 +1039,120 @@ else fi ]) +# LN_TASKLIST_LOCK +# 2.6.18 remove tasklist_lock export +AC_DEFUN([LN_TASKLIST_LOCK], +[AC_MSG_CHECKING([kernel export tasklist_lock]) + if grep -q "EXPORT_SYMBOL(tasklist_lock)" \ + "$LINUX/kernel/fork.c" 2>/dev/null ; then + AC_DEFINE(HAVE_TASKLIST_LOCK, 1, + [tasklist_lock exported]) + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + fi +]) + +# 2.6.19 API changes +# kmem_cache_destroy(cachep) return void instead of +# int +AC_DEFUN([LN_KMEM_CACHE_DESTROY_INT], +[AC_MSG_CHECKING([kmem_cache_destroy(cachep) return int]) +LB_LINUX_TRY_COMPILE([ + #include +],[ + int i = kmem_cache_destroy(NULL); +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_DESTROY_INT, 1, + [kmem_cache_destroy(cachep) return int]) +],[ + AC_MSG_RESULT(NO) +]) +]) + +#2.6.19 API change +#panic_notifier_list use atomic_notifier operations +# +AC_DEFUN([LN_ATOMIC_PANIC_NOTIFIER], +[AC_MSG_CHECKING([panic_notifier_list is atomic]) +LB_LINUX_TRY_COMPILE([ + #include + #include +],[ + struct atomic_notifier_head panic_notifier_list; +],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ATOMIC_PANIC_NOTIFIER, 1, + [panic_notifier_list is atomic_notifier_head]) +],[ + AC_MSG_RESULT(NO) +]) +]) + # -# LP_PROG_LINUX +# LN_PROG_LINUX # -# Portals linux kernel checks +# LNet linux kernel checks # -AC_DEFUN([LP_PROG_LINUX], -[LP_CONFIG_ZEROCOPY -LP_CONFIG_AFFINITY -LP_CONFIG_QUADRICS -LP_CONFIG_GM -LP_CONFIG_OPENIB -LP_CONFIG_VIB -LP_CONFIG_IIB -LP_CONFIG_RANAL +AC_DEFUN([LN_PROG_LINUX], +[LN_CONFIG_ZEROCOPY +LN_FUNC_CPU_ONLINE +LN_TYPE_GFP_T +LN_TYPE_CPUMASK_T +LN_CONFIG_AFFINITY +LN_CONFIG_BACKOFF +LN_CONFIG_PANIC_DUMPLOG +LN_CONFIG_QUADRICS +LN_CONFIG_GM +LN_CONFIG_OPENIB +LN_CONFIG_CIB +LN_CONFIG_VIB +LN_CONFIG_IIB +LN_CONFIG_O2IB +LN_CONFIG_RALND +LN_CONFIG_PTLLND +LN_CONFIG_MX -LP_STRUCT_PAGE_LIST -LP_STRUCT_SIGHAND -LP_FUNC_CPU_ONLINE -LP_TYPE_CPUMASK_T -LP_FUNC_SHOW_TASK +LN_STRUCT_PAGE_LIST +LN_STRUCT_SIGHAND +LN_FUNC_SHOW_TASK +# 2.6.18 +LN_TASKLIST_LOCK +# 2.6.19 +LN_KMEM_CACHE_DESTROY_INT +LN_ATOMIC_PANIC_NOTIFIER ]) # -# LP_PROG_DARWIN +# LN_PROG_DARWIN # # Darwin checks # -AC_DEFUN([LP_PROG_DARWIN], +AC_DEFUN([LN_PROG_DARWIN], [LB_DARWIN_CHECK_FUNCS([get_preemption_level]) ]) # -# LP_PATH_DEFAULTS +# LN_PATH_DEFAULTS # # default paths for installed files # -AC_DEFUN([LP_PATH_DEFAULTS], +AC_DEFUN([LN_PATH_DEFAULTS], [ ]) # -# LP_CONFIGURE +# LN_CONFIGURE # # other configure checks # -AC_DEFUN([LP_CONFIGURE], -[# portals/utils/portals.c +AC_DEFUN([LN_CONFIGURE], +[# lnet/utils/portals.c AC_CHECK_HEADERS([netdb.h netinet/tcp.h asm/types.h endian.h sys/ioctl.h]) AC_CHECK_FUNCS([gethostbyname socket connect]) -# portals/utils/debug.c +# lnet/utils/debug.c AC_CHECK_HEADERS([linux/version.h]) AC_CHECK_TYPE([spinlock_t], @@ -626,20 +1160,37 @@ AC_CHECK_TYPE([spinlock_t], [], [#include ]) -# portals/utils/wirecheck.c +# lnet/utils/wirecheck.c AC_CHECK_FUNCS([strnlen]) # -------- Check for required packages -------------- -LIBS_save="$LIBS" -LIBS="-lncurses $LIBS" -AC_CHECK_LIB([readline],[readline],[ +# +# LC_CONFIG_READLINE +# +# Build with readline +# +AC_MSG_CHECKING([whether to enable readline support]) +AC_ARG_ENABLE(readline, + AC_HELP_STRING([--disable-readline], + [disable readline support]), + [],[enable_readline='yes']) +AC_MSG_RESULT([$enable_readline]) + +# -------- check for readline if enabled ---- +if test x$enable_readline = xyes ; then + LIBS_save="$LIBS" + LIBS="-lncurses $LIBS" + AC_CHECK_LIB([readline],[readline],[ LIBREADLINE="-lreadline -lncurses" AC_DEFINE(HAVE_LIBREADLINE, 1, [readline library is available]) -],[ + ],[ LIBREADLINE="" -]) -LIBS="$LIBS_save" + ]) + LIBS="$LIBS_save" +else + LIBREADLINE="" +fi AC_SUBST(LIBREADLINE) AC_MSG_CHECKING([if efence debugging support is requested]) @@ -651,7 +1202,7 @@ AC_MSG_RESULT([$enable_efence]) if test "$enable_efence" = "yes" ; then LIBEFENCE="-lefence" AC_DEFINE(HAVE_LIBEFENCE, 1, [libefence support is requested]) -else +else LIBEFENCE="" fi AC_SUBST(LIBEFENCE) @@ -674,6 +1225,31 @@ else fi AC_SUBST(LIBWRAP) +# -------- check for -lpthread support ---- +AC_MSG_CHECKING([whether to use libpthread for lnet library]) +AC_ARG_ENABLE([libpthread], + AC_HELP_STRING([--disable-libpthread], + [disable libpthread]), + [],[enable_libpthread=yes]) +if test "$enable_libpthread" = "yes" ; then + AC_CHECK_LIB([pthread], [pthread_create], + [ENABLE_LIBPTHREAD="yes"], + [ENABLE_LIBPTHREAD="no"]) + if test "$ENABLE_LIBPTHREAD" = "yes" ; then + AC_MSG_RESULT([$ENABLE_LIBPTHREAD]) + PTHREAD_LIBS="-lpthread" + AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread]) + else + PTHREAD_LIBS="" + AC_MSG_RESULT([no libpthread is found]) + fi + AC_SUBST(PTHREAD_LIBS) +else + AC_MSG_RESULT([no (disabled explicitly)]) + ENABLE_LIBPTHREAD="no" +fi +AC_SUBST(ENABLE_LIBPTHREAD) + # ---------------------------------------- # some tests for catamount-like systems # ---------------------------------------- @@ -697,92 +1273,105 @@ if test x$enable_urandom != xno ; then AC_DEFINE([LIBLUSTRE_USE_URANDOM], 1, [use /dev/urandom for random data]) fi -# -------- check for -lcap and -lpthread ---- +# -------- check for -lcap support ---- if test x$enable_liblustre = xyes ; then AC_CHECK_LIB([cap], [cap_get_proc], [ CAP_LIBS="-lcap" AC_DEFINE([HAVE_LIBCAP], 1, [use libcap]) ], - [CAP_LIBS=""]) - AC_SUBST(CAP_LIBS) - AC_CHECK_LIB([pthread], [pthread_create], [ - PTHREAD_LIBS="-lpthread" - AC_DEFINE([HAVE_LIBPTHREAD], 1, [use libpthread]) - ], - [PTHREAD_LIBS=""]) - AC_SUBST(PTHREAD_LIBS) + CAP_LIBS="" + ]) + AC_SUBST(CAP_LIBS) + fi + +LN_CONFIG_MAX_PAYLOAD +LN_CONFIG_UPTLLND +LN_CONFIG_USOCKLND ]) # -# LP_CONDITIONALS +# LN_CONDITIONALS # -# AM_CONDITOINAL defines for portals +# AM_CONDITOINAL defines for lnet # -AC_DEFUN([LP_CONDITIONALS], -[AM_CONDITIONAL(BUILD_QSWNAL, test x$QSWNAL = "xqswnal") -AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal") -AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal") -AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal") -AM_CONDITIONAL(BUILD_VIBNAL, test x$VIBNAL = "xvibnal") -AM_CONDITIONAL(BUILD_RANAL, test x$RANAL = "xranal") +AC_DEFUN([LN_CONDITIONALS], +[AM_CONDITIONAL(BUILD_QSWLND, test x$QSWLND = "xqswlnd") +AM_CONDITIONAL(BUILD_GMLND, test x$GMLND = "xgmlnd") +AM_CONDITIONAL(BUILD_MXLND, test x$MXLND = "xmxlnd") +AM_CONDITIONAL(BUILD_O2IBLND, test x$O2IBLND = "xo2iblnd") +AM_CONDITIONAL(BUILD_OPENIBLND, test x$OPENIBLND = "xopeniblnd") +AM_CONDITIONAL(BUILD_CIBLND, test x$CIBLND = "xciblnd") +AM_CONDITIONAL(BUILD_IIBLND, test x$IIBLND = "xiiblnd") +AM_CONDITIONAL(BUILD_VIBLND, test x$VIBLND = "xviblnd") +AM_CONDITIONAL(BUILD_RALND, test x$RALND = "xralnd") +AM_CONDITIONAL(BUILD_PTLLND, test x$PTLLND = "xptllnd") +AM_CONDITIONAL(BUILD_UPTLLND, test x$UPTLLND = "xptllnd") +AM_CONDITIONAL(BUILD_USOCKLND, test x$USOCKLND = "xusocklnd") ]) # -# LP_CONFIG_FILES +# LN_CONFIG_FILES # # files that should be generated with AC_OUTPUT # -AC_DEFUN([LP_CONFIG_FILES], +AC_DEFUN([LN_CONFIG_FILES], [AC_CONFIG_FILES([ -portals/Kernelenv -portals/Makefile -portals/autoMakefile -portals/autoconf/Makefile -portals/doc/Makefile -portals/include/Makefile -portals/include/libcfs/Makefile -portals/include/libcfs/linux/Makefile -portals/include/portals/Makefile -portals/include/portals/linux/Makefile -portals/knals/Makefile -portals/knals/autoMakefile -portals/knals/gmnal/Makefile -portals/knals/gmnal/autoMakefile -portals/knals/openibnal/Makefile -portals/knals/openibnal/autoMakefile -portals/knals/iibnal/Makefile -portals/knals/iibnal/autoMakefile -portals/knals/vibnal/Makefile -portals/knals/vibnal/autoMakefile -portals/knals/lonal/Makefile -portals/knals/lonal/autoMakefile -portals/knals/qswnal/Makefile -portals/knals/qswnal/autoMakefile -portals/knals/ranal/Makefile -portals/knals/ranal/autoMakefile -portals/knals/socknal/Makefile -portals/knals/socknal/autoMakefile -portals/libcfs/Makefile -portals/libcfs/autoMakefile -portals/libcfs/linux/Makefile -portals/portals/Makefile -portals/portals/autoMakefile -portals/router/Makefile -portals/router/autoMakefile -portals/tests/Makefile -portals/tests/autoMakefile -portals/unals/Makefile -portals/utils/Makefile +lnet/Kernelenv +lnet/Makefile +lnet/autoMakefile +lnet/autoconf/Makefile +lnet/doc/Makefile +lnet/include/Makefile +lnet/include/libcfs/Makefile +lnet/include/libcfs/linux/Makefile +lnet/include/lnet/Makefile +lnet/include/lnet/linux/Makefile +lnet/klnds/Makefile +lnet/klnds/autoMakefile +lnet/klnds/gmlnd/Makefile +lnet/klnds/mxlnd/autoMakefile +lnet/klnds/mxlnd/Makefile +lnet/klnds/gmlnd/autoMakefile +lnet/klnds/openiblnd/Makefile +lnet/klnds/openiblnd/autoMakefile +lnet/klnds/o2iblnd/Makefile +lnet/klnds/o2iblnd/autoMakefile +lnet/klnds/ciblnd/Makefile +lnet/klnds/ciblnd/autoMakefile +lnet/klnds/iiblnd/Makefile +lnet/klnds/iiblnd/autoMakefile +lnet/klnds/viblnd/Makefile +lnet/klnds/viblnd/autoMakefile +lnet/klnds/qswlnd/Makefile +lnet/klnds/qswlnd/autoMakefile +lnet/klnds/ralnd/Makefile +lnet/klnds/ralnd/autoMakefile +lnet/klnds/socklnd/Makefile +lnet/klnds/socklnd/autoMakefile +lnet/klnds/ptllnd/Makefile +lnet/klnds/ptllnd/autoMakefile +lnet/libcfs/Makefile +lnet/libcfs/autoMakefile +lnet/libcfs/linux/Makefile +lnet/lnet/Makefile +lnet/lnet/autoMakefile +lnet/tests/Makefile +lnet/tests/autoMakefile +lnet/ulnds/Makefile +lnet/ulnds/autoMakefile +lnet/ulnds/socklnd/Makefile +lnet/ulnds/ptllnd/Makefile +lnet/utils/Makefile ]) case $lb_target_os in darwin) AC_CONFIG_FILES([ -portals/include/libcfs/darwin/Makefile -portals/include/portals/darwin/Makefile -portals/libcfs/darwin/Makefile +lnet/include/libcfs/darwin/Makefile +lnet/include/lnet/darwin/Makefile +lnet/libcfs/darwin/Makefile ]) ;; esac diff --git a/lnet/include/Makefile.am b/lnet/include/Makefile.am index dd6db1d..006180b 100644 --- a/lnet/include/Makefile.am +++ b/lnet/include/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = libcfs portals +SUBDIRS = libcfs lnet EXTRA_DIST = cygwin-ioctl.h diff --git a/lnet/include/libcfs/Makefile.am b/lnet/include/libcfs/Makefile.am index 50e377a..2874a52 100644 --- a/lnet/include/libcfs/Makefile.am +++ b/lnet/include/libcfs/Makefile.am @@ -4,5 +4,5 @@ SUBDIRS += darwin endif DIST_SUBDIRS := $(SUBDIRS) -EXTRA_DIST := curproc.h kp30.h libcfs.h list.h lltrace.h portals_lib.h \ - portals_utils.h user-lock.h user-prim.h user-time.h +EXTRA_DIST := curproc.h kp30.h libcfs.h list.h lltrace.h \ + portals_utils.h types.h user-lock.h user-prim.h user-time.h diff --git a/lnet/include/libcfs/curproc.h b/lnet/include/libcfs/curproc.h index 630912d..6495c66 100644 --- a/lnet/include/libcfs/curproc.h +++ b/lnet/include/libcfs/curproc.h @@ -20,6 +20,7 @@ #ifndef __LIBCFS_CURPROC_H__ #define __LIBCFS_CURPROC_H__ +#ifdef __KERNEL__ /* * Portable API to access common characteristics of "current" UNIX process. * @@ -48,6 +49,7 @@ char *cfs_curproc_comm(void); */ cfs_kernel_cap_t cfs_curproc_cap_get(void); void cfs_curproc_cap_set(cfs_kernel_cap_t cap); +#endif /* __LIBCFS_CURPROC_H__ */ #endif diff --git a/lnet/include/libcfs/darwin/Makefile.am b/lnet/include/libcfs/darwin/Makefile.am index 4ff2072..f2f217a 100644 --- a/lnet/include/libcfs/darwin/Makefile.am +++ b/lnet/include/libcfs/darwin/Makefile.am @@ -1,3 +1,3 @@ EXTRA_DIST := darwin-mem.h darwin-types.h libcfs.h portals_utils.h \ darwin-fs.h darwin-prim.h darwin-utils.h lltrace.h \ - darwin-lock.h darwin-sync.h kp30.h portals_lib.h + darwin-lock.h darwin-sync.h darwin-tcpip.h kp30.h diff --git a/lnet/include/libcfs/darwin/darwin-fs.h b/lnet/include/libcfs/darwin/darwin-fs.h index 32244e7..5eed9ef 100644 --- a/lnet/include/libcfs/darwin/darwin-fs.h +++ b/lnet/include/libcfs/darwin/darwin-fs.h @@ -1,5 +1,24 @@ -#ifndef __LIBCFS_DARWIN_CFS_FS_H__ -#define __LIBCFS_DARWIN_CFS_FS_H__ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Implementation of standard file system interfaces for XNU kernel. + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ +#ifndef __LIBCFS_DARWIN_FS_H__ +#define __LIBCFS_DARWIN_FS_H__ #ifndef __LIBCFS_LIBCFS_H__ #error Do not #include this file directly. #include instead @@ -9,25 +28,13 @@ #include #include -/* - * __APPLE_API_PRIVATE is defined before include user.h - * Doing this way to get the define of uthread, it's not good - * but I do need to know what's inside uthread. - */ -#ifndef __APPLE_API_PRIVATE -#define __APPLE_API_PRIVATE -#include -#undef __APPLE_API_PRIVATE -#else -#include -#endif #include #include #include #include -#include #include +#include #include #include #include @@ -37,7 +44,6 @@ #include #include -#include #include #include #include @@ -51,28 +57,42 @@ /* * File operating APIs in kernel */ +#ifdef __DARWIN8__ +/* + * Kernel file descriptor + */ +typedef struct cfs_kern_file { + int f_flags; + vnode_t f_vp; + vfs_context_t f_ctxt; +} cfs_file_t; + +#else + typedef struct file cfs_file_t; -int filp_node_size(cfs_file_t *fp, off_t *size); +#endif + +int kern_file_size(cfs_file_t *fp, off_t *size); #define cfs_filp_size(fp) \ ({ \ off_t __size; \ - filp_node_size((fp), &__size); \ + kern_file_size((fp), &__size); \ __size; \ }) #define cfs_filp_poff(fp) (NULL) -cfs_file_t *filp_open(const char *name, int flags, int mode, int *err); -int filp_close(cfs_file_t *fp); -int filp_read(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos); -int filp_write(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos); -int filp_fsync(cfs_file_t *fp); +cfs_file_t *kern_file_open(const char *name, int flags, int mode, int *err); +int kern_file_close(cfs_file_t *fp); +int kern_file_read(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos); +int kern_file_write(cfs_file_t *fp, void *buf, size_t nbytes, off_t *pos); +int kern_file_sync(cfs_file_t *fp); -#define cfs_filp_open(n, f, m, e) filp_open(n, f, m, e) -#define cfs_filp_close(f) filp_close(f) -#define cfs_filp_read(f, b, n, p) filp_read(f, b, n, p) -#define cfs_filp_write(f, b, n, p) filp_write(f, b, n, p) -#define cfs_filp_fsync(f) filp_fsync(f) +#define cfs_filp_open(n, f, m, e) kern_file_open(n, f, m, e) +#define cfs_filp_close(f) kern_file_close(f) +#define cfs_filp_read(f, b, n, p) kern_file_read(f, b, n, p) +#define cfs_filp_write(f, b, n, p) kern_file_write(f, b, n, p) +#define cfs_filp_fsync(f) kern_file_sync(f) int ref_file(cfs_file_t *fp); int rele_file(cfs_file_t *fp); @@ -85,25 +105,25 @@ int file_count(cfs_file_t *fp); #define CFS_OFFSET_MAX CFS_INT_LIMIT(loff_t) typedef struct flock cfs_flock_t; -#define CFS_FLOCK_TYPE(fl) ((fl)->l_type) -#define CFS_FLOCK_SET_TYPE(fl, type) do { (fl)->l_type = (type); } while(0) -#define CFS_FLOCK_PID(fl) ((fl)->l_pid) -#define CFS_FLOCK_SET_PID(fl, pid) do { (fl)->l_pid = (pid); } while(0) -#define CFS_FLOCK_START(fl) ((fl)->l_start) -#define CFS_FLOCK_SET_START(fl, start) do { (fl)->l_start = (start); } while(0) -#define CFS_FLOCK_END(fl) ((fl)->l_len == 0? CFS_OFFSET_MAX: ((fl)->l_start + (fl)->l_en)) -#define CFS_FLOCK_SET_END(fl, end) \ - do { \ - if (end == CFS_OFFSET_MAX) \ - (fl)->l_len = 0; \ - else \ - (fl)->l_len = (end) - (fl)->l_start;\ - } while(0) - -typedef struct { - void *d; -} cfs_dentry_t; -typedef unsigned short umode_t; +#define cfs_flock_type(fl) ((fl)->l_type) +#define cfs_flock_set_type(fl, type) do { (fl)->l_type = (type); } while(0) +#define cfs_flock_pid(fl) ((fl)->l_pid) +#define cfs_flock_set_pid(fl, pid) do { (fl)->l_pid = (pid); } while(0) +#define cfs_flock_start(fl) ((fl)->l_start) +#define cfs_flock_set_start(fl, start) do { (fl)->l_start = (start); } while(0) + +static inline loff_t cfs_flock_end(cfs_flock_t *fl) +{ + return (fl->l_len == 0 ? CFS_OFFSET_MAX: (fl->l_start + fl->l_len)); +} + +static inline void cfs_flock_set_end(cfs_flock_t *fl, loff_t end) +{ + if (end == CFS_OFFSET_MAX) + fl->l_len = 0; + else + fl->l_len = end - fl->l_start; +} #define ATTR_MODE 0x0001 #define ATTR_UID 0x0002 @@ -119,13 +139,59 @@ typedef unsigned short umode_t; #define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ #define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ #define ATTR_CTIME_SET 0x2000 +#define ATTR_BLOCKS 0x4000 #define in_group_p(x) (0) -#endif +struct posix_acl_entry { + short e_tag; + unsigned short e_perm; + unsigned int e_id; +}; + +struct posix_acl { + atomic_t a_refcount; + unsigned int a_count; + struct posix_acl_entry a_entries[0]; +}; + +struct posix_acl *posix_acl_alloc(int count, int flags); +static inline struct posix_acl *posix_acl_from_xattr(const void *value, + size_t size) +{ + return posix_acl_alloc(0, 0); +} +static inline void posix_acl_release(struct posix_acl *acl) {}; +static inline int posix_acl_valid(const struct posix_acl *acl) { return 0; } +static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) +{ + return acl; +} + +/* + * portable UNIX device file identification. + */ + +typedef dev_t cfs_rdev_t; + +#else /* !__KERNEL__ */ + +typedef struct file cfs_file_t; +#endif /* END __KERNEL__ */ + +typedef struct { + void *d; +} cfs_dentry_t; + +#ifndef O_SYNC #define O_SYNC 0 +#endif +#ifndef O_DIRECTORY #define O_DIRECTORY 0 +#endif +#ifndef O_LARGEFILE #define O_LARGEFILE 0 +#endif #endif diff --git a/lnet/include/libcfs/darwin/darwin-lock.h b/lnet/include/libcfs/darwin/darwin-lock.h index da16418..f826fef 100644 --- a/lnet/include/libcfs/darwin/darwin-lock.h +++ b/lnet/include/libcfs/darwin/darwin-lock.h @@ -9,10 +9,6 @@ #include #include #include -#include - -/* spin lock types and operations */ -#include #include #include @@ -56,12 +52,18 @@ static inline int spin_trylock(spinlock_t *lock) return kspin_trylock(&lock->spin); } +static inline void spin_lock_done(spinlock_t *lock) +{ + kspin_done(&lock->spin); +} + +#error "does this lock out timer callbacks?" #define spin_lock_bh(x) spin_lock(x) #define spin_unlock_bh(x) spin_unlock(x) #define spin_lock_bh_init(x) spin_lock_init(x) extern boolean_t ml_set_interrupts_enabled(boolean_t enable); -#define __disable_irq() (spl_t) ml_set_interrupts_enabled(FALSE) +#define __disable_irq() ml_set_interrupts_enabled(FALSE) #define __enable_irq(x) (void) ml_set_interrupts_enabled(x) #define spin_lock_irqsave(s, f) do{ \ @@ -165,6 +167,11 @@ static inline void init_rwsem(struct rw_semaphore *s) krw_sem_init(&s->s); } +static inline void fini_rwsem(struct rw_semaphore *s) +{ + krw_sem_done(&s->s); +} + static inline void down_read(struct rw_semaphore *s) { krw_sem_down_r(&s->s); @@ -173,7 +180,7 @@ static inline void down_read(struct rw_semaphore *s) static inline int down_read_trylock(struct rw_semaphore *s) { int ret = krw_sem_down_r_try(&s->s); - return ret == 0? 1: 0; + return ret == 0; } static inline void down_write(struct rw_semaphore *s) @@ -184,7 +191,7 @@ static inline void down_write(struct rw_semaphore *s) static inline int down_write_trylock(struct rw_semaphore *s) { int ret = krw_sem_down_w_try(&s->s); - return ret == 0? 1: 0; + return ret == 0; } static inline void up_read(struct rw_semaphore *s) @@ -199,7 +206,6 @@ static inline void up_write(struct rw_semaphore *s) /* * read-write lock : Need to be investigated more!! - * XXX nikita: for now, let rwlock_t to be identical to rw_semaphore * * - DECLARE_RWLOCK(l) * - rwlock_init(x) @@ -208,14 +214,14 @@ static inline void up_write(struct rw_semaphore *s) * - write_lock(x) * - write_unlock(x) */ -typedef struct rw_semaphore rwlock_t; +typedef struct krw_spin rwlock_t; -#define rwlock_init(pl) init_rwsem(pl) +#define rwlock_init(pl) krw_spin_init(pl) -#define read_lock(l) down_read(l) -#define read_unlock(l) up_read(l) -#define write_lock(l) down_write(l) -#define write_unlock(l) up_write(l) +#define read_lock(l) krw_spin_down_r(l) +#define read_unlock(l) krw_spin_up_r(l) +#define write_lock(l) krw_spin_down_w(l) +#define write_unlock(l) krw_spin_up_w(l) #define write_lock_irqsave(l, f) do{ \ f = __disable_irq(); \ @@ -232,12 +238,23 @@ typedef struct rw_semaphore rwlock_t; #define read_unlock_irqrestore(l, f) do{ \ read_unlock(l); \ __enable_irq(f);}while(0) - /* * Funnel: * * Safe funnel in/out */ +#ifdef __DARWIN8__ + +#define CFS_DECL_FUNNEL_DATA +#define CFS_DECL_CONE_DATA DECLARE_FUNNEL_DATA +#define CFS_DECL_NET_DATA DECLARE_FUNNEL_DATA +#define CFS_CONE_IN do {} while(0) +#define CFS_CONE_EX do {} while(0) + +#define CFS_NET_IN do {} while(0) +#define CFS_NET_EX do {} while(0) + +#else #define CFS_DECL_FUNNEL_DATA \ boolean_t __funnel_state = FALSE; \ @@ -257,8 +274,11 @@ void lustre_net_ex(boolean_t state, funnel_t *cone); #define CFS_NET_IN lustre_net_in(&__funnel_state, &__funnel) #define CFS_NET_EX lustre_net_ex(__funnel_state, __funnel) -/* __KERNEL__ */ #endif +#else +#include +#endif /* __KERNEL__ */ + /* __XNU_CFS_LOCK_H */ #endif diff --git a/lnet/include/libcfs/darwin/darwin-mem.h b/lnet/include/libcfs/darwin/darwin-mem.h index 922a1b8..5ffcd4e 100644 --- a/lnet/include/libcfs/darwin/darwin-mem.h +++ b/lnet/include/libcfs/darwin/darwin-mem.h @@ -33,26 +33,21 @@ #include /* - * Page of OSX - * - * There is no page in OSX, however, we need page in lustre. - */ -#define PAGE_MASK (~(PAGE_SIZE-1)) -#define _ALIGN_UP(addr,size) (((addr)+((size)-1))&(~((size)-1))) -#define _ALIGN(addr,size) _ALIGN_UP(addr,size) -#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) - -/* * Basic xnu_page struct, should be binary compatibility with * all page types in xnu (we have only xnu_raw_page, xll_page now) */ /* Variable sized pages are not supported */ +#ifdef PAGE_SHIFT +#define CFS_PAGE_SHIFT PAGE_SHIFT +#else #define CFS_PAGE_SHIFT 12 -#define CFS_PAGE_SIZE (1 << CFS_PAGE_SHIFT) -#define PAGE_CACHE_SIZE CFS_PAGE_SIZE -#define CFS_PAGE_MASK (~(CFS_PAGE_SIZE - 1)) +#endif + +#define CFS_PAGE_SIZE (1UL << CFS_PAGE_SHIFT) + +#define CFS_PAGE_MASK (~((__u64)CFS_PAGE_SIZE - 1)) enum { XNU_PAGE_RAW, @@ -98,20 +93,16 @@ void xnu_page_ops_unregister(int type); * raw page, no cache object, just like buffer */ struct xnu_raw_page { - struct xnu_page header; - vm_address_t virtual; - upl_t upl; - int order; - atomic_t count; - void *private; + struct xnu_page header; + void *virtual; + atomic_t count; + struct list_head link; }; /* * Public interface to lustre * - * - cfs_alloc_pages(f, o) * - cfs_alloc_page(f) - * - cfs_free_pages(p, o) * - cfs_free_page(p) * - cfs_kmap(p) * - cfs_kunmap(p) @@ -124,14 +115,12 @@ struct xnu_raw_page { * pages only. */ -cfs_page_t *cfs_alloc_pages(u_int32_t flags, u_int32_t order); cfs_page_t *cfs_alloc_page(u_int32_t flags); -void cfs_free_pages(cfs_page_t *pages, int order); void cfs_free_page(cfs_page_t *page); void cfs_get_page(cfs_page_t *page); int cfs_put_page_testzero(cfs_page_t *page); int cfs_page_count(cfs_page_t *page); -void cfs_set_page_count(cfs_page_t *page, int v); +#define cfs_page_index(pg) (0) void *cfs_page_address(cfs_page_t *pg); void *cfs_kmap(cfs_page_t *pg); @@ -141,48 +130,84 @@ void cfs_kunmap(cfs_page_t *pg); * Memory allocator */ -extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags); -extern void cfs_free(void *addr); +void *cfs_alloc(size_t nr_bytes, u_int32_t flags); +void cfs_free(void *addr); + +void *cfs_alloc_large(size_t nr_bytes); +void cfs_free_large(void *addr); -extern void *cfs_alloc_large(size_t nr_bytes); -extern void cfs_free_large(void *addr); +extern int get_preemption_level(void); + +#define CFS_ALLOC_ATOMIC_TRY \ + (get_preemption_level() != 0 ? CFS_ALLOC_ATOMIC : 0) /* * Slab: * - * No slab in OSX, use zone allocator to fake slab + * No slab in OSX, use zone allocator to simulate slab */ #define SLAB_HWCACHE_ALIGN 0 +#ifdef __DARWIN8__ +/* + * In Darwin8, we cannot use zalloc_noblock(not exported by kernel), + * also, direct using of zone allocator is not recommended. + */ +#define CFS_INDIVIDUAL_ZONE (0) + +#if !CFS_INDIVIDUAL_ZONE +#include +typedef OSMallocTag mem_cache_t; +#else +typedef void* zone_t; +typedef zone_t mem_cache_t; +#endif + +#else /* !__DARWIN8__ */ + +#define CFS_INDIVIDUAL_ZONE (1) + +typedef zone_t mem_cache_t; + +#endif /* !__DARWIN8__ */ + +#define MC_NAME_MAX_LEN 64 + typedef struct cfs_mem_cache { - struct list_head link; - zone_t zone; - int size; - char name [ZONE_NAME_MAX_LEN]; + int mc_size; + mem_cache_t mc_cache; + struct list_head mc_link; + char mc_name [MC_NAME_MAX_LEN]; } cfs_mem_cache_t; #define KMEM_CACHE_MAX_COUNT 64 #define KMEM_MAX_ZONE 8192 -extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long, - void (*)(void *, cfs_mem_cache_t *, unsigned long), - void (*)(void *, cfs_mem_cache_t *, unsigned long)); -extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * ); -extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int); -extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *); +cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long); +int cfs_mem_cache_destroy ( cfs_mem_cache_t * ); +void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int); +void cfs_mem_cache_free ( cfs_mem_cache_t *, void *); /* * Misc */ -/* XXX fix me */ +/* XXX Liang: num_physpages... fix me */ #define num_physpages (64 * 1024) #define CFS_DECL_MMSPACE #define CFS_MMSPACE_OPEN do {} while(0) #define CFS_MMSPACE_CLOSE do {} while(0) -#define copy_from_user(kaddr, uaddr, size) copyin((caddr_t)uaddr, (caddr_t)kaddr, size) -#define copy_to_user(uaddr, kaddr, size) copyout((caddr_t)kaddr, (caddr_t)uaddr, size) +#define copy_from_user(kaddr, uaddr, size) copyin(CAST_USER_ADDR_T(uaddr), (caddr_t)kaddr, size) +#define copy_to_user(uaddr, kaddr, size) copyout((caddr_t)kaddr, CAST_USER_ADDR_T(uaddr), size) + +#if 0 +static inline int strncpy_from_user(char *kaddr, char *uaddr, int size) +{ + size_t count; + return copyinstr((const user_addr_t)uaddr, (void *)kaddr, size, &count); +} +#endif #if defined (__ppc__) #define mb() __asm__ __volatile__ ("sync" : : : "memory") @@ -198,9 +223,10 @@ extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *); #else /* !__KERNEL__ */ -typedef struct cfs_page{ - void *foo; -} cfs_page_t; +#define CFS_CACHE_SHIFT 12 +#define PAGE_CACHE_SIZE (1 << CFS_CACHE_SHIFT) +#include + #endif /* __KERNEL__ */ #endif /* __XNU_CFS_MEM_H__ */ diff --git a/lnet/include/libcfs/darwin/darwin-prim.h b/lnet/include/libcfs/darwin/darwin-prim.h index ec9be59..00fbeed 100644 --- a/lnet/include/libcfs/darwin/darwin-prim.h +++ b/lnet/include/libcfs/darwin/darwin-prim.h @@ -9,25 +9,29 @@ #include #include -#ifndef __APPLE_API_PRIVATE -#define __APPLE_API_PRIVATE -#include -#undef __APPLE_API_PRIVATE -#else -#include -#endif +#ifndef __DARWIN8__ +# ifndef __APPLE_API_PRIVATE +# define __APPLE_API_PRIVATE +# include +# undef __APPLE_API_PRIVATE +# else +# include +# endif +# include +# include +# include +#endif /* !__DARWIN8__ */ #include #include #include -#include -#include #include #include #include #include #include +#include #include #include #include @@ -63,17 +67,19 @@ extern kern_return_t cfs_symbol_put(const char *); * User can register/unregister a list of sysctl_oids * sysctl_oid is data struct of osx's sysctl-entry */ +#define CONFIG_SYSCTL 1 + typedef struct sysctl_oid * cfs_sysctl_table_t; typedef cfs_sysctl_table_t cfs_sysctl_table_header_t; -cfs_sysctl_table_header_t *register_cfs_sysctl_table (cfs_sysctl_table_t *table, int arg); -void unregister_cfs_sysctl_table (cfs_sysctl_table_header_t *table); +cfs_sysctl_table_header_t *cfs_register_sysctl_table (cfs_sysctl_table_t *table, int arg); +void cfs_unregister_sysctl_table (cfs_sysctl_table_header_t *table); /* * Proc file system APIs, no /proc fs support in OSX */ -typedef struct cfs_proc_dir_entry{ +typedef struct cfs_proc_dir_entry { void *data; -}cfs_proc_dir_entry_t; +} cfs_proc_dir_entry_t; cfs_proc_dir_entry_t * cfs_create_proc_entry(char *name, int mod, cfs_proc_dir_entry_t *parent); @@ -111,12 +117,23 @@ extern kern_return_t cfs_psdev_deregister(cfs_psdev_t *); extern boolean_t assert_wait_possible(void); extern void *get_bsdtask_info(task_t); +#ifdef __DARWIN8__ + +typedef struct {} cfs_task_t; +#define cfs_current() ((cfs_task_t *)current_thread()) +#else /* !__DARWIN8__ */ + typedef struct uthread cfs_task_t; + #define current_uthread() ((struct uthread *)get_bsdthread_info(current_act())) #define cfs_current() current_uthread() +#endif /* !__DARWIN8__ */ + +#define cfs_task_lock(t) do {;} while (0) +#define cfs_task_unlock(t) do {;} while (0) + #define set_current_state(s) do {;} while (0) -#define reparent_to_init() do {;} while (0) #define CFS_DECL_JOURNAL_DATA #define CFS_PUSH_JOURNAL do {;} while(0) @@ -128,109 +145,12 @@ typedef struct uthread cfs_task_t; * * OSX kernel thread can not be created with args, * so we have to implement new APIs to create thread with args - * - * All requests to create kernel thread will create a new - * thread instance of cfs_thread_agent, one by one. - * cfs_thread_agent will call the caller's thread function - * with argument supplied by caller. */ typedef int (*cfs_thread_t)(void *); extern task_t kernel_task; -struct kernel_thread_arg -{ - spinlock_t lock; - atomic_t inuse; - cfs_thread_t func; - void *arg; -}; - -extern struct kernel_thread_arg cfs_thread_arg; -extern void cfs_thread_agent(void); - -#define THREAD_ARG_FREE 0 -#define THREAD_ARG_HOLD 1 -#define THREAD_ARG_RECV 2 - -#define set_targ_stat(a, v) atomic_set(&(a)->inuse, v) -#define get_targ_stat(a) atomic_read(&(a)->inuse) - -/* - * Hold the thread argument and set the status of thread_status - * to THREAD_ARG_HOLD, if the thread argument is held by other - * threads (It's THREAD_ARG_HOLD already), current-thread has to wait. - */ -#define thread_arg_hold(pta, _func, _arg) \ - do { \ - spin_lock(&(pta)->lock); \ - if (get_targ_stat(pta) == THREAD_ARG_FREE) { \ - set_targ_stat((pta), THREAD_ARG_HOLD); \ - (pta)->arg = (void *)_arg; \ - (pta)->func = _func; \ - spin_unlock(&(pta)->lock); \ - break; \ - } \ - spin_unlock(&(pta)->lock); \ - schedule(); \ - } while(1); \ - -/* - * Release the thread argument if the thread argument has been - * received by the child-thread (Status of thread_args is - * THREAD_ARG_RECV), otherwise current-thread has to wait. - * After release, the thread_args' status will be set to - * THREAD_ARG_FREE, and others can re-use the thread_args to - * create new kernel_thread. - */ -#define thread_arg_release(pta) \ - do { \ - spin_lock(&(pta)->lock); \ - if (get_targ_stat(pta) == THREAD_ARG_RECV) { \ - (pta)->arg = NULL; \ - (pta)->func = NULL; \ - set_targ_stat(pta, THREAD_ARG_FREE); \ - spin_unlock(&(pta)->lock); \ - break; \ - } \ - spin_unlock(&(pta)->lock); \ - schedule(); \ - } while(1) - -/* - * Receive thread argument (Used in child thread), set the status - * of thread_args to THREAD_ARG_RECV. - */ -#define __thread_arg_recv_fin(pta, _func, _arg, fin) \ - do { \ - spin_lock(&(pta)->lock); \ - if (get_targ_stat(pta) == THREAD_ARG_HOLD) { \ - if (fin) \ - set_targ_stat(pta, THREAD_ARG_RECV);\ - _arg = (pta)->arg; \ - _func = (pta)->func; \ - spin_unlock(&(pta)->lock); \ - break; \ - } \ - spin_unlock(&(pta)->lock); \ - schedule(); \ - } while (1); \ - -/* - * Just set the thread_args' status to THREAD_ARG_RECV - */ -#define thread_arg_fin(pta) \ - do { \ - spin_lock(&(pta)->lock); \ - assert( get_targ_stat(pta) == THREAD_ARG_HOLD); \ - set_targ_stat(pta, THREAD_ARG_RECV); \ - spin_unlock(&(pta)->lock); \ - } while(0) - -#define thread_arg_recv(pta, f, a) __thread_arg_recv_fin(pta, f, a, 1) -#define thread_arg_keep(pta, f, a) __thread_arg_recv_fin(pta, f, a, 0) - /* * cloning flags, no use in OSX, just copy them from Linux */ @@ -265,11 +185,16 @@ typedef struct cfs_waitlink { struct ksleep_link wl_ksleep_link; } cfs_waitlink_t; +typedef int cfs_task_state_t; + +#define CFS_TASK_INTERRUPTIBLE THREAD_ABORTSAFE +#define CFS_TASK_UNINT THREAD_UNINT + void cfs_waitq_init(struct cfs_waitq *waitq); void cfs_waitlink_init(struct cfs_waitlink *link); void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link); -void cfs_waitq_add_exclusive(struct cfs_waitq *waitq, +void cfs_waitq_add_exclusive(struct cfs_waitq *waitq, struct cfs_waitlink *link); void cfs_waitq_forward(struct cfs_waitlink *link, struct cfs_waitq *waitq); void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link); @@ -279,29 +204,37 @@ void cfs_waitq_signal(struct cfs_waitq *waitq); void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr); void cfs_waitq_broadcast(struct cfs_waitq *waitq); -void cfs_waitq_wait(struct cfs_waitlink *link); -cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, +void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state); +cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, + cfs_task_state_t state, cfs_duration_t timeout); /* * Thread schedule APIs. */ #define MAX_SCHEDULE_TIMEOUT ((long)(~0UL>>12)) +extern void thread_set_timer_deadline(uint64_t deadline); +extern void thread_cancel_timer(void); -static inline int schedule_timeout(int64_t timeout) +static inline int cfs_schedule_timeout(int state, int64_t timeout) { int result; - AbsoluteTime clock_current; - AbsoluteTime clock_delay; - result = assert_wait((event_t)current_uthread(), THREAD_UNINT); - clock_get_uptime(&clock_current); - nanoseconds_to_absolutetime(timeout, &clock_delay); - ADD_ABSOLUTETIME(&clock_current, &clock_delay); - thread_set_timer_deadline(clock_current); +#ifdef __DARWIN8__ + result = assert_wait((event_t)current_thread(), state); +#else + result = assert_wait((event_t)current_uthread(), state); +#endif + if (timeout > 0) { + uint64_t expire; + nanoseconds_to_absolutetime(timeout, &expire); + clock_absolutetime_interval_to_deadline(expire, &expire); + thread_set_timer_deadline(expire); + } if (result == THREAD_WAITING) result = thread_block(THREAD_CONTINUE_NULL); - thread_cancel_timer(); + if (timeout > 0) + thread_cancel_timer(); if (result == THREAD_TIMED_OUT) result = 0; else @@ -309,47 +242,80 @@ static inline int schedule_timeout(int64_t timeout) return result; } -#define schedule() \ - do { \ - if (assert_wait_possible()) \ - schedule_timeout(1); \ - else \ - schedule_timeout(0); \ - } while (0) +#define cfs_schedule() cfs_schedule_timeout(CFS_TASK_UNINT, CFS_TICK) +#define cfs_pause(tick) cfs_schedule_timeout(CFS_TASK_UNINT, tick) + +#define __wait_event(wq, condition) \ +do { \ + struct cfs_waitlink __wait; \ + \ + cfs_waitlink_init(&__wait); \ + for (;;) { \ + cfs_waitq_add(&wq, &__wait); \ + if (condition) \ + break; \ + cfs_waitq_wait(&__wait, CFS_TASK_UNINT); \ + cfs_waitq_del(&wq, &__wait); \ + } \ + cfs_waitq_del(&wq, &__wait); \ +} while (0) -#define __wait_event(wq, condition) \ -do { \ - struct cfs_waitlink __wait; \ - \ - cfs_waitlink_init(&__wait); \ - for (;;) { \ - cfs_waitq_add(&wq, &__wait); \ - if (condition) \ - break; \ - cfs_waitq_wait(&__wait); \ - cfs_waitq_del(&wq, &__wait); \ - } \ - cfs_waitq_del(&wq, &__wait); \ +#define wait_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __wait_event(wq, condition); \ } while (0) -#define wait_event(wq, condition) \ -do { \ - if (condition) \ - break; \ - __wait_event(wq, condition); \ +#define __wait_event_interruptible(wq, condition, ex, ret) \ +do { \ + struct cfs_waitlink __wait; \ + \ + cfs_waitlink_init(&__wait); \ + for (;;) { \ + if (ex == 0) \ + cfs_waitq_add(&wq, &__wait); \ + else \ + cfs_waitq_add_exclusive(&wq, &__wait); \ + if (condition) \ + break; \ + if (!cfs_signal_pending()) { \ + cfs_waitq_wait(&__wait, \ + CFS_TASK_INTERRUPTIBLE); \ + cfs_waitq_del(&wq, &__wait); \ + continue; \ + } \ + ret = -ERESTARTSYS; \ + break; \ + } \ + cfs_waitq_del(&wq, &__wait); \ } while (0) -#define wait_event_interruptible(wq, condition) \ -({ \ - wait_event(wq, condition); \ - 0; \ +#define wait_event_interruptible(wq, condition) \ +({ \ + int __ret = 0; \ + if (!condition) \ + __wait_event_interruptible(wq, condition, \ + 0, __ret); \ + __ret; \ }) +#define wait_event_interruptible_exclusive(wq, condition) \ +({ \ + int __ret = 0; \ + if (!condition) \ + __wait_event_interruptible(wq, condition, \ + 1, __ret); \ + __ret; \ +}) + +#ifndef __DARWIN8__ extern void wakeup_one __P((void * chan)); +#endif /* only used in tests */ -#define wake_up_process(p) \ - do { \ - wakeup_one(p); \ +#define wake_up_process(p) \ + do { \ + wakeup_one((caddr_t)p); \ } while (0) /* used in couple of places */ @@ -359,48 +325,19 @@ static inline void sleep_on(cfs_waitq_t *waitq) cfs_waitlink_init(&link); cfs_waitq_add(waitq, &link); - cfs_waitq_wait(&link); + cfs_waitq_wait(&link, CFS_TASK_UNINT); cfs_waitq_del(waitq, &link); } /* - * XXX * Signal */ -#define cfs_sigmask_lock(t, f) do { f = 0; } while(0) -#define cfs_sigmask_unlock(t, f) do { f = 0; } while(0) -#define cfs_signal_pending(t) (0) - -#define cfs_siginitset(pmask, sigs) \ - do { \ - sigset_t __sigs = sigs & (~sigcantmask); \ - *(pmask) = __sigs; \ - } while(0) - -#define cfs_siginitsetinv(pmask, sigs) \ - do { \ - sigset_t __sigs = ~(sigs | sigcantmask); \ - *(pmask) = __sigs; \ - } while(0) - -#define cfs_recalc_sigpending(ut) \ - do { \ - (ut)->uu_siglist = (ut)->uu_siglist & ~(ut)->uu_sigmask;\ - } while (0) -#define cfs_sigfillset(s) \ - do { \ - memset((s), -1, sizeof(sigset_t)); \ - } while(0) - -#define cfs_set_sig_blocked(ut, b) do {(ut)->uu_sigmask = b;} while(0) -#define cfs_get_sig_blocked(ut) (&(ut)->uu_sigmask) +typedef sigset_t cfs_sigset_t; #define SIGNAL_MASK_ASSERT() - /* * Timer */ - typedef struct cfs_timer { struct ktimer t; } cfs_timer_t; @@ -434,20 +371,27 @@ cfs_time_t cfs_timer_deadline(struct cfs_timer *t); /* * CPU */ -#include /* Run in PowerG5 who is PPC64 */ #define SMP_CACHE_BYTES 128 #define __cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) -/* XXX How to get the value of NCPUS from xnu ? */ #define NR_CPUS 2 -#define smp_processor_id() cpu_number() -#define smp_num_cpus NR_CPUS + +/* + * XXX Liang: patch xnu and export current_processor()? + * + * #define smp_processor_id() current_processor() + */ +#define smp_processor_id() 0 /* XXX smp_call_function is not supported in xnu */ #define smp_call_function(f, a, n, w) do {} while(0) +int cfs_online_cpus(void); +#define smp_num_cpus cfs_online_cpus() /* * Misc */ +extern int is_suser(void); + #ifndef likely #define likely(exp) (exp) #endif @@ -458,11 +402,9 @@ cfs_time_t cfs_timer_deadline(struct cfs_timer *t); #define lock_kernel() do {} while(0) #define unlock_kernel() do {} while(0) -#define exit_mm(t) do {} while(0) -#define exit_files(t) do {} while(0) - -#define CAP_SYS_ADMIN 0 -#define capable(a) suser(current_proc()->p_ucred, &(current_proc()->p_acflag)) +#define CAP_SYS_BOOT 0 +#define CAP_SYS_ADMIN 1 +#define capable(a) ((a) == CAP_SYS_BOOT ? is_suser(): is_suser1()) #define USERMODEHELPER(path, argv, envp) (0) @@ -499,6 +441,11 @@ struct __dummy_ ## name ## _struct {} #define inter_module_get(n) cfs_symbol_get(n) #define inter_module_put(n) cfs_symbol_put(n) +static inline int request_module(char *name) +{ + return (-EINVAL); +} + #ifndef __exit #define __exit #endif @@ -517,7 +464,7 @@ struct __dummy_ ## name ## _struct {} #define LINUX_VERSION_CODE KERNEL_VERSION(2,5,0) #define NR_IRQS 512 -#define in_interrupt() (0) +#define in_interrupt() ml_at_interrupt_context() #define KERN_EMERG "<0>" /* system is unusable */ #define KERN_ALERT "<1>" /* action must be taken immediately */ @@ -534,21 +481,47 @@ static inline long PTR_ERR(const void *ptr) } #define ERR_PTR(err) ((void *)err) +#define IS_ERR(p) ((unsigned long)(p) + 1000 < 1000) + +#else /* !__KERNEL__ */ -/* XXX */ -#define IS_ERR(p) (0) +typedef struct cfs_proc_dir_entry { + void *data; +} cfs_proc_dir_entry_t; + +#include +#define __WORDSIZE 32 +#endif /* END __KERNEL__ */ /* * Error number */ +#ifndef EPROTO +#define EPROTO EPROTOTYPE +#endif +#ifndef EBADR #define EBADR EBADRPC -#define ERESTARTSYS ERESTART +#endif +#ifndef ERESTARTSYS +#define ERESTARTSYS 512 +#endif +#ifndef EDEADLOCK #define EDEADLOCK EDEADLK +#endif +#ifndef ECOMM #define ECOMM EINVAL +#endif +#ifndef ENODATA #define ENODATA EINVAL +#endif +#ifndef ENOTSUPP +#define ENOTSUPP EINVAL +#endif +#if BYTE_ORDER == BIG_ENDIAN +# define __BIG_ENDIAN #else -#define __WORDSIZE 32 -#endif /* __KERNEL__ */ +# define __LITTLE_ENDIAN +#endif -#endif /* __LINUX__ */ +#endif /* __LIBCFS_DARWIN_CFS_PRIM_H__ */ diff --git a/lnet/include/libcfs/darwin/darwin-sync.h b/lnet/include/libcfs/darwin/darwin-sync.h index 3374f43..5a3fabd 100644 --- a/lnet/include/libcfs/darwin/darwin-sync.h +++ b/lnet/include/libcfs/darwin/darwin-sync.h @@ -1,7 +1,8 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Lustre Light Super operations + * Implementation of standard libcfs synchronization primitives for XNU + * kernel. * * Copyright (c) 2004 Cluster File Systems, Inc. * @@ -33,7 +34,7 @@ #error Do not #include this file directly. #include instead #endif -#define XNU_SYNC_DEBUG (0) +#define XNU_SYNC_DEBUG (1) #if XNU_SYNC_DEBUG #define ON_SYNC_DEBUG(e) e @@ -48,6 +49,7 @@ enum { KCOND_MAGIC = 0xb01dface, KRW_MAGIC = 0xdabb1edd, KSPIN_MAGIC = 0xca11ab1e, + KRW_SPIN_MAGIC = 0xbabeface, KSLEEP_CHAN_MAGIC = 0x0debac1e, KSLEEP_LINK_MAGIC = 0xacc01ade, KTIMER_MAGIC = 0xbefadd1e @@ -60,25 +62,63 @@ enum { */ #define SMP (1) +#include + +#ifdef __DARWIN8__ + +#include +#include +#include +#include + +/* + * hw_lock is not available in Darwin8 (hw_lock_* are not exported at all), + * so use lck_spin_t. we can hack out lck_spin_t easily, it's the only + * hacking in Darwin8.x. We did so because it'll take a lot of time to + * add lock_done for all locks, maybe it should be done in the future. + * If lock_done for all locks were added, we can: + * + * typedef lck_spin_t *xnu_spin_t; + */ +#if defined (__ppc__) +typedef struct { + unsigned int opaque[3]; +} xnu_spin_t; +#elif defined (__i386__) +typedef struct { + unsigned int opaque[10]; +} xnu_spin_t; +#endif + +/* + * wait_queue is not available in Darwin8 (wait_queue_* are not exported), + * use assert_wait/wakeup/wake_one (wait_queue in kernel hash). + */ +typedef void * xnu_wait_queue_t; + +/* DARWIN8 */ +#else + +#include +#include #include -#include +typedef hw_lock_data_t xnu_spin_t; +typedef struct wait_queue xnu_wait_queue_t; + +/* DARWIN8 */ +#endif struct kspin { #if SMP - hw_lock_data_t lock; + xnu_spin_t lock; #endif #if XNU_SYNC_DEBUG - unsigned magic; - thread_t owner; + unsigned magic; + thread_t owner; #endif }; -/* - * XXX nikita: we cannot use simple_* functions, because bsd/sys/lock.h - * redefines them to nothing. Use low-level hw_lock_* instead. - */ - void kspin_init(struct kspin *spin); void kspin_done(struct kspin *spin); void kspin_lock(struct kspin *spin); @@ -98,11 +138,27 @@ int kspin_isnotlocked(struct kspin *spin); #define kspin_isnotlocked(s) (1) #endif +/* ------------------------- rw spinlock ----------------------- */ +struct krw_spin { + struct kspin guard; + int count; +#if XNU_SYNC_DEBUG + unsigned magic; +#endif +}; + +void krw_spin_init(struct krw_spin *sem); +void krw_spin_done(struct krw_spin *sem); +void krw_spin_down_r(struct krw_spin *sem); +void krw_spin_down_w(struct krw_spin *sem); +void krw_spin_up_r(struct krw_spin *sem); +void krw_spin_up_w(struct krw_spin *sem); + /* ------------------------- semaphore ------------------------- */ struct ksem { struct kspin guard; - struct wait_queue q; + xnu_wait_queue_t q; int value; #if XNU_SYNC_DEBUG unsigned magic; @@ -225,20 +281,20 @@ void ksleep_link_done(struct ksleep_link *link); void ksleep_add(struct ksleep_chan *chan, struct ksleep_link *link); void ksleep_del(struct ksleep_chan *chan, struct ksleep_link *link); -void ksleep_wait(struct ksleep_chan *chan); -int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout); +void ksleep_wait(struct ksleep_chan *chan, int state); +int64_t ksleep_timedwait(struct ksleep_chan *chan, int state, uint64_t timeout); void ksleep_wake(struct ksleep_chan *chan); void ksleep_wake_all(struct ksleep_chan *chan); void ksleep_wake_nr(struct ksleep_chan *chan, int nr); -#define KSLEEP_LINK_DECLARE(name) \ -{ \ - .flags = 0, \ - .event = 0, \ - .hits = 0, \ - .linkage = PTL_LIST_HEAD_INIT(name.linkage), \ - .magic = KSLEEP_LINK_MAGIC \ +#define KSLEEP_LINK_DECLARE(name) \ +{ \ + .flags = 0, \ + .event = 0, \ + .hits = 0, \ + .linkage = CFS_LIST_HEAD(name.linkage), \ + .magic = KSLEEP_LINK_MAGIC \ } /* ------------------------- timer ------------------------- */ diff --git a/lnet/include/libcfs/darwin/darwin-tcpip.h b/lnet/include/libcfs/darwin/darwin-tcpip.h new file mode 100644 index 0000000..1a73891 --- /dev/null +++ b/lnet/include/libcfs/darwin/darwin-tcpip.h @@ -0,0 +1,90 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef __LIBCFS_DARWIN_TCPIP_H__ +#define __LIBCFS_DARWIN_TCPIP_H__ + +#ifdef __KERNEL__ +#include + +#ifdef __DARWIN8__ + +struct socket; + +typedef void (*so_upcall)(socket_t sock, void* arg, int waitf); + +#define CFS_SOCK_UPCALL 0x1 +#define CFS_SOCK_DOWN 0x2 + +#define CFS_SOCK_MAGIC 0xbabeface + +typedef struct cfs_socket { + socket_t s_so; + int s_magic; + int s_flags; + so_upcall s_upcall; + void *s_upcallarg; +} cfs_socket_t; + + +/* cfs_socket_t to bsd socket */ +#define C2B_SOCK(s) ((s)->s_so) + +static inline int get_sock_intopt(socket_t so, int opt) +{ + int val, len; + int rc; + + /* + * sock_getsockopt will take a lock(mutex) for socket, + * so it can be blocked. So be careful while using + * them. + */ + len = sizeof(val); + rc = sock_getsockopt(so, SOL_SOCKET, opt, &val, &len); + assert(rc == 0); + return val; +} + +#define SOCK_ERROR(s) get_sock_intopt(C2B_SOCK(s), SO_ERROR) +/* #define SOCK_WMEM_QUEUED(s) (0) */ +#define SOCK_WMEM_QUEUED(s) get_sock_intopt(C2B_SOCK(s), SO_NWRITE) +/* XXX Liang: no reliable way to get it in Darwin8.x */ +#define SOCK_TEST_NOSPACE(s) (0) + +void libcfs_sock_set_cb(cfs_socket_t *sock, so_upcall callback, void *arg); +void libcfs_sock_reset_cb(cfs_socket_t *sock); + +#else /* !__DARWIN8__ */ + +#define SOCK_WMEM_QUEUED(so) ((so)->so_snd.sb_cc) +#define SOCK_ERROR(so) ((so)->so_error) + +#define SOCK_TEST_NOSPACE(so) (sbspace(&(so)->so_snd) < (so)->so_snd.sb_lowat) + +#endif /* !__DARWIN8__ */ + +#endif /* __KERNEL END */ + +#endif /* __XNU_CFS_TYPES_H__ */ diff --git a/lnet/include/libcfs/darwin/darwin-time.h b/lnet/include/libcfs/darwin/darwin-time.h index d6230ad..43ad274 100644 --- a/lnet/include/libcfs/darwin/darwin-time.h +++ b/lnet/include/libcfs/darwin/darwin-time.h @@ -64,48 +64,46 @@ * int cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *); * int cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *); * - * cfs_duration_t cfs_time_minimal_timeout(void) - * * CFS_TIME_FORMAT * CFS_DURATION_FORMAT * */ #define ONE_BILLION ((u_int64_t)1000000000) -#define ONE_MILLION ((u_int64_t) 1000000) +#define ONE_MILLION 1000000 #ifdef __KERNEL__ #include #include -#ifndef __APPLE_API_PRIVATE -#define __APPLE_API_PRIVATE -#include -#undef __APPLE_API_PRIVATE -#else -#include -#endif - #include -#include #include -#include -#include #include -#include -#include -#include -#include #include -#include #include -#include #include #include #include +/* + * There are three way to measure time in OS X: + * 1. nanoseconds + * 2. absolute time (abstime unit equal to the length of one bus cycle), + * schedule of thread/timer are counted by absolute time, but abstime + * in different mac can be different also, so we wouldn't use it. + * 3. clock interval (1sec = 100hz). But clock interval only taken by KPI + * like tsleep(). + * + * We use nanoseconds (uptime, not calendar time) + * + * clock_get_uptime() :get absolute time since bootup. + * nanouptime() :get nanoseconds since bootup + * microuptime() :get microseonds since bootup + * nanotime() :get nanoseconds since epoch + * microtime() :get microseconds since epoch + */ typedef u_int64_t cfs_time_t; /* nanoseconds */ typedef int64_t cfs_duration_t; @@ -118,15 +116,15 @@ static inline cfs_time_t cfs_time_current(void) { struct timespec instant; - nanotime(&instant); - return ((u_int64_t)instant.tv_sec) * ONE_BILLION + instant.tv_nsec; + nanouptime(&instant); + return ((u_int64_t)instant.tv_sec) * NSEC_PER_SEC + instant.tv_nsec; } static inline time_t cfs_time_current_sec(void) { struct timespec instant; - nanotime(&instant); + nanouptime(&instant); return instant.tv_sec; } @@ -152,7 +150,7 @@ static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2) static inline void cfs_fs_time_current(cfs_fs_time_t *t) { - *t = time; + microtime((struct timeval *)t); } static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t) @@ -160,12 +158,6 @@ static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t) return t->tv_sec; } -static inline cfs_duration_t cfs_duration_build(int64_t nano) -{ - return nano; -} - - static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v) { *v = *t; @@ -174,17 +166,12 @@ static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v) static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s) { s->tv_sec = t->tv_sec; - s->tv_nsec = t->tv_usec * 1000; + s->tv_nsec = t->tv_usec * NSEC_PER_USEC; } static inline cfs_duration_t cfs_time_seconds(int seconds) { - return cfs_duration_build(ONE_BILLION * (int64_t)seconds); -} - -static inline cfs_time_t cfs_time_shift(int seconds) -{ - return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds)); + return (NSEC_PER_SEC * (int64_t)seconds); } /* @@ -192,7 +179,7 @@ static inline cfs_time_t cfs_time_shift(int seconds) */ static inline int64_t __cfs_fs_time_flat(cfs_fs_time_t *t) { - return ((int64_t)t->tv_sec) * ONE_BILLION + t->tv_usec; + return ((int64_t)t->tv_sec)*NSEC_PER_SEC + t->tv_usec*NSEC_PER_USEC; } static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2) @@ -207,29 +194,33 @@ static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2) static inline time_t cfs_duration_sec(cfs_duration_t d) { - return d / ONE_BILLION; + return d / NSEC_PER_SEC; } static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s) { - s->tv_sec = d / ONE_BILLION; - s->tv_usec = (d - s->tv_sec * ONE_BILLION) / 1000; + s->tv_sec = d / NSEC_PER_SEC; + s->tv_usec = (d - ((int64_t)s->tv_sec) * NSEC_PER_SEC) / NSEC_PER_USEC; } static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s) { - s->tv_sec = d / ONE_BILLION; - s->tv_nsec = d - ((int64_t)s->tv_sec) * ONE_BILLION; + s->tv_sec = d / NSEC_PER_SEC; + s->tv_nsec = d - ((int64_t)s->tv_sec) * NSEC_PER_SEC; } -static inline cfs_duration_t cfs_time_minimal_timeout(void) -{ - return ONE_BILLION / (u_int64_t)hz; -} +#define cfs_time_current_64 cfs_time_current +#define cfs_time_add_64 cfs_time_add +#define cfs_time_shift_64 cfs_time_shift +#define cfs_time_before_64 cfs_time_before -/* inline function cfs_time_minimal_timeout() can not be used to - * initiallize static variable */ -#define CFS_MIN_DELAY (ONE_BILLION / (u_int64_t)100) +/* + * One jiffy (in nanoseconds) + * + * osfmk/kern/sched_prim.c + * #define DEFAULT_PREEMPTION_RATE 100 + */ +#define CFS_TICK (NSEC_PER_SEC / (u_int64_t)100) #define LTIME_S(t) (t) diff --git a/lnet/include/libcfs/darwin/darwin-types.h b/lnet/include/libcfs/darwin/darwin-types.h index b2762c0..0fd2966 100644 --- a/lnet/include/libcfs/darwin/darwin-types.h +++ b/lnet/include/libcfs/darwin/darwin-types.h @@ -32,21 +32,23 @@ #include #include -typedef u_int8_t __u8; -typedef u_int16_t __u16; -typedef u_int32_t __u32; -typedef u_int64_t __u64; +#ifndef _BLKID_TYPES_H +#define _BLKID_TYPES_H +#endif + +typedef u_int8_t __u8; +typedef u_int16_t __u16; +typedef u_int32_t __u32; +typedef u_int64_t __u64; +typedef int8_t __s8; +typedef int16_t __s16; +typedef int32_t __s32; +typedef int64_t __s64; #ifdef __KERNEL__ #include -#ifndef __s32 -typedef __signed__ int __s32; -#endif -#ifndef __s64 -typedef __signed__ long long __s64; -#endif typedef struct { int e; } event_chan_t; typedef dev_t kdev_t; @@ -61,22 +63,30 @@ typedef struct { volatile uint32_t counter; } atomic_t; #define ATOMIC_INIT(i) { (i) } #define atomic_read(a) ((a)->counter) #define atomic_set(a, v) (((a)->counter) = (v)) +#ifdef __DARWIN8__ +#define atomic_add(v, a) OSAddAtomic(v, (SInt32 *)&((a)->counter)) +#define atomic_sub(v, a) OSAddAtomic(-(v), (SInt32 *)&((a)->counter)) +#define atomic_inc(a) OSIncrementAtomic((SInt32 *)&((a)->counter)) +#define atomic_dec(a) OSDecrementAtomic((SInt32 *)&((a)->counter)) +#else /* !__DARWIN8__ */ #define atomic_add(v, a) hw_atomic_add((uint32_t *)&((a)->counter), v) #define atomic_sub(v, a) hw_atomic_sub((uint32_t *)&((a)->counter), v) #define atomic_inc(a) atomic_add(1, a) #define atomic_dec(a) atomic_sub(1, a) -#define atomic_sub_and_test(v, a) ( atomic_sub(v, a) == 0 ) -#define atomic_dec_and_test(a) ( atomic_dec(a) == 0 ) +#endif /* !__DARWIN8__ */ +#define atomic_sub_and_test(v, a) ( atomic_sub(v, a) == -(a) ) +#define atomic_dec_and_test(a) ( atomic_dec(a) == 1 ) #include -typedef uint64_t loff_t; +typedef off_t loff_t; #else /* !__KERNEL__ */ #include -typedef uint64_t loff_t; +typedef off_t loff_t; #endif /* __KERNEL END */ +typedef unsigned short umode_t; #endif /* __XNU_CFS_TYPES_H__ */ diff --git a/lnet/include/libcfs/darwin/darwin-utils.h b/lnet/include/libcfs/darwin/darwin-utils.h index 4e91db9..0f808a2 100644 --- a/lnet/include/libcfs/darwin/darwin-utils.h +++ b/lnet/include/libcfs/darwin/darwin-utils.h @@ -1,5 +1,5 @@ -#ifndef __LIBCFS_DARWIN_XNU_UTILS_H__ -#define __LIBCFS_DARWIN_XNU_UTILS_H__ +#ifndef __LIBCFS_DARWIN_UTILS_H__ +#define __LIBCFS_DARWIN_UTILS_H__ #ifndef __LIBCFS_LIBCFS_H__ #error Do not #include this file directly. #include instead @@ -57,4 +57,11 @@ char * ul2dstr(unsigned long address, char *buf, int len); #define HIPQUAD NIPQUAD +#ifndef LIST_CIRCLE +#define LIST_CIRCLE(elm, field) \ + do { \ + (elm)->field.le_prev = &(elm)->field.le_next; \ + } while (0) +#endif + #endif /* __XNU_UTILS_H__ */ diff --git a/lnet/include/libcfs/darwin/kp30.h b/lnet/include/libcfs/darwin/kp30.h index 5c1acc4..f9e94b1 100644 --- a/lnet/include/libcfs/darwin/kp30.h +++ b/lnet/include/libcfs/darwin/kp30.h @@ -22,17 +22,20 @@ #include #include -#include +#include -#define our_cond_resched() schedule_timeout(1); +#define our_cond_resched() cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, 1) #ifdef CONFIG_SMP #define LASSERT_SPIN_LOCKED(lock) do {} while(0) /* XXX */ #else #define LASSERT_SPIN_LOCKED(lock) do {} while(0) #endif +#define LASSERT_SEM_LOCKED(sem) do {} while(0) /* XXX */ -#define LBUG_WITH_LOC(file, func, line) portals_catastrophe = 1 +#define LIBCFS_PANIC(msg) panic(msg) +#error libcfs_register_panic_notifier() missing +#error libcfs_unregister_panic_notifier() missing /* --------------------------------------------------------------------- */ @@ -45,7 +48,14 @@ #define PORTAL_MODULE_USE do{int i = 0; i++;}while(0) #define PORTAL_MODULE_UNUSE do{int i = 0; i--;}while(0) -#define printk(format, args...) printf(format, ## args) +#define num_online_cpus() cfs_online_cpus() + +/******************************************************************************/ +/* XXX Liang: There is no module parameter supporting in OSX */ +#define CFS_MODULE_PARM(name, t, type, perm, desc) + +#define CFS_SYSFS_MODULE_PARM 0 /* no sysfs access to module parameters */ +/******************************************************************************/ #else /* !__KERNEL__ */ # include @@ -57,30 +67,31 @@ # include #endif +#define BITS_PER_LONG LONG_BIT /******************************************************************************/ /* Light-weight trace * Support for temporary event tracing with minimal Heisenberg effect. */ #define LWT_SUPPORT 0 -typedef struct { - long long lwte_when; - char *lwte_where; - void *lwte_task; - long lwte_p1; - long lwte_p2; - long lwte_p3; - long lwte_p4; +typedef struct { + long long lwte_when; + char *lwte_where; + void *lwte_task; + long lwte_p1; + long lwte_p2; + long lwte_p3; + long lwte_p4; } lwt_event_t; # define LWT_EVENT(p1,p2,p3,p4) /* no lwt implementation yet */ /* -------------------------------------------------------------------------- */ -#define IOCTL_PORTAL_TYPE struct portal_ioctl_data +#define IOCTL_LIBCFS_TYPE struct libcfs_ioctl_data #define LPU64 "%llu" #define LPD64 "%lld" -#define LPX64 "%llx" +#define LPX64 "%#llx" #define LPSZ "%lu" #define LPSSZ "%ld" # define LI_POISON ((int)0x5a5a5a5a) diff --git a/lnet/include/libcfs/darwin/libcfs.h b/lnet/include/libcfs/darwin/libcfs.h index 8e4eb89..eb4d8f3 100644 --- a/lnet/include/libcfs/darwin/libcfs.h +++ b/lnet/include/libcfs/darwin/libcfs.h @@ -9,6 +9,7 @@ #endif #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #ifdef __KERNEL__ # include @@ -75,8 +77,8 @@ struct ptldebug_header { #endif #define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5) -#define CHECK_STACK(stack) do { } while(0) -#define CDEBUG_STACK (0L) +#define CHECK_STACK() do { } while(0) +#define CDEBUG_STACK() (0L) /* Darwin has defined RETURN, so we have to undef it in lustre */ #ifdef RETURN @@ -155,7 +157,7 @@ __entry_nesting(&__cdd); /* ENTRY_NESTING_SUPPORT */ #endif -#define LUSTRE_PTL_PID 12345 +#define LUSTRE_LNET_PID 12345 #define _XNU_LIBCFS_H @@ -164,10 +166,28 @@ __entry_nesting(&__cdd); * * Implementation is in darwin-curproc.c */ -#define CFS_CURPROC_COMM_MAX (sizeof ((struct proc *)0)->p_comm) +#define CFS_CURPROC_COMM_MAX MAXCOMLEN /* * XNU has no capabilities */ typedef int cfs_kernel_cap_t; +#ifdef __KERNEL__ +enum { + /* if you change this, update darwin-util.c:cfs_stack_trace_fill() */ + CFS_STACK_TRACE_DEPTH = 16 +}; + +struct cfs_stack_trace { + void *frame[CFS_STACK_TRACE_DEPTH]; +}; + +#define printk(format, args...) printf(format, ## args) + +#ifdef WITH_WATCHDOG +#undef WITH_WATCHDOG +#endif + +#endif /* __KERNEL__ */ + #endif /* _XNU_LIBCFS_H */ diff --git a/lnet/include/libcfs/darwin/lltrace.h b/lnet/include/libcfs/darwin/lltrace.h index bb0dc91..31d6e17 100644 --- a/lnet/include/libcfs/darwin/lltrace.h +++ b/lnet/include/libcfs/darwin/lltrace.h @@ -18,9 +18,9 @@ #include #include #include -#include +#include #include #include -#include +#include #endif diff --git a/lnet/include/libcfs/kp30.h b/lnet/include/libcfs/kp30.h index 971df1b..19ee200 100644 --- a/lnet/include/libcfs/kp30.h +++ b/lnet/include/libcfs/kp30.h @@ -4,197 +4,197 @@ #ifndef __LIBCFS_KP30_H__ #define __LIBCFS_KP30_H__ -#define PORTAL_DEBUG +#define LIBCFS_DEBUG #include -#include +#include #if defined(__linux__) #include #elif defined(__APPLE__) #include +#elif defined(__WINNT__) +#include #else #error Unsupported operating system #endif -#include +#ifndef DEBUG_SUBSYSTEM +# define DEBUG_SUBSYSTEM S_UNDEFINED +#endif #ifdef __KERNEL__ -# ifndef DEBUG_SUBSYSTEM -# define DEBUG_SUBSYSTEM S_UNDEFINED -# endif +#ifdef LIBCFS_DEBUG -#ifdef PORTAL_DEBUG -extern void kportal_assertion_failed(char *expr, char *file, const char *func, - const int line); -#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \ - __FUNCTION__, __LINE__)) -#define LASSERTF(cond, fmt...) \ - do { \ - if (unlikely(!(cond))) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, D_EMERG, __FILE__,\ - __FUNCTION__,__LINE__, CDEBUG_STACK,\ - "ASSERTION(" #cond ") failed:" fmt);\ - LBUG(); \ - } \ - } while (0) +/* + * When this is on, LASSERT macro includes check for assignment used instead + * of equality check, but doesn't have unlikely(). Turn this on from time to + * time to make test-builds. This shouldn't be on for production release. + */ +#define LASSERT_CHECKED (0) +#if LASSERT_CHECKED +/* + * Assertion. + * + * Strange construction with empty "then" clause is used to trigger compiler + * warnings on the assertions of the form LASSERT(a = b); + * + * "warning: suggest parentheses around assignment used as truth value" + * + * requires -Wall. Unfortunately this rules out use of likely/unlikely. + */ +#define LASSERT(cond) \ +({ \ + if (cond) \ + ; \ + else \ + libcfs_assertion_failed( #cond , __FILE__, \ + __FUNCTION__, __LINE__); \ +}) + +#define LASSERTF(cond, fmt, a...) \ +({ \ + if (cond) \ + ; \ + else { \ + libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_EMERG, \ + __FILE__, __FUNCTION__,__LINE__, \ + "ASSERTION(" #cond ") failed:" fmt, \ + ## a); \ + LBUG(); \ + } \ +}) + +/* LASSERT_CHECKED */ #else -#define LASSERT(e) -#define LASSERTF(cond, fmt...) do { } while (0) + +#define LASSERT(cond) \ +({ \ + if (unlikely(!(cond))) \ + libcfs_assertion_failed(#cond , __FILE__, \ + __FUNCTION__, __LINE__); \ +}) + +#define LASSERTF(cond, fmt, a...) \ +({ \ + if (unlikely(!(cond))) { \ + libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_EMERG, \ + __FILE__, __FUNCTION__,__LINE__, \ + "ASSERTION(" #cond ") failed:" fmt, \ + ## a); \ + LBUG(); \ + } \ +}) + +/* LASSERT_CHECKED */ #endif -/* LBUG_WITH_LOC defined in portals//kp30.h */ -#define LBUG() LBUG_WITH_LOC(__FILE__, __FUNCTION__, __LINE__) +/* LIBCFS_DEBUG */ +#else +#define LASSERT(e) ((void)(0)) +#define LASSERTF(cond, fmt...) ((void)(0)) +#endif /* LIBCFS_DEBUG */ + +void lbug_with_loc(char *file, const char *func, const int line) + __attribute__((noreturn)); + +#define LBUG() lbug_with_loc(__FILE__, __FUNCTION__, __LINE__) +extern atomic_t libcfs_kmemory; /* * Memory */ -#ifdef PORTAL_DEBUG -extern atomic_t portal_kmemory; +#ifdef LIBCFS_DEBUG -# define portal_kmem_inc(ptr, size) \ -do { \ - atomic_add(size, &portal_kmemory); \ +# define libcfs_kmem_inc(ptr, size) \ +do { \ + atomic_add(size, &libcfs_kmemory); \ } while (0) -# define portal_kmem_dec(ptr, size) do { \ - atomic_sub(size, &portal_kmemory); \ +# define libcfs_kmem_dec(ptr, size) do { \ + atomic_sub(size, &libcfs_kmemory); \ } while (0) #else -# define portal_kmem_inc(ptr, size) do {} while (0) -# define portal_kmem_dec(ptr, size) do {} while (0) -#endif /* PORTAL_DEBUG */ +# define libcfs_kmem_inc(ptr, size) do {} while (0) +# define libcfs_kmem_dec(ptr, size) do {} while (0) +#endif /* LIBCFS_DEBUG */ -#define PORTAL_VMALLOC_SIZE 16384 +#define LIBCFS_VMALLOC_SIZE 16384 -#define PORTAL_ALLOC_GFP(ptr, size, mask) \ +#define LIBCFS_ALLOC_GFP(ptr, size, mask) \ do { \ LASSERT(!in_interrupt() || \ - (size <= PORTAL_VMALLOC_SIZE && mask == CFS_ALLOC_ATOMIC));\ - if ((size) > PORTAL_VMALLOC_SIZE) \ + (size <= LIBCFS_VMALLOC_SIZE && mask == CFS_ALLOC_ATOMIC));\ + if (unlikely((size) > LIBCFS_VMALLOC_SIZE)) \ (ptr) = cfs_alloc_large(size); \ else \ (ptr) = cfs_alloc((size), (mask)); \ - if ((ptr) == NULL) { \ - CERROR("PORTALS: out of memory at %s:%d (tried to alloc '"\ + if (unlikely((ptr) == NULL)) { \ + CERROR("LNET: out of memory at %s:%d (tried to alloc '" \ #ptr "' = %d)\n", __FILE__, __LINE__, (int)(size));\ - CERROR("PORTALS: %d total bytes allocated by portals\n", \ - atomic_read(&portal_kmemory)); \ + CERROR("LNET: %d total bytes allocated by lnet\n", \ + atomic_read(&libcfs_kmemory)); \ } else { \ - portal_kmem_inc((ptr), (size)); \ + libcfs_kmem_inc((ptr), (size)); \ if (!((mask) & CFS_ALLOC_ZERO)) \ memset((ptr), 0, (size)); \ } \ CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p (tot %d).\n", \ - (int)(size), (ptr), atomic_read (&portal_kmemory)); \ + (int)(size), (ptr), atomic_read (&libcfs_kmemory)); \ } while (0) -#define PORTAL_ALLOC(ptr, size) \ - PORTAL_ALLOC_GFP(ptr, size, CFS_ALLOC_IO) +#define LIBCFS_ALLOC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, CFS_ALLOC_IO) -#define PORTAL_ALLOC_ATOMIC(ptr, size) \ - PORTAL_ALLOC_GFP(ptr, size, CFS_ALLOC_ATOMIC) +#define LIBCFS_ALLOC_ATOMIC(ptr, size) \ + LIBCFS_ALLOC_GFP(ptr, size, CFS_ALLOC_ATOMIC) -#define PORTAL_FREE(ptr, size) \ +#define LIBCFS_FREE(ptr, size) \ do { \ int s = (size); \ - if ((ptr) == NULL) { \ - CERROR("PORTALS: free NULL '" #ptr "' (%d bytes) at " \ + if (unlikely((ptr) == NULL)) { \ + CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ "%s:%d\n", s, __FILE__, __LINE__); \ break; \ } \ - if (s > PORTAL_VMALLOC_SIZE) \ + if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ cfs_free_large(ptr); \ else \ cfs_free(ptr); \ - portal_kmem_dec((ptr), s); \ + libcfs_kmem_dec((ptr), s); \ CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n", \ - s, (ptr), atomic_read(&portal_kmemory)); \ + s, (ptr), atomic_read(&libcfs_kmemory)); \ } while (0) /******************************************************************************/ -#ifdef PORTALS_PROFILING -#define prof_enum(FOO) PROF__##FOO -enum { - prof_enum(our_recvmsg), - prof_enum(our_sendmsg), - prof_enum(socknal_recv), - prof_enum(lib_parse), - prof_enum(conn_list_walk), - prof_enum(memcpy), - prof_enum(lib_finalize), - prof_enum(pingcli_time), - prof_enum(gmnal_send), - prof_enum(gmnal_recv), - MAX_PROFS -}; - -struct prof_ent { - char *str; - /* hrmph. wrap-tastic. */ - u32 starts; - u32 finishes; - cycles_t total_cycles; - cycles_t start; - cycles_t end; -}; +/* htonl hack - either this, or compile with -O2. Stupid byteorder/generic.h */ +#if defined(__GNUC__) && (__GNUC__ >= 2) && !defined(__OPTIMIZE__) +#define ___htonl(x) __cpu_to_be32(x) +#define ___htons(x) __cpu_to_be16(x) +#define ___ntohl(x) __be32_to_cpu(x) +#define ___ntohs(x) __be16_to_cpu(x) +#define htonl(x) ___htonl(x) +#define ntohl(x) ___ntohl(x) +#define htons(x) ___htons(x) +#define ntohs(x) ___ntohs(x) +#endif -extern struct prof_ent prof_ents[MAX_PROFS]; - -#define PROF_START(FOO) \ - do { \ - struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ - pe->starts++; \ - pe->start = get_cycles(); \ - } while (0) - -#define PROF_FINISH(FOO) \ - do { \ - struct prof_ent *pe = &prof_ents[PROF__##FOO]; \ - pe->finishes++; \ - pe->end = get_cycles(); \ - pe->total_cycles += (pe->end - pe->start); \ - } while (0) -#else /* !PORTALS_PROFILING */ -#define PROF_START(FOO) do {} while(0) -#define PROF_FINISH(FOO) do {} while(0) -#endif /* PORTALS_PROFILING */ - -/* debug.c */ -extern spinlock_t stack_backtrace_lock; - -void portals_debug_dumpstack(cfs_task_t *tsk); -void portals_run_upcall(char **argv); -void portals_run_lbug_upcall(char * file, const char *fn, const int line); -void portals_debug_dumplog(void); -int portals_debug_init(unsigned long bufsize); -int portals_debug_cleanup(void); -int portals_debug_clear_buffer(void); -int portals_debug_mark_buffer(char *text); -int portals_debug_set_daemon(unsigned int cmd, unsigned int length, - char *file, unsigned int size); -__s32 portals_debug_copy_to_user(char *buf, unsigned long len); -/* Use the special GNU C __attribute__ hack to have the compiler check the - * printf style argument string against the actual argument count and - * types. - */ -void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, - char *format, ...) - __attribute__ ((format (printf, 7, 8))); -void portals_debug_set_level(unsigned int debug_level); +void libcfs_debug_dumpstack(cfs_task_t *tsk); +void libcfs_run_upcall(char **argv); +void libcfs_run_lbug_upcall(char * file, const char *fn, const int line); +void libcfs_debug_dumplog(void); +int libcfs_debug_init(unsigned long bufsize); +int libcfs_debug_cleanup(void); +int libcfs_debug_clear_buffer(void); +int libcfs_debug_mark_buffer(char *text); -extern void kportal_daemonize (char *name); -extern void kportal_blockallsigs (void); +void libcfs_debug_set_level(unsigned int debug_level); #else /* !__KERNEL__ */ -# ifndef DEBUG_SUBSYSTEM -# define DEBUG_SUBSYSTEM S_UNDEFINED -# endif -# ifdef PORTAL_DEBUG +# ifdef LIBCFS_DEBUG # undef NDEBUG # include # define LASSERT(e) assert(e) @@ -204,23 +204,38 @@ do { \ CERROR(args); \ assert(cond); \ } while (0) +# define LBUG() assert(0) # else -# define LASSERT(e) +# define LASSERT(e) ((void)(0)) # define LASSERTF(cond, args...) do { } while (0) -# endif +# define LBUG() ((void)(0)) +# endif /* LIBCFS_DEBUG */ # define printk(format, args...) printf (format, ## args) -# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0); -# define PORTAL_FREE(a, b) do { free(a); } while (0); -void portals_debug_dumplog(void); -# define portals_debug_msg(subsys, mask, file, fn, line, stack, format, a...) \ - printf("%02x:%06x (@%lu %s:%s,l. %d %d %lu): " format, \ - (subsys), (mask), (long)time(0), file, fn, line, \ - getpid(), (unsigned long)stack, ## a); - -#undef CWARN -#undef CERROR -#define CWARN(format, a...) CDEBUG(D_WARNING, format, ## a) -#define CERROR(format, a...) CDEBUG(D_ERROR, format, ## a) +# ifdef CRAY_XT3 /* buggy calloc! */ +# define LIBCFS_ALLOC(ptr, size) \ + do { \ + (ptr) = malloc(size); \ + memset(ptr, 0, size); \ + } while (0); +# else +# define LIBCFS_ALLOC(ptr, size) do { (ptr) = calloc(1,size); } while (0); +# endif +# define LIBCFS_FREE(a, b) do { free(a); } while (0); + +void libcfs_debug_dumplog(void); +int libcfs_debug_init(unsigned long bufsize); +int libcfs_debug_cleanup(void); + +/* + * Generic compiler-dependent macros required for kernel + * build go below this comment. Actual compiler/compiler version + * specific implementations come from the above header files + */ + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +/* !__KERNEL__ */ #endif /* @@ -240,8 +255,31 @@ void portals_debug_dumplog(void); #define CLASSERT(cond) ({ switch(42) { case (cond): case 0: break; } }) /* support decl needed both by kernel and liblustre */ -char *portals_nid2str(int nal, ptl_nid_t nid, char *str); -char *portals_id2str(int nal, ptl_process_id_t nid, char *str); +int libcfs_isknown_lnd(int type); +char *libcfs_lnd2modname(int type); +char *libcfs_lnd2str(int type); +int libcfs_str2lnd(char *str); +char *libcfs_net2str(__u32 net); +char *libcfs_nid2str(lnet_nid_t nid); +__u32 libcfs_str2net(char *str); +lnet_nid_t libcfs_str2nid(char *str); +int libcfs_str2anynid(lnet_nid_t *nid, char *str); +char *libcfs_id2str(lnet_process_id_t id); +void libcfs_setnet0alias(int type); + +/* how an LNET NID encodes net:address */ +#define LNET_NIDADDR(nid) ((__u32)((nid) & 0xffffffff)) +#define LNET_NIDNET(nid) ((__u32)(((nid) >> 32)) & 0xffffffff) +#define LNET_MKNID(net,addr) ((((__u64)(net))<<32)|((__u64)(addr))) +/* how net encodes type:number */ +#define LNET_NETNUM(net) ((net) & 0xffff) +#define LNET_NETTYP(net) (((net) >> 16) & 0xffff) +#define LNET_MKNET(typ,num) ((((__u32)(typ))<<16)|((__u32)(num))) + +/* implication */ +#define ergo(a, b) (!(a) || (b)) +/* logical equivalence */ +#define equi(a, b) (!!(a) == !!(b)) #ifndef CURRENT_TIME # define CURRENT_TIME time(0) @@ -253,45 +291,132 @@ char *portals_id2str(int nal, ptl_process_id_t nid, char *str); * All stuff about lwt are put in arch/kp30.h * -------------------------------------------------------------------- */ -struct portals_device_userstate +struct libcfs_device_userstate { - int pdu_memhog_pages; - cfs_page_t *pdu_memhog_root_page; + int ldu_memhog_pages; + cfs_page_t *ldu_memhog_root_page; }; -#include +/* what used to be in portals_lib.h */ +#ifndef MIN +# define MIN(a,b) (((a)<(b)) ? (a): (b)) +#endif +#ifndef MAX +# define MAX(a,b) (((a)>(b)) ? (a): (b)) +#endif + +#define MKSTR(ptr) ((ptr))? (ptr) : "" + +static inline int size_round4 (int val) +{ + return (val + 3) & (~0x3); +} + +static inline int size_round (int val) +{ + return (val + 7) & (~0x7); +} + +static inline int size_round16(int val) +{ + return (val + 0xf) & (~0xf); +} + +static inline int size_round32(int val) +{ + return (val + 0x1f) & (~0x1f); +} + +static inline int size_round0(int val) +{ + if (!val) + return 0; + return (val + 1 + 7) & (~0x7); +} + +static inline size_t round_strlen(char *fset) +{ + return (size_t)size_round((int)strlen(fset) + 1); +} + +#define LOGL(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)ptr, (const char *)var, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGU(var,len,ptr) \ +do { \ + if (var) \ + memcpy((char *)var, (const char *)ptr, len); \ + ptr += size_round(len); \ +} while (0) + +#define LOGL0(var,len,ptr) \ +do { \ + if (!len) \ + break; \ + memcpy((char *)ptr, (const char *)var, len); \ + *((char *)(ptr) + len) = 0; \ + ptr += size_round(len + 1); \ +} while (0) /* * USER LEVEL STUFF BELOW */ -#define PORTAL_IOCTL_VERSION 0x00010008 -#define PING_SYNC 0 -#define PING_ASYNC 1 +#define LIBCFS_IOCTL_VERSION 0x0001000a + +struct libcfs_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + __u64 ioc_nid; + __u64 ioc_u64[1]; + + __u32 ioc_flags; + __u32 ioc_count; + __u32 ioc_net; + __u32 ioc_u32[7]; + + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; -struct portal_ioctl_hdr { + __u32 ioc_plen1; /* buffers in userspace */ + char *ioc_pbuf1; + __u32 ioc_plen2; /* buffers in userspace */ + char *ioc_pbuf2; + + char ioc_bulk[0]; +}; + + +struct libcfs_ioctl_hdr { __u32 ioc_len; __u32 ioc_version; }; -struct portals_debug_ioctl_data +struct libcfs_debug_ioctl_data { - struct portal_ioctl_hdr hdr; + struct libcfs_ioctl_hdr hdr; unsigned int subs; unsigned int debug; }; -#define PORTAL_IOC_INIT(data) \ +#define LIBCFS_IOC_INIT(data) \ do { \ memset(&data, 0, sizeof(data)); \ - data.ioc_version = PORTAL_IOCTL_VERSION; \ + data.ioc_version = LIBCFS_IOCTL_VERSION; \ data.ioc_len = sizeof(data); \ } while (0) /* FIXME check conflict with lustre_lib.h */ -#define PTL_IOC_DEBUG_MASK _IOWR('f', 250, long) +#define LIBCFS_IOC_DEBUG_MASK _IOWR('f', 250, long) -static inline int portal_ioctl_packlen(struct portal_ioctl_data *data) +static inline int libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) { int len = sizeof(*data); len += size_round(data->ioc_inllen1); @@ -299,79 +424,79 @@ static inline int portal_ioctl_packlen(struct portal_ioctl_data *data) return len; } -static inline int portal_ioctl_is_invalid(struct portal_ioctl_data *data) +static inline int libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data) { if (data->ioc_len > (1<<30)) { - CERROR ("PORTALS ioctl: ioc_len larger than 1<<30\n"); + CERROR ("LIBCFS ioctl: ioc_len larger than 1<<30\n"); return 1; } if (data->ioc_inllen1 > (1<<30)) { - CERROR ("PORTALS ioctl: ioc_inllen1 larger than 1<<30\n"); + CERROR ("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n"); return 1; } if (data->ioc_inllen2 > (1<<30)) { - CERROR ("PORTALS ioctl: ioc_inllen2 larger than 1<<30\n"); + CERROR ("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n"); return 1; } if (data->ioc_inlbuf1 && !data->ioc_inllen1) { - CERROR ("PORTALS ioctl: inlbuf1 pointer but 0 length\n"); + CERROR ("LIBCFS ioctl: inlbuf1 pointer but 0 length\n"); return 1; } if (data->ioc_inlbuf2 && !data->ioc_inllen2) { - CERROR ("PORTALS ioctl: inlbuf2 pointer but 0 length\n"); + CERROR ("LIBCFS ioctl: inlbuf2 pointer but 0 length\n"); return 1; } if (data->ioc_pbuf1 && !data->ioc_plen1) { - CERROR ("PORTALS ioctl: pbuf1 pointer but 0 length\n"); + CERROR ("LIBCFS ioctl: pbuf1 pointer but 0 length\n"); return 1; } if (data->ioc_pbuf2 && !data->ioc_plen2) { - CERROR ("PORTALS ioctl: pbuf2 pointer but 0 length\n"); + CERROR ("LIBCFS ioctl: pbuf2 pointer but 0 length\n"); return 1; } if (data->ioc_plen1 && !data->ioc_pbuf1) { - CERROR ("PORTALS ioctl: plen1 nonzero but no pbuf1 pointer\n"); + CERROR ("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n"); return 1; } if (data->ioc_plen2 && !data->ioc_pbuf2) { - CERROR ("PORTALS ioctl: plen2 nonzero but no pbuf2 pointer\n"); + CERROR ("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n"); return 1; } - if (portal_ioctl_packlen(data) != data->ioc_len ) { - CERROR ("PORTALS ioctl: packlen != ioc_len\n"); + if ((__u32)libcfs_ioctl_packlen(data) != data->ioc_len ) { + CERROR ("LIBCFS ioctl: packlen != ioc_len\n"); return 1; } if (data->ioc_inllen1 && data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { - CERROR ("PORTALS ioctl: inlbuf1 not 0 terminated\n"); + CERROR ("LIBCFS ioctl: inlbuf1 not 0 terminated\n"); return 1; } if (data->ioc_inllen2 && data->ioc_bulk[size_round(data->ioc_inllen1) + data->ioc_inllen2 - 1] != '\0') { - CERROR ("PORTALS ioctl: inlbuf2 not 0 terminated\n"); + CERROR ("LIBCFS ioctl: inlbuf2 not 0 terminated\n"); return 1; } return 0; } #ifndef __KERNEL__ -static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, +static inline int libcfs_ioctl_pack(struct libcfs_ioctl_data *data, char **pbuf, int max) { char *ptr; - struct portal_ioctl_data *overlay; - data->ioc_len = portal_ioctl_packlen(data); - data->ioc_version = PORTAL_IOCTL_VERSION; + struct libcfs_ioctl_data *overlay; + data->ioc_len = libcfs_ioctl_packlen(data); + data->ioc_version = LIBCFS_IOCTL_VERSION; - if (*pbuf && portal_ioctl_packlen(data) > max) + if (*pbuf && libcfs_ioctl_packlen(data) > max) return 1; if (*pbuf == NULL) { *pbuf = malloc(data->ioc_len); } if (!*pbuf) return 1; - overlay = (struct portal_ioctl_data *)*pbuf; + overlay = (struct libcfs_ioctl_data *)*pbuf; memcpy(*pbuf, data, sizeof(*data)); ptr = overlay->ioc_bulk; @@ -379,7 +504,7 @@ static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); if (data->ioc_inlbuf2) LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); - if (portal_ioctl_is_invalid(overlay)) + if (libcfs_ioctl_is_invalid(overlay)) return 1; return 0; @@ -387,70 +512,71 @@ static inline int portal_ioctl_pack(struct portal_ioctl_data *data, char **pbuf, #else -extern int portal_ioctl_getdata(char *buf, char *end, void *arg); +extern int libcfs_ioctl_getdata(char *buf, char *end, void *arg); +extern int libcfs_ioctl_popdata(void *arg, void *buf, int size); #endif /* ioctls for manipulating snapshots 30- */ -#define IOC_PORTAL_TYPE 'e' -#define IOC_PORTAL_MIN_NR 30 - -#define IOC_PORTAL_PING _IOWR('e', 30, IOCTL_PORTAL_TYPE) - -#define IOC_PORTAL_CLEAR_DEBUG _IOWR('e', 32, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_MARK_DEBUG _IOWR('e', 33, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_PANIC _IOWR('e', 34, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_NAL_CMD _IOWR('e', 35, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_GET_NID _IOWR('e', 36, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_FAIL_NID _IOWR('e', 37, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_LOOPBACK _IOWR('e', 38, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 39, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 40, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 41, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_MEMHOG _IOWR('e', 42, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_DMSG _IOWR('e', 43, IOCTL_PORTAL_TYPE) -#define IOC_PORTAL_MAX_NR 43 +#define IOC_LIBCFS_TYPE 'e' +#define IOC_LIBCFS_MIN_NR 30 +/* libcfs ioctls */ +#define IOC_LIBCFS_PANIC _IOWR('e', 30, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CLEAR_DEBUG _IOWR('e', 31, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_MARK_DEBUG _IOWR('e', 32, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LWT_CONTROL _IOWR('e', 33, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LWT_SNAPSHOT _IOWR('e', 34, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LWT_LOOKUP_STRING _IOWR('e', 35, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_MEMHOG _IOWR('e', 36, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING_TEST _IOWR('e', 37, IOCTL_LIBCFS_TYPE) +/* lnet ioctls */ +#define IOC_LIBCFS_GET_NI _IOWR('e', 50, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_FAIL_NID _IOWR('e', 51, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_ROUTE _IOWR('e', 52, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_ROUTE _IOWR('e', 53, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_ROUTE _IOWR('e', 54, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_NOTIFY_ROUTER _IOWR('e', 55, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_UNCONFIGURE _IOWR('e', 56, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PORTALS_COMPATIBILITY _IOWR('e', 57, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_LNET_DIST _IOWR('e', 58, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CONFIGURE _IOWR('e', 59, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_TESTPROTOCOMPAT _IOWR('e', 60, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PING _IOWR('e', 61, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEBUG_PEER _IOWR('e', 62, IOCTL_LIBCFS_TYPE) +/* lnd ioctls */ +#define IOC_LIBCFS_REGISTER_MYNID _IOWR('e', 70, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_CLOSE_CONNECTION _IOWR('e', 71, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_PUSH_CONNECTION _IOWR('e', 72, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_CONN _IOWR('e', 73, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_PEER _IOWR('e', 76, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_TXDESC _IOWR('e', 77, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, IOCTL_LIBCFS_TYPE) +#define IOC_LIBCFS_GET_GMID _IOWR('e', 81, IOCTL_LIBCFS_TYPE) + +#define IOC_LIBCFS_MAX_NR 81 + enum { - QSWNAL = 1, - SOCKNAL = 2, - GMNAL = 3, - /* 4 unused */ - TCPNAL = 5, - ROUTER = 6, - OPENIBNAL = 7, - IIBNAL = 8, - LONAL = 9, - RANAL = 10, - VIBNAL = 11, - NAL_ENUM_END_MARKER + /* Only add to these values (i.e. don't ever change or redefine them): + * network addresses depend on them... */ + QSWLND = 1, + SOCKLND = 2, + GMLND = 3, + PTLLND = 4, + O2IBLND = 5, + CIBLND = 6, + OPENIBLND = 7, + IIBLND = 8, + LOLND = 9, + RALND = 10, + VIBLND = 11, + MXLND = 12, }; -#define PTL_NALFMT_SIZE 32 /* %u:%u.%u.%u.%u,%u (10+4+4+4+3+5+1) */ -#ifndef CRAY_PORTALS -#define NALID_FROM_IFACE(nal) (nal) -#endif - -#define NAL_MAX_NR (NAL_ENUM_END_MARKER - 1) - -#define NAL_CMD_REGISTER_PEER_FD 100 -#define NAL_CMD_CLOSE_CONNECTION 101 -#define NAL_CMD_REGISTER_MYNID 102 -#define NAL_CMD_PUSH_CONNECTION 103 -#define NAL_CMD_GET_CONN 104 -#define NAL_CMD_DEL_PEER 105 -#define NAL_CMD_ADD_PEER 106 -#define NAL_CMD_GET_PEER 107 -#define NAL_CMD_GET_TXDESC 108 -#define NAL_CMD_ADD_ROUTE 109 -#define NAL_CMD_DEL_ROUTE 110 -#define NAL_CMD_GET_ROUTE 111 -#define NAL_CMD_NOTIFY_ROUTER 112 -#define NAL_CMD_ADD_INTERFACE 113 -#define NAL_CMD_DEL_INTERFACE 114 -#define NAL_CMD_GET_INTERFACE 115 - - enum { DEBUG_DAEMON_START = 1, DEBUG_DAEMON_STOP = 2, diff --git a/lnet/include/libcfs/libcfs.h b/lnet/include/libcfs/libcfs.h index 9e7ea85..2e11e7c 100644 --- a/lnet/include/libcfs/libcfs.h +++ b/lnet/include/libcfs/libcfs.h @@ -12,6 +12,8 @@ #include #elif defined(__APPLE__) #include +#elif defined(__WINNT__) +#include #else #error Unsupported operating system. #endif @@ -22,24 +24,57 @@ #include #endif -#define PORTAL_DEBUG +#define LIBCFS_DEBUG #ifndef offsetof # define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb))) #endif +/* cardinality of array */ +#define sizeof_array(a) ((sizeof (a)) / (sizeof ((a)[0]))) + +#if !defined(container_of) +/* given a pointer @ptr to the field @member embedded into type (usually + * struct) @type, return pointer to the embedding instance of @type. */ +#define container_of(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) +#endif + +#define container_of0(ptr, type, member) \ +({ \ + typeof(ptr) __ptr = (ptr); \ + __ptr ? container_of(__ptr, type, member) : NULL; \ +}) + +/* + * true iff @i is power-of-2 + */ +#define IS_PO2(i) \ +({ \ + typeof(i) __i; \ + \ + __i = (i); \ + !(__i & (__i - 1)); \ +}) + #define LOWEST_BIT_SET(x) ((x) & ~((x) - 1)) /* * Debugging */ -extern unsigned int portal_subsystem_debug; -extern unsigned int portal_stack; -extern unsigned int portal_debug; -extern unsigned int portal_printk; +extern unsigned int libcfs_subsystem_debug; +extern unsigned int libcfs_stack; +extern unsigned int libcfs_debug; +extern unsigned int libcfs_printk; +extern unsigned int libcfs_console_ratelimit; +extern unsigned int libcfs_debug_binary; +extern char debug_file_path[1024]; + +int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys); +int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys); /* Has there been an LBUG? */ -extern unsigned int portals_catastrophe; +extern unsigned int libcfs_catastrophe; /* * struct ptldebug_header is defined in libcfs//libcfs.h @@ -48,6 +83,7 @@ extern unsigned int portals_catastrophe; #define PH_FLAG_FIRST_RECORD 1 /* Debugging subsystems (32 bits, non-overlapping) */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ #define S_UNDEFINED 0x00000001 #define S_MDC 0x00000002 #define S_MDS 0x00000004 @@ -58,30 +94,33 @@ extern unsigned int portals_catastrophe; #define S_LLITE 0x00000080 #define S_RPC 0x00000100 #define S_MGMT 0x00000200 -#define S_PORTALS 0x00000400 -#define S_NAL 0x00000800 /* ALL NALs */ +/* unused */ +#define S_LNET 0x00000400 +#define S_LND 0x00000800 /* ALL LNDs */ #define S_PINGER 0x00001000 #define S_FILTER 0x00002000 -#define S_PTLBD 0x00004000 +/* unused */ #define S_ECHO 0x00008000 #define S_LDLM 0x00010000 #define S_LOV 0x00020000 -#define S_PTLROUTER 0x00040000 -#define S_COBD 0x00080000 -#define S_SM 0x00100000 -#define S_ASOBD 0x00200000 -#define S_CONFOBD 0x00400000 -#define S_LMV 0x00800000 -#define S_CMOBD 0x01000000 -#define S_SEC 0x02000000 -#define S_GSS 0x04000000 -#define S_GKS 0x08000000 -/* If you change these values, please keep these files up to date... - * portals/utils/debug.c - * utils/lconf - */ +/* unused */ +/* unused */ +/* unused */ +/* unused */ +/* unused */ +#define S_LMV 0x00800000 /* b_new_cmd */ +/* unused */ +#define S_SEC 0x02000000 /* upcall cache */ +#define S_GSS 0x04000000 /* b_new_cmd */ +/* unused */ +#define S_MGC 0x10000000 +#define S_MGS 0x20000000 +#define S_FID 0x40000000 /* b_new_cmd */ +#define S_FLD 0x80000000 /* b_new_cmd */ +/* keep these in sync with lnet/utils/debug.c and lnet/libcfs/debug.c */ /* Debugging masks (32 bits, non-overlapping) */ +/* keep these in sync with lnet/utils/debug.c */ #define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ #define D_INODE 0x00000002 #define D_SUPER 0x00000004 @@ -90,13 +129,13 @@ extern unsigned int portals_catastrophe; #define D_CACHE 0x00000020 /* cache-related items */ #define D_INFO 0x00000040 /* general information */ #define D_IOCTL 0x00000080 /* ioctl related information */ -#define D_BLOCKS 0x00000100 /* ext2 block allocation */ +#define D_NETERROR 0x00000100 /* network errors */ #define D_NET 0x00000200 /* network communications */ #define D_WARNING 0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */ #define D_BUFFS 0x00000800 #define D_OTHER 0x00001000 #define D_DENTRY 0x00002000 -#define D_PORTALS 0x00004000 /* ENTRY/EXIT markers */ +/* unused: keep in sync with lnet/utils/debug.c */ #define D_PAGE 0x00008000 /* bulk page handling */ #define D_DLMTRACE 0x00010000 #define D_ERROR 0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */ @@ -110,86 +149,75 @@ extern unsigned int portals_catastrophe; #define D_CONSOLE 0x02000000 #define D_QUOTA 0x04000000 #define D_SEC 0x08000000 -/* If you change these values, please keep these files up to date... - * portals/utils/debug.c - * utils/lconf - */ +/* keep these in sync with lnet/utils/debug.c */ + +#define D_CANTMASK (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE) #ifndef DEBUG_SUBSYSTEM # define DEBUG_SUBSYSTEM S_UNDEFINED #endif +#define CDEBUG_MAX_LIMIT 600 +typedef struct { + cfs_time_t cdls_next; + int cdls_count; + cfs_duration_t cdls_delay; +} cfs_debug_limit_state_t; + +#define CDEBUG_ENABLED (1) + #ifdef __KERNEL__ -#define CDEBUG(mask, format, a...) \ -do { \ - CHECK_STACK(CDEBUG_STACK); \ - if (((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)) || \ - (portal_debug & (mask) && \ - portal_subsystem_debug & DEBUG_SUBSYSTEM)) \ - portals_debug_msg(DEBUG_SUBSYSTEM, mask, \ - __FILE__, __FUNCTION__, __LINE__, \ - CDEBUG_STACK, format, ## a); \ + +#if CDEBUG_ENABLED +#define __CDEBUG(cdls, mask, format, a...) \ +do { \ + CHECK_STACK(); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + libcfs_debug_msg(cdls, DEBUG_SUBSYSTEM, mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + format, ## a); \ } while (0) -#define CDEBUG_MAX_LIMIT 600 -#define CDEBUG_LIMIT(cdebug_mask, cdebug_format, a...) \ -do { \ - static cfs_time_t cdebug_next = 0; \ - static int cdebug_count = 0; \ - static cfs_duration_t cdebug_delay = CFS_MIN_DELAY; \ - \ - CHECK_STACK(CDEBUG_STACK); \ - if (cfs_time_after(cfs_time_current(), cdebug_next)) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, __FILE__, \ - __FUNCTION__, __LINE__, CDEBUG_STACK, \ - cdebug_format, ## a); \ - if (cdebug_count) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, cdebug_mask, \ - __FILE__, __FUNCTION__, __LINE__,0, \ - "previously skipped %d similar " \ - "messages\n", cdebug_count); \ - cdebug_count = 0; \ - } \ - if (cfs_time_after(cfs_time_current(), \ - cdebug_next + \ - cfs_time_seconds(CDEBUG_MAX_LIMIT+10))) \ - cdebug_delay = cdebug_delay > (8 * CFS_MIN_DELAY)? \ - cdebug_delay/8 : CFS_MIN_DELAY; \ - else \ - cdebug_delay = cdebug_delay*2 >= cfs_time_seconds(CDEBUG_MAX_LIMIT)?\ - cfs_time_seconds(CDEBUG_MAX_LIMIT) : \ - cdebug_delay*2; \ - cdebug_next = cfs_time_current() + cdebug_delay; \ - } else { \ - portals_debug_msg(DEBUG_SUBSYSTEM, \ - portal_debug & \ - ~(D_EMERG|D_ERROR|D_WARNING|D_CONSOLE), \ - __FILE__, __FUNCTION__, __LINE__, \ - CDEBUG_STACK, cdebug_format, ## a); \ - cdebug_count++; \ - } \ +#define CDEBUG(mask, format, a...) __CDEBUG(NULL, mask, format, ## a) + +#define CDEBUG_LIMIT(mask, format, a...) \ +do { \ + static cfs_debug_limit_state_t cdls; \ + \ + __CDEBUG(&cdls, mask, format, ## a); \ } while (0) -#elif defined(LUSTRE_UTILS) +#else /* CDEBUG_ENABLED */ +#define CDEBUG(mask, format, a...) (void)(0) +#define CDEBUG_LIMIT(mask, format, a...) (void)(0) +#endif + +#elif defined(__arch_lib__) && !defined(LUSTRE_UTILS) #define CDEBUG(mask, format, a...) \ do { \ - if ((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)) \ - fprintf(stderr, "(%s:%d:%s()) " format, \ - __FILE__, __LINE__, __FUNCTION__, ## a); \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, mask, \ + __FILE__, __FUNCTION__, __LINE__, \ + format, ## a); \ } while (0) + #define CDEBUG_LIMIT CDEBUG -#else /* !__KERNEL__ && !LUSTRE_UTILS*/ +#else #define CDEBUG(mask, format, a...) \ do { \ - if (((mask) & (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE)) || \ - (portal_debug & (mask) && \ - portal_subsystem_debug & DEBUG_SUBSYSTEM)) \ + if (((mask) & D_CANTMASK) != 0) \ fprintf(stderr, "(%s:%d:%s()) " format, \ __FILE__, __LINE__, __FUNCTION__, ## a); \ } while (0) + #define CDEBUG_LIMIT CDEBUG #endif /* !__KERNEL__ */ @@ -204,6 +232,8 @@ do { \ #define LCONSOLE_ERROR(format, a...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, format, ## a) #define LCONSOLE_EMERG(format, a...) CDEBUG(D_CONSOLE | D_EMERG, format, ## a) +#if CDEBUG_ENABLED + #define GOTO(label, rc) \ do { \ long GOTO__ret = (long)(rc); \ @@ -212,8 +242,11 @@ do { \ (signed long)GOTO__ret); \ goto label; \ } while (0) +#else +#define GOTO(label, rc) do { ((void)(rc)); goto label; } while (0) +#endif -#define CDEBUG_ENTRY_EXIT 1 +#define CDEBUG_ENTRY_EXIT (1) #if CDEBUG_ENTRY_EXIT /* @@ -248,143 +281,65 @@ do { \ #endif /* !CDEBUG_ENTRY_EXIT */ - -#define LUSTRE_SRV_PTL_PID LUSTRE_PTL_PID - /* - * eeb cfg - * ecf6 - * ecfG + * Some (nomina odiosa sunt) platforms define NULL as naked 0. This confuses + * Lustre RETURN(NULL) macro. */ -#define PORTALS_CFG_VERSION 0xecf60001 - -struct portals_cfg { - __u32 pcfg_version; - __u32 pcfg_command; - - __u32 pcfg_nal; - __u32 pcfg_flags; - - __u32 pcfg_gw_nal; - __u32 pcfg_padding1; - - __u64 pcfg_nid; - __u64 pcfg_nid2; - __u64 pcfg_nid3; - __u32 pcfg_id; - __u32 pcfg_misc; - __u32 pcfg_fd; - __u32 pcfg_count; - __u32 pcfg_size; - __u32 pcfg_wait; - - __u32 pcfg_plen1; /* buffers in userspace */ - __u32 pcfg_plen2; /* buffers in userspace */ - __u32 pcfg_alloc_size; /* size of this allocated portals_cfg */ - char pcfg_pbuf[0]; -}; - -#define PCFG_INIT(pcfg, cmd) \ -do { \ - memset(&(pcfg), 0, sizeof((pcfg))); \ - (pcfg).pcfg_version = PORTALS_CFG_VERSION; \ - (pcfg).pcfg_command = (cmd); \ - \ -} while (0) +#if defined(NULL) +#undef NULL +#endif -#define PCFG_INIT_PBUF(pcfg, cmd, plen1, plen2) \ - do { \ - int bufsize = size_round(sizeof(*(pcfg))); \ - bufsize += size_round(plen1) + size_round(plen2); \ - PORTAL_ALLOC((pcfg), bufsize); \ - if ((pcfg)) { \ - memset((pcfg), 0, bufsize); \ - (pcfg)->pcfg_version = PORTALS_CFG_VERSION; \ - (pcfg)->pcfg_command = (cmd); \ - (pcfg)->pcfg_plen1 = (plen1); \ - (pcfg)->pcfg_plen2 = (plen2); \ - (pcfg)->pcfg_alloc_size = bufsize; \ - } \ - } while (0) - -#define PCFG_FREE_PBUF(pcfg) PORTAL_FREE((pcfg), (pcfg)->pcfg_alloc_size) - -#define PCFG_PBUF(pcfg, idx) \ - (0 == (idx) \ - ? ((char *)(pcfg) + size_round(sizeof(*(pcfg)))) \ - : (1 == (idx) \ - ? ((char *)(pcfg) + size_round(sizeof(*(pcfg))) + size_round(pcfg->pcfg_plen1)) \ - : (NULL))) - -typedef int (nal_cmd_handler_fn)(struct portals_cfg *, void *); -int libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *arg); -int libcfs_nal_cmd(struct portals_cfg *pcfg); -void libcfs_nal_cmd_unregister(int nal); - -struct portal_ioctl_data { - __u32 ioc_len; - __u32 ioc_version; - __u64 ioc_nid; - __u64 ioc_nid2; - __u64 ioc_nid3; - __u32 ioc_count; - __u32 ioc_nal; - __u32 ioc_nal_cmd; - __u32 ioc_fd; - __u32 ioc_id; - - __u32 ioc_flags; - __u32 ioc_size; - - __u32 ioc_wait; - __u32 ioc_timeout; - __u32 ioc_misc; - - __u32 ioc_inllen1; - char *ioc_inlbuf1; - __u32 ioc_inllen2; - char *ioc_inlbuf2; - - __u32 ioc_plen1; /* buffers in userspace */ - char *ioc_pbuf1; - __u32 ioc_plen2; /* buffers in userspace */ - char *ioc_pbuf2; - - char ioc_bulk[0]; -}; +#define NULL ((void *)0) +#define LUSTRE_SRV_LNET_PID LUSTRE_LNET_PID #ifdef __KERNEL__ #include +struct libcfs_ioctl_data; /* forward ref */ + struct libcfs_ioctl_handler { struct list_head item; - int (*handle_ioctl)(struct portal_ioctl_data *data, - unsigned int cmd, unsigned long args); + int (*handle_ioctl)(unsigned int cmd, struct libcfs_ioctl_data *data); }; -#define DECLARE_IOCTL_HANDLER(ident, func) \ - struct libcfs_ioctl_handler ident = { \ - .item = CFS_LIST_HEAD_INIT(ident.item), \ - .handle_ioctl = func \ +#define DECLARE_IOCTL_HANDLER(ident, func) \ + struct libcfs_ioctl_handler ident = { \ + /* .item = */ CFS_LIST_HEAD_INIT(ident.item), \ + /* .handle_ioctl = */ func \ } int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand); int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand); +/* libcfs tcpip */ +#define LNET_ACCEPTOR_MIN_RESERVED_PORT 512 +#define LNET_ACCEPTOR_MAX_RESERVED_PORT 1023 + +int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask); +int libcfs_ipif_enumerate(char ***names); +void libcfs_ipif_free_enumeration(char **names, int n); +int libcfs_sock_listen(cfs_socket_t **sockp, __u32 ip, int port, int backlog); +int libcfs_sock_accept(cfs_socket_t **newsockp, cfs_socket_t *sock); +void libcfs_sock_abort_accept(cfs_socket_t *sock); +int libcfs_sock_connect(cfs_socket_t **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port); +int libcfs_sock_setbuf(cfs_socket_t *socket, int txbufsize, int rxbufsize); +int libcfs_sock_getbuf(cfs_socket_t *socket, int *txbufsize, int *rxbufsize); +int libcfs_sock_getaddr(cfs_socket_t *socket, int remote, __u32 *ip, int *port); +int libcfs_sock_write(cfs_socket_t *sock, void *buffer, int nob, int timeout); +int libcfs_sock_read(cfs_socket_t *sock, void *buffer, int nob, int timeout); +void libcfs_sock_release(cfs_socket_t *sock); + /* libcfs watchdogs */ struct lc_watchdog; -/* Just use the default handler (dumplog) */ -#define LC_WATCHDOG_DEFAULT_CB NULL - /* Add a watchdog which fires after "time" milliseconds of delay. You have to * touch it once to enable it. */ struct lc_watchdog *lc_watchdog_add(int time, - void (*cb)(struct lc_watchdog *, - struct task_struct *, - void *), + void (*cb)(pid_t pid, void *), void *data); /* Enables a watchdog and resets its timer. */ @@ -397,9 +352,7 @@ void lc_watchdog_disable(struct lc_watchdog *lcw); void lc_watchdog_delete(struct lc_watchdog *lcw); /* Dump a debug log */ -void lc_watchdog_dumplog(struct lc_watchdog *lcw, - struct task_struct *tsk, - void *data); +void lc_watchdog_dumplog(pid_t pid, void *data); /* __KERNEL__ */ #endif @@ -452,7 +405,25 @@ static inline time_t cfs_unix_seconds(void) cfs_fs_time_t t; cfs_fs_time_current(&t); - return cfs_fs_time_sec(&t); + return (time_t)cfs_fs_time_sec(&t); +} + +static inline cfs_time_t cfs_time_shift(int seconds) +{ + return cfs_time_add(cfs_time_current(), cfs_time_seconds(seconds)); +} + +static inline long cfs_timeval_sub(struct timeval *large, struct timeval *small, + struct timeval *result) +{ + long r = (long) ( + (large->tv_sec - small->tv_sec) * ONE_MILLION + + (large->tv_usec - small->tv_usec)); + if (result != NULL) { + result->tv_usec = r / ONE_MILLION; + result->tv_sec = r; + } + return r; } #define CFS_RATELIMIT(seconds) \ @@ -472,10 +443,57 @@ static inline time_t cfs_unix_seconds(void) result; \ }) -extern void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, - char *format, ...) - __attribute__ ((format (printf, 7, 8))); +struct libcfs_debug_msg_data { + cfs_debug_limit_state_t *msg_cdls; + int msg_subsys; + const char *msg_file; + const char *msg_fn; + int msg_line; +}; + +#define DEBUG_MSG_DATA_INIT(cdls, subsystem, file, func, ln ) { \ + .msg_cdls = (cdls), \ + .msg_subsys = (subsystem), \ + .msg_file = (file), \ + .msg_fn = (func), \ + .msg_line = (ln) \ + } + + +extern int libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls, + int subsys, int mask, + const char *file, const char *fn, const int line, + const char *format1, va_list args, + const char *format2, ...); + +#define libcfs_debug_vmsg(cdls, subsys, mask, file, fn, line, format, args) \ + libcfs_debug_vmsg2(cdls, subsys, mask, file, fn, line, format, args, NULL, NULL) + +#define libcfs_debug_msg(cdls, subsys, mask, file, fn, line, format, a...) \ + libcfs_debug_vmsg2(cdls, subsys, mask, file, fn, line, NULL, NULL, format, ##a) + +#define cdebug_va(cdls, mask, file, func, line, fmt, args) do { \ + CHECK_STACK(); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + libcfs_debug_vmsg(cdls, DEBUG_SUBSYSTEM, (mask), \ + (file), (func), (line), fmt, args); \ +} while(0); + +#define cdebug(cdls, mask, file, func, line, fmt, a...) do { \ + CHECK_STACK(); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + libcfs_debug_msg(cdls, DEBUG_SUBSYSTEM, (mask), \ + (file), (func), (line), fmt, ## a); \ +} while(0); + +extern void libcfs_assertion_failed(const char *expr, const char *file, + const char *fn, const int line); static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg) { @@ -490,10 +508,10 @@ static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg) */ static inline void cfs_fs_timeval(struct timeval *tv) { - cfs_fs_time_t time; + cfs_fs_time_t time; - cfs_fs_time_current(&time); - cfs_fs_time_usec(&time, tv); + cfs_fs_time_current(&time); + cfs_fs_time_usec(&time, tv); } /* @@ -502,13 +520,13 @@ static inline void cfs_fs_timeval(struct timeval *tv) */ static inline cfs_duration_t cfs_timeout_cap(cfs_duration_t timeout) { - if (timeout < cfs_time_minimal_timeout()) - timeout = cfs_time_minimal_timeout(); - return timeout; + if (timeout < CFS_TICK) + timeout = CFS_TICK; + return timeout; } /* - * Portable memory allocator API (draft) + * Universal memory allocator API */ enum cfs_alloc_flags { /* allocation is not allowed to block */ @@ -522,27 +540,124 @@ enum cfs_alloc_flags { CFS_ALLOC_FS = (1 << 3), /* allocation is allowed to do io to free/clean memory */ CFS_ALLOC_IO = (1 << 4), + /* don't report allocation failure to the console */ + CFS_ALLOC_NOWARN = (1 << 5), /* standard allocator flag combination */ CFS_ALLOC_STD = CFS_ALLOC_FS | CFS_ALLOC_IO, CFS_ALLOC_USER = CFS_ALLOC_WAIT | CFS_ALLOC_FS | CFS_ALLOC_IO, }; -#define CFS_SLAB_ATOMIC CFS_ALLOC_ATOMIC -#define CFS_SLAB_WAIT CFS_ALLOC_WAIT -#define CFS_SLAB_ZERO CFS_ALLOC_ZERO -#define CFS_SLAB_FS CFS_ALLOC_FS -#define CFS_SLAB_IO CFS_ALLOC_IO -#define CFS_SLAB_STD CFS_ALLOC_STD -#define CFS_SLAB_USER CFS_ALLOC_USER - /* flags for cfs_page_alloc() in addition to enum cfs_alloc_flags */ -enum cfs_page_alloc_flags { +enum cfs_alloc_page_flags { /* allow to return page beyond KVM. It has to be mapped into KVM by * cfs_page_map(); */ CFS_ALLOC_HIGH = (1 << 5), CFS_ALLOC_HIGHUSER = CFS_ALLOC_WAIT | CFS_ALLOC_FS | CFS_ALLOC_IO | CFS_ALLOC_HIGH, }; +/* + * portable UNIX device file identification. (This is not _very_ + * portable. Probably makes no sense for Windows.) + */ +/* + * Platform defines + * + * cfs_rdev_t + */ + +typedef unsigned int cfs_major_nr_t; +typedef unsigned int cfs_minor_nr_t; + +/* + * Defined by platform. + */ +cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor); +cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev); +cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev); + +/* + * Generic on-wire rdev format. + */ + +typedef __u32 cfs_wire_rdev_t; + +cfs_wire_rdev_t cfs_wire_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor); +cfs_major_nr_t cfs_wire_rdev_major(cfs_wire_rdev_t rdev); +cfs_minor_nr_t cfs_wire_rdev_minor(cfs_wire_rdev_t rdev); + +/* + * Drop into debugger, if possible. Implementation is provided by platform. + */ + +void cfs_enter_debugger(void); + +/* + * Defined by platform + */ +void cfs_daemonize(char *str); +int cfs_daemonize_ctxt(char *str); +cfs_sigset_t cfs_get_blocked_sigs(void); +cfs_sigset_t cfs_block_allsigs(void); +cfs_sigset_t cfs_block_sigs(cfs_sigset_t bits); +void cfs_restore_sigs(cfs_sigset_t); +int cfs_signal_pending(void); +void cfs_clear_sigpending(void); +/* + * XXX Liang: + * these macros should be removed in the future, + * we keep them just for keeping libcfs compatible + * with other branches. + */ +#define libcfs_daemonize(s) cfs_daemonize(s) +#define cfs_sigmask_lock(f) do { f= 0; } while (0) +#define cfs_sigmask_unlock(f) do { f= 0; } while (0) + +int convert_server_error(__u64 ecode); +int convert_client_oflag(int cflag, int *result); + +/* + * Stack-tracing filling. + */ + +/* + * Platform-dependent data-type to hold stack frames. + */ +struct cfs_stack_trace; + +/* + * Fill @trace with current back-trace. + */ +void cfs_stack_trace_fill(struct cfs_stack_trace *trace); + +/* + * Return instruction pointer for frame @frame_no. NULL if @frame_no is + * invalid. + */ +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no); + +/* + * Universal open flags. + */ +#define CFS_O_ACCMODE 0003 +#define CFS_O_CREAT 0100 +#define CFS_O_EXCL 0200 +#define CFS_O_NOCTTY 0400 +#define CFS_O_TRUNC 01000 +#define CFS_O_APPEND 02000 +#define CFS_O_NONBLOCK 04000 +#define CFS_O_NDELAY CFS_O_NONBLOCK +#define CFS_O_SYNC 010000 +#define CFS_O_ASYNC 020000 +#define CFS_O_DIRECT 040000 +#define CFS_O_LARGEFILE 0100000 +#define CFS_O_DIRECTORY 0200000 +#define CFS_O_NOFOLLOW 0400000 +#define CFS_O_NOATIME 01000000 + +/* convert local open flags to universal open flags */ +int cfs_oflags2univ(int flags); +/* convert universal open flags to local open flags */ +int cfs_univ2oflags(int flags); #define _LIBCFS_H diff --git a/lnet/include/libcfs/linux/Makefile.am b/lnet/include/libcfs/linux/Makefile.am index 159cf57..072a7ad 100644 --- a/lnet/include/libcfs/linux/Makefile.am +++ b/lnet/include/libcfs/linux/Makefile.am @@ -1,3 +1,3 @@ EXTRA_DIST := kp30.h libcfs.h linux-fs.h linux-lock.h linux-mem.h \ - linux-prim.h linux-time.h lltrace.h portals_compat25.h \ - portals_lib.h portals_utils.h + linux-prim.h linux-time.h linux-tcpip.h lltrace.h \ + portals_compat25.h portals_utils.h diff --git a/lnet/include/libcfs/linux/kp30.h b/lnet/include/libcfs/linux/kp30.h index d2329ba..be2cd34 100644 --- a/lnet/include/libcfs/linux/kp30.h +++ b/lnet/include/libcfs/linux/kp30.h @@ -9,7 +9,9 @@ #endif #ifdef __KERNEL__ +#ifdef HAVE_KERNEL_CONFIG_H # include +#endif # include # include # include @@ -30,7 +32,7 @@ # include # include # include -# include +# include # include # include # include @@ -39,11 +41,13 @@ # include # include # include +# include # ifdef HAVE_MM_INLINE # include # endif # if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) # include +# include # endif #include @@ -88,28 +92,9 @@ static inline void our_cond_resched(void) #else #define LASSERT_SPIN_LOCKED(lock) do {} while(0) #endif +#define LASSERT_SEM_LOCKED(sem) LASSERT(down_trylock(sem) != 0) -#ifdef __arch_um__ -#define LBUG_WITH_LOC(file, func, line) \ -do { \ - CEMERG("LBUG - trying to dump log to /tmp/lustre-log\n"); \ - portals_catastrophe = 1; \ - portals_debug_dumplog(); \ - portals_run_lbug_upcall(file, func, line); \ - panic("LBUG"); \ -} while (0) -#else -#define LBUG_WITH_LOC(file, func, line) \ -do { \ - CEMERG("LBUG\n"); \ - portals_catastrophe = 1; \ - portals_debug_dumpstack(NULL); \ - portals_debug_dumplog(); \ - portals_run_lbug_upcall(file, func, line); \ - set_task_state(current, TASK_UNINTERRUPTIBLE); \ - schedule(); \ -} while (0) -#endif /* __arch_um__ */ +#define LIBCFS_PANIC(msg) panic(msg) /* ------------------------------------------------------------------- */ @@ -137,6 +122,24 @@ do { \ #endif /******************************************************************************/ +/* Module parameter support */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +# define CFS_MODULE_PARM(name, t, type, perm, desc) \ + MODULE_PARM(name, t);\ + MODULE_PARM_DESC(name, desc) + +#else +# define CFS_MODULE_PARM(name, t, type, perm, desc) \ + module_param(name, type, perm);\ + MODULE_PARM_DESC(name, desc) +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9)) +# define CFS_SYSFS_MODULE_PARM 0 /* no sysfs module parameters */ +#else +# define CFS_SYSFS_MODULE_PARM 1 /* module parameters accessible via sysfs */ +#endif +/******************************************************************************/ #if (__GNUC__) /* Use the special GNU C __attribute__ hack to have the compiler check the @@ -157,14 +160,22 @@ do { \ #else /* !__KERNEL__ */ # include # include -#ifndef __CYGWIN__ -# include -#else +#ifdef CRAY_XT3 +# include +#elif defined(__CYGWIN__) # include +#else +# include #endif # include # include # include +# include +# include /* for _IOWR */ + +# define CFS_MODULE_PARM(name, t, type, perm, desc) +#define PORTAL_SYMBOL_GET(x) inter_module_get(#x) +#define PORTAL_SYMBOL_PUT(x) inter_module_put(#x) #endif /* End of !__KERNEL__ */ @@ -175,7 +186,7 @@ do { \ #define LWT_MEMORY (16<<20) -#if !KLWT_SUPPORT +#ifndef KLWT_SUPPORT # if defined(__KERNEL__) # if !defined(BITS_PER_LONG) # error "BITS_PER_LONG not defined" @@ -225,7 +236,7 @@ extern lwt_cpu_t lwt_cpus[]; #define LWTSTR(n) #n #define LWTWHERE(f,l) f ":" LWTSTR(l) -#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t)) +#define LWT_EVENTS_PER_PAGE (CFS_PAGE_SIZE / sizeof (lwt_event_t)) #define LWT_EVENT(p1, p2, p3, p4) \ do { \ @@ -276,7 +287,7 @@ extern int lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, /* ------------------------------------------------------------------ */ -#define IOCTL_PORTAL_TYPE long +#define IOCTL_LIBCFS_TYPE long #ifdef __CYGWIN__ # ifndef BITS_PER_LONG @@ -298,23 +309,26 @@ extern int lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, # define LP_POISON ((void *)(long)0x5a5a5a5a) #endif -#if defined(__x86_64__) && defined(__KERNEL__) +#if (defined(__x86_64__) && defined(__KERNEL__)) /* x86_64 defines __u64 as "long" in userspace, but "long long" in the kernel */ # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" +# define LPF64 "L" # define LPSZ "%lu" # define LPSSZ "%ld" #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" +# define LPF64 "L" # define LPSZ "%u" # define LPSSZ "%d" #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" +# define LPF64 "l" # define LPSZ "%lu" # define LPSSZ "%ld" #endif diff --git a/lnet/include/libcfs/linux/libcfs.h b/lnet/include/libcfs/linux/libcfs.h index 1e82343..0aac919 100644 --- a/lnet/include/libcfs/linux/libcfs.h +++ b/lnet/include/libcfs/linux/libcfs.h @@ -8,28 +8,18 @@ #error Do not #include this file directly. #include instead #endif +#include #include #include #include #include #include +#include #ifdef HAVE_ASM_TYPES_H #include #else -/* this is actually coming from within lustre, a layering violation. - * we may not even need it, as libuptlctl (the dependency for which it - * is needed in liblustre building on catamount, bug 6923) shows no - * apparent need to be included in liblustre AFAICS. The change of - * include to lustre/types.h only makes this explicit instead of implicit. - * To be resolved. For now, make it CRAY_PORTALS only, to avoid breaking - * non-b1_4 branches that don't have this file. - */ -# if CRAY_PORTALS -# include -# else -# include "types.h" -# endif +#include #endif @@ -99,49 +89,40 @@ struct ptldebug_header { #define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5) -#ifdef __KERNEL__ +#if defined(__KERNEL__) && !defined(__x86_64__) # ifdef __ia64__ -# define CDEBUG_STACK (THREAD_SIZE - \ - ((unsigned long)__builtin_dwarf_cfa() & \ - (THREAD_SIZE - 1))) +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) # else -# define CDEBUG_STACK (THREAD_SIZE - \ - ((unsigned long)__builtin_frame_address(0) & \ - (THREAD_SIZE - 1))) +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((unsigned long)__builtin_frame_address(0) & \ + (THREAD_SIZE - 1))) # endif /* __ia64__ */ -#define CHECK_STACK(stack) \ - do { \ - if ((stack) > 3*THREAD_SIZE/4 && (stack) > portal_stack) { \ - portals_debug_msg(DEBUG_SUBSYSTEM, D_WARNING, \ - __FILE__, __FUNCTION__, __LINE__, \ - (stack),"maximum lustre stack %u\n",\ - portal_stack = (stack)); \ - /*panic("LBUG");*/ \ - } \ - } while (0) +#define __CHECK_STACK(file, func, line) \ +do { \ + unsigned long _stack = CDEBUG_STACK(); \ + \ + if (_stack > 3*THREAD_SIZE/4 && _stack > libcfs_stack) { \ + libcfs_stack = _stack; \ + libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_WARNING, \ + file, func, line, \ + "maximum lustre stack %lu\n", _stack); \ + /*panic("LBUG");*/ \ + } \ +} while (0) +#define CHECK_STACK() __CHECK_STACK(__FILE__, __func__, __LINE__) #else /* !__KERNEL__ */ -#define CHECK_STACK(stack) do { } while(0) -#define CDEBUG_STACK (0L) +#define __CHECK_STACK(X, Y, Z) do { } while(0) +#define CHECK_STACK() do { } while(0) +#define CDEBUG_STACK() (0L) #endif /* __KERNEL__ */ /* initial pid */ -# if CRAY_PORTALS -/* - * 1) ptl_pid_t in cray portals is only 16 bits, not 32 bits, therefore this - * is too big. - * - * 2) the implementation of ernal in cray portals further restricts the pid - * space that may be used to 0 <= pid <= 255 (an 8 bit value). Returns - * an error at nal init time for any pid outside this range. Other nals - * in cray portals don't have this restriction. - * */ -#define LUSTRE_PTL_PID 9 -# else -#define LUSTRE_PTL_PID 12345 -# endif +#define LUSTRE_LNET_PID 12345 -#define ENTRY_NESTING_SUPPORT (0) +#define ENTRY_NESTING_SUPPORT (1) #define ENTRY_NESTING do {;} while (0) #define EXIT_NESTING do {;} while (0) #define __current_nesting_level() (0) @@ -160,4 +141,17 @@ typedef kernel_cap_t cfs_kernel_cap_t; typedef __u32 cfs_kernel_cap_t; #endif +#if defined(__KERNEL__) +/* + * No stack-back-tracing in Linux for now. + */ +struct cfs_stack_trace { +}; + +#ifndef WITH_WATCHDOG +#define WITH_WATCHDOG +#endif + +#endif + #endif /* _LINUX_LIBCFS_H */ diff --git a/lnet/include/libcfs/linux/linux-fs.h b/lnet/include/libcfs/linux/linux-fs.h index 9530360..3ba5461 100644 --- a/lnet/include/libcfs/linux/linux-fs.h +++ b/lnet/include/libcfs/linux/linux-fs.h @@ -33,7 +33,16 @@ #include #include #include -#endif +#else /* !__KERNEL__ */ +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* __KERNEL__ */ typedef struct file cfs_file_t; typedef struct dentry cfs_dentry_t; @@ -55,15 +64,23 @@ cfs_file_t *cfs_filp_open (const char *name, int flags, int mode, int *err); #define cfs_put_file(f) fput(f) #define cfs_file_count(f) file_count(f) -typedef struct file_lock cfs_flock_t; -#define CFS_FLOCK_TYPE(fl) ((fl)->fl_type) -#define CFS_FLOCK_SET_TYPE(fl, type) do { (fl)->fl_type = (type); } while(0) -#define CFS_FLOCK_PID(fl) ((fl)->fl_pid) -#define CFS_FLOCK_SET_PID(fl, pid) do { (fl)->fl_pid = (pid); } while(0) -#define CFS_FLOCK_START(fl) ((fl)->fl_start) -#define CFS_FLOCK_SET_START(fl, start) do { (fl)->fl_start = (start); } while(0) -#define CFS_FLOCK_END(fl) ((fl)->fl_end) -#define CFS_FLOCK_SET_END(fl, end) do { (fl)->fl_end = (end); } while(0) +typedef struct file_lock cfs_flock_t; +#define cfs_flock_type(fl) ((fl)->fl_type) +#define cfs_flock_set_type(fl, type) do { (fl)->fl_type = (type); } while(0) +#define cfs_flock_pid(fl) ((fl)->fl_pid) +#define cfs_flock_set_pid(fl, pid) do { (fl)->fl_pid = (pid); } while(0) +#define cfs_flock_start(fl) ((fl)->fl_start) +#define cfs_flock_set_start(fl, start) do { (fl)->fl_start = (start); } while(0) +#define cfs_flock_end(fl) ((fl)->fl_end) +#define cfs_flock_set_end(fl, end) do { (fl)->fl_end = (end); } while(0) + +ssize_t cfs_user_write (cfs_file_t *filp, const char *buf, size_t count, loff_t *offset); + +/* + * portable UNIX device file identification. + */ + +typedef dev_t cfs_rdev_t; #endif diff --git a/lnet/include/libcfs/linux/linux-lock.h b/lnet/include/libcfs/linux/linux-lock.h index ce097e9..f419c9b 100644 --- a/lnet/include/libcfs/linux/linux-lock.h +++ b/lnet/include/libcfs/linux/linux-lock.h @@ -74,7 +74,7 @@ */ /* - * mutex_t: + * mutex: * * - init_mutex(x) * - init_mutex_locked(x) @@ -94,24 +94,10 @@ * - wait_for_completion(c) */ -/* - * OSX funnels: - * - * No funnels needed in Linux - */ -#define CFS_DECL_FUNNEL_DATA -#define CFS_DECL_CONE_DATA DECLARE_FUNNEL_DATA -#define CFS_DECL_NET_DATA DECLARE_FUNNEL_DATA -#define CFS_CONE_IN do {} while(0) -#define CFS_CONE_EX do {} while(0) - -#define CFS_NET_IN do {} while(0) -#define CFS_NET_EX do {} while(0) - /* __KERNEL__ */ #else -//#include "../user-lock.h" +#include "../user-lock.h" /* __KERNEL__ */ #endif diff --git a/lnet/include/libcfs/linux/linux-mem.h b/lnet/include/libcfs/linux/linux-mem.h index 94b764f..7591213 100644 --- a/lnet/include/libcfs/linux/linux-mem.h +++ b/lnet/include/libcfs/linux/linux-mem.h @@ -41,15 +41,17 @@ typedef struct page cfs_page_t; #define CFS_PAGE_SIZE PAGE_CACHE_SIZE #define CFS_PAGE_SHIFT PAGE_CACHE_SHIFT -#define CFS_PAGE_MASK PAGE_CACHE_MASK +#define CFS_PAGE_MASK (~((__u64)CFS_PAGE_SIZE-1)) -cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order); -#define cfs_alloc_page(f) cfs_alloc_pages(f, 0) -#define cfs_free_pages(p, o) __free_pages(p, o) +cfs_page_t *cfs_alloc_page(unsigned int flags); #define cfs_free_page(p) __free_pages(p, 0) static inline void *cfs_page_address(cfs_page_t *page) { + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ return page_address(page); } @@ -73,13 +75,11 @@ static inline int cfs_page_count(cfs_page_t *page) return page_count(page); } -static inline void cfs_set_page_count(cfs_page_t *page, int v) -{ - set_page_count(page, v); -} +#define cfs_page_index(p) ((p)->index) /* * Memory allocator + * XXX Liang: move these declare to public file */ extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags); extern void cfs_free(void *addr); @@ -88,12 +88,17 @@ extern void *cfs_alloc_large(size_t nr_bytes); extern void cfs_free_large(void *addr); /* + * In Linux there is no way to determine whether current execution context is + * blockable. + */ +#define CFS_ALLOC_ATOMIC_TRY CFS_ALLOC_ATOMIC + +/* * SLAB allocator + * XXX Liang: move these declare to public file */ typedef kmem_cache_t cfs_mem_cache_t; -extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long, - void (*)(void *, cfs_mem_cache_t *, unsigned long), - void (*)(void *, cfs_mem_cache_t *, unsigned long)); +extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, unsigned long); extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * ); extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int); extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *); @@ -104,6 +109,12 @@ extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *); #define CFS_MMSPACE_OPEN do { __oldfs = get_fs(); set_fs(get_ds());} while(0) #define CFS_MMSPACE_CLOSE set_fs(__oldfs) +#else /* !__KERNEL__ */ +#ifdef HAVE_ASM_PAGE_H +#include /* needed for PAGE_SIZE - rread */ +#endif + +#include /* __KERNEL__ */ #endif diff --git a/lnet/include/libcfs/linux/linux-prim.h b/lnet/include/libcfs/linux/linux-prim.h index 69bda36..41eeb8a 100644 --- a/lnet/include/libcfs/linux/linux-prim.h +++ b/lnet/include/libcfs/linux/linux-prim.h @@ -30,7 +30,9 @@ #endif #ifdef __KERNEL__ +#ifdef HAVE_KERNEL_CONFIG_H #include +#endif #include #include #include @@ -57,8 +59,27 @@ typedef struct miscdevice cfs_psdev_t; typedef struct ctl_table cfs_sysctl_table_t; typedef struct ctl_table_header cfs_sysctl_table_header_t; -#define register_cfs_sysctl_table(t, a) register_sysctl_table(t, a) -#define unregister_cfs_sysctl_table(t) unregister_sysctl_table(t, a) +#define cfs_register_sysctl_table(t, a) register_sysctl_table(t, a) +#define cfs_unregister_sysctl_table(t) unregister_sysctl_table(t, a) + +/* + * Symbol register + */ +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define cfs_symbol_register(s, p) inter_module_register(s, THIS_MODULE, p) +#define cfs_symbol_unregister(s) inter_module_unregister(s) +#define cfs_symbol_get(s) inter_module_get(s) +#define cfs_symbol_put(s) inter_module_put(s) +#define cfs_module_get() MOD_INC_USE_COUNT +#define cfs_module_put() MOD_DEC_USE_COUNT +#else +#define cfs_symbol_register(s, p) do {} while(0) +#define cfs_symbol_unregister(s) do {} while(0) +#define cfs_symbol_get(s) symbol_get(s) +#define cfs_symbol_put(s) symbol_put(s) +#define cfs_module_get() try_module_get(THIS_MODULE) +#define cfs_module_put() module_put(THIS_MODULE) +#endif /* * Proc file system APIs @@ -73,21 +94,28 @@ typedef struct proc_dir_entry cfs_proc_dir_entry_t; /* * Wait Queue */ +#define CFS_TASK_INTERRUPTIBLE TASK_INTERRUPTIBLE +#define CFS_TASK_UNINT TASK_UNINTERRUPTIBLE + typedef wait_queue_t cfs_waitlink_t; typedef wait_queue_head_t cfs_waitq_t; -#define cfs_waitq_init(w) init_waitqueue_head(w) -#define cfs_waitlink_init(l) init_waitqueue_entry(l, current) -#define cfs_waitq_add(w, l) add_wait_queue(w, l) -#define cfs_waitq_add_exclusive(w, l) add_wait_queue_exclusive(w, l) +typedef long cfs_task_state_t; + +#define cfs_waitq_init(w) init_waitqueue_head(w) +#define cfs_waitlink_init(l) init_waitqueue_entry(l, current) +#define cfs_waitq_add(w, l) add_wait_queue(w, l) +#define cfs_waitq_add_exclusive(w, l) add_wait_queue_exclusive(w, l) #define cfs_waitq_forward(l, w) do {} while(0) -#define cfs_waitq_del(w, l) remove_wait_queue(w, l) -#define cfs_waitq_active(w) waitqueue_active(w) -#define cfs_waitq_signal(w) wake_up(w) -#define cfs_waitq_signal_nr(w,n) wake_up_nr(w, n) -#define cfs_waitq_broadcast(w) wake_up_all(w) -#define cfs_waitq_wait(l) schedule() -#define cfs_waitq_timedwait(l, t) schedule_timeout(t) +#define cfs_waitq_del(w, l) remove_wait_queue(w, l) +#define cfs_waitq_active(w) waitqueue_active(w) +#define cfs_waitq_signal(w) wake_up(w) +#define cfs_waitq_signal_nr(w,n) wake_up_nr(w, n) +#define cfs_waitq_broadcast(w) wake_up_all(w) +#define cfs_waitq_wait(l, s) schedule() +#define cfs_waitq_timedwait(l, s, t) schedule_timeout(t) +#define cfs_schedule_timeout(s, t) schedule_timeout(t) +#define cfs_schedule() schedule() /* Kernel thread */ typedef int (*cfs_thread_t)(void *); @@ -98,6 +126,8 @@ typedef int (*cfs_thread_t)(void *); */ typedef struct task_struct cfs_task_t; #define cfs_current() current +#define cfs_task_lock(t) task_lock(t) +#define cfs_task_unlock(t) task_unlock(t) #define CFS_DECL_JOURNAL_DATA void *journal_info #define CFS_PUSH_JOURNAL do { \ journal_info = current->journal_info; \ @@ -115,14 +145,7 @@ module_exit(fini) /* * Signal */ -#define cfs_sigmask_lock(t, f) SIGNAL_MASK_LOCK(t, f) -#define cfs_sigmask_unlock(t, f) SIGNAL_MASK_UNLOCK(t, f) -#define cfs_recalc_sigpending(t) RECALC_SIGPENDING -#define cfs_signal_pending(t) signal_pending(t) -#define cfs_sigfillset(s) sigfillset(s) - -#define cfs_set_sig_blocked(t, b) do { (t)->blocked = b; } while(0) -#define cfs_get_sig_blocked(t) (&(t)->blocked) +typedef sigset_t cfs_sigset_t; /* * Timer @@ -164,8 +187,17 @@ static inline cfs_time_t cfs_timer_deadline(cfs_timer_t *t) return t->expires; } + +/* deschedule for a bit... */ +static inline void cfs_pause(cfs_duration_t ticks) +{ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(ticks); +} + #else /* !__KERNEL__ */ +typedef struct proc_dir_entry cfs_proc_dir_entry_t; #include "../user-prim.h" #endif /* __KERNEL__ */ diff --git a/lnet/include/libcfs/linux/linux-tcpip.h b/lnet/include/libcfs/linux/linux-tcpip.h new file mode 100644 index 0000000..2d14904 --- /dev/null +++ b/lnet/include/libcfs/linux/linux-tcpip.h @@ -0,0 +1,62 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef __LIBCFS_LINUX_CFS_TCP_H__ +#define __LIBCFS_LINUX_CFS_TCP_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifdef __KERNEL__ +#include + +typedef struct socket cfs_socket_t; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) +# define sk_allocation allocation +# define sk_data_ready data_ready +# define sk_write_space write_space +# define sk_user_data user_data +# define sk_prot prot +# define sk_sndbuf sndbuf +# define sk_rcvbuf rcvbuf +# define sk_socket socket +# define sk_sleep sleep +#endif + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) +# define sk_wmem_queued wmem_queued +# define sk_err err +# define sk_route_caps route_caps +#endif + +#define SOCK_SNDBUF(so) ((so)->sk->sk_sndbuf) +#define SOCK_WMEM_QUEUED(so) ((so)->sk->sk_wmem_queued) +#define SOCK_ERROR(so) ((so)->sk->sk_err) +#define SOCK_TEST_NOSPACE(so) test_bit(SOCK_NOSPACE, &(so)->flags) + +#endif + +#endif diff --git a/lnet/include/libcfs/linux/linux-time.h b/lnet/include/libcfs/linux/linux-time.h index f18e7d9..7135218 100644 --- a/lnet/include/libcfs/linux/linux-time.h +++ b/lnet/include/libcfs/linux/linux-time.h @@ -64,19 +64,18 @@ * int cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *); * int cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *); * - * cfs_duration_t cfs_time_minimal_timeout(void) - * * CFS_TIME_FORMAT * CFS_DURATION_FORMAT * */ #define ONE_BILLION ((u_int64_t)1000000000) -#define ONE_MILLION ((u_int64_t) 1000000) +#define ONE_MILLION 1000000 #ifdef __KERNEL__ - +#ifdef HAVE_KERNEL_CONFIG_H #include +#endif #include #include #include @@ -106,15 +105,15 @@ static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s) /* * internal helper function used by cfs_fs_time_before*() */ -static inline unsigned long __cfs_fs_time_flat(cfs_fs_time_t *t) +static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t) { - return ((unsigned long)t->tv_sec) * ONE_MILLION + t->tv_usec * 1000; + return (unsigned long long)t->tv_sec * ONE_MILLION + t->tv_usec; } #define CURRENT_KERN_TIME xtime -/* (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) */ #else +/* (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) */ /* * post 2.5 kernels. @@ -138,9 +137,9 @@ static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s) /* * internal helper function used by cfs_fs_time_before*() */ -static inline unsigned long __cfs_fs_time_flat(cfs_fs_time_t *t) +static inline unsigned long long __cfs_fs_time_flat(cfs_fs_time_t *t) { - return ((unsigned long)t->tv_sec) * ONE_BILLION + t->tv_nsec; + return (unsigned long long)t->tv_sec * ONE_BILLION + t->tv_nsec; } #define CURRENT_KERN_TIME CURRENT_TIME @@ -198,12 +197,12 @@ static inline time_t cfs_fs_time_sec(cfs_fs_time_t *t) static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2) { - return time_before(__cfs_fs_time_flat(t1), __cfs_fs_time_flat(t2)); + return __cfs_fs_time_flat(t1) < __cfs_fs_time_flat(t2); } static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2) { - return time_before_eq(__cfs_fs_time_flat(t1), __cfs_fs_time_flat(t2)); + return __cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2); } #if 0 @@ -224,12 +223,7 @@ static inline cfs_duration_t cfs_duration_build(int64_t nano) static inline cfs_duration_t cfs_time_seconds(int seconds) { - return seconds * HZ; -} - -static inline cfs_time_t cfs_time_shift(int seconds) -{ - return jiffies + seconds * HZ; + return ((cfs_duration_t)seconds) * HZ; } static inline time_t cfs_duration_sec(cfs_duration_t d) @@ -239,34 +233,64 @@ static inline time_t cfs_duration_sec(cfs_duration_t d) static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s) { -#if (BITS_PER_LONG == 32) - uint64_t t = (d - s->tv_sec * HZ) * ONE_MILLION; +#if (BITS_PER_LONG == 32) && (HZ > 4096) + uint64_t t; + + s->tv_sec = d / HZ; + t = (d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION; s->tv_usec = do_div (t, HZ); #else - s->tv_usec = (d - s->tv_sec * HZ) * ONE_MILLION / HZ; -#endif s->tv_sec = d / HZ; + s->tv_usec = ((d - (cfs_duration_t)s->tv_sec * HZ) * ONE_MILLION) / HZ; +#endif } static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s) { #if (BITS_PER_LONG == 32) - uint64_t t = (d - s->tv_sec * HZ) * ONE_BILLION; + uint64_t t; + + s->tv_sec = d / HZ; + t = (d - s->tv_sec * HZ) * ONE_BILLION; s->tv_nsec = do_div (t, HZ); #else - s->tv_nsec = (d - s->tv_sec * HZ) * ONE_BILLION / HZ; -#endif s->tv_sec = d / HZ; + s->tv_nsec = ((d - s->tv_sec * HZ) * ONE_BILLION) / HZ; +#endif +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) + +#define cfs_time_current_64 get_jiffies_64 + +static inline __u64 cfs_time_add_64(__u64 t, __u64 d) +{ + return t + d; +} + +static inline __u64 cfs_time_shift_64(int seconds) +{ + return cfs_time_add_64(cfs_time_current_64(), + cfs_time_seconds(seconds)); } -static inline cfs_duration_t cfs_time_minimal_timeout(void) +static inline int cfs_time_before_64(__u64 t1, __u64 t2) { - return 1; + return (__s64)t2 - (__s64)t1 > 0; } -/* inline function cfs_time_minimal_timeout() can not be used - * to initiallize static variable */ -#define CFS_MIN_DELAY (1) +#else +#define cfs_time_current_64 cfs_time_current +#define cfs_time_add_64 cfs_time_add +#define cfs_time_shift_64 cfs_time_shift +#define cfs_time_before_64 cfs_time_before + +#endif + +/* + * One jiffy + */ +#define CFS_TICK (1) #define CFS_TIME_T "%lu" #define CFS_DURATION_T "%ld" diff --git a/lnet/include/libcfs/linux/lltrace.h b/lnet/include/libcfs/linux/lltrace.h index 5050abc..1ddd03d 100644 --- a/lnet/include/libcfs/linux/lltrace.h +++ b/lnet/include/libcfs/linux/lltrace.h @@ -18,9 +18,9 @@ #include #include #include -#include +#include #include -#include +#include #include #include #include diff --git a/lnet/include/libcfs/linux/portals_compat25.h b/lnet/include/libcfs/linux/portals_compat25.h index 31658d5..657c011 100644 --- a/lnet/include/libcfs/linux/portals_compat25.h +++ b/lnet/include/libcfs/linux/portals_compat25.h @@ -5,7 +5,7 @@ #define __LIBCFS_LINUX_PORTALS_COMPAT_H__ // XXX BUG 1511 -- remove this stanza and all callers when bug 1511 is resolved -#if SPINLOCK_DEBUG +#if defined(SPINLOCK_DEBUG) && SPINLOCK_DEBUG # if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) || defined(CONFIG_RH_2_4_20) # define SIGNAL_MASK_ASSERT() \ LASSERT(current->sighand->siglock.magic == SPINLOCK_MAGIC) @@ -44,6 +44,8 @@ # define RECALC_SIGPENDING recalc_sigpending() # define CLEAR_SIGPENDING (current->sigpending = 0) # define CURRENT_SECONDS CURRENT_TIME +# define wait_event_interruptible_exclusive(wq, condition) \ + wait_event_interruptible(wq, condition) #else /* 2.4.x */ @@ -56,6 +58,8 @@ # define RECALC_SIGPENDING recalc_sigpending(current) # define CLEAR_SIGPENDING (current->sigpending = 0) # define CURRENT_SECONDS CURRENT_TIME +# define wait_event_interruptible_exclusive(wq, condition) \ + wait_event_interruptible(wq, condition) #endif @@ -86,11 +90,34 @@ #endif #ifndef HAVE_CPU_ONLINE -#define cpu_online(cpu) test_bit(cpu, &(cpu_online_map)) +#define cpu_online(cpu) ((1< #include #include - + #include #include @@ -29,16 +29,16 @@ #include #include - + #ifdef HAVE_LINUX_VERSION_H # include - + # if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) # define BUG() /* workaround for module.h includes */ # include # endif #endif /* !HAVE_LINUX_VERSION_H */ - + #ifndef __CYGWIN__ # include #else /* __CYGWIN__ */ @@ -47,5 +47,5 @@ # include #endif /* __CYGWIN__ */ -#endif /* !__KERNEL__ */ +#endif /* !__KERNEL__ */ #endif diff --git a/lnet/include/libcfs/list.h b/lnet/include/libcfs/list.h index 5520f75..5c27071 100644 --- a/lnet/include/libcfs/list.h +++ b/lnet/include/libcfs/list.h @@ -9,6 +9,13 @@ #define CFS_LIST_HEAD(n) LIST_HEAD(n) #define CFS_INIT_LIST_HEAD(p) INIT_LIST_HEAD(p) +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +#define CFS_HLIST_HEAD_INIT HLIST_HEAD_INIT +#define CFS_HLIST_HEAD(n) HLIST_HEAD(n) +#define CFS_INIT_HLIST_HEAD(p) INIT_HLIST_HEAD(p) +#define CFS_INIT_HLIST_NODE(p) INIT_HLIST_NODE(p) +#endif + #else /* !defined (__linux__) || !defined(__KERNEL__) */ /* @@ -21,7 +28,11 @@ * using the generic single-entry routines. */ +#ifndef __WINNT__ #define prefetch(a) ((void)a) +#else +#define prefetch(a) ((void *)a) +#endif struct list_head { struct list_head *next, *prev; @@ -124,6 +135,8 @@ static inline void list_del_init(struct list_head *entry) * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry + * + * This is not safe to use if @list is already on the same list as @head. */ static inline void list_move(struct list_head *list, struct list_head *head) { @@ -135,6 +148,8 @@ static inline void list_move(struct list_head *list, struct list_head *head) * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry + * + * This is not safe to use if @list is already on the same list as @head. */ static inline void list_move_tail(struct list_head *list, struct list_head *head) @@ -221,7 +236,164 @@ static inline void list_splice_init(struct list_head *list, for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) -#endif /* __linux__*/ +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +/* + * "NULL" might not be defined at this point + */ +#ifdef NULL +#define NULL_P NULL +#else +#define NULL_P ((void *)0) +#endif + +#define CFS_HLIST_HEAD_INIT { .first = NULL_P } +#define CFS_HLIST_HEAD(name) struct hlist_head name = { .first = NULL_P } +#define CFS_INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL_P) +#define CFS_INIT_HLIST_NODE(ptr) ((ptr)->next = NULL_P, (ptr)->pprev = NULL_P) + +#define HLIST_HEAD_INIT CFS_HLIST_HEAD_INIT +#define HLIST_HEAD(n) CFS_HLIST_HEAD(n) +#define INIT_HLIST_HEAD(p) CFS_INIT_HLIST_HEAD(p) +#define INIT_HLIST_NODE(p) CFS_INIT_HLIST_NODE(p) + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (n->pprev) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, + struct hlist_node *next) +{ + next->next = n->next; + n->next = next; + next->pprev = &n->next; + + if(next->next) + next->next->pprev = &next->next; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \ + pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ + pos = n) + +/** + * hlist_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_continue - iterate over a hlist continuing after existing point + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(tpos, pos, member) \ + for (pos = (pos)->next; \ + pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_from - iterate over a hlist continuing from existing point + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(tpos, pos, member) \ + for (; pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ + for (pos = (head)->first; \ + pos && ({ n = pos->next; 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = n) + +#endif /* __linux__ && __KERNEL__ */ #ifndef list_for_each_prev /** @@ -250,6 +422,19 @@ static inline void list_splice_init(struct list_head *list, prefetch(pos->member.next)) #endif /* list_for_each_entry */ +#ifndef list_for_each_entry_reverse +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + prefetch(pos->member.prev), &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) +#endif /* list_for_each_entry_reverse */ + #ifndef list_for_each_entry_safe /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry @@ -265,138 +450,4 @@ static inline void list_splice_init(struct list_head *list, pos = n, n = list_entry(n->member.next, typeof(*n), member)) #endif /* list_for_each_entry_safe */ -#ifndef list_for_each_entry_reverse -/** - * list_for_each_entry_reverse - iterate backwards over list of given type. - * @pos: the type * to use as a loop counter. - * @head: the head for your list. - * @member: the name of the list_struct within the struct. - */ -#define list_for_each_entry_reverse(pos, head, member) \ - for (pos = list_entry((head)->prev, typeof(*pos), member), \ - prefetch(pos->member.prev); \ - &pos->member != (head); \ - pos = list_entry(pos->member.prev, typeof(*pos), member), \ - prefetch(pos->member.prev)) -#endif - -#ifndef NULL -#define NULL ((void *)0) -#endif - -/* hlist stuff */ -#ifndef __KERNEL__ -#define HLIST_HEAD_INIT { .first = NULL } -#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } -#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) -#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL) - -#define hlist_entry(ptr, type, member) container_of(ptr,type,member) - -#ifndef hlist_for_each -#define hlist_for_each(pos, head) \ - for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \ - pos = pos->next) -#endif - -#ifndef hlist_for_each_entry_safe -/** - * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry - * @tpos: the type * to use as a loop counter. - * @pos: the &struct hlist_node to use as a loop counter. - * @n: another &struct hlist_node to use as temporary storage - * @head: the head for your list. - * @member: the name of the hlist_node within the struct. - */ -#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ - for (pos = (head)->first; \ - pos && ({ n = pos->next; 1; }) && \ - ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ - pos = n) -#endif - -#ifndef hlist_for_each_safe -#define hlist_for_each_safe(pos, n, head) \ - for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ - pos = n) -#endif - -#ifndef hlist_for_each_entry -/** - * hlist_for_each_entry - iterate over list of given type - * @tpos: the type * to use as a loop counter. - * @pos: the &struct hlist_node to use as a loop counter. - * @head: the head for your list. - * @member: the name of the hlist_node within the struct. - */ -#define hlist_for_each_entry(tpos, pos, head, member) \ - for (pos = (head)->first; \ - pos && ({ prefetch(pos->next); 1;}) && \ - ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ - pos = pos->next) -#endif - -/* - * These are non-NULL pointers that will result in page faults - * under normal circumstances, used to verify that nobody uses - * non-initialized list entries. - */ -#define LIST_POISON1 ((void *) 0x00100100) -#define LIST_POISON2 ((void *) 0x00200200) - -#ifndef __KERNEL__ -struct hlist_head { - struct hlist_node *first; -}; - -struct hlist_node { - struct hlist_node *next, **pprev; -}; - -static inline int hlist_unhashed(const struct hlist_node *h) -{ - return !h->pprev; -} - -static inline int hlist_empty(const struct hlist_head *h) -{ - return !h->first; -} - -static inline void __hlist_del(struct hlist_node *n) -{ - struct hlist_node *next = n->next; - struct hlist_node **pprev = n->pprev; - *pprev = next; - if (next) - next->pprev = pprev; -} - -static inline void hlist_del(struct hlist_node *n) -{ - __hlist_del(n); - n->next = LIST_POISON1; - n->pprev = LIST_POISON2; -} - -static inline void hlist_del_init(struct hlist_node *n) -{ - if (n->pprev) { - __hlist_del(n); - INIT_HLIST_NODE(n); - } -} - -static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) -{ - struct hlist_node *first = h->first; - n->next = first; - if (first) - first->pprev = &n->next; - h->first = n; - n->pprev = &h->first; -} -#endif /* __KERNEL__ */ -#endif /* HLIST_HEAD */ - #endif /* __LIBCFS_LUSTRE_LIST_H__ */ diff --git a/lnet/include/libcfs/lltrace.h b/lnet/include/libcfs/lltrace.h index 4f386c5..dbeae91 100644 --- a/lnet/include/libcfs/lltrace.h +++ b/lnet/include/libcfs/lltrace.h @@ -11,6 +11,8 @@ #include #elif defined(__APPLE__) #include +#elif defined(__WINNT__) +#include #else #error Unsupported Operating System #endif @@ -83,8 +85,9 @@ static inline int ltrace_start() { int rc = 0; dbg_initialize(0, NULL); -#ifdef PORTALS_DEV_ID - rc = register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); +#ifdef LNET_DEV_ID + rc = register_ioc_dev(LNET_DEV_ID, LNET_DEV_PATH, + LNET_DEV_MAJOR, LNET_DEV_MINOR); #endif ltrace_filter("class"); ltrace_filter("nal"); @@ -105,8 +108,8 @@ static inline int ltrace_start() static inline void ltrace_stop() { -#ifdef PORTALS_DEV_ID - unregister_ioc_dev(PORTALS_DEV_ID); +#ifdef LNET_DEV_ID + unregister_ioc_dev(LNET_DEV_ID); #endif } @@ -117,14 +120,14 @@ static inline int not_uml() * 1 when run on host * <0 when lookup failed */ - struct stat buf; - int rc = stat("/dev/ubd", &buf); - rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc; - if (rc<0) { - fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno)); - rc = 1; /* Assume host */ - } - return rc; + struct stat buf; + int rc = stat("/dev/ubd", &buf); + rc = ((rc<0) && (errno == ENOENT)) ? 1 : rc; + if (rc<0) { + fprintf(stderr, "Cannot stat /dev/ubd: %s\n", strerror(errno)); + rc = 1; /* Assume host */ + } + return rc; } #define LTRACE_MAX_NOB 256 diff --git a/lnet/include/libcfs/portals_lib.h b/lnet/include/libcfs/portals_lib.h deleted file mode 100644 index 8be849b..0000000 --- a/lnet/include/libcfs/portals_lib.h +++ /dev/null @@ -1,97 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2001 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * Basic library routines. - * - */ - -#ifndef __LIBCFS_PORTALS_LIB_H__ -#define __LIBCFS_PORTALS_LIB_H__ - -#if defined(__linux__) -#include -#elif defined(__APPLE__) -#include -#else -#error Unsupported Operating System -#endif - -#undef MIN -#define MIN(a,b) (((a)<(b)) ? (a): (b)) -#undef MAX -#define MAX(a,b) (((a)>(b)) ? (a): (b)) -#define MKSTR(ptr) ((ptr))? (ptr) : "" - -static inline int size_round4 (int val) -{ - return (val + 3) & (~0x3); -} - -static inline int size_round (int val) -{ - return (val + 7) & (~0x7); -} - -static inline int size_round16(int val) -{ - return (val + 0xf) & (~0xf); -} - -static inline int size_round32(int val) -{ - return (val + 0x1f) & (~0x1f); -} - -static inline int size_round0(int val) -{ - if (!val) - return 0; - return (val + 1 + 7) & (~0x7); -} - -static inline size_t round_strlen(char *fset) -{ - return size_round(strlen(fset) + 1); -} - -#define LOGL(var,len,ptr) \ -do { \ - if (var) \ - memcpy((char *)ptr, (const char *)var, len); \ - ptr += size_round(len); \ -} while (0) - -#define LOGU(var,len,ptr) \ -do { \ - if (var) \ - memcpy((char *)var, (const char *)ptr, len); \ - ptr += size_round(len); \ -} while (0) - -#define LOGL0(var,len,ptr) \ -do { \ - if (!len) \ - break; \ - memcpy((char *)ptr, (const char *)var, len); \ - *((char *)(ptr) + len) = 0; \ - ptr += size_round(len + 1); \ -} while (0) - -#endif /* _PORTALS_LIB_H */ diff --git a/lnet/include/libcfs/portals_utils.h b/lnet/include/libcfs/portals_utils.h index 932caaf..b79eb7e 100644 --- a/lnet/include/libcfs/portals_utils.h +++ b/lnet/include/libcfs/portals_utils.h @@ -12,6 +12,8 @@ #include #elif defined(__APPLE__) #include +#elif defined(__WINNT__) +#include #else #error Unsupported Operating System #endif diff --git a/lnet/include/libcfs/types.h b/lnet/include/libcfs/types.h new file mode 100755 index 0000000..71dd7fb --- /dev/null +++ b/lnet/include/libcfs/types.h @@ -0,0 +1,17 @@ +#ifndef _LIBCFS_TYPES_H +#define _LIBCFS_TYPES_H + +/* + * This file was inttroduced to resolve XT3 (Catamount) build issues. + * The orignal idea was to move here however at + * the time of this writing + * it's unclear what external dependencies are tied + * to that file (It's not just some source file #including it) + * there is some build/packaging infrastructure that includes it. + * Hopefully that will be resolved shortly, that file will + * be removed, its contents copied here and this comment can be deleted. + */ + +#include + +#endif diff --git a/lnet/include/libcfs/user-lock.h b/lnet/include/libcfs/user-lock.h index e57200f..cea7a6d 100644 --- a/lnet/include/libcfs/user-lock.h +++ b/lnet/include/libcfs/user-lock.h @@ -34,10 +34,17 @@ /* * liblustre is single-threaded, so most "synchronization" APIs are trivial. + * + * XXX Liang: There are several branches share lnet with b_hd_newconfig, + * if we define lock APIs at here, there will be conflict with liblustre + * in other branches. */ #ifndef __KERNEL__ +#include +#include +#if 0 /* * Optional debugging (magic stamping and checking ownership) can be added. */ @@ -55,10 +62,13 @@ * * No-op implementation. */ -struct spin_lock {}; +struct spin_lock {int foo;}; typedef struct spin_lock spinlock_t; +#define SPIN_LOCK_UNLOCKED (spinlock_t) { } +#define LASSERT_SPIN_LOCKED(lock) do {} while(0) + void spin_lock_init(spinlock_t *lock); void spin_lock(spinlock_t *lock); void spin_unlock(spinlock_t *lock); @@ -66,9 +76,10 @@ int spin_trylock(spinlock_t *lock); void spin_lock_bh_init(spinlock_t *lock); void spin_lock_bh(spinlock_t *lock); void spin_unlock_bh(spinlock_t *lock); +static inline int spin_is_locked(spinlock_t *l) {return 1;} -#define spin_lock_irqsave(l, flags) ({ spin_lock(l); (void)flags; }) -#define spin_unlock_irqrestore(l, flags) ({ spin_unlock(l); (void)flags; }) +static inline void spin_lock_irqsave(spinlock_t *l, unsigned long f){} +static inline void spin_unlock_irqrestore(spinlock_t *l, unsigned long f){} /* * Semaphore @@ -77,7 +88,9 @@ void spin_unlock_bh(spinlock_t *lock); * - __down(x) * - __up(x) */ -struct semaphore {}; +typedef struct semaphore { + int foo; +} mutex_t; void sema_init(struct semaphore *s, int val); void __down(struct semaphore *s); @@ -104,11 +117,13 @@ void __up(struct semaphore *s); * - complete(c) * - wait_for_completion(c) */ +#if 0 struct completion {}; void init_completion(struct completion *c); void complete(struct completion *c); void wait_for_completion(struct completion *c); +#endif /* * rw_semaphore: @@ -149,11 +164,32 @@ typedef struct rw_semaphore rwlock_t; #define write_lock(l) down_write(l) #define write_unlock(l) up_write(l) -#define write_lock_irqsave(l, f) write_lock(l) -#define write_unlock_irqrestore(l, f) write_unlock(l) +static inline void +write_lock_irqsave(rwlock_t *l, unsigned long f) { write_lock(l); } +static inline void +write_unlock_irqrestore(rwlock_t *l, unsigned long f) { write_unlock(l); } -#define read_lock_irqsave(l, f) read_lock(l) -#define read_unlock_irqrestore(l, f) read_unlock(l) +static inline void +read_lock_irqsave(rwlock_t *l, unsigned long f) { read_lock(l); } +static inline void +read_unlock_irqrestore(rwlock_t *l, unsigned long f) { read_unlock(l); } + +/* + * Atomic for user-space + * Copied from liblustre + */ +typedef struct { volatile int counter; } atomic_t; + +#define ATOMIC_INIT(i) { (i) } +#define atomic_read(a) ((a)->counter) +#define atomic_set(a,b) do {(a)->counter = b; } while (0) +#define atomic_dec_and_test(a) ((--((a)->counter)) == 0) +#define atomic_inc(a) (((a)->counter)++) +#define atomic_dec(a) do { (a)->counter--; } while (0) +#define atomic_add(b,a) do {(a)->counter += b;} while (0) +#define atomic_sub(b,a) do {(a)->counter -= b;} while (0) + +#endif /* !__KERNEL__ */ #endif diff --git a/lnet/include/libcfs/user-prim.h b/lnet/include/libcfs/user-prim.h index 6c3410b..54f7832 100644 --- a/lnet/include/libcfs/user-prim.h +++ b/lnet/include/libcfs/user-prim.h @@ -38,14 +38,27 @@ #ifndef __KERNEL__ +#include +#include +#include +#include #include +#include +#include +#include /* * Wait Queue. No-op implementation. */ -typedef struct cfs_waitlink {} cfs_waitlink_t; -typedef struct cfs_waitq {} cfs_waitq_t; +typedef struct cfs_waitlink { + struct list_head sleeping; + void *process; +} cfs_waitlink_t; + +typedef struct cfs_waitq { + struct list_head sleepers; +} cfs_waitq_t; void cfs_waitq_init(struct cfs_waitq *waitq); void cfs_waitlink_init(struct cfs_waitlink *link); @@ -57,13 +70,17 @@ void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link); int cfs_waitq_active(struct cfs_waitq *waitq); void cfs_waitq_signal(struct cfs_waitq *waitq); void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr); -void cfs_waitq_broadcast(struct cfs_waitq *waitq); +void cfs_waitq_broadcast(struct cfs_waitq *waitq, int state); void cfs_waitq_wait(struct cfs_waitlink *link); -int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int64_t timeout); +int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int state, int64_t timeout); +#define cfs_schedule_timeout(s, t) \ + do { \ + cfs_waitlink_t l; \ + cfs_waitq_timedwait(&l, s, t); \ + } while (0) -/* - * Allocator - */ +#define CFS_TASK_INTERRUPTIBLE (0) +#define CFS_TASK_UNINT (0) /* 2.4 defines */ @@ -88,31 +105,40 @@ struct page { typedef struct page cfs_page_t; -#define CFS_PAGE_SIZE PAGE_CACHE_SIZE -#define CFS_PAGE_SHIFT PAGE_CACHE_SHIFT -#define CFS_PAGE_MASK PAGE_CACHE_MASK - -cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order); -void cfs_free_pages(struct page *pg, int what); +#define CFS_PAGE_SIZE PAGE_SIZE +#define CFS_PAGE_SHIFT PAGE_SHIFT +#define CFS_PAGE_MASK (~((__u64)CFS_PAGE_SIZE-1)) cfs_page_t *cfs_alloc_page(unsigned int flags); -void cfs_free_page(cfs_page_t *pg, int what); +void cfs_free_page(cfs_page_t *pg); void *cfs_page_address(cfs_page_t *pg); void *cfs_kmap(cfs_page_t *pg); void cfs_kunmap(cfs_page_t *pg); #define cfs_get_page(p) __I_should_not_be_called__(at_all) #define cfs_page_count(p) __I_should_not_be_called__(at_all) -#define cfs_set_page_count(p, v) __I_should_not_be_called__(at_all) +#define cfs_page_index(p) ((p)->index) /* * Memory allocator + * Inline function, so utils can use them without linking of libcfs */ -void *cfs_alloc(size_t nr_bytes, u_int32_t flags); -void cfs_free(void *addr); -void *cfs_alloc_large(size_t nr_bytes); -void cfs_free_large(void *addr); +#define __ALLOC_ZERO (1 << 2) +static inline void *cfs_alloc(size_t nr_bytes, u_int32_t flags) +{ + void *result; + + result = malloc(nr_bytes); + if (result != NULL && (flags & __ALLOC_ZERO)) + memset(result, 0, nr_bytes); + return result; +} + +#define cfs_free(addr) free(addr) +#define cfs_alloc_large(nr_bytes) cfs_alloc(nr_bytes, 0) +#define cfs_free_large(addr) cfs_free(addr) +#define CFS_ALLOC_ATOMIC_TRY (0) /* * SLAB allocator */ @@ -121,11 +147,11 @@ typedef struct { } cfs_mem_cache_t; #define SLAB_HWCACHE_ALIGN 0 +#define SLAB_KERNEL 0 +#define SLAB_NOFS 0 cfs_mem_cache_t * -cfs_mem_cache_create(const char *, size_t, size_t, unsigned long, - void (*)(void *, cfs_mem_cache_t *, unsigned long), - void (*)(void *, cfs_mem_cache_t *, unsigned long)); +cfs_mem_cache_create(const char *, size_t, size_t, unsigned long); int cfs_mem_cache_destroy(cfs_mem_cache_t *c); void *cfs_mem_cache_alloc(cfs_mem_cache_t *c, int gfp); void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr); @@ -138,10 +164,61 @@ typedef int (cfs_write_proc_t)(struct file *file, const char *buffer, unsigned long count, void *data); /* + * Signal + */ +typedef sigset_t cfs_sigset_t; + +/* * Timer */ +#include + +typedef struct { + struct list_head tl_list; + void (*function)(unsigned long unused); + unsigned long data; + long expires; +} cfs_timer_t; + +#define cfs_init_timer(t) do {} while(0) +#define cfs_jiffies \ +({ \ + unsigned long _ret = 0; \ + struct timeval tv; \ + if (gettimeofday(&tv, NULL) == 0) \ + _ret = tv.tv_sec; \ + _ret; \ +}) + +static inline int cfs_timer_init(cfs_timer_t *l, void (* func)(unsigned long), void *arg) +{ + CFS_INIT_LIST_HEAD(&l->tl_list); + l->function = func; + l->data = (unsigned long)arg; + return 0; +} + +static inline int cfs_timer_is_armed(cfs_timer_t *l) +{ + if (cfs_time_before(cfs_jiffies, l->expires)) + return 1; + else + return 0; +} + +static inline void cfs_timer_arm(cfs_timer_t *l, int thetime) +{ + l->expires = thetime; +} + +static inline void cfs_timer_disarm(cfs_timer_t *l) +{ +} -typedef struct cfs_timer {} cfs_timer_t; +static inline long cfs_timer_deadline(cfs_timer_t *l) +{ + return l->expires; +} #if 0 #define cfs_init_timer(t) do {} while(0) @@ -154,6 +231,16 @@ int cfs_timer_is_armed(struct cfs_timer *t); cfs_time_t cfs_timer_deadline(struct cfs_timer *t); #endif +#define in_interrupt() (0) + +static inline void cfs_pause(cfs_duration_t d) +{ + struct timespec s; + + cfs_duration_nsec(d, &s); + nanosleep(&s, NULL); +} + typedef void cfs_psdev_t; static inline int cfs_psdev_register(cfs_psdev_t *foo) @@ -166,6 +253,42 @@ static inline int cfs_psdev_deregister(cfs_psdev_t *foo) return 0; } +/* + * portable UNIX device file identification. + */ + +typedef unsigned int cfs_rdev_t; +// typedef unsigned long long kdev_t; +/* + */ +#define cfs_lock_kernel() do {} while (0) +#define cfs_sigfillset(l) do {} while (0) +#define cfs_recalc_sigpending(l) do {} while (0) +#define cfs_kernel_thread(l,m,n) LBUG() + +// static inline void local_irq_save(unsigned long flag) {return;} +// static inline void local_irq_restore(unsigned long flag) {return;} + +enum { + CFS_STACK_TRACE_DEPTH = 16 +}; + +struct cfs_stack_trace { + void *frame[CFS_STACK_TRACE_DEPTH]; +}; + +/* + * arithmetic + */ +#define do_div(a,b) \ + ({ \ + unsigned long remainder;\ + remainder = (a) % (b); \ + (a) = (a) / (b); \ + (remainder); \ + }) + + /* !__KERNEL__ */ #endif diff --git a/lnet/include/libcfs/user-time.h b/lnet/include/libcfs/user-time.h index 7abc9e8..86cbc2d 100644 --- a/lnet/include/libcfs/user-time.h +++ b/lnet/include/libcfs/user-time.h @@ -64,18 +64,16 @@ * int cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *); * int cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *); * - * cfs_duration_t cfs_time_minimal_timeout(void) - * * CFS_TIME_FORMAT * CFS_DURATION_FORMAT * */ -#define ONE_BILLION ((u_int64_t)1000000000) -#define ONE_MILLION ((u_int64_t) 1000000) - #ifndef __KERNEL__ +#define ONE_BILLION ((u_int64_t)1000000000) +#define ONE_MILLION 1000000 + /* * Liblustre. time(2) based implementation. */ @@ -98,6 +96,11 @@ static inline cfs_duration_t cfs_time_seconds(int seconds) return seconds; } +static inline time_t cfs_time_current_sec(void) +{ + return cfs_time_seconds(cfs_time_current()); +} + static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2) { return t1 < t2; @@ -110,7 +113,7 @@ static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2) static inline cfs_duration_t cfs_duration_build(int64_t nano) { - return nano / ONE_BILLION; + return (cfs_duration_t) (nano / ONE_BILLION); } static inline time_t cfs_duration_sec(cfs_duration_t d) @@ -162,12 +165,7 @@ static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2) return *t1 <= *t2; } -static inline cfs_duration_t cfs_time_minimal_timeout(void) -{ - return 1; -} - -#define CFS_MIN_DELAY (1) +#define CFS_TICK (1) static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d) { @@ -179,6 +177,11 @@ static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2) return t1 - t2; } +#define cfs_time_current_64 cfs_time_current +#define cfs_time_add_64 cfs_time_add +#define cfs_time_shift_64 cfs_time_shift +#define cfs_time_before_64 cfs_time_before + #define CFS_TIME_T "%lu" #define CFS_DURATION_T "%ld" diff --git a/lnet/include/libcfs/winnt/kp30.h b/lnet/include/libcfs/winnt/kp30.h new file mode 100644 index 0000000..e494a9f --- /dev/null +++ b/lnet/include/libcfs/winnt/kp30.h @@ -0,0 +1,156 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under the + * terms of version 2 of the GNU General Public License as published by the + * Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass + * Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LIBCFS_WINNT_KP30_H__ +#define __LIBCFS_WINNT_KP30_H__ + +#ifndef __LIBCFS_KP30_H__ +#error Do not #include this file directly. #include instead +#endif + +#include +#include + +#ifdef __KERNEL__ + +/* Module parameter support */ +#define CFS_MODULE_PARM(name, t, type, perm, desc) + +#define CFS_SYSFS_MODULE_PARM 0 /* no sysfs access to module parameters */ + + +static inline void our_cond_resched() +{ + schedule_timeout(1i64); +} + +#ifdef CONFIG_SMP +#define LASSERT_SPIN_LOCKED(lock) do {} while(0) /* XXX */ +#else +#define LASSERT_SPIN_LOCKED(lock) do {} while(0) +#endif + +#error Need a winnt version of panic() +#define LIBCFS_PANIC(msg) KeBugCheckEx(msg, (ULONG_PTR)NULL, (ULONG_PTR)NULL, (ULONG_PTR)NULL, (ULONG_PTR)NULL) +#error libcfs_register_panic_notifier() missing +#error libcfs_unregister_panic_notifier() missing + +#define cfs_work_struct_t WORK_QUEUE_ITEM +#define cfs_prepare_work(tq, routine, contex) +#define cfs_schedule_work(tq) + +/* ------------------------------------------------------------------- */ + +#define PORTAL_SYMBOL_REGISTER(x) cfs_symbol_register(#x, &x) +#define PORTAL_SYMBOL_UNREGISTER(x) cfs_symbol_unregister(#x) + +#define PORTAL_SYMBOL_GET(x) (cfs_symbol_get(#x)) +#define PORTAL_SYMBOL_PUT(x) cfs_symbol_put(#x) + +#define PORTAL_MODULE_USE do{}while(0) +#define PORTAL_MODULE_UNUSE do{}while(0) + +#define printk DbgPrint +#define ptintf DbgPrint + +#else /* !__KERNEL__ */ + +# include +# include +#ifdef __CYGWIN__ +# include +#endif +# include + +#endif /* End of !__KERNEL__ */ + +/******************************************************************************/ +/* Light-weight trace + * Support for temporary event tracing with minimal Heisenberg effect. */ +#define LWT_SUPPORT 0 + +/* kernel hasn't defined this? */ +typedef struct { + __s64 lwte_when; + char *lwte_where; + void *lwte_task; + long_ptr lwte_p1; + long_ptr lwte_p2; + long_ptr lwte_p3; + long_ptr lwte_p4; +# if BITS_PER_LONG > 32 + long_ptr lwte_pad; +# endif +} lwt_event_t; + + +# define LWT_EVENT(p1,p2,p3,p4) + + +/* ------------------------------------------------------------------ */ + +#define IOCTL_LIBCFS_TYPE long_ptr + +#ifdef __CYGWIN__ +# ifndef BITS_PER_LONG +# if (~0UL) == 0xffffffffUL +# define BITS_PER_LONG 32 +# else +# define BITS_PER_LONG 64 +# endif +# endif +#endif + +#if BITS_PER_LONG > 32 +# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +# define LL_POISON ((long_ptr)0x5a5a5a5a5a5a5a5a) +# define LP_POISON ((char *)(long_ptr)0x5a5a5a5a5a5a5a5a) +#else +# define LI_POISON ((int)0x5a5a5a5a) +# define LL_POISON ((long_ptr)0x5a5a5a5a) +# define LP_POISON ((char *)(long_ptr)0x5a5a5a5a) +#endif + +#if defined(__x86_64__) +# define LPU64 "%I64u" +# define LPD64 "%I64d" +# define LPX64 "%I64x" +# define LPSZ "%lu" +# define LPSSZ "%ld" +#elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) +# define LPU64 "%I64u" +# define LPD64 "%I64d" +# define LPX64 "%I64x" +# define LPSZ "%u" +# define LPSSZ "%d" +#elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) +# define LPU64 "%I64u" +# define LPD64 "%I64d" +# define LPX64 "%I64x" +# define LPSZ "%u" +# define LPSSZ "%d" +#endif +#ifndef LPU64 +# error "No word size defined" +#endif + +#endif diff --git a/lnet/include/libcfs/winnt/libcfs.h b/lnet/include/libcfs/winnt/libcfs.h new file mode 100644 index 0000000..386eb5f --- /dev/null +++ b/lnet/include/libcfs/winnt/libcfs.h @@ -0,0 +1,126 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under the + * terms of version 2 of the GNU General Public License as published by the + * Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass + * Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LIBCFS_WINNT_LIBCFS_H__ +#define __LIBCFS_WINNT_LIBCFS_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +/* workgroud for VC compiler */ +#ifndef __FUNCTION__ +#define __FUNCTION__ "generic" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +struct ptldebug_header { + __u32 ph_len; + __u32 ph_flags; + __u32 ph_subsys; + __u32 ph_mask; + __u32 ph_cpu_id; + __u32 ph_sec; + __u64 ph_usec; + __u32 ph_stack; + __u32 ph_pid; + __u32 ph_extern_pid; + __u32 ph_line_num; +} __attribute__((packed)); + +#ifdef __KERNEL__ + +enum { + /* if you change this, update darwin-util.c:cfs_stack_trace_fill() */ + CFS_STACK_TRACE_DEPTH = 16 +}; + +struct cfs_stack_trace { + void *frame[CFS_STACK_TRACE_DEPTH]; +}; + +static inline __u32 query_stack_size() +{ + ULONG LowLimit, HighLimit; + + IoGetStackLimits(&LowLimit, &HighLimit); + ASSERT(HighLimit > LowLimit); + + return (__u32) (HighLimit - LowLimit); +} +#else +static inline __u32 query_stack_size() +{ + return 4096; +} +#endif + + +#ifndef THREAD_SIZE +# define THREAD_SIZE query_stack_size() +#endif + +#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5) + +#ifdef __KERNEL__ +# ifdef __ia64__ +# define CDEBUG_STACK() (THREAD_SIZE - \ + ((ulong_ptr)__builtin_dwarf_cfa() & \ + (THREAD_SIZE - 1))) +# else +# define CDEBUG_STACK (IoGetRemainingStackSize()) +# error "This doesn't seem right; CDEBUG_STACK should grow with the stack" +# endif /* __ia64__ */ + +#define CHECK_STACK() \ +do { \ + unsigned long _stack = CDEBUG_STACK(); \ + \ + if (_stack > 3*THREAD_SIZE/4 && _stack > libcfs_stack) { \ + libcfs_stack = _stack; \ + libcfs_debug_msg(NULL, DEBUG_SUBSYSTEM, D_WARNING, \ + __FILE__, NULL, __LINE__, \ + "maximum lustre stack %lu\n", _stack); \ + } \ +} while (0) +#else /* !__KERNEL__ */ +#define CHECK_STACK() do { } while(0) +#define CDEBUG_STACK() (0L) +#endif /* __KERNEL__ */ + +/* initial pid */ +#define LUSTRE_LNET_PID 12345 + +#define ENTRY_NESTING_SUPPORT (0) +#define ENTRY_NESTING do {;} while (0) +#define EXIT_NESTING do {;} while (0) +#define __current_nesting_level() (0) + +#endif /* _WINNT_LIBCFS_H */ diff --git a/lnet/include/libcfs/darwin/portals_lib.h b/lnet/include/libcfs/winnt/lltrace.h similarity index 69% rename from lnet/include/libcfs/darwin/portals_lib.h rename to lnet/include/libcfs/winnt/lltrace.h index dde962a..9615e94 100644 --- a/lnet/include/libcfs/darwin/portals_lib.h +++ b/lnet/include/libcfs/winnt/lltrace.h @@ -1,5 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: * * Copyright (C) 2001 Cluster File Systems, Inc. * @@ -18,17 +18,16 @@ * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Basic library routines. + * Basic library routines. * */ -#ifndef __LIBCFS_DARWIN_PORTALS_LIB_H__ -#define __LIBCFS_DARWIN_PORTALS_LIB_H__ +#ifndef __LIBCFS_WINNT_LLTRACE_H__ +#define __LIBCFS_WINNT_LLTRACE_H__ -#ifndef __LIBCFS_PORTALS_LIB_H__ -#error Do not #include this file directly. #include instead +#ifndef __LIBCFS_LLTRACE_H__ +#error Do not #include this file directly. #include instead #endif -#include #endif diff --git a/lnet/include/libcfs/linux/portals_lib.h b/lnet/include/libcfs/winnt/portals_compat25.h similarity index 65% rename from lnet/include/libcfs/linux/portals_lib.h rename to lnet/include/libcfs/winnt/portals_compat25.h index 99fd1bd..579b795 100644 --- a/lnet/include/libcfs/linux/portals_lib.h +++ b/lnet/include/libcfs/winnt/portals_compat25.h @@ -1,5 +1,5 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: * * Copyright (C) 2001 Cluster File Systems, Inc. * @@ -18,21 +18,11 @@ * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * Basic library routines. - * */ -#ifndef __LIBCFS_LINUX_PORTALS_LIB_H__ -#define __LIBCFS_LINUX_PORTALS_LIB_H__ +#ifndef __LIBCFS_WINNT_PORTALS_COMPAT_H__ +#define __LIBCFS_WINNT_PORTALS_COMPAT_H__ -#ifndef __LIBCFS_PORTALS_LIB_H__ -#error Do not #include this file directly. #include instead -#endif -#ifndef __KERNEL__ -# include -#else -# include -#endif -#endif +#endif /* _PORTALS_COMPAT_H */ diff --git a/lnet/include/libcfs/winnt/portals_utils.h b/lnet/include/libcfs/winnt/portals_utils.h new file mode 100644 index 0000000..ec80692 --- /dev/null +++ b/lnet/include/libcfs/winnt/portals_utils.h @@ -0,0 +1,168 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef __LIBCFS_WINNT_PORTALS_UTILS_H__ +#define __LIBCFS_WINNT_PORTALS_UTILS_H__ + +#ifndef __LIBCFS_PORTALS_UTILS_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifndef cfs_is_flag_set +#define cfs_is_flag_set(x,f) (((x)&(f))==(f)) +#endif + +#ifndef cfs_set_flag +#define cfs_set_flag(x,f) ((x) |= (f)) +#endif + +#ifndef cfs_clear_flag +#define cfs_clear_flag(x,f) ((x) &= ~(f)) +#endif + + +static inline __u32 __do_div(__u32 * n, __u32 b) +{ + __u32 mod; + + mod = *n % b; + *n = *n / b; + return mod; +} + +#define do_div(n,base) __do_div((__u32 *)&(n), (__u32) (base)) + +#ifdef __KERNEL__ + +#include +#include + +char * strsep(char **s, const char *ct); +static inline size_t strnlen(const char * s, size_t count) { + size_t len = 0; + while(len < count && s[len++]); + return len; +} +char * ul2dstr(ulong_ptr address, char *buf, int len); + +#define simple_strtol(a1, a2, a3) strtol(a1, a2, a3) +#define simple_strtoll(a1, a2, a3) (__s64)strtoull(a1, a2, a3) +#define simple_strtoull(a1, a2, a3) strtoull(a1, a2, a3) + +unsigned long simple_strtoul(const char *cp,char **endp, unsigned int base); + +static inline int test_bit(int nr, void * addr) +{ + return ((1UL << (nr & 31)) & (((volatile ULONG *) addr)[nr >> 5])) != 0; +} + +static inline void clear_bit(int nr, void * addr) +{ + (((volatile ULONG *) addr)[nr >> 5]) &= (~(1UL << (nr & 31))); +} + + +static inline void set_bit(int nr, void * addr) +{ + (((volatile ULONG *) addr)[nr >> 5]) |= (1UL << (nr & 31)); +} + +static inline void read_random(char *buf, int len) +{ + ULONG Seed = (ULONG) buf; + Seed = RtlRandom(&Seed); + while (len >0) { + if (len > sizeof(ULONG)) { + memcpy(buf, &Seed, sizeof(ULONG)); + len -= sizeof(ULONG); + buf += sizeof(ULONG); + } else { + memcpy(buf, &Seed, len); + len = 0; + break; + } + } +} +#define get_random_bytes(buf, len) read_random(buf, len) + +/* do NOT use function or expression as parameters ... */ + +#ifndef min_t +#define min_t(type,x,y) (type)(x) < (type)(y) ? (x): (y) +#endif + +#ifndef max_t +#define max_t(type,x,y) (type)(x) < (type)(y) ? (y): (x) +#endif + + +#define NIPQUAD(addr) \ + ((unsigned char *)&addr)[0], \ + ((unsigned char *)&addr)[1], \ + ((unsigned char *)&addr)[2], \ + ((unsigned char *)&addr)[3] + +#define HIPQUAD(addr) \ + ((unsigned char *)&addr)[3], \ + ((unsigned char *)&addr)[2], \ + ((unsigned char *)&addr)[1], \ + ((unsigned char *)&addr)[0] + +static int copy_from_user(void *to, void *from, int c) +{ + memcpy(to, from, c); + return 0; +} + +static int copy_to_user(void *to, void *from, int c) +{ + memcpy(to, from, c); + return 0; +} + + +#define put_user(x, ptr) \ +( \ + *(ptr) = x, \ + 0 \ +) + + +#define get_user(x,ptr) \ +( \ + x = *(ptr), \ + 0 \ +) + +#define num_physpages (64 * 1024) + +#define snprintf _snprintf +#define vsnprintf _vsnprintf + + +#endif /* !__KERNEL__ */ + +int cfs_error_code(NTSTATUS); + +#endif diff --git a/lnet/include/libcfs/winnt/winnt-fs.h b/lnet/include/libcfs/winnt/winnt-fs.h new file mode 100644 index 0000000..6280b93 --- /dev/null +++ b/lnet/include/libcfs/winnt/winnt-fs.h @@ -0,0 +1,280 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * File operations & routines. + * + */ + +#ifndef __LIBCFS_WINNT_CFS_FS_H__ +#define __LIBCFS_WINNT_CFS_FS_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +/* + * Platform defines + * + * cfs_rdev_t + */ + +typedef unsigned short cfs_rdev_t; + +typedef unsigned int cfs_major_nr_t; +typedef unsigned int cfs_minor_nr_t; + + +#define MINORBITS 8 +#define MINORMASK ((1U << MINORBITS) - 1) + +#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) +#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) +#define NODEV 0 +#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) + + +static inline cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor) +{ + return MKDEV(major, minor); +} + +static inline cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev) +{ + return MAJOR(rdev); +} + +static inline cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev) +{ + return MINOR(rdev); +} + + +#ifdef __KERNEL__ + +struct file_operations +{ + loff_t (*lseek)(struct file * file, loff_t offset, int origin); + ssize_t (*read) (struct file * file, char * buf, size_t nbytes, loff_t *ppos); + ssize_t (*write)(struct file * file, const char * buffer, + size_t count, loff_t *ppos); + int (*ioctl) (struct file *, unsigned int, ulong_ptr); + int (*open) (struct file *); + int (*release) (struct file *); +}; + +struct file { + + cfs_handle_t f_handle; + unsigned int f_flags; + mode_t f_mode; + ulong_ptr f_count; + + //struct list_head f_list; + //struct dentry * f_dentry; + + cfs_proc_entry_t * proc_dentry; + cfs_file_operations_t * f_op; + + size_t f_size; + loff_t f_pos; + unsigned int f_uid, f_gid; + int f_error; + + ulong_ptr f_version; + + void * private_data; + + char f_name[1]; + +}; + +#define cfs_filp_size(f) ((f)->f_size) +#define cfs_filp_poff(f) (&(f)->f_pos) + +cfs_file_t *cfs_filp_open(const char *name, int flags, int mode, int *err); +int cfs_filp_close(cfs_file_t *fp); +int cfs_filp_read(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos); +int cfs_filp_write(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos); +int cfs_filp_fsync(cfs_file_t *fp); +int cfs_get_file(cfs_file_t *fp); +int cfs_put_file(cfs_file_t *fp); +int cfs_file_count(cfs_file_t *fp); + + + +/* + * CFS_FLOCK routines + */ + +typedef struct file_lock{ + int fl_type; + pid_t fl_pid; + size_t fl_len; + off_t fl_start; + off_t fl_end; +} cfs_flock_t; + +#define CFS_INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) +#define CFS_OFFSET_MAX CFS_INT_LIMIT(loff_t) + +#define cfs_flock_type(fl) ((fl)->fl_type) +#define cfs_flock_set_type(fl, type) do { (fl)->fl_type = (type); } while(0) +#define cfs_flock_pid(fl) ((fl)->fl_pid) +#define cfs_flock_set_pid(fl, pid) do { (fl)->fl_pid = (pid); } while(0) +#define cfs_flock_start(fl) ((fl)->fl_start) +#define cfs_flock_set_start(fl, start) do { (fl)->fl_start = (start); } while(0) +#define cfs_flock_end(fl) ((fl)->fl_end) +#define cfs_flock_set_end(fl, end) do { (fl)->fl_end = (end); } while(0) + +#define ATTR_MODE 0x0001 +#define ATTR_UID 0x0002 +#define ATTR_GID 0x0004 +#define ATTR_SIZE 0x0008 +#define ATTR_ATIME 0x0010 +#define ATTR_MTIME 0x0020 +#define ATTR_CTIME 0x0040 +#define ATTR_ATIME_SET 0x0080 +#define ATTR_MTIME_SET 0x0100 +#define ATTR_FORCE 0x0200 /* Not a change, but a change it */ +#define ATTR_ATTR_FLAG 0x0400 +#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */ +#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */ +//#define ATTR_CTIME_SET 0x2000 +#define ATTR_BLOCKS 0x4000 + +#define in_group_p(x) (0) + +/* + * proc fs routines + */ + +int proc_init_fs(); +void proc_destroy_fs(); + + +/* + * misc + */ + +static inline void *ERR_PTR(long_ptr error) +{ + return (void *) error; +} + +static inline long_ptr PTR_ERR(const void *ptr) +{ + return (long_ptr) ptr; +} + +static inline long_ptr IS_ERR(const void *ptr) +{ + return (ulong_ptr)ptr > (ulong_ptr)-1000L; +} + +#else /* !__KERNEL__ */ + +#define CREATE_NEW 1 +#define CREATE_ALWAYS 2 +#define OPEN_EXISTING 3 +#define OPEN_ALWAYS 4 +#define TRUNCATE_EXISTING 5 + +#define SECTION_QUERY 0x0001 +#define SECTION_MAP_WRITE 0x0002 +#define SECTION_MAP_READ 0x0004 +#define SECTION_MAP_EXECUTE 0x0008 +#define SECTION_EXTEND_SIZE 0x0010 + +#define FILE_MAP_COPY SECTION_QUERY +#define FILE_MAP_WRITE SECTION_MAP_WRITE +#define FILE_MAP_READ SECTION_MAP_READ +#define FILE_MAP_ALL_ACCESS SECTION_ALL_ACCESS + + +NTSYSAPI +HANDLE +NTAPI +CreateFileA( + IN LPCSTR lpFileName, + IN DWORD dwDesiredAccess, + IN DWORD dwShareMode, + IN PVOID lpSecurityAttributes, + IN DWORD dwCreationDisposition, + IN DWORD dwFlagsAndAttributes, + IN HANDLE hTemplateFile + ); + +#define CreateFile CreateFileA + +NTSYSAPI +BOOL +NTAPI +CloseHandle( + IN OUT HANDLE hObject + ); + +NTSYSAPI +HANDLE +NTAPI +CreateFileMappingA( + IN HANDLE hFile, + IN PVOID lpFileMappingAttributes, + IN DWORD flProtect, + IN DWORD dwMaximumSizeHigh, + IN DWORD dwMaximumSizeLow, + IN LPCSTR lpName + ); +#define CreateFileMapping CreateFileMappingA + +NTSYSAPI +DWORD +NTAPI +GetFileSize( + IN HANDLE hFile, + OUT DWORD * lpFileSizeHigh + ); + +NTSYSAPI +PVOID +NTAPI +MapViewOfFile( + IN HANDLE hFileMappingObject, + IN DWORD dwDesiredAccess, + IN DWORD dwFileOffsetHigh, + IN DWORD dwFileOffsetLow, + IN SIZE_T dwNumberOfBytesToMap + ); + +NTSYSAPI +BOOL +NTAPI +UnmapViewOfFile( + IN PVOID lpBaseAddress + ); + +#endif /* __KERNEL__ */ + +typedef struct { + void *d; +} cfs_dentry_t; + + +#endif /* __LIBCFS_WINNT_CFS_FS_H__*/ diff --git a/lnet/include/libcfs/winnt/winnt-lock.h b/lnet/include/libcfs/winnt/winnt-lock.h new file mode 100644 index 0000000..e0b9393 --- /dev/null +++ b/lnet/include/libcfs/winnt/winnt-lock.h @@ -0,0 +1,686 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef __LIBCFS_WINNT_CFS_LOCK_H__ +#define __LIBCFS_WINNT_CFS_LOCK_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifdef __KERNEL__ + + +/* + * nt specific part ... + */ + + +/* atomic */ + +typedef struct { volatile int counter; } atomic_t; + +#define ATOMIC_INIT(i) { i } + +#define atomic_read(v) ((v)->counter) +#define atomic_set(v,i) (((v)->counter) = (i)) + +void FASTCALL atomic_add(int i, atomic_t *v); +void FASTCALL atomic_sub(int i, atomic_t *v); + +int FASTCALL atomic_sub_and_test(int i, atomic_t *v); + +void FASTCALL atomic_inc(atomic_t *v); +void FASTCALL atomic_dec(atomic_t *v); + +int FASTCALL atomic_dec_and_test(atomic_t *v); +int FASTCALL atomic_inc_and_test(atomic_t *v); + + +/* event */ + +typedef KEVENT event_t; + +/* + * cfs_init_event + * To initialize the event object + * + * Arguments: + * event: pointer to the event object + * type: Non Zero: SynchronizationEvent + * Zero: NotificationEvent + * status: the initial stats of the event + * Non Zero: signaled + * Zero: un-signaled + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ +static inline void + cfs_init_event(event_t *event, int type, int status) +{ + KeInitializeEvent( + event, + (type) ? SynchronizationEvent: NotificationEvent, + (status) ? TRUE : FALSE + ); +} + +/* + * cfs_wait_event + * To wait on an event to syncrhonize the process + * + * Arguments: + * event: pointer to the event object + * timeout: the timeout for waitting or 0 means infinite time. + * + * Return Value: + * Zero: waiting timeouts + * Non Zero: event signaled ... + * + * Notes: + * N/A + */ + +static inline int64_t +cfs_wait_event(event_t * event, int64_t timeout) +{ + NTSTATUS Status; + LARGE_INTEGER TimeOut; + + TimeOut.QuadPart = -1 * (10000000/HZ) * timeout; + + Status = KeWaitForSingleObject( + event, + Executive, + KernelMode, + FALSE, + (timeout != 0) ? (&TimeOut) : (NULL) + ); + + if (Status == STATUS_TIMEOUT) { + return 0; + } + + return TRUE; // signaled case +} + +/* + * cfs_wake_event + * To signal the event object + * + * Arguments: + * event: pointer to the event object + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline int +cfs_wake_event(event_t * event) +{ + return (KeSetEvent(event, 0, FALSE) != 0); +} + +/* + * cfs_clear_event + * To clear/reset the status of the event object + * + * Arguments: + * event: pointer to the event object + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void +cfs_clear_event(event_t * event) +{ + KeResetEvent(event); +} + + +/* + * IMPORTANT !!!!!!!! + * + * All locks' declaration are not guaranteed to be initialized, + * Althought some of they are initialized in Linux. All locks + * declared by CFS_DECL_* should be initialized explicitly. + */ + + +/* + * spin lock defintions / routines + */ + +/* + * Warning: + * + * for spinlock operations, try to grab nesting acquisition of + * spinlock will cause dead-lock in MP system and current irql + * overwritten for UP system. (UP system could allow nesting spin + * acqisition, because it's not spin at all just raising the irql.) + * + */ + +typedef struct spin_lock { + + KSPIN_LOCK lock; + KIRQL irql; + +} spinlock_t; + + +#define CFS_DECL_SPIN(name) spinlock_t name; +#define CFS_DECL_SPIN_EXTERN(name) extern spinlock_t name; + + +static inline void spin_lock_init(spinlock_t *lock) +{ + KeInitializeSpinLock(&(lock->lock)); +} + + +static inline void spin_lock(spinlock_t *lock) +{ + KeAcquireSpinLock(&(lock->lock), &(lock->irql)); +} + +static inline void spin_unlock(spinlock_t *lock) +{ + KIRQL irql = lock->irql; + KeReleaseSpinLock(&(lock->lock), irql); +} + + +#define spin_lock_irqsave(lock, flags) do {(flags) = 0; spin_lock(lock);} while(0) +#define spin_unlock_irqrestore(lock, flags) do {spin_unlock(lock);} while(0) + + +/* There's no corresponding routine in windows kernel. + We must realize a light one of our own. But there's + no way to identify the system is MP build or UP build + on the runtime. We just uses a workaround for it. */ + +extern int MPSystem; + +static int spin_trylock(spinlock_t *lock) +{ + KIRQL Irql; + int rc = 0; + + ASSERT(lock != NULL); + + KeRaiseIrql(DISPATCH_LEVEL, &Irql); + + if (MPSystem) { + if (0 == (ulong_ptr)lock->lock) { +#if _X86_ + __asm { + mov edx, dword ptr [ebp + 8] + lock bts dword ptr[edx], 0 + jb lock_failed + mov rc, TRUE + lock_failed: + } +#else + KdBreakPoint(); +#endif + + } + } else { + rc = TRUE; + } + + if (rc) { + lock->irql = Irql; + } else { + KeLowerIrql(Irql); + } + + return rc; +} + +/* synchronization between cpus: it will disable all DPCs + kernel task scheduler on the CPU */ +#define spin_lock_bh(x) spin_lock(x) +#define spin_unlock_bh(x) spin_unlock(x) +#define spin_lock_bh_init(x) spin_lock_init(x) + +/* + * rw_semaphore (using ERESOURCE) + */ + + +typedef struct rw_semaphore { + ERESOURCE rwsem; +} rw_semaphore_t; + + +#define CFS_DECL_RWSEM(name) rw_semaphore_t name +#define CFS_DECL_RWSEM_EXTERN(name) extern rw_semaphore_t name + + +/* + * init_rwsem + * To initialize the the rw_semaphore_t structure + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void init_rwsem(rw_semaphore_t *s) +{ + ExInitializeResourceLite(&s->rwsem); +} + + +/* + * fini_rwsem + * To finilize/destroy the the rw_semaphore_t structure + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * N/A + * + * Notes: + * For winnt system, we need this routine to delete the ERESOURCE. + * Just define it NULL for other systems. + */ + +static inline void fini_rwsem(rw_semaphore_t *s) +{ + ExDeleteResourceLite(&s->rwsem); +} + +/* + * down_read + * To acquire read-lock of the rw_semahore + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void down_read(struct rw_semaphore *s) +{ + ExAcquireResourceSharedLite(&s->rwsem, TRUE); +} + + +/* + * down_read_trylock + * To acquire read-lock of the rw_semahore without blocking + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * Zero: failed to acquire the read lock + * Non-Zero: succeeded to acquire the read lock + * + * Notes: + * This routine will return immediately without waiting. + */ + +static inline int down_read_trylock(struct rw_semaphore *s) +{ + return ExAcquireResourceSharedLite(&s->rwsem, FALSE); +} + + +/* + * down_write + * To acquire write-lock of the rw_semahore + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void down_write(struct rw_semaphore *s) +{ + ExAcquireResourceExclusiveLite(&(s->rwsem), TRUE); +} + + +/* + * down_write_trylock + * To acquire write-lock of the rw_semahore without blocking + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * Zero: failed to acquire the write lock + * Non-Zero: succeeded to acquire the read lock + * + * Notes: + * This routine will return immediately without waiting. + */ + +static inline int down_write_trylock(struct rw_semaphore *s) +{ + return ExAcquireResourceExclusiveLite(&(s->rwsem), FALSE); +} + + +/* + * up_read + * To release read-lock of the rw_semahore + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void up_read(struct rw_semaphore *s) +{ + ExReleaseResourceForThreadLite( + &(s->rwsem), + ExGetCurrentResourceThread()); +} + + +/* + * up_write + * To release write-lock of the rw_semahore + * + * Arguments: + * rwsem: pointer to the rw_semaphore_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void up_write(struct rw_semaphore *s) +{ + ExReleaseResourceForThreadLite( + &(s->rwsem), + ExGetCurrentResourceThread()); +} + +/* + * rwlock_t (using sempahore) + * + * - rwlock_init(x) + * - read_lock(x) + * - read_unlock(x) + * - write_lock(x) + * - write_unlock(x) + */ + +typedef struct { + spinlock_t guard; + int count; +} rwlock_t; + +void rwlock_init(rwlock_t * rwlock); +void rwlock_fini(rwlock_t * rwlock); + +void read_lock(rwlock_t * rwlock); +void read_unlock(rwlock_t * rwlock); +void write_lock(rwlock_t * rwlock); +void write_unlock(rwlock_t * rwlock); + +#define write_lock_irqsave(l, f) do {f = 0; write_lock(l);} while(0) +#define write_unlock_irqrestore(l, f) do {write_unlock(l);} while(0) +#define read_lock_irqsave(l, f) do {f=0; read_lock(l);} while(0) +#define read_unlock_irqrestore(l, f) do {read_unlock(l);} while(0) + + +/* + * Semaphore + * + * - sema_init(x, v) + * - __down(x) + * - __up(x) + */ + +typedef struct semaphore { + KSEMAPHORE sem; +} mutex_t; + +static inline void sema_init(struct semaphore *s, int val) +{ + KeInitializeSemaphore(&s->sem, val, val); +} + +static inline void __down(struct semaphore *s) +{ + KeWaitForSingleObject( &(s->sem), Executive, + KernelMode, FALSE, NULL ); + +} + +static inline void __up(struct semaphore *s) +{ + KeReleaseSemaphore(&s->sem, 0, 1, FALSE); +} + +/* + * mutex_t: + * + * - init_mutex(x) + * - init_mutex_locked(x) + * - mutex_up(x) + * - mutex_down(x) + */ + + +/* + * init_mutex + * To initialize a mutex_t structure + * + * Arguments: + * mutex: pointer to the mutex_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void init_mutex(mutex_t *mutex) +{ + sema_init(mutex, 1); +} + + +/* + * mutex_down + * To acquire the mutex lock + * + * Arguments: + * mutex: pointer to the mutex_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void mutex_down(mutex_t *mutex) +{ + __down(mutex); +} + + +/* + * mutex_up + * To release the mutex lock (acquired already) + * + * Arguments: + * mutex: pointer to the mutex_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void mutex_up(mutex_t *mutex) +{ + __up(mutex); +} + + +/* + * init_mutex_locked + * To initialize the mutex as acquired state + * + * Arguments: + * mutex: pointer to the mutex_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline init_mutex_locked(mutex_t *mutex) +{ + init_mutex(mutex); + mutex_down(mutex); +} + +/* + * completion + * + * - init_complition(c) + * - complete(c) + * - wait_for_completion(c) + */ + +struct completion { + event_t event; +}; + + +/* + * init_completion + * To initialize the completion object + * + * Arguments: + * c: pointer to the completion structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void init_completion(struct completion *c) +{ + cfs_init_event(&(c->event), 1, FALSE); +} + + +/* + * complete + * To complete/signal the completion object + * + * Arguments: + * c: pointer to the completion structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void complete(struct completion *c) +{ + cfs_wake_event(&(c->event)); +} + +/* + * wait_for_completion + * To wait on the completion object. If the event is signaled, + * this function will return to the call with the event un-singled. + * + * Arguments: + * c: pointer to the completion structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +static inline void wait_for_completion(struct completion *c) +{ + cfs_wait_event(&(c->event), 0); +} + +/* __KERNEL__ */ +#else + +#include "../user-lock.h" + +/* __KERNEL__ */ +#endif +#endif diff --git a/lnet/include/libcfs/winnt/winnt-mem.h b/lnet/include/libcfs/winnt/winnt-mem.h new file mode 100644 index 0000000..b7f00a4 --- /dev/null +++ b/lnet/include/libcfs/winnt/winnt-mem.h @@ -0,0 +1,133 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines of memory manipulation routines . + * + */ + +#ifndef __LIBCFS_WINNT_CFS_MEM_H__ +#define __LIBCFS_WINNT_CFS_MEM_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifdef __KERNEL__ + +#define CFS_PAGE_SIZE PAGE_SIZE +#define CFS_PAGE_SHIFT PAGE_SHIFT +#define CFS_PAGE_MASK (~(PAGE_SIZE - 1)) + +typedef struct cfs_page { + void * addr; + atomic_t count; +} cfs_page_t; + + +cfs_page_t *cfs_alloc_page(int flags); +void cfs_free_page(cfs_page_t *pg); + +static inline void *cfs_page_address(cfs_page_t *page) +{ + return page->addr; +} + +static inline void *cfs_kmap(cfs_page_t *page) +{ + return page->addr; +} + +static inline void cfs_kunmap(cfs_page_t *page) +{ + return; +} + +static inline void cfs_get_page(cfs_page_t *page) +{ + atomic_inc(&page->count); +} + +static inline void cfs_put_page(cfs_page_t *page) +{ + atomic_dec(&page->count); +} + +static inline int cfs_page_count(cfs_page_t *page) +{ + return atomic_read(&page->count); +} + +/* + * Memory allocator + */ + +#define CFS_ALLOC_ATOMIC_TRY (0) + +extern void *cfs_alloc(size_t nr_bytes, u_int32_t flags); +extern void cfs_free(void *addr); + +extern void *cfs_alloc_large(size_t nr_bytes); +extern void cfs_free_large(void *addr); + +/* + * SLAB allocator + */ + +#define SLAB_HWCACHE_ALIGN 0 + +/* The cache name is limited to 20 chars */ + +typedef struct cfs_mem_cache { + + char name[20]; + ulong_ptr flags; + NPAGED_LOOKASIDE_LIST npll; + +} cfs_mem_cache_t; + + +extern cfs_mem_cache_t * cfs_mem_cache_create (const char *, size_t, size_t, ulong_ptr); +extern int cfs_mem_cache_destroy ( cfs_mem_cache_t * ); +extern void *cfs_mem_cache_alloc ( cfs_mem_cache_t *, int); +extern void cfs_mem_cache_free ( cfs_mem_cache_t *, void *); + + +/* + * Page allocator slabs + */ + +extern cfs_mem_cache_t *cfs_page_t_slab; +extern cfs_mem_cache_t *cfs_page_p_slab; + + +#define CFS_DECL_MMSPACE +#define CFS_MMSPACE_OPEN do {} while(0) +#define CFS_MMSPACE_CLOSE do {} while(0) + + +#define mb() do {} while(0) +#define rmb() mb() +#define wmb() mb() + + +/* __KERNEL__ */ +#endif + +#endif /* __WINNT_CFS_MEM_H__ */ diff --git a/lnet/include/libcfs/winnt/winnt-prim.h b/lnet/include/libcfs/winnt/winnt-prim.h new file mode 100644 index 0000000..3c8560b --- /dev/null +++ b/lnet/include/libcfs/winnt/winnt-prim.h @@ -0,0 +1,1082 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic library routines. + * + */ + +#ifndef __LIBCFS_WINNT_CFS_PRIM_H__ +#define __LIBCFS_WINNT_CFS_PRIM_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +/* + * libcfs proc device object + */ + + +#define LUSTRE_PROC_DEVICE L"\\Device\\lproc" /* proc fs emulator device object */ +#define LUSTRE_PROC_SYMLNK L"\\DosDevices\\lproc" /* proc fs user-visible device */ + + +/* + * Device IO Control Code Definitions + */ + +#define FILE_DEVICE_LIBCFS ('LC') + +#define FILE_DEVICE_LIBCFS ('LC') + +#define FUNC_LIBCFS_VERSION 0x101 // get version of current libcfs +#define FUNC_LIBCFS_IOCTL 0x102 // Device i/o control to proc fs + + +#define IOCTL_LIBCFS_VERSION \ + CTL_CODE (FILE_DEVICE_LIBCFS, FUNC_LIBCFS_VERSION, METHOD_BUFFERED, FILE_ANY_ACCESS) +#define IOCTL_LIBCFS_ENTRY \ + CTL_CODE(FILE_DEVICE_LIBCFS, FUNC_LIBCFS_IOCTL, METHOD_BUFFERED, FILE_ANY_ACCESS) + +#pragma pack(4) + +typedef struct _CFS_PROC_IOCTL { + + ULONG cmd; // ioctl command identifier + ULONG len; // length of data + + // UCHAR data[]; // content of the real ioctl + +} CFS_PROC_IOCTL, *PCFS_PROC_IOCTL; + +#pragma pack() + +#ifdef __KERNEL__ + +#include + +/* + * Symbol functions for libcfs + * + * OSX has no facility for use to register symbol. + * So we have to implement it. + */ +#define CFS_SYMBOL_LEN 64 + +struct cfs_symbol { + char name[CFS_SYMBOL_LEN]; + void *value; + int ref; + struct list_head sym_list; +}; + +extern int cfs_symbol_register(const char *, const void *); +extern void cfs_symbol_unregister(const char *); +extern void * cfs_symbol_get(const char *); +extern void cfs_symbol_put(const char *); +extern void cfs_symbol_clean(); + + + +typedef struct file_operations cfs_file_operations_t; +typedef struct file cfs_file_t; + +/* + * Pseudo device register + */ + +typedef struct +{ + int minor; + const char * name; + cfs_file_operations_t * fops; +} cfs_psdev_t; + +int cfs_psdev_register(cfs_psdev_t * psdev); +int cfs_psdev_deregister(cfs_psdev_t * psdev); + + +/* + * Proc emulator file system APIs + */ + +typedef int cfs_read_proc_t(char *page, char **start, off_t off, + int count, int *eof, void *data); +typedef int cfs_write_proc_t(struct file *file, const char *buffer, + ulong_ptr count, void *data); + +#define CFS_PROC_ENTRY_MAGIC 'CPEM' + +#define CFS_PROC_FLAG_DIRECTORY 0x00000001 // directory node +#define CFS_PROC_FLAG_ATTACHED 0x00000002 // node is attached to proc +#define CFS_PROC_FLAG_MISCDEV 0x00000004 // miscellaneous device + +typedef struct cfs_proc_entry +{ + ULONG magic; // Magic + ULONG flags; // Flags + + struct _dir_entry { // proc directory entry + PRTL_SPLAY_LINKS root; + }; + + struct _file_entry { // proc file / leaf entry + cfs_read_proc_t * read_proc; + cfs_write_proc_t * write_proc; + }; + + mode_t mode; + unsigned short nlink; + + + struct file_operations * proc_fops; + void * data; + + // proc_dir_entry ended. + + RTL_SPLAY_LINKS s_link; // splay link + + // + // Maximum length of proc entry name is 0x20 + // + + char name[0x20]; + +} cfs_proc_entry_t, cfs_proc_dir_entry_t; + +typedef cfs_proc_entry_t cfs_proc_dir_entry_t; + +#define PROC_BLOCK_SIZE PAGE_SIZE + +/* + * Sysctl register + */ + +typedef struct ctl_table cfs_sysctl_table_t; +typedef struct ctl_table_header cfs_sysctl_table_header_t; + + +typedef int ctl_handler ( + cfs_sysctl_table_t *table, + int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, + void **context ); + +typedef int proc_handler ( + cfs_sysctl_table_t *ctl, + int write, struct file * filp, + void *buffer, size_t *lenp ); + + +int proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp, + void *buffer, size_t *lenp); + +int proc_dostring(cfs_sysctl_table_t *table, int write, struct file *filp, + void *buffer, size_t *lenp); + +int sysctl_string(cfs_sysctl_table_t *table, int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, void **context); + + +/* + * System io control definitions + */ + +#define CTL_MAXNAME 10 + +#define CTL_ANY -1 /* Matches any name */ +#define CTL_NONE 0 + +enum +{ + CTL_KERN=1, /* General kernel info and control */ + CTL_VM=2, /* VM management */ + CTL_NET=3, /* Networking */ + CTL_PROC=4, /* Process info */ + CTL_FS=5, /* Filesystems */ + CTL_DEBUG=6, /* Debugging */ + CTL_DEV=7, /* Devices */ + CTL_BUS=8, /* Busses */ + CTL_ABI=9, /* Binary emulation */ + CTL_CPU=10 /* CPU stuff (speed scaling, etc) */ +}; + +/* sysctl table definitons */ +struct ctl_table +{ + int ctl_name; + char *procname; + void *data; + int maxlen; + mode_t mode; + cfs_sysctl_table_t *child; + proc_handler *proc_handler; /* text formatting callback */ + ctl_handler *strategy; /* read / write callback functions */ + cfs_proc_entry_t *de; /* proc entry block */ + void *extra1; + void *extra2; +}; + + +/* the mantaner of the cfs_sysctl_table trees */ +struct ctl_table_header +{ + cfs_sysctl_table_t * ctl_table; + struct list_head ctl_entry; +}; + + +cfs_proc_entry_t * create_proc_entry(char *name, mode_t mod, + cfs_proc_entry_t *parent); +void proc_free_entry(cfs_proc_entry_t *de); +void remove_proc_entry(char *name, cfs_proc_entry_t *entry); +cfs_proc_entry_t * search_proc_entry(char * name, + cfs_proc_entry_t * root ); + +#define cfs_create_proc_entry create_proc_entry +#define cfs_free_proc_entry proc_free_entry +#define cfs_remove_proc_entry remove_proc_entry + +#define register_cfs_sysctl_table(t, a) register_sysctl_table(t, a) +#define unregister_cfs_sysctl_table(t) unregister_sysctl_table(t, a) + + +/* + * declaration of proc kernel process routines + */ + +cfs_file_t * +lustre_open_file(char * filename); + +int +lustre_close_file(cfs_file_t * fh); + +int +lustre_do_ioctl( cfs_file_t * fh, + unsigned long cmd, + ulong_ptr arg ); + +int +lustre_ioctl_file( cfs_file_t * fh, + PCFS_PROC_IOCTL devctl); + +size_t +lustre_read_file( cfs_file_t * fh, + loff_t off, + size_t size, + char * buf + ); + +size_t +lustre_write_file( cfs_file_t * fh, + loff_t off, + size_t size, + char * buf + ); + +/* + * Wait Queue + */ + + +typedef int cfs_task_state_t; + +#define CFS_TASK_INTERRUPTIBLE 0x00000001 +#define CFS_TASK_UNINT 0x00000002 + + + +#define CFS_WAITQ_MAGIC 'CWQM' +#define CFS_WAITLINK_MAGIC 'CWLM' + +typedef struct cfs_waitq { + + unsigned int magic; + unsigned int flags; + + spinlock_t guard; + struct list_head waiters; + +} cfs_waitq_t; + + +typedef struct cfs_waitlink cfs_waitlink_t; + +#define CFS_WAITQ_CHANNELS (2) + +#define CFS_WAITQ_CHAN_NORMAL (0) +#define CFS_WAITQ_CHAN_FORWARD (1) + + + +typedef struct cfs_waitlink_channel { + struct list_head link; + cfs_waitq_t * waitq; + cfs_waitlink_t * waitl; +} cfs_waitlink_channel_t; + +struct cfs_waitlink { + + unsigned int magic; + int flags; + event_t * event; + atomic_t * hits; + + cfs_waitlink_channel_t waitq[CFS_WAITQ_CHANNELS]; +}; + +enum { + CFS_WAITQ_EXCLUSIVE = 1 +}; + +#define CFS_DECL_WAITQ(name) cfs_waitq_t name + + +void cfs_waitq_init(struct cfs_waitq *waitq); +void cfs_waitlink_init(struct cfs_waitlink *link); + +void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link); +void cfs_waitq_add_exclusive(struct cfs_waitq *waitq, + struct cfs_waitlink *link); +void cfs_waitq_forward(struct cfs_waitlink *link, struct cfs_waitq *waitq); +void cfs_waitq_del(struct cfs_waitq *waitq, struct cfs_waitlink *link); +int cfs_waitq_active(struct cfs_waitq *waitq); + +void cfs_waitq_signal(struct cfs_waitq *waitq); +void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr); +void cfs_waitq_broadcast(struct cfs_waitq *waitq); + +void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state); +cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, + cfs_task_state_t state, cfs_duration_t timeout); + + + +/* Kernel thread */ + +typedef int (*cfs_thread_t) (void *arg); + +typedef struct _cfs_thread_context { + cfs_thread_t func; + void * arg; +} cfs_thread_context_t; + +int cfs_kernel_thread(int (*func)(void *), void *arg, int flag); + +/* + * thread creation flags from Linux, not used in winnt + */ +#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ +#define CLONE_VM 0x00000100 /* set if VM shared between processes */ +#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ +#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ +#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ +#define CLONE_PID 0x00001000 /* set if pid shared */ +#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ +#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ +#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ +#define CLONE_THREAD 0x00010000 /* Same thread group? */ +#define CLONE_NEWNS 0x00020000 /* New namespace group? */ + +#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) + + +/* + * sigset ... + */ + +typedef sigset_t cfs_sigset_t; + +/* + * Task struct + */ + +#define MAX_SCHEDULE_TIMEOUT ((long_ptr)(~0UL>>12)) + + +#define NGROUPS 1 +#define CFS_CURPROC_COMM_MAX (16) +typedef struct task_sruct{ + mode_t umask; + + pid_t pid; + pid_t pgrp; + + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + + int ngroups; + gid_t groups[NGROUPS]; + cfs_kernel_cap_t cap_effective, + cap_inheritable, + cap_permitted; + + char comm[CFS_CURPROC_COMM_MAX]; + void * journal_info; +} cfs_task_t; + + +/* + * linux task struct emulator ... + */ + +#define TASKMAN_MAGIC 'TMAN' /* Task Manager */ +#define TASKSLT_MAGIC 'TSLT' /* Task Slot */ + +typedef struct _TASK_MAN { + + ULONG Magic; /* Magic and Flags */ + ULONG Flags; + + spinlock_t Lock; /* Protection lock */ + + cfs_mem_cache_t * slab; /* Memory slab for task slot */ + + ULONG NumOfTasks; /* Total tasks (threads) */ + LIST_ENTRY TaskList; /* List of task slots */ + +} TASK_MAN, *PTASK_MAN; + +typedef struct _TASK_SLOT { + + ULONG Magic; /* Magic and Flags */ + ULONG Flags; + + LIST_ENTRY Link; /* To be linked to TaskMan */ + + event_t Event; /* Schedule event */ + + HANDLE Pid; /* Process id */ + HANDLE Tid; /* Thread id */ + PETHREAD Tet; /* Pointer to ethread */ + + atomic_t count; /* refer count */ + atomic_t hits; /* times of waken event singaled */ + + KIRQL irql; /* irql for rwlock ... */ + + cfs_task_t task; /* linux task part */ + +} TASK_SLOT, *PTASK_SLOT; + + +#define current cfs_current() +#define set_current_state(s) do {;} while (0) + +#define wait_event(wq, condition) \ +do { \ + cfs_waitlink_t __wait; \ + \ + cfs_waitlink_init(&__wait); \ + while (TRUE) { \ + cfs_waitq_add(&wq, &__wait); \ + if (condition) { \ + break; \ + } \ + cfs_waitq_wait(&__wait, CFS_TASK_INTERRUPTIBLE); \ + cfs_waitq_del(&wq, &__wait); \ + } \ + cfs_waitq_del(&wq, &__wait); \ +} while(0) + +#define wait_event_interruptible(wq, condition, __ret) \ +do { \ + cfs_waitlink_t __wait; \ + \ + __ret = 0; \ + cfs_waitlink_init(&__wait); \ + while (TRUE) { \ + cfs_waitq_add(&wq, &__wait); \ + if (condition) { \ + break; \ + } \ + cfs_waitq_wait(&__wait, CFS_TASK_INTERRUPTIBLE); \ + cfs_waitq_del(&wq, &__wait); \ + } \ + cfs_waitq_del(&wq, &__wait); \ +} while(0) + + +int init_task_manager(); +void cleanup_task_manager(); +cfs_task_t * cfs_current(); +int schedule_timeout(int64_t time); +int schedule(); +int wake_up_process(cfs_task_t * task); +#define cfs_schedule_timeout(state, time) schedule_timeout(time) +void sleep_on(cfs_waitq_t *waitq); + +#define CFS_DECL_JOURNAL_DATA +#define CFS_PUSH_JOURNAL do {;} while(0) +#define CFS_POP_JOURNAL do {;} while(0) + + +/* module related definitions */ + +#ifndef __exit +#define __exit +#endif +#ifndef __init +#define __init +#endif + +#define request_module(x) (0) + +#define EXPORT_SYMBOL(s) +#define MODULE_AUTHOR(s) +#define MODULE_DESCRIPTION(s) +#define MODULE_LICENSE(s) +#define MODULE_PARM(a, b) +#define MODULE_PARM_DESC(a, b) + +#define module_init(X) int __init module_##X() {return X();} +#define module_exit(X) void __exit module_##X() {X();} + +#define DECLARE_INIT(X) extern int __init module_##X(void) +#define DECLARE_EXIT(X) extern void __exit module_##X(void) + +#define MODULE_INIT(X) do { int rc = module_##X(); \ + if (rc) goto errorout; \ + } while(0) + +#define MODULE_EXIT(X) do { module_##X(); } while(0) + + +/* Module interfaces */ +#define cfs_module(name, version, init, fini) \ +module_init(init); \ +module_exit(fini) + + +/* + * Linux kernel version definition + */ + +#define KERNEL_VERSION(a,b,c) ((a)*100+(b)*10+c) +#define LINUX_VERSION_CODE (2*100+6*10+7) + + +/* + * Signal + */ +#define SIGNAL_MASK_ASSERT() + +/* + * Timer + */ + +#define CFS_TIMER_FLAG_INITED 0x00000001 // Initialized already +#define CFS_TIMER_FLAG_TIMERED 0x00000002 // KeSetTimer is called + +typedef struct cfs_timer { + + KSPIN_LOCK Lock; + + ULONG Flags; + + KDPC Dpc; + KTIMER Timer; + + cfs_time_t deadline; + + void (*proc)(ulong_ptr); + void * arg; + +} cfs_timer_t; + + +typedef void (*timer_func_t)(ulong_ptr); + +#define cfs_init_timer(t) + +void cfs_timer_init(cfs_timer_t *timer, void (*func)(ulong_ptr), void *arg); +void cfs_timer_done(cfs_timer_t *t); +void cfs_timer_arm(cfs_timer_t *t, cfs_time_t deadline); +void cfs_timer_disarm(cfs_timer_t *t); +int cfs_timer_is_armed(cfs_timer_t *t); +cfs_time_t cfs_timer_deadline(cfs_timer_t *t); + + +/* deschedule for a bit... */ +static inline void cfs_pause(cfs_duration_t ticks) +{ + cfs_schedule_timeout(TASK_UNINTERRUPTIBLE, ticks); +} + + +static inline void cfs_enter_debugger(void) +{ +#if _X86_ + __asm int 3; +#else + KdBreakPoint(); +#endif +} + +/* + * libcfs globals initialization/cleanup + */ + +int +libcfs_arch_init(void); + +void +libcfs_arch_cleanup(void); + +/* + * SMP ... + */ + +#define SMP_CACHE_BYTES 128 +#define __cacheline_aligned +#define NR_CPUS (2) +#define smp_processor_id() KeGetCurrentProcessorNumber() +#define smp_num_cpus NR_CPUS +#define num_online_cpus() smp_num_cpus +#define smp_call_function(f, a, n, w) do {} while(0) + +/* + * Irp related + */ + +#define NR_IRQS 512 +#define in_interrupt() (0) + +/* + * printk flags + */ + +#define KERN_EMERG "<0>" /* system is unusable */ +#define KERN_ALERT "<1>" /* action must be taken immediately */ +#define KERN_CRIT "<2>" /* critical conditions */ +#define KERN_ERR "<3>" /* error conditions */ +#define KERN_WARNING "<4>" /* warning conditions */ +#define KERN_NOTICE "<5>" /* normal but significant condition */ +#define KERN_INFO "<6>" /* informational */ +#define KERN_DEBUG "<7>" /* debug-level messages */ + +/* + * Misc + */ + + +#define inter_module_get(n) cfs_symbol_get(n) +#define inter_module_put(n) cfs_symbol_put(n) + +#ifndef likely +#define likely(exp) (exp) +#endif +#ifndef unlikely +#define unlikely(exp) (exp) +#endif + +#define lock_kernel() do {} while(0) +#define unlock_kernel() do {} while(0) + +#define CAP_SYS_ADMIN 0 +#define CAP_SYS_ROOT 1 + +#define capable(a) (TRUE) + +#define USERMODEHELPER(path, argv, envp) (0) + + +#define local_irq_save(x) +#define local_irq_restore(x) + +#define cfs_assert ASSERT + +#define THREAD_NAME + +#else /* !__KERNEL__ */ + +#define PAGE_CACHE_SIZE PAGE_SIZE +#define PAGE_CACHE_MASK PAGE_MASK + +#define getpagesize() (PAGE_SIZE) + + +typedef struct { + int foo; +} pthread_mutex_t; + +typedef struct { + int foo; +} pthread_cond_t; + +#define pthread_mutex_init(x, y) do {} while(0) +#define pthread_cond_init(x, y) do {} while(0) + +#define pthread_mutex_lock(x) do {} while(0) +#define pthread_mutex_unlock(x) do {} while(0) + +#define pthread_cond_wait(x,y) do {} while(0) +#define pthread_cond_broadcast(x) do {} while(0) + +typedef struct file { + int foo; +} cfs_file_t; + +typedef struct cfs_proc_dir_entry{ + void *data; +}cfs_proc_dir_entry_t; + + + +#include "../user-prim.h" + +#include +#include + +#define strcasecmp strcmp +#define strncasecmp strncmp +#define snprintf _snprintf +#define getpid() (0) + + +#define getpwuid(x) (NULL) +#define getgrgid(x) (NULL) + +int cfs_proc_mknod(const char *path, mode_t mode, dev_t dev); + +int gethostname(char * name, int namelen); + +#define setlinebuf(x) do {} while(0) + + +NTSYSAPI VOID NTAPI DebugBreak(); + + +static inline void cfs_enter_debugger(void) +{ +#if _X86_ + __asm int 3; +#else + DebugBreak(); +#endif +} + +/* Maximum EA Information Length */ +#define EA_MAX_LENGTH (sizeof(FILE_FULL_EA_INFORMATION) + 15) + + +/* + * proc user mode routines + */ + +HANDLE cfs_proc_open (char * filename, int oflag); +int cfs_proc_close(HANDLE handle); +int cfs_proc_read(HANDLE handle, void *buffer, unsigned int count); +int cfs_proc_write(HANDLE handle, void *buffer, unsigned int count); +int cfs_proc_ioctl(HANDLE handle, int cmd, void *buffer); + + +/* + * Native API definitions + */ + +// +// Disk I/O Routines +// + +NTSYSAPI +NTSTATUS +NTAPI +NtReadFile(HANDLE FileHandle, + HANDLE Event OPTIONAL, + PIO_APC_ROUTINE ApcRoutine OPTIONAL, + PVOID ApcContext OPTIONAL, + PIO_STATUS_BLOCK IoStatusBlock, + PVOID Buffer, + ULONG Length, + PLARGE_INTEGER ByteOffset OPTIONAL, + PULONG Key OPTIONAL); + +NTSYSAPI +NTSTATUS +NTAPI +NtWriteFile(HANDLE FileHandle, + HANDLE Event OPTIONAL, + PIO_APC_ROUTINE ApcRoutine OPTIONAL, + PVOID ApcContext OPTIONAL, + PIO_STATUS_BLOCK IoStatusBlock, + PVOID Buffer, + ULONG Length, + PLARGE_INTEGER ByteOffset OPTIONAL, + PULONG Key OPTIONAL); + +NTSYSAPI +NTSTATUS +NTAPI +NtClose(HANDLE Handle); + +NTSYSAPI +NTSTATUS +NTAPI +NtCreateFile(PHANDLE FileHandle, + ACCESS_MASK DesiredAccess, + POBJECT_ATTRIBUTES ObjectAttributes, + PIO_STATUS_BLOCK IoStatusBlock, + PLARGE_INTEGER AllocationSize OPTIONAL, + ULONG FileAttributes, + ULONG ShareAccess, + ULONG CreateDisposition, + ULONG CreateOptions, + PVOID EaBuffer OPTIONAL, + ULONG EaLength); + + +NTSYSAPI +NTSTATUS +NTAPI +NtDeviceIoControlFile( + IN HANDLE FileHandle, + IN HANDLE Event, + IN PIO_APC_ROUTINE ApcRoutine, + IN PVOID ApcContext, + OUT PIO_STATUS_BLOCK IoStatusBlock, + IN ULONG IoControlCode, + IN PVOID InputBuffer, + IN ULONG InputBufferLength, + OUT PVOID OutputBuffer, + OUT ULONG OutputBufferLength + ); + +NTSYSAPI +NTSTATUS +NTAPI +NtFsControlFile( + IN HANDLE FileHandle, + IN HANDLE Event OPTIONAL, + IN PIO_APC_ROUTINE ApcRoutine OPTIONAL, + IN PVOID ApcContext OPTIONAL, + OUT PIO_STATUS_BLOCK IoStatusBlock, + IN ULONG FsControlCode, + IN PVOID InputBuffer OPTIONAL, + IN ULONG InputBufferLength, + OUT PVOID OutputBuffer OPTIONAL, + IN ULONG OutputBufferLength +); + + +NTSYSAPI +NTSTATUS +NTAPI +NtQueryInformationFile( + IN HANDLE FileHandle, + OUT PIO_STATUS_BLOCK IoStatusBlock, + OUT PVOID FileInformation, + IN ULONG Length, + IN FILE_INFORMATION_CLASS FileInformationClass + ); + +// +// Random routines ... +// + +NTSYSAPI +ULONG +NTAPI +RtlRandom( + IN OUT PULONG Seed + ); + +#endif /* __KERNEL__ */ + + +// +// Inode flags (Linux uses octad number, but why ? strange!!!) +// + +#undef S_IFMT +#undef S_IFDIR +#undef S_IFCHR +#undef S_IFREG +#undef S_IREAD +#undef S_IWRITE +#undef S_IEXEC + +#define S_IFMT 0x0F000 /* 017 0000 */ +#define S_IFSOCK 0x0C000 /* 014 0000 */ +#define S_IFLNK 0x0A000 /* 012 0000 */ +#define S_IFREG 0x08000 /* 010 0000 */ +#define S_IFBLK 0x06000 /* 006 0000 */ +#define S_IFDIR 0x04000 /* 004 0000 */ +#define S_IFCHR 0x02000 /* 002 0000 */ +#define S_IFIFO 0x01000 /* 001 0000 */ +#define S_ISUID 0x00800 /* 000 4000 */ +#define S_ISGID 0x00400 /* 000 2000 */ +#define S_ISVTX 0x00200 /* 000 1000 */ + +#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) +#define S_ISSOCK(m) (((m) & S_IFMT) == S_IFSOCK) +#define S_ISLNK(m) (((m) & S_IFMT) == S_IFLNK) +#define S_ISFIL(m) (((m) & S_IFMT) == S_IFFIL) +#define S_ISBLK(m) (((m) & S_IFMT) == S_IFBLK) +#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) +#define S_ISCHR(m) (((m) & S_IFMT) == S_IFCHR) +#define S_ISFIFO(m) (((m) & S_IFMT) == S_IFIFO) + +#define S_IPERMISSION_MASK 0x1FF /* */ + +#define S_IRWXU 0x1C0 /* 0 0700 */ +#define S_IRUSR 0x100 /* 0 0400 */ +#define S_IWUSR 0x080 /* 0 0200 */ +#define S_IXUSR 0x040 /* 0 0100 */ + +#define S_IRWXG 0x038 /* 0 0070 */ +#define S_IRGRP 0x020 /* 0 0040 */ +#define S_IWGRP 0x010 /* 0 0020 */ +#define S_IXGRP 0x008 /* 0 0010 */ + +#define S_IRWXO 0x007 /* 0 0007 */ +#define S_IROTH 0x004 /* 0 0004 */ +#define S_IWOTH 0x002 /* 0 0002 */ +#define S_IXOTH 0x001 /* 0 0001 */ + +#define S_IRWXUGO (S_IRWXU|S_IRWXG|S_IRWXO) +#define S_IALLUGO (S_ISUID|S_ISGID|S_ISVTX|S_IRWXUGO) +#define S_IRUGO (S_IRUSR|S_IRGRP|S_IROTH) +#define S_IWUGO (S_IWUSR|S_IWGRP|S_IWOTH) +#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH) + +/* + * linux ioctl coding definitions + */ + +#define _IOC_NRBITS 8 +#define _IOC_TYPEBITS 8 +#define _IOC_SIZEBITS 14 +#define _IOC_DIRBITS 2 + +#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) +#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) +#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) +#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) + +#define _IOC_NRSHIFT 0 +#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) +#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) +#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) + +/* + * Direction bits. + */ +#define _IOC_NONE 0U +#define _IOC_WRITE 1U +#define _IOC_READ 2U + +#define _IOC(dir,type,nr,size) \ + (((dir) << _IOC_DIRSHIFT) | \ + ((type) << _IOC_TYPESHIFT) | \ + ((nr) << _IOC_NRSHIFT) | \ + ((size) << _IOC_SIZESHIFT)) + +/* used to create numbers */ +#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) +#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) +#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) +#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) + +/* used to decode ioctl numbers.. */ +#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) +#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) +#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) +#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) + +/* + * Io vector ... + */ + +struct iovec +{ + void *iov_base; + size_t iov_len; +}; + + +#define ULONG_LONG_MAX ((__u64)(0xFFFFFFFFFFFFFFFF)) +/* + * Convert a string to an unsigned long long integer. + * + * Ignores `locale' stuff. Assumes that the upper and lower case + * alphabets and digits are each contiguous. + */ +static inline __u64 +strtoull( + char *nptr, + char **endptr, + int base) +{ + char *s = nptr; + __u64 acc, cutoff; + int c, neg = 0, any, cutlim; + + /* + * See strtol for comments as to the logic used. + */ + do { + c = *s++; + } while (isspace(c)); + if (c == '-') { + neg = 1; + c = *s++; + } else if (c == '+') + c = *s++; + if ((base == 0 || base == 16) && + c == '0' && (*s == 'x' || *s == 'X')) { + c = s[1]; + s += 2; + base = 16; + } + if (base == 0) + base = c == '0' ? 8 : 10; + cutoff = (__u64)ULONG_LONG_MAX / (__u64)base; + cutlim = (int)((__u64)ULONG_LONG_MAX % (__u64)base); + for (acc = 0, any = 0;; c = *s++) { + if (isdigit(c)) + c -= '0'; + else if (isalpha(c)) + c -= isupper(c) ? 'A' - 10 : 'a' - 10; + else + break; + if (c >= base) + break; + if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) + any = -1; + else { + any = 1; + acc *= base; + acc += c; + } + } + if (any < 0) { + acc = ULONG_LONG_MAX; + } else if (neg) + acc = 0 - acc; + if (endptr != 0) + *endptr = (char *) (any ? s - 1 : nptr); + return (acc); +} + +#endif diff --git a/lnet/include/libcfs/winnt/winnt-tcpip.h b/lnet/include/libcfs/winnt/winnt-tcpip.h new file mode 100644 index 0000000..a988247 --- /dev/null +++ b/lnet/include/libcfs/winnt/winnt-tcpip.h @@ -0,0 +1,660 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under the + * terms of version 2 of the GNU General Public License as published by the + * Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass + * Ave, Cambridge, MA 02139, USA. + * + * Implementation of portable time API for Winnt (kernel and user-level). + * + */ + +#ifndef __LIBCFS_WINNT_TCPIP_H__ +#define __LIBCFS_WINNT_TCPIP_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + + +#ifdef __KERNEL__ + +// +// ks definitions +// + +// iovec is defined in libcfs: winnt_prim.h +// lnetkiov_t is defined in lnet/types.h + +typedef struct socket ksock_tconn_t; +typedef struct socket cfs_socket_t; + +// completion notification callback routine + +typedef VOID (*ksock_schedule_cb)(struct socket*, int, void *, ulong_ptr); + +/* completion routine to update tx structure for async sending */ +typedef PVOID (*ksock_update_tx)(struct socket*, PVOID tx, ulong_ptr); + +// +// tdinal definitions +// + + +#if TDI_LIBCFS_DBG +#define KsPrint(X) KsPrintf X +#else +#define KsPrint(X) +#endif + + +// +// Socket Addresses Related ... +// + +#define INADDR_ANY (ULONG)0x00000000 +#define INADDR_LOOPBACK (ULONG)0x7f000001 +#define INADDR_BROADCAST (ULONG)0xffffffff +#define INADDR_NONE (ULONG)0xffffffff + +/* + * TCP / IP options + */ + +#define SOL_TCP 6 +#define SOL_UDP 17 + + +#define TL_INSTANCE 0 + +#define TCP_SOCKET_NODELAY 1 // disabling "Nagle" +#define TCP_SOCKET_KEEPALIVE 2 +#define TCP_SOCKET_OOBINLINE 3 +#define TCP_SOCKET_BSDURGENT 4 +#define TCP_SOCKET_ATMARK 5 +#define TCP_SOCKET_WINDOW 6 + + +/* Flags we can use with send/ and recv. + Added those for 1003.1g not all are supported yet + */ + +#define MSG_OOB 1 +#define MSG_PEEK 2 +#define MSG_DONTROUTE 4 +#define MSG_TRYHARD 4 /* Synonym for MSG_DONTROUTE for DECnet */ +#define MSG_CTRUNC 8 +#define MSG_PROBE 0x10 /* Do not send. Only probe path f.e. for MTU */ +#define MSG_TRUNC 0x20 +#define MSG_DONTWAIT 0x40 /* Nonblocking io */ +#define MSG_EOR 0x80 /* End of record */ +#define MSG_WAITALL 0x100 /* Wait for a full request */ +#define MSG_FIN 0x200 +#define MSG_SYN 0x400 +#define MSG_CONFIRM 0x800 /* Confirm path validity */ +#define MSG_RST 0x1000 +#define MSG_ERRQUEUE 0x2000 /* Fetch message from error queue */ +#define MSG_NOSIGNAL 0x4000 /* Do not generate SIGPIPE */ +#define MSG_MORE 0x8000 /* Sender will send more */ + +#define MSG_EOF MSG_FIN + + +// +// Maximum TRANSPORT_ADDRESS Length +// +// it must >= FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) +// + TDI_ADDRESS_LENGTH_IP +// +// I define it a little large and 16 bytes aligned to avoid possible overflow. +// + +#define MAX_ADDRESS_LENGTH (0x30) + + +// +// Maximum Listers Children Sockets +// + +#define MAX_CHILD_LISTENERS (4) + +// +// Maximum EA Information Length +// + +#define EA_MAX_LENGTH ( sizeof(FILE_FULL_EA_INFORMATION) - 1 + \ + TDI_TRANSPORT_ADDRESS_LENGTH + 1 + \ + MAX_ADDRESS_LENGTH ) + + +#define UDP_DEVICE_NAME L"\\Device\\Udp" +#define TCP_DEVICE_NAME L"\\Device\\Tcp" + + +/* + * TSDU definitions + */ + +#define TDINAL_TSDU_DEFAULT_SIZE (0x10000) + +#define KS_TSDU_MAGIC 'KSTD' + +#define KS_TSDU_ATTACHED 0x00000001 // Attached to the socket receive tsdu list + +typedef struct _KS_TSDU { + + ULONG Magic; + ULONG Flags; + + struct list_head Link; + + ULONG TotalLength; // Total size of KS_TSDU + + ULONG StartOffset; // Start offset of the first Tsdu unit + ULONG LastOffset; // End offset of the last Tsdu unit + +/* + union { + KS_TSDU_DAT[]; + KS_TSDU_BUF[]; + KS_TSDU_MDL[]; + } +*/ + +} KS_TSDU, *PKS_TSDU; + +#define TSDU_TYPE_BUF ((USHORT)0x5401) +#define TSDU_TYPE_DAT ((USHORT)0x5402) +#define TSDU_TYPE_MDL ((USHORT)0x5403) + +#define KS_TSDU_BUF_RECEIVING 0x0001 +typedef struct _KS_TSDU_BUF { + + USHORT TsduType; + USHORT TsduFlags; + + ULONG DataLength; + ULONG StartOffset; + + PVOID UserBuffer; + +} KS_TSDU_BUF, *PKS_TSDU_BUF; + +#define KS_TSDU_DAT_RECEIVING 0x0001 + +typedef struct _KS_TSDU_DAT { + + USHORT TsduType; + USHORT TsduFlags; + + ULONG DataLength; + ULONG StartOffset; + + ULONG TotalLength; + + UCHAR Data[1]; + +} KS_TSDU_DAT, *PKS_TSDU_DAT; + +#define KS_DWORD_ALIGN(x) (((x) + 0x03) & (~(0x03))) +#define KS_TSDU_STRU_SIZE(Len) (KS_DWORD_ALIGN((Len) + FIELD_OFFSET(KS_TSDU_DAT, Data))) + +typedef struct _KS_TSDU_MDL { + + USHORT TsduType; + USHORT TsduFlags; + + ULONG DataLength; + ULONG StartOffset; + + PMDL Mdl; + PVOID Descriptor; + +} KS_TSDU_MDL, *PKS_TSDU_MDL; + + +typedef struct _KS_TSDUMGR { + + struct list_head TsduList; + ULONG NumOfTsdu; + ULONG TotalBytes; + KEVENT Event; + +} KS_TSDUMGR, *PKS_TSDUMGR; + + +typedef struct _KS_CHAIN { + + KS_TSDUMGR Normal; + KS_TSDUMGR Expedited; + +} KS_CHAIN, *PKS_CHAIN; + + +#define TDINAL_SCHED_FACTOR (1) +#define CAN_BE_SCHED(Len, Limit) (Len >= ((Limit) >> TDINAL_SCHED_FACTOR)) + +// +// Handler Settings Indictor +// + +#define TDI_EVENT_MAXIMUM_HANDLER (TDI_EVENT_ERROR_EX + 1) + + +typedef struct _KS_EVENT_HANDLERS { + BOOLEAN IsActive[TDI_EVENT_MAXIMUM_HANDLER]; + PVOID Handler [TDI_EVENT_MAXIMUM_HANDLER]; +} KS_EVENT_HANDLERS, *PKS_EVENT_HANDLERS; + +#define SetEventHandler(ha, ht, hr) do { \ + ha.IsActive[ht] = TRUE; \ + ha.Handler[ht] = (PVOID) (hr); \ + } while(0) + +// +// KSock Internal Structures +// + +typedef struct _KS_ADDRESS { + + union { + TRANSPORT_ADDRESS Tdi; + UCHAR Pading[MAX_ADDRESS_LENGTH]; + }; + + HANDLE Handle; + PFILE_OBJECT FileObject; + +} KS_ADDRESS, *PKS_ADDRESS; + +// +// Structures for Disconnect Workitem +// + +typedef struct _KS_DISCONNECT_WORKITEM { + + WORK_QUEUE_ITEM WorkItem; // Workitem to perform disconnection + ksock_tconn_t * tconn; // tdi connecton + ULONG Flags; // connection broken/discnnection flags + KEVENT Event; // sync event + +} KS_DISCONNECT_WORKITEM, *PKS_DISCONNECT_WORKITEM; + + +typedef struct _KS_CONNECTION { + + HANDLE Handle; // Handle of the tdi connection + PFILE_OBJECT FileObject; // FileObject if the conn object + + PTRANSPORT_ADDRESS Remote; // the ConnectionInfo of this connection + PTDI_CONNECTION_INFORMATION ConnectionInfo; + + ULONG nagle; // Tcp options + +} KS_CONNECTION, *PKS_CONNECTION; + + +// +// type definitions +// + +typedef MDL ksock_mdl_t; +typedef UNICODE_STRING ksock_unicode_name_t; +typedef WORK_QUEUE_ITEM ksock_workitem_t; + + +typedef KS_CHAIN ksock_chain_t; +typedef KS_ADDRESS ksock_tdi_addr_t; +typedef KS_CONNECTION ksock_tconn_info_t; +typedef KS_DISCONNECT_WORKITEM ksock_disconnect_workitem_t; + + +// +// Structures for transmission done Workitem +// + +typedef struct _KS_TCPX_FINILIZE { + ksock_workitem_t item; + void * tx; +} ksock_tcpx_fini_t; + + +typedef struct ksock_backlogs { + + struct list_head list; /* list to link the backlog connections */ + int num; /* number of backlogs in the list */ + +} ksock_backlogs_t; + + +typedef struct ksock_daemon { + + ksock_tconn_t * tconn; /* the listener connection object */ + unsigned short nbacklogs; /* number of listening backlog conns */ + unsigned short port; /* listening port number */ + int shutdown; /* daemon threads is to exit */ + struct list_head list; /* to be attached into ksock_nal_data_t*/ + +} ksock_daemon_t ; + + +typedef enum { + + kstt_sender = 0, // normal sending connection type, it's active connection, while + // child tconn is for passive connection. + + kstt_listener, // listener daemon type, it just acts as a daemon, and it does + // not have real connection. It manages children tcons to accept + // or refuse the connecting request from remote peers. + + kstt_child, // accepted child connection type, it's parent must be Listener + kstt_lasttype +} ksock_tconn_type; + +typedef enum { + + ksts_uninited = 0, // tconn is just allocated (zero values), not initialized yet + + ksts_inited, // tconn structure initialized: so it now can be identified as + // a sender, listener or a child + + ksts_bind, // tconn is bound: the local address object (ip/port) is created. + // after being bound, we must call ksocknal_put_tconn to release + // the tconn objects, it's not safe just to free the memory of tconn. + + ksts_associated, // the connection object is created and associated with the address + // object. so it's ready for connection. only for child and sender. + + ksts_connecting, // only used by child tconn: in the ConnectEvent handler routine, + // it indicts the child tconn is busy to be connected to the peer. + + ksts_connected, // the connection is built already: for sender and child + + ksts_listening, // listener daemon is working, only for listener tconn + + ksts_disconnected, // disconnected by user + ksts_aborted, // un-exptected broken status + + ksts_last // total number of tconn statuses +} ksock_tconn_state; + +#define KS_TCONN_MAGIC 'KSTM' + +#define KS_TCONN_HANDLERS_SET 0x00000001 // Conection handlers are set. +#define KS_TCONN_DISCONNECT_BUSY 0x00010000 // Disconnect Workitem is queued ... +#define KS_TCONN_DESTROY_BUSY 0x00020000 // Destory Workitem is queued ... + +#define KS_TCONN_DAEMON_STARTED 0x00100000 // indict the daemon is started, + // only valid for listener + +struct socket { + + ulong_ptr kstc_magic; /* Magic & Flags */ + ulong_ptr kstc_flags; + + spinlock_t kstc_lock; /* serialise lock*/ + void * kstc_conn; /* ksock_conn_t */ + + ksock_tconn_type kstc_type; /* tdi connection Type */ + ksock_tconn_state kstc_state; /* tdi connection state flag */ + + ksock_unicode_name_t kstc_dev; /* tcp transport device name */ + + ksock_tdi_addr_t kstc_addr; /* local address handlers / Objects */ + + atomic_t kstc_refcount; /* reference count of ksock_tconn */ + + struct list_head kstc_list; /* linked to global ksocknal_data */ + + union { + + struct { + int nbacklog; /* total number of backlog tdi connections */ + ksock_backlogs_t kstc_listening; /* listeing backlog child connections */ + ksock_backlogs_t kstc_accepted; /* connected backlog child connections */ + event_t kstc_accept_event; /* Signaled by AcceptedHander, + ksocknal_wait_accpeted_conns waits on */ + event_t kstc_destroy_event; /* Signaled when accepted child is released */ + } listener; + + struct { + ksock_tconn_info_t kstc_info; /* Connection Info if Connected */ + ksock_chain_t kstc_recv; /* tsdu engine for data receiving */ + ksock_chain_t kstc_send; /* tsdu engine for data sending */ + + int kstc_queued; /* Attached to Parent->ChildList ... */ + int kstc_queueno; /* 0: Attached to Listening list + 1: Attached to Accepted list */ + + int kstc_busy; /* referred by ConnectEventCallback ? */ + int kstc_accepted; /* the connection is built ready ? */ + + struct list_head kstc_link; /* linked to parent tdi connection */ + ksock_tconn_t * kstc_parent; /* pointers to it's listener parent */ + } child; + + struct { + ksock_tconn_info_t kstc_info; /* Connection Info if Connected */ + ksock_chain_t kstc_recv; /* tsdu engine for data receiving */ + ksock_chain_t kstc_send; /* tsdu engine for data sending */ + } sender; + }; + + ulong_ptr kstc_snd_wnd; /* Sending window size */ + ulong_ptr kstc_rcv_wnd; /* Recving window size */ + + ksock_workitem_t kstc_destroy; /* tconn destruction workitem */ + ksock_disconnect_workitem_t kstc_disconnect; /* connection disconnect workitem */ + + ksock_schedule_cb kstc_sched_cb; /* notification callback routine of completion */ + ksock_update_tx kstc_update_tx; /* aync sending callback to update tx */ +}; + +#define SOCK_WMEM_QUEUED(sock) (0) + +#define TDINAL_WINDOW_DEFAULT_SIZE (0x100000) + + +struct _KS_UDP_COMPLETION_CONTEXT; +struct _KS_TCP_COMPLETION_CONTEXT; + + +typedef +NTSTATUS +(*PKS_UDP_COMPLETION_ROUTINE) ( + IN PIRP Irp, + IN struct _KS_UDP_COMPLETION_CONTEXT + *UdpContext + ); + + +typedef +NTSTATUS +(*PKS_TCP_COMPLETION_ROUTINE) ( + IN PIRP Irp, + IN struct _KS_TCP_COMPLETION_CONTEXT + *TcpContext + ); + +// +// Udp Irp Completion Context +// + +typedef struct _KS_UDP_COMPLETION_CONTEXT { + + PKEVENT Event; + union { + PFILE_OBJECT AddressObject; + ksock_tconn_t * tconn; + }; + + PKS_UDP_COMPLETION_ROUTINE CompletionRoutine; + PVOID CompletionContext; + +} KS_UDP_COMPLETION_CONTEXT, *PKS_UDP_COMPLETION_CONTEXT; + + +// +// Tcp Irp Completion Context (used by tcp data recv/send) +// + +typedef struct _KS_TCP_COMPLETION_CONTEXT { + + PKEVENT Event; // Event to be waited on by Irp caller ... + + ksock_tconn_t * tconn; // the tdi connection + + PKS_TCP_COMPLETION_ROUTINE CompletionRoutine; + PVOID CompletionContext; + PVOID CompletionContext2; + + PKS_TSDUMGR KsTsduMgr; // Tsdu buffer manager + + // + // These tow new members are for NON_BLOCKING transmission + // + + BOOLEAN bCounted; // To indict needing refcount to + // execute CompetionRoutine + ULONG ReferCount; // Refer count of this structure + +} KS_TCP_COMPLETION_CONTEXT, *PKS_TCP_COMPLETION_CONTEXT; + +typedef KS_TCP_COMPLETION_CONTEXT ksock_tdi_tx_t, ksock_tdi_rx_t; + + +/* + * tdi extensions + */ + +#define IOCTL_TCP_QUERY_INFORMATION_EX \ + CTL_CODE(FILE_DEVICE_NETWORK, 0, METHOD_NEITHER, FILE_ANY_ACCESS) +#define IOCTL_TCP_SET_INFORMATION_EX \ + CTL_CODE(FILE_DEVICE_NETWORK, 1, METHOD_BUFFERED, FILE_WRITE_ACCESS) + + +#define TcpBuildSetInformationEx(Irp, DevObj, FileObj, CompRoutine, Contxt, Buffer, BufferLen)\ + { \ + PIO_STACK_LOCATION _IRPSP; \ + if ( CompRoutine != NULL) { \ + IoSetCompletionRoutine( Irp, CompRoutine, Contxt, TRUE, TRUE, TRUE);\ + } else { \ + IoSetCompletionRoutine( Irp, NULL, NULL, FALSE, FALSE, FALSE); \ + } \ + _IRPSP = IoGetNextIrpStackLocation (Irp); \ + _IRPSP->MajorFunction = IRP_MJ_DEVICE_CONTROL; \ + _IRPSP->DeviceObject = DevObj; \ + _IRPSP->FileObject = FileObj; \ + _IRPSP->Parameters.DeviceIoControl.OutputBufferLength = 0; \ + _IRPSP->Parameters.DeviceIoControl.InputBufferLength = BufferLen; \ + _IRPSP->Parameters.DeviceIoControl.IoControlCode = IOCTL_TCP_SET_INFORMATION_EX; \ + Irp->AssociatedIrp.SystemBuffer = Buffer; \ + } + + +#define TcpBuildQueryInformationEx(Irp, DevObj, FileObj, CompRoutine, Contxt, InBuffer, InLength, OutBuffer, OutLength)\ + { \ + PIO_STACK_LOCATION _IRPSP; \ + if ( CompRoutine != NULL) { \ + IoSetCompletionRoutine( Irp, CompRoutine, Contxt, TRUE, TRUE, TRUE);\ + } else { \ + IoSetCompletionRoutine( Irp, NULL, NULL, FALSE, FALSE, FALSE); \ + } \ + _IRPSP = IoGetNextIrpStackLocation (Irp); \ + _IRPSP->MajorFunction = IRP_MJ_DEVICE_CONTROL; \ + _IRPSP->DeviceObject = DevObj; \ + _IRPSP->FileObject = FileObj; \ + _IRPSP->Parameters.DeviceIoControl.OutputBufferLength = OutLength; \ + _IRPSP->Parameters.DeviceIoControl.InputBufferLength = InLength; \ + _IRPSP->Parameters.DeviceIoControl.IoControlCode = IOCTL_TCP_QUERY_INFORMATION_EX; \ + _IRPSP->Parameters.DeviceIoControl.Type3InputBuffer = InBuffer; \ + Irp->UserBuffer = OutBuffer; \ + } + + +typedef struct ks_addr_slot { + LIST_ENTRY link; + int up; + char iface[40]; + __u32 ip_addr; + __u32 netmask; + UNICODE_STRING devname; + WCHAR buffer[1]; +} ks_addr_slot_t; + +typedef struct { + + /* + * Tdi client information + */ + + UNICODE_STRING ksnd_client_name; /* tdi client module name */ + HANDLE ksnd_pnp_handle; /* the handle for pnp changes */ + + spinlock_t ksnd_addrs_lock; /* serialize ip address list access */ + LIST_ENTRY ksnd_addrs_list; /* list of the ip addresses */ + int ksnd_naddrs; /* number of the ip addresses */ + + /* + * Tdilnd internal defintions + */ + + int ksnd_init; /* initialisation state */ + + TDI_PROVIDER_INFO ksnd_provider; /* tdi tcp/ip provider's information */ + + spinlock_t ksnd_tconn_lock; /* tdi connections access serialise */ + + int ksnd_ntconns; /* number of tconns attached in list */ + struct list_head ksnd_tconns; /* tdi connections list */ + cfs_mem_cache_t * ksnd_tconn_slab; /* slabs for ksock_tconn_t allocations */ + event_t ksnd_tconn_exit; /* exit event to be signaled by the last tconn */ + + spinlock_t ksnd_tsdu_lock; /* tsdu access serialise */ + + int ksnd_ntsdus; /* number of tsdu buffers allocated */ + ulong_ptr ksnd_tsdu_size; /* the size of a signel tsdu buffer */ + cfs_mem_cache_t * ksnd_tsdu_slab; /* slab cache for tsdu buffer allocation */ + + int ksnd_nfreetsdus; /* number of tsdu buffers in the freed list */ + struct list_head ksnd_freetsdus; /* List of the freed Tsdu buffer. */ + + spinlock_t ksnd_daemon_lock; /* stabilize daemon ops */ + int ksnd_ndaemons; /* number of listening daemons */ + struct list_head ksnd_daemons; /* listening daemon list */ + event_t ksnd_daemon_exit; /* the last daemon quiting should singal it */ + +} ks_data_t; + +int +ks_init_tdi_data(); + +void +ks_fini_tdi_data(); + + +#endif /* __KERNEL__ */ +#endif /* __LIBCFS_WINNT_TCPIP_H__ */ + +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/lnet/include/libcfs/winnt/winnt-time.h b/lnet/include/libcfs/winnt/winnt-time.h new file mode 100644 index 0000000..d31f854 --- /dev/null +++ b/lnet/include/libcfs/winnt/winnt-time.h @@ -0,0 +1,315 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under the + * terms of version 2 of the GNU General Public License as published by the + * Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along + * with Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass + * Ave, Cambridge, MA 02139, USA. + * + * Implementation of portable time API for Winnt (kernel and user-level). + * + */ + +#ifndef __LIBCFS_WINNT_LINUX_TIME_H__ +#define __LIBCFS_WINNT_LINUX_TIME_H__ + +#ifndef __LIBCFS_LIBCFS_H__ +#error Do not #include this file directly. #include instead +#endif + +/* Portable time API */ + +/* + * Platform provides three opaque data-types: + * + * cfs_time_t represents point in time. This is internal kernel + * time rather than "wall clock". This time bears no + * relation to gettimeofday(). + * + * cfs_duration_t represents time interval with resolution of internal + * platform clock + * + * cfs_fs_time_t represents instance in world-visible time. This is + * used in file-system time-stamps + * + * cfs_time_t cfs_time_current(void); + * cfs_time_t cfs_time_add (cfs_time_t, cfs_duration_t); + * cfs_duration_t cfs_time_sub (cfs_time_t, cfs_time_t); + * int cfs_time_before (cfs_time_t, cfs_time_t); + * int cfs_time_beforeq(cfs_time_t, cfs_time_t); + * + * cfs_duration_t cfs_duration_build(int64_t); + * + * time_t cfs_duration_sec (cfs_duration_t); + * void cfs_duration_usec(cfs_duration_t, struct timeval *); + * void cfs_duration_nsec(cfs_duration_t, struct timespec *); + * + * void cfs_fs_time_current(cfs_fs_time_t *); + * time_t cfs_fs_time_sec (cfs_fs_time_t *); + * void cfs_fs_time_usec (cfs_fs_time_t *, struct timeval *); + * void cfs_fs_time_nsec (cfs_fs_time_t *, struct timespec *); + * int cfs_fs_time_before (cfs_fs_time_t *, cfs_fs_time_t *); + * int cfs_fs_time_beforeq(cfs_fs_time_t *, cfs_fs_time_t *); + * + * CFS_TIME_FORMAT + * CFS_DURATION_FORMAT + * + */ + +#define ONE_BILLION ((u_int64_t)1000000000) +#define ONE_MILLION ((u_int64_t) 1000000) + +#define HZ (100) + +struct timeval { + time_t tv_sec; /* seconds */ + suseconds_t tv_usec; /* microseconds */ +}; + +struct timespec { + ulong_ptr tv_sec; + ulong_ptr tv_nsec; +}; + +#ifdef __KERNEL__ + +#include + +/* + * Generic kernel stuff + */ + +typedef struct timeval cfs_fs_time_t; + +typedef u_int64_t cfs_time_t; +typedef int64_t cfs_duration_t; + +static inline void do_gettimeofday(struct timeval *tv) +{ + LARGE_INTEGER Time; + + KeQuerySystemTime(&Time); + + tv->tv_sec = (long_ptr) (Time.QuadPart / 10000000); + tv->tv_usec = (long_ptr) (Time.QuadPart % 10000000) / 10; +} + +static inline cfs_time_t JIFFIES() +{ + LARGE_INTEGER Tick; + LARGE_INTEGER Elapse; + + KeQueryTickCount(&Tick); + + Elapse.QuadPart = Tick.QuadPart * KeQueryTimeIncrement(); + Elapse.QuadPart /= (10000000 / HZ); + + return Elapse.QuadPart; +} + +static inline cfs_time_t cfs_time_current(void) +{ + return JIFFIES(); +} + +static inline cfs_time_t cfs_time_current_sec(void) +{ + return (JIFFIES() / HZ); +} + +static inline cfs_time_t cfs_time_add(cfs_time_t t, cfs_duration_t d) +{ + return (t + d); +} + +static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2) +{ + return (t1 - t2); +} + +static inline int cfs_time_before(cfs_time_t t1, cfs_time_t t2) +{ + return ((int64_t)t1 - (int64_t)t2) < 0; +} + +static inline int cfs_time_beforeq(cfs_time_t t1, cfs_time_t t2) +{ + return ((int64_t)t1 - (int64_t)t2) <= 0; +} + +static inline void cfs_fs_time_current(cfs_fs_time_t *t) +{ + ULONG Linux; + LARGE_INTEGER Sys; + + KeQuerySystemTime(&Sys); + + RtlTimeToSecondsSince1970(&Sys, &Linux); + + t->tv_sec = Linux; + t->tv_usec = (Sys.LowPart % 10000000) / 10; +} + +static inline cfs_time_t cfs_fs_time_sec(cfs_fs_time_t *t) +{ + return t->tv_sec; +} + +static inline u_int64_t __cfs_fs_time_flat(cfs_fs_time_t *t) +{ + return ((u_int64_t)t->tv_sec) * ONE_MILLION + t->tv_usec; +} + +static inline int cfs_fs_time_before(cfs_fs_time_t *t1, cfs_fs_time_t *t2) +{ + return (__cfs_fs_time_flat(t1) < __cfs_fs_time_flat(t2)); +} + +static inline int cfs_fs_time_beforeq(cfs_fs_time_t *t1, cfs_fs_time_t *t2) +{ + return (__cfs_fs_time_flat(t1) <= __cfs_fs_time_flat(t2)); +} + +static inline cfs_duration_t cfs_time_seconds(int seconds) +{ + return (cfs_duration_t)seconds * HZ; +} + +static inline cfs_time_t cfs_duration_sec(cfs_duration_t d) +{ + return d / HZ; +} + +static inline void cfs_duration_usec(cfs_duration_t d, struct timeval *s) +{ + s->tv_sec = (suseconds_t) (d / HZ); + s->tv_usec = (time_t)((d - (cfs_duration_t)s->tv_sec * HZ) * + ONE_MILLION / HZ); +} + +static inline void cfs_duration_nsec(cfs_duration_t d, struct timespec *s) +{ + s->tv_sec = (suseconds_t) (d / HZ); + s->tv_nsec = (time_t)((d - (cfs_duration_t)s->tv_sec * HZ) * + ONE_BILLION / HZ); +} + +static inline void cfs_fs_time_usec(cfs_fs_time_t *t, struct timeval *v) +{ + *v = *t; +} + +static inline void cfs_fs_time_nsec(cfs_fs_time_t *t, struct timespec *s) +{ + s->tv_sec = t->tv_sec; + s->tv_nsec = t->tv_usec * 1000; +} + +#define cfs_time_current_64 cfs_time_current +#define cfs_time_add_64 cfs_time_add +#define cfs_time_shift_64 cfs_time_shift +#define cfs_time_before_64 cfs_time_before + +/* + * One jiffy + */ +#define CFS_TICK (1) + +#define LTIME_S(t) (t) + +#define CFS_TIME_T "%I64u" +#define CFS_DURATION_T "%I64d" + +#else /* !__KERNEL__ */ + +/* + * Liblustre. time(2) based implementation. + */ +#include + + +// +// Time routines ... +// + +NTSYSAPI +CCHAR +NTAPI +NtQuerySystemTime( + OUT PLARGE_INTEGER CurrentTime + ); + + +NTSYSAPI +BOOLEAN +NTAPI +RtlTimeToSecondsSince1970( + IN PLARGE_INTEGER Time, + OUT PULONG ElapsedSeconds + ); + + +NTSYSAPI +VOID +NTAPI +RtlSecondsSince1970ToTime( + IN ULONG ElapsedSeconds, + OUT PLARGE_INTEGER Time + ); + +NTSYSAPI +VOID +NTAPI +Sleep( + DWORD dwMilliseconds // sleep time in milliseconds +); + + +static inline void sleep(int time) +{ + DWORD Time = 1000 * time; + Sleep(Time); +} + + +static inline void do_gettimeofday(struct timeval *tv) +{ + LARGE_INTEGER Time; + + NtQuerySystemTime(&Time); + + tv->tv_sec = (long_ptr) (Time.QuadPart / 10000000); + tv->tv_usec = (long_ptr) (Time.QuadPart % 10000000) / 10; +} + +static inline int gettimeofday(struct timeval *tv, void * tz) +{ + do_gettimeofday(tv); + return 0; +} + +#endif /* __KERNEL__ */ + +/* __LIBCFS_LINUX_LINUX_TIME_H__ */ +#endif +/* + * Local variables: + * c-indentation-style: "K&R" + * c-basic-offset: 8 + * tab-width: 8 + * fill-column: 80 + * scroll-step: 1 + * End: + */ diff --git a/lnet/include/libcfs/winnt/winnt-types.h b/lnet/include/libcfs/winnt/winnt-types.h new file mode 100644 index 0000000..6478730 --- /dev/null +++ b/lnet/include/libcfs/winnt/winnt-types.h @@ -0,0 +1,647 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Basic types definitions + * + */ + +#ifndef _WINNT_TYPE_H +#define _WINNT_TYPE_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#else + +#include +#include +#include +#include +#include +#include +#include +#include + +#endif + + +#define __LITTLE_ENDIAN + +#define inline __inline +#define __inline__ __inline + +typedef unsigned __int8 __u8; +typedef signed __int8 __s8; + +typedef signed __int64 __s64; +typedef unsigned __int64 __u64; + +typedef signed __int16 __s16; +typedef unsigned __int16 __u16; + +typedef signed __int32 __s32; +typedef unsigned __int32 __u32; + +typedef signed __int64 __s64; +typedef unsigned __int64 __u64; + +typedef unsigned long ULONG; + + +#if defined(_WIN64) + #define long_ptr __int64 + #define ulong_ptr unsigned __int64 + #define BITS_PER_LONG (64) +#else + #define long_ptr long + #define ulong_ptr unsigned long + #define BITS_PER_LONG (32) + +#endif + +/* bsd */ +typedef unsigned char u_char; +typedef unsigned short u_short; +typedef unsigned int u_int; +typedef unsigned long u_long; + +/* sysv */ +typedef unsigned char unchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; + +#ifndef __BIT_TYPES_DEFINED__ +#define __BIT_TYPES_DEFINED__ + +typedef __u8 u_int8_t; +typedef __s8 int8_t; +typedef __u16 u_int16_t; +typedef __s16 int16_t; +typedef __u32 u_int32_t; +typedef __s32 int32_t; + +#endif /* !(__BIT_TYPES_DEFINED__) */ + +typedef __u8 uint8_t; +typedef __u16 uint16_t; +typedef __u32 uint32_t; + +typedef __u64 uint64_t; +typedef __u64 u_int64_t; +typedef __s64 int64_t; + +typedef long ssize_t; + +typedef __u32 suseconds_t; + +typedef __u32 pid_t, tid_t; + +typedef __u16 uid_t, gid_t; + +typedef __u16 mode_t; +typedef __u16 umode_t; + +typedef ulong_ptr sigset_t; + +typedef uint64_t loff_t; +typedef HANDLE cfs_handle_t; +typedef uint64_t cycles_t; + +#ifndef INVALID_HANDLE_VALUE +#define INVALID_HANDLE_VALUE ((HANDLE)-1) +#endif + + +#ifdef __KERNEL__ /* kernel */ + +typedef __u32 off_t; +typedef __u32 time_t; + +typedef unsigned short kdev_t; + +#else /* !__KERNEL__ */ + +typedef int BOOL; +typedef __u8 BYTE; +typedef __u16 WORD; +typedef __u32 DWORD; + +#endif /* __KERNEL__ */ + +/* + * Conastants suffix + */ + +#define ULL i64 +#define ull i64 + +/* + * Winnt kernel has no capabilities. + */ + +typedef __u32 cfs_kernel_cap_t; + +#define INT_MAX ((int)(~0U>>1)) +#define INT_MIN (-INT_MAX - 1) +#define UINT_MAX (~0U) + +#endif /* _WINNT_TYPES_H */ + + +/* + * Bytes order + */ + +// +// Byte order swapping routines +// + + +#define ___swab16(x) RtlUshortByteSwap(x) +#define ___swab32(x) RtlUlongByteSwap(x) +#define ___swab64(x) RtlUlonglongByteSwap(x) + +#define ___constant_swab16(x) \ + ((__u16)( \ + (((__u16)(x) & (__u16)0x00ffU) << 8) | \ + (((__u16)(x) & (__u16)0xff00U) >> 8) )) + +#define ___constant_swab32(x) \ + ((__u32)( \ + (((__u32)(x) & (__u32)0x000000ffUL) << 24) | \ + (((__u32)(x) & (__u32)0x0000ff00UL) << 8) | \ + (((__u32)(x) & (__u32)0x00ff0000UL) >> 8) | \ + (((__u32)(x) & (__u32)0xff000000UL) >> 24) )) + +#define ___constant_swab64(x) \ + ((__u64)( \ + (__u64)(((__u64)(x) & (__u64)0x00000000000000ffUL) << 56) | \ + (__u64)(((__u64)(x) & (__u64)0x000000000000ff00UL) << 40) | \ + (__u64)(((__u64)(x) & (__u64)0x0000000000ff0000UL) << 24) | \ + (__u64)(((__u64)(x) & (__u64)0x00000000ff000000UL) << 8) | \ + (__u64)(((__u64)(x) & (__u64)0x000000ff00000000UL) >> 8) | \ + (__u64)(((__u64)(x) & (__u64)0x0000ff0000000000UL) >> 24) | \ + (__u64)(((__u64)(x) & (__u64)0x00ff000000000000UL) >> 40) | \ + (__u64)(((__u64)(x) & (__u64)0xff00000000000000UL) >> 56) )) + + +#define __swab16(x) ___constant_swab16(x) +#define __swab32(x) ___constant_swab32(x) +#define __swab64(x) ___constant_swab64(x) + +#define __swab16s(x) do { *(x) = __swab16((USHORT)(*(x)));} while(0) +#define __swab32s(x) do { *(x) = __swab32((ULONG)(*(x)));} while(0) +#define __swab64s(x) do { *(x) = __swab64((ULONGLONG)(*(x)));} while(0) + +#define __constant_htonl(x) ___constant_swab32((x)) +#define __constant_ntohl(x) ___constant_swab32((x)) +#define __constant_htons(x) ___constant_swab16((x)) +#define __constant_ntohs(x) ___constant_swab16((x)) +#define __constant_cpu_to_le64(x) ((__u64)(x)) +#define __constant_le64_to_cpu(x) ((__u64)(x)) +#define __constant_cpu_to_le32(x) ((__u32)(x)) +#define __constant_le32_to_cpu(x) ((__u32)(x)) +#define __constant_cpu_to_le16(x) ((__u16)(x)) +#define __constant_le16_to_cpu(x) ((__u16)(x)) +#define __constant_cpu_to_be64(x) ___constant_swab64((x)) +#define __constant_be64_to_cpu(x) ___constant_swab64((x)) +#define __constant_cpu_to_be32(x) ___constant_swab32((x)) +#define __constant_be32_to_cpu(x) ___constant_swab32((x)) +#define __constant_cpu_to_be16(x) ___constant_swab16((x)) +#define __constant_be16_to_cpu(x) ___constant_swab16((x)) +#define __cpu_to_le64(x) ((__u64)(x)) +#define __le64_to_cpu(x) ((__u64)(x)) +#define __cpu_to_le32(x) ((__u32)(x)) +#define __le32_to_cpu(x) ((__u32)(x)) +#define __cpu_to_le16(x) ((__u16)(x)) +#define __le16_to_cpu(x) ((__u16)(x)) +#define __cpu_to_be64(x) __swab64((x)) +#define __be64_to_cpu(x) __swab64((x)) +#define __cpu_to_be32(x) __swab32((x)) +#define __be32_to_cpu(x) __swab32((x)) +#define __cpu_to_be16(x) __swab16((x)) +#define __be16_to_cpu(x) __swab16((x)) +#define __cpu_to_le64p(x) (*(__u64*)(x)) +#define __le64_to_cpup(x) (*(__u64*)(x)) +#define __cpu_to_le32p(x) (*(__u32*)(x)) +#define __le32_to_cpup(x) (*(__u32*)(x)) +#define __cpu_to_le16p(x) (*(__u16*)(x)) +#define __le16_to_cpup(x) (*(__u16*)(x)) +#define __cpu_to_be64p(x) __swab64p((x)) +#define __be64_to_cpup(x) __swab64p((x)) +#define __cpu_to_be32p(x) __swab32p((x)) +#define __be32_to_cpup(x) __swab32p((x)) +#define __cpu_to_be16p(x) __swab16p((x)) +#define __be16_to_cpup(x) __swab16p((x)) +#define __cpu_to_le64s(x) do {} while (0) +#define __le64_to_cpus(x) do {} while (0) +#define __cpu_to_le32s(x) do {} while (0) +#define __le32_to_cpus(x) do {} while (0) +#define __cpu_to_le16s(x) do {} while (0) +#define __le16_to_cpus(x) do {} while (0) +#define __cpu_to_be64s(x) __swab64s((x)) +#define __be64_to_cpus(x) __swab64s((x)) +#define __cpu_to_be32s(x) __swab32s((x)) +#define __be32_to_cpus(x) __swab32s((x)) +#define __cpu_to_be16s(x) __swab16s((x)) +#define __be16_to_cpus(x) __swab16s((x)) + +#ifndef cpu_to_le64 +#define cpu_to_le64 __cpu_to_le64 +#define le64_to_cpu __le64_to_cpu +#define cpu_to_le32 __cpu_to_le32 +#define le32_to_cpu __le32_to_cpu +#define cpu_to_le16 __cpu_to_le16 +#define le16_to_cpu __le16_to_cpu +#endif + +#define cpu_to_be64 __cpu_to_be64 +#define be64_to_cpu __be64_to_cpu +#define cpu_to_be32 __cpu_to_be32 +#define be32_to_cpu __be32_to_cpu +#define cpu_to_be16 __cpu_to_be16 +#define be16_to_cpu __be16_to_cpu +#define cpu_to_le64p __cpu_to_le64p +#define le64_to_cpup __le64_to_cpup +#define cpu_to_le32p __cpu_to_le32p +#define le32_to_cpup __le32_to_cpup +#define cpu_to_le16p __cpu_to_le16p +#define le16_to_cpup __le16_to_cpup +#define cpu_to_be64p __cpu_to_be64p +#define be64_to_cpup __be64_to_cpup +#define cpu_to_be32p __cpu_to_be32p +#define be32_to_cpup __be32_to_cpup +#define cpu_to_be16p __cpu_to_be16p +#define be16_to_cpup __be16_to_cpup +#define cpu_to_le64s __cpu_to_le64s +#define le64_to_cpus __le64_to_cpus +#define cpu_to_le32s __cpu_to_le32s +#define le32_to_cpus __le32_to_cpus +#define cpu_to_le16s __cpu_to_le16s +#define le16_to_cpus __le16_to_cpus +#define cpu_to_be64s __cpu_to_be64s +#define be64_to_cpus __be64_to_cpus +#define cpu_to_be32s __cpu_to_be32s +#define be32_to_cpus __be32_to_cpus +#define cpu_to_be16s __cpu_to_be16s +#define be16_to_cpus __be16_to_cpus + + +// +// Network to host byte swap functions +// + +#define ntohl(x) ( ( ( ( x ) & 0x000000ff ) << 24 ) | \ + ( ( ( x ) & 0x0000ff00 ) << 8 ) | \ + ( ( ( x ) & 0x00ff0000 ) >> 8 ) | \ + ( ( ( x ) & 0xff000000 ) >> 24 ) ) + +#define ntohs(x) ( ( ( ( x ) & 0xff00 ) >> 8 ) | \ + ( ( ( x ) & 0x00ff ) << 8 ) ) + + +#define htonl(x) ntohl(x) +#define htons(x) ntohs(x) + + + +#ifndef _I386_ERRNO_H +#define _I386_ERRNO_H + +#define EPERM 1 /* Operation not permitted */ +#define ENOENT 2 /* No such file or directory */ +#define ESRCH 3 /* No such process */ +#define EINTR 4 /* Interrupted system call */ +#define EIO 5 /* I/O error */ +#define ENXIO 6 /* No such device or address */ +#define E2BIG 7 /* Arg list too long */ +#define ENOEXEC 8 /* Exec format error */ +#define EBADF 9 /* Bad file number */ +#define ECHILD 10 /* No child processes */ +#define EAGAIN 11 /* Try again */ +#define ENOMEM 12 /* Out of memory */ +#define EACCES 13 /* Permission denied */ +#define EFAULT 14 /* Bad address */ +#define ENOTBLK 15 /* Block device required */ +#define EBUSY 16 /* Device or resource busy */ +#define EEXIST 17 /* File exists */ +#define EXDEV 18 /* Cross-device link */ +#define ENODEV 19 /* No such device */ +#define ENOTDIR 20 /* Not a directory */ +#define EISDIR 21 /* Is a directory */ +#define EINVAL 22 /* Invalid argument */ +#define ENFILE 23 /* File table overflow */ +#define EMFILE 24 /* Too many open files */ +#define ENOTTY 25 /* Not a typewriter */ +#define ETXTBSY 26 /* Text file busy */ +#define EFBIG 27 /* File too large */ +#define ENOSPC 28 /* No space left on device */ +#define ESPIPE 29 /* Illegal seek */ +#define EROFS 30 /* Read-only file system */ +#define EMLINK 31 /* Too many links */ +#define EPIPE 32 /* Broken pipe */ +#define EDOM 33 /* Math argument out of domain of func */ +#define ERANGE 34 /* Math result not representable */ +#undef EDEADLK +#define EDEADLK 35 /* Resource deadlock would occur */ +#undef ENAMETOOLONG +#define ENAMETOOLONG 36 /* File name too long */ +#undef ENOLCK +#define ENOLCK 37 /* No record locks available */ +#undef ENOSYS +#define ENOSYS 38 /* Function not implemented */ +#undef ENOTEMPTY +#define ENOTEMPTY 39 /* Directory not empty */ +#define ELOOP 40 /* Too many symbolic links encountered */ +#define EWOULDBLOCK EAGAIN /* Operation would block */ +#define ENOMSG 42 /* No message of desired type */ +#define EIDRM 43 /* Identifier removed */ +#define ECHRNG 44 /* Channel number out of range */ +#define EL2NSYNC 45 /* Level 2 not synchronized */ +#define EL3HLT 46 /* Level 3 halted */ +#define EL3RST 47 /* Level 3 reset */ +#define ELNRNG 48 /* Link number out of range */ +#define EUNATCH 49 /* Protocol driver not attached */ +#define ENOCSI 50 /* No CSI structure available */ +#define EL2HLT 51 /* Level 2 halted */ +#define EBADE 52 /* Invalid exchange */ +#define EBADR 53 /* Invalid request descriptor */ +#define EXFULL 54 /* Exchange full */ +#define ENOANO 55 /* No anode */ +#define EBADRQC 56 /* Invalid request code */ +#define EBADSLT 57 /* Invalid slot */ + +#define EDEADLOCK EDEADLK + +#define EBFONT 59 /* Bad font file format */ +#define ENOSTR 60 /* Device not a stream */ +#define ENODATA 61 /* No data available */ +#define ETIME 62 /* Timer expired */ +#define ENOSR 63 /* Out of streams resources */ +#define ENONET 64 /* Machine is not on the network */ +#define ENOPKG 65 /* Package not installed */ +#define EREMOTE 66 /* Object is remote */ +#define ENOLINK 67 /* Link has been severed */ +#define EADV 68 /* Advertise error */ +#define ESRMNT 69 /* Srmount error */ +#define ECOMM 70 /* Communication error on send */ +#define EPROTO 71 /* Protocol error */ +#define EMULTIHOP 72 /* Multihop attempted */ +#define EDOTDOT 73 /* RFS specific error */ +#define EBADMSG 74 /* Not a data message */ +#define EOVERFLOW 75 /* Value too large for defined data type */ +#define ENOTUNIQ 76 /* Name not unique on network */ +#define EBADFD 77 /* File descriptor in bad state */ +#define EREMCHG 78 /* Remote address changed */ +#define ELIBACC 79 /* Can not access a needed shared library */ +#define ELIBBAD 80 /* Accessing a corrupted shared library */ +#define ELIBSCN 81 /* .lib section in a.out corrupted */ +#define ELIBMAX 82 /* Attempting to link in too many shared libraries */ +#define ELIBEXEC 83 /* Cannot exec a shared library directly */ +#undef EILSEQ +#define EILSEQ 84 /* Illegal byte sequence */ +#define ERESTART 85 /* Interrupted system call should be restarted */ +#define ESTRPIPE 86 /* Streams pipe error */ +#define EUSERS 87 /* Too many users */ +#define ENOTSOCK 88 /* Socket operation on non-socket */ +#define EDESTADDRREQ 89 /* Destination address required */ +#define EMSGSIZE 90 /* Message too long */ +#define EPROTOTYPE 91 /* Protocol wrong type for socket */ +#define ENOPROTOOPT 92 /* Protocol not available */ +#define EPROTONOSUPPORT 93 /* Protocol not supported */ +#define ESOCKTNOSUPPORT 94 /* Socket type not supported */ +#define EOPNOTSUPP 95 /* Operation not supported on transport endpoint */ +#define EPFNOSUPPORT 96 /* Protocol family not supported */ +#define EAFNOSUPPORT 97 /* Address family not supported by protocol */ +#define EADDRINUSE 98 /* Address already in use */ +#define EADDRNOTAVAIL 99 /* Cannot assign requested address */ +#define ENETDOWN 100 /* Network is down */ +#define ENETUNREACH 101 /* Network is unreachable */ +#define ENETRESET 102 /* Network dropped connection because of reset */ +#define ECONNABORTED 103 /* Software caused connection abort */ +#define ECONNRESET 104 /* Connection reset by peer */ +#define ENOBUFS 105 /* No buffer space available */ +#define EISCONN 106 /* Transport endpoint is already connected */ +#define ENOTCONN 107 /* Transport endpoint is not connected */ +#define ESHUTDOWN 108 /* Cannot send after transport endpoint shutdown */ +#define ETOOMANYREFS 109 /* Too many references: cannot splice */ +#define ETIMEDOUT 110 /* Connection timed out */ +#define ECONNREFUSED 111 /* Connection refused */ +#define EHOSTDOWN 112 /* Host is down */ +#define EHOSTUNREACH 113 /* No route to host */ +#define EALREADY 114 /* Operation already in progress */ +#define EINPROGRESS 115 /* Operation now in progress */ +#define ESTALE 116 /* Stale NFS file handle */ +#define EUCLEAN 117 /* Structure needs cleaning */ +#define ENOTNAM 118 /* Not a XENIX named type file */ +#define ENAVAIL 119 /* No XENIX semaphores available */ +#define EISNAM 120 /* Is a named type file */ +#define EREMOTEIO 121 /* Remote I/O error */ +#define EDQUOT 122 /* Quota exceeded */ + +#define ENOMEDIUM 123 /* No medium found */ +#define EMEDIUMTYPE 124 /* Wrong medium type */ + +/* Should never be seen by user programs */ +#define ERESTARTSYS 512 +#define ERESTARTNOINTR 513 +#define ERESTARTNOHAND 514 /* restart if no handler.. */ +#define ENOIOCTLCMD 515 /* No ioctl command */ + +/* Defined for the NFSv3 protocol */ +#define EBADHANDLE 521 /* Illegal NFS file handle */ +#define ENOTSYNC 522 /* Update synchronization mismatch */ +#define EBADCOOKIE 523 /* Cookie is stale */ +#define ENOTSUPP 524 /* Operation is not supported */ +#define ETOOSMALL 525 /* Buffer or request is too small */ +#define ESERVERFAULT 526 /* An untranslatable error occurred */ +#define EBADTYPE 527 /* Type not supported by server */ +#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */ + + + +/* open/fcntl - O_SYNC is only implemented on blocks devices and on files + located on an ext2 file system */ +#define O_ACCMODE 0003 +#define O_RDONLY 00 +#define O_WRONLY 01 +#define O_RDWR 02 +#define O_CREAT 0100 /* not fcntl */ +#define O_EXCL 0200 /* not fcntl */ +#define O_NOCTTY 0400 /* not fcntl */ +#define O_TRUNC 01000 /* not fcntl */ +#define O_APPEND 02000 +#define O_NONBLOCK 04000 +#define O_NDELAY O_NONBLOCK +#define O_SYNC 010000 +#define FASYNC 020000 /* fcntl, for BSD compatibility */ +#define O_DIRECT 040000 /* direct disk access hint */ +#define O_LARGEFILE 0100000 +#define O_DIRECTORY 0200000 /* must be a directory */ +#define O_NOFOLLOW 0400000 /* don't follow links */ + +#define F_DUPFD 0 /* dup */ +#define F_GETFD 1 /* get close_on_exec */ +#define F_SETFD 2 /* set/clear close_on_exec */ +#define F_GETFL 3 /* get file->f_flags */ +#define F_SETFL 4 /* set file->f_flags */ +#define F_GETLK 5 +#define F_SETLK 6 +#define F_SETLKW 7 + +#define F_SETOWN 8 /* for sockets. */ +#define F_GETOWN 9 /* for sockets. */ +#define F_SETSIG 10 /* for sockets. */ +#define F_GETSIG 11 /* for sockets. */ + +#define F_GETLK64 12 /* using 'struct flock64' */ +#define F_SETLK64 13 +#define F_SETLKW64 14 + +/* for F_[GET|SET]FL */ +#define FD_CLOEXEC 1 /* actually anything with low bit set goes */ + +/* for posix fcntl() and lockf() */ +#define F_RDLCK 0 +#define F_WRLCK 1 +#define F_UNLCK 2 + +/* for old implementation of bsd flock () */ +#define F_EXLCK 4 /* or 3 */ +#define F_SHLCK 8 /* or 4 */ + +/* for leases */ +#define F_INPROGRESS 16 + +/* operations for bsd flock(), also used by the kernel implementation */ +#define LOCK_SH 1 /* shared lock */ +#define LOCK_EX 2 /* exclusive lock */ +#define LOCK_NB 4 /* or'd with one of the above to prevent + blocking */ +#define LOCK_UN 8 /* remove lock */ + +#define LOCK_MAND 32 /* This is a mandatory flock */ +#define LOCK_READ 64 /* ... Which allows concurrent read operations */ +#define LOCK_WRITE 128 /* ... Which allows concurrent write operations */ +#define LOCK_RW 192 /* ... Which allows concurrent read & write ops */ + +#endif + + +#ifndef LIBCFS_SIGNAL_H +#define LIBCFS_SIGNAL_H + +/* + * signal values ... + */ + +#define SIGHUP 1 +#define SIGINT 2 +#define SIGQUIT 3 +#define SIGILL 4 +#define SIGTRAP 5 +#define SIGABRT 6 +#define SIGIOT 6 +#define SIGBUS 7 +#define SIGFPE 8 +#define SIGKILL 9 +#define SIGUSR1 10 +#define SIGSEGV 11 +#define SIGUSR2 12 +#define SIGPIPE 13 +#define SIGALRM 14 +#define SIGTERM 15 +#define SIGSTKFLT 16 +#define SIGCHLD 17 +#define SIGCONT 18 +#define SIGSTOP 19 +#define SIGTSTP 20 +#define SIGTTIN 21 +#define SIGTTOU 22 +#define SIGURG 23 +#define SIGXCPU 24 +#define SIGXFSZ 25 +#define SIGVTALRM 26 +#define SIGPROF 27 +#define SIGWINCH 28 +#define SIGIO 29 +#define SIGPOLL SIGIO +/* +#define SIGLOST 29 +*/ +#define SIGPWR 30 +#define SIGSYS 31 +#define SIGUNUSED 31 + +/* These should not be considered constants from userland. */ +#define SIGRTMIN 32 +#define SIGRTMAX (_NSIG-1) + +/* + * SA_FLAGS values: + * + * SA_ONSTACK indicates that a registered stack_t will be used. + * SA_INTERRUPT is a no-op, but left due to historical reasons. Use the + * SA_RESTART flag to get restarting signals (which were the default long ago) + * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. + * SA_RESETHAND clears the handler when the signal is delivered. + * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. + * SA_NODEFER prevents the current signal from being masked in the handler. + * + * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single + * Unix names RESETHAND and NODEFER respectively. + */ +#define SA_NOCLDSTOP 0x00000001 +#define SA_NOCLDWAIT 0x00000002 /* not supported yet */ +#define SA_SIGINFO 0x00000004 +#define SA_ONSTACK 0x08000000 +#define SA_RESTART 0x10000000 +#define SA_NODEFER 0x40000000 +#define SA_RESETHAND 0x80000000 + +#define SA_NOMASK SA_NODEFER +#define SA_ONESHOT SA_RESETHAND +#define SA_INTERRUPT 0x20000000 /* dummy -- ignored */ + +#define SA_RESTORER 0x04000000 + +/* + * sigaltstack controls + */ +#define SS_ONSTACK 1 +#define SS_DISABLE 2 + +#define MINSIGSTKSZ 2048 +#define SIGSTKSZ 8192 + + +#define sigmask(sig) ((__u32)1 << ((sig) - 1)) + +#endif // LIBCFS_SIGNAL_H \ No newline at end of file diff --git a/lnet/include/lnet/Makefile.am b/lnet/include/lnet/Makefile.am index 3df0f2b..a6e5159 100644 --- a/lnet/include/lnet/Makefile.am +++ b/lnet/include/lnet/Makefile.am @@ -1,4 +1,4 @@ -portalsdir=$(includedir)/portals +lnetdir=$(includedir)/lnet SUBDIRS := linux if DARWIN @@ -6,7 +6,6 @@ SUBDIRS += darwin endif DIST_SUBDIRS := $(SUBDIRS) -EXTRA_DIST = api.h api-support.h build_check.h errno.h \ - internal.h kpr.h lib-p30.h lib-types.h \ - myrnal.h nal.h nalids.h p30.h ptlctl.h \ - socknal.h stringtab.h types.h +EXTRA_DIST = api.h api-support.h \ + lib-lnet.h lib-types.h lnet.h lnetctl.h types.h \ + socklnd.h ptllnd.h ptllnd_wire.h diff --git a/lnet/include/lnet/api-support.h b/lnet/include/lnet/api-support.h index 848cf40..717559f 100644 --- a/lnet/include/lnet/api-support.h +++ b/lnet/include/lnet/api-support.h @@ -1,24 +1,18 @@ -#ifndef __API_SUPPORT_H__ -#define __API_SUPPORT_H__ -#include "build_check.h" +#ifndef __LNET_API_SUPPORT_H__ +#define __LNET_API_SUPPORT_H__ -#ifndef __KERNEL__ -# include -# include -# include -# include - -/* Lots of POSIX dependencies to support PtlEQWait_timeout */ -# include -# include -# include +#if defined(__linux__) +#include +#elif defined(__APPLE__) +#include +#elif defined(__WINNT__) +#include +#else +#error Unsupported Operating System #endif -#include +#include #include -#include - -#include -#include +#include #endif diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index 2d3a8f6..481a0fd 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -1,146 +1,99 @@ -#ifndef P30_API_H -#define P30_API_H +#ifndef __LNET_API_H__ +#define __LNET_API_H__ -#include "build_check.h" +#include -#include +int LNetInit(void); +void LNetFini(void); -int PtlInit(int *); -void PtlFini(void); +int LNetNIInit(lnet_pid_t requested_pid); +int LNetNIFini(void); -int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, - ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits, - ptl_handle_ni_t *interface_out); - -int PtlNIInitialized(ptl_interface_t); - -int PtlNIFini(ptl_handle_ni_t interface_in); - -int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id); - -int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid); - - -/* - * Network interfaces - */ - -int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, - ptl_sr_value_t * status_out); - -int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, - unsigned long *distance_out); - -int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t * interface_out); - - -/* - * PtlFailNid - * - * Not an official Portals 3 API call. It provides a way of simulating - * communications failures to all (nid == PTL_NID_ANY), or specific peers - * (via multiple calls), either until further notice (threshold == -1), or - * for a specific number of messages. Passing a threshold of zero, "heals" - * the given peer. - */ -int PtlFailNid (ptl_handle_ni_t ni, ptl_nid_t nid, unsigned int threshold); - -/* - * PtlLoopback - * - * Not an official Portals 3 API call. It provides a way of enabling or - * disabling loopback optimisation, or getting its current state. - */ -int PtlLoopback (ptl_handle_ni_t ni, int set, int *enabled); +int LNetGetId(unsigned int index, lnet_process_id_t *id); +int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, int *order); +int LNetCtl(unsigned int cmd, void *arg); +void LNetSnprintHandle (char *str, int str_len, lnet_handle_any_t handle); /* - * PtlSnprintHandle: - * - * This is not an official Portals 3 API call. It is provided - * so that an application can print an opaque handle. + * Portals */ -void PtlSnprintHandle (char *str, int str_len, ptl_handle_any_t handle); +int LNetSetLazyPortal(int portal); +int LNetClearLazyPortal(int portal); /* * Match entries */ - -int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, - ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, - ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, - ptl_ins_pos_t pos_in, ptl_handle_me_t * handle_out); - -int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, - ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, - ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, - ptl_handle_me_t * handle_out); - -int PtlMEUnlink(ptl_handle_me_t current_in); - -int PtlMEUnlinkList(ptl_handle_me_t current_in); - - +int LNetMEAttach(unsigned int portal, + lnet_process_id_t match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + lnet_unlink_t unlink_in, + lnet_ins_pos_t pos_in, + lnet_handle_me_t *handle_out); + +int LNetMEInsert(lnet_handle_me_t current_in, + lnet_process_id_t match_id_in, + __u64 match_bits_in, + __u64 ignore_bits_in, + lnet_unlink_t unlink_in, + lnet_ins_pos_t position_in, + lnet_handle_me_t *handle_out); + +int LNetMEUnlink(lnet_handle_me_t current_in); /* * Memory descriptors */ +int LNetMDAttach(lnet_handle_me_t current_in, + lnet_md_t md_in, + lnet_unlink_t unlink_in, + lnet_handle_md_t *handle_out); -int PtlMDAttach(ptl_handle_me_t current_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); - -int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out); - -int PtlMDUnlink(ptl_handle_md_t md_in); - -int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t * old_inout, - ptl_md_t * new_inout, ptl_handle_eq_t testq_in); - - -/* These should not be called by users */ -int PtlMDUpdate_internal(ptl_handle_md_t md_in, ptl_md_t * old_inout, - ptl_md_t * new_inout, ptl_handle_eq_t testq_in, - ptl_seq_t sequence_in); - - +int LNetMDBind(lnet_md_t md_in, + lnet_unlink_t unlink_in, + lnet_handle_md_t *handle_out); +int LNetMDUnlink(lnet_handle_md_t md_in); /* * Event queues */ -int PtlEQAlloc(ptl_handle_ni_t ni_in, ptl_size_t count_in, - ptl_eq_handler_t handler, - ptl_handle_eq_t *handle_out); -int PtlEQFree(ptl_handle_eq_t eventq_in); - -int PtlEQGet(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); +int LNetEQAlloc(unsigned int count_in, + lnet_eq_handler_t handler, + lnet_handle_eq_t *handle_out); +int LNetEQFree(lnet_handle_eq_t eventq_in); -int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t * event_out); +int LNetEQGet(lnet_handle_eq_t eventq_in, + lnet_event_t *event_out); -int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, - ptl_event_t *event_out, int *which_out); -/* - * Access Control Table - */ -int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, - ptl_process_id_t match_id_in, ptl_pt_index_t portal_in); +int LNetEQWait(lnet_handle_eq_t eventq_in, + lnet_event_t *event_out); +int LNetEQPoll(lnet_handle_eq_t *eventqs_in, + int neq_in, + int timeout_ms, + lnet_event_t *event_out, + int *which_eq_out); /* * Data movement */ - -int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, - ptl_process_id_t target_in, ptl_pt_index_t portal_in, - ptl_ac_index_t cookie_in, ptl_match_bits_t match_bits_in, - ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in); - -int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, - ptl_pt_index_t portal_in, ptl_ac_index_t cookie_in, - ptl_match_bits_t match_bits_in, ptl_size_t offset_in); - - +int LNetPut(lnet_nid_t self, + lnet_handle_md_t md_in, + lnet_ack_req_t ack_req_in, + lnet_process_id_t target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in, + __u64 hdr_data_in); + +int LNetGet(lnet_nid_t self, + lnet_handle_md_t md_in, + lnet_process_id_t target_in, + unsigned int portal_in, + __u64 match_bits_in, + unsigned int offset_in); #endif diff --git a/lnet/include/lnet/build_check.h b/lnet/include/lnet/build_check.h deleted file mode 100644 index c219d2a..0000000 --- a/lnet/include/lnet/build_check.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _BUILD_CHECK_H -#define _BUILD_CHECK_H - -#if CRAY_PORTALS -#error "an application got to me instead of cray's includes" -#endif - -#endif diff --git a/lnet/include/lnet/darwin/Makefile.am b/lnet/include/lnet/darwin/Makefile.am index b6e7daf..409e159 100644 --- a/lnet/include/lnet/darwin/Makefile.am +++ b/lnet/include/lnet/darwin/Makefile.am @@ -1 +1 @@ -EXTRA_DIST := lib-p30.h lib-types.h p30.h +EXTRA_DIST := lib-lnet.h lib-types.h lnet.h api-support.h diff --git a/lnet/include/lnet/darwin/api-support.h b/lnet/include/lnet/darwin/api-support.h new file mode 100644 index 0000000..c411f17 --- /dev/null +++ b/lnet/include/lnet/darwin/api-support.h @@ -0,0 +1,27 @@ +#ifndef __DARWIN_API_SUPPORT_H__ +#define __DARWIN_API_SUPPORT_H__ + +#ifndef __LNET_API_SUPPORT_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifndef __KERNEL__ +# include +# include +# include +# include + +/* Lots of POSIX dependencies to support PtlEQWait_timeout */ +# include +# include +# include + +# ifdef HAVE_LIBREADLINE +# include +typedef VFunction rl_vintfunc_t; +typedef VFunction rl_voidfunc_t; +# endif +#endif + + +#endif diff --git a/lnet/include/lnet/darwin/lib-lnet.h b/lnet/include/lnet/darwin/lib-lnet.h index d3b1ba9..af4bc5d 100644 --- a/lnet/include/lnet/darwin/lib-lnet.h +++ b/lnet/include/lnet/darwin/lib-lnet.h @@ -1,14 +1,16 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ -#ifndef __PORTALS_DARWIN_LIB_P30_H__ -#define __PORTALS_DARWIN_LIB_P30_H__ +#ifndef __LNET_DARWIN_LIB_LNET_H__ +#define __LNET_DARWIN_LIB_LNET_H__ -#ifndef __PORTALS_LIB_P30_H__ -#error Do not #include this file directly. #include instead +#ifndef __LNET_LIB_LNET_H__ +#error Do not #include this file directly. #include instead #endif #include #include +#undef LNET_ROUTER + #endif diff --git a/lnet/include/lnet/darwin/lib-p30.h b/lnet/include/lnet/darwin/lib-p30.h deleted file mode 100644 index d3b1ba9..0000000 --- a/lnet/include/lnet/darwin/lib-p30.h +++ /dev/null @@ -1,14 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef __PORTALS_DARWIN_LIB_P30_H__ -#define __PORTALS_DARWIN_LIB_P30_H__ - -#ifndef __PORTALS_LIB_P30_H__ -#error Do not #include this file directly. #include instead -#endif - -#include -#include - -#endif diff --git a/lnet/include/lnet/darwin/lib-types.h b/lnet/include/lnet/darwin/lib-types.h index 744e566..f1552fb 100644 --- a/lnet/include/lnet/darwin/lib-types.h +++ b/lnet/include/lnet/darwin/lib-types.h @@ -1,15 +1,27 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ -#ifndef __PORTALS_DARWIN_LIB_TYPES_H__ -#define __PORTALS_DARWIN_LIB_TYPES_H__ +#ifndef __LNET_DARWIN_LIB_TYPES_H__ +#define __LNET_DARWIN_LIB_TYPES_H__ -#ifndef __PORTALS_LIB_TYPES_H__ -#error Do not #include this file directly. #include instead +#ifndef __LNET_LIB_TYPES_H__ +#error Do not #include this file directly. #include instead #endif #include #include #include +/* + * XXX Liang: + * + * Temporary fix, because lnet_me_free()->cfs_free->FREE() can be blocked in xnu, + * at then same time we've taken LNET_LOCK(), which is a spinlock. + * by using LNET_USE_LIB_FREELIST, we can avoid calling of FREE(). + * + * A better solution is moving lnet_me_free() out from LNET_LOCK, it's not hard + * but need to be very careful and take some time. + */ +#define LNET_USE_LIB_FREELIST + #endif diff --git a/lnet/include/lnet/darwin/lnet.h b/lnet/include/lnet/darwin/lnet.h index e619fa7..82a6127 100644 --- a/lnet/include/lnet/darwin/lnet.h +++ b/lnet/include/lnet/darwin/lnet.h @@ -1,15 +1,15 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ -#ifndef _PORTALS_DARWIN_P30_H_ -#define _PORTALS_DARWIN_P30_H_ +#ifndef __LNET_DARWIN_LNET_H__ +#define __LNET_DARWIN_LNET_H__ -#ifndef __PORTALS_P30_H__ -#error Do not #include this file directly. #include instead +#ifndef __LNET_H__ +#error Do not #include this file directly. #include instead #endif /* - * p30.h + * lnet.h * * User application interface file */ diff --git a/lnet/include/lnet/darwin/p30.h b/lnet/include/lnet/darwin/p30.h deleted file mode 100644 index e619fa7..0000000 --- a/lnet/include/lnet/darwin/p30.h +++ /dev/null @@ -1,20 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef _PORTALS_DARWIN_P30_H_ -#define _PORTALS_DARWIN_P30_H_ - -#ifndef __PORTALS_P30_H__ -#error Do not #include this file directly. #include instead -#endif - -/* - * p30.h - * - * User application interface file - */ - -#include -#include - -#endif diff --git a/lnet/include/lnet/errno.h b/lnet/include/lnet/errno.h deleted file mode 100644 index 42f2626..0000000 --- a/lnet/include/lnet/errno.h +++ /dev/null @@ -1,53 +0,0 @@ -#ifndef _P30_ERRNO_H_ -#define _P30_ERRNO_H_ - -#include "build_check.h" -/* - * include/portals/errno.h - * - * Shared error number lists - */ - -/* If you change these, you must update the string table in api-errno.c */ -typedef enum { - PTL_OK = 0, - PTL_SEGV = 1, - - PTL_NO_SPACE = 2, - PTL_ME_IN_USE = 3, - PTL_VAL_FAILED = 4, - - PTL_NAL_FAILED = 5, - PTL_NO_INIT = 6, - PTL_IFACE_DUP = 7, - PTL_IFACE_INVALID = 8, - - PTL_HANDLE_INVALID = 9, - PTL_MD_INVALID = 10, - PTL_ME_INVALID = 11, -/* If you change these, you must update the string table in api-errno.c */ - PTL_PROCESS_INVALID = 12, - PTL_PT_INDEX_INVALID = 13, - - PTL_SR_INDEX_INVALID = 14, - PTL_EQ_INVALID = 15, - PTL_EQ_DROPPED = 16, - - PTL_EQ_EMPTY = 17, - PTL_MD_NO_UPDATE = 18, - PTL_FAIL = 19, - - PTL_IOV_INVALID = 20, - - PTL_EQ_IN_USE = 21, - - PTL_NI_INVALID = 22, - PTL_MD_ILLEGAL = 23, - - PTL_MAX_ERRNO = 24 -} ptl_err_t; -/* If you change these, you must update the string table in api-errno.c */ - -extern const char *ptl_err_str[]; - -#endif diff --git a/lnet/include/lnet/internal.h b/lnet/include/lnet/internal.h deleted file mode 100644 index eae00a0..0000000 --- a/lnet/include/lnet/internal.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _P30_INTERNAL_H_ -#define _P30_INTERNAL_H_ - -#include "build_check.h" -/* - * p30/internal.h - * - * Internals for the API level library that are not needed - * by the user application - */ - -#include - -extern int ptl_init; /* Has the library been initialized */ - -#endif diff --git a/lnet/include/lnet/kpr.h b/lnet/include/lnet/kpr.h deleted file mode 100644 index 23d6e7c..0000000 --- a/lnet/include/lnet/kpr.h +++ /dev/null @@ -1,176 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef __PORTALS_KPR_H__ -#define __PORTALS_KPR_H__ - -# include /* for ptl_hdr_t */ - -/******************************************************************************/ -/* Kernel Portals Router interface */ - -typedef void (*kpr_fwd_callback_t)(void *arg, int error); // completion callback - -/* space for routing targets to stash "stuff" in a forwarded packet */ -typedef union { - long long _alignment; - void *_space[16]; /* scale with CPU arch */ -} kprfd_scratch_t; - -/* Kernel Portals Routing Forwarded message Descriptor */ -typedef struct { - struct list_head kprfd_list; /* stash in queues (routing target can use) */ - ptl_nid_t kprfd_target_nid; /* final destination NID */ - ptl_nid_t kprfd_gateway_nid; /* gateway NID */ - ptl_hdr_t *kprfd_hdr; /* header in wire byte order */ - int kprfd_nob; /* # payload bytes */ - int kprfd_niov; /* # payload frags */ - ptl_kiov_t *kprfd_kiov; /* payload fragments */ - void *kprfd_router_arg; /* originating NAL's router arg */ - kpr_fwd_callback_t kprfd_callback; /* completion callback */ - void *kprfd_callback_arg; /* completion callback arg */ - kprfd_scratch_t kprfd_scratch; /* scratchpad for routing targets */ -} kpr_fwd_desc_t; - -typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); -typedef void (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive); - -/* NAL's routing interface (Kernel Portals Routing Nal Interface) */ -typedef const struct { - int kprni_nalid; /* NAL's id */ - void *kprni_arg; /* Arg to pass when calling into NAL */ - kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ - kpr_notify_t kprni_notify; /* NAL's notification entrypoint */ -} kpr_nal_interface_t; - -/* Router's routing interface (Kernel Portals Routing Router Interface) */ -typedef const struct { - /* register the calling NAL with the router and get back the handle for - * subsequent calls */ - int (*kprri_register) (kpr_nal_interface_t *nal_interface, - void **router_arg); - - /* ask the router to find a gateway that forwards to 'nid' and is a - * peer of the calling NAL; assume caller will send 'nob' bytes of - * payload there */ - int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob, - ptl_nid_t *gateway_nid); - - /* hand a packet over to the router for forwarding */ - kpr_fwd_t kprri_fwd_start; - - /* hand a packet back to the router for completion */ - void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, - int error); - - /* notify the router about peer state */ - void (*kprri_notify) (void *router_arg, ptl_nid_t peer, - int alive, time_t when); - - /* the calling NAL is shutting down */ - void (*kprri_shutdown) (void *router_arg); - - /* deregister the calling NAL with the router */ - void (*kprri_deregister) (void *router_arg); - -} kpr_router_interface_t; - -/* Convenient struct for NAL to stash router interface/args */ -typedef struct { - kpr_router_interface_t *kpr_interface; - void *kpr_arg; -} kpr_router_t; - -extern kpr_router_interface_t kpr_router_interface; - -static inline int -kpr_register (kpr_router_t *router, kpr_nal_interface_t *nalif) -{ - int rc; - - router->kpr_interface = PORTAL_SYMBOL_GET (kpr_router_interface); - if (router->kpr_interface == NULL) - return (-ENOENT); - - rc = (router->kpr_interface)->kprri_register (nalif, &router->kpr_arg); - if (rc != 0) - router->kpr_interface = NULL; - - PORTAL_SYMBOL_PUT (kpr_router_interface); - return (rc); -} - -static inline int -kpr_routing (kpr_router_t *router) -{ - return (router->kpr_interface != NULL); -} - -static inline int -kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid) -{ - if (!kpr_routing (router)) - return (-ENETUNREACH); - - return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob, - gateway_nid)); -} - -static inline void -kpr_fwd_init (kpr_fwd_desc_t *fwd, ptl_nid_t nid, ptl_hdr_t *hdr, - int nob, int niov, ptl_kiov_t *kiov, - kpr_fwd_callback_t callback, void *callback_arg) -{ - fwd->kprfd_target_nid = nid; - fwd->kprfd_gateway_nid = nid; - fwd->kprfd_hdr = hdr; - fwd->kprfd_nob = nob; - fwd->kprfd_niov = niov; - fwd->kprfd_kiov = kiov; - fwd->kprfd_callback = callback; - fwd->kprfd_callback_arg = callback_arg; -} - -static inline void -kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) -{ - if (!kpr_routing (router)) - fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH); - else - router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); -} - -static inline void -kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) -{ - LASSERT (kpr_routing (router)); - router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); -} - -static inline void -kpr_notify (kpr_router_t *router, - ptl_nid_t peer, int alive, time_t when) -{ - if (!kpr_routing (router)) - return; - - router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when); -} - -static inline void -kpr_shutdown (kpr_router_t *router) -{ - if (kpr_routing (router)) - router->kpr_interface->kprri_shutdown (router->kpr_arg); -} - -static inline void -kpr_deregister (kpr_router_t *router) -{ - if (!kpr_routing (router)) - return; - router->kpr_interface->kprri_deregister (router->kpr_arg); - router->kpr_interface = NULL; -} - -#endif /* _KPR_H */ diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index f56206b..25ab308 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -1,213 +1,254 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * lib-p30.h + * lib-lnet.h * * Top level include for library side routines */ -#ifndef __PORTALS_LIB_P30_H__ -#define __PORTALS_LIB_P30_H__ - -#include "build_check.h" +#ifndef __LNET_LIB_LNET_H__ +#define __LNET_LIB_LNET_H__ #if defined(__linux__) -#include +#include #elif defined(__APPLE__) -#include +#include +#elif defined(__WINNT__) +#include #else #error Unsupported Operating System #endif -#include +#include #include -#include -#include -#include +#include +#include + +extern lnet_t the_lnet; /* THE network */ + +static inline int lnet_is_wire_handle_none (lnet_handle_wire_t *wh) +{ + return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_NONE.wh_interface_cookie && + wh->wh_object_cookie == LNET_WIRE_HANDLE_NONE.wh_object_cookie); +} + +static inline int lnet_md_exhausted (lnet_libmd_t *md) +{ + return (md->md_threshold == 0 || + ((md->md_options & LNET_MD_MAX_SIZE) != 0 && + md->md_offset + md->md_max_size > md->md_length)); +} -static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) +static inline int lnet_md_unlinkable (lnet_libmd_t *md) { - return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && - wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); + /* Should unlink md when its refcount is 0 and either: + * - md has been flagged for deletion (by auto unlink or LNetM[DE]Unlink, + * in the latter case md may not be exhausted). + * - auto unlink is on and md is exhausted. + */ + if (md->md_refcount != 0) + return 0; + + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) != 0) + return 1; + + return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 && + lnet_md_exhausted(md)); } #ifdef __KERNEL__ -#define LIB_LOCK(nal,flags) \ - spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags) -#define LIB_UNLOCK(nal,flags) \ - spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags) +#define LNET_LOCK() spin_lock(&the_lnet.ln_lock) +#define LNET_UNLOCK() spin_unlock(&the_lnet.ln_lock) +#define LNET_MUTEX_DOWN(m) mutex_down(m) +#define LNET_MUTEX_UP(m) mutex_up(m) #else -#define LIB_LOCK(nal,flags) \ - (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0) -#define LIB_UNLOCK(nal,flags) \ - pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex) +# if !HAVE_LIBPTHREAD +#define LNET_SINGLE_THREADED_LOCK(l) \ +do { \ + LASSERT ((l) == 0); \ + (l) = 1; \ +} while (0) + +#define LNET_SINGLE_THREADED_UNLOCK(l) \ +do { \ + LASSERT ((l) == 1); \ + (l) = 0; \ +} while (0) + +#define LNET_LOCK() LNET_SINGLE_THREADED_LOCK(the_lnet.ln_lock) +#define LNET_UNLOCK() LNET_SINGLE_THREADED_UNLOCK(the_lnet.ln_lock) +#define LNET_MUTEX_DOWN(m) LNET_SINGLE_THREADED_LOCK(*(m)) +#define LNET_MUTEX_UP(m) LNET_SINGLE_THREADED_UNLOCK(*(m)) +# else +#define LNET_LOCK() pthread_mutex_lock(&the_lnet.ln_lock) +#define LNET_UNLOCK() pthread_mutex_unlock(&the_lnet.ln_lock) +#define LNET_MUTEX_DOWN(m) pthread_mutex_lock(m) +#define LNET_MUTEX_UP(m) pthread_mutex_unlock(m) +# endif #endif +#define MAX_PORTALS 64 -#ifdef PTL_USE_LIB_FREELIST +#ifdef LNET_USE_LIB_FREELIST #define MAX_MES 2048 #define MAX_MDS 2048 #define MAX_MSGS 2048 /* Outstanding messages */ #define MAX_EQS 512 -extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize); -extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl); - static inline void * -lib_freelist_alloc (lib_freelist_t *fl) +lnet_freelist_alloc (lnet_freelist_t *fl) { /* ALWAYS called with liblock held */ - lib_freeobj_t *o; + lnet_freeobj_t *o; if (list_empty (&fl->fl_list)) return (NULL); - o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); + o = list_entry (fl->fl_list.next, lnet_freeobj_t, fo_list); list_del (&o->fo_list); return ((void *)&o->fo_contents); } static inline void -lib_freelist_free (lib_freelist_t *fl, void *obj) +lnet_freelist_free (lnet_freelist_t *fl, void *obj) { /* ALWAYS called with liblock held */ - lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); + lnet_freeobj_t *o = list_entry (obj, lnet_freeobj_t, fo_contents); list_add (&o->fo_list, &fl->fl_list); } -static inline lib_eq_t * -lib_eq_alloc (lib_nal_t *nal) +static inline lnet_eq_t * +lnet_eq_alloc (void) { /* NEVER called with liblock held */ - unsigned long flags; - lib_eq_t *eq; + lnet_eq_t *eq; - LIB_LOCK (nal, flags); - eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs); - LIB_UNLOCK (nal, flags); + LNET_LOCK(); + eq = (lnet_eq_t *)lnet_freelist_alloc(&the_lnet.ln_free_eqs); + LNET_UNLOCK(); return (eq); } static inline void -lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) +lnet_eq_free (lnet_eq_t *eq) { /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq); + lnet_freelist_free(&the_lnet.ln_free_eqs, eq); } -static inline lib_md_t * -lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) +static inline lnet_libmd_t * +lnet_md_alloc (lnet_md_t *umd) { /* NEVER called with liblock held */ - unsigned long flags; - lib_md_t *md; + lnet_libmd_t *md; - LIB_LOCK (nal, flags); - md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds); - LIB_UNLOCK (nal, flags); + LNET_LOCK(); + md = (lnet_libmd_t *)lnet_freelist_alloc(&the_lnet.ln_free_mds); + LNET_UNLOCK(); return (md); } static inline void -lib_md_free (lib_nal_t *nal, lib_md_t *md) +lnet_md_free (lnet_libmd_t *md) { /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_mds, md); + lnet_freelist_free (&the_lnet.ln_free_mds, md); } -static inline lib_me_t * -lib_me_alloc (lib_nal_t *nal) +static inline lnet_me_t * +lnet_me_alloc (void) { /* NEVER called with liblock held */ - unsigned long flags; - lib_me_t *me; + lnet_me_t *me; - LIB_LOCK (nal, flags); - me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes); - LIB_UNLOCK (nal, flags); + LNET_LOCK(); + me = (lnet_me_t *)lnet_freelist_alloc(&the_lnet.ln_free_mes); + LNET_UNLOCK(); return (me); } static inline void -lib_me_free (lib_nal_t *nal, lib_me_t *me) +lnet_me_free (lnet_me_t *me) { /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_mes, me); + lnet_freelist_free (&the_lnet.ln_free_mes, me); } -static inline lib_msg_t * -lib_msg_alloc (lib_nal_t *nal) +static inline lnet_msg_t * +lnet_msg_alloc (void) { /* NEVER called with liblock held */ - unsigned long flags; - lib_msg_t *msg; + lnet_msg_t *msg; - LIB_LOCK (nal, flags); - msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs); - LIB_UNLOCK (nal, flags); + LNET_LOCK(); + msg = (lnet_msg_t *)lnet_freelist_alloc(&the_lnet.ln_free_msgs); + LNET_UNLOCK(); if (msg != NULL) { /* NULL pointers, clear flags etc */ memset (msg, 0, sizeof (*msg)); - msg->ack_wmd = PTL_WIRE_HANDLE_NONE; +#ifdef CRAY_XT3 + msg->msg_ev.uid = LNET_UID_ANY; +#endif } return(msg); } static inline void -lib_msg_free (lib_nal_t *nal, lib_msg_t *msg) +lnet_msg_free (lnet_msg_t *msg) { /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg); + LASSERT (!msg->msg_onactivelist); + lnet_freelist_free(&the_lnet.ln_free_msgs, msg); } #else -static inline lib_eq_t * -lib_eq_alloc (lib_nal_t *nal) +static inline lnet_eq_t * +lnet_eq_alloc (void) { /* NEVER called with liblock held */ - lib_eq_t *eq; + lnet_eq_t *eq; - PORTAL_ALLOC(eq, sizeof(*eq)); + LIBCFS_ALLOC(eq, sizeof(*eq)); return (eq); } static inline void -lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) +lnet_eq_free (lnet_eq_t *eq) { /* ALWAYS called with liblock held */ - PORTAL_FREE(eq, sizeof(*eq)); + LIBCFS_FREE(eq, sizeof(*eq)); } -static inline lib_md_t * -lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) +static inline lnet_libmd_t * +lnet_md_alloc (lnet_md_t *umd) { /* NEVER called with liblock held */ - lib_md_t *md; - int size; - int niov; + lnet_libmd_t *md; + int size; + unsigned int niov; - if ((umd->options & PTL_MD_KIOV) != 0) { + if ((umd->options & LNET_MD_KIOV) != 0) { niov = umd->length; - size = offsetof(lib_md_t, md_iov.kiov[niov]); + size = offsetof(lnet_libmd_t, md_iov.kiov[niov]); } else { - niov = ((umd->options & PTL_MD_IOVEC) != 0) ? + niov = ((umd->options & LNET_MD_IOVEC) != 0) ? umd->length : 1; - size = offsetof(lib_md_t, md_iov.iov[niov]); + size = offsetof(lnet_libmd_t, md_iov.iov[niov]); } - PORTAL_ALLOC(md, size); + LIBCFS_ALLOC(md, size); if (md != NULL) { /* Set here in case of early free */ - md->options = umd->options; + md->md_options = umd->options; md->md_niov = niov; } @@ -215,252 +256,424 @@ lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) } static inline void -lib_md_free (lib_nal_t *nal, lib_md_t *md) +lnet_md_free (lnet_libmd_t *md) { /* ALWAYS called with liblock held */ int size; - if ((md->options & PTL_MD_KIOV) != 0) - size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]); + if ((md->md_options & LNET_MD_KIOV) != 0) + size = offsetof(lnet_libmd_t, md_iov.kiov[md->md_niov]); else - size = offsetof(lib_md_t, md_iov.iov[md->md_niov]); + size = offsetof(lnet_libmd_t, md_iov.iov[md->md_niov]); - PORTAL_FREE(md, size); + LIBCFS_FREE(md, size); } -static inline lib_me_t * -lib_me_alloc (lib_nal_t *nal) +static inline lnet_me_t * +lnet_me_alloc (void) { /* NEVER called with liblock held */ - lib_me_t *me; + lnet_me_t *me; - PORTAL_ALLOC(me, sizeof(*me)); + LIBCFS_ALLOC(me, sizeof(*me)); return (me); } static inline void -lib_me_free(lib_nal_t *nal, lib_me_t *me) +lnet_me_free(lnet_me_t *me) { /* ALWAYS called with liblock held */ - PORTAL_FREE(me, sizeof(*me)); + LIBCFS_FREE(me, sizeof(*me)); } -static inline lib_msg_t * -lib_msg_alloc(lib_nal_t *nal) +static inline lnet_msg_t * +lnet_msg_alloc(void) { - /* NEVER called with liblock held; may be in interrupt... */ - lib_msg_t *msg; + /* NEVER called with liblock held */ + lnet_msg_t *msg; - if (in_interrupt()) - PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg)); - else - PORTAL_ALLOC(msg, sizeof(*msg)); + LIBCFS_ALLOC(msg, sizeof(*msg)); if (msg != NULL) { /* NULL pointers, clear flags etc */ memset (msg, 0, sizeof (*msg)); - msg->ack_wmd = PTL_WIRE_HANDLE_NONE; +#ifdef CRAY_XT3 + msg->msg_ev.uid = LNET_UID_ANY; +#endif } return (msg); } static inline void -lib_msg_free(lib_nal_t *nal, lib_msg_t *msg) +lnet_msg_free(lnet_msg_t *msg) { /* ALWAYS called with liblock held */ - PORTAL_FREE(msg, sizeof(*msg)); + LASSERT (!msg->msg_onactivelist); + LIBCFS_FREE(msg, sizeof(*msg)); } #endif -extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type); -extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type); -extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh); +extern lnet_libhandle_t *lnet_lookup_cookie (__u64 cookie, int type); +extern void lnet_initialise_handle (lnet_libhandle_t *lh, int type); +extern void lnet_invalidate_handle (lnet_libhandle_t *lh); static inline void -ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq) +lnet_eq2handle (lnet_handle_eq_t *handle, lnet_eq_t *eq) { if (eq == NULL) { - *handle = PTL_EQ_NONE; + *handle = LNET_EQ_NONE; return; } - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = eq->eq_lh.lh_cookie; } -static inline lib_eq_t * -ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal) +static inline lnet_eq_t * +lnet_handle2eq (lnet_handle_eq_t *handle) { /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_EQ); + lnet_libhandle_t *lh = lnet_lookup_cookie(handle->cookie, + LNET_COOKIE_TYPE_EQ); if (lh == NULL) return (NULL); - return (lh_entry (lh, lib_eq_t, eq_lh)); + return (lh_entry (lh, lnet_eq_t, eq_lh)); } static inline void -ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md) +lnet_md2handle (lnet_handle_md_t *handle, lnet_libmd_t *md) { - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = md->md_lh.lh_cookie; } -static inline lib_md_t * -ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal) +static inline lnet_libmd_t * +lnet_handle2md (lnet_handle_md_t *handle) { /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_MD); + lnet_libhandle_t *lh = lnet_lookup_cookie(handle->cookie, + LNET_COOKIE_TYPE_MD); if (lh == NULL) return (NULL); - return (lh_entry (lh, lib_md_t, md_lh)); + return (lh_entry (lh, lnet_libmd_t, md_lh)); } -static inline lib_md_t * -ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal) +static inline lnet_libmd_t * +lnet_wire_handle2md (lnet_handle_wire_t *wh) { /* ALWAYS called with liblock held */ - lib_handle_t *lh; + lnet_libhandle_t *lh; - if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie) + if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie) return (NULL); - lh = lib_lookup_cookie (nal, wh->wh_object_cookie, - PTL_COOKIE_TYPE_MD); + lh = lnet_lookup_cookie(wh->wh_object_cookie, + LNET_COOKIE_TYPE_MD); if (lh == NULL) return (NULL); - return (lh_entry (lh, lib_md_t, md_lh)); + return (lh_entry (lh, lnet_libmd_t, md_lh)); } static inline void -ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me) +lnet_me2handle (lnet_handle_me_t *handle, lnet_me_t *me) { - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; handle->cookie = me->me_lh.lh_cookie; } -static inline lib_me_t * -ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal) +static inline lnet_me_t * +lnet_handle2me (lnet_handle_me_t *handle) { /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_ME); + lnet_libhandle_t *lh = lnet_lookup_cookie(handle->cookie, + LNET_COOKIE_TYPE_ME); if (lh == NULL) return (NULL); - return (lh_entry (lh, lib_me_t, me_lh)); + return (lh_entry (lh, lnet_me_t, me_lh)); } -extern int lib_init(lib_nal_t *libnal, nal_t *apinal, - ptl_process_id_t pid, - ptl_ni_limits_t *desired_limits, - ptl_ni_limits_t *actual_limits); -extern int lib_fini(lib_nal_t *libnal); +static inline void +lnet_peer_addref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + lp->lp_refcount++; +} -/* - * When the NAL detects an incoming message header, it should call - * lib_parse() decode it. If the message header is garbage, lib_parse() - * returns immediately with failure, otherwise the NAL callbacks will be - * called to receive the message body. They are handed the private cookie - * as a way for the NAL to maintain state about which transaction is being - * processed. An extra parameter, lib_msg contains the lib-level message - * state for passing to lib_finalize() when the message body has been - * received. - */ -extern void lib_enq_event_locked (lib_nal_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev); -extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_ni_fail_t ni_fail_type); -extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private); -extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, - lib_msg_t *get_msg); -extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr); - - -extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); -extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, - ptl_size_t offset, ptl_size_t len); -extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, - char *src, ptl_size_t len); -extern int lib_extract_iov (int dst_niov, struct iovec *dst, - int src_niov, struct iovec *src, - ptl_size_t offset, ptl_size_t len); - -extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); -extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, - ptl_size_t offset, ptl_size_t len); -extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, - char *src, ptl_size_t len); -extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, - int src_niov, ptl_kiov_t *src, - ptl_size_t offset, ptl_size_t len); - -extern void lib_assert_wire_constants (void); - -extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len); - -extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx, - ptl_sr_value_t *status); -extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, - unsigned long *dist); - -extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t *handle); -extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh); -extern int lib_api_eq_poll (nal_t *nal, - ptl_handle_eq_t *eventqs, int neq, int timeout_ms, - ptl_event_t *event, int *which); - -extern int lib_api_me_attach(nal_t *nal, - ptl_pt_index_t portal, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); -extern int lib_api_me_insert(nal_t *nal, - ptl_handle_me_t *current_meh, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); -extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh); -extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me); - -extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid); - -extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md); -extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd); -extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh, - ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle); -extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle); -extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh); -extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh, - ptl_md_t *oldumd, ptl_md_t *newumd, - ptl_handle_eq_t *testqh); - -extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, ptl_size_t offset); -extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_ack_req_t ack, ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, - ptl_size_t offset, ptl_hdr_data_t hdr_data); -extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold); -extern int lib_api_loopback(nal_t *apinal, int set, int *enabled); +extern void lnet_destroy_peer_locked(lnet_peer_t *lp); + +static inline void +lnet_peer_decref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + lp->lp_refcount--; + if (lp->lp_refcount == 0) + lnet_destroy_peer_locked(lp); +} + +static inline int +lnet_isrouter(lnet_peer_t *lp) +{ + return lp->lp_rtr_refcount != 0; +} + +static inline void +lnet_ni_addref_locked(lnet_ni_t *ni) +{ + LASSERT (ni->ni_refcount > 0); + ni->ni_refcount++; +} + +static inline void +lnet_ni_addref(lnet_ni_t *ni) +{ + LNET_LOCK(); + lnet_ni_addref_locked(ni); + LNET_UNLOCK(); +} + +static inline void +lnet_ni_decref_locked(lnet_ni_t *ni) +{ + LASSERT (ni->ni_refcount > 0); + ni->ni_refcount--; + if (ni->ni_refcount == 0) + list_add_tail(&ni->ni_list, &the_lnet.ln_zombie_nis); +} + +static inline void +lnet_ni_decref(lnet_ni_t *ni) +{ + LNET_LOCK(); + lnet_ni_decref_locked(ni); + LNET_UNLOCK(); +} + +static inline lnet_nid_t +lnet_ptlcompat_srcnid(lnet_nid_t src, lnet_nid_t dst) +{ + /* Give myself a portals srcnid if I'm sending to portals */ + if (the_lnet.ln_ptlcompat > 0 && + LNET_NIDNET(dst) == 0) + return LNET_MKNID(0, LNET_NIDADDR(src)); + + return src; +} + +static inline int +lnet_ptlcompat_matchnid(lnet_nid_t lnet_nid, lnet_nid_t ptl_nid) +{ + return ((ptl_nid == lnet_nid) || + (the_lnet.ln_ptlcompat > 0 && + LNET_NIDNET(ptl_nid) == 0 && + LNET_NETTYP(LNET_NIDNET(lnet_nid)) != LOLND && + LNET_NIDADDR(ptl_nid) == LNET_NIDADDR(lnet_nid))); +} + +static inline int +lnet_ptlcompat_matchnet(__u32 lnet_net, __u32 ptl_net) +{ + return ((ptl_net == lnet_net) || + (the_lnet.ln_ptlcompat > 0 && + ptl_net == 0 && + LNET_NETTYP(lnet_net) != LOLND)); +} + +static inline struct list_head * +lnet_nid2peerhash (lnet_nid_t nid) +{ + unsigned int idx = LNET_NIDADDR(nid) % LNET_PEER_HASHSIZE; + + return &the_lnet.ln_peer_hash[idx]; +} + +extern lnd_t the_lolnd; + +#ifndef __KERNEL__ +/* unconditional registration */ +#define LNET_REGISTER_ULND(lnd) \ +do { \ + extern lnd_t lnd; \ + \ + lnet_register_lnd(&(lnd)); \ +} while (0) + +/* conditional registration */ +#define LNET_REGISTER_ULND_IF_PRESENT(lnd) \ +do { \ + extern lnd_t lnd __attribute__ ((weak, alias("the_lolnd"))); \ + \ + if (&(lnd) != &the_lolnd) \ + lnet_register_lnd(&(lnd)); \ +} while (0) +#endif + +#ifdef CRAY_XT3 +inline static void +lnet_set_msg_uid(lnet_ni_t *ni, lnet_msg_t *msg, lnet_uid_t uid) +{ + LASSERT (msg->msg_ev.uid == LNET_UID_ANY); + msg->msg_ev.uid = uid; +} +#endif + +extern lnet_ni_t *lnet_nid2ni_locked (lnet_nid_t nid); +extern lnet_ni_t *lnet_net2ni_locked (__u32 net); +static inline lnet_ni_t * +lnet_net2ni (__u32 net) +{ + lnet_ni_t *ni; + + LNET_LOCK(); + ni = lnet_net2ni_locked(net); + LNET_UNLOCK(); + + return ni; +} + +int lnet_notify(lnet_ni_t *ni, lnet_nid_t peer, int alive, time_t when); +int lnet_add_route(__u32 net, unsigned int hops, lnet_nid_t gateway_nid); +int lnet_check_routes(void); +int lnet_del_route(__u32 net, lnet_nid_t gw_nid); +void lnet_destroy_routes(void); +int lnet_get_route(int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive); +void lnet_proc_init(void); +void lnet_proc_fini(void); +void lnet_init_rtrpools(void); +int lnet_alloc_rtrpools(int im_a_router); +void lnet_free_rtrpools(void); +lnet_remotenet_t *lnet_find_net_locked (__u32 net); + +int lnet_islocalnid(lnet_nid_t nid); +int lnet_islocalnet(__u32 net); + +void lnet_enq_event_locked(lnet_eq_t *eq, lnet_event_t *ev); +void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len); +int lnet_send(lnet_nid_t nid, lnet_msg_t *msg); +void lnet_return_credits_locked (lnet_msg_t *msg); +void lnet_match_blocked_msg(lnet_libmd_t *md); +int lnet_parse (lnet_ni_t *ni, lnet_hdr_t *hdr, + lnet_nid_t fromnid, void *private, int rdma_req); +void lnet_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen); +lnet_msg_t *lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *get_msg); +void lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *msg, unsigned int len); +void lnet_finalize(lnet_ni_t *ni, lnet_msg_t *msg, int rc); + +char *lnet_msgtyp2str (int type); +void lnet_print_hdr (lnet_hdr_t * hdr); +int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); + +unsigned int lnet_iov_nob (unsigned int niov, struct iovec *iov); +int lnet_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + unsigned int offset, unsigned int len); + +unsigned int lnet_kiov_nob (unsigned int niov, lnet_kiov_t *iov); +int lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len); + +void lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, + unsigned int doffset, + unsigned int nsiov, struct iovec *siov, + unsigned int soffset, unsigned int nob); +void lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, + unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, unsigned int nob); +void lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, + unsigned int kiovoffset, + unsigned int niov, struct iovec *iov, + unsigned int iovoffset, unsigned int nob); +void lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov, + unsigned int doffset, + unsigned int nskiov, lnet_kiov_t *skiov, + unsigned int soffset, unsigned int nob); + +static inline void +lnet_copy_iov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, struct iovec *siov, unsigned int soffset, + unsigned int nob) +{ + struct iovec diov = {/*.iov_base = */ dest, /*.iov_len = */ dlen}; + + lnet_copy_iov2iov(1, &diov, doffset, + nsiov, siov, soffset, nob); +} + +static inline void +lnet_copy_kiov2flat(int dlen, void *dest, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *skiov, unsigned int soffset, + unsigned int nob) +{ + struct iovec diov = {/* .iov_base = */ dest, /* .iov_len = */ dlen}; + + lnet_copy_kiov2iov(1, &diov, doffset, + nsiov, skiov, soffset, nob); +} + +static inline void +lnet_copy_flat2iov(unsigned int ndiov, struct iovec *diov, unsigned int doffset, + int slen, void *src, unsigned int soffset, unsigned int nob) +{ + struct iovec siov = {/*.iov_base = */ src, /*.iov_len = */slen}; + lnet_copy_iov2iov(ndiov, diov, doffset, + 1, &siov, soffset, nob); +} + +static inline void +lnet_copy_flat2kiov(unsigned int ndiov, lnet_kiov_t *dkiov, unsigned int doffset, + int slen, void *src, unsigned int soffset, unsigned int nob) +{ + struct iovec siov = {/* .iov_base = */ src, /* .iov_len = */ slen}; + lnet_copy_iov2kiov(ndiov, dkiov, doffset, + 1, &siov, soffset, nob); +} + +void lnet_me_unlink(lnet_me_t *me); + +void lnet_md_unlink(lnet_libmd_t *md); +void lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd); + +void lnet_register_lnd(lnd_t *lnd); +void lnet_unregister_lnd(lnd_t *lnd); +int lnet_set_ip_niaddr (lnet_ni_t *ni); + +#ifdef __KERNEL__ +int lnet_connect(cfs_socket_t **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port); +void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int port); +int lnet_count_acceptor_nis(lnet_ni_t **first_ni); +int lnet_accept(lnet_ni_t *blind_ni, cfs_socket_t *sock, __u32 magic); +int lnet_acceptor_timeout(void); +int lnet_acceptor_port(void); +#endif + +int lnet_acceptor_start(void); +void lnet_acceptor_stop(void); + +int lnet_peers_start_down(void); +int lnet_router_checker_start(void); +void lnet_router_checker_stop(void); + +int lnet_ping_target_init(void); +void lnet_ping_target_fini(void); +int lnet_ping(lnet_process_id_t id, int timeout_ms, + lnet_process_id_t *ids, int n_ids); + +int lnet_parse_ip2nets (char **networksp, char *ip2nets); +int lnet_parse_routes (char *route_str, int *im_a_router); +int lnet_parse_networks (struct list_head *nilist, char *networks); + +int lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid); +lnet_peer_t *lnet_find_peer_locked (lnet_nid_t nid); +void lnet_clear_peer_table(void); +void lnet_destroy_peer_table(void); +int lnet_create_peer_table(void); +void lnet_debug_peer(lnet_nid_t nid); #endif diff --git a/lnet/include/lnet/lib-p30.h b/lnet/include/lnet/lib-p30.h deleted file mode 100644 index f56206b..0000000 --- a/lnet/include/lnet/lib-p30.h +++ /dev/null @@ -1,466 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib-p30.h - * - * Top level include for library side routines - */ - -#ifndef __PORTALS_LIB_P30_H__ -#define __PORTALS_LIB_P30_H__ - -#include "build_check.h" - -#if defined(__linux__) -#include -#elif defined(__APPLE__) -#include -#else -#error Unsupported Operating System -#endif - -#include -#include -#include -#include -#include - -static inline int ptl_is_wire_handle_none (ptl_handle_wire_t *wh) -{ - return (wh->wh_interface_cookie == PTL_WIRE_HANDLE_NONE.wh_interface_cookie && - wh->wh_object_cookie == PTL_WIRE_HANDLE_NONE.wh_object_cookie); -} - -#ifdef __KERNEL__ -#define LIB_LOCK(nal,flags) \ - spin_lock_irqsave(&(nal)->libnal_ni.ni_lock, flags) -#define LIB_UNLOCK(nal,flags) \ - spin_unlock_irqrestore(&(nal)->libnal_ni.ni_lock, flags) -#else -#define LIB_LOCK(nal,flags) \ - (pthread_mutex_lock(&(nal)->libnal_ni.ni_mutex), (flags) = 0) -#define LIB_UNLOCK(nal,flags) \ - pthread_mutex_unlock(&(nal)->libnal_ni.ni_mutex) -#endif - - -#ifdef PTL_USE_LIB_FREELIST - -#define MAX_MES 2048 -#define MAX_MDS 2048 -#define MAX_MSGS 2048 /* Outstanding messages */ -#define MAX_EQS 512 - -extern int lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int nobj, int objsize); -extern void lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl); - -static inline void * -lib_freelist_alloc (lib_freelist_t *fl) -{ - /* ALWAYS called with liblock held */ - lib_freeobj_t *o; - - if (list_empty (&fl->fl_list)) - return (NULL); - - o = list_entry (fl->fl_list.next, lib_freeobj_t, fo_list); - list_del (&o->fo_list); - return ((void *)&o->fo_contents); -} - -static inline void -lib_freelist_free (lib_freelist_t *fl, void *obj) -{ - /* ALWAYS called with liblock held */ - lib_freeobj_t *o = list_entry (obj, lib_freeobj_t, fo_contents); - - list_add (&o->fo_list, &fl->fl_list); -} - - -static inline lib_eq_t * -lib_eq_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_eq_t *eq; - - LIB_LOCK (nal, flags); - eq = (lib_eq_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_eqs); - LIB_UNLOCK (nal, flags); - - return (eq); -} - -static inline void -lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_eqs, eq); -} - -static inline lib_md_t * -lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_md_t *md; - - LIB_LOCK (nal, flags); - md = (lib_md_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mds); - LIB_UNLOCK (nal, flags); - - return (md); -} - -static inline void -lib_md_free (lib_nal_t *nal, lib_md_t *md) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_mds, md); -} - -static inline lib_me_t * -lib_me_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_me_t *me; - - LIB_LOCK (nal, flags); - me = (lib_me_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_mes); - LIB_UNLOCK (nal, flags); - - return (me); -} - -static inline void -lib_me_free (lib_nal_t *nal, lib_me_t *me) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_mes, me); -} - -static inline lib_msg_t * -lib_msg_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - unsigned long flags; - lib_msg_t *msg; - - LIB_LOCK (nal, flags); - msg = (lib_msg_t *)lib_freelist_alloc (&nal->libnal_ni.ni_free_msgs); - LIB_UNLOCK (nal, flags); - - if (msg != NULL) { - /* NULL pointers, clear flags etc */ - memset (msg, 0, sizeof (*msg)); - msg->ack_wmd = PTL_WIRE_HANDLE_NONE; - } - return(msg); -} - -static inline void -lib_msg_free (lib_nal_t *nal, lib_msg_t *msg) -{ - /* ALWAYS called with liblock held */ - lib_freelist_free (&nal->libnal_ni.ni_free_msgs, msg); -} - -#else - -static inline lib_eq_t * -lib_eq_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - lib_eq_t *eq; - - PORTAL_ALLOC(eq, sizeof(*eq)); - return (eq); -} - -static inline void -lib_eq_free (lib_nal_t *nal, lib_eq_t *eq) -{ - /* ALWAYS called with liblock held */ - PORTAL_FREE(eq, sizeof(*eq)); -} - -static inline lib_md_t * -lib_md_alloc (lib_nal_t *nal, ptl_md_t *umd) -{ - /* NEVER called with liblock held */ - lib_md_t *md; - int size; - int niov; - - if ((umd->options & PTL_MD_KIOV) != 0) { - niov = umd->length; - size = offsetof(lib_md_t, md_iov.kiov[niov]); - } else { - niov = ((umd->options & PTL_MD_IOVEC) != 0) ? - umd->length : 1; - size = offsetof(lib_md_t, md_iov.iov[niov]); - } - - PORTAL_ALLOC(md, size); - - if (md != NULL) { - /* Set here in case of early free */ - md->options = umd->options; - md->md_niov = niov; - } - - return (md); -} - -static inline void -lib_md_free (lib_nal_t *nal, lib_md_t *md) -{ - /* ALWAYS called with liblock held */ - int size; - - if ((md->options & PTL_MD_KIOV) != 0) - size = offsetof(lib_md_t, md_iov.kiov[md->md_niov]); - else - size = offsetof(lib_md_t, md_iov.iov[md->md_niov]); - - PORTAL_FREE(md, size); -} - -static inline lib_me_t * -lib_me_alloc (lib_nal_t *nal) -{ - /* NEVER called with liblock held */ - lib_me_t *me; - - PORTAL_ALLOC(me, sizeof(*me)); - return (me); -} - -static inline void -lib_me_free(lib_nal_t *nal, lib_me_t *me) -{ - /* ALWAYS called with liblock held */ - PORTAL_FREE(me, sizeof(*me)); -} - -static inline lib_msg_t * -lib_msg_alloc(lib_nal_t *nal) -{ - /* NEVER called with liblock held; may be in interrupt... */ - lib_msg_t *msg; - - if (in_interrupt()) - PORTAL_ALLOC_ATOMIC(msg, sizeof(*msg)); - else - PORTAL_ALLOC(msg, sizeof(*msg)); - - if (msg != NULL) { - /* NULL pointers, clear flags etc */ - memset (msg, 0, sizeof (*msg)); - msg->ack_wmd = PTL_WIRE_HANDLE_NONE; - } - return (msg); -} - -static inline void -lib_msg_free(lib_nal_t *nal, lib_msg_t *msg) -{ - /* ALWAYS called with liblock held */ - PORTAL_FREE(msg, sizeof(*msg)); -} -#endif - -extern lib_handle_t *lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type); -extern void lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type); -extern void lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh); - -static inline void -ptl_eq2handle (ptl_handle_eq_t *handle, lib_nal_t *nal, lib_eq_t *eq) -{ - if (eq == NULL) { - *handle = PTL_EQ_NONE; - return; - } - - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; - handle->cookie = eq->eq_lh.lh_cookie; -} - -static inline lib_eq_t * -ptl_handle2eq (ptl_handle_eq_t *handle, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_EQ); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_eq_t, eq_lh)); -} - -static inline void -ptl_md2handle (ptl_handle_md_t *handle, lib_nal_t *nal, lib_md_t *md) -{ - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; - handle->cookie = md->md_lh.lh_cookie; -} - -static inline lib_md_t * -ptl_handle2md (ptl_handle_md_t *handle, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_MD); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_md_t, md_lh)); -} - -static inline lib_md_t * -ptl_wire_handle2md (ptl_handle_wire_t *wh, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh; - - if (wh->wh_interface_cookie != nal->libnal_ni.ni_interface_cookie) - return (NULL); - - lh = lib_lookup_cookie (nal, wh->wh_object_cookie, - PTL_COOKIE_TYPE_MD); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_md_t, md_lh)); -} - -static inline void -ptl_me2handle (ptl_handle_me_t *handle, lib_nal_t *nal, lib_me_t *me) -{ - handle->nal_idx = nal->libnal_ni.ni_api->nal_handle.nal_idx; - handle->cookie = me->me_lh.lh_cookie; -} - -static inline lib_me_t * -ptl_handle2me (ptl_handle_me_t *handle, lib_nal_t *nal) -{ - /* ALWAYS called with liblock held */ - lib_handle_t *lh = lib_lookup_cookie (nal, handle->cookie, - PTL_COOKIE_TYPE_ME); - if (lh == NULL) - return (NULL); - - return (lh_entry (lh, lib_me_t, me_lh)); -} - -extern int lib_init(lib_nal_t *libnal, nal_t *apinal, - ptl_process_id_t pid, - ptl_ni_limits_t *desired_limits, - ptl_ni_limits_t *actual_limits); -extern int lib_fini(lib_nal_t *libnal); - -/* - * When the NAL detects an incoming message header, it should call - * lib_parse() decode it. If the message header is garbage, lib_parse() - * returns immediately with failure, otherwise the NAL callbacks will be - * called to receive the message body. They are handed the private cookie - * as a way for the NAL to maintain state about which transaction is being - * processed. An extra parameter, lib_msg contains the lib-level message - * state for passing to lib_finalize() when the message body has been - * received. - */ -extern void lib_enq_event_locked (lib_nal_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev); -extern void lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_ni_fail_t ni_fail_type); -extern ptl_err_t lib_parse (lib_nal_t *nal, ptl_hdr_t *hdr, void *private); -extern lib_msg_t *lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, - lib_msg_t *get_msg); -extern void print_hdr (lib_nal_t * nal, ptl_hdr_t * hdr); - - -extern ptl_size_t lib_iov_nob (int niov, struct iovec *iov); -extern void lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, - ptl_size_t offset, ptl_size_t len); -extern void lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, - char *src, ptl_size_t len); -extern int lib_extract_iov (int dst_niov, struct iovec *dst, - int src_niov, struct iovec *src, - ptl_size_t offset, ptl_size_t len); - -extern ptl_size_t lib_kiov_nob (int niov, ptl_kiov_t *iov); -extern void lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, - ptl_size_t offset, ptl_size_t len); -extern void lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, - char *src, ptl_size_t len); -extern int lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, - int src_niov, ptl_kiov_t *src, - ptl_size_t offset, ptl_size_t len); - -extern void lib_assert_wire_constants (void); - -extern ptl_err_t lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen); -extern ptl_err_t lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len); - -extern int lib_api_ni_status (nal_t *nal, ptl_sr_index_t sr_idx, - ptl_sr_value_t *status); -extern int lib_api_ni_dist (nal_t *nal, ptl_process_id_t *pid, - unsigned long *dist); - -extern int lib_api_eq_alloc (nal_t *nal, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t *handle); -extern int lib_api_eq_free(nal_t *nal, ptl_handle_eq_t *eqh); -extern int lib_api_eq_poll (nal_t *nal, - ptl_handle_eq_t *eventqs, int neq, int timeout_ms, - ptl_event_t *event, int *which); - -extern int lib_api_me_attach(nal_t *nal, - ptl_pt_index_t portal, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); -extern int lib_api_me_insert(nal_t *nal, - ptl_handle_me_t *current_meh, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); -extern int lib_api_me_unlink (nal_t *nal, ptl_handle_me_t *meh); -extern void lib_me_unlink(lib_nal_t *nal, lib_me_t *me); - -extern int lib_api_get_id(nal_t *nal, ptl_process_id_t *pid); - -extern void lib_md_unlink(lib_nal_t *nal, lib_md_t *md); -extern void lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd); -extern int lib_api_md_attach(nal_t *nal, ptl_handle_me_t *meh, - ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle); -extern int lib_api_md_bind(nal_t *nal, ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle); -extern int lib_api_md_unlink (nal_t *nal, ptl_handle_md_t *mdh); -extern int lib_api_md_update (nal_t *nal, ptl_handle_md_t *mdh, - ptl_md_t *oldumd, ptl_md_t *newumd, - ptl_handle_eq_t *testqh); - -extern int lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, ptl_size_t offset); -extern int lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_ack_req_t ack, ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, - ptl_size_t offset, ptl_hdr_data_t hdr_data); -extern int lib_api_fail_nid(nal_t *apinal, ptl_nid_t nid, unsigned int threshold); -extern int lib_api_loopback(nal_t *apinal, int set, int *enabled); - -#endif diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 608b1e2..2227c6a 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -7,31 +7,22 @@ * exposed to the user application */ -#ifndef __PORTALS_LIB_TYPES_H__ -#define __PORTALS_LIB_TYPES_H__ - -#include "build_check.h" +#ifndef __LNET_LIB_TYPES_H__ +#define __LNET_LIB_TYPES_H__ #if defined(__linux__) -#include +#include #elif defined(__APPLE__) -#include +#include +#elif defined(__WINNT__) +#include #else #error Unsupported Operating System #endif #include #include -#include -#include - -typedef char *user_ptr; -typedef struct lib_msg_t lib_msg_t; -typedef struct lib_ptl_t lib_ptl_t; -typedef struct lib_ac_t lib_ac_t; -typedef struct lib_me_t lib_me_t; -typedef struct lib_md_t lib_md_t; -typedef struct lib_eq_t lib_eq_t; +#include #define WIRE_ATTR __attribute__((packed)) @@ -42,334 +33,517 @@ typedef struct lib_eq_t lib_eq_t; typedef struct { __u64 wh_interface_cookie; __u64 wh_object_cookie; -} WIRE_ATTR ptl_handle_wire_t; +} WIRE_ATTR lnet_handle_wire_t; /* byte-flip insensitive! */ -#define PTL_WIRE_HANDLE_NONE \ -((const ptl_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) +#define LNET_WIRE_HANDLE_NONE \ +((const lnet_handle_wire_t) {.wh_interface_cookie = -1, .wh_object_cookie = -1}) typedef enum { - PTL_MSG_ACK = 0, - PTL_MSG_PUT, - PTL_MSG_GET, - PTL_MSG_REPLY, - PTL_MSG_HELLO, -} ptl_msg_type_t; + LNET_MSG_ACK = 0, + LNET_MSG_PUT, + LNET_MSG_GET, + LNET_MSG_REPLY, + LNET_MSG_HELLO, +} lnet_msg_type_t; /* The variant fields of the portals message header are aligned on an 8 * byte boundary in the message header. Note that all types used in these * wire structs MUST be fixed size and the smaller types are placed at the * end. */ -typedef struct ptl_ack { - ptl_handle_wire_t dst_wmd; - ptl_match_bits_t match_bits; - ptl_size_t mlength; -} WIRE_ATTR ptl_ack_t; - -typedef struct ptl_put { - ptl_handle_wire_t ack_wmd; - ptl_match_bits_t match_bits; - ptl_hdr_data_t hdr_data; - ptl_pt_index_t ptl_index; - ptl_size_t offset; -} WIRE_ATTR ptl_put_t; - -typedef struct ptl_get { - ptl_handle_wire_t return_wmd; - ptl_match_bits_t match_bits; - ptl_pt_index_t ptl_index; - ptl_size_t src_offset; - ptl_size_t sink_length; -} WIRE_ATTR ptl_get_t; - -typedef struct ptl_reply { - ptl_handle_wire_t dst_wmd; -} WIRE_ATTR ptl_reply_t; - -typedef struct ptl_hello { +typedef struct lnet_ack { + lnet_handle_wire_t dst_wmd; + __u64 match_bits; + __u32 mlength; +} WIRE_ATTR lnet_ack_t; + +typedef struct lnet_put { + lnet_handle_wire_t ack_wmd; + __u64 match_bits; + __u64 hdr_data; + __u32 ptl_index; + __u32 offset; +} WIRE_ATTR lnet_put_t; + +typedef struct lnet_get { + lnet_handle_wire_t return_wmd; + __u64 match_bits; + __u32 ptl_index; + __u32 src_offset; + __u32 sink_length; +} WIRE_ATTR lnet_get_t; + +typedef struct lnet_reply { + lnet_handle_wire_t dst_wmd; +} WIRE_ATTR lnet_reply_t; + +typedef struct lnet_hello { __u64 incarnation; __u32 type; -} WIRE_ATTR ptl_hello_t; +} WIRE_ATTR lnet_hello_t; typedef struct { - ptl_nid_t dest_nid; - ptl_nid_t src_nid; - ptl_pid_t dest_pid; - ptl_pid_t src_pid; - __u32 type; /* ptl_msg_type_t */ + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + lnet_pid_t dest_pid; + lnet_pid_t src_pid; + __u32 type; /* lnet_msg_type_t */ __u32 payload_length; /* payload data to follow */ /*<------__u64 aligned------->*/ union { - ptl_ack_t ack; - ptl_put_t put; - ptl_get_t get; - ptl_reply_t reply; - ptl_hello_t hello; + lnet_ack_t ack; + lnet_put_t put; + lnet_get_t get; + lnet_reply_t reply; + lnet_hello_t hello; } msg; -} WIRE_ATTR ptl_hdr_t; +} WIRE_ATTR lnet_hdr_t; -/* A HELLO message contains the portals magic number and protocol version +/* A HELLO message contains a magic number and protocol version * code in the header's dest_nid, the peer's NID in the src_nid, and - * PTL_MSG_HELLO in the type field. All other common fields are zero + * LNET_MSG_HELLO in the type field. All other common fields are zero * (including payload_size; i.e. no payload). - * This is for use by byte-stream NALs (e.g. TCP/IP) to check the peer is - * running the same protocol and to find out its NID, so that hosts with - * multiple IP interfaces can have a single NID. These NALs should exchange - * HELLO messages when a connection is first established. - * Individual NALs can put whatever else they fancy in ptl_hdr_t::msg. + * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is + * running the same protocol and to find out its NID. These LNDs should + * exchange HELLO messages when a connection is first established. Individual + * LNDs can put whatever else they fancy in lnet_hdr_t::msg. */ typedef struct { - __u32 magic; /* PORTALS_PROTO_MAGIC */ + __u32 magic; /* LNET_PROTO_TCP_MAGIC */ __u16 version_major; /* increment on incompatible change */ __u16 version_minor; /* increment on compatible change */ -} WIRE_ATTR ptl_magicversion_t; - -#define PORTALS_PROTO_MAGIC 0xeebc0ded - -#define PORTALS_PROTO_VERSION_MAJOR 1 -#define PORTALS_PROTO_VERSION_MINOR 0 - -typedef struct { - long recv_count, recv_length, send_count, send_length, drop_count, - drop_length, msgs_alloc, msgs_max; -} lib_counters_t; - -/* temporary expedient: limit number of entries in discontiguous MDs */ -#define PTL_MTU (1<<20) -#define PTL_MD_MAX_IOV 256 - -struct lib_msg_t { - struct list_head msg_list; - lib_md_t *md; - ptl_handle_wire_t ack_wmd; - ptl_event_t ev; -}; - -struct lib_ptl_t { - ptl_pt_index_t size; - struct list_head *tbl; -}; - -struct lib_ac_t { - int next_free; -}; - +} WIRE_ATTR lnet_magicversion_t; + +/* PROTO MAGIC for LNDs */ +#define LNET_PROTO_IB_MAGIC 0x0be91b91 +#define LNET_PROTO_OPENIB_MAGIC LNET_PROTO_IB_MAGIC +#define LNET_PROTO_IIB_MAGIC LNET_PROTO_IB_MAGIC +#define LNET_PROTO_VIB_MAGIC LNET_PROTO_IB_MAGIC +#define LNET_PROTO_RA_MAGIC 0x0be91b92 +#define LNET_PROTO_QSW_MAGIC 0x0be91b93 +#define LNET_PROTO_TCP_MAGIC 0xeebc0ded +#define LNET_PROTO_PTL_MAGIC 0x50746C4E /* 'PtlN' unique magic */ +#define LNET_PROTO_GM_MAGIC 0x6d797269 /* 'myri'! */ +#define LNET_PROTO_MX_MAGIC 0x4d583130 /* 'MX10'! */ +#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 +#define LNET_PROTO_PING_MAGIC 0x70696E67 /* 'ping' */ + +/* Placeholder for a future "unified" protocol across all LNDs */ +/* Current LNDs that receive a request with this magic will respond with a + * "stub" reply using their current protocol */ +#define LNET_PROTO_MAGIC 0x45726963 /* ! */ + + +#define LNET_PROTO_TCP_VERSION_MAJOR 1 +#define LNET_PROTO_TCP_VERSION_MINOR 0 + +/* Acceptor connection request */ typedef struct { + __u32 acr_magic; /* PTL_ACCEPTOR_PROTO_MAGIC */ + __u32 acr_version; /* protocol version */ + __u64 acr_nid; /* target NID */ +} WIRE_ATTR lnet_acceptor_connreq_t; + +#define LNET_PROTO_ACCEPTOR_VERSION 1 + +/* forward refs */ +struct lnet_libmd; + +typedef struct lnet_msg { + struct list_head msg_activelist; + struct list_head msg_list; /* Q for credits/MD */ + + lnet_process_id_t msg_target; + __u32 msg_type; + + unsigned int msg_target_is_router:1; /* sending to a router */ + unsigned int msg_routing:1; /* being forwarded */ + unsigned int msg_ack:1; /* ack on finalize (PUT) */ + unsigned int msg_sending:1; /* outgoing message */ + unsigned int msg_receiving:1; /* being received */ + unsigned int msg_delayed:1; /* had to Q for buffer or tx credit */ + unsigned int msg_txcredit:1; /* taken an NI send credit */ + unsigned int msg_peertxcredit:1; /* taken a peer send credit */ + unsigned int msg_rtrcredit:1; /* taken a globel router credit */ + unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ + unsigned int msg_onactivelist:1; /* on the activelist */ + + struct lnet_peer *msg_txpeer; /* peer I'm sending to */ + struct lnet_peer *msg_rxpeer; /* peer I received from */ + + void *msg_private; + struct lnet_libmd *msg_md; + + unsigned int msg_len; + unsigned int msg_wanted; + unsigned int msg_offset; + unsigned int msg_niov; + struct iovec *msg_iov; + lnet_kiov_t *msg_kiov; + + lnet_event_t msg_ev; + lnet_hdr_t msg_hdr; +} lnet_msg_t; + + +typedef struct lnet_libhandle { struct list_head lh_hash_chain; __u64 lh_cookie; -} lib_handle_t; +} lnet_libhandle_t; #define lh_entry(ptr, type, member) \ ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) -struct lib_eq_t { +typedef struct lnet_eq { struct list_head eq_list; - lib_handle_t eq_lh; - ptl_seq_t eq_enq_seq; - ptl_seq_t eq_deq_seq; - ptl_size_t eq_size; - ptl_event_t *eq_events; + lnet_libhandle_t eq_lh; + lnet_seq_t eq_enq_seq; + lnet_seq_t eq_deq_seq; + unsigned int eq_size; + lnet_event_t *eq_events; int eq_refcount; - ptl_eq_handler_t eq_callback; - void *eq_addrkey; -}; - -struct lib_me_t { - struct list_head me_list; - lib_handle_t me_lh; - ptl_process_id_t match_id; - ptl_match_bits_t match_bits, ignore_bits; - ptl_unlink_t unlink; - lib_md_t *md; -}; - -struct lib_md_t { + lnet_eq_handler_t eq_callback; +} lnet_eq_t; + +typedef struct lnet_me { + struct list_head me_list; + lnet_libhandle_t me_lh; + lnet_process_id_t me_match_id; + unsigned int me_portal; + __u64 me_match_bits; + __u64 me_ignore_bits; + lnet_unlink_t me_unlink; + struct lnet_libmd *me_md; +} lnet_me_t; + +typedef struct lnet_libmd { struct list_head md_list; - lib_handle_t md_lh; - lib_me_t *me; - user_ptr start; - ptl_size_t offset; - ptl_size_t length; - ptl_size_t max_size; - int threshold; - int pending; - unsigned int options; + lnet_libhandle_t md_lh; + lnet_me_t *md_me; + char *md_start; + unsigned int md_offset; + unsigned int md_length; + unsigned int md_max_size; + int md_threshold; + int md_refcount; + unsigned int md_options; unsigned int md_flags; - void *user_ptr; - lib_eq_t *eq; + void *md_user_ptr; + lnet_eq_t *md_eq; void *md_addrkey; unsigned int md_niov; /* # frags */ union { - struct iovec iov[PTL_MD_MAX_IOV]; - ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + struct iovec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; } md_iov; -}; +} lnet_libmd_t; -#define PTL_MD_FLAG_ZOMBIE (1 << 0) -#define PTL_MD_FLAG_AUTO_UNLINK (1 << 1) +#define LNET_MD_FLAG_ZOMBIE (1 << 0) +#define LNET_MD_FLAG_AUTO_UNLINK (1 << 1) -static inline int lib_md_exhausted (lib_md_t *md) -{ - return (md->threshold == 0 || - ((md->options & PTL_MD_MAX_SIZE) != 0 && - md->offset + md->max_size > md->length)); -} - -#ifdef PTL_USE_LIB_FREELIST +#ifdef LNET_USE_LIB_FREELIST typedef struct { void *fl_objs; /* single contiguous array of objects */ int fl_nobjs; /* the number of them */ int fl_objsize; /* the size (including overhead) of each of them */ struct list_head fl_list; /* where they are enqueued */ -} lib_freelist_t; +} lnet_freelist_t; typedef struct { struct list_head fo_list; /* enqueue on fl_list */ void *fo_contents; /* aligned contents */ -} lib_freeobj_t; +} lnet_freeobj_t; #endif typedef struct { /* info about peers we are trying to fail */ - struct list_head tp_list; /* stash in ni.ni_test_peers */ - ptl_nid_t tp_nid; /* matching nid */ - unsigned int tp_threshold; /* # failures to simulate */ -} lib_test_peer_t; - -#define PTL_COOKIE_TYPE_MD 1 -#define PTL_COOKIE_TYPE_ME 2 -#define PTL_COOKIE_TYPE_EQ 3 -#define PTL_COOKIE_TYPES 4 -/* PTL_COOKIE_TYPES must be a power of 2, so the cookie type can be - * extracted by masking with (PTL_COOKIE_TYPES - 1) */ - -typedef struct lib_ni + struct list_head tp_list; /* ln_test_peers */ + lnet_nid_t tp_nid; /* matching nid */ + unsigned int tp_threshold; /* # failures to simulate */ +} lnet_test_peer_t; + +#define LNET_COOKIE_TYPE_MD 1 +#define LNET_COOKIE_TYPE_ME 2 +#define LNET_COOKIE_TYPE_EQ 3 +#define LNET_COOKIE_TYPES 4 +/* LNET_COOKIE_TYPES must be a power of 2, so the cookie type can be + * extracted by masking with (LNET_COOKIE_TYPES - 1) */ + +struct lnet_ni; /* forward ref */ + +typedef struct lnet_lnd { - nal_t *ni_api; - ptl_process_id_t ni_pid; - lib_ptl_t ni_portals; - lib_counters_t ni_counters; - ptl_ni_limits_t ni_actual_limits; - - int ni_lh_hash_size; /* size of lib handle hash table */ - struct list_head *ni_lh_hash_table; /* all extant lib handles, this interface */ - __u64 ni_next_object_cookie; /* cookie generator */ - __u64 ni_interface_cookie; /* uniquely identifies this ni in this epoch */ + /* fields managed by portals */ + struct list_head lnd_list; /* stash in the LND table */ + int lnd_refcount; /* # active instances */ + + /* fields initialised by the LND */ + unsigned int lnd_type; - struct list_head ni_test_peers; - int ni_loopback; /* loopback shortcircuits NAL */ + int (*lnd_startup) (struct lnet_ni *ni); + void (*lnd_shutdown) (struct lnet_ni *ni); + int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); + + /* In data movement APIs below, payload buffers are described as a set + * of 'niov' fragments which are... + * EITHER + * in virtual memory (struct iovec *iov != NULL) + * OR + * in pages (kernel only: plt_kiov_t *kiov != NULL). + * The LND may NOT overwrite these fragment descriptors. + * An 'offset' and may specify a byte offset within the set of + * fragments to start from + */ + + /* Start sending a preformatted message. 'private' is NULL for PUT and + * GET messages; otherwise this is a response to an incoming message + * and 'private' is the 'private' passed to lnet_parse(). Return + * non-zero for immediate failure, otherwise complete later with + * lnet_finalize() */ + int (*lnd_send)(struct lnet_ni *ni, void *private, lnet_msg_t *msg); + + /* Start receiving 'mlen' bytes of payload data, skipping the following + * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to + * lnet_parse(). Return non-zero for immedaite failure, otherwise + * complete later with lnet_finalize(). This also gives back a receive + * credit if the LND does flow control. */ + int (*lnd_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + + /* lnet_parse() has had to delay processing of this message + * (e.g. waiting for a forwarding buffer or send credits). Give the + * LND a chance to free urgently needed resources. If called, return 0 + * for success and do NOT give back a receive credit; that has to wait + * until lnd_recv() gets called. On failure return < 0 and + * release resources; lnd_recv() will not be called. */ + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + void **new_privatep); + + /* notification of peer health */ + void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); + +#ifdef __KERNEL__ + /* accept a new connection */ + int (*lnd_accept)(struct lnet_ni *ni, cfs_socket_t *sock); +#else + /* wait for something to happen */ + void (*lnd_wait)(struct lnet_ni *ni, int milliseconds); +#endif +} lnd_t; + +#define LNET_MAX_INTERFACES 16 + +typedef struct lnet_ni { + struct list_head ni_list; /* chain on ln_nis */ + struct list_head ni_txq; /* messages waiting for tx credits */ + int ni_maxtxcredits; /* # tx credits */ + int ni_txcredits; /* # tx credits free */ + int ni_mintxcredits; /* lowest it's been */ + int ni_peertxcredits; /* # per-peer send credits */ + lnet_nid_t ni_nid; /* interface's NID */ + void *ni_data; /* instance-specific data */ + lnd_t *ni_lnd; /* procedural interface */ + int ni_refcount; /* reference count */ + char *ni_interfaces[LNET_MAX_INTERFACES]; /* equivalent interfaces to use */ +} lnet_ni_t; + +typedef struct lnet_peer { + struct list_head lp_hashlist; /* chain on peer hash */ + struct list_head lp_txq; /* messages blocking for tx credits */ + struct list_head lp_rtrq; /* messages blocking for router credits */ + struct list_head lp_rtr_list; /* chain on router list */ + int lp_txcredits; /* # tx credits available */ + int lp_mintxcredits; /* low water mark */ + int lp_rtrcredits; /* # router credits */ + int lp_minrtrcredits; /* low water mark */ + unsigned int lp_alive:1; /* alive/dead? */ + unsigned int lp_notify:1; /* notification outstanding? */ + unsigned int lp_notifylnd:1; /* outstanding notification for LND? */ + unsigned int lp_notifying:1; /* some thread is handling notification */ + unsigned int lp_ping_notsent; /* SEND event outstanding from ping */ + int lp_alive_count; /* # times router went dead<->alive */ + long lp_txqnob; /* bytes queued for sending */ + time_t lp_timestamp; /* time of last aliveness news */ + time_t lp_ping_timestamp; /* time of last ping attempt */ + time_t lp_ping_deadline; /* != 0 if ping reply expected */ + lnet_ni_t *lp_ni; /* interface peer is on */ + lnet_nid_t lp_nid; /* peer's NID */ + int lp_refcount; /* # refs */ + int lp_rtr_refcount; /* # refs from lnet_route_t::lr_gateway */ +} lnet_peer_t; + +typedef struct { + struct list_head lr_list; /* chain on net */ + lnet_peer_t *lr_gateway; /* router node */ +} lnet_route_t; + +typedef struct { + struct list_head lrn_list; /* chain on ln_remote_nets */ + struct list_head lrn_routes; /* routes to me */ + __u32 lrn_net; /* my net number */ + unsigned int lrn_hops; /* how far I am */ +} lnet_remotenet_t; + +typedef struct { + struct list_head rbp_bufs; /* my free buffer pool */ + struct list_head rbp_msgs; /* messages blocking for a buffer */ + int rbp_npages; /* # pages in each buffer */ + int rbp_nbuffers; /* # buffers */ + int rbp_credits; /* # free buffers / blocked messages */ + int rbp_mincredits; /* low water mark */ +} lnet_rtrbufpool_t; + +typedef struct { + struct list_head rb_list; /* chain on rbp_bufs */ + lnet_rtrbufpool_t *rb_pool; /* owning pool */ + lnet_kiov_t rb_kiov[0]; /* the buffer space */ +} lnet_rtrbuf_t; + +typedef struct { + __u32 msgs_alloc; + __u32 msgs_max; + __u32 errors; + __u32 send_count; + __u32 recv_count; + __u32 route_count; + __u32 drop_count; + __u64 send_length; + __u64 recv_length; + __u64 route_length; + __u64 drop_length; +} lnet_counters_t; + +#define LNET_PEER_HASHSIZE 503 /* prime! */ + +#define LNET_NRBPOOLS 3 /* # different router buffer pools */ + +#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL +#define LNET_PROTO_PING_VERSION 1 +typedef struct { + __u32 pi_magic; + __u32 pi_version; + lnet_pid_t pi_pid; + __u32 pi_nnids; + lnet_nid_t pi_nid[0]; +} WIRE_ATTR lnet_ping_info_t; + +/* Options for lnet_portal_t::ptl_options */ +#define LNET_PTL_LAZY (1 << 0) +typedef struct { + struct list_head ptl_ml; /* match list */ + struct list_head ptl_msgq; /* messages blocking for MD */ + __u64 ptl_msgq_version; /* validity stamp */ + unsigned int ptl_options; +} lnet_portal_t; + +/* Router Checker */ +/* < 0 == startup error */ +#define LNET_RC_STATE_SHUTDOWN 0 /* not started */ +#define LNET_RC_STATE_RUNNING 1 /* started up OK */ +#define LNET_RC_STATE_STOPTHREAD 2 /* telling thread to stop */ +#define LNET_RC_STATE_UNLINKING 3 /* unlinking RC MD */ +#define LNET_RC_STATE_UNLINKED 4 /* RC's MD has been unlinked */ + +typedef struct +{ + /* Stuff initialised at LNetInit() */ + int ln_init; /* LNetInit() called? */ + int ln_refcount; /* LNetNIInit/LNetNIFini counter */ + int ln_niinit_self; /* Have I called LNetNIInit myself? */ + + int ln_ptlcompat; /* do I support talking to portals? */ -#ifdef PTL_USE_LIB_FREELIST - lib_freelist_t ni_free_mes; - lib_freelist_t ni_free_msgs; - lib_freelist_t ni_free_mds; - lib_freelist_t ni_free_eqs; + struct list_head ln_lnds; /* registered LNDs */ + +#ifdef __KERNEL__ + spinlock_t ln_lock; + cfs_waitq_t ln_waitq; + struct semaphore ln_api_mutex; + struct semaphore ln_lnd_mutex; +#else +# if !HAVE_LIBPTHREAD + int ln_lock; + int ln_api_mutex; + int ln_lnd_mutex; +# else + pthread_cond_t ln_cond; + pthread_mutex_t ln_lock; + pthread_mutex_t ln_api_mutex; + pthread_mutex_t ln_lnd_mutex; +# endif #endif - struct list_head ni_active_msgs; - struct list_head ni_active_mds; - struct list_head ni_active_eqs; + /* Stuff initialised at LNetNIInit() */ + + int ln_shutdown; /* shutdown in progress */ + int ln_nportals; /* # portals */ + lnet_portal_t *ln_portals; /* the vector of portals */ + + lnet_pid_t ln_pid; /* requested pid */ + + struct list_head ln_nis; /* LND instances */ + lnet_ni_t *ln_loni; /* the loopback NI */ + lnet_ni_t *ln_eqwaitni; /* NI to wait for events in */ + struct list_head ln_zombie_nis; /* dying LND instances */ + int ln_nzombie_nis; /* # of NIs to wait for */ + struct list_head ln_remote_nets; /* remote networks with routes to them */ + __u64 ln_remote_nets_version; /* validity stamp */ + + struct list_head ln_routers; /* list of all known routers */ + __u64 ln_routers_version; /* validity stamp */ + + struct list_head *ln_peer_hash; /* NID->peer hash */ + int ln_npeers; /* # peers extant */ + int ln_peertable_version; /* /proc validity stamp */ + + int ln_routing; /* am I a router? */ + lnet_rtrbufpool_t ln_rtrpools[LNET_NRBPOOLS]; /* router buffer pools */ + + int ln_lh_hash_size; /* size of lib handle hash table */ + struct list_head *ln_lh_hash_table; /* all extant lib handles, this interface */ + __u64 ln_next_object_cookie; /* cookie generator */ + __u64 ln_interface_cookie; /* uniquely identifies this ni in this epoch */ + + char *ln_network_tokens; /* space for network names */ + int ln_network_tokens_nob; + + int ln_testprotocompat; /* test protocol compatibility flags */ + + struct list_head ln_finalizeq; /* msgs waiting to complete finalizing */ #ifdef __KERNEL__ - spinlock_t ni_lock; - cfs_waitq_t ni_waitq; + void **ln_finalizers; /* threads doing finalization */ + int ln_nfinalizers; /* max # threads finalizing */ #else - pthread_mutex_t ni_mutex; - pthread_cond_t ni_cond; + int ln_finalizing; #endif -} lib_ni_t; + struct list_head ln_test_peers; /* failure simulation */ + lnet_handle_md_t ln_ping_target_md; + lnet_handle_eq_t ln_ping_target_eq; + lnet_ping_info_t *ln_ping_info; -typedef struct lib_nal -{ - /* lib-level interface state */ - lib_ni_t libnal_ni; - - /* NAL-private data */ - void *libnal_data; - - /* - * send: Sends a preformatted header and payload data to a - * specified remote process. The payload is scattered over 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to send and will call - * lib_finalize on completion - */ - ptl_err_t (*libnal_send) - (struct lib_nal *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen); +#ifdef __KERNEL__ + int ln_rc_state; /* router checker startup/shutdown state */ + struct semaphore ln_rc_signal; /* serialise startup/shutdown */ + lnet_handle_eq_t ln_rc_eqh; /* router checker's event queue */ +#endif - /* as send, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*libnal_send_pages) - (struct lib_nal *nal, void *private, lib_msg_t * cookie, - ptl_hdr_t * hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen); - /* - * recv: Receives an incoming message from a remote process. The - * payload is to be received into the scattered buffer of 'niov' - * fragments described by iov, starting at 'offset' for 'mlen' - * bytes. Payload bytes after 'mlen' up to 'rlen' are to be - * discarded. - * NB the NAL may NOT overwrite iov. - * PTL_OK on success => NAL has committed to receive and will call - * lib_finalize on completion - */ - ptl_err_t (*libnal_recv) - (struct lib_nal *nal, void *private, lib_msg_t * cookie, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen); - - /* as recv, but with a set of page fragments (NULL if not supported) */ - ptl_err_t (*libnal_recv_pages) - (struct lib_nal *nal, void *private, lib_msg_t * cookie, - unsigned int niov, ptl_kiov_t *iov, - size_t offset, size_t mlen, size_t rlen); - - /* - * (un)map: Tell the NAL about some memory it will access. - * *addrkey passed to libnal_unmap() is what libnal_map() set it to. - * type of *iov depends on options. - * Set to NULL if not required. - */ - ptl_err_t (*libnal_map) - (struct lib_nal *nal, unsigned int niov, struct iovec *iov, - void **addrkey); - void (*libnal_unmap) - (struct lib_nal *nal, unsigned int niov, struct iovec *iov, - void **addrkey); - - /* as (un)map, but with a set of page fragments */ - ptl_err_t (*libnal_map_pages) - (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - void (*libnal_unmap_pages) - (struct lib_nal *nal, unsigned int niov, ptl_kiov_t *iov, - void **addrkey); - - /* Calculate a network "distance" to given node */ - int (*libnal_dist) (struct lib_nal *nal, ptl_nid_t nid, unsigned long *dist); -} lib_nal_t; - -typedef struct /* loopback descriptor */ -{ - unsigned int lod_type; - unsigned int lod_niov; - size_t lod_offset; - size_t lod_nob; - union { - struct iovec *iov; - ptl_kiov_t *kiov; - } lod_iov; -} lo_desc_t; +#ifdef LNET_USE_LIB_FREELIST + lnet_freelist_t ln_free_mes; + lnet_freelist_t ln_free_msgs; + lnet_freelist_t ln_free_mds; + lnet_freelist_t ln_free_eqs; +#endif + struct list_head ln_active_msgs; + struct list_head ln_active_mds; + struct list_head ln_active_eqs; -#define LOD_IOV 0xeb105 -#define LOD_KIOV 0xeb106 + lnet_counters_t ln_counters; +} lnet_t; #endif diff --git a/lnet/include/lnet/linux/Makefile.am b/lnet/include/lnet/linux/Makefile.am index b6e7daf..409e159 100644 --- a/lnet/include/lnet/linux/Makefile.am +++ b/lnet/include/lnet/linux/Makefile.am @@ -1 +1 @@ -EXTRA_DIST := lib-p30.h lib-types.h p30.h +EXTRA_DIST := lib-lnet.h lib-types.h lnet.h api-support.h diff --git a/lnet/include/lnet/linux/api-support.h b/lnet/include/lnet/linux/api-support.h new file mode 100644 index 0000000..bec6e34 --- /dev/null +++ b/lnet/include/lnet/linux/api-support.h @@ -0,0 +1,39 @@ +#ifndef __LINUX_API_SUPPORT_H__ +#define __LINUX_API_SUPPORT_H__ + +#ifndef __LNET_API_SUPPORT_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifndef __KERNEL__ +# include +# include +# include +# include + +/* Lots of POSIX dependencies to support PtlEQWait_timeout */ +# include +# include +# include + +#ifdef HAVE_LIBREADLINE +#define READLINE_LIBRARY +#include + +/* readline.h pulls in a #define that conflicts with one in libcfs.h */ +#undef RETURN + +/* completion_matches() is #if 0-ed out in modern glibc */ +#ifndef completion_matches +# define completion_matches rl_completion_matches +#endif + +#endif /* HAVE_LIBREADLINE */ + +extern void using_history(void); +extern void stifle_history(int); +extern void add_history(char *); + +#endif /* !__KERNEL__ */ + +#endif diff --git a/lnet/include/lnet/linux/lib-lnet.h b/lnet/include/lnet/linux/lib-lnet.h index 1c88080..9c38fd3 100644 --- a/lnet/include/lnet/linux/lib-lnet.h +++ b/lnet/include/lnet/linux/lib-lnet.h @@ -1,20 +1,49 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ -#ifndef __PORTALS_LINUX_LIB_P30_H__ -#define __PORTALS_LINUX_LIB_P30_H__ +#ifndef __LNET_LINUX_LIB_LNET_H__ +#define __LNET_LINUX_LIB_LNET_H__ -#ifndef __PORTALS_LIB_P30_H__ -#error Do not #include this file directly. #include instead +#ifndef __LNET_LIB_LNET_H__ +#error Do not #include this file directly. #include instead #endif #ifdef __KERNEL__ # include # include -#else +# include +# include + +static inline __u64 +lnet_page2phys (struct page *p) +{ + /* compiler optimizer will elide unused branches */ + + switch (sizeof(typeof(page_to_phys(p)))) { + case 4: + /* page_to_phys returns a 32 bit physical address. This must + * be a 32 bit machine with <= 4G memory and we must ensure we + * don't sign extend when converting to 64 bits. */ + return (unsigned long)page_to_phys(p); + + case 8: + /* page_to_phys returns a 64 bit physical address :) */ + return page_to_phys(p); + + default: + LBUG(); + return 0; + } +} + +#else /* __KERNEL__ */ # include # include -# include +# ifdef HAVE_LIBPTHREAD +# include +# endif #endif -#endif +#define LNET_ROUTER + +#endif /* __LNET_LINUX_LIB_LNET_H__ */ diff --git a/lnet/include/lnet/linux/lib-p30.h b/lnet/include/lnet/linux/lib-p30.h deleted file mode 100644 index 1c88080..0000000 --- a/lnet/include/lnet/linux/lib-p30.h +++ /dev/null @@ -1,20 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef __PORTALS_LINUX_LIB_P30_H__ -#define __PORTALS_LINUX_LIB_P30_H__ - -#ifndef __PORTALS_LIB_P30_H__ -#error Do not #include this file directly. #include instead -#endif - -#ifdef __KERNEL__ -# include -# include -#else -# include -# include -# include -#endif - -#endif diff --git a/lnet/include/lnet/linux/lib-types.h b/lnet/include/lnet/linux/lib-types.h index f896b4b..7d28839 100644 --- a/lnet/include/lnet/linux/lib-types.h +++ b/lnet/include/lnet/linux/lib-types.h @@ -1,11 +1,11 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ -#ifndef __PORTALS_LINUX_LIB_TYPES_H__ -#define __PORTALS_LINUX_LIB_TYPES_H__ +#ifndef __LNET_LINUX_LIB_TYPES_H__ +#define __LNET_LINUX_LIB_TYPES_H__ -#ifndef __PORTALS_LIB_TYPES_H__ -#error Do not #include this file directly. #include instead +#ifndef __LNET_LIB_TYPES_H__ +#error Do not #include this file directly. #include instead #endif #ifdef __KERNEL__ @@ -13,7 +13,7 @@ # include # include #else -# define PTL_USE_LIB_FREELIST +# define LNET_USE_LIB_FREELIST # include #endif diff --git a/lnet/include/lnet/linux/lnet.h b/lnet/include/lnet/linux/lnet.h index b074837..b1aab84 100644 --- a/lnet/include/lnet/linux/lnet.h +++ b/lnet/include/lnet/linux/lnet.h @@ -1,15 +1,15 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ -#ifndef __PORTALS_LINUX_P30_H__ -#define __PORTALS_LINUX_P30_H__ +#ifndef __LNET_LINUX_LNET_H__ +#define __LNET_LINUX_LNET_H__ -#ifndef __PORTALS_P30_H__ -#error Do not #include this file directly. #include instead +#ifndef __LNET_H__ +#error Do not #include this file directly. #include instead #endif /* - * p30.h + * lnet.h * * User application interface file */ diff --git a/lnet/include/lnet/linux/p30.h b/lnet/include/lnet/linux/p30.h deleted file mode 100644 index b074837..0000000 --- a/lnet/include/lnet/linux/p30.h +++ /dev/null @@ -1,25 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef __PORTALS_LINUX_P30_H__ -#define __PORTALS_LINUX_P30_H__ - -#ifndef __PORTALS_P30_H__ -#error Do not #include this file directly. #include instead -#endif - -/* - * p30.h - * - * User application interface file - */ - -#if defined (__KERNEL__) -#include -#include -#else -#include -#include -#endif - -#endif diff --git a/lnet/include/lnet/lnet.h b/lnet/include/lnet/lnet.h index 9be79b8..819c524 100644 --- a/lnet/include/lnet/lnet.h +++ b/lnet/include/lnet/lnet.h @@ -1,25 +1,25 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: */ -#ifndef __PORTALS_P30_H__ -#define __PORTALS_P30_H__ - -#include "build_check.h" +#ifndef __LNET_H__ +#define __LNET_H__ /* - * p30.h + * lnet.h * * User application interface file */ #if defined(__linux__) -#include +#include #elif defined(__APPLE__) -#include +#include +#elif defined(__WINNT__) +#include #else #error Unsupported Operating System #endif -#include -#include +#include +#include #endif diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h index cce160e..cb66b9d 100644 --- a/lnet/include/lnet/lnetctl.h +++ b/lnet/include/lnet/lnetctl.h @@ -21,25 +21,27 @@ #ifndef _PTLCTL_H_ #define _PTLCTL_H_ -#include +#include #include #include -#define PORTALS_DEV_ID 0 -#define PORTALS_DEV_PATH "/dev/portals" +#define LNET_DEV_ID 0 +#define LNET_DEV_PATH "/dev/lnet" +#define LNET_DEV_MAJOR 10 +#define LNET_DEV_MINOR 240 #define OBD_DEV_ID 1 #define OBD_DEV_PATH "/dev/obd" +#define OBD_DEV_MAJOR 10 +#define OBD_DEV_MINOR 241 #define SMFS_DEV_ID 2 #define SMFS_DEV_PATH "/dev/snapdev" - -int ptl_name2nal(char *str); -int ptl_parse_ipaddr (__u32 *ipaddrp, char *str); -int ptl_parse_anynid (ptl_nid_t *nidp, char *str); -int ptl_parse_nid (ptl_nid_t *nidp, char *str); -char * ptl_nid2str (char *buffer, ptl_nid_t nid); +#define SMFS_DEV_MAJOR 10 +#define SMFS_DEV_MINOR 242 int ptl_initialize(int argc, char **argv); int jt_ptl_network(int argc, char **argv); +int jt_ptl_list_nids(int argc, char **argv); +int jt_ptl_which_nid(int argc, char **argv); int jt_ptl_print_interfaces(int argc, char **argv); int jt_ptl_add_interface(int argc, char **argv); int jt_ptl_del_interface(int argc, char **argv); @@ -47,12 +49,11 @@ int jt_ptl_print_peers (int argc, char **argv); int jt_ptl_add_peer (int argc, char **argv); int jt_ptl_del_peer (int argc, char **argv); int jt_ptl_print_connections (int argc, char **argv); -int jt_ptl_connect(int argc, char **argv); int jt_ptl_disconnect(int argc, char **argv); int jt_ptl_push_connection(int argc, char **argv); int jt_ptl_print_active_txs(int argc, char **argv); int jt_ptl_ping(int argc, char **argv); -int jt_ptl_shownid(int argc, char **argv); +int jt_ptl_ping_test(int argc, char **argv); int jt_ptl_mynid(int argc, char **argv); int jt_ptl_add_uuid(int argc, char **argv); int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ @@ -63,8 +64,8 @@ int jt_ptl_del_route (int argc, char **argv); int jt_ptl_notify_router (int argc, char **argv); int jt_ptl_print_routes (int argc, char **argv); int jt_ptl_fail_nid (int argc, char **argv); -int jt_ptl_loopback (int argc, char **argv); int jt_ptl_lwt(int argc, char **argv); +int jt_ptl_testprotocompat(int argc, char **argv); int jt_ptl_memhog(int argc, char **argv); int dbg_initialize(int argc, char **argv); @@ -79,12 +80,10 @@ int jt_dbg_mark_debug_buf(int argc, char **argv); int jt_dbg_modules(int argc, char **argv); int jt_dbg_panic(int argc, char **argv); -int ptl_set_cfg_record_cb(cfg_record_cb_t cb); - /* l_ioctl.c */ typedef int (ioc_handler_t)(int dev_id, unsigned int opc, void *buf); void set_ioc_handler(ioc_handler_t *handler); -int register_ioc_dev(int dev_id, const char * dev_name); +int register_ioc_dev(int dev_id, const char * dev_name, int major, int minor); void unregister_ioc_dev(int dev_id); int set_ioctl_dump(char * file); int l_ioctl(int dev_id, unsigned int opc, void *buf); diff --git a/lnet/include/lnet/myrnal.h b/lnet/include/lnet/myrnal.h deleted file mode 100644 index 13790f7..0000000 --- a/lnet/include/lnet/myrnal.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef MYRNAL_H -#define MYRNAL_H - -#define MAX_ARGS_LEN (256) -#define MAX_RET_LEN (128) -#define MYRNAL_MAX_ACL_SIZE (64) -#define MYRNAL_MAX_PTL_SIZE (64) - -#define P3CMD (100) -#define P3SYSCALL (200) -#define P3REGISTER (300) - -enum { PTL_MLOCKALL }; - -typedef struct { - void *args; - size_t args_len; - void *ret; - size_t ret_len; - int p3cmd; -} myrnal_forward_t; - -#endif /* MYRNAL_H */ diff --git a/lnet/include/lnet/nal.h b/lnet/include/lnet/nal.h deleted file mode 100644 index aad611d..0000000 --- a/lnet/include/lnet/nal.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef _NAL_H_ -#define _NAL_H_ - -#include "build_check.h" - -/* - * p30/nal.h - * - * The API side NAL declarations - */ - -#include - -typedef struct nal_t nal_t; - -struct nal_t { - /* common interface state */ - int nal_refct; - ptl_handle_ni_t nal_handle; - - /* NAL-private data */ - void *nal_data; - - /* NAL API implementation - * NB only nal_ni_init needs to be set when the NAL registers itself */ - int (*nal_ni_init) (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *req, ptl_ni_limits_t *actual); - - void (*nal_ni_fini) (nal_t *nal); - - int (*nal_get_id) (nal_t *nal, ptl_process_id_t *id); - int (*nal_ni_status) (nal_t *nal, ptl_sr_index_t register, ptl_sr_value_t *status); - int (*nal_ni_dist) (nal_t *nal, ptl_process_id_t *id, unsigned long *distance); - int (*nal_fail_nid) (nal_t *nal, ptl_nid_t nid, unsigned int threshold); - int (*nal_loopback) (nal_t *nal, int set, int *enabled); - - int (*nal_me_attach) (nal_t *nal, ptl_pt_index_t portal, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); - int (*nal_me_insert) (nal_t *nal, ptl_handle_me_t *me, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle); - int (*nal_me_unlink) (nal_t *nal, ptl_handle_me_t *me); - - int (*nal_md_attach) (nal_t *nal, ptl_handle_me_t *me, - ptl_md_t *md, ptl_unlink_t unlink, - ptl_handle_md_t *handle); - int (*nal_md_bind) (nal_t *nal, - ptl_md_t *md, ptl_unlink_t unlink, - ptl_handle_md_t *handle); - int (*nal_md_unlink) (nal_t *nal, ptl_handle_md_t *md); - int (*nal_md_update) (nal_t *nal, ptl_handle_md_t *md, - ptl_md_t *old_md, ptl_md_t *new_md, - ptl_handle_eq_t *testq); - - int (*nal_eq_alloc) (nal_t *nal, ptl_size_t count, - ptl_eq_handler_t handler, - ptl_handle_eq_t *handle); - int (*nal_eq_free) (nal_t *nal, ptl_handle_eq_t *eq); - int (*nal_eq_poll) (nal_t *nal, - ptl_handle_eq_t *eqs, int neqs, int timeout, - ptl_event_t *event, int *which); - - int (*nal_ace_entry) (nal_t *nal, ptl_ac_index_t index, - ptl_process_id_t match_id, ptl_pt_index_t portal); - - int (*nal_put) (nal_t *nal, ptl_handle_md_t *md, ptl_ack_req_t ack, - ptl_process_id_t *target, ptl_pt_index_t portal, - ptl_ac_index_t ac, ptl_match_bits_t match, - ptl_size_t offset, ptl_hdr_data_t hdr_data); - int (*nal_get) (nal_t *nal, ptl_handle_md_t *md, - ptl_process_id_t *target, ptl_pt_index_t portal, - ptl_ac_index_t ac, ptl_match_bits_t match, - ptl_size_t offset); -}; - -extern nal_t *ptl_hndl2nal(ptl_handle_any_t *any); - -#ifdef __KERNEL__ -extern int ptl_register_nal(ptl_interface_t interface, nal_t *nal); -extern void ptl_unregister_nal(ptl_interface_t interface); -#endif - -#endif diff --git a/lnet/include/lnet/nalids.h b/lnet/include/lnet/nalids.h deleted file mode 100644 index 55a991b..0000000 --- a/lnet/include/lnet/nalids.h +++ /dev/null @@ -1,2 +0,0 @@ -#include "build_check.h" - diff --git a/lnet/include/lnet/p30.h b/lnet/include/lnet/p30.h deleted file mode 100644 index 9be79b8..0000000 --- a/lnet/include/lnet/p30.h +++ /dev/null @@ -1,25 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#ifndef __PORTALS_P30_H__ -#define __PORTALS_P30_H__ - -#include "build_check.h" - -/* - * p30.h - * - * User application interface file - */ -#if defined(__linux__) -#include -#elif defined(__APPLE__) -#include -#else -#error Unsupported Operating System -#endif - -#include -#include - -#endif diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h deleted file mode 100644 index cce160e..0000000 --- a/lnet/include/lnet/ptlctl.h +++ /dev/null @@ -1,96 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - * header for libptlctl.a - */ -#ifndef _PTLCTL_H_ -#define _PTLCTL_H_ - -#include -#include -#include - -#define PORTALS_DEV_ID 0 -#define PORTALS_DEV_PATH "/dev/portals" -#define OBD_DEV_ID 1 -#define OBD_DEV_PATH "/dev/obd" -#define SMFS_DEV_ID 2 -#define SMFS_DEV_PATH "/dev/snapdev" - -int ptl_name2nal(char *str); -int ptl_parse_ipaddr (__u32 *ipaddrp, char *str); -int ptl_parse_anynid (ptl_nid_t *nidp, char *str); -int ptl_parse_nid (ptl_nid_t *nidp, char *str); -char * ptl_nid2str (char *buffer, ptl_nid_t nid); - -int ptl_initialize(int argc, char **argv); -int jt_ptl_network(int argc, char **argv); -int jt_ptl_print_interfaces(int argc, char **argv); -int jt_ptl_add_interface(int argc, char **argv); -int jt_ptl_del_interface(int argc, char **argv); -int jt_ptl_print_peers (int argc, char **argv); -int jt_ptl_add_peer (int argc, char **argv); -int jt_ptl_del_peer (int argc, char **argv); -int jt_ptl_print_connections (int argc, char **argv); -int jt_ptl_connect(int argc, char **argv); -int jt_ptl_disconnect(int argc, char **argv); -int jt_ptl_push_connection(int argc, char **argv); -int jt_ptl_print_active_txs(int argc, char **argv); -int jt_ptl_ping(int argc, char **argv); -int jt_ptl_shownid(int argc, char **argv); -int jt_ptl_mynid(int argc, char **argv); -int jt_ptl_add_uuid(int argc, char **argv); -int jt_ptl_add_uuid_old(int argc, char **argv); /* backwards compatibility */ -int jt_ptl_close_uuid(int argc, char **argv); -int jt_ptl_del_uuid(int argc, char **argv); -int jt_ptl_add_route (int argc, char **argv); -int jt_ptl_del_route (int argc, char **argv); -int jt_ptl_notify_router (int argc, char **argv); -int jt_ptl_print_routes (int argc, char **argv); -int jt_ptl_fail_nid (int argc, char **argv); -int jt_ptl_loopback (int argc, char **argv); -int jt_ptl_lwt(int argc, char **argv); -int jt_ptl_memhog(int argc, char **argv); - -int dbg_initialize(int argc, char **argv); -int jt_dbg_filter(int argc, char **argv); -int jt_dbg_show(int argc, char **argv); -int jt_dbg_list(int argc, char **argv); -int jt_dbg_debug_kernel(int argc, char **argv); -int jt_dbg_debug_daemon(int argc, char **argv); -int jt_dbg_debug_file(int argc, char **argv); -int jt_dbg_clear_debug_buf(int argc, char **argv); -int jt_dbg_mark_debug_buf(int argc, char **argv); -int jt_dbg_modules(int argc, char **argv); -int jt_dbg_panic(int argc, char **argv); - -int ptl_set_cfg_record_cb(cfg_record_cb_t cb); - -/* l_ioctl.c */ -typedef int (ioc_handler_t)(int dev_id, unsigned int opc, void *buf); -void set_ioc_handler(ioc_handler_t *handler); -int register_ioc_dev(int dev_id, const char * dev_name); -void unregister_ioc_dev(int dev_id); -int set_ioctl_dump(char * file); -int l_ioctl(int dev_id, unsigned int opc, void *buf); -int parse_dump(char * dump_file, ioc_handler_t ioc_func); -int jt_ioc_dump(int argc, char **argv); -extern char *dump_filename; -int dump(int dev_id, unsigned int opc, void *buf); - -#endif diff --git a/lnet/include/lnet/ptllnd.h b/lnet/include/lnet/ptllnd.h new file mode 100755 index 0000000..c52480c --- /dev/null +++ b/lnet/include/lnet/ptllnd.h @@ -0,0 +1,77 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +/* + * The PTLLND was designed to support Portals with + * Lustre and non-lustre UNLINK semantics. + * However for now the two targets are Cray Portals + * on the XT3 and Lustre Portals (for testing) both + * have Lustre UNLINK semantics, so this is defined + * by default. + */ +#define LUSTRE_PORTALS_UNLINK_SEMANTICS + + +#ifdef _USING_LUSTRE_PORTALS_ + +/* NIDs are 64-bits on Lustre Portals */ +#define FMT_NID LPU64 +#define FMT_PID "%d" + +/* When using Lustre Portals Lustre completion semantics are imlicit*/ +#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS 0 + +#else /* _USING_CRAY_PORTALS_ */ + +/* Explicit NULL function pointer for EQ handler */ +#define PTL_EQ_HANDLER_NONE 0 + +/* NIDs are integers on Cray Portals */ +#define FMT_NID "%u" +#define FMT_PID "%d" + +/* When using Cray Portals this is defined in the Cray Portals Header*/ +/*#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS */ + +/* Can compare handles directly on Cray Portals */ +#define PtlHandleIsEqual(a,b) ((a) == (b)) + +/* Diffrent error types on Cray Portals*/ +#define ptl_err_t ptl_ni_fail_t + +/* + * The Cray Portals has no maximum number of IOVs. The + * maximum is limited only my memory and size of the + * int parameters (2^31-1). + * Lustre only really require that the underyling + * implemenation to support at least LNET_MAX_IOV, + * so for Cray portals we can safely just use that + * value here. + * + */ +#define PTL_MD_MAX_IOV LNET_MAX_IOV + +#endif + +#define FMT_PTLID "ptlid:"FMT_PID"-"FMT_NID + +/* Align incoming small request messages to an 8 byte boundary if this is + * supported to avoid alignment issues on some architectures */ +#ifndef PTL_MD_LOCAL_ALIGN8 +# define PTL_MD_LOCAL_ALIGN8 0 +#endif diff --git a/lnet/include/lnet/ptllnd_wire.h b/lnet/include/lnet/ptllnd_wire.h new file mode 100644 index 0000000..e5b5410 --- /dev/null +++ b/lnet/include/lnet/ptllnd_wire.h @@ -0,0 +1,93 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +/************************************************************************ + * Tunable defaults that {u,k}lnds/ptllnd should have in common. + */ + +#define PTLLND_PORTAL 9 /* The same portal PTLPRC used when talking to cray portals */ +#define PTLLND_PID 9 /* The Portals PID */ +#define PTLLND_PEERCREDITS 8 /* concurrent sends to 1 peer */ +#define PTLLND_MAX_MSG_SIZE 512 /* Maximum message size */ + + +/************************************************************************ + * Portals LNS Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +#define PTL_RESERVED_MATCHBITS 0x100 /* below this value is reserved + * above is for bulk data transfer */ +#define LNET_MSG_MATCHBITS 0 /* the value for the message channel */ + +typedef struct +{ + lnet_hdr_t kptlim_hdr; /* portals header */ + char kptlim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kptl_immediate_msg_t; + +typedef struct +{ + lnet_hdr_t kptlrm_hdr; /* portals header */ + __u64 kptlrm_matchbits; /* matchbits */ +} WIRE_ATTR kptl_rdma_msg_t; + +typedef struct +{ + __u64 kptlhm_matchbits; /* matchbits */ + __u32 kptlhm_max_msg_size; /* max message size */ +} WIRE_ATTR kptl_hello_msg_t; + +typedef struct +{ + /* First 2 fields fixed FOR ALL TIME */ + __u32 ptlm_magic; /* I'm a Portals LND message */ + __u16 ptlm_version; /* this is my version number */ + __u8 ptlm_type; /* the message type */ + __u8 ptlm_credits; /* returned credits */ + __u32 ptlm_nob; /* # bytes in whole message */ + __u32 ptlm_cksum; /* checksum (0 == no checksum) */ + __u64 ptlm_srcnid; /* sender's NID */ + __u64 ptlm_srcstamp; /* sender's incarnation */ + __u64 ptlm_dstnid; /* destination's NID */ + __u64 ptlm_dststamp; /* destination's incarnation */ + __u32 ptlm_srcpid; /* sender's PID */ + __u32 ptlm_dstpid; /* destination's PID */ + + union { + kptl_immediate_msg_t immediate; + kptl_rdma_msg_t rdma; + kptl_hello_msg_t hello; + } WIRE_ATTR ptlm_u; + +} kptl_msg_t; + +#define PTLLND_MSG_MAGIC LNET_PROTO_PTL_MAGIC +#define PTLLND_MSG_VERSION 0x04 + +#define PTLLND_RDMA_OK 0x00 +#define PTLLND_RDMA_FAIL 0x01 + +#define PTLLND_MSG_TYPE_INVALID 0x00 +#define PTLLND_MSG_TYPE_PUT 0x01 +#define PTLLND_MSG_TYPE_GET 0x02 +#define PTLLND_MSG_TYPE_IMMEDIATE 0x03 /* No bulk data xfer*/ +#define PTLLND_MSG_TYPE_NOOP 0x04 +#define PTLLND_MSG_TYPE_HELLO 0x05 +#define PTLLND_MSG_TYPE_NAK 0x06 + diff --git a/lnet/include/lnet/socklnd.h b/lnet/include/lnet/socklnd.h index 27e6f8e..301f8a8 100644 --- a/lnet/include/lnet/socklnd.h +++ b/lnet/include/lnet/socklnd.h @@ -1,14 +1,53 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * + * * * #defines shared between socknal implementation and utilities */ +#ifndef __LNET_LNET_SOCKLND_H__ +#define __LNET_LNET_SOCKLND_H__ -#define SOCKNAL_CONN_NONE (-1) -#define SOCKNAL_CONN_ANY 0 -#define SOCKNAL_CONN_CONTROL 1 -#define SOCKNAL_CONN_BULK_IN 2 -#define SOCKNAL_CONN_BULK_OUT 3 -#define SOCKNAL_CONN_NTYPES 4 +#include +#include + +#define SOCKLND_CONN_NONE (-1) +#define SOCKLND_CONN_ANY 0 +#define SOCKLND_CONN_CONTROL 1 +#define SOCKLND_CONN_BULK_IN 2 +#define SOCKLND_CONN_BULK_OUT 3 +#define SOCKLND_CONN_NTYPES 4 + +typedef struct { + __u32 kshm_magic; /* magic number of socklnd message */ + __u32 kshm_version; /* version of socklnd message */ + lnet_nid_t kshm_src_nid; /* sender's nid */ + lnet_nid_t kshm_dst_nid; /* destination nid */ + lnet_pid_t kshm_src_pid; /* sender's pid */ + lnet_pid_t kshm_dst_pid; /* destination pid */ + __u64 kshm_src_incarnation; /* sender's incarnation */ + __u64 kshm_dst_incarnation; /* destination's incarnation */ + __u32 kshm_ctype; /* connection type */ + __u32 kshm_nips; /* # IP addrs */ + __u32 kshm_ips[0]; /* IP addrs */ +} WIRE_ATTR ksock_hello_msg_t; + +typedef struct { + lnet_hdr_t ksnm_hdr; /* lnet hdr */ + char ksnm_payload[0];/* lnet payload */ +} WIRE_ATTR ksock_lnet_msg_t; + +typedef struct { + __u32 ksm_type; /* type of socklnd message */ + __u32 ksm_csum; /* checksum if != 0 */ + __u64 ksm_zc_req_cookie; /* ack required if != 0 */ + __u64 ksm_zc_ack_cookie; /* ack if != 0 */ + union { + ksock_lnet_msg_t lnetmsg; /* lnet message, it's empty if it's NOOP */ + } WIRE_ATTR ksm_u; +} WIRE_ATTR ksock_msg_t; + +#define KSOCK_MSG_NOOP 0xc0 /* ksm_u empty */ +#define KSOCK_MSG_LNET 0xc1 /* lnet msg */ + +#endif diff --git a/lnet/include/lnet/stringtab.h b/lnet/include/lnet/stringtab.h deleted file mode 100644 index 33e4375..0000000 --- a/lnet/include/lnet/stringtab.h +++ /dev/null @@ -1,3 +0,0 @@ -/* - * stringtab.h - */ diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index f07534b..11ea5de 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -1,127 +1,126 @@ -#ifndef _P30_TYPES_H_ -#define _P30_TYPES_H_ - -#include "build_check.h" +#ifndef __LNET_TYPES_H__ +#define __LNET_TYPES_H__ #include -#include - -/* This implementation uses the same type for API function return codes and - * the completion status in an event */ -#define PTL_NI_OK PTL_OK -typedef ptl_err_t ptl_ni_fail_t; - -typedef __u32 ptl_uid_t; -typedef __u32 ptl_jid_t; -typedef __u64 ptl_nid_t; -typedef __u32 ptl_netid_t; -typedef __u32 ptl_pid_t; -typedef __u32 ptl_pt_index_t; -typedef __u32 ptl_ac_index_t; -typedef __u64 ptl_match_bits_t; -typedef __u64 ptl_hdr_data_t; -typedef __u32 ptl_size_t; - -#define PTL_TIME_FOREVER (-1) + +#define LNET_RESERVED_PORTAL 0 /* portals reserved for lnet's own use */ + +typedef __u64 lnet_nid_t; +typedef __u32 lnet_pid_t; + +#define LNET_NID_ANY ((lnet_nid_t) -1) +#define LNET_PID_ANY ((lnet_pid_t) -1) + +#ifdef CRAY_XT3 +typedef __u32 lnet_uid_t; +#define LNET_UID_ANY ((lnet_uid_t) -1) +#endif + +#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */ +#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */ + +#define LNET_TIME_FOREVER (-1) typedef struct { - unsigned long nal_idx; /* which network interface */ - __u64 cookie; /* which thing on that interface */ -} ptl_handle_any_t; + __u64 cookie; +} lnet_handle_any_t; -typedef ptl_handle_any_t ptl_handle_ni_t; -typedef ptl_handle_any_t ptl_handle_eq_t; -typedef ptl_handle_any_t ptl_handle_md_t; -typedef ptl_handle_any_t ptl_handle_me_t; +typedef lnet_handle_any_t lnet_handle_eq_t; +typedef lnet_handle_any_t lnet_handle_md_t; +typedef lnet_handle_any_t lnet_handle_me_t; -#define PTL_INVALID_HANDLE \ - ((const ptl_handle_any_t){.nal_idx = -1, .cookie = -1}) -#define PTL_EQ_NONE PTL_INVALID_HANDLE +#define LNET_INVALID_HANDLE \ + ((const lnet_handle_any_t){.cookie = -1}) +#define LNET_EQ_NONE LNET_INVALID_HANDLE -static inline int PtlHandleIsEqual (ptl_handle_any_t h1, ptl_handle_any_t h2) +static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2) { - return (h1.nal_idx == h2.nal_idx && h1.cookie == h2.cookie); + return (h1.cookie == h2.cookie); } -#define PTL_UID_ANY ((ptl_uid_t) -1) -#define PTL_JID_ANY ((ptl_jid_t) -1) -#define PTL_NID_ANY ((ptl_nid_t) -1) -#define PTL_PID_ANY ((ptl_pid_t) -1) - typedef struct { - ptl_nid_t nid; - ptl_pid_t pid; /* node id / process id */ -} ptl_process_id_t; + lnet_nid_t nid; + lnet_pid_t pid; /* node id / process id */ +} lnet_process_id_t; typedef enum { - PTL_RETAIN = 0, - PTL_UNLINK -} ptl_unlink_t; + LNET_RETAIN = 0, + LNET_UNLINK +} lnet_unlink_t; typedef enum { - PTL_INS_BEFORE, - PTL_INS_AFTER -} ptl_ins_pos_t; + LNET_INS_BEFORE, + LNET_INS_AFTER +} lnet_ins_pos_t; typedef struct { void *start; - ptl_size_t length; + unsigned int length; int threshold; int max_size; unsigned int options; void *user_ptr; - ptl_handle_eq_t eq_handle; -} ptl_md_t; + lnet_handle_eq_t eq_handle; +} lnet_md_t; + +/* Max Transfer Unit (minimum supported everywhere) */ +#define LNET_MTU_BITS 20 +#define LNET_MTU (1< (PAGE_SIZE * LNET_MAX_IOV)) +/* PAGE_SIZE is a constant: check with cpp! */ +# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb" +# endif +# endif +#endif /* Options for the MD structure */ -#define PTL_MD_OP_PUT (1 << 0) -#define PTL_MD_OP_GET (1 << 1) -#define PTL_MD_MANAGE_REMOTE (1 << 2) -/* unused (1 << 3) */ -#define PTL_MD_TRUNCATE (1 << 4) -#define PTL_MD_ACK_DISABLE (1 << 5) -#define PTL_MD_IOVEC (1 << 6) -#define PTL_MD_MAX_SIZE (1 << 7) -#define PTL_MD_KIOV (1 << 8) -#define PTL_MD_EVENT_START_DISABLE (1 << 9) -#define PTL_MD_EVENT_END_DISABLE (1 << 10) +#define LNET_MD_OP_PUT (1 << 0) +#define LNET_MD_OP_GET (1 << 1) +#define LNET_MD_MANAGE_REMOTE (1 << 2) +/* unused (1 << 3) */ +#define LNET_MD_TRUNCATE (1 << 4) +#define LNET_MD_ACK_DISABLE (1 << 5) +#define LNET_MD_IOVEC (1 << 6) +#define LNET_MD_MAX_SIZE (1 << 7) +#define LNET_MD_KIOV (1 << 8) /* For compatibility with Cray Portals */ -#define PTL_MD_LUSTRE_COMPLETION_SEMANTICS 0 -#define PTL_MD_PHYS 0 +#define LNET_MD_PHYS 0 -#define PTL_MD_THRESH_INF (-1) +#define LNET_MD_THRESH_INF (-1) /* NB lustre portals uses struct iovec internally! */ -typedef struct iovec ptl_md_iovec_t; +typedef struct iovec lnet_md_iovec_t; typedef struct { cfs_page_t *kiov_page; unsigned int kiov_len; unsigned int kiov_offset; -} ptl_kiov_t; +} lnet_kiov_t; typedef enum { - PTL_EVENT_GET_START, - PTL_EVENT_GET_END, - - PTL_EVENT_PUT_START, - PTL_EVENT_PUT_END, - - PTL_EVENT_REPLY_START, - PTL_EVENT_REPLY_END, - - PTL_EVENT_ACK, - - PTL_EVENT_SEND_START, - PTL_EVENT_SEND_END, - - PTL_EVENT_UNLINK, -} ptl_event_kind_t; - -#define PTL_SEQ_BASETYPE long -typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; -#define PTL_SEQ_GT(a,b) (((signed PTL_SEQ_BASETYPE)((a) - (b))) > 0) + LNET_EVENT_GET, + LNET_EVENT_PUT, + LNET_EVENT_REPLY, + LNET_EVENT_ACK, + LNET_EVENT_SEND, + LNET_EVENT_UNLINK, +} lnet_event_kind_t; + +#define LNET_SEQ_BASETYPE long +typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t; +#define LNET_SEQ_GT(a,b) (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0) /* XXX * cygwin need the pragma line, not clear if it's needed in other places. @@ -131,64 +130,35 @@ typedef unsigned PTL_SEQ_BASETYPE ptl_seq_t; #pragma pack(push, 4) #endif typedef struct { - ptl_event_kind_t type; - ptl_process_id_t initiator; - ptl_uid_t uid; - ptl_jid_t jid; - ptl_pt_index_t pt_index; - ptl_match_bits_t match_bits; - ptl_size_t rlength; - ptl_size_t mlength; - ptl_size_t offset; - ptl_handle_md_t md_handle; - ptl_md_t md; - ptl_hdr_data_t hdr_data; - ptl_seq_t link; - ptl_ni_fail_t ni_fail_type; - - int unlinked; - - volatile ptl_seq_t sequence; -} ptl_event_t; + lnet_event_kind_t type; + lnet_process_id_t target; + lnet_process_id_t initiator; +#ifdef CRAY_XT3 + lnet_uid_t uid; +#endif + unsigned int pt_index; + __u64 match_bits; + unsigned int rlength; + unsigned int mlength; + unsigned int offset; + lnet_handle_md_t md_handle; + lnet_md_t md; + __u64 hdr_data; + int status; + int unlinked; + + volatile lnet_seq_t sequence; +} lnet_event_t; #ifdef __CYGWIN__ #pragma pop #endif typedef enum { - PTL_ACK_REQ, - PTL_NOACK_REQ -} ptl_ack_req_t; - -typedef void (*ptl_eq_handler_t)(ptl_event_t *event); -#define PTL_EQ_HANDLER_NONE NULL + LNET_ACK_REQ, + LNET_NOACK_REQ +} lnet_ack_req_t; -typedef struct { - int max_mes; - int max_mds; - int max_eqs; - int max_ac_index; - int max_pt_index; - int max_md_iovecs; - int max_me_list; - int max_getput_md; -} ptl_ni_limits_t; - -/* - * Status registers - */ -typedef enum { - PTL_SR_DROP_COUNT, - PTL_SR_DROP_LENGTH, - PTL_SR_RECV_COUNT, - PTL_SR_RECV_LENGTH, - PTL_SR_SEND_COUNT, - PTL_SR_SEND_LENGTH, - PTL_SR_MSGS_MAX, -} ptl_sr_index_t; - -typedef int ptl_sr_value_t; - -typedef int ptl_interface_t; -#define PTL_IFACE_DEFAULT (-1) +typedef void (*lnet_eq_handler_t)(lnet_event_t *event); +#define LNET_EQ_HANDLER_NONE NULL #endif diff --git a/lnet/include/lnet/winnt/api-support.h b/lnet/include/lnet/winnt/api-support.h new file mode 100644 index 0000000..8806981 --- /dev/null +++ b/lnet/include/lnet/winnt/api-support.h @@ -0,0 +1,9 @@ +#ifndef __WINNT_API_SUPPORT_H__ +#define __WINNT_API_SUPPORT_H__ + +#ifndef __LNET_API_SUPPORT_H__ +#error Do not #include this file directly. #include instead +#endif + + +#endif diff --git a/lnet/include/lnet/winnt/lib-lnet.h b/lnet/include/lnet/winnt/lib-lnet.h new file mode 100644 index 0000000..bb3e5af --- /dev/null +++ b/lnet/include/lnet/winnt/lib-lnet.h @@ -0,0 +1,25 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef __LNET_WINNT_LIB_LNET_H__ +#define __LNET_WINNT_LIB_LNET_H__ + +#ifndef __LNET_LIB_LNET_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifdef __KERNEL__ +# include +# include + +static inline __u64 +lnet_page2phys (struct page *p) +{ + return 0; +} + +#else /* __KERNEL__ */ + +#endif + +#endif /* __LNET_WINNT_LIB_LNET_H__ */ diff --git a/lnet/include/lnet/winnt/lib-types.h b/lnet/include/lnet/winnt/lib-types.h new file mode 100644 index 0000000..33a3134 --- /dev/null +++ b/lnet/include/lnet/winnt/lib-types.h @@ -0,0 +1,55 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LNET_WINNT_LIB_TYPES_H__ +#define __LNET_WINNT_LIB_TYPES_H__ + +#ifndef __LNET_LIB_TYPES_H__ +#error Do not #include this file directly. #include instead +#endif + +#include + +typedef struct { + spinlock_t lock; +} lib_ni_lock_t; + +static inline void lib_ni_lock_init(lib_ni_lock_t *l) +{ + spin_lock_init(&l->lock); +} + +static inline void lib_ni_lock_fini(lib_ni_lock_t *l) +{} + +static inline void lib_ni_lock(lib_ni_lock_t *l) +{ + int flags; + spin_lock_irqsave(&l->lock, flags); +} + +static inline void lib_ni_unlock(lib_ni_lock_t *l) +{ + spin_unlock_irqrestore(&l->lock, 0); +} + +#endif diff --git a/lnet/include/lnet/winnt/lnet.h b/lnet/include/lnet/winnt/lnet.h new file mode 100644 index 0000000..7a3d24d --- /dev/null +++ b/lnet/include/lnet/winnt/lnet.h @@ -0,0 +1,511 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#ifndef __LNET_LINUX_LNET_H__ +#define __LNET_LINUX_LNET_H__ + +#ifndef __LNET_H__ +#error Do not #include this file directly. #include instead +#endif + +#ifdef __KERNEL__ + +#include +#include + +/* + * tdilnd routines + */ + + +PUCHAR +KsNtStatusToString (IN NTSTATUS Status); + + +VOID +KsPrintf( + IN LONG DebugPrintLevel, + IN PCHAR DebugMessage, + IN ... + ); + + +ksock_mdl_t * +ks_lock_iovs( + IN struct iovec *iov, + IN int niov, + IN int recv, + IN int * len + ); + +ksock_mdl_t * +ks_lock_kiovs( + IN lnet_kiov_t * kiov, + IN int nkiov, + IN int recv, + IN int * len + ); + +int +ks_send_mdl( + ksock_tconn_t * tconn, + void * tx, + ksock_mdl_t * mdl, + int len, + int flags + ); + +int +ks_query_data( + ksock_tconn_t * tconn, + size_t * size, + int bIsExpedited); + +int +ks_recv_mdl( + ksock_tconn_t * tconn, + ksock_mdl_t * mdl, + int size, + int flags + ); + +int +ks_get_tcp_option ( + ksock_tconn_t * tconn, + ULONG ID, + PVOID OptionValue, + PULONG Length + ); + +NTSTATUS +ks_set_tcp_option ( + ksock_tconn_t * tconn, + ULONG ID, + PVOID OptionValue, + ULONG Length + ); + +int +ks_bind_tconn ( + ksock_tconn_t * tconn, + ksock_tconn_t * parent, + ulong_ptr addr, + unsigned short port + ); + +int +ks_build_tconn( + ksock_tconn_t * tconn, + ulong_ptr addr, + unsigned short port + ); + +int +ks_disconnect_tconn( + ksock_tconn_t * tconn, + ulong_ptr flags + ); + +void +ks_abort_tconn( + ksock_tconn_t * tconn + ); + +int +ks_query_local_ipaddr( + ksock_tconn_t * tconn + ); + +int +ks_tconn_write (ksock_tconn_t *tconn, void *buffer, int nob); + +int +ks_tconn_read (ksock_tconn_t * tconn, void *buffer, int nob); + +NTSTATUS +KsTcpCompletionRoutine( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN PVOID Context + ); + +NTSTATUS +KsDisconectCompletionRoutine ( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN PVOID Context + ); + +NTSTATUS +KsTcpReceiveCompletionRoutine( + IN PIRP Irp, + IN PKS_TCP_COMPLETION_CONTEXT Context + ); + +NTSTATUS +KsTcpSendCompletionRoutine( + IN PIRP Irp, + IN PKS_TCP_COMPLETION_CONTEXT Context + ); + +NTSTATUS +KsAcceptCompletionRoutine( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN PVOID Context + ); + + +NTSTATUS +KsConnectEventHandler( + IN PVOID TdiEventContext, + IN LONG RemoteAddressLength, + IN PVOID RemoteAddress, + IN LONG UserDataLength, + IN PVOID UserData, + IN LONG OptionsLength, + IN PVOID Options, + OUT CONNECTION_CONTEXT * ConnectionContext, + OUT PIRP * AcceptIrp + ); + +NTSTATUS +KsDisconnectEventHandler( + IN PVOID TdiEventContext, + IN CONNECTION_CONTEXT ConnectionContext, + IN LONG DisconnectDataLength, + IN PVOID DisconnectData, + IN LONG DisconnectInformationLength, + IN PVOID DisconnectInformation, + IN ULONG DisconnectFlags + ); + +NTSTATUS +KsTcpReceiveEventHandler( + IN PVOID TdiEventContext, + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG BytesIndicated, + IN ULONG BytesAvailable, + OUT ULONG * BytesTaken, + IN PVOID Tsdu, + OUT PIRP * IoRequestPacket + ); + +NTSTATUS +KsTcpReceiveExpeditedEventHandler( + IN PVOID TdiEventContext, + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG BytesIndicated, + IN ULONG BytesAvailable, + OUT ULONG * BytesTaken, + IN PVOID Tsdu, + OUT PIRP * IoRequestPacket + ); + +NTSTATUS +KsTcpChainedReceiveEventHandler ( + IN PVOID TdiEventContext, // the event context + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG ReceiveLength, + IN ULONG StartingOffset, // offset of start of client data in TSDU + IN PMDL Tsdu, // TSDU data chain + IN PVOID TsduDescriptor // for call to TdiReturnChainedReceives + ); + +NTSTATUS +KsTcpChainedReceiveExpeditedEventHandler ( + IN PVOID TdiEventContext, // the event context + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG ReceiveLength, + IN ULONG StartingOffset, // offset of start of client data in TSDU + IN PMDL Tsdu, // TSDU data chain + IN PVOID TsduDescriptor // for call to TdiReturnChainedReceives + ); + + + +VOID +KsDisconnectHelper(PKS_DISCONNECT_WORKITEM WorkItem); + + +ULONG +ks_tdi_send_flags(ULONG SockFlags); + +PIRP +KsBuildTdiIrp( + IN PDEVICE_OBJECT DeviceObject + ); + +NTSTATUS +KsSubmitTdiIrp( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN BOOLEAN bSynchronous, + OUT PULONG Information + ); + +NTSTATUS +KsOpenControl( + IN PUNICODE_STRING DeviceName, + OUT HANDLE * Handle, + OUT PFILE_OBJECT * FileObject + ); + +NTSTATUS +KsCloseControl( + IN HANDLE Handle, + IN PFILE_OBJECT FileObject + ); + +NTSTATUS +KsOpenAddress( + IN PUNICODE_STRING DeviceName, + IN PTRANSPORT_ADDRESS pAddress, + IN ULONG AddressLength, + OUT HANDLE * Handle, + OUT PFILE_OBJECT * FileObject + ); + +NTSTATUS +KsCloseAddress( + IN HANDLE Handle, + IN PFILE_OBJECT FileObject + ); + +NTSTATUS +KsOpenConnection( + IN PUNICODE_STRING DeviceName, + IN CONNECTION_CONTEXT ConnectionContext, + OUT HANDLE * Handle, + OUT PFILE_OBJECT * FileObject + ); + +NTSTATUS +KsCloseConnection( + IN HANDLE Handle, + IN PFILE_OBJECT FileObject + ); + +NTSTATUS +KsAssociateAddress( + IN HANDLE AddressHandle, + IN PFILE_OBJECT ConnectionObject + ); + + +NTSTATUS +KsDisassociateAddress( + IN PFILE_OBJECT ConnectionObject + ); + + +NTSTATUS +KsSetEventHandlers( + IN PFILE_OBJECT AddressObject, + IN PVOID EventContext, + IN PKS_EVENT_HANDLERS Handlers + ); + + +NTSTATUS +KsQueryProviderInfo( + PWSTR TdiDeviceName, + PTDI_PROVIDER_INFO ProviderInfo + ); + +NTSTATUS +KsQueryAddressInfo( + IN PFILE_OBJECT FileObject, + OUT PTDI_ADDRESS_INFO AddressInfo, + OUT PULONG AddressSize + ); + +NTSTATUS +KsQueryConnectionInfo( + IN PFILE_OBJECT ConnectionObject, + OUT PTDI_CONNECTION_INFO ConnectionInfo, + OUT PULONG ConnectionSize + ); + +ULONG +KsInitializeTdiAddress( + IN OUT PTA_IP_ADDRESS pTransportAddress, + IN ULONG IpAddress, + IN USHORT IpPort + ); + +ULONG +KsQueryMdlsSize (IN PMDL Mdl); + + +ULONG +KsQueryTdiAddressLength( + OUT PTRANSPORT_ADDRESS pTransportAddress + ); + +NTSTATUS +KsQueryIpAddress( + IN PFILE_OBJECT FileObject, + OUT PVOID TdiAddress, + OUT ULONG* AddressLength + ); + + +NTSTATUS +KsErrorEventHandler( + IN PVOID TdiEventContext, + IN NTSTATUS Status + ); + +int +ks_set_handlers( + ksock_tconn_t * tconn + ); + + +VOID +KsPrintProviderInfo( + PWSTR DeviceName, + PTDI_PROVIDER_INFO ProviderInfo + ); + +ksock_tconn_t * +ks_create_tconn(); + +void +ks_free_tconn( + ksock_tconn_t * tconn + ); + +void +ks_init_listener( + ksock_tconn_t * tconn + ); + +void +ks_init_sender( + ksock_tconn_t * tconn + ); + +void +ks_init_child( + ksock_tconn_t * tconn + ); + +void +ks_get_tconn( + ksock_tconn_t * tconn + ); + +void +ks_put_tconn( + ksock_tconn_t * tconn + ); + +int +ks_reset_handlers( + ksock_tconn_t * tconn + ); + +void +ks_destroy_tconn( + ksock_tconn_t * tconn + ); + + +PKS_TSDU +KsAllocateKsTsdu(); + +VOID +KsPutKsTsdu( + PKS_TSDU KsTsdu + ); + +VOID +KsFreeKsTsdu( + PKS_TSDU KsTsdu + ); + +VOID +KsInitializeKsTsdu( + PKS_TSDU KsTsdu, + ULONG Length + ); + + +VOID +KsInitializeKsTsduMgr( + PKS_TSDUMGR TsduMgr + ); + +VOID +KsInitializeKsChain( + PKS_CHAIN KsChain + ); + +NTSTATUS +KsCleanupTsduMgr( + PKS_TSDUMGR KsTsduMgr + ); + +NTSTATUS +KsCleanupKsChain( + PKS_CHAIN KsChain + ); + +NTSTATUS +KsCleanupTsdu( + ksock_tconn_t * tconn + ); + +NTSTATUS +KsCopyMdlChainToMdlChain( + IN PMDL SourceMdlChain, + IN ULONG SourceOffset, + IN PMDL DestinationMdlChain, + IN ULONG DestinationOffset, + IN ULONG BytesTobecopied, + OUT PULONG BytesCopied + ); + +ULONG +KsQueryMdlsSize (PMDL Mdl); + +NTSTATUS +KsLockUserBuffer ( + IN PVOID UserBuffer, + IN BOOLEAN bPaged, + IN ULONG Length, + IN LOCK_OPERATION Operation, + OUT PMDL * pMdl + ); + +PVOID +KsMapMdlBuffer (PMDL Mdl); + +VOID +KsReleaseMdl ( IN PMDL Mdl, + IN int Paged ); + +int +ks_lock_buffer ( + void * buffer, + int paged, + int length, + LOCK_OPERATION access, + ksock_mdl_t ** kmdl + ); + +void * +ks_map_mdl (ksock_mdl_t * mdl); + +void +ks_release_mdl (ksock_mdl_t *mdl, int paged); + +#endif /* __KERNEL__ */ + +#endif diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in index f494a30..d4e034c 100644 --- a/lnet/klnds/Makefile.in +++ b/lnet/klnds/Makefile.in @@ -1,10 +1,13 @@ -@BUILD_GMNAL_TRUE@subdir-m += gmnal -@BUILD_RANAL_TRUE@subdir-m += ranal -@BUILD_OPENIBNAL_TRUE@subdir-m += openibnal -@BUILD_IIBNAL_TRUE@subdir-m += iibnal -@BUILD_VIBNAL_TRUE@subdir-m += vibnal -@BUILD_QSWNAL_TRUE@subdir-m += qswnal -subdir-m += socknal -subdir-m += lonal +@BUILD_GMLND_TRUE@subdir-m += gmlnd +@BUILD_MXLND_TRUE@subdir-m += mxlnd +@BUILD_RALND_TRUE@subdir-m += ralnd +@BUILD_O2IBLND_TRUE@subdir-m += o2iblnd +@BUILD_OPENIBLND_TRUE@subdir-m += openiblnd +@BUILD_CIBLND_TRUE@subdir-m += ciblnd +@BUILD_IIBLND_TRUE@subdir-m += iiblnd +@BUILD_VIBLND_TRUE@subdir-m += viblnd +@BUILD_QSWLND_TRUE@subdir-m += qswlnd +@BUILD_PTLLND_TRUE@subdir-m += ptllnd +subdir-m += socklnd @INCLUDE_RULES@ diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am index d28e365..e6d0146 100644 --- a/lnet/klnds/autoMakefile.am +++ b/lnet/klnds/autoMakefile.am @@ -3,4 +3,4 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -SUBDIRS = lonal socknal qswnal gmnal openibnal iibnal vibnal ranal +SUBDIRS = socklnd qswlnd gmlnd mxlnd openiblnd iiblnd viblnd ralnd ptllnd ciblnd o2iblnd diff --git a/lnet/klnds/lolnd/.cvsignore b/lnet/klnds/ciblnd/.cvsignore similarity index 100% rename from lnet/klnds/lolnd/.cvsignore rename to lnet/klnds/ciblnd/.cvsignore diff --git a/lnet/klnds/ciblnd/Makefile.in b/lnet/klnds/ciblnd/Makefile.in new file mode 100644 index 0000000..55311ad --- /dev/null +++ b/lnet/klnds/ciblnd/Makefile.in @@ -0,0 +1,8 @@ +MODULES := kciblnd +kciblnd-objs := ciblnd.o ciblnd_cb.o ciblnd_modparams.o + +default: all + +EXTRA_POST_CFLAGS := @CIBCPPFLAGS@ -I@LUSTRE@/../lnet/klnds/openiblnd + +@INCLUDE_RULES@ diff --git a/lnet/klnds/lolnd/autoMakefile.am b/lnet/klnds/ciblnd/autoMakefile.am similarity index 54% rename from lnet/klnds/lolnd/autoMakefile.am rename to lnet/klnds/ciblnd/autoMakefile.am index f7d04f7..cae5cfc 100644 --- a/lnet/klnds/lolnd/autoMakefile.am +++ b/lnet/klnds/ciblnd/autoMakefile.am @@ -4,12 +4,11 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS -if LINUX -modulenet_DATA = klonal$(KMODEXT) -endif +if BUILD_CIBLND +modulenet_DATA = kciblnd$(KMODEXT) endif endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(klonal-objs:%.o=%.c) lonal.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kciblnd-objs:%.o=%.c) + diff --git a/lnet/klnds/ciblnd/ciblnd.c b/lnet/klnds/ciblnd/ciblnd.c new file mode 100644 index 0000000..e139484 --- /dev/null +++ b/lnet/klnds/ciblnd/ciblnd.c @@ -0,0 +1 @@ +#include "openiblnd.c" diff --git a/lnet/klnds/ciblnd/ciblnd_cb.c b/lnet/klnds/ciblnd/ciblnd_cb.c new file mode 100644 index 0000000..893e16d --- /dev/null +++ b/lnet/klnds/ciblnd/ciblnd_cb.c @@ -0,0 +1 @@ +#include "openiblnd_cb.c" diff --git a/lnet/klnds/ciblnd/ciblnd_modparams.c b/lnet/klnds/ciblnd/ciblnd_modparams.c new file mode 100644 index 0000000..a0c6b1f --- /dev/null +++ b/lnet/klnds/ciblnd/ciblnd_modparams.c @@ -0,0 +1 @@ +#include "openiblnd_modparams.c" diff --git a/lnet/klnds/gmlnd/Makefile.in b/lnet/klnds/gmlnd/Makefile.in index 2efbea71..1aec50d 100644 --- a/lnet/klnds/gmlnd/Makefile.in +++ b/lnet/klnds/gmlnd/Makefile.in @@ -1,5 +1,5 @@ -MODULES := kgmnal -kgmnal-objs := gmnal_api.o gmnal_cb.o gmnal_comm.o gmnal_utils.o gmnal_module.o +MODULES := kgmlnd +kgmlnd-objs := gmlnd_api.o gmlnd_cb.o gmlnd_comm.o gmlnd_utils.o gmlnd_module.o EXTRA_PRE_CFLAGS := @GMCPPFLAGS@ -DGM_KERNEL diff --git a/lnet/klnds/gmlnd/README b/lnet/klnds/gmlnd/README new file mode 100644 index 0000000..ac2e23d --- /dev/null +++ b/lnet/klnds/gmlnd/README @@ -0,0 +1,73 @@ +1. This version of the GM nal requires an unreleased extension to the GM API to + map physical memory: gm_register_memory_ex_phys(). This allows it to avoid + ENOMEM problems associated with large contiguous buffer allocation. + +2. ./configure --with-gm= \ + [--with-gm-install=] + + If the sources do not support gm_register_memory_ex_phys(), configure flags + an error. In this case you should apply the patch and rebuild and re-install + GM as directed in the error message. + + By default GM is installed in /opt/gm. If an alternate path was specified to + /binary/GM_INSTALL, you should also specify --with-gm-install + with the same path. + +3. The GM timeout is 300 seconds; i.e. the network may not release resources + claimed by communications stalled with a crashing node for this time. + Default gmnal buffer tuning parameters (see (4) below) have been chosen to + minimize this problem and prevent lustre having to block for resources. + However in some situations, where all network buffers are busy, the default + lustre timeout (various, scaled from the base timeout of 100 seconds) may be + too small and the only solution may be to increase the lustre timeout + dramatically. + +4. The gmnal has the following module parameters... + + gmnal_port The GM port that the NAL will use (default 4) + Change this if it conflicts with site usage. + + gmnal_ntx The number of "normal" transmit descriptors (default + 32). When this pool is exhausted, threads sending + and receiving on the network block until in-progress + transmits have completed. Each descriptor consumes 1 + GM_MTU sized buffer. + + gmnal_ntx_nblk The number of "reserved" transmit descriptors + (default 256). This pool is reserved for responses to + incoming communications that may not block. Increase + only if console error messages indicates the pool + has been exhausted (LustreError: Can't get tx for + msg type...) Each descriptor consumes 1 GM_MTU sized + buffer. + + gmnal_nlarge_tx_bufs The number of 1MByte transmit buffers to reserve at + startup (default 32). This controls the number of + concurrent sends larger that GM_MTU. It can be + reduced to conserve memory, or increased to increase + large message sending concurrency. + + gmnal_nrx_small The number of GM_MTU sized receive buffers posted to + receive from the network (default 128). Increase if + congestion is suspected, however note that the total + number of receives that can be posted at any time is + limited by the number of GM receive tokens + available. If there are too few, this, and + gmnal_nrx_large are scaled back accordingly. + + gmnal_nrx_large The number of 1MByte receive buffers posted to + receive from the network (default 64). Increase if + the number of OST threads is increased. But note + that the total number of receives that can be posted + at any time is limited by the number of GM receive + tokens available. If there are too few, this, and + gmnal_nrx_small are scaled back accordingly. + +5. Network configuration for GM is done in an lmc script as follows... + + GM2NID=${path-to-lustre-tree}/portals/utils/gmnalnid + + ${LMC} --node some_server --add net --nettype gm --nid `$GM2NID -n some_server` + + ${LMC} --node client --add net --nettype gm --nid '*' + diff --git a/lnet/klnds/gmlnd/autoMakefile.am b/lnet/klnds/gmlnd/autoMakefile.am index 8c3b7c0..6ff7933 100644 --- a/lnet/klnds/gmlnd/autoMakefile.am +++ b/lnet/klnds/gmlnd/autoMakefile.am @@ -4,12 +4,10 @@ # See the file COPYING in this distribution if MODULES -if BUILD_GMNAL -if !CRAY_PORTALS -modulenet_DATA = kgmnal$(KMODEXT) -endif +if BUILD_GMLND +modulenet_DATA = kgmlnd$(KMODEXT) endif endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kgmnal-objs:%.o=%.c) gmnal.h +DIST_SOURCES = $(kgmlnd-objs:%.o=%.c) gmlnd.h diff --git a/lnet/klnds/gmlnd/gm-reg-phys.patch b/lnet/klnds/gmlnd/gm-reg-phys.patch new file mode 100644 index 0000000..df32a21 --- /dev/null +++ b/lnet/klnds/gmlnd/gm-reg-phys.patch @@ -0,0 +1,107 @@ +Index: libgm/gm_register.c +=================================================================== +RCS file: /repository/gm/libgm/gm_register.c,v +retrieving revision 1.9.16.3 +diff -u -r1.9.16.3 gm_register.c +--- libgm/gm_register.c 9 Aug 2005 14:37:02 -0000 1.9.16.3 ++++ libgm/gm_register.c 25 Aug 2005 21:35:58 -0000 +@@ -77,20 +77,14 @@ + + */ + +-GM_ENTRY_POINT +-gm_status_t +-gm_register_memory_ex (gm_port_t *p, void *_ptr, gm_size_t length, void *_pvma) ++static gm_status_t ++_gm_register_memory (gm_port_t *p, int is_physical, gm_u64_t ptr, gm_size_t length, gm_up_t pvma) + { + gm_status_t status; +- gm_up_t ptr; +- gm_up_t pvma; + + GM_CALLED_WITH_ARGS (("%p,%p,"GM_U64_TMPL",%p", + p, _ptr, GM_U64_ARG (length), _pvma)); + +- ptr = GM_PTR_TO_UP (_ptr); +- pvma = GM_PTR_TO_UP (_pvma); +- + #if !GM_KERNEL && !GM_CAN_REGISTER_MEMORY + GM_PARAMETER_MAY_BE_UNUSED (p); + GM_PARAMETER_MAY_BE_UNUSED (ptr); +@@ -160,7 +154,7 @@ + status = gm_add_mapping_to_page_table (ps, + ptr + offset, + pvma + offset, +- GM_INVALID_DMA_PAGE); ++ is_physical ? ptr + offset : GM_INVALID_DMA_PAGE); + if (status != GM_SUCCESS) + { + status = GM_INVALID_PARAMETER; +@@ -317,13 +311,31 @@ + + */ + ++#if GM_KERNEL && (GM_CPU_x86 || GM_CPU_x86_64 || GM_CPU_ia64) ++/* only architecture where pci bus addr == physical address can use ++ such a simple scheme */ ++GM_ENTRY_POINT gm_status_t ++gm_register_memory_ex_phys (struct gm_port *p, ++ gm_u64_t phys, gm_size_t length, ++ gm_up_t pvma) ++{ ++ return _gm_register_memory(p, 1, phys, length, (gm_size_t)pvma); ++} ++#endif ++ ++GM_ENTRY_POINT gm_status_t ++gm_register_memory_ex (gm_port_t *p, void *ptr, gm_size_t length, void *pvma) ++{ ++ return _gm_register_memory(p, 0, (gm_size_t)ptr, length, (gm_size_t)pvma); ++} ++ + GM_ENTRY_POINT gm_status_t + gm_register_memory (gm_port_t *p, void *ptr, gm_size_t length) + { + gm_status_t status; + + GM_CALLED_WITH_ARGS (("%p,%p,"GM_U64_TMPL, p, ptr, GM_U64_ARG (length))); +- status = gm_register_memory_ex (p, ptr, length, ptr); ++ status = _gm_register_memory(p, 0, (gm_size_t)ptr, length, (gm_size_t)ptr); + GM_RETURN_STATUS (status); + } + +Index: include/gm.h +=================================================================== +RCS file: /repository/gm/include/gm.h,v +retrieving revision 1.25.10.11 +diff -u -r1.25.10.11 gm.h +--- include/gm.h 14 Mar 2005 21:42:41 -0000 1.25.10.11 ++++ include/gm.h 25 Aug 2005 21:35:58 -0000 +@@ -2676,6 +2676,10 @@ + GM_ENTRY_POINT gm_status_t gm_register_memory_ex (struct gm_port *p, + void *ptr, gm_size_t length, + void *pvma); ++ ++GM_ENTRY_POINT gm_status_t gm_register_memory_ex_phys (struct gm_port *p, ++ gm_u64_t phys, gm_size_t length, ++ gm_up_t pvma); + #endif /* GM_API_VERSION >= GM_API_VERSION_2_0_6 */ + + #if GM_API_VERSION >= GM_API_VERSION_2_1_0 +Index: libgm/gm_reference_api.c +=================================================================== +RCS file: /repository/gm/libgm/gm_reference_api.c,v +retrieving revision 1.3.14.1 +diff -u -r1.3.14.1 gm_reference_api.c +--- libgm/gm_reference_api.c 23 Apr 2004 20:27:29 -0000 1.3.14.1 ++++ libgm/gm_reference_api.c 25 Aug 2005 22:39:20 -0000 +@@ -154,6 +154,9 @@ + GM_REF (gm_register_buffer); + GM_REF (gm_register_memory); + GM_REF (gm_register_memory_ex); ++#if GM_KERNEL && (GM_CPU_x86 || GM_CPU_x86_64 || GM_CPU_ia64) ++GM_REF (gm_register_memory_ex_phys); ++#endif + GM_REF (gm_resume_sending); + GM_REF (gm_send); + GM_REF (gm_send_to_peer); diff --git a/lnet/klnds/gmlnd/gmlnd.h b/lnet/klnds/gmlnd/gmlnd.h index 47d71eb..6936737 100644 --- a/lnet/klnds/gmlnd/gmlnd.h +++ b/lnet/klnds/gmlnd/gmlnd.h @@ -60,16 +60,11 @@ #include "linux/vmalloc.h" #include "linux/sysctl.h" -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND -#include "portals/nal.h" -#include "portals/api.h" -#include "portals/errno.h" #include "libcfs/kp30.h" -#include "portals/p30.h" - -#include "portals/nal.h" -#include "portals/lib-p30.h" +#include "lnet/lnet.h" +#include "lnet/lib-lnet.h" /* undefine these before including the GM headers which clash */ #undef PACKAGE_BUGREPORT @@ -85,14 +80,15 @@ #include "gm.h" #include "gm_internal.h" -/* - * Defines for the API NAL - */ +/* Fixed tunables */ +#define GMNAL_RESCHED 100 /* # busy loops to force scheduler to yield */ +#define GMNAL_NETADDR_BASE 0x10000000 /* where we start in network VM */ +#define GMNAL_LARGE_PRIORITY GM_LOW_PRIORITY /* large message GM priority */ +#define GMNAL_SMALL_PRIORITY GM_LOW_PRIORITY /* small message GM priority */ /* Wire protocol */ - typedef struct { - ptl_hdr_t gmim_hdr; /* portals header */ + lnet_hdr_t gmim_hdr; /* portals header */ char gmim_payload[0]; /* payload */ } gmnal_immediate_msg_t; @@ -109,129 +105,141 @@ typedef struct { } gmm_u; } WIRE_ATTR gmnal_msg_t; -#define GMNAL_MSG_MAGIC 0x6d797269 /* 'myri'! */ +#define GMNAL_MSG_MAGIC LNET_PROTO_GM_MAGIC #define GMNAL_MSG_VERSION 1 #define GMNAL_MSG_IMMEDIATE 1 +typedef struct netbuf { + __u64 nb_netaddr; /* network VM address */ + lnet_kiov_t nb_kiov[1]; /* the pages (at least 1) */ +} gmnal_netbuf_t; + +#define GMNAL_NETBUF_MSG(nb) ((gmnal_msg_t *)page_address((nb)->nb_kiov[0].kiov_page)) +#define GMNAL_NETBUF_LOCAL_NETADDR(nb) ((void *)((unsigned long)(nb)->nb_netaddr)) + +typedef struct gmnal_txbuf { + struct list_head txb_list; /* queue on gmni_idle_ltxbs */ + struct gmnal_txbuf *txb_next; /* stash on gmni_ltxs */ + gmnal_netbuf_t txb_buf; /* space */ +} gmnal_txbuf_t; + typedef struct gmnal_tx { - struct gmnal_tx *tx_next; - gmnal_msg_t *tx_msg; - int tx_buffer_size; - gm_size_t tx_gm_size; - int tx_msg_size; - int tx_gmlid; - int tx_gm_priority; - ptl_nid_t tx_nid; - struct gmnal_ni *tx_gmni; - lib_msg_t *tx_libmsg; - int tx_rxt; + struct list_head tx_list; /* queue */ + int tx_credit:1; /* consumed a credit? */ + int tx_large_iskiov:1; /* large is in kiovs? */ + struct gmnal_ni *tx_gmni; /* owning NI */ + lnet_nid_t tx_nid; /* destination NID */ + int tx_gmlid; /* destination GM local ID */ + lnet_msg_t *tx_lntmsg; /* lntmsg to finalize on completion */ + + gmnal_netbuf_t tx_buf; /* small tx buffer */ + gmnal_txbuf_t *tx_ltxb; /* large buffer (to free on completion) */ + int tx_msgnob; /* message size (so far) */ + + int tx_large_nob; /* # bytes large buffer payload */ + int tx_large_offset; /* offset within frags */ + int tx_large_niov; /* # VM frags */ + union { + struct iovec *iov; /* mapped frags */ + lnet_kiov_t *kiov; /* page frags */ + } tx_large_frags; + unsigned long tx_launchtime; /* when (in jiffies) the transmit was launched */ + struct gmnal_tx *tx_next; /* stash on gmni_txs */ } gmnal_tx_t; -/* - * as for gmnal_tx_t - * a hash table in nal_data find rxs from - * the rx buffer address. hash table populated at init time - */ typedef struct gmnal_rx { - struct list_head rx_list; - gmnal_msg_t *rx_msg; - int rx_size; - gm_size_t rx_gmsize; - unsigned int rx_recv_nob; - __u16 rx_recv_gmid; - __u8 rx_recv_port; - __u8 rx_recv_type; - struct gmnal_rx *rx_next; + struct list_head rx_list; /* enqueue on gmni_rxq for handling */ + int rx_islarge:1; /* large receive buffer? */ + unsigned int rx_recv_nob; /* bytes received */ + __u16 rx_recv_gmid; /* sender */ + __u8 rx_recv_port; /* sender's port */ + __u8 rx_recv_type; /* ?? */ + struct gmnal_rx *rx_next; /* stash on gmni_rxs */ + gmnal_netbuf_t rx_buf; /* the buffer */ } gmnal_rx_t; - -/* - * 1 receive thread started on each CPU - */ -#define NRXTHREADS 10 /* max number of receiver threads */ - typedef struct gmnal_ni { - spinlock_t gmni_tx_lock; - struct semaphore gmni_tx_token; - gmnal_tx_t *gmni_tx; - spinlock_t gmni_rxt_tx_lock; - struct semaphore gmni_rxt_tx_token; - gmnal_tx_t *gmni_rxt_tx; - gmnal_rx_t *gmni_rx; - struct gm_hash *gmni_rx_hash; - lib_nal_t *gmni_libnal; - struct gm_port *gmni_port; - spinlock_t gmni_gm_lock; /* serialise GM calls */ - atomic_t gmni_nthreads; - int gmni_nrxthreads; - long gmni_rxthread_pid[NRXTHREADS]; - gm_alarm_t gmni_ctthread_alarm; - int gmni_thread_shutdown; - int gmni_msg_size; - struct list_head gmni_rxq; - spinlock_t gmni_rxq_lock; - struct semaphore gmni_rxq_wait; + lnet_ni_t *gmni_ni; /* generic NI */ + struct gm_port *gmni_port; /* GM port */ + spinlock_t gmni_gm_lock; /* serialise GM calls */ + int gmni_large_pages; /* # pages in a large message buffer */ + int gmni_large_msgsize; /* nob in large message buffers */ + int gmni_large_gmsize; /* large message GM bucket */ + int gmni_small_msgsize; /* nob in small message buffers */ + int gmni_small_gmsize; /* small message GM bucket */ + __u64 gmni_netaddr_base; /* base of mapped network VM */ + int gmni_netaddr_size; /* # bytes of mapped network VM */ + + gmnal_tx_t *gmni_txs; /* all txs */ + gmnal_rx_t *gmni_rxs; /* all rx descs */ + gmnal_txbuf_t *gmni_ltxbs; /* all large tx bufs */ + + atomic_t gmni_nthreads; /* total # threads */ + gm_alarm_t gmni_alarm; /* alarm to wake caretaker */ + int gmni_shutdown; /* tell all threads to exit */ + + struct list_head gmni_idle_txs; /* idle tx's */ + int gmni_tx_credits; /* # transmits still possible */ + struct list_head gmni_idle_ltxbs; /* idle large tx buffers */ + struct list_head gmni_buf_txq; /* tx's waiting for buffers */ + struct list_head gmni_cred_txq; /* tx's waiting for credits */ + spinlock_t gmni_tx_lock; /* serialise */ + + struct gm_hash *gmni_rx_hash; /* buffer->rx lookup */ + struct semaphore gmni_rx_mutex; /* serialise blocking on GM */ } gmnal_ni_t; - -/* - * for ioctl get pid - */ -#define GMNAL_IOC_GET_GNID 1 +typedef struct { + int *gm_port; + int *gm_ntx; + int *gm_credits; + int *gm_peer_credits; + int *gm_nlarge_tx_bufs; + int *gm_nrx_small; + int *gm_nrx_large; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + struct ctl_table_header *gm_sysctl; /* sysctl interface */ +#endif +} gmnal_tunables_t; /* gmnal_api.c */ int gmnal_init(void); -void gmnal_fini(void); +void gmnal_fini(void); +int gmnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int gmnal_startup(lnet_ni_t *ni); +void gmnal_shutdown(lnet_ni_t *ni); /* gmnal_cb.c */ -ptl_err_t gmnal_cb_recv(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen); -ptl_err_t gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, - unsigned int nkiov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen); -ptl_err_t gmnal_cb_send(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, ptl_hdr_t *hdr, int type, - ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t len); -ptl_err_t gmnal_cb_send_pages(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, ptl_hdr_t *hdr, int type, - ptl_nid_t nid, ptl_pid_t pid, - unsigned int nkiov, ptl_kiov_t *kiov, - size_t offset, size_t len); -int gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist); +int gmnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int gmnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); /* gmnal_util.c */ -int gmnal_is_rxthread(gmnal_ni_t *gmnalni); -int gmnal_alloc_txs(gmnal_ni_t *gmnalni); -void gmnal_free_txs(gmnal_ni_t *gmnalni); -gmnal_tx_t *gmnal_get_tx(gmnal_ni_t *gmnalni, int block); -void gmnal_return_tx(gmnal_ni_t *gmnalni, gmnal_tx_t *tx); -int gmnal_alloc_rxs(gmnal_ni_t *gmnalni); -void gmnal_free_rxs(gmnal_ni_t *gmnalni); +void gmnal_free_ltxbufs(gmnal_ni_t *gmni); +int gmnal_alloc_ltxbufs(gmnal_ni_t *gmni); +void gmnal_free_txs(gmnal_ni_t *gmni); +int gmnal_alloc_txs(gmnal_ni_t *gmni); +void gmnal_free_rxs(gmnal_ni_t *gmni); +int gmnal_alloc_rxs(gmnal_ni_t *gmni); char *gmnal_gmstatus2str(gm_status_t status); char *gmnal_rxevent2str(gm_recv_event_t *ev); void gmnal_yield(int delay); -int gmnal_enqueue_rx(gmnal_ni_t *gmnalni, gm_recv_t *recv); -gmnal_rx_t *gmnal_dequeue_rx(gmnal_ni_t *gmnalni); -void gmnal_stop_threads(gmnal_ni_t *gmnalni); -int gmnal_start_threads(gmnal_ni_t *gmnalni); /* gmnal_comm.c */ -void gmnal_pack_msg(gmnal_ni_t *gmnalni, gmnal_tx_t *tx, - ptl_nid_t dstnid, int type); -int gmnal_ct_thread(void *arg); -int gmnal_rx_thread(void *arg); -void gmnal_post_rx(gmnal_ni_t *gmnalni, gmnal_rx_t *rx); -ptl_err_t gmnal_post_tx(gmnal_ni_t *gmnalni, gmnal_tx_t *tx, - lib_msg_t *libmsg, ptl_nid_t nid, int nob); +void gmnal_post_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx); +gmnal_tx_t *gmnal_get_tx(gmnal_ni_t *gmni); +void gmnal_tx_done(gmnal_tx_t *tx, int rc); +void gmnal_pack_msg(gmnal_ni_t *gmni, gmnal_msg_t *msg, + lnet_nid_t dstnid, int type); +void gmnal_stop_threads(gmnal_ni_t *gmni); +int gmnal_start_threads(gmnal_ni_t *gmni); +void gmnal_check_txqueues_locked (gmnal_ni_t *gmni); /* Module Parameters */ -extern int num_txds; -extern int gm_port_id; +extern gmnal_tunables_t gmnal_tunables; #endif /*__INCLUDE_GMNAL_H__*/ diff --git a/lnet/klnds/gmlnd/gmlnd_api.c b/lnet/klnds/gmlnd/gmlnd_api.c index 6597cb5..a5c426f 100644 --- a/lnet/klnds/gmlnd/gmlnd_api.c +++ b/lnet/klnds/gmlnd/gmlnd_api.c @@ -23,313 +23,240 @@ * Implements the API NAL functions */ -#include "gmnal.h" +#include "gmlnd.h" -int -gmnal_cmd(struct portals_cfg *pcfg, void *private) +lnd_t the_gmlnd = { - gmnal_ni_t *gmnalni = private; - char *name; - int nid; - int gmid; - gm_status_t gm_status; - - CDEBUG(D_TRACE, "gmnal_cmd [%d] private [%p]\n", - pcfg->pcfg_command, private); - gmnalni = (gmnal_ni_t*)private; - - switch(pcfg->pcfg_command) { - case GMNAL_IOC_GET_GNID: - - PORTAL_ALLOC(name, pcfg->pcfg_plen1); - copy_from_user(name, PCFG_PBUF(pcfg, 1), pcfg->pcfg_plen1); - - gm_status = gm_host_name_to_node_id_ex(gmnalni->gmni_port, 0, - name, &nid); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_NET, "gm_host_name_to_node_id_ex(...host %s) " - "failed[%d]\n", name, gm_status); - return -ENOENT; - } - - CDEBUG(D_NET, "Local node %s id is [%d]\n", name, nid); - gm_status = gm_node_id_to_global_id(gmnalni->gmni_port, - nid, &gmid); - if (gm_status != GM_SUCCESS) { - CDEBUG(D_NET, "gm_node_id_to_global_id failed[%d]\n", - gm_status); - return -ENOENT; - } - - CDEBUG(D_NET, "Global node is is [%u][%x]\n", gmid, gmid); - copy_to_user(PCFG_PBUF(pcfg, 2), &gmid, pcfg->pcfg_plen2); - return 0; - - case NAL_CMD_REGISTER_MYNID: - /* Same NID OK */ - if (pcfg->pcfg_nid == gmnalni->gmni_libnal->libnal_ni.ni_pid.nid) - return 0; - - CERROR("Can't change NID from "LPD64" to "LPD64"\n", - gmnalni->gmni_libnal->libnal_ni.ni_pid.nid, - pcfg->pcfg_nid); - return -EINVAL; + .lnd_type = GMLND, + .lnd_startup = gmnal_startup, + .lnd_shutdown = gmnal_shutdown, + .lnd_ctl = gmnal_ctl, + .lnd_send = gmnal_send, + .lnd_recv = gmnal_recv, +}; +gmnal_ni_t *the_gmni = NULL; + +int +gmnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + + switch (cmd) { + case IOC_LIBCFS_REGISTER_MYNID: + if (data->ioc_nid == ni->ni_nid) + return 0; + + LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid)); + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return 0; + default: - CERROR ("gmnal_cmd UNKNOWN[%d]\n", pcfg->pcfg_command); - return -EINVAL; + return (-EINVAL); } - /* not reached */ } -ptl_nid_t -gmnal_get_local_nid (gmnal_ni_t *gmnalni) +int +gmnal_set_local_nid (gmnal_ni_t *gmni) { - unsigned int local_gmid; - unsigned int global_gmid; - ptl_nid_t nid; + lnet_ni_t *ni = gmni->gmni_ni; + __u32 local_gmid; + __u32 global_gmid; gm_status_t gm_status; /* Called before anything initialised: no need to lock */ - gm_status = gm_get_node_id(gmnalni->gmni_port, &local_gmid); + gm_status = gm_get_node_id(gmni->gmni_port, &local_gmid); if (gm_status != GM_SUCCESS) - return PTL_NID_ANY; + return 0; CDEBUG(D_NET, "Local node id is [%u]\n", local_gmid); - gm_status = gm_node_id_to_global_id(gmnalni->gmni_port, + gm_status = gm_node_id_to_global_id(gmni->gmni_port, local_gmid, &global_gmid); if (gm_status != GM_SUCCESS) - return PTL_NID_ANY; + return 0; CDEBUG(D_NET, "Global node id is [%u]\n", global_gmid); - nid = (__u64)global_gmid; - LASSERT (nid != PTL_NID_ANY); - - return global_gmid; + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), global_gmid); + return 1; } - void -gmnal_api_shutdown(nal_t *nal) +gmnal_shutdown(lnet_ni_t *ni) { - lib_nal_t *libnal = nal->nal_data; - gmnal_ni_t *gmnalni = libnal->libnal_data; - - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } + gmnal_ni_t *gmni = ni->ni_data; - CDEBUG(D_TRACE, "gmnal_api_shutdown: gmnalni [%p]\n", gmnalni); + CDEBUG(D_TRACE, "gmnal_api_shutdown: gmni [%p]\n", gmni); - /* Stop portals calling our ioctl handler */ - libcfs_nal_cmd_unregister(GMNAL); + LASSERT (gmni == the_gmni); /* stop processing messages */ - gmnal_stop_threads(gmnalni); + gmnal_stop_threads(gmni); + + /* stop all network callbacks */ + gm_close(gmni->gmni_port); + gmni->gmni_port = NULL; - gm_close(gmnalni->gmni_port); gm_finalize(); - lib_fini(libnal); + gmnal_free_ltxbufs(gmni); + gmnal_free_txs(gmni); + gmnal_free_rxs(gmni); - gmnal_free_txs(gmnalni); - gmnal_free_rxs(gmnalni); + LIBCFS_FREE(gmni, sizeof(*gmni)); - PORTAL_FREE(gmnalni, sizeof(*gmnalni)); - PORTAL_FREE(libnal, sizeof(*libnal)); + the_gmni = NULL; } int -gmnal_api_startup(nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +gmnal_startup(lnet_ni_t *ni) { - - lib_nal_t *libnal = NULL; - gmnal_ni_t *gmnalni = NULL; + gmnal_ni_t *gmni = NULL; gmnal_rx_t *rx = NULL; gm_status_t gm_status; - ptl_process_id_t process_id; int rc; - if (nal->nal_refct != 0) { - if (actual_limits != NULL) { - libnal = (lib_nal_t *)nal->nal_data; - *actual_limits = libnal->libnal_ni.ni_actual_limits; - } - PORTAL_MODULE_USE; - return PTL_OK; - } - - /* Called on first PtlNIInit() */ - CDEBUG(D_TRACE, "startup\n"); + LASSERT (ni->ni_lnd == &the_gmlnd); - PORTAL_ALLOC(gmnalni, sizeof(*gmnalni)); - if (gmnalni == NULL) { - CERROR("can't allocate gmnalni\n"); - return PTL_FAIL; - } + ni->ni_maxtxcredits = *gmnal_tunables.gm_credits; + ni->ni_peertxcredits = *gmnal_tunables.gm_peer_credits; - PORTAL_ALLOC(libnal, sizeof(*libnal)); - if (libnal == NULL) { - CERROR("can't allocate lib_nal\n"); - goto failed_0; - } - - memset(gmnalni, 0, sizeof(*gmnalni)); - gmnalni->gmni_libnal = libnal; - spin_lock_init(&gmnalni->gmni_gm_lock); + if (the_gmni != NULL) { + CERROR("Only 1 instance supported\n"); + return -EINVAL; + } - *libnal = (lib_nal_t) { - .libnal_send = gmnal_cb_send, - .libnal_send_pages = gmnal_cb_send_pages, - .libnal_recv = gmnal_cb_recv, - .libnal_recv_pages = gmnal_cb_recv_pages, - .libnal_dist = gmnal_cb_dist, - .libnal_data = gmnalni, - }; + LIBCFS_ALLOC(gmni, sizeof(*gmni)); + if (gmni == NULL) { + CERROR("can't allocate gmni\n"); + return -ENOMEM; + } + ni->ni_data = gmni; + + memset(gmni, 0, sizeof(*gmni)); + gmni->gmni_ni = ni; + spin_lock_init(&gmni->gmni_tx_lock); + spin_lock_init(&gmni->gmni_gm_lock); + INIT_LIST_HEAD(&gmni->gmni_idle_txs); + INIT_LIST_HEAD(&gmni->gmni_idle_ltxbs); + INIT_LIST_HEAD(&gmni->gmni_buf_txq); + INIT_LIST_HEAD(&gmni->gmni_cred_txq); + sema_init(&gmni->gmni_rx_mutex, 1); + /* * initialise the interface, */ CDEBUG(D_NET, "Calling gm_init\n"); if (gm_init() != GM_SUCCESS) { CERROR("call to gm_init failed\n"); - goto failed_1; + goto failed_0; } - CDEBUG(D_NET, "Calling gm_open with port [%d], " - "name [%s], version [%d]\n", gm_port_id, - "gmnal", GM_API_VERSION); + CDEBUG(D_NET, "Calling gm_open with port [%d], version [%d]\n", + *gmnal_tunables.gm_port, GM_API_VERSION); - gm_status = gm_open(&gmnalni->gmni_port, 0, gm_port_id, "gmnal", - GM_API_VERSION); + gm_status = gm_open(&gmni->gmni_port, 0, *gmnal_tunables.gm_port, + "gmnal", GM_API_VERSION); if (gm_status != GM_SUCCESS) { CERROR("Can't open GM port %d: %d (%s)\n", - gm_port_id, gm_status, gmnal_gmstatus2str(gm_status)); - goto failed_2; + *gmnal_tunables.gm_port, gm_status, + gmnal_gmstatus2str(gm_status)); + goto failed_1; } - CDEBUG(D_NET,"gm_open succeeded port[%p]\n",gmnalni->gmni_port); + CDEBUG(D_NET,"gm_open succeeded port[%p]\n",gmni->gmni_port); - gmnalni->gmni_msg_size = offsetof(gmnal_msg_t, - gmm_u.immediate.gmim_payload[PTL_MTU]); - CWARN("Msg size %08x\n", gmnalni->gmni_msg_size); + if (!gmnal_set_local_nid(gmni)) + goto failed_2; - if (gmnal_alloc_rxs(gmnalni) != 0) { - CERROR("Failed to allocate rx descriptors\n"); - goto failed_3; - } + CDEBUG(D_NET, "portals_nid is %s\n", libcfs_nid2str(ni->ni_nid)); - if (gmnal_alloc_txs(gmnalni) != 0) { - CERROR("Failed to allocate tx descriptors\n"); - goto failed_3; - } + gmni->gmni_large_msgsize = + offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[LNET_MAX_PAYLOAD]); + gmni->gmni_large_gmsize = + gm_min_size_for_length(gmni->gmni_large_msgsize); + gmni->gmni_large_pages = + (gmni->gmni_large_msgsize + PAGE_SIZE - 1)/PAGE_SIZE; + + gmni->gmni_small_msgsize = MIN(GM_MTU, PAGE_SIZE); + gmni->gmni_small_gmsize = + gm_min_size_for_length(gmni->gmni_small_msgsize); - process_id.pid = requested_pid; - process_id.nid = gmnal_get_local_nid(gmnalni); - if (process_id.nid == PTL_NID_ANY) - goto failed_3; + gmni->gmni_netaddr_base = GMNAL_NETADDR_BASE; + gmni->gmni_netaddr_size = 0; - CDEBUG(D_NET, "portals_pid is [%u]\n", process_id.pid); - CDEBUG(D_NET, "portals_nid is ["LPU64"]\n", process_id.nid); + CDEBUG(D_NET, "Msg size %08x/%08x [%d/%d]\n", + gmni->gmni_large_msgsize, gmni->gmni_small_msgsize, + gmni->gmni_large_gmsize, gmni->gmni_small_gmsize); - /* Hang out a bunch of small receive buffers - * In fact hang them all out */ - for (rx = gmnalni->gmni_rx; rx != NULL; rx = rx->rx_next) - gmnal_post_rx(gmnalni, rx); + if (gmnal_alloc_rxs(gmni) != 0) { + CERROR("Failed to allocate rx descriptors\n"); + goto failed_2; + } - if (lib_init(libnal, nal, process_id, - requested_limits, actual_limits) != PTL_OK) { - CERROR("lib_init failed\n"); - goto failed_3; + if (gmnal_alloc_txs(gmni) != 0) { + CERROR("Failed to allocate tx descriptors\n"); + goto failed_2; } - /* Now that we have initialised the portals library, start receive - * threads, we do this to avoid processing messages before we can parse - * them */ - rc = gmnal_start_threads(gmnalni); - if (rc != 0) { - CERROR("Can't start threads: %d\n", rc); - goto failed_3; + if (gmnal_alloc_ltxbufs(gmni) != 0) { + CERROR("Failed to allocate large tx buffers\n"); + goto failed_2; } - rc = libcfs_nal_cmd_register(GMNAL, &gmnal_cmd, libnal->libnal_data); - if (rc != 0) { - CDEBUG(D_NET, "libcfs_nal_cmd_register failed: %d\n", rc); - goto failed_4; + rc = gmnal_start_threads(gmni); + if (rc != 0) { + CERROR("Can't start threads: %d\n", rc); + goto failed_2; } - CDEBUG(D_NET, "gmnal_init finished\n"); - return PTL_OK; + /* Start listening */ + for (rx = gmni->gmni_rxs; rx != NULL; rx = rx->rx_next) + gmnal_post_rx(gmni, rx); - failed_4: - gmnal_stop_threads(gmnalni); + the_gmni = gmni; - failed_3: - gm_close(gmnalni->gmni_port); + CDEBUG(D_NET, "gmnal_init finished\n"); + return 0; failed_2: - gm_finalize(); - - /* safe to free buffers after network has been shut down */ - gmnal_free_txs(gmnalni); - gmnal_free_rxs(gmnalni); + gm_close(gmni->gmni_port); + gmni->gmni_port = NULL; failed_1: - PORTAL_FREE(libnal, sizeof(*libnal)); + gm_finalize(); failed_0: - PORTAL_FREE(gmnalni, sizeof(*gmnalni)); + /* safe to free descriptors after network has been shut down */ + gmnal_free_ltxbufs(gmni); + gmnal_free_txs(gmni); + gmnal_free_rxs(gmni); - return PTL_FAIL; -} + LIBCFS_FREE(gmni, sizeof(*gmni)); -ptl_handle_ni_t kgmnal_ni; -nal_t the_gm_nal; + return -EIO; +} /* * Called when module loaded */ int gmnal_init(void) { - int rc; - - CDEBUG(D_NET, "reset nal[%p]\n", &the_gm_nal); - - the_gm_nal = (nal_t) { - .nal_ni_init = gmnal_api_startup, - .nal_ni_fini = gmnal_api_shutdown, - .nal_data = NULL, - }; - - rc = ptl_register_nal(GMNAL, &the_gm_nal); - if (rc != PTL_OK) - CERROR("Can't register GMNAL: %d\n", rc); - rc = PtlNIInit(GMNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kgmnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(GMNAL); - return (-ENODEV); - } - - return (rc); + lnet_register_lnd(&the_gmlnd); + return 0; } - /* * Called when module removed */ void gmnal_fini() { - CDEBUG(D_TRACE, "gmnal_fini\n"); - - PtlNIFini(kgmnal_ni); - - ptl_unregister_nal(GMNAL); + lnet_unregister_lnd(&the_gmlnd); } diff --git a/lnet/klnds/gmlnd/gmlnd_cb.c b/lnet/klnds/gmlnd/gmlnd_cb.c index d7e7f5b..503bedf 100644 --- a/lnet/klnds/gmlnd/gmlnd_cb.c +++ b/lnet/klnds/gmlnd/gmlnd_cb.c @@ -25,228 +25,137 @@ */ -#include "gmnal.h" +#include "gmlnd.h" -ptl_err_t -gmnal_cb_recv(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) +int +gmnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) { + gmnal_ni_t *gmni = ni->ni_data; gmnal_rx_t *rx = (gmnal_rx_t*)private; - gmnal_msg_t *msg = rx->rx_msg; - size_t nobleft = mlen; - int rxnob; - char *buffer; - size_t nob; - - CDEBUG(D_TRACE, "gmnal_cb_recv libnal [%p], private[%p], libmsg[%p], " - "niov[%d], iov [%p], offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n", - libnal, private, libmsg, niov, iov, offset, mlen, rlen); + gmnal_msg_t *msg = GMNAL_NETBUF_MSG(&rx->rx_buf); + int npages = rx->rx_islarge ? gmni->gmni_large_pages : 1; + int payload_offset = offsetof(gmnal_msg_t, + gmm_u.immediate.gmim_payload[0]); + int nob = payload_offset + mlen; LASSERT (msg->gmm_type == GMNAL_MSG_IMMEDIATE); + LASSERT (iov == NULL || kiov == NULL); - buffer = &msg->gmm_u.immediate.gmim_payload[0]; - rxnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[nobleft]); - - if (rx->rx_recv_nob < rxnob) { - CERROR("Short message from nid "LPD64": got %d, need %d\n", - msg->gmm_srcnid, rx->rx_recv_nob, rxnob); - return PTL_FAIL; + if (rx->rx_recv_nob < nob) { + CERROR("Short message from nid %s: got %d, need %d\n", + libcfs_nid2str(msg->gmm_srcnid), rx->rx_recv_nob, nob); + gmnal_post_rx(gmni, rx); + return -EIO; } - - while (nobleft > 0) { - LASSERT (niov > 0); - if (offset >= iov->iov_len) { - offset -= iov->iov_len; - } else { - nob = MIN (iov->iov_len - offset, nobleft); - - gm_bcopy(buffer, iov->iov_base + offset, nob); - - buffer += nob; - nobleft -= nob; - offset = 0; - } - niov--; - iov++; - } - - lib_finalize(libnal, private, libmsg, PTL_OK); - return PTL_OK; + if (kiov != NULL) + lnet_copy_kiov2kiov(niov, kiov, offset, + npages, rx->rx_buf.nb_kiov, payload_offset, + mlen); + else + lnet_copy_kiov2iov(niov, iov, offset, + npages, rx->rx_buf.nb_kiov, payload_offset, + mlen); + + lnet_finalize(ni, lntmsg, 0); + gmnal_post_rx(gmni, rx); + return 0; } -ptl_err_t -gmnal_cb_recv_pages(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, - unsigned int nkiov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) +int +gmnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { - gmnal_rx_t *rx = (gmnal_rx_t*)private; - gmnal_msg_t *msg = rx->rx_msg; - size_t nobleft = mlen; - int rxnob; - size_t nob; - char *ptr; - void *buffer; - - CDEBUG(D_TRACE, "gmnal_cb_recv_pages libnal [%p],private[%p], " - "libmsg[%p], kniov[%d], kiov [%p], " - "offset["LPSZ"], mlen["LPSZ"], rlen["LPSZ"]\n", - libnal, private, libmsg, nkiov, kiov, offset, mlen, rlen); - - LASSERT (msg->gmm_type == GMNAL_MSG_IMMEDIATE); - - buffer = &msg->gmm_u.immediate.gmim_payload[0]; - rxnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[nobleft]); - - if (rx->rx_recv_nob < rxnob) { - CERROR("Short message from nid "LPD64": got %d, need %d\n", - msg->gmm_srcnid, rx->rx_recv_nob, rxnob); - return PTL_FAIL; + lnet_hdr_t *hdr= &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int len = lntmsg->msg_len; + gmnal_ni_t *gmni = ni->ni_data; + gm_status_t gmrc; + gmnal_tx_t *tx; + + LASSERT (iov == NULL || kiov == NULL); + + /* I may not block for a tx if I'm responding to an incoming message */ + tx = gmnal_get_tx(gmni); + if (tx == NULL) { + if (!gmni->gmni_shutdown) + CERROR ("Can't get tx for msg type %d for %s\n", + type, libcfs_nid2str(target.nid)); + return -EIO; } - - while (nobleft > 0) { - LASSERT (nkiov > 0); - - if (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - } else { - nob = MIN (kiov->kiov_len - offset, nobleft); - - ptr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset; - - gm_bcopy(buffer, ptr + offset, nob); - - kunmap(kiov->kiov_page); - - buffer += nob; - nobleft -= nob; - offset = 0; - } - kiov++; - nkiov--; - } - - lib_finalize(libnal, private, libmsg, PTL_OK); - return PTL_OK; -} - -ptl_err_t -gmnal_cb_send(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, ptl_hdr_t *hdr, int type, - ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t len) -{ - - gmnal_ni_t *gmnalni = libnal->libnal_data; - size_t nobleft = len; - void *buffer; - gmnal_tx_t *tx; - size_t nob; - CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] offset["LPSZ"] " - "len["LPSZ"] nid["LPU64"]\n", niov, offset, len, nid); + tx->tx_nid = target.nid; - if ((nid >> 32) != 0) { - CERROR("Illegal nid: "LPU64"\n", nid); - return PTL_FAIL; + gmrc = gm_global_id_to_node_id(gmni->gmni_port, LNET_NIDADDR(target.nid), + &tx->tx_gmlid); + if (gmrc != GM_SUCCESS) { + CERROR("Can't map Nid %s to a GM local ID: %d\n", + libcfs_nid2str(target.nid), gmrc); + /* NB tx_lntmsg not set => doesn't finalize */ + gmnal_tx_done(tx, -EIO); + return -EIO; } - tx = gmnal_get_tx(gmnalni, 1); - - gmnal_pack_msg(gmnalni, tx, nid, GMNAL_MSG_IMMEDIATE); - gm_bcopy(hdr, &tx->tx_msg->gmm_u.immediate.gmim_hdr, sizeof(*hdr)); - - buffer = &tx->tx_msg->gmm_u.immediate.gmim_payload[0]; - while (nobleft > 0) { - LASSERT (niov > 0); - - if (offset >= iov->iov_len) { - offset -= iov->iov_len; - } else { - nob = MIN (iov->iov_len - offset, nobleft); - - gm_bcopy(iov->iov_base + offset, buffer, nob); - - buffer += nob; - nobleft -= nob; - offset = 0; + gmnal_pack_msg(gmni, GMNAL_NETBUF_MSG(&tx->tx_buf), + target.nid, GMNAL_MSG_IMMEDIATE); + GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_hdr = *hdr; + tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[0]); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_version++; + the_lnet.ln_testprotocompat &= ~1; } - niov--; - iov++; - } - - nob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[len]); - return gmnal_post_tx(gmnalni, tx, libmsg, nid, nob); -} - -ptl_err_t -gmnal_cb_send_pages(lib_nal_t *libnal, void *private, - lib_msg_t *libmsg, ptl_hdr_t *hdr, int type, - ptl_nid_t nid, ptl_pid_t pid, - unsigned int nkiov, ptl_kiov_t *kiov, - size_t offset, size_t len) -{ - - gmnal_ni_t *gmnalni = libnal->libnal_data; - size_t nobleft = len; - void *buffer; - gmnal_tx_t *tx; - char *ptr; - size_t nob; - - CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] offset[" - LPSZ"] len["LPSZ"]\n", nid, nkiov, offset, len); - - if ((nid >> 32) != 0) { - CERROR("Illegal nid: "LPU64"\n", nid); - return PTL_FAIL; + if ((the_lnet.ln_testprotocompat & 2) != 0) { + GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_magic = + LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); } - tx = gmnal_get_tx(gmnalni, 1); - - gmnal_pack_msg(gmnalni, tx, nid, GMNAL_MSG_IMMEDIATE); - gm_bcopy(hdr, &tx->tx_msg->gmm_u.immediate.gmim_hdr, sizeof(*hdr)); - - buffer = &tx->tx_msg->gmm_u.immediate.gmim_payload[0]; - while (nobleft > 0) { - LASSERT (nkiov > 0); - - if (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - } else { - nob = MIN (kiov->kiov_len - offset, nobleft); - - ptr = ((char *)kmap(kiov->kiov_page)) + - kiov->kiov_offset; - - gm_bcopy(ptr + offset, buffer, nob); - - kunmap(kiov->kiov_page); + if (tx->tx_msgnob + len <= gmni->gmni_small_msgsize) { + /* whole message fits in tx_buf */ + char *buffer = &(GMNAL_NETBUF_MSG(&tx->tx_buf)->gmm_u.immediate.gmim_payload[0]); - buffer += nob; - nobleft -= nob; - offset = 0; - } - nkiov--; - kiov++; + if (iov != NULL) + lnet_copy_iov2flat(len, buffer, 0, + niov, iov, offset, len); + else + lnet_copy_kiov2flat(len, buffer, 0, + niov, kiov, offset, len); + + tx->tx_msgnob += len; + tx->tx_large_nob = 0; + } else { + /* stash payload pts to copy later */ + tx->tx_large_nob = len; + tx->tx_large_iskiov = (kiov != NULL); + tx->tx_large_niov = niov; + if (tx->tx_large_iskiov) + tx->tx_large_frags.kiov = kiov; + else + tx->tx_large_frags.iov = iov; } - nob = offsetof(gmnal_msg_t, gmm_u.immediate.gmim_payload[len]); - return gmnal_post_tx(gmnalni, tx, libmsg, nid, nob); -} + LASSERT(tx->tx_lntmsg == NULL); + tx->tx_lntmsg = lntmsg; + + spin_lock(&gmni->gmni_tx_lock); -int -gmnal_cb_dist(lib_nal_t *libnal, ptl_nid_t nid, unsigned long *dist) -{ - CDEBUG(D_TRACE, "gmnal_cb_dist\n"); + list_add_tail(&tx->tx_list, &gmni->gmni_buf_txq); + gmnal_check_txqueues_locked(gmni); - if (dist != NULL) - *dist = 1; + spin_unlock(&gmni->gmni_tx_lock); - return PTL_OK; + return 0; } diff --git a/lnet/klnds/gmlnd/gmlnd_comm.c b/lnet/klnds/gmlnd/gmlnd_comm.c index 3b4baa0..ea6a8d1 100644 --- a/lnet/klnds/gmlnd/gmlnd_comm.c +++ b/lnet/klnds/gmlnd/gmlnd_comm.c @@ -23,29 +23,47 @@ * This file contains all gmnal send and receive functions */ -#include "gmnal.h" +#include "gmlnd.h" void -gmnal_pack_msg(gmnal_ni_t *gmnalni, gmnal_tx_t *tx, - ptl_nid_t dstnid, int type) +gmnal_notify_peer_down(gmnal_tx_t *tx) { - gmnal_msg_t *msg = tx->tx_msg; + struct timeval now; + time_t then; + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - tx->tx_launchtime)/HZ; + + lnet_notify(tx->tx_gmni->gmni_ni, tx->tx_nid, 0, then); +} + +void +gmnal_pack_msg(gmnal_ni_t *gmni, gmnal_msg_t *msg, + lnet_nid_t dstnid, int type) +{ /* CAVEAT EMPTOR! this only sets the common message fields. */ msg->gmm_magic = GMNAL_MSG_MAGIC; msg->gmm_version = GMNAL_MSG_VERSION; msg->gmm_type = type; - msg->gmm_srcnid = gmnalni->gmni_libnal->libnal_ni.ni_pid.nid; + msg->gmm_srcnid = lnet_ptlcompat_srcnid(gmni->gmni_ni->ni_nid, + dstnid); msg->gmm_dstnid = dstnid; } int -gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx) +gmnal_unpack_msg(gmnal_ni_t *gmni, gmnal_rx_t *rx) { - gmnal_msg_t *msg = rx->rx_msg; + gmnal_msg_t *msg = GMNAL_NETBUF_MSG(&rx->rx_buf); const int hdr_size = offsetof(gmnal_msg_t, gmm_u); + int buffnob = rx->rx_islarge ? gmni->gmni_large_msgsize : + gmni->gmni_small_msgsize; int flip; + /* rc = 0:SUCCESS -ve:failure +ve:version mismatch */ + + /* GM may not overflow our buffer */ + LASSERT (rx->rx_recv_nob <= buffnob); + /* 6 bytes are enough to have received magic + version */ if (rx->rx_recv_nob < 6) { CERROR("Short message from gmid %u: %d\n", @@ -57,6 +75,9 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx) flip = 0; } else if (msg->gmm_magic == __swab32(GMNAL_MSG_MAGIC)) { flip = 1; + } else if (msg->gmm_magic == LNET_PROTO_MAGIC || + msg->gmm_magic == __swab32(LNET_PROTO_MAGIC)) { + return EPROTO; } else { CERROR("Bad magic from gmid %u: %08x\n", rx->rx_recv_gmid, msg->gmm_magic); @@ -65,9 +86,7 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx) if (msg->gmm_version != (flip ? __swab16(GMNAL_MSG_VERSION) : GMNAL_MSG_VERSION)) { - CERROR("Bad version from gmid %u: %d\n", - rx->rx_recv_gmid, msg->gmm_version); - return -EPROTO; + return EPROTO; } if (rx->rx_recv_nob < hdr_size) { @@ -84,15 +103,16 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx) __swab64s(&msg->gmm_dstnid); } - if (msg->gmm_srcnid == PTL_NID_ANY) { - CERROR("Bad src nid from %u: "LPX64"\n", - rx->rx_recv_gmid, msg->gmm_srcnid); + if (msg->gmm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid from %u: %s\n", + rx->rx_recv_gmid, libcfs_nid2str(msg->gmm_srcnid)); return -EPROTO; } - if (msg->gmm_dstnid != gmnalni->gmni_libnal->libnal_ni.ni_pid.nid) { - CERROR("Bad dst nid from %u: "LPX64"\n", - rx->rx_recv_gmid, msg->gmm_dstnid); + if (!lnet_ptlcompat_matchnid(gmni->gmni_ni->ni_nid, + msg->gmm_dstnid)) { + CERROR("Bad dst nid from %u: %s\n", + rx->rx_recv_gmid, libcfs_nid2str(msg->gmm_dstnid)); return -EPROTO; } @@ -114,211 +134,430 @@ gmnal_unpack_msg(gmnal_ni_t *gmnalni, gmnal_rx_t *rx) return 0; } - -/* - * The caretaker thread - * This is main thread of execution for the NAL side - * This guy waits in gm_blocking_recvive and gets - * woken up when the myrinet adaptor gets an interrupt. - * Hands off receive operations to the receive thread - * This thread Looks after gm_callbacks etc inline. - */ -int -gmnal_ct_thread(void *arg) +gmnal_tx_t * +gmnal_get_tx(gmnal_ni_t *gmni) { - gmnal_ni_t *gmnalni = arg; - gm_recv_event_t *rxevent = NULL; - gm_recv_t *recv = NULL; + gmnal_tx_t *tx = NULL; + + spin_lock(&gmni->gmni_tx_lock); + + if (gmni->gmni_shutdown || + list_empty(&gmni->gmni_idle_txs)) { + spin_unlock(&gmni->gmni_tx_lock); + return NULL; + } + + tx = list_entry(gmni->gmni_idle_txs.next, gmnal_tx_t, tx_list); + list_del(&tx->tx_list); - sprintf(current->comm, "gmnal_ct"); - kportal_daemonize("gmnalctd"); + spin_unlock(&gmni->gmni_tx_lock); - while(!gmnalni->gmni_thread_shutdown) { + LASSERT (tx->tx_lntmsg == NULL); + LASSERT (tx->tx_ltxb == NULL); + LASSERT (!tx->tx_credit); + + return tx; +} - spin_lock(&gmnalni->gmni_gm_lock); - rxevent = gm_blocking_receive_no_spin(gmnalni->gmni_port); - spin_unlock(&gmnalni->gmni_gm_lock); +void +gmnal_tx_done(gmnal_tx_t *tx, int rc) +{ + gmnal_ni_t *gmni = tx->tx_gmni; + int wake_sched = 0; + lnet_msg_t *lnetmsg = tx->tx_lntmsg; + + tx->tx_lntmsg = NULL; - CDEBUG(D_NET, "got [%s]\n", gmnal_rxevent2str(rxevent)); + spin_lock(&gmni->gmni_tx_lock); + + if (tx->tx_ltxb != NULL) { + wake_sched = 1; + list_add_tail(&tx->tx_ltxb->txb_list, &gmni->gmni_idle_ltxbs); + tx->tx_ltxb = NULL; + } + + if (tx->tx_credit) { + wake_sched = 1; + gmni->gmni_tx_credits++; + tx->tx_credit = 0; + } + + list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs); - if (GM_RECV_EVENT_TYPE(rxevent) == GM_RECV_EVENT) { - recv = (gm_recv_t*)&rxevent->recv; - gmnal_enqueue_rx(gmnalni, recv); - continue; - } + if (wake_sched) + gmnal_check_txqueues_locked(gmni); - gm_unknown(gmnalni->gmni_port, rxevent); - } + spin_unlock(&gmni->gmni_tx_lock); - CDEBUG(D_NET, "exiting\n"); - atomic_dec(&gmnalni->gmni_nthreads); - return 0; + /* Delay finalize until tx is free */ + if (lnetmsg != NULL) + lnet_finalize(gmni->gmni_ni, lnetmsg, 0); } +void +gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, + gm_status_t status) +{ + gmnal_tx_t *tx = (gmnal_tx_t*)context; -/* - * process a receive event - */ -int -gmnal_rx_thread(void *arg) + LASSERT(!in_interrupt()); + + CDEBUG(D_NET, "status for tx [%p] is [%d][%s], nid %s\n", + tx, status, gmnal_gmstatus2str(status), + libcfs_nid2str(tx->tx_nid)); + + gmnal_tx_done(tx, -EIO); +} + +void +gmnal_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) { - gmnal_ni_t *gmnalni = arg; - char name[16]; - gmnal_rx_t *rx; - int rank; + gmnal_tx_t *tx = (gmnal_tx_t*)context; + gmnal_ni_t *gmni = tx->tx_gmni; - for (rank = 0; rank < gmnalni->gmni_nrxthreads; rank++) - if (gmnalni->gmni_rxthread_pid[rank] == current->pid) - break; + LASSERT(!in_interrupt()); - snprintf(name, sizeof(name), "gmnal_rx_%d", rank); - kportal_daemonize(name); + switch(status) { + case GM_SUCCESS: + gmnal_tx_done(tx, 0); + return; + + case GM_SEND_DROPPED: + CDEBUG(D_NETERROR, "Dropped tx %p to %s\n", + tx, libcfs_nid2str(tx->tx_nid)); + /* Another tx failed and called gm_drop_sends() which made this + * one complete immediately */ + gmnal_tx_done(tx, -EIO); + return; + + default: + /* Some error; NB don't complete tx yet; we need its credit for + * gm_drop_sends() */ + CDEBUG(D_NETERROR, "tx %p error %d(%s), nid %s\n", + tx, status, gmnal_gmstatus2str(status), + libcfs_nid2str(tx->tx_nid)); + + gmnal_notify_peer_down(tx); + + spin_lock(&gmni->gmni_gm_lock); + gm_drop_sends(gmni->gmni_port, + tx->tx_ltxb != NULL ? + GMNAL_LARGE_PRIORITY : GMNAL_SMALL_PRIORITY, + tx->tx_gmlid, *gmnal_tunables.gm_port, + gmnal_drop_sends_callback, tx); + spin_unlock(&gmni->gmni_gm_lock); + return; + } + + /* not reached */ + LBUG(); +} - while(!gmnalni->gmni_thread_shutdown) { +void +gmnal_check_txqueues_locked (gmnal_ni_t *gmni) +{ + gmnal_tx_t *tx; + gmnal_txbuf_t *ltxb; + int gmsize; + int pri; + void *netaddr; + + tx = list_empty(&gmni->gmni_buf_txq) ? NULL : + list_entry(gmni->gmni_buf_txq.next, gmnal_tx_t, tx_list); - rx = gmnal_dequeue_rx(gmnalni); - if (rx == NULL) - break; + if (tx != NULL && + (tx->tx_large_nob == 0 || + !list_empty(&gmni->gmni_idle_ltxbs))) { - /* We're connectionless: simply ignore packets on error */ + /* consume tx */ + list_del(&tx->tx_list); - if (gmnal_unpack_msg(gmnalni, rx) == 0) { - - LASSERT (rx->rx_msg->gmm_type == GMNAL_MSG_IMMEDIATE); - (void)lib_parse(gmnalni->gmni_libnal, - &rx->rx_msg->gmm_u.immediate.gmim_hdr, - rx); + LASSERT (tx->tx_ltxb == NULL); + + if (tx->tx_large_nob != 0) { + ltxb = list_entry(gmni->gmni_idle_ltxbs.next, + gmnal_txbuf_t, txb_list); + + /* consume large buffer */ + list_del(<xb->txb_list); + + spin_unlock(&gmni->gmni_tx_lock); + + /* Unlocking here allows sends to get re-ordered, + * but we want to allow other CPUs to progress... */ + + tx->tx_ltxb = ltxb; + + /* marshall message in tx_ltxb... + * 1. Copy what was marshalled so far (in tx_buf) */ + memcpy(GMNAL_NETBUF_MSG(<xb->txb_buf), + GMNAL_NETBUF_MSG(&tx->tx_buf), tx->tx_msgnob); + + /* 2. Copy the payload */ + if (tx->tx_large_iskiov) + lnet_copy_kiov2kiov( + gmni->gmni_large_pages, + ltxb->txb_buf.nb_kiov, + tx->tx_msgnob, + tx->tx_large_niov, + tx->tx_large_frags.kiov, + tx->tx_large_offset, + tx->tx_large_nob); + else + lnet_copy_iov2kiov( + gmni->gmni_large_pages, + ltxb->txb_buf.nb_kiov, + tx->tx_msgnob, + tx->tx_large_niov, + tx->tx_large_frags.iov, + tx->tx_large_offset, + tx->tx_large_nob); + + tx->tx_msgnob += tx->tx_large_nob; + + spin_lock(&gmni->gmni_tx_lock); } - gmnal_post_rx(gmnalni, rx); - } + list_add_tail(&tx->tx_list, &gmni->gmni_cred_txq); + } - CDEBUG(D_NET, "exiting\n"); - atomic_dec(&gmnalni->gmni_nthreads); - return 0; + if (!list_empty(&gmni->gmni_cred_txq) && + gmni->gmni_tx_credits != 0) { + + tx = list_entry(gmni->gmni_cred_txq.next, gmnal_tx_t, tx_list); + + /* consume tx and 1 credit */ + list_del(&tx->tx_list); + gmni->gmni_tx_credits--; + + spin_unlock(&gmni->gmni_tx_lock); + + /* Unlocking here allows sends to get re-ordered, but we want + * to allow other CPUs to progress... */ + + LASSERT(!tx->tx_credit); + tx->tx_credit = 1; + + tx->tx_launchtime = jiffies; + + if (tx->tx_msgnob <= gmni->gmni_small_msgsize) { + LASSERT (tx->tx_ltxb == NULL); + netaddr = GMNAL_NETBUF_LOCAL_NETADDR(&tx->tx_buf); + gmsize = gmni->gmni_small_gmsize; + pri = GMNAL_SMALL_PRIORITY; + } else { + LASSERT (tx->tx_ltxb != NULL); + netaddr = GMNAL_NETBUF_LOCAL_NETADDR(&tx->tx_ltxb->txb_buf); + gmsize = gmni->gmni_large_gmsize; + pri = GMNAL_LARGE_PRIORITY; + } + + spin_lock(&gmni->gmni_gm_lock); + + gm_send_to_peer_with_callback(gmni->gmni_port, + netaddr, gmsize, + tx->tx_msgnob, + pri, + tx->tx_gmlid, + gmnal_tx_callback, + (void*)tx); + + spin_unlock(&gmni->gmni_gm_lock); + spin_lock(&gmni->gmni_tx_lock); + } } void -gmnal_post_rx(gmnal_ni_t *gmnalni, gmnal_rx_t *rx) +gmnal_post_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx) { - CDEBUG(D_NET, "requeueing rx[%p] gmnalni[%p]\n", rx, gmnalni); - - spin_lock(&gmnalni->gmni_gm_lock); - gm_provide_receive_buffer_with_tag(gmnalni->gmni_port, rx->rx_msg, - rx->rx_gmsize, GM_LOW_PRIORITY, 0 ); - spin_unlock(&gmnalni->gmni_gm_lock); + int gmsize = rx->rx_islarge ? gmni->gmni_large_gmsize : + gmni->gmni_small_gmsize; + int pri = rx->rx_islarge ? GMNAL_LARGE_PRIORITY : + GMNAL_SMALL_PRIORITY; + void *buffer = GMNAL_NETBUF_LOCAL_NETADDR(&rx->rx_buf); + + CDEBUG(D_NET, "posting rx %p buf %p\n", rx, buffer); + + spin_lock(&gmni->gmni_gm_lock); + gm_provide_receive_buffer_with_tag(gmni->gmni_port, + buffer, gmsize, pri, 0); + spin_unlock(&gmni->gmni_gm_lock); } -void -gmnal_resume_sending_callback(struct gm_port *gm_port, void *context, - gm_status_t status) +void +gmnal_version_reply (gmnal_ni_t *gmni, gmnal_rx_t *rx) { - gmnal_tx_t *tx = (gmnal_tx_t*)context; - gmnal_ni_t *gmnalni = tx->tx_gmni; - lib_msg_t *libmsg = tx->tx_libmsg; + /* Future protocol version compatibility support! + * The next gmlnd-specific protocol rev will first send a message to + * check version; I reply with a stub message containing my current + * magic+version... */ + gmnal_msg_t *msg; + gmnal_tx_t *tx = gmnal_get_tx(gmni); + + if (tx == NULL) { + CERROR("Can't allocate tx to send version info to %u\n", + rx->rx_recv_gmid); + return; + } - CWARN("status for tx [%p] is [%d][%s]\n", - tx, status, gmnal_gmstatus2str(status)); + LASSERT (tx->tx_lntmsg == NULL); /* no finalize */ - gmnal_return_tx(gmnalni, tx); - lib_finalize(gmnalni->gmni_libnal, NULL, libmsg, PTL_FAIL); -} + tx->tx_nid = LNET_NID_ANY; + tx->tx_gmlid = rx->rx_recv_gmid; -void -gmnal_drop_sends_callback(struct gm_port *gm_port, void *context, - gm_status_t status) -{ - gmnal_tx_t *tx = (gmnal_tx_t*)context; - gmnal_ni_t *gmnalni = tx->tx_gmni; + msg = GMNAL_NETBUF_MSG(&tx->tx_buf); + msg->gmm_magic = GMNAL_MSG_MAGIC; + msg->gmm_version = GMNAL_MSG_VERSION; + + /* just send magic + version */ + tx->tx_msgnob = offsetof(gmnal_msg_t, gmm_type); + tx->tx_large_nob = 0; - CERROR("status for tx [%p] is [%d][%s]\n", - tx, status, gmnal_gmstatus2str(status)); + spin_lock(&gmni->gmni_tx_lock); - spin_lock(&gmnalni->gmni_gm_lock); - gm_resume_sending(gmnalni->gmni_port, tx->tx_gm_priority, - tx->tx_gmlid, gm_port_id, - gmnal_resume_sending_callback, tx); - spin_unlock(&gmnalni->gmni_gm_lock); + list_add_tail(&tx->tx_list, &gmni->gmni_buf_txq); + gmnal_check_txqueues_locked(gmni); + + spin_unlock(&gmni->gmni_tx_lock); } -void -gmnal_tx_callback(gm_port_t *gm_port, void *context, gm_status_t status) +int +gmnal_rx_thread(void *arg) { - gmnal_tx_t *tx = (gmnal_tx_t*)context; - gmnal_ni_t *gmnalni = tx->tx_gmni; - lib_nal_t *libnal = gmnalni->gmni_libnal; - lib_msg_t *libmsg = tx->tx_libmsg; - ptl_err_t rc; + gmnal_ni_t *gmni = arg; + gm_recv_event_t *rxevent = NULL; + gm_recv_t *recv = NULL; + gmnal_rx_t *rx; + int rc; - if (!tx) { - CERROR("send completion event for unknown tx\n"); - return; - } + cfs_daemonize("gmnal_rxd"); - switch(status) { - case(GM_SUCCESS): - rc = PTL_OK; - break; + down(&gmni->gmni_rx_mutex); - case(GM_SEND_DROPPED): - rc = PTL_FAIL; - break; + while (!gmni->gmni_shutdown) { + + spin_lock(&gmni->gmni_gm_lock); + rxevent = gm_blocking_receive_no_spin(gmni->gmni_port); + spin_unlock(&gmni->gmni_gm_lock); + + switch (GM_RECV_EVENT_TYPE(rxevent)) { + default: + gm_unknown(gmni->gmni_port, rxevent); + continue; + + case GM_FAST_RECV_EVENT: + case GM_FAST_PEER_RECV_EVENT: + case GM_PEER_RECV_EVENT: + case GM_FAST_HIGH_RECV_EVENT: + case GM_FAST_HIGH_PEER_RECV_EVENT: + case GM_HIGH_PEER_RECV_EVENT: + case GM_RECV_EVENT: + case GM_HIGH_RECV_EVENT: + break; + } + + recv = &rxevent->recv; + rx = gm_hash_find(gmni->gmni_rx_hash, + gm_ntohp(recv->buffer)); + LASSERT (rx != NULL); + + rx->rx_recv_nob = gm_ntoh_u32(recv->length); + rx->rx_recv_gmid = gm_ntoh_u16(recv->sender_node_id); + rx->rx_recv_port = gm_ntoh_u8(recv->sender_port_id); + rx->rx_recv_type = gm_ntoh_u8(recv->type); + + switch (GM_RECV_EVENT_TYPE(rxevent)) { + case GM_FAST_RECV_EVENT: + case GM_FAST_PEER_RECV_EVENT: + case GM_FAST_HIGH_RECV_EVENT: + case GM_FAST_HIGH_PEER_RECV_EVENT: + LASSERT (rx->rx_recv_nob <= PAGE_SIZE); + + memcpy(GMNAL_NETBUF_MSG(&rx->rx_buf), + gm_ntohp(recv->message), rx->rx_recv_nob); + break; + } + + up(&gmni->gmni_rx_mutex); + + CDEBUG (D_NET, "rx %p: buf %p(%p) nob %d\n", rx, + GMNAL_NETBUF_LOCAL_NETADDR(&rx->rx_buf), + gm_ntohp(recv->buffer), rx->rx_recv_nob); + + /* We're connectionless: simply drop packets with + * errors */ + rc = gmnal_unpack_msg(gmni, rx); + + if (rc == 0) { + gmnal_msg_t *msg = GMNAL_NETBUF_MSG(&rx->rx_buf); - default: - CERROR("Error %d(%s), nid "LPD64"\n", - status, gmnal_gmstatus2str(status), tx->tx_nid); + LASSERT (msg->gmm_type == GMNAL_MSG_IMMEDIATE); + rc = lnet_parse(gmni->gmni_ni, + &msg->gmm_u.immediate.gmim_hdr, + msg->gmm_srcnid, + rx, 0); + } else if (rc > 0) { + gmnal_version_reply(gmni, rx); + rc = -EPROTO; /* repost rx */ + } - spin_lock(&gmnalni->gmni_gm_lock); - gm_drop_sends(gmnalni->gmni_port, tx->tx_gm_priority, - tx->tx_gmlid, gm_port_id, - gmnal_drop_sends_callback, tx); - spin_unlock(&gmnalni->gmni_gm_lock); - return; + if (rc < 0) /* parse failure */ + gmnal_post_rx(gmni, rx); + + down(&gmni->gmni_rx_mutex); } - gmnal_return_tx(gmnalni, tx); - lib_finalize(libnal, NULL, libmsg, rc); - return; + up(&gmni->gmni_rx_mutex); + + CDEBUG(D_NET, "exiting\n"); + atomic_dec(&gmni->gmni_nthreads); + return 0; } -ptl_err_t -gmnal_post_tx (gmnal_ni_t *gmnalni, gmnal_tx_t *tx, - lib_msg_t *libmsg, ptl_nid_t nid, int nob) +void +gmnal_stop_threads(gmnal_ni_t *gmni) { - gm_status_t gm_status; + int count = 2; - CDEBUG(D_NET, "send %d bytes to "LPU64"\n", nob, nid); + gmni->gmni_shutdown = 1; + mb(); + + /* wake rxthread owning gmni_rx_mutex with an alarm. */ + spin_lock(&gmni->gmni_gm_lock); + gm_set_alarm(gmni->gmni_port, &gmni->gmni_alarm, 0, NULL, NULL); + spin_unlock(&gmni->gmni_gm_lock); + + while (atomic_read(&gmni->gmni_nthreads) != 0) { + count++; + if ((count & (count - 1)) == 0) + CWARN("Waiting for %d threads to stop\n", + atomic_read(&gmni->gmni_nthreads)); + gmnal_yield(1); + } +} - LASSERT ((nid >> 32) == 0); +int +gmnal_start_threads(gmnal_ni_t *gmni) +{ + int i; + int pid; - gm_status = gm_global_id_to_node_id(gmnalni->gmni_port, (__u32)nid, - &tx->tx_gmlid); - if (gm_status != GM_SUCCESS) { - CERROR("Failed to obtain local id\n"); - gmnal_return_tx(gmnalni, tx); - return PTL_FAIL; - } + LASSERT (!gmni->gmni_shutdown); + LASSERT (atomic_read(&gmni->gmni_nthreads) == 0); - CDEBUG(D_NET, "Local Node_id is [%u][%x]\n", - tx->tx_gmlid, tx->tx_gmlid); + gm_initialize_alarm(&gmni->gmni_alarm); - tx->tx_nid = nid; - tx->tx_libmsg = libmsg; - tx->tx_gm_priority = GM_LOW_PRIORITY; - tx->tx_msg_size = nob; + for (i = 0; i < num_online_cpus(); i++) { - CDEBUG(D_NET, "Calling gm_send_to_peer port [%p] buffer [%p] " - "gmsize [%lu] msize [%d] nid ["LPU64"] local_gmid[%d] " - "tx [%p]\n", gmnalni->gmni_port, tx->tx_msg, - tx->tx_gm_size, tx->tx_msg_size, - tx->tx_nid, tx->tx_gmlid, tx); + pid = kernel_thread(gmnal_rx_thread, (void*)gmni, 0); + if (pid < 0) { + CERROR("rx thread failed to start: %d\n", pid); + gmnal_stop_threads(gmni); + return pid; + } - spin_lock(&gmnalni->gmni_gm_lock); - gm_send_to_peer_with_callback(gmnalni->gmni_port, tx->tx_msg, - tx->tx_gm_size, tx->tx_msg_size, - tx->tx_gm_priority, tx->tx_gmlid, - gmnal_tx_callback, (void*)tx); - spin_unlock(&gmnalni->gmni_gm_lock); + atomic_inc(&gmni->gmni_nthreads); + } - return PTL_OK; + return 0; } diff --git a/lnet/klnds/gmlnd/gmlnd_module.c b/lnet/klnds/gmlnd/gmlnd_module.c index 449c331..114a286 100644 --- a/lnet/klnds/gmlnd/gmlnd_module.c +++ b/lnet/klnds/gmlnd/gmlnd_module.c @@ -19,11 +19,71 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "gmnal.h" - - -int num_txds = 5; -int gm_port_id = 4; +#include "gmlnd.h" + + +static int port = 4; +CFS_MODULE_PARM(port, "i", int, 0444, + "GM port to use for communications"); + +static int ntx = 256; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# tx descriptors"); + +static int credits = 128; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends per peer"); + +static int nlarge_tx_bufs = 32; +CFS_MODULE_PARM(nlarge_tx_bufs, "i", int, 0444, + "# large tx message buffers"); + +static int nrx_small = 128; +CFS_MODULE_PARM(nrx_small, "i", int, 0444, + "# small rx message buffers"); + +static int nrx_large = 64; +CFS_MODULE_PARM(nrx_large, "i", int, 0444, + "# large rx message buffers"); + +gmnal_tunables_t gmnal_tunables = { + .gm_port = &port, + .gm_ntx = &ntx, + .gm_credits = &credits, + .gm_peer_credits = &peer_credits, + .gm_nlarge_tx_bufs = &nlarge_tx_bufs, + .gm_nrx_small = &nrx_small, + .gm_nrx_large = &nrx_large, +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static ctl_table gmnal_ctl_table[] = { + {1, "port", &port, + sizeof (int), 0444, NULL, &proc_dointvec}, + {2, "ntx", &ntx, + sizeof (int), 0444, NULL, &proc_dointvec}, + {3, "credits", &credits, + sizeof (int), 0444, NULL, &proc_dointvec}, + {4, "peer_credits", &peer_credits, + sizeof (int), 0444, NULL, &proc_dointvec}, + {5, "nlarge_tx_bufs", &nlarge_tx_bufs, + sizeof (int), 0444, NULL, &proc_dointvec}, + {6, "nrx_small", &nrx_small, + sizeof (int), 0444, NULL, &proc_dointvec}, + {7, "nrx_large", &nrx_large, + sizeof (int), 0444, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table gmnal_top_ctl_table[] = { + {207, "gmnal", NULL, 0, 0555, gmnal_ctl_table}, + {0} +}; +#endif static int __init gmnal_load(void) @@ -31,10 +91,16 @@ gmnal_load(void) int status; CDEBUG(D_TRACE, "This is the gmnal module initialisation routine\n"); - +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + gmnal_tunables.gm_sysctl = + register_sysctl_table(gmnal_top_ctl_table, 0); + + if (gmnal_tunables.gm_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); +#endif CDEBUG(D_NET, "Calling gmnal_init\n"); status = gmnal_init(); - if (status == PTL_OK) { + if (status == 0) { CDEBUG(D_NET, "Portals GMNAL initialised ok\n"); } else { CDEBUG(D_NET, "Portals GMNAL Failed to initialise\n"); @@ -46,24 +112,19 @@ gmnal_load(void) return(0); } - static void __exit gmnal_unload(void) { gmnal_fini(); - return; +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + if (gmnal_tunables.gm_sysctl != NULL) + unregister_sysctl_table(gmnal_tunables.gm_sysctl); +#endif } - module_init(gmnal_load); module_exit(gmnal_unload); -MODULE_PARM(num_rx_threads, "i"); -MODULE_PARM(num_txds, "i"); -MODULE_PARM(gm_port_id, "i"); - -MODULE_AUTHOR("Morgan Doyle"); - -MODULE_DESCRIPTION("A Portals kernel NAL for Myrinet GM."); - +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel GM LND v1.01"); MODULE_LICENSE("GPL"); diff --git a/lnet/klnds/gmlnd/gmlnd_utils.c b/lnet/klnds/gmlnd/gmlnd_utils.c index 00bedf5..9810731 100644 --- a/lnet/klnds/gmlnd/gmlnd_utils.c +++ b/lnet/klnds/gmlnd/gmlnd_utils.c @@ -18,407 +18,310 @@ * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* - * All utilities required by lgmanl - */ -#include "gmnal.h" +#include "gmlnd.h" + +void +gmnal_free_netbuf_pages (gmnal_netbuf_t *nb, int npages) +{ + int i; + + for (i = 0; i < npages; i++) + __free_page(nb->nb_kiov[i].kiov_page); +} -/* - * Am I one of the gmnal rxthreads ? - */ int -gmnal_is_rxthread(gmnal_ni_t *gmnalni) +gmnal_alloc_netbuf_pages (gmnal_ni_t *gmni, gmnal_netbuf_t *nb, int npages) { - int i; + int i; + gm_status_t gmrc; - for (i = 0; i < gmnalni->gmni_nrxthreads; i++) - if (gmnalni->gmni_rxthread_pid[i] == current->pid) - return 1; - return 0; + LASSERT (npages > 0); + + for (i = 0; i < npages; i++) { + + nb->nb_kiov[i].kiov_page = alloc_page(GFP_KERNEL); + nb->nb_kiov[i].kiov_offset = 0; + nb->nb_kiov[i].kiov_len = PAGE_SIZE; + + if (nb->nb_kiov[i].kiov_page == NULL) { + CERROR("Can't allocate page\n"); + gmnal_free_netbuf_pages(nb, i); + return -ENOMEM; + } + + CDEBUG(D_NET,"[%3d] page %p, phys "LPX64", @ "LPX64"\n", + i, nb->nb_kiov[i].kiov_page, + lnet_page2phys(nb->nb_kiov[i].kiov_page), + gmni->gmni_netaddr_base); + + gmrc = gm_register_memory_ex_phys( + gmni->gmni_port, + lnet_page2phys(nb->nb_kiov[i].kiov_page), + PAGE_SIZE, + gmni->gmni_netaddr_base); + CDEBUG(D_NET,"[%3d] page %p: %d\n", + i, nb->nb_kiov[i].kiov_page, gmrc); + + if (gmrc != GM_SUCCESS) { + CERROR("Can't map page: %d(%s)\n", gmrc, + gmnal_gmstatus2str(gmrc)); + gmnal_free_netbuf_pages(nb, i+1); + return -ENOMEM; + } + + if (i == 0) + nb->nb_netaddr = gmni->gmni_netaddr_base; + + gmni->gmni_netaddr_base += PAGE_SIZE; + } + + return 0; } -gmnal_tx_t * -gmnal_alloc_tx (gmnal_ni_t *gmnalni) +void +gmnal_free_ltxbuf (gmnal_ni_t *gmni, gmnal_txbuf_t *txb) { - gmnal_tx_t *tx; - void *buffer; + int npages = gmni->gmni_large_pages; + + LASSERT (gmni->gmni_port == NULL); + /* No unmapping; the port has been closed */ + + gmnal_free_netbuf_pages(&txb->txb_buf, gmni->gmni_large_pages); + LIBCFS_FREE(txb, offsetof(gmnal_txbuf_t, txb_buf.nb_kiov[npages])); +} + +int +gmnal_alloc_ltxbuf (gmnal_ni_t *gmni) +{ + int npages = gmni->gmni_large_pages; + int sz = offsetof(gmnal_txbuf_t, txb_buf.nb_kiov[npages]); + gmnal_txbuf_t *txb; + int rc; - PORTAL_ALLOC(tx, sizeof(*tx)); - if (tx == NULL) { - CERROR ("Failed to allocate tx\n"); - return NULL; + LIBCFS_ALLOC(txb, sz); + if (txb == NULL) { + CERROR("Can't allocate large txbuffer\n"); + return -ENOMEM; } - - buffer = gm_dma_malloc(gmnalni->gmni_port, gmnalni->gmni_msg_size); - if (buffer == NULL) { - CERROR("Failed to gm_dma_malloc tx buffer size [%d]\n", - gmnalni->gmni_msg_size); - PORTAL_FREE(tx, sizeof(*tx)); - return NULL; + + rc = gmnal_alloc_netbuf_pages(gmni, &txb->txb_buf, npages); + if (rc != 0) { + LIBCFS_FREE(txb, sz); + return rc; } - memset(tx, 0, sizeof(*tx)); - tx->tx_msg = (gmnal_msg_t *)buffer; - tx->tx_buffer_size = gmnalni->gmni_msg_size; - tx->tx_gm_size = gm_min_size_for_length(tx->tx_buffer_size); - tx->tx_gmni = gmnalni; + list_add_tail(&txb->txb_list, &gmni->gmni_idle_ltxbs); - CDEBUG(D_NET, "Created tx [%p] with buffer [%p], size [%d]\n", - tx, tx->tx_msg, tx->tx_buffer_size); + txb->txb_next = gmni->gmni_ltxbs; + gmni->gmni_ltxbs = txb; - return tx; + return 0; } void gmnal_free_tx (gmnal_tx_t *tx) { - gmnal_ni_t *gmnalni = tx->tx_gmni; - - CDEBUG(D_NET, "Freeing tx [%p] with buffer [%p], size [%d]\n", - tx, tx->tx_msg, tx->tx_buffer_size); -#if 0 - /* We free buffers after we've closed the GM port */ - gm_dma_free(gmnalni->gmni_port, tx->tx_msg); -#endif - PORTAL_FREE(tx, sizeof(*tx)); + LASSERT (tx->tx_gmni->gmni_port == NULL); + + gmnal_free_netbuf_pages(&tx->tx_buf, 1); + LIBCFS_FREE(tx, sizeof(*tx)); } int -gmnal_alloc_txs(gmnal_ni_t *gmnalni) +gmnal_alloc_tx (gmnal_ni_t *gmni) { - int ntxcred = gm_num_send_tokens(gmnalni->gmni_port); - int ntx; - int nrxt_tx; - int i; - gmnal_tx_t *tx; - - CWARN("ntxcred: %d\n", ntxcred); - - ntx = num_txds; - nrxt_tx = num_txds + 1; - - if (ntx + nrxt_tx > ntxcred) { - CERROR ("Asked for %d + %d tx credits, but only %d available\n", - ntx, nrxt_tx, ntxcred); + gmnal_tx_t *tx; + int rc; + + LIBCFS_ALLOC(tx, sizeof(*tx)); + if (tx == NULL) { + CERROR("Failed to allocate tx\n"); return -ENOMEM; } - /* A semaphore is initialised with the number of transmit tokens - * available. To get a stxd, acquire the token semaphore. this - * decrements the available token count (if no tokens you block here, - * someone returning a stxd will release the semaphore and wake you) - * When token is obtained acquire the spinlock to manipulate the - * list */ - sema_init(&gmnalni->gmni_tx_token, ntx); - spin_lock_init(&gmnalni->gmni_tx_lock); - LASSERT (gmnalni->gmni_tx == NULL); - - for (i = 0; i <= ntx; i++) { - tx = gmnal_alloc_tx(gmnalni); - if (tx == NULL) { - CERROR("Failed to create tx %d\n", i); - return -ENOMEM; - } - - tx->tx_rxt = 0; - tx->tx_next = gmnalni->gmni_tx; - gmnalni->gmni_tx = tx; - } - - sema_init(&gmnalni->gmni_rxt_tx_token, nrxt_tx); - spin_lock_init(&gmnalni->gmni_rxt_tx_lock); - LASSERT (gmnalni->gmni_rxt_tx == NULL); + memset(tx, 0, sizeof(*tx)); - for (i = 0; i <= nrxt_tx; i++) { - tx = gmnal_alloc_tx(gmnalni); - if (tx == NULL) { - CERROR("Failed to create tx %d + %d\n", ntx, i); - return -ENOMEM; - } + rc = gmnal_alloc_netbuf_pages(gmni, &tx->tx_buf, 1); + if (rc != 0) { + LIBCFS_FREE(tx, sizeof(*tx)); + return -ENOMEM; + } - tx->tx_rxt = 1; - tx->tx_next = gmnalni->gmni_rxt_tx; - gmnalni->gmni_rxt_tx = tx; - } + tx->tx_gmni = gmni; + + list_add_tail(&tx->tx_list, &gmni->gmni_idle_txs); - return 0; + tx->tx_next = gmni->gmni_txs; + gmni->gmni_txs = tx; + + return 0; } void -gmnal_free_txs(gmnal_ni_t *gmnalni) +gmnal_free_rx(gmnal_ni_t *gmni, gmnal_rx_t *rx) { - gmnal_tx_t *tx; - - while ((tx = gmnalni->gmni_tx) != NULL) { - gmnalni->gmni_tx = tx->tx_next; - gmnal_free_tx (tx); - } + int npages = rx->rx_islarge ? gmni->gmni_large_pages : 1; + + LASSERT (gmni->gmni_port == NULL); - while ((tx = gmnalni->gmni_rxt_tx) != NULL) { - gmnalni->gmni_rxt_tx = tx->tx_next; - gmnal_free_tx (tx); - } + gmnal_free_netbuf_pages(&rx->rx_buf, npages); + LIBCFS_FREE(rx, offsetof(gmnal_rx_t, rx_buf.nb_kiov[npages])); } - -/* - * Get a tx from the list - * This get us a wired and gm_registered small tx buffer. - * This implicitly gets us a send token also. - */ -gmnal_tx_t * -gmnal_get_tx(gmnal_ni_t *gmnalni, int block) +int +gmnal_alloc_rx (gmnal_ni_t *gmni, int islarge) { + int npages = islarge ? gmni->gmni_large_pages : 1; + int sz = offsetof(gmnal_rx_t, rx_buf.nb_kiov[npages]); + int rc; + gmnal_rx_t *rx; + gm_status_t gmrc; + + LIBCFS_ALLOC(rx, sz); + if (rx == NULL) { + CERROR("Failed to allocate rx\n"); + return -ENOMEM; + } + + memset(rx, 0, sizeof(*rx)); - gmnal_tx_t *tx = NULL; - pid_t pid = current->pid; - - - CDEBUG(D_TRACE, "gmnal_get_tx gmnalni [%p] block[%d] pid [%d]\n", - gmnalni, block, pid); - - if (gmnal_is_rxthread(gmnalni)) { - CDEBUG(D_NET, "RXTHREAD Attempting to get token\n"); - down(&gmnalni->gmni_rxt_tx_token); - spin_lock(&gmnalni->gmni_rxt_tx_lock); - tx = gmnalni->gmni_rxt_tx; - gmnalni->gmni_rxt_tx = tx->tx_next; - spin_unlock(&gmnalni->gmni_rxt_tx_lock); - CDEBUG(D_NET, "RXTHREAD got [%p], head is [%p]\n", - tx, gmnalni->gmni_rxt_tx); - tx->tx_rxt = 1; - } else { - if (block) { - CDEBUG(D_NET, "Attempting to get token\n"); - down(&gmnalni->gmni_tx_token); - CDEBUG(D_PORTALS, "Got token\n"); - } else { - if (down_trylock(&gmnalni->gmni_tx_token)) { - CERROR("can't get token\n"); - return(NULL); - } - } - spin_lock(&gmnalni->gmni_tx_lock); - tx = gmnalni->gmni_tx; - gmnalni->gmni_tx = tx->tx_next; - spin_unlock(&gmnalni->gmni_tx_lock); - CDEBUG(D_NET, "got [%p], head is [%p]\n", tx, - gmnalni->gmni_tx); - } /* general tx get */ - - return tx; + rc = gmnal_alloc_netbuf_pages(gmni, &rx->rx_buf, npages); + if (rc != 0) { + LIBCFS_FREE(rx, sz); + return rc; + } + + rx->rx_islarge = islarge; + rx->rx_next = gmni->gmni_rxs; + gmni->gmni_rxs = rx; + + gmrc = gm_hash_insert(gmni->gmni_rx_hash, + GMNAL_NETBUF_LOCAL_NETADDR(&rx->rx_buf), rx); + if (gmrc != GM_SUCCESS) { + CERROR("Couldn't add rx to hash table: %d\n", gmrc); + return -ENOMEM; + } + + return 0; } -/* - * Return a tx to the list - */ void -gmnal_return_tx(gmnal_ni_t *gmnalni, gmnal_tx_t *tx) +gmnal_free_ltxbufs (gmnal_ni_t *gmni) { - CDEBUG(D_TRACE, "gmnalni [%p], tx[%p] rxt[%d]\n", gmnalni, - tx, tx->tx_rxt); - - /* - * this transmit descriptor is - * for the rxthread - */ - if (tx->tx_rxt) { - spin_lock(&gmnalni->gmni_rxt_tx_lock); - tx->tx_next = gmnalni->gmni_rxt_tx; - gmnalni->gmni_rxt_tx = tx; - spin_unlock(&gmnalni->gmni_rxt_tx_lock); - up(&gmnalni->gmni_rxt_tx_token); - CDEBUG(D_NET, "Returned tx to rxthread list\n"); - } else { - spin_lock(&gmnalni->gmni_tx_lock); - tx->tx_next = gmnalni->gmni_tx; - gmnalni->gmni_tx = tx; - spin_unlock(&gmnalni->gmni_tx_lock); - up(&gmnalni->gmni_tx_token); - CDEBUG(D_NET, "Returned tx to general list\n"); + gmnal_txbuf_t *txb; + + while ((txb = gmni->gmni_ltxbs) != NULL) { + gmni->gmni_ltxbs = txb->txb_next; + gmnal_free_ltxbuf(gmni, txb); } - return; } - -/* - * allocate a number of small rx buffers and register with GM - * so they are wired and set up for DMA. This is a costly operation. - * Also allocate a corrosponding descriptor to keep track of - * the buffer. - * Put all descriptors on singly linked list to be available to - * receive thread. - */ int -gmnal_alloc_rxs (gmnal_ni_t *gmnalni) +gmnal_alloc_ltxbufs (gmnal_ni_t *gmni) { - int nrxcred = gm_num_receive_tokens(gmnalni->gmni_port); - int nrx; - int i; - gmnal_rx_t *rxd; - void *rxbuffer; - - CWARN("nrxcred: %d\n", nrxcred); + int nlarge_tx_bufs = *gmnal_tunables.gm_nlarge_tx_bufs; + int i; + int rc; - nrx = num_txds*2 + 2; - if (nrx > nrxcred) { - CERROR("Can't allocate %d rx credits: (%d available)\n", - nrx, nrxcred); - return -ENOMEM; + for (i = 0; i < nlarge_tx_bufs; i++) { + rc = gmnal_alloc_ltxbuf(gmni); + + if (rc != 0) + return rc; } - CDEBUG(D_NET, "Allocated [%d] receive tokens to small messages\n", nrx); + return 0; +} - gmnalni->gmni_rx_hash = gm_create_hash(gm_hash_compare_ptrs, - gm_hash_hash_ptr, 0, 0, nrx, 0); - if (gmnalni->gmni_rx_hash == NULL) { - CERROR("Failed to create hash table\n"); - return -ENOMEM; - } +void +gmnal_free_txs(gmnal_ni_t *gmni) +{ + gmnal_tx_t *tx; - LASSERT (gmnalni->gmni_rx == NULL); - - for (i=0; i <= nrx; i++) { - - PORTAL_ALLOC(rxd, sizeof(*rxd)); - if (rxd == NULL) { - CERROR("Failed to malloc rxd [%d]\n", i); - return -ENOMEM; - } - - rxbuffer = gm_dma_malloc(gmnalni->gmni_port, - gmnalni->gmni_msg_size); - if (rxbuffer == NULL) { - CERROR("Failed to gm_dma_malloc rxbuffer [%d], " - "size [%d]\n",i ,gmnalni->gmni_msg_size); - PORTAL_FREE(rxd, sizeof(*rxd)); - return -ENOMEM; - } - - rxd->rx_msg = (gmnal_msg_t *)rxbuffer; - rxd->rx_size = gmnalni->gmni_msg_size; - rxd->rx_gmsize = gm_min_size_for_length(rxd->rx_size); - - rxd->rx_next = gmnalni->gmni_rx; - gmnalni->gmni_rx = rxd; - - if (gm_hash_insert(gmnalni->gmni_rx_hash, - (void*)rxbuffer, (void*)rxd)) { - CERROR("failed to create hash entry rxd[%p] " - "for rxbuffer[%p]\n", rxd, rxbuffer); - return -ENOMEM; - } - - CDEBUG(D_NET, "Registered rxd [%p] with buffer [%p], " - "size [%d]\n", rxd, rxd->rx_msg, rxd->rx_size); + while ((tx = gmni->gmni_txs) != NULL) { + gmni->gmni_txs = tx->tx_next; + gmnal_free_tx (tx); } - - return 0; } -void -gmnal_free_rxs(gmnal_ni_t *gmnalni) +int +gmnal_alloc_txs(gmnal_ni_t *gmni) { - gmnal_rx_t *rx; - - CDEBUG(D_TRACE, "gmnal_free_small rx\n"); + int ntxcred = gm_num_send_tokens(gmni->gmni_port); + int ntx = *gmnal_tunables.gm_ntx; + int i; + int rc; - while ((rx = gmnalni->gmni_rx) != NULL) { - gmnalni->gmni_rx = rx->rx_next; + CDEBUG(D_NET, "ntxcred: %d\n", ntxcred); + gmni->gmni_tx_credits = ntxcred; - CDEBUG(D_NET, "Freeing rxd [%p] buffer [%p], size [%d]\n", - rx, rx->rx_msg, rx->rx_size); -#if 0 - /* We free buffers after we've shutdown the GM port */ - gm_dma_free(gmnalni->gmni_port, _rxd->rx_msg); -#endif - PORTAL_FREE(rx, sizeof(*rx)); - } + for (i = 0; i < ntx; i++) { + rc = gmnal_alloc_tx(gmni); + if (rc != 0) + return rc; + } -#if 0 - /* see above */ - if (gmnalni->gmni_rx_hash != NULL) - gm_destroy_hash(gmnalni->gmni_rx_hash); -#endif + return 0; } void -gmnal_stop_threads(gmnal_ni_t *gmnalni) +gmnal_free_rxs(gmnal_ni_t *gmni) { - int count = 2; - int i; + gmnal_rx_t *rx; - gmnalni->gmni_thread_shutdown = 1; + while ((rx = gmni->gmni_rxs) != NULL) { + gmni->gmni_rxs = rx->rx_next; - /* wake ctthread with an alarm */ - spin_lock(&gmnalni->gmni_gm_lock); - gm_set_alarm(gmnalni->gmni_port, &gmnalni->gmni_ctthread_alarm, - 0, NULL, NULL); - spin_unlock(&gmnalni->gmni_gm_lock); + gmnal_free_rx(gmni, rx); + } - /* wake each rxthread */ - for (i = 0; i < num_online_cpus(); i++) - up(&gmnalni->gmni_rxq_wait); - - while (atomic_read(&gmnalni->gmni_nthreads) != 0) { - count++; - if ((count & (count - 1)) == 0) - CWARN("Waiting for %d threads to stop\n", - atomic_read(&gmnalni->gmni_nthreads)); - gmnal_yield(1); - } + LASSERT (gmni->gmni_port == NULL); +#if 0 + /* GM releases all resources allocated to a port when it closes */ + if (gmni->gmni_rx_hash != NULL) + gm_destroy_hash(gmni->gmni_rx_hash); +#endif } -/* - * Start the caretaker thread and a number of receiver threads - * The caretaker thread gets events from the gm library. - * It passes receive events to the receiver threads via a work list. - * It processes other events itself in gm_unknown. These will be - * callback events or sleeps. - */ int -gmnal_start_threads(gmnal_ni_t *gmnalni) +gmnal_alloc_rxs (gmnal_ni_t *gmni) { - int i; - int pid; - - gmnalni->gmni_thread_shutdown = 0; - gmnalni->gmni_nrxthreads = 0; - atomic_set(&gmnalni->gmni_nthreads, 0); - - INIT_LIST_HEAD(&gmnalni->gmni_rxq); - spin_lock_init(&gmnalni->gmni_rxq_lock); - sema_init(&gmnalni->gmni_rxq_wait, 0); - - /* - * the alarm is used to wake the caretaker thread from - * gm_unknown call (sleeping) to exit it. - */ - CDEBUG(D_NET, "Initializing caretaker thread alarm and flag\n"); - gm_initialize_alarm(&gmnalni->gmni_ctthread_alarm); - - pid = kernel_thread(gmnal_ct_thread, (void*)gmnalni, 0); - if (pid < 0) { - CERROR("Caretaker thread failed to start: %d\n", pid); - return pid; - } - atomic_inc(&gmnalni->gmni_nthreads); - - for (i = 0; i < num_online_cpus(); i++) { + int nrxcred = gm_num_receive_tokens(gmni->gmni_port); + int nrx_small = *gmnal_tunables.gm_nrx_small; + int nrx_large = *gmnal_tunables.gm_nrx_large; + int nrx = nrx_large + nrx_small; + int rc; + int i; - pid = kernel_thread(gmnal_rx_thread, (void*)gmnalni, 0); - if (pid < 0) { - CERROR("rx thread failed to start: %d\n", pid); - gmnal_stop_threads(gmnalni); - return pid; - } + CDEBUG(D_NET, "nrxcred: %d(%dL+%dS)\n", nrxcred, nrx_large, nrx_small); - atomic_inc(&gmnalni->gmni_nthreads); - gmnalni->gmni_rxthread_pid[i] = pid; - gmnalni->gmni_nrxthreads++; + if (nrx > nrxcred) { + int nlarge = (nrx_large * nrxcred)/nrx; + int nsmall = nrxcred - nlarge; + + CWARN("Only %d rx credits: " + "reducing large %d->%d, small %d->%d\n", nrxcred, + nrx_large, nlarge, nrx_small, nsmall); + + *gmnal_tunables.gm_nrx_large = nrx_large = nlarge; + *gmnal_tunables.gm_nrx_small = nrx_small = nsmall; + nrx = nlarge + nsmall; + } + + gmni->gmni_rx_hash = gm_create_hash(gm_hash_compare_ptrs, + gm_hash_hash_ptr, 0, 0, nrx, 0); + if (gmni->gmni_rx_hash == NULL) { + CERROR("Failed to create hash table\n"); + return -ENOMEM; } + for (i = 0; i < nrx; i++ ) { + rc = gmnal_alloc_rx(gmni, i < nrx_large); + if (rc != 0) + return rc; + } + return 0; } @@ -674,62 +577,3 @@ gmnal_yield(int delay) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(delay); } - -int -gmnal_enqueue_rx(gmnal_ni_t *gmnalni, gm_recv_t *recv) -{ - void *ptr = gm_ntohp(recv->buffer); - gmnal_rx_t *rx = gm_hash_find(gmnalni->gmni_rx_hash, ptr); - - /* No locking; hash is read-only */ - - LASSERT (rx != NULL); - LASSERT (rx->rx_msg == (gmnal_msg_t *)ptr); - - rx->rx_recv_nob = gm_ntohl(recv->length); - rx->rx_recv_gmid = gm_ntoh_u16(recv->sender_node_id); - rx->rx_recv_port = gm_ntoh_u8(recv->sender_port_id); - rx->rx_recv_type = gm_ntoh_u8(recv->type); - - spin_lock(&gmnalni->gmni_rxq_lock); - list_add_tail (&rx->rx_list, &gmnalni->gmni_rxq); - spin_unlock(&gmnalni->gmni_rxq_lock); - - up(&gmnalni->gmni_rxq_wait); - return 0; -} - -gmnal_rx_t * -gmnal_dequeue_rx(gmnal_ni_t *gmnalni) -{ - gmnal_rx_t *rx; - - CDEBUG(D_NET, "Getting entry to list\n"); - - for (;;) { - while(down_interruptible(&gmnalni->gmni_rxq_wait) != 0) - /* do nothing */; - - if (gmnalni->gmni_thread_shutdown) - return NULL; - - spin_lock(&gmnalni->gmni_rxq_lock); - - if (list_empty(&gmnalni->gmni_rxq)) { - rx = NULL; - } else { - rx = list_entry(gmnalni->gmni_rxq.next, - gmnal_rx_t, rx_list); - list_del(&rx->rx_list); - } - - spin_unlock(&gmnalni->gmni_rxq_lock); - - if (rx != NULL) - return rx; - - CWARN("woken but no work\n"); - } -} - - diff --git a/lnet/klnds/iiblnd/Makefile.in b/lnet/klnds/iiblnd/Makefile.in index e7934e2..7ee9b64 100644 --- a/lnet/klnds/iiblnd/Makefile.in +++ b/lnet/klnds/iiblnd/Makefile.in @@ -1,5 +1,5 @@ -MODULES := kiibnal -kiibnal-objs := iibnal.o iibnal_cb.o +MODULES := kiiblnd +kiiblnd-objs := iiblnd.o iiblnd_cb.o iiblnd_modparams.o EXTRA_POST_CFLAGS := @IIBCPPFLAGS@ diff --git a/lnet/klnds/iiblnd/autoMakefile.am b/lnet/klnds/iiblnd/autoMakefile.am index d61ffe7..d08d079 100644 --- a/lnet/klnds/iiblnd/autoMakefile.am +++ b/lnet/klnds/iiblnd/autoMakefile.am @@ -4,12 +4,10 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS -if BUILD_IIBNAL -modulenet_DATA = kiibnal$(KMODEXT) -endif +if BUILD_IIBLND +modulenet_DATA = kiiblnd$(KMODEXT) endif endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kiibnal-objs:%.o=%.c) iibnal.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kiiblnd-objs:%.o=%.c) iiblnd.h diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c index 1ecd32d..27b31a5 100644 --- a/lnet/klnds/iiblnd/iiblnd.c +++ b/lnet/klnds/iiblnd/iiblnd.c @@ -21,121 +21,471 @@ * */ -#include "iibnal.h" - -nal_t kibnal_api; -ptl_handle_ni_t kibnal_ni; -kib_tunables_t kibnal_tunables; - -kib_data_t kibnal_data = { - .kib_service_id = IBNAL_SERVICE_NUMBER, +#include "iiblnd.h" + +lnd_t the_kiblnd = { + .lnd_type = IIBLND, + .lnd_startup = kibnal_startup, + .lnd_shutdown = kibnal_shutdown, + .lnd_ctl = kibnal_ctl, + .lnd_send = kibnal_send, + .lnd_recv = kibnal_recv, + .lnd_eager_recv = kibnal_eager_recv, }; -#ifdef CONFIG_SYSCTL -#define IBNAL_SYSCTL 202 +kib_data_t kibnal_data; -#define IBNAL_SYSCTL_TIMEOUT 1 +__u32 +kibnal_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; -static ctl_table kibnal_ctl_table[] = { - {IBNAL_SYSCTL_TIMEOUT, "timeout", - &kibnal_tunables.kib_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} -static ctl_table kibnal_top_ctl_table[] = { - {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table}, - { 0 } -}; -#endif +void +kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) +{ + msg->ibm_type = type; + msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; +} -#ifdef unused void -print_service(IB_SERVICE_RECORD *service, char *tag, int rc) +kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, + lnet_nid_t dstnid, __u64 dststamp, __u64 seq) { - char name[32]; + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. */ + msg->ibm_magic = IBNAL_MSG_MAGIC; + msg->ibm_version = version; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid, + dstnid); + msg->ibm_srcstamp = kibnal_data.kib_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; + msg->ibm_seq = seq; + + if (*kibnal_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); + } +} - if (service == NULL) - { - CWARN("tag : %s\n" - "status : %d (NULL)\n", tag, rc); - return; +void +kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, + int type, lnet_nid_t dstnid, __u64 dststamp) +{ + LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); + + memset(msg, 0, nob); + kibnal_init_msg(msg, type, sizeof(kib_connparams_t)); + + msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; + msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; + msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; + + kibnal_pack_msg(msg, version, 0, dstnid, dststamp, 0); +} + +int +kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) +{ + const int hdr_size = offsetof(kib_msg_t, ibm_u); + __u32 msg_cksum; + __u32 msg_version; + int flip; + int msg_nob; +#if !IBNAL_USE_FMR + int i; + int n; +#endif + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + /* Future protocol version compatibility support! + * If the iiblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will negotiate a + * protocol version. If I find this, I avoid any console errors. If + * my is doing connection establishment, the reject will tell the peer + * which version I'm running. */ + + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { + flip = 0; + } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { + flip = 1; + } else { + if (msg->ibm_magic == LNET_PROTO_MAGIC || + msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) + return -EPROTO; + + /* Completely out to lunch */ + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; } - strncpy (name, service->ServiceName, sizeof(name)-1); - name[sizeof(name)-1] = 0; + + msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if (expected_version == 0) { + if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && + msg_version != IBNAL_MSG_VERSION) + return -EPROTO; + } else if (msg_version != expected_version) { + CERROR("Bad version: %x(%x expected)\n", + msg_version, expected_version); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kibnal_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + msg->ibm_cksum = msg_cksum; - CWARN("tag : %s\n" - "status : %d\n" - "service id: "LPX64"\n" - "name : %s\n" - "NID : "LPX64"\n", tag, rc, - service->RID.ServiceID, name, - *kibnal_service_nid_field(service)); -} + if (flip) { + /* leave magic unflipped as a clue to peer endianness */ + msg->ibm_version = msg_version; + CLASSERT (sizeof(msg->ibm_type) == 1); + CLASSERT (sizeof(msg->ibm_credits) == 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + __swab64s(&msg->ibm_seq); + } + + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); + return -EPROTO; + } + + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; + + case IBNAL_MSG_NOOP: + break; + + case IBNAL_MSG_IMMEDIATE: + if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { + CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); + return -EPROTO; + } + break; + + case IBNAL_MSG_PUT_REQ: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) { + CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.putreq))); + return -EPROTO; + } + break; + + case IBNAL_MSG_PUT_ACK: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.putack))); + return -EPROTO; + } +#if IBNAL_USE_FMR + if (flip) { + __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + } +#else + if (flip) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); + } + + n = msg->ibm_u.putack.ibpam_rd.rd_nfrag; + if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", + n, IBNAL_MAX_RDMA_FRAGS); + return -EPROTO; + } + + if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); + return -EPROTO; + } + + if (flip) { + for (i = 0; i < n; i++) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); + __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr); + } + } #endif + break; -static void -kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod, - FSTATUS frc, uint32 madrc) + case IBNAL_MSG_GET_REQ: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.get))); + return -EPROTO; + } +#if IBNAL_USE_FMR + if (flip) { + __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + } +#else + if (flip) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); + } + + n = msg->ibm_u.get.ibgm_rd.rd_nfrag; + if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", + n, IBNAL_MAX_RDMA_FRAGS); + return -EPROTO; + } + + if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); + return -EPROTO; + } + + if (flip) + for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); + __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr); + } +#endif + break; + + case IBNAL_MSG_PUT_NAK: + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { + CERROR("Short RDMA completion: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.completion))); + return -EPROTO; + } + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBNAL_MSG_CONNREQ: + case IBNAL_MSG_CONNACK: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { + CERROR("Short connreq/ack: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.connparams))); + return -EPROTO; + } + if (flip) { + __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + __swab32s(&msg->ibm_u.connparams.ibcp_max_frags); + } + break; + } + return 0; +} + +IB_HANDLE +kibnal_create_cep(lnet_nid_t nid) { - *(FSTATUS *)arg = frc; - up (&kibnal_data.kib_nid_signal); + FSTATUS frc; + __u32 u32val; + IB_HANDLE cep; + + cep = iba_cm_create_cep(CM_RC_TYPE); + if (cep == NULL) { + CERROR ("Can't create CEP for %s\n", + (nid == LNET_NID_ANY) ? "listener" : + libcfs_nid2str(nid)); + return NULL; + } + + if (nid == LNET_NID_ANY) { + u32val = 1; + frc = iba_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, + (char *)&u32val, sizeof(u32val), 0); + if (frc != FSUCCESS) { + CERROR("Can't set async_accept: %d\n", frc); + goto failed; + } + + u32val = 0; /* sets system max */ + frc = iba_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG, + (char *)&u32val, sizeof(u32val), 0); + if (frc != FSUCCESS) { + CERROR("Can't set listen backlog: %d\n", frc); + goto failed; + } + } + + u32val = 1; + frc = iba_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, + (char *)&u32val, sizeof(u32val), 0); + if (frc != FSUCCESS) { + CERROR("Can't set timewait_callback for %s: %d\n", + (nid == LNET_NID_ANY) ? "listener" : + libcfs_nid2str(nid), frc); + goto failed; + } + + return cep; + + failed: + iba_cm_destroy_cep(cep); + return NULL; } +#define IBNAL_CHECK_ADVERT 1 #if IBNAL_CHECK_ADVERT -static void +void kibnal_service_query_done (void *arg, QUERY *qry, QUERY_RESULT_VALUES *qry_result) { - FSTATUS frc = qry_result->Status; + int *rcp = arg; + FSTATUS frc = qry_result->Status; + SERVICE_RECORD_RESULTS *svc_rslt; + IB_SERVICE_RECORD *svc; + lnet_nid_t nid; + + if (frc != FSUCCESS || qry_result->ResultDataSize == 0) { + CERROR("Error checking advert: status %d data size %d\n", + frc, qry_result->ResultDataSize); + *rcp = -EIO; + goto out; + } + + svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult; + + if (svc_rslt->NumServiceRecords < 1) { + CERROR("Check advert: %d records\n", + svc_rslt->NumServiceRecords); + *rcp = -ENOENT; + goto out; + } - if (frc != FSUCCESS && - qry_result->ResultDataSize == 0) - frc = FERROR; + svc = &svc_rslt->ServiceRecords[0]; + nid = le64_to_cpu(*kibnal_service_nid_field(svc)); - *(FSTATUS *)arg = frc; - up (&kibnal_data.kib_nid_signal); + CDEBUG(D_NET, "Check advert: %s "LPX64" "LPX64":%04x\n", + libcfs_nid2str(nid), svc->RID.ServiceID, + svc->RID.ServiceGID.Type.Global.InterfaceID, + svc->RID.ServiceP_Key); + + if (nid != kibnal_data.kib_ni->ni_nid) { + CERROR("Check advert: Bad NID %s (%s expected)\n", + libcfs_nid2str(nid), + libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); + *rcp = -EINVAL; + goto out; + } + + if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) { + CERROR("Check advert: Bad ServiceID "LPX64" (%x expected)\n", + svc->RID.ServiceID, + *kibnal_tunables.kib_service_number); + *rcp = -EINVAL; + goto out; + } + + if (svc->RID.ServiceGID.Type.Global.InterfaceID != + kibnal_data.kib_port_guid) { + CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n", + svc->RID.ServiceGID.Type.Global.InterfaceID, + kibnal_data.kib_port_guid); + *rcp = -EINVAL; + goto out; + } + + if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) { + CERROR("Check advert: Bad PKEY %04x (%04x expected)\n", + svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey); + *rcp = -EINVAL; + goto out; + } + + CDEBUG(D_NET, "Check advert OK\n"); + *rcp = 0; + + out: + up (&kibnal_data.kib_listener_signal); } -static void +int kibnal_check_advert (void) { - QUERY *qry; - IB_SERVICE_RECORD *svc; - FSTATUS frc; - FSTATUS frc2; + /* single-threaded */ + static QUERY qry; - PORTAL_ALLOC(qry, sizeof(*qry)); - if (qry == NULL) - return; + FSTATUS frc; + int rc; - memset (qry, 0, sizeof(*qry)); - qry->InputType = InputTypeServiceRecord; - qry->OutputType = OutputTypeServiceRecord; - qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; - svc = &qry->InputValue.ServiceRecordValue.ServiceRecord; - kibnal_set_service_keys(svc, kibnal_data.kib_nid); - - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - qry, - kibnal_service_query_done, - NULL, &frc2); - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d checking SM service\n", frc); - } else { - down (&kibnal_data.kib_nid_signal); - frc = frc2; + memset (&qry, 0, sizeof(qry)); + qry.InputType = InputTypeServiceRecord; + qry.OutputType = OutputTypeServiceRecord; + kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord, + kibnal_data.kib_ni->ni_nid); + qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; - if (frc != 0) - CERROR ("Error %d checking SM service\n", rc); + frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &qry, + kibnal_service_query_done, + &kibnal_data.kib_sdretry, + &rc); + if (frc != FPENDING) { + CERROR ("Immediate error %d checking SM service\n", frc); + return -EIO; } - - return (rc); + + down (&kibnal_data.kib_listener_signal); + + if (rc != 0) + CERROR ("Error %d checking SM service\n", rc); + return rc; +} +#else +int +kibnal_check_advert(void) +{ + return 0; } #endif -static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) +void +kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) { IB_SERVICE_RECORD *svc; @@ -143,211 +493,208 @@ static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type) fod->Type = type; svc = &fod->Value.ServiceRecordValue.ServiceRecord; - svc->RID.ServiceID = kibnal_data.kib_service_id; + svc->RID.ServiceID = *kibnal_tunables.kib_service_number; svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid; svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX; svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey; svc->ServiceLease = 0xffffffff; - kibnal_set_service_keys(svc, kibnal_data.kib_nid); + kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid); } -static int -kibnal_advertise (void) +void +kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod, + FSTATUS frc, uint32 madrc) { - FABRIC_OPERATION_DATA *fod; - IB_SERVICE_RECORD *svc; - FSTATUS frc; - FSTATUS frc2; - - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + *(FSTATUS *)arg = frc; + up (&kibnal_data.kib_listener_signal); +} - PORTAL_ALLOC(fod, sizeof(*fod)); - if (fod == NULL) - return (-ENOMEM); +int +kibnal_advertise (void) +{ + /* Single threaded here */ + static FABRIC_OPERATION_DATA fod; + + IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord; + FSTATUS frc; + FSTATUS frc2; + + if (strlen(*kibnal_tunables.kib_service_name) >= + sizeof(svc->ServiceName)) { + CERROR("Service name '%s' too long (%d chars max)\n", + *kibnal_tunables.kib_service_name, + (int)sizeof(svc->ServiceName) - 1); + return -EINVAL; + } - fill_fod(fod, FabOpSetServiceRecord); - svc = &fod->Value.ServiceRecordValue.ServiceRecord; + kibnal_fill_fod(&fod, FabOpSetServiceRecord); - CDEBUG(D_NET, "Advertising service id "LPX64" %s:"LPX64"\n", - svc->RID.ServiceID, - svc->ServiceName, *kibnal_service_nid_field(svc)); + CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n", + svc->RID.ServiceID, svc->ServiceName, + libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc)))); - frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - fod, kibnal_service_setunset_done, - NULL, &frc2); + frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &fod, + kibnal_service_setunset_done, + &kibnal_data.kib_sdretry, + &frc2); if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d advertising NID "LPX64"\n", - frc, kibnal_data.kib_nid); - goto out; + CERROR ("Immediate error %d advertising NID %s\n", + frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); + return -EIO; } - down (&kibnal_data.kib_nid_signal); + down (&kibnal_data.kib_listener_signal); frc = frc2; - if (frc != FSUCCESS) - CERROR ("Error %d advertising BUD "LPX64"\n", - frc, kibnal_data.kib_nid); -out: - PORTAL_FREE(fod, sizeof(*fod)); - return (frc == FSUCCESS) ? 0 : -EINVAL; + if (frc == FSUCCESS) + return 0; + + CERROR ("Error %d advertising %s\n", + frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); + return -EIO; } -static void +void kibnal_unadvertise (int expect_success) { - FABRIC_OPERATION_DATA *fod; - IB_SERVICE_RECORD *svc; - FSTATUS frc; - FSTATUS frc2; + /* single threaded */ + static FABRIC_OPERATION_DATA fod; - LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord; + FSTATUS frc; + FSTATUS frc2; - PORTAL_ALLOC(fod, sizeof(*fod)); - if (fod == NULL) - return; + LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY); - fill_fod(fod, FabOpDeleteServiceRecord); - svc = &fod->Value.ServiceRecordValue.ServiceRecord; + kibnal_fill_fod(&fod, FabOpDeleteServiceRecord); - CDEBUG(D_NET, "Unadvertising service %s:"LPX64"\n", - svc->ServiceName, *kibnal_service_nid_field(svc)); + CDEBUG(D_NET, "Unadvertising service %s:%s\n", + svc->ServiceName, + libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc)))); - frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - fod, kibnal_service_setunset_done, - NULL, &frc2); - + frc = iba_sd_port_fabric_operation(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + &fod, + kibnal_service_setunset_done, + &kibnal_data.kib_sdretry, + &frc2); if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("Immediate error %d unadvertising NID "LPX64"\n", - frc, kibnal_data.kib_nid); - goto out; + CERROR ("Immediate error %d unadvertising NID %s\n", + frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); + return; } - down (&kibnal_data.kib_nid_signal); + down (&kibnal_data.kib_listener_signal); + + CDEBUG(D_NET, "Unadvertise rc: %d\n", frc2); if ((frc2 == FSUCCESS) == !!expect_success) - goto out; + return; if (expect_success) - CERROR("Error %d unadvertising NID "LPX64"\n", - frc2, kibnal_data.kib_nid); + CERROR("Error %d unadvertising NID %s\n", + frc2, libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); else - CWARN("Removed conflicting NID "LPX64"\n", - kibnal_data.kib_nid); - out: - PORTAL_FREE(fod, sizeof(*fod)); + CWARN("Removed conflicting NID %s\n", + libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); } -static int -kibnal_set_mynid(ptl_nid_t nid) +void +kibnal_stop_listener(int normal_shutdown) { - struct timeval tv; - lib_ni_t *ni = &kibnal_lib.libnal_ni; - int rc; + /* NB this also disables peer creation and destroys all existing + * peers */ + IB_HANDLE cep = kibnal_data.kib_listener_cep; + unsigned long flags; FSTATUS frc; - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); + LASSERT (cep != NULL); - do_gettimeofday(&tv); + kibnal_unadvertise(normal_shutdown); - down (&kibnal_data.kib_nid_mutex); + frc = iba_cm_cancel(cep); + if (frc != FSUCCESS && frc != FPENDING) + CERROR ("Error %d stopping listener\n", frc); - if (nid == kibnal_data.kib_nid) { - /* no change of NID */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } + down(&kibnal_data.kib_listener_signal); - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - kibnal_data.kib_nid, nid); - - if (kibnal_data.kib_nid != PTL_NID_ANY) { + frc = iba_cm_destroy_cep(cep); + if (frc != FSUCCESS) + CERROR ("Error %d destroying listener CEP\n", frc); - kibnal_unadvertise (1); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + /* This assignment disables peer creation */ + kibnal_data.kib_listener_cep = NULL; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - frc = iibt_cm_cancel(kibnal_data.kib_cep); - if (frc != FSUCCESS && frc != FPENDING) - CERROR ("Error %d stopping listener\n", frc); + /* Start to tear down any peers created while the listener was + * running */ + kibnal_del_peer(LNET_NID_ANY); +} - frc = iibt_cm_destroy_cep(kibnal_data.kib_cep); - if (frc != FSUCCESS) - CERROR ("Error %d destroying CEP\n", frc); +int +kibnal_start_listener(void) +{ + /* NB this also enables peer creation */ - kibnal_data.kib_cep = NULL; - } - - kibnal_data.kib_nid = ni->ni_pid.nid = nid; - kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave - * new world. */ - kibnal_del_peer (PTL_NID_ANY, 0); - - if (kibnal_data.kib_nid == PTL_NID_ANY) { - /* No new NID to install */ - up (&kibnal_data.kib_nid_mutex); - return (0); - } + IB_HANDLE cep; + CM_LISTEN_INFO info; + unsigned long flags; + int rc; + FSTATUS frc; - /* remove any previous advert (crashed node etc) */ - kibnal_unadvertise(0); + LASSERT (kibnal_data.kib_listener_cep == NULL); + init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal); - kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE); - if (kibnal_data.kib_cep == NULL) { - CERROR ("Can't create CEP\n"); - rc = -ENOMEM; - } else { - CM_LISTEN_INFO info; - memset (&info, 0, sizeof(info)); - info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id; - - frc = iibt_cm_listen(kibnal_data.kib_cep, &info, - kibnal_listen_callback, NULL); - if (frc != FSUCCESS && frc != FPENDING) { - CERROR ("iibt_cm_listen error: %d\n", frc); - rc = -EINVAL; - } else { - rc = 0; - } - } - - if (rc == 0) { - rc = kibnal_advertise(); - if (rc == 0) { -#if IBNAL_CHECK_ADVERT - kibnal_check_advert(); -#endif - up (&kibnal_data.kib_nid_mutex); - return (0); - } - - iibt_cm_cancel (kibnal_data.kib_cep); - iibt_cm_destroy_cep (kibnal_data.kib_cep); - /* remove any peers that sprung up while I failed to - * advertise myself */ - kibnal_del_peer (PTL_NID_ANY, 0); + cep = kibnal_create_cep(LNET_NID_ANY); + if (cep == NULL) + return -ENOMEM; + + memset (&info, 0, sizeof(info)); + info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number; + + frc = iba_cm_listen(cep, &info, kibnal_listen_callback, NULL); + if (frc != FSUCCESS && frc != FPENDING) { + CERROR ("iba_cm_listen error: %d\n", frc); + + iba_cm_destroy_cep(cep); + return -EIO; } - kibnal_data.kib_nid = PTL_NID_ANY; - up (&kibnal_data.kib_nid_mutex); - return (rc); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + /* This assignment enables peer creation */ + kibnal_data.kib_listener_cep = cep; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + rc = kibnal_advertise(); + if (rc == 0) + rc = kibnal_check_advert(); + + if (rc == 0) + return 0; + + kibnal_stop_listener(0); + return rc; } -kib_peer_t * -kibnal_create_peer (ptl_nid_t nid) +int +kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) { - kib_peer_t *peer; + kib_peer_t *peer; + unsigned long flags; + int rc; - LASSERT (nid != PTL_NID_ANY); + LASSERT (nid != LNET_NID_ANY); - PORTAL_ALLOC (peer, sizeof (*peer)); - if (peer == NULL) - return (NULL); + LIBCFS_ALLOC (peer, sizeof (*peer)); + if (peer == NULL) { + CERROR("Cannot allocate peer\n"); + return -ENOMEM; + } memset(peer, 0, sizeof(*peer)); /* zero flags etc */ @@ -358,11 +705,35 @@ kibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_conns); INIT_LIST_HEAD (&peer->ibp_tx_queue); - peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_error = 0; + peer->ibp_last_alive = cfs_time_current(); + peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + if (atomic_read(&kibnal_data.kib_npeers) >= + *kibnal_tunables.kib_concurrent_peers) { + rc = -EOVERFLOW; /* !! but at least it distinguishes */ + } else if (kibnal_data.kib_listener_cep == NULL) { + rc = -ESHUTDOWN; /* shutdown has started */ + } else { + rc = 0; + /* npeers only grows with the global lock held */ + atomic_inc(&kibnal_data.kib_npeers); + } + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - atomic_inc (&kibnal_data.kib_npeers); - return (peer); + if (rc != 0) { + CERROR("Can't create peer: %s\n", + (rc == -ESHUTDOWN) ? "shutting down" : + "too many peers"); + LIBCFS_FREE(peer, sizeof(*peer)); + } else { + *peerp = peer; + } + + return rc; } void @@ -372,11 +743,11 @@ kibnal_destroy_peer (kib_peer_t *peer) LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); LASSERT (!kibnal_peer_active(peer)); - LASSERT (peer->ibp_connecting == 0); + LASSERT (!kibnal_peer_connecting(peer)); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); - PORTAL_FREE (peer, sizeof (*peer)); + LIBCFS_FREE (peer, sizeof (*peer)); /* NB a peer's connections keep a reference on their peer until * they are destroyed, so we can be assured that _all_ state to do @@ -388,7 +759,7 @@ kibnal_destroy_peer (kib_peer_t *peer) /* the caller is responsible for accounting for the additional reference * that this creates */ kib_peer_t * -kibnal_find_peer_locked (ptl_nid_t nid) +kibnal_find_peer_locked (lnet_nid_t nid) { struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; @@ -398,35 +769,20 @@ kibnal_find_peer_locked (ptl_nid_t nid) peer = list_entry (tmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ - peer->ibp_connecting != 0 || /* creating conns */ - !list_empty (&peer->ibp_conns)); /* active conn */ + LASSERT (peer->ibp_persistence != 0 || + kibnal_peer_connecting(peer) || + !list_empty (&peer->ibp_conns)); if (peer->ibp_nid != nid) continue; - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ibp_refcount)); + CDEBUG(D_NET, "got peer %s (%d)\n", + libcfs_nid2str(nid), atomic_read (&peer->ibp_refcount)); return (peer); } return (NULL); } -kib_peer_t * -kibnal_get_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - unsigned long flags; - - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer != NULL) /* +1 ref for caller? */ - kib_peer_addref(peer); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - - return (peer); -} - void kibnal_unlink_peer_locked (kib_peer_t *peer) { @@ -436,11 +792,11 @@ kibnal_unlink_peer_locked (kib_peer_t *peer) LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ - kib_peer_decref(peer); + kibnal_peer_decref(peer); } -static int -kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +int +kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep) { kib_peer_t *peer; struct list_head *ptmp; @@ -455,7 +811,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || + kibnal_peer_connecting(peer) || !list_empty (&peer->ibp_conns)); if (index-- > 0) @@ -474,25 +830,26 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) return (-ENOENT); } -static int -kibnal_add_persistent_peer (ptl_nid_t nid) +int +kibnal_add_persistent_peer (lnet_nid_t nid) { unsigned long flags; kib_peer_t *peer; kib_peer_t *peer2; + int rc; - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return (-EINVAL); - peer = kibnal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); + rc = kibnal_create_peer(&peer, nid); + if (rc != 0) + return rc; write_lock_irqsave (&kibnal_data.kib_global_lock, flags); peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { - kib_peer_decref (peer); + kibnal_peer_decref (peer); peer = peer2; } else { /* peer table takes existing ref on peer */ @@ -506,20 +863,14 @@ kibnal_add_persistent_peer (ptl_nid_t nid) return (0); } -static void -kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +void +kibnal_del_peer_locked (kib_peer_t *peer) { struct list_head *ctmp; struct list_head *cnxt; kib_conn_t *conn; - if (!single_share) - peer->ibp_persistence = 0; - else if (peer->ibp_persistence > 0) - peer->ibp_persistence--; - - if (peer->ibp_persistence != 0) - return; + peer->ibp_persistence = 0; if (list_empty(&peer->ibp_conns)) { kibnal_unlink_peer_locked(peer); @@ -537,9 +888,10 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share) } int -kibnal_del_peer (ptl_nid_t nid, int single_share) +kibnal_del_peer (lnet_nid_t nid) { unsigned long flags; + CFS_LIST_HEAD (zombies); struct list_head *ptmp; struct list_head *pnxt; kib_peer_t *peer; @@ -550,7 +902,7 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -561,26 +913,31 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || + kibnal_peer_connecting(peer) || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) continue; - kibnal_del_peer_locked (peer, single_share); - rc = 0; /* matched something */ + if (!list_empty(&peer->ibp_tx_queue)) { + LASSERT (list_empty(&peer->ibp_conns)); - if (single_share) - goto out; + list_splice_init(&peer->ibp_tx_queue, &zombies); + } + + kibnal_del_peer_locked (peer); + rc = 0; /* matched something */ } } - out: + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + kibnal_txlist_done(&zombies, -EIO); + return (rc); } -static kib_conn_t * +kib_conn_t * kibnal_get_conn_by_idx (int index) { kib_peer_t *peer; @@ -596,37 +953,111 @@ kibnal_get_conn_by_idx (int index) list_for_each (ptmp, &kibnal_data.kib_peers[i]) { peer = list_entry (ptmp, kib_peer_t, ibp_list); - LASSERT (peer->ibp_persistence > 0 || - peer->ibp_connecting != 0 || + LASSERT (peer->ibp_persistence != 0 || + kibnal_peer_connecting(peer) || !list_empty (&peer->ibp_conns)); list_for_each (ctmp, &peer->ibp_conns) { if (index-- > 0) continue; - conn = list_entry (ctmp, kib_conn_t, ibc_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - return (conn); - } - } + conn = list_entry (ctmp, kib_conn_t, ibc_list); + kibnal_conn_addref(conn); + read_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + return (conn); + } + } + } + + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + return (NULL); +} + +int +kibnal_conn_rts(kib_conn_t *conn, + __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn) +{ + IB_PATH_RECORD *path = &conn->ibc_cvars->cv_path; + IB_HANDLE qp = conn->ibc_qp; + IB_QP_ATTRIBUTES_MODIFY modify_attr; + FSTATUS frc; + int rc; + + if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources) + resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources; + + if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth) + init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth; + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToRecv, + .RecvPSN = IBNAL_STARTING_PSN, + .DestQPNumber = qpn, + .ResponderResources = resp_res, + .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ + .Attrs = (IB_QP_ATTR_RECVPSN | + IB_QP_ATTR_DESTQPNUMBER | + IB_QP_ATTR_RESPONDERRESOURCES | + IB_QP_ATTR_DESTAV | + IB_QP_ATTR_PATHMTU | + IB_QP_ATTR_MINRNRTIMER), + }; + GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, + &modify_attr.DestAV); + + frc = iba_modify_qp(qp, &modify_attr, NULL); + if (frc != FSUCCESS) { + CERROR("Can't set QP %s ready to receive: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + return -EIO; + } + + rc = kibnal_post_receives(conn); + if (rc != 0) { + CERROR("Can't post receives for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + return rc; + } + + modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { + .RequestState = QPStateReadyToSend, + .FlowControl = TRUE, + .InitiatorDepth = init_depth, + .SendPSN = psn, + .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ + .RetryCount = IBNAL_RETRY, + .RnrRetryCount = IBNAL_RNR_RETRY, + .Attrs = (IB_QP_ATTR_FLOWCONTROL | + IB_QP_ATTR_INITIATORDEPTH | + IB_QP_ATTR_SENDPSN | + IB_QP_ATTR_LOCALACKTIMEOUT | + IB_QP_ATTR_RETRYCOUNT | + IB_QP_ATTR_RNRRETRYCOUNT), + }; + + frc = iba_modify_qp(qp, &modify_attr, NULL); + if (frc != FSUCCESS) { + CERROR("Can't set QP %s ready to send: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + return -EIO; } - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return (NULL); + frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't query QP %s attributes: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + return -EIO; + } + + return 0; } kib_conn_t * -kibnal_create_conn (void) +kibnal_create_conn (lnet_nid_t nid, int proto_version) { kib_conn_t *conn; int i; - __u64 vaddr = 0; - __u64 vaddr_base; int page_offset; int ipage; int rc; @@ -636,50 +1067,61 @@ kibnal_create_conn (void) IB_QP_ATTRIBUTES_MODIFY qp_attr; } params; - PORTAL_ALLOC (conn, sizeof (*conn)); + LIBCFS_ALLOC (conn, sizeof (*conn)); if (conn == NULL) { - CERROR ("Can't allocate connection\n"); + CERROR ("Can't allocate connection for %s\n", + libcfs_nid2str(nid)); return (NULL); } /* zero flags, NULL pointers etc... */ memset (conn, 0, sizeof (*conn)); + conn->ibc_state = IBNAL_CONN_INIT_NOTHING; + conn->ibc_version = proto_version; + INIT_LIST_HEAD (&conn->ibc_early_rxs); + INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ - PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); - if (conn->ibc_rxs == NULL) + LIBCFS_ALLOC(conn->ibc_cvars, sizeof (*conn->ibc_cvars)); + if (conn->ibc_cvars == NULL) { + CERROR ("Can't allocate connvars for %s\n", + libcfs_nid2str(nid)); goto failed; - memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + } + memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars)); - rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); - if (rc != 0) + LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX descriptors for %s\n", + libcfs_nid2str(nid)); goto failed; + } + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); - vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; - + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES); + if (rc != 0) { + CERROR("Can't allocate RX buffers for %s\n", + libcfs_nid2str(nid)); + goto failed; + } + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; - kib_rx_t *rx = &conn->ibc_rxs[i]; + kib_rx_t *rx = &conn->ibc_rxs[i]; rx->rx_conn = conn; rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - if (kibnal_whole_mem()) - rx->rx_vaddr = kibnal_page2phys(page) + - page_offset + - kibnal_data.kib_md.md_addr; - else - rx->rx_vaddr = vaddr; - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); + rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr + + lnet_page2phys(page) + page_offset; page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -693,9 +1135,9 @@ kibnal_create_conn (void) params.qp_create = (IB_QP_ATTRIBUTES_CREATE) { .Type = QPTypeReliableConnected, - .SendQDepth = IBNAL_TX_MAX_SG * - IBNAL_MSG_QUEUE_SIZE, - .RecvQDepth = IBNAL_MSG_QUEUE_SIZE, + .SendQDepth = (1 + IBNAL_MAX_RDMA_FRAGS) * + (*kibnal_tunables.kib_concurrent_sends), + .RecvQDepth = IBNAL_RX_MSGS, .SendDSListDepth = 1, .RecvDSListDepth = 1, .SendCQHandle = kibnal_data.kib_cq, @@ -703,15 +1145,15 @@ kibnal_create_conn (void) .PDHandle = kibnal_data.kib_pd, .SendSignaledCompletions = TRUE, }; - frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL, - &conn->ibc_qp, &conn->ibc_qp_attrs); - if (rc != 0) { - CERROR ("Failed to create queue pair: %d\n", rc); + frc = iba_create_qp(kibnal_data.kib_hca, ¶ms.qp_create, NULL, + &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs); + if (frc != 0) { + CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc); goto failed; } /* Mark QP created */ - conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP); params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) { .RequestState = QPStateInit, @@ -720,21 +1162,30 @@ kibnal_create_conn (void) IB_QP_ATTR_ACCESSCONTROL), .PortGUID = kibnal_data.kib_port_guid, .PkeyIndex = 0, - .AccessControl = { + .AccessControl = { .s = { .RdmaWrite = 1, .RdmaRead = 1, }, }, }; - rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL); - if (rc != 0) { - CERROR ("Failed to modify queue pair: %d\n", rc); + frc = iba_modify_qp(conn->ibc_qp, ¶ms.qp_attr, NULL); + if (frc != 0) { + CERROR ("Can't set QP %s state to INIT: %d\n", + libcfs_nid2str(nid), frc); + goto failed; + } + + frc = iba_query_qp(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL); + if (frc != FSUCCESS) { + CERROR ("Can't query QP %s attributes: %d\n", + libcfs_nid2str(nid), frc); goto failed; } /* 1 ref for caller */ atomic_set (&conn->ibc_refcount, 1); + CDEBUG(D_NET, "New conn %p\n", conn); return (conn); failed: @@ -745,92 +1196,70 @@ kibnal_create_conn (void) void kibnal_destroy_conn (kib_conn_t *conn) { - int rc; FSTATUS frc; + + LASSERT (!in_interrupt()); - CDEBUG (D_NET, "connection %p\n", conn); + CDEBUG (D_NET, "connection %s\n", + (conn->ibc_peer) == NULL ? "" : + libcfs_nid2str(conn->ibc_peer->ibp_nid)); LASSERT (atomic_read (&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_early_rxs)); LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); - LASSERT (conn->ibc_connreq == NULL); switch (conn->ibc_state) { - case IBNAL_CONN_DISCONNECTED: - /* called after connection sequence initiated */ - /* fall through */ - - case IBNAL_CONN_INIT_QP: - /* _destroy includes an implicit Reset of the QP which - * discards posted work */ - rc = iibt_qp_destroy(conn->ibc_qp); - if (rc != 0) - CERROR("Can't destroy QP: %d\n", rc); - /* fall through */ - case IBNAL_CONN_INIT_NOTHING: + case IBNAL_CONN_INIT_QP: + case IBNAL_CONN_DISCONNECTED: break; default: - LASSERT (0); + /* conn must either have never engaged with the CM, or have + * completely disengaged from it */ + CERROR("Bad conn %s state %d\n", + (conn->ibc_peer) == NULL ? "" : + libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state); + LBUG(); } if (conn->ibc_cep != NULL) { - frc = iibt_cm_destroy_cep(conn->ibc_cep); - if (frc != 0) - CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, - frc); + frc = iba_cm_destroy_cep(conn->ibc_cep); + if (frc != FSUCCESS) + CERROR("Error destroying CEP %p: %d\n", + conn->ibc_cep, frc); + } + + if (conn->ibc_qp != NULL) { + frc = iba_destroy_qp(conn->ibc_qp); + if (frc != FSUCCESS) + CERROR("Error destroying QP %p: %d\n", + conn->ibc_qp, frc); } if (conn->ibc_rx_pages != NULL) kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) - PORTAL_FREE(conn->ibc_rxs, + LIBCFS_FREE(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + if (conn->ibc_cvars != NULL) + LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars)); + if (conn->ibc_peer != NULL) - kib_peer_decref(conn->ibc_peer); + kibnal_peer_decref(conn->ibc_peer); - PORTAL_FREE(conn, sizeof (*conn)); + LIBCFS_FREE(conn, sizeof (*conn)); atomic_dec(&kibnal_data.kib_nconns); - - if (atomic_read (&kibnal_data.kib_nconns) == 0 && - kibnal_data.kib_shutdown) { - /* I just nuked the last connection on shutdown; wake up - * everyone so they can exit. */ - wake_up_all(&kibnal_data.kib_sched_waitq); - wake_up_all(&kibnal_data.kib_connd_waitq); - } -} - -void -kibnal_put_conn (kib_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - LASSERT (atomic_read (&conn->ibc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ibc_refcount)) - return; - - /* must disconnect before dropping the final ref */ - LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); - - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - - list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); } -static int +int kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { kib_conn_t *conn; @@ -862,8 +1291,9 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) if (conn->ibc_incarnation == incarnation) continue; - CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", - peer->ibp_nid, conn->ibc_incarnation, incarnation); + CDEBUG(D_NET, "Closing stale conn nid:%s incarnation:"LPX64"("LPX64")\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_incarnation, incarnation); count++; kibnal_close_conn_locked (conn, -ESTALE); @@ -872,8 +1302,8 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) return (count); } -static int -kibnal_close_matching_conns (ptl_nid_t nid) +int +kibnal_close_matching_conns (lnet_nid_t nid) { unsigned long flags; kib_peer_t *peer; @@ -886,7 +1316,7 @@ kibnal_close_matching_conns (ptl_nid_t nid) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -898,10 +1328,10 @@ kibnal_close_matching_conns (ptl_nid_t nid) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || - peer->ibp_connecting != 0 || + kibnal_peer_connecting(peer) || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) continue; count += kibnal_close_peer_conns_locked (peer, 0); @@ -911,69 +1341,65 @@ kibnal_close_matching_conns (ptl_nid_t nid) write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return (0); return (count == 0 ? -ENOENT : 0); } -static int -kibnal_cmd(struct portals_cfg *pcfg, void * private) +int +kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { - int rc = -EINVAL; + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; ENTRY; - LASSERT (pcfg != NULL); + LASSERT (ni == kibnal_data.kib_ni); - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - int share_count = 0; + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + int share_count = 0; - rc = kibnal_get_peer_info(pcfg->pcfg_count, + rc = kibnal_get_peer_info(data->ioc_count, &nid, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; + data->ioc_nid = nid; + data->ioc_count = share_count; break; } - case NAL_CMD_ADD_PEER: { - rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); + case IOC_LIBCFS_ADD_PEER: { + rc = kibnal_add_persistent_peer (data->ioc_nid); break; } - case NAL_CMD_DEL_PEER: { - rc = kibnal_del_peer (pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); + case IOC_LIBCFS_DEL_PEER: { + rc = kibnal_del_peer (data->ioc_nid); break; } - case NAL_CMD_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + case IOC_LIBCFS_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); if (conn == NULL) rc = -ENOENT; else { rc = 0; - pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; - kibnal_put_conn (conn); + data->ioc_nid = conn->ibc_peer->ibp_nid; + kibnal_conn_decref(conn); } break; } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (data->ioc_nid); break; } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) + case IOC_LIBCFS_REGISTER_MYNID: { + if (ni->ni_nid == data->ioc_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; - else - rc = kibnal_set_mynid (pcfg->pcfg_nid); + } break; } } @@ -985,38 +1411,22 @@ void kibnal_free_pages (kib_pages_t *p) { int npages = p->ibp_npages; - int rc; int i; - if (p->ibp_mapped) { - rc = iibt_deregister_memory(p->ibp_handle); - if (rc != 0) - CERROR ("Deregister error: %d\n", rc); - } - for (i = 0; i < npages; i++) if (p->ibp_pages[i] != NULL) __free_page(p->ibp_pages[i]); - PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int -kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) +kibnal_alloc_pages (kib_pages_t **pp, int npages) { - kib_pages_t *p; - __u64 *phys_pages; - int i; - FSTATUS frc; - IB_ACCESS_CONTROL access; - - memset(&access, 0, sizeof(access)); - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; + kib_pages_t *p; + int i; - PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); @@ -1034,107 +1444,131 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) } } - if (kibnal_whole_mem()) - goto out; - - PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); - if (phys_pages == NULL) { - CERROR ("Can't allocate physarray for %d pages\n", npages); - /* XXX free ibp_pages? */ - kibnal_free_pages(p); - return (-ENOMEM); - } + *pp = p; + return (0); +} - /* if we were using the _contig_ registration variant we would have - * an array of PhysAddr/Length pairs, but the discontiguous variant - * just takes the PhysAddr */ - for (i = 0; i < npages; i++) - phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]); - - frc = iibt_register_physical_memory(kibnal_data.kib_hca, - 0, /* requested vaddr */ - phys_pages, npages, - 0, /* offset */ - kibnal_data.kib_pd, - access, - &p->ibp_handle, &p->ibp_vaddr, - &p->ibp_lkey, &p->ibp_rkey); +int +kibnal_alloc_tx_descs (void) +{ + int i; - PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); + LIBCFS_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) + return -ENOMEM; - if (frc != FSUCCESS) { - CERROR ("Error %d mapping %d pages\n", frc, npages); - kibnal_free_pages(p); - return (-ENOMEM); + memset(kibnal_data.kib_tx_descs, 0, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); + + for (i = 0; i < IBNAL_TX_MSGS(); i++) { + kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; + +#if IBNAL_USE_FMR + LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) + return -ENOMEM; +#else + LIBCFS_ALLOC(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_gl)); + if (tx->tx_gl == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBNAL_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + return -ENOMEM; +#endif } - CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" " - "lkey %x rkey %x\n", npages, p->ibp_handle, - p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); - - p->ibp_mapped = 1; -out: - *pp = p; - return (0); + return 0; +} + +void +kibnal_free_tx_descs (void) +{ + int i; + + if (kibnal_data.kib_tx_descs == NULL) + return; + + for (i = 0; i < IBNAL_TX_MSGS(); i++) { + kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; + +#if IBNAL_USE_FMR + if (tx->tx_pages != NULL) + LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV * + sizeof(*tx->tx_pages)); +#else + if (tx->tx_wrq != NULL) + LIBCFS_FREE(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + + if (tx->tx_gl != NULL) + LIBCFS_FREE(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_gl)); + + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBNAL_MAX_RDMA_FRAGS])); +#endif + } + + LIBCFS_FREE(kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); } -static int +int kibnal_setup_tx_descs (void) { int ipage = 0; int page_offset = 0; - __u64 vaddr; - __u64 vaddr_base; struct page *page; kib_tx_t *tx; int i; int rc; /* pre-mapped messages are not bigger than 1 page */ - LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); + CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); /* No fancy arithmetic when we do the buffer calculations */ - LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); + CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, - 0); + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + IBNAL_TX_MSG_PAGES()); if (rc != 0) return (rc); - /* ignored for the whole_mem case */ - vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; - memset (tx, 0, sizeof(*tx)); /* zero flags etc */ - +#if IBNAL_USE_FMR + /* Allocate an FMR for this TX so it can map src/sink buffers + * for large transfers */ +#endif tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); - if (kibnal_whole_mem()) - tx->tx_vaddr = kibnal_page2phys(page) + - page_offset + - kibnal_data.kib_md.md_addr; - else - tx->tx_vaddr = vaddr; - - tx->tx_isnblk = (i >= IBNAL_NTX); - tx->tx_mapped = KIB_TX_UNMAPPED; + tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr + + lnet_page2phys(page) + page_offset; CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", - i, tx, tx->tx_msg, tx->tx_vaddr); + i, tx, tx->tx_msg, tx->tx_hca_msg); - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); - - vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1142,29 +1576,89 @@ kibnal_setup_tx_descs (void) if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); } } return (0); } -static void -kibnal_api_shutdown (nal_t *nal) +int +kibnal_register_all_memory(void) { - int i; - int rc; + /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous + * chunk starting at 0 */ + struct sysinfo si; + __u64 total; + __u64 total2; + __u64 roundup = (128<<20); /* round up in big chunks */ + IB_MR_PHYS_BUFFER phys; + IB_ACCESS_CONTROL access; + FSTATUS frc; - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; + memset(&access, 0, sizeof(access)); + access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaRead = 1; + access.s.RdmaWrite = 1; + + /* XXX we don't bother with first-gen cards */ + if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 && + kibnal_data.kib_hca_attrs.DeviceId == 0x3101) { + CERROR("Can't register all memory on first generation HCAs\n"); + return -EINVAL; } - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); + si_meminfo(&si); + + CDEBUG(D_NET, "si_meminfo: %lu/%u, num_physpages %lu/%lu\n", + si.totalram, si.mem_unit, num_physpages, PAGE_SIZE); + + total = ((__u64)si.totalram) * si.mem_unit; + total2 = num_physpages * PAGE_SIZE; + if (total < total2) + total = total2; + + if (total == 0) { + CERROR("Can't determine memory size\n"); + return -ENOMEM; + } + + roundup = (128<<20); + total = (total + (roundup - 1)) & ~(roundup - 1); + + phys.PhysAddr = 0; + phys.Length = total; - LASSERT(nal == &kibnal_api); + frc = iba_register_contig_pmr(kibnal_data.kib_hca, 0, &phys, 1, 0, + kibnal_data.kib_pd, access, + &kibnal_data.kib_whole_mem.md_handle, + &kibnal_data.kib_whole_mem.md_addr, + &kibnal_data.kib_whole_mem.md_lkey, + &kibnal_data.kib_whole_mem.md_rkey); + + if (frc != FSUCCESS) { + CERROR("registering physical memory failed: %d\n", frc); + return -EIO; + } + + CDEBUG(D_WARNING, "registered phys mem from 0("LPX64") for "LPU64"("LPU64") -> "LPX64"\n", + phys.PhysAddr, total, phys.Length, kibnal_data.kib_whole_mem.md_addr); + + return 0; +} + +void +kibnal_shutdown (lnet_ni_t *ni) +{ + int i; + int rc; + + LASSERT (ni == kibnal_data.kib_ni); + LASSERT (ni->ni_data == &kibnal_data); + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); switch (kibnal_data.kib_init) { default: @@ -1172,20 +1666,16 @@ kibnal_api_shutdown (nal_t *nal) LBUG(); case IBNAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(IIBNAL); - /* No new peers */ + /* stop accepting connections, prevent new peers and start to + * tear down all existing ones... */ + kibnal_stop_listener(1); - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ - kibnal_set_mynid (PTL_NID_ANY); - - /* Wait for all peer state to clean up (crazy) */ + /* Wait for all peer state to clean up */ i = 2; while (atomic_read (&kibnal_data.kib_npeers) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect (can take a few seconds)\n", + "waiting for %d peers to disconnect\n", atomic_read (&kibnal_data.kib_npeers)); set_current_state (TASK_UNINTERRUPTIBLE); schedule_timeout (HZ); @@ -1193,7 +1683,7 @@ kibnal_api_shutdown (nal_t *nal) /* fall through */ case IBNAL_INIT_CQ: - rc = iibt_cq_destroy(kibnal_data.kib_cq); + rc = iba_destroy_cq(kibnal_data.kib_cq); if (rc != 0) CERROR ("Destroy CQ error: %d\n", rc); /* fall through */ @@ -1202,63 +1692,43 @@ kibnal_api_shutdown (nal_t *nal) kibnal_free_pages (kibnal_data.kib_tx_pages); /* fall through */ - case IBNAL_INIT_MR: - if (kibnal_data.kib_md.md_handle != NULL) { - rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle); - if (rc != FSUCCESS) - CERROR ("Deregister memory: %d\n", rc); - } + case IBNAL_INIT_MD: + rc = iba_deregister_mr(kibnal_data.kib_whole_mem.md_handle); + if (rc != FSUCCESS) + CERROR ("Deregister memory: %d\n", rc); /* fall through */ -#if IBNAL_FMR - case IBNAL_INIT_FMR: - rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); - if (rc != 0) - CERROR ("Destroy FMR pool error: %d\n", rc); - /* fall through */ -#endif case IBNAL_INIT_PD: - rc = iibt_pd_free(kibnal_data.kib_pd); + rc = iba_free_pd(kibnal_data.kib_pd); if (rc != 0) CERROR ("Destroy PD error: %d\n", rc); /* fall through */ case IBNAL_INIT_SD: - rc = iibt_sd_deregister(kibnal_data.kib_sd); + rc = iba_sd_deregister(kibnal_data.kib_sd); if (rc != 0) CERROR ("Deregister SD error: %d\n", rc); /* fall through */ - case IBNAL_INIT_PORT: - /* XXX ??? */ - /* fall through */ - case IBNAL_INIT_PORTATTRS: - PORTAL_FREE(kibnal_data.kib_hca_attrs.PortAttributesList, + LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList, kibnal_data.kib_hca_attrs.PortAttributesListSize); /* fall through */ case IBNAL_INIT_HCA: - rc = iibt_close_hca(kibnal_data.kib_hca); + rc = iba_close_ca(kibnal_data.kib_hca); if (rc != 0) CERROR ("Close HCA error: %d\n", rc); /* fall through */ - case IBNAL_INIT_LIB: - lib_fini(&kibnal_lib); - /* fall through */ - case IBNAL_INIT_DATA: - /* Module refcount only gets to zero when all peers - * have been closed so all lists must be empty */ LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); LASSERT (kibnal_data.kib_peers != NULL); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { LASSERT (list_empty (&kibnal_data.kib_peers[i])); } LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); - LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_zombies)); LASSERT (list_empty (&kibnal_data.kib_connd_conns)); LASSERT (list_empty (&kibnal_data.kib_connd_peers)); @@ -1282,83 +1752,143 @@ kibnal_api_shutdown (nal_t *nal) break; } - if (kibnal_data.kib_tx_descs != NULL) - PORTAL_FREE (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + kibnal_free_tx_descs(); if (kibnal_data.kib_peers != NULL) - PORTAL_FREE (kibnal_data.kib_peers, + LIBCFS_FREE (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - printk(KERN_INFO "Lustre: Infinicon IB NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + atomic_read (&libcfs_kmemory)); kibnal_data.kib_init = IBNAL_INIT_NOTHING; + PORTAL_MODULE_UNUSE; } -#define roundup_power(val, power) \ - ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) - -/* this isn't very portable or sturdy in the face of funny mem/bus configs */ -static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr) +int +kibnal_get_ipif_name(char *ifname, int ifname_size, int idx) { - struct sysinfo si; - __u64 ret; + char *basename = *kibnal_tunables.kib_ipif_basename; + int n = strlen(basename); + int baseidx; + int m; - /* XXX we don't bother with first-gen cards */ - if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101) - return 0ULL; + if (n == 0) { /* empty string */ + CERROR("Empty IP interface basename specified\n"); + return -EINVAL; + } - si_meminfo(&si); - ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; - return roundup_power(ret, 128 * 1024 * 1024); -} -#undef roundup_power - -static int -kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) + for (m = n; m > 0; m--) /* find max numeric postfix */ + if (sscanf(basename + m - 1, "%d", &baseidx) != 1) + break; + + if (m == 0) /* just a number */ + m = n; + + if (m == n) /* no postfix */ + baseidx = 1; /* default to 1 */ + + if (m >= ifname_size) + m = ifname_size - 1; + + memcpy(ifname, basename, m); /* copy prefix name */ + + snprintf(ifname + m, ifname_size - m, "%d", baseidx + idx); + + if (strlen(ifname) == ifname_size - 1) { + CERROR("IP interface basename %s too long\n", basename); + return -EINVAL; + } + + return 0; +} + +int +kibnal_startup (lnet_ni_t *ni) { - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); + char ipif_name[32]; + __u32 ip; + __u32 netmask; + int up; + int nob; + struct timeval tv; IB_PORT_ATTRIBUTES *pattr; FSTATUS frc; int rc; - int n; + __u32 n; int i; - LASSERT (nal == &kibnal_api); + LASSERT (ni->ni_lnd == &the_kiblnd); - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); + /* Only 1 instance supported */ + if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { + CERROR ("Only 1 instance supported\n"); + return -EPERM; } - LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kibnal_tunables.kib_credits, + *kibnal_tunables.kib_ntx); + return -EINVAL; + } - frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2, - &kibnal_data.kib_interfaces); - if (frc != FSUCCESS) { - CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n", - frc); - return -ENOSYS; + ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; + ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; + + CLASSERT (LNET_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[0] == NULL) { + kibnal_data.kib_hca_idx = 0; + } else { + /* Use the HCA specified in 'networks=' */ + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + return -EPERM; + } + + /* Parse into kib_hca_idx */ + nob = strlen(ni->ni_interfaces[0]); + if (sscanf(ni->ni_interfaces[0], "%d%n", + &kibnal_data.kib_hca_idx, &nob) < 1 || + nob != strlen(ni->ni_interfaces[0])) { + CERROR("Can't parse interface '%s'\n", + ni->ni_interfaces[0]); + return -EINVAL; + } + } + + rc = kibnal_get_ipif_name(ipif_name, sizeof(ipif_name), + kibnal_data.kib_hca_idx); + if (rc != 0) + return rc; + + rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); + if (rc != 0) { + CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); + return -ENETDOWN; + } + + if (!up) { + CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); + return -ENETDOWN; } + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); + + ni->ni_data = &kibnal_data; + kibnal_data.kib_ni = ni; + + do_gettimeofday(&tv); + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - init_MUTEX (&kibnal_data.kib_nid_mutex); - init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal); - kibnal_data.kib_nid = PTL_NID_ANY; + PORTAL_MODULE_USE; rwlock_init(&kibnal_data.kib_global_lock); kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (kibnal_data.kib_peers, + LIBCFS_ALLOC (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); if (kibnal_data.kib_peers == NULL) { goto failed; @@ -1369,22 +1899,18 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init (&kibnal_data.kib_connd_lock); INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies); init_waitqueue_head (&kibnal_data.kib_connd_waitq); spin_lock_init (&kibnal_data.kib_sched_lock); - INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); - INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); init_waitqueue_head (&kibnal_data.kib_sched_waitq); spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); - PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); - if (kibnal_data.kib_tx_descs == NULL) { - CERROR ("Can't allocate tx descs\n"); + rc = kibnal_alloc_tx_descs(); + if (rc != 0) { + CERROR("Can't allocate tx descs\n"); goto failed; } @@ -1392,24 +1918,15 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ - process_id.pid = requested_pid; - process_id.nid = kibnal_data.kib_nid; - - rc = lib_init(&kibnal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kibnal_data.kib_init = IBNAL_INIT_LIB; - /*****************************************************/ + kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries; + kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/ + *kibnal_tunables.kib_sd_retries; for (i = 0; i < IBNAL_N_SCHED; i++) { - rc = kibnal_thread_start (kibnal_scheduler, (void *)i); + rc = kibnal_thread_start (kibnal_scheduler, + (void *)(unsigned long)i); if (rc != 0) { - CERROR("Can't spawn iibnal scheduler[%d]: %d\n", + CERROR("Can't spawn iib scheduler[%d]: %d\n", i, rc); goto failed; } @@ -1417,30 +1934,38 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = kibnal_thread_start (kibnal_connd, NULL); if (rc != 0) { - CERROR ("Can't spawn iibnal connd: %d\n", rc); + CERROR ("Can't spawn iib connd: %d\n", rc); goto failed; } n = sizeof(kibnal_data.kib_hca_guids) / sizeof(kibnal_data.kib_hca_guids[0]); - frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids); + frc = iba_get_caguids(&n, kibnal_data.kib_hca_guids); if (frc != FSUCCESS) { - CERROR ("Can't get channel adapter guids: %d\n", frc); + CERROR ("Can't get HCA guids: %d\n", frc); goto failed; } + if (n == 0) { - CERROR ("No channel adapters found\n"); + CERROR ("No HCAs found\n"); goto failed; } - /* Infinicon has per-HCA rather than per CQ completion handlers */ - frc = iibt_open_hca(kibnal_data.kib_hca_guids[0], - kibnal_ca_callback, - kibnal_ca_async_callback, - &kibnal_data.kib_hca, + if (n <= kibnal_data.kib_hca_idx) { + CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n", + kibnal_data.kib_hca_idx, n - 1); + goto failed; + } + + /* Infinicon has per-HCA notification callbacks */ + frc = iba_open_ca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx], + kibnal_hca_callback, + kibnal_hca_async_callback, + NULL, &kibnal_data.kib_hca); if (frc != FSUCCESS) { - CERROR ("Can't open CA[0]: %d\n", frc); + CERROR ("Can't open HCA[%d]: %d\n", + kibnal_data.kib_hca_idx, frc); goto failed; } @@ -1450,14 +1975,14 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_hca_attrs.PortAttributesList = NULL; kibnal_data.kib_hca_attrs.PortAttributesListSize = 0; - frc = iibt_query_hca(kibnal_data.kib_hca, - &kibnal_data.kib_hca_attrs, NULL); + frc = iba_query_ca(kibnal_data.kib_hca, + &kibnal_data.kib_hca_attrs, NULL); if (frc != FSUCCESS) { CERROR ("Can't size port attrs: %d\n", frc); goto failed; } - PORTAL_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList, + LIBCFS_ALLOC(kibnal_data.kib_hca_attrs.PortAttributesList, kibnal_data.kib_hca_attrs.PortAttributesListSize); if (kibnal_data.kib_hca_attrs.PortAttributesList == NULL) goto failed; @@ -1466,10 +1991,11 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_PORTATTRS; /*****************************************************/ - frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs, - NULL); + frc = iba_query_ca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs, + NULL); if (frc != FSUCCESS) { - CERROR ("Can't get port attrs for CA 0: %d\n", frc); + CERROR ("Can't get port attrs for HCA %d: %d\n", + kibnal_data.kib_hca_idx, frc); goto failed; } @@ -1508,11 +2034,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid); - /* Active port found */ - kibnal_data.kib_init = IBNAL_INIT_PORT; - /*****************************************************/ - - frc = iibt_sd_register(&kibnal_data.kib_sd, NULL); + frc = iba_sd_register(&kibnal_data.kib_sd, NULL); if (frc != FSUCCESS) { CERROR ("Can't register with SD: %d\n", frc); goto failed; @@ -1522,7 +2044,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_SD; /*****************************************************/ - frc = iibt_pd_allocate(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd); + frc = iba_alloc_pd(kibnal_data.kib_hca, 0, &kibnal_data.kib_pd); if (frc != FSUCCESS) { CERROR ("Can't create PD: %d\n", rc); goto failed; @@ -1532,73 +2054,14 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_PD; /*****************************************************/ -#if IBNAL_FMR - { - const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; - struct ib_fmr_pool_param params = { - .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ), - .pool_size = pool_size, - .dirty_watermark = (pool_size * 3)/4, - .flush_function = NULL, - .flush_arg = NULL, - .cache = 1, - }; - rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, - &kibnal_data.kib_fmr_pool); - if (rc != 0) { - CERROR ("Can't create FMR pool size %d: %d\n", - pool_size, rc); - goto failed; - } - } - - /* flag FMR pool initialised */ - kibnal_data.kib_init = IBNAL_INIT_FMR; -#endif - /*****************************************************/ - if (IBNAL_WHOLE_MEM) { - IB_MR_PHYS_BUFFER phys; - IB_ACCESS_CONTROL access; - kib_md_t *md = &kibnal_data.kib_md; - - memset(&access, 0, sizeof(access)); - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; - - phys.PhysAddr = 0; - phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs); - if (phys.Length == 0) { - CERROR ("couldn't determine the end of phys mem\n"); - goto failed; - } - - rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca, - 0, - &phys, 1, - 0, - kibnal_data.kib_pd, - access, - &md->md_handle, - &md->md_addr, - &md->md_lkey, - &md->md_rkey); - if (rc != FSUCCESS) { - CERROR("registering physical memory failed: %d\n", - rc); - CERROR("falling back to registration per-rdma\n"); - md->md_handle = NULL; - } else { - CDEBUG(D_NET, "registered "LPU64" bytes of mem\n", - phys.Length); - kibnal_data.kib_init = IBNAL_INIT_MR; - } + rc = kibnal_register_all_memory(); + if (rc != 0) { + CERROR ("Can't register all memory\n"); + goto failed; } - + + /* flag whole memory MD initialised */ + kibnal_data.kib_init = IBNAL_INIT_MD; /*****************************************************/ rc = kibnal_setup_tx_descs(); @@ -1611,38 +2074,33 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ - { - uint32 nentries; - - frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, - &kibnal_data.kib_cq, &kibnal_data.kib_cq, - &nentries); - if (frc != FSUCCESS) { - CERROR ("Can't create RX CQ: %d\n", frc); - goto failed; - } - - /* flag CQ initialised */ - kibnal_data.kib_init = IBNAL_INIT_CQ; + frc = iba_create_cq(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), + &kibnal_data.kib_cq, &kibnal_data.kib_cq, + &n); + if (frc != FSUCCESS) { + CERROR ("Can't create RX CQ: %d\n", frc); + goto failed; + } - if (nentries < IBNAL_CQ_ENTRIES) { - CERROR ("CQ only has %d entries, need %d\n", - nentries, IBNAL_CQ_ENTRIES); - goto failed; - } + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; + /*****************************************************/ + + if (n < IBNAL_CQ_ENTRIES()) { + CERROR ("CQ only has %d entries: %d needed\n", + n, IBNAL_CQ_ENTRIES()); + goto failed; + } - rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC); - if (rc != 0) { - CERROR ("Failed to re-arm completion queue: %d\n", rc); - goto failed; - } + rc = iba_rearm_cq(kibnal_data.kib_cq, CQEventSelNextWC); + if (rc != 0) { + CERROR ("Failed to re-arm completion queue: %d\n", rc); + goto failed; } - /*****************************************************/ - - rc = libcfs_nal_cmd_register(IIBNAL, &kibnal_cmd, NULL); + rc = kibnal_start_listener(); if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); + CERROR("Can't start listener: %d\n", rc); goto failed; } @@ -1650,26 +2108,18 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ - printk(KERN_INFO "Lustre: Infinicon IB NAL loaded " - "(initial mem %d)\n", pkmem); - - return (PTL_OK); + return (0); failed: - kibnal_api_shutdown (&kibnal_api); - return (PTL_FAIL); + kibnal_shutdown (ni); + return (-ENETDOWN); } void __exit kibnal_module_fini (void) { -#ifdef CONFIG_SYSCTL - if (kibnal_tunables.kib_sysctl != NULL) - unregister_sysctl_table (kibnal_tunables.kib_sysctl); -#endif - PtlNIFini(kibnal_ni); - - ptl_unregister_nal(IIBNAL); + lnet_unregister_lnd(&the_kiblnd); + kibnal_tunables_fini(); } int __init @@ -1677,46 +2127,22 @@ kibnal_module_init (void) { int rc; - if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) { - CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n"); - return -EINVAL; - } - - /* the following must be sizeof(int) for proc_dointvec() */ - if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { - CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); - return -EINVAL; + if (the_lnet.ln_ptlcompat != 0) { + LCONSOLE_ERROR("IIB does not support portals compatibility mode\n"); + return -ENODEV; } + + rc = kibnal_tunables_init(); + if (rc != 0) + return rc; - kibnal_api.nal_ni_init = kibnal_api_startup; - kibnal_api.nal_ni_fini = kibnal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; - - rc = ptl_register_nal(IIBNAL, &kibnal_api); - if (rc != PTL_OK) { - CERROR("Can't register IBNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } + lnet_register_lnd(&the_kiblnd); - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(IIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(IIBNAL); - return (-ENODEV); - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kibnal_tunables.kib_sysctl = - register_sysctl_table (kibnal_top_ctl_table, 0); -#endif - return (0); + return 0; } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01"); +MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00"); MODULE_LICENSE("GPL"); module_init(kibnal_module_init); diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h index e16bd4c..0a2fa94 100644 --- a/lnet/klnds/iiblnd/iiblnd.h +++ b/lnet/klnds/iiblnd/iiblnd.h @@ -48,12 +48,11 @@ #include #include -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND #include -#include -#include -#include +#include +#include #include @@ -69,90 +68,85 @@ #error Invalid GCC version. Must use GCC >= 3.2.3 #endif -#define IBNAL_SERVICE_NAME "iibnal" -#define IBNAL_SERVICE_NUMBER 0x11b9a1 - #if CONFIG_SMP # define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else # define IBNAL_N_SCHED 1 /* # schedulers */ #endif -#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ +#define IBNAL_FMR 0 /* map on demand v. use whole mem mapping */ -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +/* tunables fixed at compile time */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_STARTING_PSN 1 -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ +/* QP tunables */ /* 7 indicates infinite retry attempts, Infinicon recommended 5 */ -#define IBNAL_RETRY 5 /* # times to retry */ -#define IBNAL_RNR_RETRY 5 /* */ -#define IBNAL_CM_RETRY 5 /* # times to retry connection */ -#define IBNAL_FLOW_CONTROL 1 -#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ - -#define IBNAL_NTX 64 /* # tx descs */ -/* this had to be dropped down so that we only register < 255 pages per - * region. this will change if we register all memory. */ -#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ - -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ - -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ - -/* default vals for runtime tunables */ -#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define IBNAL_RETRY 5 /* # times to retry */ +#define IBNAL_RNR_RETRY 5 /* */ +#define IBNAL_CM_RETRY 5 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ +#define IBNAL_EE_FLOW 1 +#define IBNAL_LOCAL_SUB 1 +#define IBNAL_FAILOVER_ACCEPTED 0 /************************/ /* derived constants... */ /* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) -#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) +#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) -#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1) +#if IBNAL_USE_FMR +# define IBNAL_MAX_RDMA_FRAGS 1 +# define IBNAL_CONCURRENT_SENDS IBNAL_RX_MSGS +#else +# define IBNAL_MAX_RDMA_FRAGS LNET_MAX_IOV +# define IBNAL_CONCURRENT_SENDS IBNAL_MSG_QUEUE_SIZE +#endif /* RX messages (per connection) */ -#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - +#define IBNAL_RX_MSGS (IBNAL_MSG_QUEUE_SIZE * 2) +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -/* we may have up to 2 completions per transmit + - 1 completion per receive, per connection */ -#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ - (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) - -#define IBNAL_RDMA_BASE 0x0eeb0000 -#define IBNAL_FMR 0 -#define IBNAL_WHOLE_MEM 1 -#define IBNAL_CKSUM 0 -//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT - -/* XXX I have no idea. */ -#define IBNAL_STARTING_PSN 1 +#define IBNAL_CQ_ENTRIES() (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) + \ + (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers)) typedef struct { - int kib_io_timeout; /* comms timeout (seconds) */ + char **kib_hca_basename; /* HCA base name */ + char **kib_ipif_basename; /* IPoIB interface base name */ + char **kib_service_name; /* global service name */ + unsigned int *kib_service_number; /* global service number */ + int *kib_min_reconnect_interval; /* min connect retry seconds... */ + int *kib_max_reconnect_interval; /* max connect retry seconds */ + int *kib_concurrent_peers; /* max # peers */ + int *kib_cksum; /* checksum kib_msg_t? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_keepalive; /* keepalive timeout (seconds) */ + int *kib_ntx; /* # tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peercredits; /* # concurrent sends to 1 peer */ + int *kib_sd_retries; /* # concurrent sends to 1 peer */ + int *kib_concurrent_sends; /* send work queue sizing */ +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM struct ctl_table_header *kib_sysctl; /* sysctl interface */ +#endif } kib_tunables_t; -/* some of these have specific types in the stack that just map back - * to the uFOO types, like IB_{L,R}_KEY. */ +/* NB The Infinicon stack has specific typedefs for some things + * (e.g. IB_{L,R}_KEY), that just map back to __u32 etc */ typedef struct { int ibp_npages; /* # pages */ - int ibp_mapped; /* mapped? */ - __u64 ibp_vaddr; /* mapped region vaddr */ - __u32 ibp_lkey; /* mapped region lkey */ - __u32 ibp_rkey; /* mapped region rkey */ - IB_HANDLE ibp_handle; /* mapped region handle */ struct page *ibp_pages[0]; } kib_pages_t; @@ -170,39 +164,35 @@ typedef struct __u64 kib_incarnation; /* which one am I */ int kib_shutdown; /* shut down? */ atomic_t kib_nthreads; /* # live threads */ + lnet_ni_t *kib_ni; /* _the_ iib instance */ - __u64 kib_service_id; /* service number I listen on */ __u64 kib_port_guid; /* my GUID (lo 64 of GID)*/ __u16 kib_port_pkey; /* my pkey, whatever that is */ - ptl_nid_t kib_nid; /* my NID */ - struct semaphore kib_nid_mutex; /* serialise NID ops */ - struct semaphore kib_nid_signal; /* signal completion */ - IB_HANDLE kib_cep; /* connection end point */ + struct semaphore kib_listener_signal; /* signal completion */ + IB_HANDLE kib_listener_cep; /* connection end point */ rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + int kib_ready; /* CQ callback fired */ + int kib_checking_cq; /* a scheduler is checking the CQ */ struct list_head *kib_peers; /* hash table of all my known peers */ int kib_peer_hash_size; /* size of kib_peers */ atomic_t kib_npeers; /* # peers extant */ atomic_t kib_nconns; /* # connections extant */ + struct list_head kib_connd_zombies; /* connections to free */ struct list_head kib_connd_conns; /* connections to progress */ struct list_head kib_connd_peers; /* peers waiting for a connection */ - wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ - unsigned long kib_connd_waketime; /* when connd will wake */ + wait_queue_head_t kib_connd_waitq; /* connection daemon sleep here */ spinlock_t kib_connd_lock; /* serialise */ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ - struct list_head kib_sched_txq; /* tx requiring attention */ - struct list_head kib_sched_rxq; /* rx requiring attention */ spinlock_t kib_sched_lock; /* serialise */ struct kib_tx *kib_tx_descs; /* all the tx descriptors */ kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ __u64 kib_next_tx_cookie; /* RDMA completion cookie */ spinlock_t kib_tx_lock; /* serialise */ @@ -211,15 +201,13 @@ typedef struct IB_HANDLE kib_pd; /* protection domain */ IB_HANDLE kib_sd; /* SD handle */ IB_HANDLE kib_cq; /* completion queue */ - kib_md_t kib_md; /* full-mem registration */ + kib_md_t kib_whole_mem; /* whole-mem registration */ - void *kib_listen_handle; /* where I listen for connections */ + int kib_hca_idx; /* my HCA number */ + uint64 kib_hca_guids[8]; /* all the HCA guids */ + IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */ - IBT_INTERFACE_UNION kib_interfaces; /* The Infinicon IBT interface */ - - uint64 kib_hca_guids[8]; /* all the HCA guids */ - IB_CA_ATTRIBUTES kib_hca_attrs; /* where to get HCA attrs */ - FABRIC_OPERATION_DATA kib_fabopdata; /* (un)advertise service record */ + COMMAND_CONTROL_PARAMETERS kib_sdretry; /* control SD query retries */ } kib_data_t; #define IBNAL_INIT_NOTHING 0 @@ -227,14 +215,12 @@ typedef struct #define IBNAL_INIT_LIB 2 #define IBNAL_INIT_HCA 3 #define IBNAL_INIT_PORTATTRS 4 -#define IBNAL_INIT_PORT 5 -#define IBNAL_INIT_SD 6 -#define IBNAL_INIT_PD 7 -#define IBNAL_INIT_FMR 8 -#define IBNAL_INIT_MR 9 -#define IBNAL_INIT_TXD 10 -#define IBNAL_INIT_CQ 11 -#define IBNAL_INIT_ALL 12 +#define IBNAL_INIT_SD 5 +#define IBNAL_INIT_PD 6 +#define IBNAL_INIT_MD 7 +#define IBNAL_INIT_TXD 8 +#define IBNAL_INIT_CQ 9 +#define IBNAL_INIT_ALL 10 /************************************************************************ * Wire message structs. @@ -243,35 +229,60 @@ typedef struct * private data and SM service info), is LE on the wire. */ -/* also kib_md_t above */ +typedef struct kib_connparams +{ + __u32 ibcp_queue_depth; + __u32 ibcp_max_msg_size; + __u32 ibcp_max_frags; +} WIRE_ATTR kib_connparams_t; + +typedef struct +{ + lnet_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kib_immediate_msg_t; +#if IBNAL_USE_FMR typedef struct { - __u32 rd_nob; /* # of bytes */ - __u64 rd_addr; /* remote io vaddr */ + __u64 rd_addr; /* IO VMA address */ + __u32 rd_nob; /* # of bytes */ + __u32 rd_key; /* remote key */ } WIRE_ATTR kib_rdma_desc_t; +#else +typedef struct +{ + __u32 rf_nob; /* # of bytes */ + __u64 rf_addr; /* remote io vaddr */ +} WIRE_ATTR kib_rdma_frag_t; typedef struct { - ptl_hdr_t ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR kib_immediate_msg_t; + __u32 rd_key; /* local/remote key */ + __u32 rd_nfrag; /* # fragments */ + kib_rdma_frag_t rd_frags[0]; /* buffer frags */ +} WIRE_ATTR kib_rdma_desc_t; +#endif + +typedef struct +{ + lnet_hdr_t ibprm_hdr; /* LNET header */ + __u64 ibprm_cookie; /* opaque completion cookie */ +} WIRE_ATTR kib_putreq_msg_t; -/* these arrays serve two purposes during rdma. they are built on the passive - * side and sent to the active side as remote arguments. On the active side - * the descs are used as a data structure on the way to local gather items. - * the different roles result in split local/remote meaning of desc->rd_key */ typedef struct { - ptl_hdr_t ibrm_hdr; /* portals header */ - __u64 ibrm_cookie; /* opaque completion cookie */ - __u32 ibrm_num_descs; /* how many descs */ - __u32 rd_key; /* remote key */ - kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ -} WIRE_ATTR kib_rdma_msg_t; + __u64 ibpam_src_cookie; /* reflected completion cookie */ + __u64 ibpam_dst_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */ +} WIRE_ATTR kib_putack_msg_t; -#define kib_rdma_msg_len(num_descs) \ - offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs]) +typedef struct +{ + lnet_hdr_t ibgm_hdr; /* LNET header */ + __u64 ibgm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibgm_rd; /* sender's sink buffer */ +} WIRE_ATTR kib_get_msg_t; typedef struct { @@ -281,30 +292,49 @@ typedef struct typedef struct { - __u32 ibm_magic; /* I'm an openibnal message */ - __u16 ibm_version; /* this is my version number */ - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ -#if IBNAL_CKSUM - __u32 ibm_nob; - __u32 ibm_cksum; -#endif + /* First 2 fields fixed FOR ALL TIME */ + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ + __u32 ibm_nob; /* # bytes in whole message */ + __u32 ibm_cksum; /* checksum (0 == no checksum) */ + __u64 ibm_srcnid; /* sender's NID */ + __u64 ibm_srcstamp; /* sender's incarnation */ + __u64 ibm_dstnid; /* destination's NID */ + __u64 ibm_dststamp; /* destination's incarnation */ + __u64 ibm_seq; /* sequence number */ + union { + kib_connparams_t connparams; kib_immediate_msg_t immediate; - kib_rdma_msg_t rdma; + kib_putreq_msg_t putreq; + kib_putack_msg_t putack; + kib_get_msg_t get; kib_completion_msg_t completion; } WIRE_ATTR ibm_u; } WIRE_ATTR kib_msg_t; -#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define IBNAL_MSG_VERSION 1 /* current protocol version */ +#define IBNAL_MSG_MAGIC LNET_PROTO_IIB_MAGIC /* unique magic */ +#define IBNAL_MSG_VERSION 2 /* current protocol version */ +#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 1 /* previous version */ +#define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ +#define IBNAL_MSG_CONNACK 0xc1 /* connection acknowledge */ #define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ -#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ -#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ -#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ -#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* immediate */ +#define IBNAL_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ +#define IBNAL_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ +#define IBNAL_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ +#define IBNAL_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ +#define IBNAL_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ +#define IBNAL_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +/* connection rejection reasons */ +#define IBNAL_REJECT_CONN_RACE 0 /* You lost connection race */ +#define IBNAL_REJECT_NO_RESOURCES 1 /* Out of memory/conns etc */ +#define IBNAL_REJECT_FATAL 2 /* Anything else */ /***********************************************************************/ @@ -312,431 +342,167 @@ typedef struct kib_rx /* receive message */ { struct list_head rx_list; /* queue for attention */ struct kib_conn *rx_conn; /* owning conn */ - int rx_rdma; /* RDMA completion posted? */ - int rx_posted; /* posted? */ - __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ + int rx_nob; /* # bytes received (-1 while posted) */ + __u64 rx_hca_msg; /* pre-mapped buffer (hca vaddr) */ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ - IB_WORK_REQ rx_wrq; + IB_WORK_REQ2 rx_wrq; IB_LOCAL_DATASEGMENT rx_gl; /* and its memory */ } kib_rx_t; typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ + int tx_queued; /* queued for sending */ + int tx_waiting; /* waiting for peer */ int tx_status; /* completion status */ unsigned long tx_deadline; /* completion deadline */ - int tx_passive_rdma; /* peer sucks/blows */ - int tx_passive_rdma_wait; /* waiting for peer to complete */ - __u64 tx_passive_rdma_cookie; /* completion cookie */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ - kib_md_t tx_md; /* RDMA mapping (active/passive) */ - __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ + __u64 tx_cookie; /* completion cookie */ + lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ - int tx_nsp; /* # send work items */ - IB_WORK_REQ tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */ - IB_LOCAL_DATASEGMENT tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */ + __u64 tx_hca_msg; /* pre-mapped buffer (HCA vaddr) */ + int tx_nwrq; /* # send work items */ +#if IBNAL_USE_FMR + IB_WORK_REQ2 tx_wrq[2]; /* send work items... */ + IB_LOCAL_DATASEGMENT tx_gl[2]; /* ...and their memory */ + kib_rdma_desc_t tx_rd[1]; /* rdma descriptor */ + kib_md_t tx_md; /* mapping */ + __u64 *tx_pages; /* page phys addrs */ +#else + IB_WORK_REQ2 *tx_wrq; /* send work items... */ + IB_LOCAL_DATASEGMENT *tx_gl; /* ...and their memory */ + kib_rdma_desc_t *tx_rd; /* rdma descriptor (src buffers) */ +#endif } kib_tx_t; -#define KIB_TX_UNMAPPED 0 -#define KIB_TX_MAPPED 1 -#define KIB_TX_MAPPED_FMR 2 - -typedef struct kib_wire_connreq -{ - __u32 wcr_magic; /* I'm an openibnal connreq */ - __u16 wcr_version; /* this is my version number */ - __u16 wcr_queue_depth; /* this is my receive queue size */ - __u64 wcr_nid; /* peer's NID */ - __u64 wcr_incarnation; /* peer's incarnation */ -} kib_wire_connreq_t; - -typedef struct kib_gid -{ - __u64 hi, lo; -} kib_gid_t; - -typedef struct kib_connreq +typedef struct { - /* connection-in-progress */ - struct kib_conn *cr_conn; - kib_wire_connreq_t cr_wcr; - __u64 cr_tid; - IB_SERVICE_RECORD cr_service; - kib_gid_t cr_gid; - IB_PATH_RECORD cr_path; - CM_REQUEST_INFO cr_cmreq; - CM_CONN_INFO cr_discarded; -} kib_connreq_t; + /* scratchpad during connection establishment */ + IB_QP_ATTRIBUTES_QUERY cv_qpattrs; + QUERY cv_query; + IB_SERVICE_RECORD cv_svcrec; + IB_PATH_RECORD cv_path; + CM_CONN_INFO cv_cmci; +} kib_connvars_t; typedef struct kib_conn { struct kib_peer *ibc_peer; /* owning peer */ struct list_head ibc_list; /* stash on peer's conn list */ __u64 ibc_incarnation; /* which instance of the peer */ + __u64 ibc_txseq; /* tx sequence number */ + __u64 ibc_rxseq; /* rx sequence number */ + __u32 ibc_version; /* peer protocol version */ atomic_t ibc_refcount; /* # users */ int ibc_state; /* what's happening */ - atomic_t ibc_nob; /* # bytes buffered */ int ibc_nsends_posted; /* # uncompleted sends */ int ibc_credits; /* # credits I have */ int ibc_outstanding_credits; /* # credits to return */ - int ibc_rcvd_disconnect;/* received discon request */ - int ibc_sent_disconnect;/* sent discon request */ + int ibc_reserved_credits; /* # credits for ACK/DONE msgs */ + unsigned long ibc_last_send; /* time of last send */ + struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ + struct list_head ibc_tx_queue_nocred; /* sends that don't need a cred */ + struct list_head ibc_tx_queue_rsrvd; /* sends that need a reserved cred */ struct list_head ibc_tx_queue; /* send queue */ struct list_head ibc_active_txs; /* active tx awaiting completion */ spinlock_t ibc_lock; /* serialise */ kib_rx_t *ibc_rxs; /* the rx descs */ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ IB_HANDLE ibc_qp; /* queue pair */ - IB_HANDLE ibc_cep; /* connection ID? */ - IB_QP_ATTRIBUTES_QUERY ibc_qp_attrs; /* QP attrs */ - kib_connreq_t *ibc_connreq; /* connection request state */ + IB_HANDLE ibc_cep; /* CM endpoint */ + kib_connvars_t *ibc_cvars; /* connection scratchpad */ } kib_conn_t; #define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ #define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ #define IBNAL_CONN_CONNECTING 2 /* started to connect */ #define IBNAL_CONN_ESTABLISHED 3 /* connection established */ -#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */ -#define IBNAL_CONN_DREQ 5 /* sent disconnect req */ -#define IBNAL_CONN_DREP 6 /* sent disconnect rep */ -#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */ +#define IBNAL_CONN_DISCONNECTING 4 /* to send disconnect req */ +#define IBNAL_CONN_DISCONNECTED 5 /* no more QP or CM traffic */ -#define KIB_ASSERT_CONN_STATE(conn, state) do { \ - LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \ -} while (0) - -#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \ - LASSERTF(low <= high, "%d %d\n", low, high); \ - LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \ - "%d\n", conn->ibc_state); \ -} while (0) +/* types of connection */ +#define IBNAL_CONN_ACTIVE 0 /* active connect */ +#define IBNAL_CONN_PASSIVE 1 /* passive connect */ +#define IBNAL_CONN_WAITING 2 /* waiting for connect */ typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - ptl_nid_t ibp_nid; /* who's on the other end(s) */ + lnet_nid_t ibp_nid; /* who's on the other end(s) */ atomic_t ibp_refcount; /* # users */ int ibp_persistence; /* "known" peer refs */ + int ibp_version; /* protocol version */ struct list_head ibp_conns; /* all active connections */ struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* connecting+accepting */ + int ibp_connecting; /* active connects in progress */ + int ibp_accepting; /* passive connects in progress */ + int ibp_passivewait; /* waiting for peer to connect */ + unsigned long ibp_passivewait_deadline; /* when passive wait must complete */ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ unsigned long ibp_reconnect_interval; /* exponential backoff */ + int ibp_error; /* errno on closing this peer */ + cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ } kib_peer_t; -extern lib_nal_t kibnal_lib; extern kib_data_t kibnal_data; extern kib_tunables_t kibnal_tunables; /******************************************************************************/ -/* Infinicon IBT interface wrappers */ -#define IIBT_IF (kibnal_data.kib_interfaces.ver2) - -static inline FSTATUS -iibt_get_hca_guids(uint32 *hca_count, EUI64 *hca_guid_list) -{ - return IIBT_IF.GetCaGuids(hca_count, hca_guid_list); -} - -static inline FSTATUS -iibt_open_hca(EUI64 hca_guid, - IB_COMPLETION_CALLBACK completion_callback, - IB_ASYNC_EVENT_CALLBACK async_event_callback, - void *arg, - IB_HANDLE *handle) -{ - return IIBT_IF.Vpi.OpenCA(hca_guid, completion_callback, - async_event_callback, arg, handle); -} - -static inline FSTATUS -iibt_query_hca(IB_HANDLE hca_handle, IB_CA_ATTRIBUTES *attrs, void **argp) -{ - return IIBT_IF.Vpi.QueryCA(hca_handle, attrs, argp); -} - -static inline FSTATUS -iibt_close_hca(IB_HANDLE hca_handle) -{ - return IIBT_IF.Vpi.CloseCA(hca_handle); -} - -static inline FSTATUS -iibt_pd_allocate(IB_HANDLE hca_handle, __u32 max_avs, IB_HANDLE *pd_handle) -{ - return IIBT_IF.Vpi.AllocatePD(hca_handle, max_avs, pd_handle); -} - -static inline FSTATUS -iibt_pd_free(IB_HANDLE pd_handle) -{ - return IIBT_IF.Vpi.FreePD(pd_handle); -} - -static inline FSTATUS -iibt_register_physical_memory(IB_HANDLE hca_handle, - IB_VIRT_ADDR requested_io_va, - void *phys_buffers, uint64 nphys_buffers, - uint32 io_va_offset, IB_HANDLE pd_handle, - IB_ACCESS_CONTROL access, - IB_HANDLE *mem_handle, - IB_VIRT_ADDR *actual_io_va, - IB_L_KEY *lkey, IB_R_KEY *rkey) -{ - return IIBT_IF.Vpi.RegisterPhysMemRegion(hca_handle, requested_io_va, - phys_buffers, nphys_buffers, - io_va_offset, pd_handle, - access, - mem_handle, actual_io_va, - lkey, rkey); -} - -static inline FSTATUS -iibt_register_contig_physical_memory(IB_HANDLE hca_handle, - IB_VIRT_ADDR requested_io_va, - IB_MR_PHYS_BUFFER *phys_buffers, - uint64 nphys_buffers, - uint32 io_va_offset, IB_HANDLE pd_handle, - IB_ACCESS_CONTROL access, - IB_HANDLE *mem_handle, - IB_VIRT_ADDR *actual_io_va, - IB_L_KEY *lkey, IB_R_KEY *rkey) -{ - return IIBT_IF.Vpi.RegisterContigPhysMemRegion(hca_handle, - requested_io_va, - phys_buffers, - nphys_buffers, - io_va_offset, pd_handle, - access, - mem_handle, actual_io_va, - lkey, rkey); -} - -static inline FSTATUS -iibt_register_memory(IB_HANDLE hca_handle, - void *virt_addr, unsigned int length, - IB_HANDLE pd_handle, - IB_ACCESS_CONTROL access, - IB_HANDLE *mem_handle, - IB_L_KEY *lkey, IB_R_KEY *rkey) -{ - return IIBT_IF.Vpi.RegisterMemRegion(hca_handle, - virt_addr, length, - pd_handle, - access, - mem_handle, - lkey, rkey); -} - -static inline FSTATUS -iibt_deregister_memory(IB_HANDLE mem_handle) -{ - return IIBT_IF.Vpi.DeregisterMemRegion(mem_handle); -} - -static inline FSTATUS -iibt_cq_create(IB_HANDLE hca_handle, uint32 requested_size, - void *arg, IB_HANDLE *cq_handle, uint32 *actual_size) -{ - return IIBT_IF.Vpi.CreateCQ(hca_handle, requested_size, - arg, cq_handle, actual_size); -} - -static inline FSTATUS -iibt_cq_poll(IB_HANDLE cq_handle, IB_WORK_COMPLETION *wc) -{ - return IIBT_IF.Vpi.PollCQ(cq_handle, wc); -} - -static inline FSTATUS -iibt_cq_rearm(IB_HANDLE cq_handle, IB_CQ_EVENT_SELECT select) -{ - return IIBT_IF.Vpi.RearmCQ(cq_handle, select); -} - -static inline FSTATUS -iibt_cq_destroy(IB_HANDLE cq_handle) -{ - return IIBT_IF.Vpi.DestroyCQ(cq_handle); -} - -static inline FSTATUS -iibt_qp_create(IB_HANDLE hca_handle, IB_QP_ATTRIBUTES_CREATE *create_attr, - void *arg, IB_HANDLE *cq_handle, - IB_QP_ATTRIBUTES_QUERY *query_attr) -{ - return IIBT_IF.Vpi.CreateQP(hca_handle, create_attr, arg, cq_handle, - query_attr); -} - -static inline FSTATUS -iibt_qp_query(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_QUERY *query_attr, - void **arg_ptr) -{ - return IIBT_IF.Vpi.QueryQP(qp_handle, query_attr, arg_ptr); -} - -static inline FSTATUS -iibt_qp_modify(IB_HANDLE qp_handle, IB_QP_ATTRIBUTES_MODIFY *modify_attr, - IB_QP_ATTRIBUTES_QUERY *query_attr) -{ - return IIBT_IF.Vpi.ModifyQP(qp_handle, modify_attr, query_attr); -} - -static inline FSTATUS -iibt_qp_destroy(IB_HANDLE qp_handle) -{ - return IIBT_IF.Vpi.DestroyQP(qp_handle); -} - -static inline FSTATUS -iibt_postrecv(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) -{ - return IIBT_IF.Vpi.PostRecv(qp_handle, work_req); -} - -static inline FSTATUS -iibt_postsend(IB_HANDLE qp_handle, IB_WORK_REQ *work_req) -{ - return IIBT_IF.Vpi.PostSend(qp_handle, work_req); -} - -static inline FSTATUS -iibt_sd_register(IB_HANDLE *sd_handle, CLIENT_CONTROL_PARAMETERS *p) -{ - return IIBT_IF.Sdi.Register(sd_handle, p); -} - -static inline FSTATUS -iibt_sd_deregister(IB_HANDLE sd_handle) -{ - return IIBT_IF.Sdi.Deregister(sd_handle); -} - -static inline FSTATUS -iibt_sd_port_fabric_operation(IB_HANDLE sd_handle, EUI64 port_guid, - FABRIC_OPERATION_DATA *fod, - PFABRIC_OPERATION_CALLBACK callback, - COMMAND_CONTROL_PARAMETERS *p, void *arg) -{ - return IIBT_IF.Sdi.PortFabricOperation(sd_handle, port_guid, - fod, callback, p, arg); -} - -static inline FSTATUS -iibt_sd_query_port_fabric_information(IB_HANDLE sd_handle, EUI64 port_guid, - QUERY *qry, - PQUERY_CALLBACK callback, - COMMAND_CONTROL_PARAMETERS *p, void *arg) -{ - return IIBT_IF.Sdi.QueryPortFabricInformation(sd_handle, port_guid, - qry, callback, p, arg); -} - -static inline IB_HANDLE -iibt_cm_create_cep(CM_CEP_TYPE type) -{ - return IIBT_IF.Cmi.CmCreateCEP(type); -} - -static inline FSTATUS -iibt_cm_modify_cep(IB_HANDLE cep, uint32 attr, char* value, uint32 len, - uint32 offset) -{ - return IIBT_IF.Cmi.CmModifyCEP(cep, attr, value, len, offset); -} - -static inline FSTATUS -iibt_cm_destroy_cep(IB_HANDLE cep_handle) -{ - return IIBT_IF.Cmi.CmDestroyCEP(cep_handle); -} - -static inline FSTATUS -iibt_cm_listen(IB_HANDLE cep, CM_LISTEN_INFO *info, - PFN_CM_CALLBACK callback, void *arg) -{ - return IIBT_IF.Cmi.CmListen(cep, info, callback, arg); -} - -static inline FSTATUS -iibt_cm_cancel(IB_HANDLE cep) -{ - return IIBT_IF.Cmi.CmCancel(cep); -} - -static inline FSTATUS -iibt_cm_accept(IB_HANDLE cep, - CM_CONN_INFO *send_info, CM_CONN_INFO *recv_info, - PFN_CM_CALLBACK callback, void *arg, - IB_HANDLE *new_cep) -{ - return IIBT_IF.Cmi.CmAccept(cep, - send_info, recv_info, - callback, arg, new_cep); -} - -static inline FSTATUS -iibt_cm_reject(IB_HANDLE cep, CM_REJECT_INFO *rej) -{ - return IIBT_IF.Cmi.CmReject(cep, rej); -} - -static inline FSTATUS -iibt_cm_disconnect(IB_HANDLE cep, CM_DREQUEST_INFO *req, - CM_DREPLY_INFO *reply) -{ - return IIBT_IF.Cmi.CmDisconnect(cep, req, reply); -} - -static inline FSTATUS -iibt_cm_connect (IB_HANDLE cep, CM_REQUEST_INFO *req, - PFN_CM_CALLBACK callback, void *arg) -{ - return IIBT_IF.Cmi.CmConnect (cep, req, callback, arg); -} - -static inline int wrq_signals_completion(IB_WORK_REQ *wrq) -{ - return wrq->Req.SendRC.Options.s.SignaledCompletion == 1; -} - - -/******************************************************************************/ /* these are purposely avoiding using local vars so they don't increase * stack consumption. */ -#define kib_peer_addref(peer) do { \ - LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ - atomic_read(&peer->ibp_refcount)); \ - CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \ - peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ - atomic_inc(&peer->ibp_refcount); \ +#define kibnal_conn_addref(conn) \ +do { \ + CDEBUG(D_NET, "conn[%p] (%d)++\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ + atomic_inc(&(conn)->ibc_refcount); \ +} while (0) + +#define kibnal_conn_decref(conn) \ +do { \ + unsigned long flags; \ + \ + CDEBUG(D_NET, "conn[%p] (%d)--\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ + if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); \ + list_add_tail(&(conn)->ibc_list, \ + &kibnal_data.kib_connd_zombies); \ + wake_up(&kibnal_data.kib_connd_waitq); \ + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); \ + } \ } while (0) -#define kib_peer_decref(peer) do { \ - LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ - atomic_read(&peer->ibp_refcount)); \ - CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \ - peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ - if (atomic_dec_and_test (&peer->ibp_refcount)) { \ - CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \ - peer->ibp_nid, peer); \ - kibnal_destroy_peer (peer); \ - } \ +#define kibnal_peer_addref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ + atomic_inc(&(peer)->ibp_refcount); \ +} while (0) + +#define kibnal_peer_decref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ + if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ + kibnal_destroy_peer(peer); \ } while (0) /******************************************************************************/ static inline struct list_head * -kibnal_nid2peerlist (ptl_nid_t nid) +kibnal_nid2peerlist (lnet_nid_t nid) { unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; @@ -750,17 +516,79 @@ kibnal_peer_active(kib_peer_t *peer) return (!list_empty(&peer->ibp_list)); } +static inline int +kibnal_peer_connecting(kib_peer_t *peer) +{ + /* Am I expecting a connection to materialise? */ + return (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || + peer->ibp_passivewait); +} + static inline void kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) { - /* CAVEAT EMPTOR: tx takes caller's ref on conn */ + struct list_head *q; + + LASSERT (tx->tx_nwrq > 0); /* work items set up */ + LASSERT (!tx->tx_queued); /* not queued for sending already */ + + tx->tx_queued = 1; + tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ); + + if (tx->tx_conn == NULL) { + kibnal_conn_addref(conn); + tx->tx_conn = conn; + LASSERT (tx->tx_msg->ibm_type != IBNAL_MSG_PUT_DONE); + } else { + LASSERT (tx->tx_conn == conn); + LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE); + } - LASSERT (tx->tx_nsp > 0); /* work items set up */ - LASSERT (tx->tx_conn == NULL); /* only set here */ + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + /* All messages have simple credit control */ + q = &conn->ibc_tx_queue; + } else { + LASSERT (conn->ibc_version == IBNAL_MSG_VERSION); + + switch (tx->tx_msg->ibm_type) { + case IBNAL_MSG_PUT_REQ: + case IBNAL_MSG_GET_REQ: + /* RDMA request: reserve a buffer for the RDMA reply + * before sending */ + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBNAL_MSG_PUT_NAK: + case IBNAL_MSG_PUT_ACK: + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + /* RDMA reply/completion: no credits; peer has reserved + * a reply buffer */ + q = &conn->ibc_tx_queue_nocred; + break; + + case IBNAL_MSG_NOOP: + case IBNAL_MSG_IMMEDIATE: + /* Otherwise: consume a credit before sending */ + q = &conn->ibc_tx_queue; + break; + + default: + LBUG(); + q = NULL; + } + } + + list_add_tail(&tx->tx_list, q); +} - tx->tx_conn = conn; - tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); +static inline int +kibnal_send_keepalive(kib_conn_t *conn) +{ + return (*kibnal_tunables.kib_keepalive > 0) && + time_after(jiffies, conn->ibc_last_send + + *kibnal_tunables.kib_keepalive*HZ); } #define KIBNAL_SERVICE_KEY_MASK (IB_SERVICE_RECORD_COMP_SERVICENAME | \ @@ -780,112 +608,130 @@ kibnal_service_nid_field(IB_SERVICE_RECORD *srv) return (__u64 *)srv->ServiceData8; } - static inline void -kibnal_set_service_keys(IB_SERVICE_RECORD *srv, ptl_nid_t nid) +kibnal_set_service_keys(IB_SERVICE_RECORD *srv, lnet_nid_t nid) { - LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(srv->ServiceName)); + char *svc_name = *kibnal_tunables.kib_service_name; + + LASSERT (strlen(svc_name) < sizeof(srv->ServiceName)); memset (srv->ServiceName, 0, sizeof(srv->ServiceName)); - strcpy (srv->ServiceName, IBNAL_SERVICE_NAME); + strcpy (srv->ServiceName, svc_name); *kibnal_service_nid_field(srv) = cpu_to_le64(nid); } -#if 0 -static inline void -kibnal_show_rdma_attr (kib_conn_t *conn) -{ - struct ib_qp_attribute qp_attr; - int rc; - - memset (&qp_attr, 0, sizeof(qp_attr)); - rc = ib_qp_query(conn->ibc_qp, &qp_attr); - if (rc != 0) { - CERROR ("Can't get qp attrs: %d\n", rc); - return; - } +/* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to use the + * lowest 2 bits of the work request id to stash the work item type (the op + * field is not valid when the wc completes in error). */ - CWARN ("RDMA CAPABILITY: write %s read %s\n", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid"); -} -#endif +#define IBNAL_WID_TX 0 +#define IBNAL_WID_RX 1 +#define IBNAL_WID_RDMA 2 +#define IBNAL_WID_MASK 3UL -#if CONFIG_X86 static inline __u64 -kibnal_page2phys (struct page *p) +kibnal_ptr2wreqid (void *ptr, int type) { - __u64 page_number = p - mem_map; + unsigned long lptr = (unsigned long)ptr; - return (page_number << PAGE_SHIFT); + LASSERT ((lptr & IBNAL_WID_MASK) == 0); + LASSERT ((type & ~IBNAL_WID_MASK) == 0); + return (__u64)(lptr | type); } -#else -# error "no page->phys" -#endif - -/* CAVEAT EMPTOR: - * We rely on tx/rx descriptor alignment to allow us to use the lowest bit - * of the work request id as a flag to determine if the completion is for a - * transmit or a receive. It seems that that the CQ entry's 'op' field - * isn't always set correctly on completions that occur after QP teardown. */ -static inline __u64 -kibnal_ptr2wreqid (void *ptr, int isrx) +static inline void * +kibnal_wreqid2ptr (__u64 wreqid) { - unsigned long lptr = (unsigned long)ptr; + return (void *)(((unsigned long)wreqid) & ~IBNAL_WID_MASK); +} - LASSERT ((lptr & 1) == 0); - return (__u64)(lptr | (isrx ? 1 : 0)); +static inline int +kibnal_wreqid2type (__u64 wreqid) +{ + return (wreqid & IBNAL_WID_MASK); } -static inline void * -kibnal_wreqid2ptr (__u64 wreqid) +static inline void +kibnal_set_conn_state (kib_conn_t *conn, int state) { - return (void *)(((unsigned long)wreqid) & ~1UL); + CDEBUG(D_NET,"%p state %d\n", conn, state); + conn->ibc_state = state; + mb(); } +#if IBNAL_USE_FMR + static inline int -kibnal_wreqid_is_rx (__u64 wreqid) +kibnal_rd_size (kib_rdma_desc_t *rd) { - return (wreqid & 1) != 0; + return rd->rd_nob; } +#else static inline int -kibnal_whole_mem(void) +kibnal_rd_size (kib_rdma_desc_t *rd) { - return kibnal_data.kib_md.md_handle != NULL; + int i; + int size; + + for (i = size = 0; i < rd->rd_nfrag; i++) + size += rd->rd_frags[i].rf_nob; + + return size; } +#endif -extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); -extern void kibnal_destroy_peer (kib_peer_t *peer); -extern int kibnal_del_peer (ptl_nid_t nid, int single_share); -extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); -extern void kibnal_unlink_peer_locked (kib_peer_t *peer); -extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, - __u64 incarnation); -extern kib_conn_t *kibnal_create_conn (void); -extern void kibnal_put_conn (kib_conn_t *conn); -extern void kibnal_destroy_conn (kib_conn_t *conn); +int kibnal_startup (lnet_ni_t *ni); +void kibnal_shutdown (lnet_ni_t *ni); +int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kibnal_eager_recv (lnet_ni_t *ni, void *private, + lnet_msg_t *lntmsg, void **new_private); +int kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); +void kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, + lnet_nid_t dstnid, __u64 dststamp, __u64 seq); +void kibnal_pack_connmsg(kib_msg_t *msg, __u32 version, int nob, int type, + lnet_nid_t dstnid, __u64 dststamp); +int kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob); +IB_HANDLE kibnal_create_cep(lnet_nid_t nid); +int kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid); +void kibnal_destroy_peer (kib_peer_t *peer); +kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid); +int kibnal_del_peer (lnet_nid_t nid); +void kibnal_peer_alive (kib_peer_t *peer); +void kibnal_unlink_peer_locked (kib_peer_t *peer); +int kibnal_add_persistent_peer (lnet_nid_t nid); +int kibnal_close_stale_conns_locked (kib_peer_t *peer, + __u64 incarnation); +int kibnal_conn_rts(kib_conn_t *conn, + __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn); +kib_conn_t *kibnal_create_conn (lnet_nid_t nid, int proto_version); +void kibnal_destroy_conn (kib_conn_t *conn); void kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg); - -extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); -extern void kibnal_free_pages (kib_pages_t *p); - -extern void kibnal_check_sends (kib_conn_t *conn); -extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); -extern void kibnal_destroy_conn (kib_conn_t *conn); -extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); -extern int kibnal_scheduler(void *arg); -extern int kibnal_connd (void *arg); -extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); -extern void kibnal_close_conn (kib_conn_t *conn, int why); -extern void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob); - -void kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev); -void kibnal_ca_callback (void *ca_arg, void *cq_arg); +int kibnal_alloc_pages (kib_pages_t **pp, int npages); +void kibnal_free_pages (kib_pages_t *p); +void kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn); +void kibnal_txlist_done (struct list_head *txlist, int status); +int kibnal_post_receives (kib_conn_t *conn); +int kibnal_init_rdma (kib_tx_t *tx, int type, int nob, + kib_rdma_desc_t *dstrd, __u64 dstcookie); +void kibnal_check_sends (kib_conn_t *conn); +void kibnal_close_conn_locked (kib_conn_t *conn, int error); +int kibnal_thread_start (int (*fn)(void *arg), void *arg); +int kibnal_scheduler(void *arg); +int kibnal_connd (void *arg); +void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +void kibnal_close_conn (kib_conn_t *conn, int why); +void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lnet_msg_t *lntmsg, + unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int nob); +void kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev); +void kibnal_hca_callback (void *hca_arg, void *cq_arg); +int kibnal_tunables_init (void); +void kibnal_tunables_fini (void); diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index eb9e6fa..fb4bba0 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -21,658 +21,698 @@ * */ -#include "iibnal.h" +#include "iiblnd.h" -/* - * LIB functions follow - * - */ -static void -kibnal_schedule_tx_done (kib_tx_t *tx) +void +hexdump(char *string, void *ptr, int len) { - unsigned long flags; + unsigned char *c = ptr; + int i; + + return; - spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); + if (len < 0 || len > 2048) { + printk("XXX what the hell? %d\n",len); + return; + } - list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); - wake_up (&kibnal_data.kib_sched_waitq); + printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + for (i = 0; i < len;) { + printk("%02x",*(c++)); + i++; + if (!(i & 15)) { + printk("\n"); + } else if (!(i&1)) { + printk(" "); + } + } + + if(len & 15) { + printk("\n"); + } } -static void +void kibnal_tx_done (kib_tx_t *tx) { - ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; - unsigned long flags; - int i; - FSTATUS frc; - - LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ - LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + lnet_msg_t *lntmsg[2]; + int rc = tx->tx_status; + int i; - switch (tx->tx_mapped) { - default: - LBUG(); + LASSERT (!in_interrupt()); + LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ + LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ - case KIB_TX_UNMAPPED: - break; +#if IBNAL_USE_FMR + /* Handle unmapping if required */ +#endif + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + + if (tx->tx_conn != NULL) { + kibnal_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + } - case KIB_TX_MAPPED: - if (in_interrupt()) { - /* can't deregister memory in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - frc = iibt_deregister_memory(tx->tx_md.md_handle); - LASSERT (frc == FSUCCESS); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; + tx->tx_nwrq = 0; + tx->tx_status = 0; -#if IBNAL_FMR - case KIB_TX_MAPPED_FMR: - if (in_interrupt() && tx->tx_status != 0) { - /* can't flush FMRs in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } + spin_lock(&kibnal_data.kib_tx_lock); - rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); - LASSERT (rc == 0); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - if (tx->tx_status != 0) - ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); - tx->tx_mapped = KIB_TX_UNMAPPED; - break; -#endif - } + spin_unlock(&kibnal_data.kib_tx_lock); + /* delay finalize until my descs have been freed */ for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) + if (lntmsg[i] == NULL) continue; - lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; + lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); } +} + +kib_tx_t * +kibnal_get_idle_tx (void) +{ + kib_tx_t *tx; - if (tx->tx_conn != NULL) { - kibnal_put_conn (tx->tx_conn); - tx->tx_conn = NULL; + spin_lock(&kibnal_data.kib_tx_lock); + + if (list_empty (&kibnal_data.kib_idle_txs)) { + spin_unlock(&kibnal_data.kib_tx_lock); + return NULL; } - tx->tx_nsp = 0; - tx->tx_passive_rdma = 0; - tx->tx_status = 0; + tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + /* Allocate a new completion cookie. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; - if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } + spin_unlock(&kibnal_data.kib_tx_lock); - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + + return tx; } -static kib_tx_t * -kibnal_get_idle_tx (int may_block) +int +kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) { - unsigned long flags; - kib_tx_t *tx = NULL; - ENTRY; - - for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + kib_conn_t *conn = rx->rx_conn; + int rc = 0; + FSTATUS frc; - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } + LASSERT (!in_interrupt()); + /* old peers don't reserve rxs for RDMA replies */ + LASSERT (!rsrvd_credit || + conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + + rx->rx_gl = (IB_LOCAL_DATASEGMENT) { + .Address = rx->rx_hca_msg, + .Lkey = kibnal_data.kib_whole_mem.md_lkey, + .Length = IBNAL_MSG_SIZE, + }; - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } + rx->rx_wrq = (IB_WORK_REQ2) { + .Next = NULL, + .WorkReqId = kibnal_ptr2wreqid(rx, IBNAL_WID_RX), + .MessageLen = IBNAL_MSG_SIZE, + .DSList = &rx->rx_gl, + .DSListDepth = 1, + .Operation = WROpRecv, + }; - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } + LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING); + LASSERT (rx->rx_nob >= 0); /* not posted */ - /* block for idle tx */ - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", + rx->rx_wrq.DSList->Length, + rx->rx_wrq.DSList->Lkey, + rx->rx_wrq.DSList->Address); - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) { + /* No more posts for this rx; so lose its ref */ + kibnal_conn_decref(conn); + return 0; } + + rx->rx_nob = -1; /* flag posted */ + mb(); - if (tx != NULL) { - list_del (&tx->tx_list); + frc = iba_post_recv2(conn->ibc_qp, &rx->rx_wrq, NULL); + if (frc == FSUCCESS) { + if (credit || rsrvd_credit) { + spin_lock(&conn->ibc_lock); - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; + if (credit) + conn->ibc_outstanding_credits++; + if (rsrvd_credit) + conn->ibc_reserved_credits++; - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } + spin_unlock(&conn->ibc_lock); - spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + kibnal_check_sends(conn); + } + return 0; + } - RETURN(tx); + CERROR ("post rx -> %s failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + rc = -EIO; + kibnal_close_conn(rx->rx_conn, rc); + /* No more posts for this rx; so lose its ref */ + kibnal_conn_decref(conn); + return rc; } -static int -kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +int +kibnal_post_receives (kib_conn_t *conn) { - /* I would guess that if kibnal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; + int i; + int rc; + + LASSERT (conn->ibc_state == IBNAL_CONN_CONNECTING); + + for (i = 0; i < IBNAL_RX_MSGS; i++) { + /* +1 ref for rx desc. This ref remains until kibnal_post_rx + * fails (i.e. actual failure or we're disconnecting) */ + kibnal_conn_addref(conn); + rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); + if (rc != 0) + return rc; } return 0; } -static void -kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) +kib_tx_t * +kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) { - struct list_head *ttmp; - unsigned long flags; - int idle; - - spin_lock_irqsave (&conn->ibc_lock, flags); - - list_for_each (ttmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); + struct list_head *tmp; + + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending != 0 || tx->tx_waiting); - if (!tx->tx_passive_rdma_wait || - tx->tx_passive_rdma_cookie != cookie) + if (tx->tx_cookie != cookie) continue; - CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; - tx->tx_status = status; - tx->tx_passive_rdma_wait = 0; - idle = (tx->tx_sending == 0); + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} - if (idle) - list_del (&tx->tx_list); +void +kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) +{ + kib_tx_t *tx; + int idle; - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); - /* I could be racing with tx callbacks. It's whoever - * _makes_ tx idle that frees it */ - if (idle) - kibnal_tx_done (tx); + tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); + + CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_close_conn (conn, -EPROTO); return; } - - spin_unlock_irqrestore (&conn->ibc_lock, flags); - CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", - cookie, conn->ibc_peer->ibp_nid); + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + tx->tx_status = status; + } else if (txtype == IBNAL_MSG_GET_REQ) { + lnet_set_reply_msg_len(kibnal_data.kib_ni, + tx->tx_lntmsg[1], status); + } + } + + tx->tx_waiting = 0; + + idle = !tx->tx_queued && (tx->tx_sending == 0); + if (idle) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (idle) + kibnal_tx_done(tx); } -static __u32 -kibnal_lkey(kib_pages_t *ibp) +void +kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) { - if (kibnal_whole_mem()) - return kibnal_data.kib_md.md_lkey; - - return ibp->ibp_lkey; + kib_tx_t *tx = kibnal_get_idle_tx(); + + if (tx == NULL) { + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t)); + + kibnal_queue_tx(tx, conn); } -static void -kibnal_post_rx (kib_rx_t *rx, int do_credits) +void +kibnal_handle_rx (kib_rx_t *rx) { + kib_msg_t *msg = rx->rx_msg; kib_conn_t *conn = rx->rx_conn; + int credits = msg->ibm_credits; + kib_tx_t *tx; int rc = 0; - unsigned long flags; - FSTATUS frc; - ENTRY; - - rx->rx_gl = (IB_LOCAL_DATASEGMENT) { - .Address = rx->rx_vaddr, - .Length = IBNAL_MSG_SIZE, - .Lkey = kibnal_lkey(conn->ibc_rx_pages), - }; + int repost = 1; + int rsrvd_credit = 0; + int rc2; - rx->rx_wrq = (IB_WORK_REQ) { - .Operation = WROpRecv, - .DSListDepth = 1, - .MessageLen = IBNAL_MSG_SIZE, - .WorkReqId = kibnal_ptr2wreqid(rx, 1), - .DSList = &rx->rx_gl, - }; + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DREP); - LASSERT (!rx->rx_posted); - rx->rx_posted = 1; - mb(); + CDEBUG (D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + if (credits != 0) { + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); + conn->ibc_credits += credits; + spin_unlock(&conn->ibc_lock); - if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) - rc = -ECONNABORTED; - else { - frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq); - if (frc != FSUCCESS) { - CDEBUG(D_NET, "post failed %d\n", frc); - rc = -EINVAL; - } - CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); + kibnal_check_sends(conn); } - if (rc == 0) { - if (do_credits) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_outstanding_credits++; - spin_unlock_irqrestore(&conn->ibc_lock, flags); + switch (msg->ibm_type) { + default: + CERROR("Bad IBNAL message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; - kibnal_check_sends(conn); - } - EXIT; - return; - } + case IBNAL_MSG_NOOP: + break; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - CERROR ("Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - kibnal_close_conn (rx->rx_conn, rc); - } else { - CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); - } + case IBNAL_MSG_IMMEDIATE: + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); + repost = rc < 0; /* repost on error */ + break; + + case IBNAL_MSG_PUT_REQ: + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx, 1); + repost = rc < 0; /* repost on error */ + break; - /* Drop rx's ref */ - kibnal_put_conn (conn); - EXIT; -} + case IBNAL_MSG_PUT_NAK: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ -#if IBNAL_CKSUM -static inline __u32 kibnal_cksum (void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; + CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - return (sum); -} -#endif + case IBNAL_MSG_PUT_ACK: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ -static void hexdump(char *string, void *ptr, int len) -{ - unsigned char *c = ptr; - int i; + spin_lock(&conn->ibc_lock); + tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); - return; + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; + } - if (len < 0 || len > 2048) { - printk("XXX what the hell? %d\n",len); - return; - } + LASSERT (tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer has received it! + * (b) tx_waiting set tells tx_complete() it's not done. */ + + tx->tx_nwrq = 0; /* overwrite PUT_REQ */ + + rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, + kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); + + spin_lock(&conn->ibc_lock); + if (tx->tx_status == 0 && rc2 < 0) + tx->tx_status = rc2; + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kibnal_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + break; + + case IBNAL_MSG_PUT_DONE: + /* This buffer was pre-reserved by not returning the credit + * when the PUT_REQ's buffer was reposted, so I just return it + * now */ + kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; - printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); + case IBNAL_MSG_GET_REQ: + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx, 1); + repost = rc < 0; /* repost on error */ + break; - for (i = 0; i < len;) { - printk("%02x",*(c++)); - i++; - if (!(i & 15)) { - printk("\n"); - } else if (!(i&1)) { - printk(" "); - } + case IBNAL_MSG_GET_DONE: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ + + kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; } - if(len & 15) { - printk("\n"); + if (rc < 0) /* protocol error */ + kibnal_close_conn(conn, rc); + + if (repost) { + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) + rsrvd_credit = 0; /* peer isn't pre-reserving */ + + kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit); } } -static void -kibnal_rx_callback (IB_WORK_COMPLETION *wc) +void +kibnal_rx_complete (IB_WORK_COMPLETION *wc, __u64 rxseq) { kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId); + int nob = wc->Length; kib_msg_t *msg = rx->rx_msg; kib_conn_t *conn = rx->rx_conn; - int nob = wc->Length; - const int base_nob = offsetof(kib_msg_t, ibm_u); - int credits; - int flipped; unsigned long flags; - __u32 i; -#if IBNAL_CKSUM - __u32 msg_cksum; - __u32 computed_cksum; -#endif - - /* we set the QP to erroring after we've finished disconnecting, - * maybe we should do so sooner. */ - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, - IBNAL_CONN_DISCONNECTED); + int rc; + int err = -EIO; - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_posted); - rx->rx_posted = 0; + LASSERT (rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ mb(); /* receives complete with error in any case after we've started * disconnecting */ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - goto failed; + goto ignore; if (wc->Status != WRStatusSuccess) { - CERROR("Rx from "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->Status); + CERROR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), wc->Status); goto failed; } - if (nob < base_nob) { - CERROR ("Short rx from "LPX64": %d < expected %d\n", - conn->ibc_peer->ibp_nid, nob, base_nob); + rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); + if (rc != 0) { + CERROR ("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } - hexdump("rx", rx->rx_msg, sizeof(kib_msg_t)); - - /* Receiver does any byte flipping if necessary... */ - - if (msg->ibm_magic == IBNAL_MSG_MAGIC) { - flipped = 0; - } else { - if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { - CERROR ("Unrecognised magic: %08x from "LPX64"\n", - msg->ibm_magic, conn->ibc_peer->ibp_nid); - goto failed; - } - flipped = 1; - __swab16s (&msg->ibm_version); - LASSERT (sizeof(msg->ibm_type) == 1); - LASSERT (sizeof(msg->ibm_credits) == 1); - } + rx->rx_nob = nob; /* Now I know nob > 0 */ + mb(); - if (msg->ibm_version != IBNAL_MSG_VERSION) { - CERROR ("Incompatible msg version %d (%d expected)\n", - msg->ibm_version, IBNAL_MSG_VERSION); + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dststamp != kibnal_data.kib_incarnation) { + CERROR ("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; goto failed; } -#if IBNAL_CKSUM - if (nob != msg->ibm_nob) { - CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); + if (msg->ibm_seq != rxseq) { + CERROR ("Out-of-sequence rx from %s" + ": got "LPD64" but expected "LPD64"\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + msg->ibm_seq, rxseq); goto failed; } - msg_cksum = le32_to_cpu(msg->ibm_cksum); - msg->ibm_cksum = 0; - computed_cksum = kibnal_cksum (msg, nob); - - if (msg_cksum != computed_cksum) { - CERROR ("Checksum failure %d: (%d expected)\n", - computed_cksum, msg_cksum); -// goto failed; - } - CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); -#endif - - /* Have I received credits that will let me send? */ - credits = msg->ibm_credits; - if (credits != 0) { - spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_credits += credits; - spin_unlock_irqrestore(&conn->ibc_lock, flags); - - kibnal_check_sends(conn); - } - - switch (msg->ibm_type) { - case IBNAL_MSG_NOOP: - kibnal_post_rx (rx, 1); - return; - - case IBNAL_MSG_IMMEDIATE: - if (nob < base_nob + sizeof (kib_immediate_msg_t)) { - CERROR ("Short IMMEDIATE from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - break; - - case IBNAL_MSG_PUT_RDMA: - case IBNAL_MSG_GET_RDMA: - if (nob < base_nob + sizeof (kib_rdma_msg_t)) { - CERROR ("Short RDMA msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; - } - if (flipped) - __swab32(msg->ibm_u.rdma.ibrm_num_descs); - - CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", - msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); - - if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || - (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > - min(nob, IBNAL_MSG_SIZE))) { - CERROR ("num_descs %d too large\n", - msg->ibm_u.rdma.ibrm_num_descs); - goto failed; - } - - if (flipped) { - __swab32(msg->ibm_u.rdma.rd_key); - } - - for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; + /* set time last known alive */ + kibnal_peer_alive(conn->ibc_peer); - if (flipped) { - __swab32(desc->rd_nob); - __swab64(desc->rd_addr); - } + /* racing with connection establishment/teardown! */ - CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", - msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob); - } - break; - - case IBNAL_MSG_PUT_DONE: - case IBNAL_MSG_GET_DONE: - if (nob < base_nob + sizeof (kib_completion_msg_t)) { - CERROR ("Short COMPLETION msg from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, nob); - goto failed; + if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + return; } - if (flipped) - __swab32s(&msg->ibm_u.completion.ibcm_status); - - CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", - msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - - kibnal_complete_passive_rdma (conn, - msg->ibm_u.completion.ibcm_cookie, - msg->ibm_u.completion.ibcm_status); - kibnal_post_rx (rx, 1); - return; - - default: - CERROR ("Can't parse type from "LPX64": %d\n", - conn->ibc_peer->ibp_nid, msg->ibm_type); - goto failed; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); } - - /* schedule for kibnal_rx() in thread context */ - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - - list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); - wake_up (&kibnal_data.kib_sched_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + kibnal_handle_rx(rx); return; failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, -ECONNABORTED); - + kibnal_close_conn(conn, err); + ignore: /* Don't re-post rx & drop its ref on conn */ - kibnal_put_conn(conn); + kibnal_conn_decref(conn); } -void -kibnal_rx (kib_rx_t *rx) +struct page * +kibnal_kvaddr_to_page (unsigned long vaddr) { - kib_msg_t *msg = rx->rx_msg; - - /* Clear flag so I can detect if I've sent an RDMA completion */ - rx->rx_rdma = 0; + struct page *page; - switch (msg->ibm_type) { - case IBNAL_MSG_GET_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - /* If the incoming get was matched, I'll have initiated the - * RDMA and the completion message... */ - if (rx->rx_rdma) - break; - - /* Otherwise, I'll send a failed completion now to prevent - * the peer's GET blocking for the full timeout. */ - CERROR ("Completing unmatched RDMA GET from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); - break; - - case IBNAL_MSG_PUT_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - if (rx->rx_rdma) - break; - /* This is most unusual, since even if lib_parse() didn't - * match anything, it should have asked us to read (and - * discard) the payload. The portals header must be - * inconsistent with this message type, so it's the - * sender's fault for sending garbage and she can time - * herself out... */ - CERROR ("Uncompleted RMDA PUT from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - break; - - case IBNAL_MSG_IMMEDIATE: - lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); - LASSERT (!rx->rx_rdma); - break; - - default: + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) { + page = vmalloc_to_page ((void *)vaddr); + LASSERT (page != NULL); + return page; + } +#if CONFIG_HIGHMEM + if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); LBUG(); - break; } - - kibnal_post_rx (rx, 1); +#endif + page = virt_to_page (vaddr); + LASSERT (page != NULL); + return page; } -static struct page * -kibnal_kvaddr_to_page (unsigned long vaddr) +#if !IBNAL_USE_FMR +int +kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, + unsigned long page_offset, unsigned long len) { - struct page *page; + kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag]; - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#if CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); + if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) { + CERROR ("Too many RDMA fragments\n"); + return -EMSGSIZE; + } + + if (active) { + if (rd->rd_nfrag == 0) + rd->rd_key = kibnal_data.kib_whole_mem.md_lkey; + } else { + if (rd->rd_nfrag == 0) + rd->rd_key = kibnal_data.kib_whole_mem.md_rkey; + } - if (!VALID_PAGE (page)) - page = NULL; + frag->rf_nob = len; + frag->rf_addr = kibnal_data.kib_whole_mem.md_addr + + lnet_page2phys(page) + page_offset; - return page; + CDEBUG(D_NET,"map key %x frag [%d]["LPX64" for %d]\n", + rd->rd_key, rd->rd_nfrag, frag->rf_addr, frag->rf_nob); + + rd->rd_nfrag++; + return 0; } -static void -kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, - unsigned long len, int active) +int +kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + unsigned int niov, struct iovec *iov, int offset, int nob) + { - kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; - kib_rdma_desc_t *desc; + int fragnob; + int rc; + unsigned long vaddr; + struct page *page; + int page_offset; - LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", - ibrm->ibrm_num_descs); + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT ((rd != tx->tx_rd) == !active); - desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; - if (active) - ibrm->rd_key = kibnal_data.kib_md.md_lkey; - else - ibrm->rd_key = kibnal_data.kib_md.md_rkey; - desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ - desc->rd_addr = kibnal_page2phys(page) + page_offset + - kibnal_data.kib_md.md_addr; + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + rd->rd_nfrag = 0; + do { + LASSERT (niov > 0); + + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR ("Can't find page\n"); + return -EFAULT; + } - ibrm->ibrm_num_descs++; + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + + rc = kibnal_append_rdfrag(rd, active, page, + page_offset, fragnob); + if (rc != 0) + return rc; + + if (offset + fragnob < iov->iov_len) { + offset += fragnob; + } else { + offset = 0; + iov++; + niov--; + } + nob -= fragnob; + } while (nob > 0); + + return 0; } -static int -kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) +int +kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) { - struct page *page; - int page_offset, len; + int fragnob; + int rc; - while (nob > 0) { - page = kibnal_kvaddr_to_page(vaddr); - if (page == NULL) - return -EFAULT; + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - page_offset = vaddr & (PAGE_SIZE - 1); - len = min(nob, (int)PAGE_SIZE - page_offset); - - kibnal_fill_ibrm(tx, page, page_offset, len, active); - nob -= len; - vaddr += len; + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT ((rd != tx->tx_rd) == !active); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); } + + rd->rd_nfrag = 0; + do { + LASSERT (nkiov > 0); + fragnob = min((int)(kiov->kiov_len - offset), nob); + + rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page, + kiov->kiov_offset + offset, + fragnob); + if (rc != 0) + return rc; + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + return 0; } +#else +int +kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + int npages, unsigned long page_offset, int nob) +{ + IB_ACCESS_CONTROL access = {0,}; + FSTATUS frc; + + LASSERT ((rd != tx->tx_rd) == !active); + LASSERT (!tx->tx_md.md_active); + LASSERT (tx->tx_md.md_fmrcount > 0); + LASSERT (page_offset < PAGE_SIZE); + LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); + LASSERT (npages <= LNET_MAX_IOV); + + if (!active) { + // access.s.MWBindable = 1; + access.s.LocalWrite = 1; + access.s.RdmaWrite = 1; + } + + /* Map the memory described by tx->tx_pages + frc = iibt_register_physical_memory(kibnal_data.kib_hca, + IBNAL_RDMA_BASE, + tx->tx_pages, npages, + page_offset, + kibnal_data.kib_pd, + access, + &tx->tx_md.md_handle, + &tx->tx_md.md_addr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + */ + return -EINVAL; +} -static int -kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, - int niov, struct iovec *iov, int offset, int nob, int active) +int +kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + unsigned int niov, struct iovec *iov, int offset, int nob) { - void *vaddr; - FSTATUS frc; + int resid; + int fragnob; + struct page *page; + int npages; + unsigned long page_offset; + unsigned long vaddr; LASSERT (nob > 0); LASSERT (niov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); while (offset >= iov->iov_len) { offset -= iov->iov_len; @@ -686,54 +726,47 @@ kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access, return (-EMSGSIZE); } - /* our large contiguous iov could be backed by multiple physical - * pages. */ - if (kibnal_whole_mem()) { - int rc; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + - offset, nob, active); - if (rc != 0) { - CERROR ("Can't map iov: %d\n", rc); - return rc; + vaddr = ((unsigned long)iov->iov_base) + offset; + + page_offset = vaddr & (PAGE_SIZE - 1); + resid = nob; + npages = 0; + + do { + LASSERT (npages < LNET_MAX_IOV); + + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR("Can't find page for %lu\n", vaddr); + return -EFAULT; } - return 0; - } - vaddr = (void *)(((unsigned long)iov->iov_base) + offset); - tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); + tx->tx_pages[npages++] = lnet_page2phys(page); - frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob, - kibnal_data.kib_pd, access, - &tx->tx_md.md_handle, &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); - if (frc != 0) { - CERROR ("Can't map vaddr %p: %d\n", vaddr, frc); - return -EINVAL; - } + fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); + vaddr += fragnob; + resid -= fragnob; - tx->tx_mapped = KIB_TX_MAPPED; - return (0); + } while (resid > 0); + + return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } -static int -kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, - int nkiov, ptl_kiov_t *kiov, - int offset, int nob, int active) +int +kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) { - __u64 *phys = NULL; - int page_offset; - int nphys; - int resid; - int phys_size = 0; - FSTATUS frc; - int i, rc = 0; - + int resid; + int npages; + unsigned long page_offset; + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); LASSERT (nob > 0); LASSERT (nkiov > 0); - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (nkiov <= LNET_MAX_IOV); + LASSERT (!tx->tx_md.md_active); + LASSERT ((rd != tx->tx_rd) == !active); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; @@ -743,122 +776,36 @@ kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, } page_offset = kiov->kiov_offset + offset; - nphys = 1; - - if (!kibnal_whole_mem()) { - phys_size = nkiov * sizeof (*phys); - PORTAL_ALLOC(phys, phys_size); - if (phys == NULL) { - CERROR ("Can't allocate tmp phys\n"); - return (-ENOMEM); - } - - phys[0] = kibnal_page2phys(kiov->kiov_page); - } else { - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; - kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, - kiov->kiov_len, active); - } - - resid = nob - (kiov->kiov_len - offset); + + resid = offset + nob; + npages = 0; - while (resid > 0) { - kiov++; - nkiov--; + do { + LASSERT (npages < LNET_MAX_IOV); LASSERT (nkiov > 0); - if (kiov->kiov_offset != 0 || - ((resid > PAGE_SIZE) && - kiov->kiov_len < PAGE_SIZE)) { + if ((npages > 0 && kiov->kiov_offset != 0) || + (resid > kiov->kiov_len && + (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) { /* Can't have gaps */ CERROR ("Can't make payload contiguous in I/O VM:" - "page %d, offset %d, len %d \n", nphys, - kiov->kiov_offset, kiov->kiov_len); - - for (i = -nphys; i < nkiov; i++) - { - CERROR("kiov[%d] %p +%d for %d\n", - i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); - } + "page %d, offset %d, len %d \n", + npages, kiov->kiov_offset, kiov->kiov_len); - rc = -EINVAL; - goto out; - } - - if (nphys == PTL_MD_MAX_IOV) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - - if (!kibnal_whole_mem()) { - LASSERT (nphys * sizeof (*phys) < phys_size); - phys[nphys] = kibnal_page2phys(kiov->kiov_page); - } else { - if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { - CERROR ("payload too big (%d)\n", nphys); - rc = -EMSGSIZE; - goto out; - } - kibnal_fill_ibrm(tx, kiov->kiov_page, - kiov->kiov_offset, kiov->kiov_len, - active); + return -EINVAL; } - nphys ++; - resid -= PAGE_SIZE; - } - - if (kibnal_whole_mem()) - goto out; - -#if 0 - CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); - for (i = 0; i < nphys; i++) - CWARN (" [%d] "LPX64"\n", i, phys[i]); -#endif - -#if IBNAL_FMR -#error "iibnal hasn't learned about FMR yet" - rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, - phys, nphys, - &tx->tx_md.md_addr, - page_offset, - &tx->tx_md.md_handle.fmr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#else - frc = iibt_register_physical_memory(kibnal_data.kib_hca, - IBNAL_RDMA_BASE, - phys, nphys, - 0, /* offset */ - kibnal_data.kib_pd, - access, - &tx->tx_md.md_handle, - &tx->tx_md.md_addr, - &tx->tx_md.md_lkey, - &tx->tx_md.md_rkey); -#endif - if (frc == FSUCCESS) { - CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", - nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); -#if IBNAL_FMR - tx->tx_mapped = KIB_TX_MAPPED_FMR; -#else - tx->tx_mapped = KIB_TX_MAPPED; -#endif - } else { - CERROR ("Can't map phys: %d\n", frc); - rc = -EFAULT; - } + tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page); + resid -= kiov->kiov_len; + kiov++; + nkiov--; + } while (resid > 0); - out: - if (phys != NULL) - PORTAL_FREE(phys, phys_size); - return (rc); + return kibnal_map_tx(tx, rd, active, npages, page_offset, nob); } +#endif -static kib_conn_t * +kib_conn_t * kibnal_find_conn_locked (kib_peer_t *peer) { struct list_head *tmp; @@ -874,134 +821,173 @@ kibnal_find_conn_locked (kib_peer_t *peer) void kibnal_check_sends (kib_conn_t *conn) { - unsigned long flags; kib_tx_t *tx; + FSTATUS frc; int rc; - int i; + int consume_cred; int done; - int nwork; - ENTRY; - spin_lock_irqsave (&conn->ibc_lock, flags); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); + + spin_lock(&conn->ibc_lock); - LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_nsends_posted <= + *kibnal_tunables.kib_concurrent_sends); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { - spin_unlock_irqrestore(&conn->ibc_lock, flags); + list_empty(&conn->ibc_tx_queue_nocred) && + (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || + kibnal_send_keepalive(conn))) { + spin_unlock(&conn->ibc_lock); - tx = kibnal_get_idle_tx(0); /* don't block */ + tx = kibnal_get_idle_tx(); if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); - spin_lock_irqsave(&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); - if (tx != NULL) { - atomic_inc(&conn->ibc_refcount); + if (tx != NULL) kibnal_queue_tx_locked(tx, conn); - } } - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); + for (;;) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry (conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); + consume_cred = 0; + } else if (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, + kib_tx_t, tx_list); + consume_cred = 1; + } else { + /* nothing waiting */ + break; + } + LASSERT (tx->tx_queued); /* We rely on this for QP sizing */ - LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); + LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS); LASSERT (conn->ibc_outstanding_credits >= 0); LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); LASSERT (conn->ibc_credits >= 0); LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - /* Not on ibc_rdma_queue */ - LASSERT (!tx->tx_passive_rdma_wait); - - if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) - GOTO(out, 0); + if (conn->ibc_nsends_posted == + *kibnal_tunables.kib_concurrent_sends) { + /* We've got some tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } - if (conn->ibc_credits == 0) /* no credits */ - GOTO(out, 1); + if (consume_cred) { + if (conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) { /* giving back credits */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + } - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) /* giving back credits */ - GOTO(out, 2); - list_del (&tx->tx_list); + tx->tx_queued = 0; + + /* NB don't drop ibc_lock before bumping tx_sending */ if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + !list_empty(&conn->ibc_tx_queue_nocred) || + (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && + !kibnal_send_keepalive(conn)))) { /* redundant NOOP */ - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); kibnal_tx_done(tx); - spin_lock_irqsave(&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, "%s: redundant noop\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); continue; } - tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; - conn->ibc_outstanding_credits = 0; + kibnal_pack_msg(tx->tx_msg, conn->ibc_version, + conn->ibc_outstanding_credits, + conn->ibc_peer->ibp_nid, conn->ibc_incarnation, + conn->ibc_txseq); + conn->ibc_txseq++; + conn->ibc_outstanding_credits = 0; conn->ibc_nsends_posted++; - conn->ibc_credits--; + if (consume_cred) + conn->ibc_credits--; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() from + * the first send; hence the ++ rather than = below. */ + tx->tx_sending++; - /* we only get a tx completion for the final rdma op */ - tx->tx_sending = min(tx->tx_nsp, 2); - tx->tx_passive_rdma_wait = tx->tx_passive_rdma; list_add (&tx->tx_list, &conn->ibc_active_txs); -#if IBNAL_CKSUM - tx->tx_msg->ibm_cksum = 0; - tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); - CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); -#endif - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - /* NB the gap between removing tx from the queue and sending it - * allows message re-ordering to occur */ - - LASSERT (tx->tx_nsp > 0); - - rc = -ECONNABORTED; - nwork = 0; - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - tx->tx_status = 0; - /* Driver only accepts 1 item at a time */ - for (i = 0; i < tx->tx_nsp; i++) { - hexdump("tx", tx->tx_msg, sizeof(kib_msg_t)); - rc = iibt_postsend(conn->ibc_qp, - &tx->tx_wrq[i]); - if (rc != 0) - break; - if (wrq_signals_completion(&tx->tx_wrq[i])) - nwork++; - CDEBUG(D_NET, "posted tx wrq %p\n", - &tx->tx_wrq[i]); - } + + LASSERT (tx->tx_nwrq > 0); + + rc = 0; + frc = FSUCCESS; + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) { + rc = -ECONNABORTED; + } else { + frc = iba_post_send2(conn->ibc_qp, tx->tx_wrq, NULL); + if (frc != FSUCCESS) + rc = -EIO; } - spin_lock_irqsave (&conn->ibc_lock, flags); + conn->ibc_last_send = jiffies; + if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - conn->ibc_credits++; + if (consume_cred) + conn->ibc_credits++; conn->ibc_nsends_posted--; tx->tx_status = rc; - tx->tx_passive_rdma_wait = 0; - tx->tx_sending -= tx->tx_nsp - nwork; - + tx->tx_waiting = 0; + tx->tx_sending--; + done = (tx->tx_sending == 0); if (done) list_del (&tx->tx_list); - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); + CERROR ("Error %d posting transmit to %s\n", + frc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); else - CDEBUG (D_NET, "Error %d posting transmit to " - LPX64"\n", rc, conn->ibc_peer->ibp_nid); + CDEBUG (D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, rc); @@ -1009,138 +995,172 @@ kibnal_check_sends (kib_conn_t *conn) kibnal_tx_done (tx); return; } - } - EXIT; -out: - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); } -static void -kibnal_tx_callback (IB_WORK_COMPLETION *wc) +void +kibnal_tx_complete (IB_WORK_COMPLETION *wc) { kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId); - kib_conn_t *conn; - unsigned long flags; + kib_conn_t *conn = tx->tx_conn; + int failed = wc->Status != WRStatusSuccess; int idle; - conn = tx->tx_conn; - LASSERT (conn != NULL); - LASSERT (tx->tx_sending != 0); + CDEBUG(D_NET, "%s: sending %d nwrq %d status %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_sending, tx->tx_nwrq, wc->Status); + + LASSERT (tx->tx_sending > 0); - spin_lock_irqsave(&conn->ibc_lock, flags); + if (failed && + tx->tx_status == 0 && + conn->ibc_state == IBNAL_CONN_ESTABLISHED) { +#if KIBLND_DETAILED_DEBUG + int i; + IB_WORK_REQ2 *wrq = &tx->tx_wrq[0]; + IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[0]; + lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; +#endif + CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64 + " sending %d waiting %d failed %d nwrk %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_msg->ibm_type, tx->tx_cookie, + tx->tx_sending, tx->tx_waiting, wc->Status, + tx->tx_nwrq); +#if KIBLND_DETAILED_DEBUG + for (i = 0; i < tx->tx_nwrq; i++, wrq++, gl++) { + switch (wrq->Operation) { + default: + CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p OP %d " + "DSList %p(%p)/%d: "LPX64"/%d K %x\n", + i, wrq, wrq->Next, wrq->Operation, + wrq->DSList, gl, wrq->DSListDepth, + gl->Address, gl->Length, gl->Lkey); + break; + case WROpSend: + CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p SEND " + "DSList %p(%p)/%d: "LPX64"/%d K %x\n", + i, wrq, wrq->Next, + wrq->DSList, gl, wrq->DSListDepth, + gl->Address, gl->Length, gl->Lkey); + break; + case WROpRdmaWrite: + CDEBUG(D_NETERROR, " [%3d] Addr %p Next %p DMA " + "DSList: %p(%p)/%d "LPX64"/%d K %x -> " + LPX64" K %x\n", + i, wrq, wrq->Next, + wrq->DSList, gl, wrq->DSListDepth, + gl->Address, gl->Length, gl->Lkey, + wrq->Req.SendRC.RemoteDS.Address, + wrq->Req.SendRC.RemoteDS.Rkey); + break; + } + } + + switch (tx->tx_msg->ibm_type) { + default: + CDEBUG(D_NETERROR, " msg type %x %p/%d, No RDMA\n", + tx->tx_msg->ibm_type, + tx->tx_msg, tx->tx_msg->ibm_nob); + break; - CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, - tx->tx_sending, tx->tx_nsp, wc->Status); + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + CDEBUG(D_NETERROR, " msg type %x %p/%d, RDMA key %x frags %d...\n", + tx->tx_msg->ibm_type, + tx->tx_msg, tx->tx_msg->ibm_nob, + tx->tx_rd->rd_key, tx->tx_rd->rd_nfrag); + for (i = 0; i < tx->tx_rd->rd_nfrag; i++) + CDEBUG(D_NETERROR, " [%d] "LPX64"/%d\n", i, + tx->tx_rd->rd_frags[i].rf_addr, + tx->tx_rd->rd_frags[i].rf_nob); + if (lntmsg == NULL) { + CDEBUG(D_NETERROR, " No lntmsg\n"); + } else if (lntmsg->msg_iov != NULL) { + CDEBUG(D_NETERROR, " lntmsg in %d VIRT frags...\n", + lntmsg->msg_niov); + for (i = 0; i < lntmsg->msg_niov; i++) + CDEBUG(D_NETERROR, " [%d] %p/%d\n", i, + lntmsg->msg_iov[i].iov_base, + lntmsg->msg_iov[i].iov_len); + } else if (lntmsg->msg_kiov != NULL) { + CDEBUG(D_NETERROR, " lntmsg in %d PAGE frags...\n", + lntmsg->msg_niov); + for (i = 0; i < lntmsg->msg_niov; i++) + CDEBUG(D_NETERROR, " [%d] %p+%d/%d\n", i, + lntmsg->msg_kiov[i].kiov_page, + lntmsg->msg_kiov[i].kiov_offset, + lntmsg->msg_kiov[i].kiov_len); + } else { + CDEBUG(D_NETERROR, " lntmsg in %d frags\n", + lntmsg->msg_niov); + } + + break; + } +#endif + } + + spin_lock(&conn->ibc_lock); /* I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. If it's - * not me, then I take an extra ref on conn so it can't disappear - * under me. */ + * gets to free it, which also drops its ref on 'conn'. */ tx->tx_sending--; + conn->ibc_nsends_posted--; + + if (failed) { + tx->tx_waiting = 0; + tx->tx_status = -EIO; + } + idle = (tx->tx_sending == 0) && /* This is the final callback */ - (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + !tx->tx_waiting && /* Not waiting for peer */ + !tx->tx_queued; /* Not re-queued (PUT_DONE) */ if (idle) list_del(&tx->tx_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); /* 1 ref for me.... */ - if (tx->tx_sending == 0) - conn->ibc_nsends_posted--; - - if (wc->Status != WRStatusSuccess && - tx->tx_status == 0) - tx->tx_status = -ECONNABORTED; - - spin_unlock_irqrestore(&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); if (idle) kibnal_tx_done (tx); - if (wc->Status != WRStatusSuccess) { - CERROR ("Tx completion to "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, wc->Status); - kibnal_close_conn (conn, -ENETDOWN); + if (failed) { + kibnal_close_conn (conn, -EIO); } else { - /* can I shovel some more sends out the door? */ + kibnal_peer_alive(conn->ibc_peer); kibnal_check_sends(conn); } - kibnal_put_conn (conn); -} - -void -kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev) -{ - /* XXX flesh out. this seems largely for async errors */ - CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); -} - -void -kibnal_ca_callback (void *ca_arg, void *cq_arg) -{ - IB_HANDLE cq = *(IB_HANDLE *)cq_arg; - IB_HANDLE ca = *(IB_HANDLE *)ca_arg; - IB_WORK_COMPLETION wc; - int armed = 0; - - CDEBUG(D_NET, "ca %p cq %p\n", ca, cq); - - for(;;) { - while (iibt_cq_poll(cq, &wc) == FSUCCESS) { - - /* We will need to rearm the CQ to avoid a potential race. */ - armed = 0; - - if (kibnal_wreqid_is_rx(wc.WorkReqId)) - kibnal_rx_callback(&wc); - else - kibnal_tx_callback(&wc); - } - if (armed) - return; - if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) { - CERROR("rearm failed?\n"); - return; - } - armed = 1; - } + kibnal_conn_decref(conn); /* ...until here */ } void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) { - IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp]; - IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp]; - int fence; - int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nwrq]; + IB_WORK_REQ2 *wrq = &tx->tx_wrq[tx->tx_nwrq]; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; - LASSERT (tx->tx_nsp >= 0 && - tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); + LASSERT (tx->tx_nwrq >= 0 && + tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS)); LASSERT (nob <= IBNAL_MSG_SIZE); - - tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; - tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; - tx->tx_msg->ibm_type = type; -#if IBNAL_CKSUM - tx->tx_msg->ibm_nob = nob; -#endif - /* Fence the message if it's bundled with an RDMA read */ - fence = (tx->tx_nsp > 0) && - (type == IBNAL_MSG_PUT_DONE); + + kibnal_init_msg(tx->tx_msg, type, body_nob); *gl = (IB_LOCAL_DATASEGMENT) { - .Address = tx->tx_vaddr, + .Address = tx->tx_hca_msg, .Length = IBNAL_MSG_SIZE, - .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages), + .Lkey = kibnal_data.kib_whole_mem.md_lkey, }; - wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); + wrq->Next = NULL; /* This is the last one */ + + wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_TX); wrq->Operation = WROpSend; wrq->DSList = gl; wrq->DSListDepth = 1; @@ -1149,869 +1169,1339 @@ kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) wrq->Req.SendRC.Options.s.SolicitedEvent = 1; wrq->Req.SendRC.Options.s.SignaledCompletion = 1; wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = fence; - - tx->tx_nsp++; + wrq->Req.SendRC.Options.s.Fence = 0; + /* fence only needed on RDMA reads */ + + tx->tx_nwrq++; } -static void -kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +int +kibnal_init_rdma (kib_tx_t *tx, int type, int nob, + kib_rdma_desc_t *dstrd, __u64 dstcookie) { - unsigned long flags; + kib_msg_t *ibmsg = tx->tx_msg; + kib_rdma_desc_t *srcrd = tx->tx_rd; + IB_LOCAL_DATASEGMENT *gl; + IB_WORK_REQ2 *wrq; + int rc; - spin_lock_irqsave(&conn->ibc_lock, flags); +#if IBNAL_USE_FMR + LASSERT (tx->tx_nwrq == 0); - kibnal_queue_tx_locked (tx, conn); - - spin_unlock_irqrestore(&conn->ibc_lock, flags); - + gl = &tx->tx_gl[0]; + gl->Length = nob; + gl->Address = srcrd->rd_addr; + gl->Lkey = srcrd->rd_key; + + wrq = &tx->tx_wrq[0]; + + wrq->Next = wrq + 1; + wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); + wrq->Operation = WROpRdmaWrite; + wrq->DSList = gl; + wrq->DSListDepth = 1; + wrq->MessageLen = nob; + + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 0; + wrq->Req.SendRC.Options.s.SignaledCompletion = 0; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = 0; + + wrq->Req.SendRC.RemoteDS.Address = dstrd->rd_addr; + wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key; + + tx->tx_nwrq = 1; + rc = nob; +#else + /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ + int resid = nob; + kib_rdma_frag_t *srcfrag; + int srcidx; + kib_rdma_frag_t *dstfrag; + int dstidx; + int wrknob; + + /* Called by scheduler */ + LASSERT (!in_interrupt()); + + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); + + srcidx = dstidx = 0; + srcfrag = &srcrd->rd_frags[0]; + dstfrag = &dstrd->rd_frags[0]; + rc = resid; + + while (resid > 0) { + if (srcidx >= srcrd->rd_nfrag) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } + + if (dstidx == dstrd->rd_nfrag) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } + + if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) { + CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n", + srcidx, srcrd->rd_nfrag, + dstidx, dstrd->rd_nfrag); + rc = -EMSGSIZE; + break; + } + + wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid); + + gl = &tx->tx_gl[tx->tx_nwrq]; + gl->Length = wrknob; + gl->Address = srcfrag->rf_addr; + gl->Lkey = srcrd->rd_key; + + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + wrq->Next = wrq + 1; + wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA); + wrq->Operation = WROpRdmaWrite; + wrq->DSList = gl; + wrq->DSListDepth = 1; + wrq->MessageLen = nob; + + wrq->Req.SendRC.ImmediateData = 0; + wrq->Req.SendRC.Options.s.SolicitedEvent = 0; + wrq->Req.SendRC.Options.s.SignaledCompletion = 0; + wrq->Req.SendRC.Options.s.ImmediateData = 0; + wrq->Req.SendRC.Options.s.Fence = 0; + + wrq->Req.SendRC.RemoteDS.Address = dstfrag->rf_addr; + wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key; + + resid -= wrknob; + if (wrknob < srcfrag->rf_nob) { + srcfrag->rf_addr += wrknob; + srcfrag->rf_nob -= wrknob; + } else { + srcfrag++; + srcidx++; + } + + if (wrknob < dstfrag->rf_nob) { + dstfrag->rf_addr += wrknob; + dstfrag->rf_nob -= wrknob; + } else { + dstfrag++; + dstidx++; + } + + tx->tx_nwrq++; + } + + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = 0; +#endif + + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); + + return rc; +} + +void +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + spin_lock(&conn->ibc_lock); + kibnal_queue_tx_locked (tx, conn); + spin_unlock(&conn->ibc_lock); + kibnal_check_sends(conn); } -static void -kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +void +kibnal_schedule_active_connect_locked (kib_peer_t *peer, int proto_version) +{ + /* Called holding kib_global_lock exclusive with IRQs disabled */ + + peer->ibp_version = proto_version; /* proto version for new conn */ + peer->ibp_connecting++; /* I'm connecting */ + kibnal_peer_addref(peer); /* extra ref for connd */ + + spin_lock(&kibnal_data.kib_connd_lock); + + list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock(&kibnal_data.kib_connd_lock); +} + +void +kibnal_schedule_active_connect (kib_peer_t *peer, int proto_version) +{ + unsigned long flags; + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + kibnal_schedule_active_connect_locked(peer, proto_version); + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); +} + +void +kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) { - unsigned long flags; kib_peer_t *peer; kib_conn_t *conn; + unsigned long flags; rwlock_t *g_lock = &kibnal_data.kib_global_lock; + int retry; + int rc; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - LASSERT (tx->tx_nsp > 0); /* work items have been set up */ + LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ - read_lock_irqsave(g_lock, flags); + for (retry = 0; ; retry = 1) { + read_lock_irqsave(g_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - read_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - read_unlock_irqrestore(g_lock, flags); + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) { + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + kibnal_conn_addref(conn); /* 1 ref for me... */ + read_unlock_irqrestore(g_lock, flags); + + kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...to here */ + return; + } + } - kibnal_queue_tx (tx, conn); - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); + /* Making one or more connections; I'll need a write lock... */ + read_unlock(g_lock); + write_lock(g_lock); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) + break; + + write_unlock_irqrestore(g_lock, flags); + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } + + rc = kibnal_add_persistent_peer(nid); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } } conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - write_unlock_irqrestore (g_lock, flags); + kibnal_conn_addref(conn); /* 1 ref for me... */ + write_unlock_irqrestore(g_lock, flags); kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...until here */ return; } - if (peer->ibp_connecting == 0) { - if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { - write_unlock_irqrestore (g_lock, flags); + if (!kibnal_peer_connecting(peer)) { + if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ + time_after_eq(jiffies, peer->ibp_reconnect_time))) { + write_unlock_irqrestore(g_lock, flags); tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; kibnal_tx_done (tx); return; } - - peer->ibp_connecting = 1; - kib_peer_addref(peer); /* extra ref for connd */ - - spin_lock (&kibnal_data.kib_connd_lock); - - list_add_tail (&peer->ibp_connd_list, - &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); + + kibnal_schedule_active_connect_locked(peer, IBNAL_MSG_VERSION); } /* A connection is being established; queue the message... */ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); - write_unlock_irqrestore (g_lock, flags); + write_unlock_irqrestore(g_lock, flags); } -static ptl_err_t -kibnal_start_passive_rdma (int type, ptl_nid_t nid, - lib_msg_t *libmsg, ptl_hdr_t *hdr) +void +kibnal_txlist_done (struct list_head *txlist, int status) { - int nob = libmsg->md->length; - kib_tx_t *tx; - kib_msg_t *ibmsg; - int rc; - IB_ACCESS_CONTROL access = {0,}; - - LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); - LASSERT (nob > 0); - LASSERT (!in_interrupt()); /* Mapping could block */ - - access.s.MWBindable = 1; - access.s.LocalWrite = 1; - access.s.RdmaRead = 1; - access.s.RdmaWrite = 1; + kib_tx_t *tx; - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, kib_tx_t, tx_list); - if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = kibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob, 0); - else - rc = kibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob, 0); - - if (rc != 0) { - CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); - goto failed; - } - - if (type == IBNAL_MSG_GET_RDMA) { - /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, - nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR ("Can't create reply for GET -> "LPX64"\n", - nid); - rc = -ENOMEM; - goto failed; - } + list_del (&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + kibnal_tx_done (tx); } - - tx->tx_passive_rdma = 1; +} - ibmsg = tx->tx_msg; +int +kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + int rc; - ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; - ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; - /* map_kiov alrady filled the rdma descs for the whole_mem case */ - if (!kibnal_whole_mem()) { - ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - ibmsg->ibm_u.rdma.ibrm_num_descs = 1; - } + /* NB 'private' is different depending on what we're sending.... */ - kibnal_init_tx_msg (tx, type, - kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); - CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " - LPX64", nob %d\n", - tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, - tx->tx_md.md_addr, nob); - - /* libmsg gets finalized when tx completes. */ - tx->tx_libmsg[0] = libmsg; + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); - kibnal_launch_tx(tx, nid); - return (PTL_OK); + /* Thread context */ + LASSERT (!in_interrupt()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - failed: - tx->tx_status = rc; - kibnal_tx_done (tx); - return (PTL_FAIL); -} + switch (type) { + default: + LBUG(); + return (-EIO); + + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; -void -kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t nob) -{ - kib_msg_t *rxmsg = rx->rx_msg; - kib_msg_t *txmsg; - kib_tx_t *tx; - IB_ACCESS_CONTROL access = {0,}; - IB_WR_OP rdma_op; - int rc; - __u32 i; + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can allocate txd for GET to %s: \n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.get.ibgm_hdr = *hdr; + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd, + 0, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); + else + rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd, + 0, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kibnal_tx_done(tx); + return -EIO; + } - CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", - type, status, niov, offset, nob); +#if IBNAL_USE_FMR + nob = sizeof(kib_get_msg_t); +#else + { + int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag; + + nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]); + } +#endif + kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); - /* Called by scheduler */ - LASSERT (!in_interrupt ()); + tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, + lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); + kibnal_tx_done(tx); + return -EIO; + } - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kibnal_launch_tx(tx, target.nid); + return 0; - /* No data if we're completing with failure */ - LASSERT (status == 0 || nob == 0); + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } - LASSERT (type == IBNAL_MSG_GET_DONE || - type == IBNAL_MSG_PUT_DONE); + if (payload_kiov == NULL) + rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kibnal_tx_done(tx); + return -EIO; + } - /* Flag I'm completing the RDMA. Even if I fail to send the - * completion message, I will have tried my best so further - * attempts shouldn't be tried. */ - LASSERT (!rx->rx_rdma); - rx->rx_rdma = 1; + ibmsg = tx->tx_msg; + ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); - if (type == IBNAL_MSG_GET_DONE) { - rdma_op = WROpRdmaWrite; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); - } else { - access.s.LocalWrite = 1; - rdma_op = WROpRdmaRead; - LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kibnal_launch_tx(tx, target.nid); + return 0; } - tx = kibnal_get_idle_tx (0); /* Mustn't block */ + /* send IMMEDIATE */ + + LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBNAL_MSG_SIZE); + + tx = kibnal_get_idle_tx(); if (tx == NULL) { - CERROR ("tx descs exhausted on RDMA from "LPX64 - " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); - return; + CERROR ("Can't send %d to %s: tx descs exhausted\n", + type, libcfs_nid2str(target.nid)); + return -ENOMEM; } - LASSERT (tx->tx_nsp == 0); - - if (nob == 0) - GOTO(init_tx, 0); - - /* We actually need to transfer some data (the transfer - * size could get truncated to zero when the incoming - * message is matched) */ - if (kiov != NULL) - rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); else - rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); - - if (rc != 0) { - CERROR ("Can't map RDMA -> "LPX64": %d\n", - rx->rx_conn->ibc_peer->ibp_nid, rc); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - if (!kibnal_whole_mem()) { - tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; - tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; - } - - /* XXX ugh. different page-sized hosts. */ - if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != - rxmsg->ibm_u.rdma.ibrm_num_descs) { - CERROR("tx descs (%u) != rx descs (%u)\n", - tx->tx_msg->ibm_u.rdma.ibrm_num_descs, - rxmsg->ibm_u.rdma.ibrm_num_descs); - /* We'll skip the RDMA and complete with failure. */ - status = rc; - nob = 0; - GOTO(init_tx, rc); - } - - /* map_kiov filled in the rdma descs which describe our side of the - * rdma transfer. */ - /* ibrm_num_descs was verified in rx_callback */ - for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { - kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ - IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i]; - IB_WORK_REQ *wrq = &tx->tx_wrq[i]; - - ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; - rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; - - ds->Address = ldesc->rd_addr; - ds->Length = ldesc->rd_nob; - ds->Lkey = tx->tx_msg->ibm_u.rdma.rd_key; - - memset(wrq, 0, sizeof(*wrq)); - wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); - wrq->Operation = rdma_op; - wrq->DSList = ds; - wrq->DSListDepth = 1; - wrq->MessageLen = ds->Length; - wrq->Req.SendRC.ImmediateData = 0; - wrq->Req.SendRC.Options.s.SolicitedEvent = 0; - wrq->Req.SendRC.Options.s.SignaledCompletion = 0; - wrq->Req.SendRC.Options.s.ImmediateData = 0; - wrq->Req.SendRC.Options.s.Fence = 0; - wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr; - wrq->Req.SendRC.RemoteDS.Rkey = rxmsg->ibm_u.rdma.rd_key; + lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); - /* only the last rdma post triggers tx completion */ - if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) - wrq->Req.SendRC.Options.s.SignaledCompletion = 1; + nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob); - tx->tx_nsp++; + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + kibnal_launch_tx(tx, target.nid); + return 0; +} + +void +kibnal_reply(lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) +{ + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kib_tx_t *tx; + int rc; + + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + goto failed_0; } -init_tx: - txmsg = tx->tx_msg; + if (nob == 0) + rc = 0; + else if (kiov == NULL) + rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1, + niov, iov, offset, nob); + else + rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1, + niov, kiov, offset, nob); - txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; - txmsg->ibm_u.completion.ibcm_status = status; + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } - kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); - - if (status == 0 && nob != 0) { - LASSERT (tx->tx_nsp > 1); - /* RDMA: libmsg gets finalized when the tx completes. This - * is after the completion message has been sent, which in - * turn is after the RDMA has finished. */ - tx->tx_libmsg[0] = libmsg; + rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + if (rc == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(ni, lntmsg, 0); } else { - LASSERT (tx->tx_nsp == 1); - /* No RDMA: local completion happens now! */ - CWARN("No data: immediate completion\n"); - lib_finalize (&kibnal_lib, NULL, libmsg, - status == 0 ? PTL_OK : PTL_FAIL); - } - - /* +1 ref for this tx... */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - rx->rx_conn, rx->rx_conn->ibc_state, - rx->rx_conn->ibc_peer->ibp_nid, - atomic_read (&rx->rx_conn->ibc_refcount)); - atomic_inc (&rx->rx_conn->ibc_refcount); - /* ...and queue it up */ + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + kibnal_queue_tx(tx, rx->rx_conn); + return; + + failed_1: + kibnal_tx_done(tx); + failed_0: + lnet_finalize(ni, lntmsg, -EIO); } -static ptl_err_t -kibnal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); +int +kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + void **new_private) +{ + kib_rx_t *rx = private; + kib_conn_t *conn = rx->rx_conn; - LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + /* Can't block if RDMA completions need normal credits */ + LCONSOLE_ERROR("Dropping message from %s: no buffers free. " + "%s is running an old version of LNET that may " + "deadlock if messages wait for buffers)\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return -EDEADLK; + } + + *new_private = private; + return 0; +} - /* Thread context if we're sending payload */ - LASSERT (!in_interrupt() || payload_niov == 0); - /* payload is either all vaddrs or all pages */ - LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); +int +kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + kib_tx_t *tx; + kib_msg_t *txmsg; + int nob; + int post_cred = 1; + int rc = 0; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); - switch (type) { + switch (rxmsg->ibm_type) { default: LBUG(); - return (PTL_FAIL); - case PTL_MSG_REPLY: { - /* reply's 'private' is the incoming receive */ - kib_rx_t *rx = private; - - /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { - kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - return (PTL_OK); + case IBNAL_MSG_IMMEDIATE: + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov(niov, kiov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + else + lnet_copy_flat2iov(niov, iov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize (ni, lntmsg, 0); + break; + + case IBNAL_MSG_PUT_REQ: + if (mlen == 0) { + lnet_finalize(ni, lntmsg, 0); + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; } - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->ibm_type); - return (PTL_FAIL); + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; } - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob >= IBNAL_MSG_SIZE) { - CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", - nid, payload_nob); - return (PTL_FAIL); + txmsg = tx->tx_msg; + if (kiov == NULL) + rc = kibnal_setup_rd_iov(tx, + &txmsg->ibm_u.putack.ibpam_rd, + 0, + niov, iov, offset, mlen); + else + rc = kibnal_setup_rd_kiov(tx, + &txmsg->ibm_u.putack.ibpam_rd, + 0, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kibnal_tx_done(tx); + /* tell peer it's over */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; } - break; - } - case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); - break; + txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; +#if IBNAL_USE_FMR + nob = sizeof(kib_putack_msg_t); +#else + { + int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag; - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); + nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]); + } +#endif + kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kibnal_queue_tx(tx, conn); + + if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) + post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */ break; - case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); - + case IBNAL_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kibnal_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } break; } - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); - if (tx == NULL) { - CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", - type, nid, in_interrupt() ? " (intr)" : ""); - return (PTL_NO_SPACE); + kibnal_post_rx(rx, post_cred, 0); + return rc; +} + +int +kibnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kibnal_data.kib_nthreads); + return (0); +} + +void +kibnal_thread_fini (void) +{ + atomic_dec (&kibnal_data.kib_nthreads); +} + +void +kibnal_peer_alive (kib_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->ibp_last_alive = cfs_time_current(); + mb(); +} + +void +kibnal_peer_notify (kib_peer_t *peer) +{ + time_t last_alive = 0; + int error = 0; + unsigned long flags; + + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + if (list_empty(&peer->ibp_conns) && + peer->ibp_accepting == 0 && + peer->ibp_connecting == 0 && + peer->ibp_error != 0) { + error = peer->ibp_error; + peer->ibp_error = 0; + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive); + } + + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); +} + +void +kibnal_schedule_conn (kib_conn_t *conn) +{ + unsigned long flags; + + kibnal_conn_addref(conn); /* ++ref for connd */ + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); +} + +void +kibnal_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immediate housekeeping to start shutdown of an + * established connection. 'error' is zero for a normal shutdown. + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); + + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) + return; /* already being handled */ + + /* NB Can't take ibc_lock here (could be in IRQ context), without + * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */ + + if (error == 0 && + list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && + list_empty(&conn->ibc_active_txs)) { + CDEBUG(D_NET, "closing conn to %s" + " rx# "LPD64" tx# "LPD64"\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_txseq, conn->ibc_rxseq); + } else { + CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s" + " rx# "LPD64" tx# "LPD64"\n", + libcfs_nid2str(peer->ibp_nid), error, + list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", + list_empty(&conn->ibc_active_txs) ? "" : "(waiting)", + conn->ibc_txseq, conn->ibc_rxseq); +#if 0 + /* can't skip down the queue without holding ibc_lock (see above) */ + list_for_each(tmp, &conn->ibc_tx_queue) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + CERROR(" queued tx type %x cookie "LPX64 + " sending %d waiting %d ticks %ld/%d\n", + tx->tx_msg->ibm_type, tx->tx_cookie, + tx->tx_sending, tx->tx_waiting, + (long)(tx->tx_deadline - jiffies), HZ); + } + + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + CERROR(" active tx type %x cookie "LPX64 + " sending %d waiting %d ticks %ld/%d\n", + tx->tx_msg->ibm_type, tx->tx_cookie, + tx->tx_sending, tx->tx_waiting, + (long)(tx->tx_deadline - jiffies), HZ); + } +#endif } - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + list_del (&conn->ibc_list); - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_iov, - payload_offset, payload_nob); - } + if (list_empty (&peer->ibp_conns)) { /* no more conns */ + if (peer->ibp_persistence == 0 && /* non-persistent peer */ + kibnal_peer_active(peer)) /* still in peer table */ + kibnal_unlink_peer_locked (peer); - kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, - offsetof(kib_immediate_msg_t, - ibim_payload[payload_nob])); + peer->ibp_error = error; /* set/clear error on last conn */ + } - /* libmsg gets finalized when tx completes */ - tx->tx_libmsg[0] = libmsg; + kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTING); - kibnal_launch_tx(tx, nid); - return (PTL_OK); + kibnal_schedule_conn(conn); + kibnal_conn_decref(conn); /* lose ibc_list's ref */ } -static ptl_err_t -kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) +void +kibnal_close_conn (kib_conn_t *conn, int error) { - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); -} + unsigned long flags; + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); -static ptl_err_t -kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) -{ - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); + kibnal_close_conn_locked (conn, error); + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); } -static ptl_err_t -kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) +void +kibnal_handle_early_rxs(kib_conn_t *conn) { - kib_rx_t *rx = private; - kib_msg_t *rxmsg = rx->rx_msg; - int msg_nob; - - LASSERT (mlen <= rlen); - LASSERT (!in_interrupt ()); - /* Either all pages or all vaddrs */ - LASSERT (!(kiov != NULL && iov != NULL)); + unsigned long flags; + kib_rx_t *rx; - switch (rxmsg->ibm_type) { - default: - LBUG(); - return (PTL_FAIL); + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + while (!list_empty(&conn->ibc_early_rxs)) { + rx = list_entry(conn->ibc_early_rxs.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - case IBNAL_MSG_IMMEDIATE: - msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (msg_nob > IBNAL_MSG_SIZE) { - CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); - return (PTL_FAIL); - } + kibnal_handle_rx(rx); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); +} - if (kiov != NULL) - lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); - else - lib_copy_buf2iov(niov, iov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); +void +kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + spin_lock(&conn->ibc_lock); - case IBNAL_MSG_GET_RDMA: - /* We get called here just to discard any junk after the - * GET hdr. */ - LASSERT (libmsg == NULL); - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + list_for_each_safe (tmp, nxt, txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); - case IBNAL_MSG_PUT_RDMA: - kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, - rx, libmsg, - niov, iov, kiov, offset, mlen); - return (PTL_OK); + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); + } + + tx->tx_status = -ECONNABORTED; + tx->tx_queued = 0; + tx->tx_waiting = 0; + + if (tx->tx_sending == 0) { + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } } -} -static ptl_err_t -kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); + spin_unlock(&conn->ibc_lock); + + kibnal_txlist_done(&zombies, -ECONNABORTED); } -static ptl_err_t -kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) +void +kibnal_conn_disconnected(kib_conn_t *conn) { - return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); -} + static IB_QP_ATTRIBUTES_MODIFY qpam = {.RequestState = QPStateError}; -/***************************************************************************** - * the rest of this file concerns connection management. active connetions - * start with connect_peer, passive connections start with passive_callback. - * active disconnects start with conn_close, cm_callback starts passive - * disconnects and contains the guts of how the disconnect state machine - * progresses. - *****************************************************************************/ + FSTATUS frc; -int -kibnal_thread_start (int (*fn)(void *arg), void *arg) -{ - long pid = kernel_thread (fn, arg, 0); + LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP); - if (pid < 0) - return ((int)pid); + kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); - atomic_inc (&kibnal_data.kib_nthreads); - return (0); -} + /* move QP to error state to make posted work items complete */ + frc = iba_modify_qp(conn->ibc_qp, &qpam, NULL); + if (frc != FSUCCESS) + CERROR("can't move qp state to error: %d\n", frc); -static void -kibnal_thread_fini (void) -{ - atomic_dec (&kibnal_data.kib_nthreads); + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + kibnal_abort_txs(conn, &conn->ibc_tx_queue); + kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kibnal_abort_txs(conn, &conn->ibc_tx_queue); + kibnal_abort_txs(conn, &conn->ibc_active_txs); + + kibnal_handle_early_rxs(conn); } -/* this can be called by anyone at any time to close a connection. if - * the connection is still established it heads to the connd to start - * the disconnection in a safe context. It has no effect if called - * on a connection that is already disconnecting */ void -kibnal_close_conn_locked (kib_conn_t *conn, int error) +kibnal_peer_connect_failed (kib_peer_t *peer, int type, int error) { - /* This just does the immmediate housekeeping, and schedules the - * connection for the connd to finish off. - * Caller holds kib_global_lock exclusively in irq context */ - kib_peer_t *peer = conn->ibc_peer; + LIST_HEAD (zombies); + unsigned long flags; - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, - IBNAL_CONN_DISCONNECTED); + LASSERT (error != 0); + LASSERT (!in_interrupt()); - if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) - return; /* already disconnecting */ + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - CDEBUG (error == 0 ? D_NET : D_ERROR, - "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); + LASSERT (kibnal_peer_connecting(peer)); - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - /* kib_connd_conns takes ibc_list's ref */ - list_del (&conn->ibc_list); - } else { - /* new ref for kib_connd_conns */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + switch (type) { + case IBNAL_CONN_ACTIVE: + LASSERT (peer->ibp_connecting > 0); + peer->ibp_connecting--; + break; + + case IBNAL_CONN_PASSIVE: + LASSERT (peer->ibp_accepting > 0); + peer->ibp_accepting--; + break; + + case IBNAL_CONN_WAITING: + /* Can't assert; I might be racing with a successful connection + * which clears passivewait */ + peer->ibp_passivewait = 0; + break; + default: + LBUG(); + } + + if (kibnal_peer_connecting(peer) || /* another attempt underway */ + !list_empty(&peer->ibp_conns)) { /* got connected */ + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return; } + + /* Say when active connection can be re-attempted */ + peer->ibp_reconnect_interval *= 2; + peer->ibp_reconnect_interval = + MAX(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_min_reconnect_interval); + peer->ibp_reconnect_interval = + MIN(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_max_reconnect_interval); - if (list_empty (&peer->ibp_conns) && /* no more conns */ - peer->ibp_persistence == 0 && /* non-persistent peer */ - kibnal_peer_active(peer)) { /* still in peer table */ + peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval * HZ; + + /* Take peer's blocked transmits to complete with error */ + list_add(&zombies, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + if (kibnal_peer_active(peer) && + peer->ibp_persistence == 0) { + /* failed connection attempt on non-persistent peer */ kibnal_unlink_peer_locked (peer); } - conn->ibc_state = IBNAL_CONN_SEND_DREQ; + peer->ibp_error = error; + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - spin_lock (&kibnal_data.kib_connd_lock); + kibnal_peer_notify(peer); - list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock (&kibnal_data.kib_connd_lock); + if (list_empty (&zombies)) + return; + + CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer->ibp_nid)); + + kibnal_txlist_done (&zombies, -EHOSTUNREACH); } void -kibnal_close_conn (kib_conn_t *conn, int error) +kibnal_connreq_done (kib_conn_t *conn, int type, int status) { + kib_peer_t *peer = conn->ibc_peer; + struct list_head txs; + kib_tx_t *tx; unsigned long flags; - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + LASSERT (!in_interrupt()); + LASSERT (type == IBNAL_CONN_ACTIVE || type == IBNAL_CONN_PASSIVE); + LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP); + LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED); + LASSERT (kibnal_peer_connecting(peer)); - kibnal_close_conn_locked (conn, error); - - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); -} + LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars)); + conn->ibc_cvars = NULL; -static void -kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) -{ - LIST_HEAD (zombies); - kib_tx_t *tx; - unsigned long flags; + if (status != 0) { + /* failed to establish connection */ + kibnal_peer_connect_failed(conn->ibc_peer, type, status); + kibnal_conn_disconnected(conn); + kibnal_conn_decref(conn); /* Lose CM's ref */ + return; + } - LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); + /* connection established */ + LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING); - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + conn->ibc_last_send = jiffies; + kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED); + kibnal_peer_alive(peer); - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; + CDEBUG(D_NET, "Connection %s ESTABLISHED\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - if (peer->ibp_connecting != 0) { - /* another connection attempt under way (loopback?)... */ + peer->ibp_passivewait = 0; /* not waiting (got conn now) */ + kibnal_conn_addref(conn); /* +1 ref for ibc_list */ + list_add_tail(&conn->ibc_list, &peer->ibp_conns); + + if (!kibnal_peer_active(peer)) { + /* peer has been deleted */ + kibnal_close_conn_locked(conn, -ECONNABORTED); write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + kibnal_peer_connect_failed(conn->ibc_peer, type, -ECONNABORTED); + kibnal_conn_decref(conn); /* lose CM's ref */ return; } - - if (list_empty(&peer->ibp_conns)) { - /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; - /* Increase reconnection interval */ - peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - IBNAL_MAX_RECONNECT_INTERVAL); - /* Take peer's blocked blocked transmits; I'll complete - * them with error */ - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } - - if (kibnal_peer_active(peer) && - (peer->ibp_persistence == 0)) { - /* failed connection attempt on non-persistent peer */ - kibnal_unlink_peer_locked (peer); - } - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT (list_empty(&peer->ibp_tx_queue)); + switch (type) { + case IBNAL_CONN_ACTIVE: + LASSERT (peer->ibp_connecting > 0); + peer->ibp_connecting--; + break; + + case IBNAL_CONN_PASSIVE: + LASSERT (peer->ibp_accepting > 0); + peer->ibp_accepting--; + break; + default: + LBUG(); } - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ - if (!list_empty (&zombies)) - CERROR ("Deleting messages for "LPX64": connection failed\n", - peer->ibp_nid); + /* Nuke any dangling conns from a different peer instance... */ + kibnal_close_stale_conns_locked(peer, conn->ibc_incarnation); - while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); + /* grab txs blocking for a conn */ + list_add(&txs, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + /* Schedule blocked txs */ + spin_lock (&conn->ibc_lock); + while (!list_empty (&txs)) { + tx = list_entry (txs.next, kib_tx_t, tx_list); list_del (&tx->tx_list); - /* complete now */ - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); + + kibnal_queue_tx_locked (tx, conn); } + spin_unlock (&conn->ibc_lock); + kibnal_check_sends (conn); } -static void -kibnal_connreq_done (kib_conn_t *conn, int active, int status) +void +kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int why) { - int state = conn->ibc_state; - kib_peer_t *peer = conn->ibc_peer; - kib_tx_t *tx; - unsigned long flags; - int i; - - /* passive connection has no connreq & vice versa */ - LASSERTF(!active == !(conn->ibc_connreq != NULL), - "%d %p\n", active, conn->ibc_connreq); - if (active) { - PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - conn->ibc_connreq = NULL; - } - - write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + static CM_REJECT_INFO msgs[3]; + CM_REJECT_INFO *msg = &msgs[why]; + FSTATUS frc; + + LASSERT (why >= 0 && why < sizeof(msgs)/sizeof(msgs[0])); + + /* If I wasn't so lazy, I'd initialise this only once; it's effectively + * read-only... */ + msg->Reason = RC_USER_REJ; + msg->PrivateData[0] = (IBNAL_MSG_MAGIC) & 0xff; + msg->PrivateData[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; + msg->PrivateData[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; + msg->PrivateData[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; + msg->PrivateData[4] = (IBNAL_MSG_VERSION) & 0xff; + msg->PrivateData[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; + msg->PrivateData[6] = why; + + frc = iba_cm_reject(cep, msg); + if (frc != FSUCCESS) + CERROR("Error %d rejecting %s\n", frc, libcfs_nid2str(nid)); +} - LASSERT (peer->ibp_connecting != 0); - - if (status == 0) { - /* connection established... */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); - conn->ibc_state = IBNAL_CONN_ESTABLISHED; +void +kibnal_check_connreject(kib_conn_t *conn, int type, CM_REJECT_INFO *rej) +{ + kib_peer_t *peer = conn->ibc_peer; + unsigned long flags; + int magic; + int version; + int why; + + LASSERT (type == IBNAL_CONN_ACTIVE || + type == IBNAL_CONN_PASSIVE); + + CDEBUG(D_NET, "%s connection with %s rejected: %d\n", + (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), rej->Reason); + + switch (rej->Reason) { + case RC_STALE_CONN: + if (type == IBNAL_CONN_PASSIVE) { + CERROR("Connection to %s rejected (stale QP)\n", + libcfs_nid2str(peer->ibp_nid)); + } else { + CWARN("Connection from %s rejected (stale QP): " + "retrying...\n", libcfs_nid2str(peer->ibp_nid)); - if (!kibnal_peer_active(peer)) { - /* ...but peer deleted meantime */ - status = -ECONNABORTED; + /* retry from scratch to allocate a new conn + * which will use a different QP */ + kibnal_schedule_active_connect(peer, peer->ibp_version); } - } else { - KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, - IBNAL_CONN_CONNECTING); - } - if (status == 0) { - /* Everything worked! */ - - peer->ibp_connecting--; + /* An FCM_DISCONNECTED callback is still outstanding: give it a + * ref since kibnal_connreq_done() drops the CM's ref on conn + * on failure */ + kibnal_conn_addref(conn); + break; - /* +1 ref for ibc_list; caller(== CM)'s ref remains until - * the IB_CM_IDLE callback */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - list_add (&conn->ibc_list, &peer->ibp_conns); - - /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + case RC_USER_REJ: + magic = (rej->PrivateData[0]) | + (rej->PrivateData[1] << 8) | + (rej->PrivateData[2] << 16) | + (rej->PrivateData[3] << 24); + version = (rej->PrivateData[4]) | + (rej->PrivateData[5] << 8); + why = (rej->PrivateData[6]); + + /* retry with old proto version */ + if (magic == IBNAL_MSG_MAGIC && + version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && + conn->ibc_version == IBNAL_MSG_VERSION && + type != IBNAL_CONN_PASSIVE) { + /* retry with a new conn */ + CWARN ("Connection to %s refused: " + "retrying with old protocol version 0x%x\n", + libcfs_nid2str(peer->ibp_nid), version); + kibnal_schedule_active_connect(peer, version); + break; + } - /* post blocked sends to the new connection */ - spin_lock (&conn->ibc_lock); - - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); + if (magic != IBNAL_MSG_MAGIC || + version != IBNAL_MSG_VERSION) { + CERROR("%s connection with %s rejected " + "(magic/ver %08x/%d why %d): " + "incompatible protocol\n", + (type == IBNAL_CONN_ACTIVE) ? + "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), + magic, version, why); + break; + } - /* +1 ref for each tx */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); - kibnal_queue_tx_locked (tx, conn); + if (type == IBNAL_CONN_ACTIVE && + why == IBNAL_REJECT_CONN_RACE) { + /* lost connection race */ + CWARN("Connection to %s rejected: " + "lost connection race\n", + libcfs_nid2str(peer->ibp_nid)); + + write_lock_irqsave(&kibnal_data.kib_global_lock, + flags); + + if (list_empty(&peer->ibp_conns)) { + peer->ibp_passivewait = 1; + peer->ibp_passivewait_deadline = + jiffies + + (*kibnal_tunables.kib_timeout * HZ); + } + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + break; } - - spin_unlock (&conn->ibc_lock); - /* Nuke any dangling conns from a different peer instance... */ - kibnal_close_stale_conns_locked (conn->ibc_peer, - conn->ibc_incarnation); + CERROR("%s connection with %s rejected: %d\n", + (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), why); + break; - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + default: + CERROR("%s connection with %s rejected: %d\n", + (type == IBNAL_CONN_ACTIVE) ? "Active" : "Passive", + libcfs_nid2str(peer->ibp_nid), rej->Reason); + } + + kibnal_connreq_done(conn, type, -ECONNREFUSED); +} - /* queue up all the receives */ - for (i = 0; i < IBNAL_RX_MSGS; i++) { - /* +1 ref for rx desc */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); +void +kibnal_cm_disconnect_callback(kib_conn_t *conn, CM_CONN_INFO *info) +{ + CDEBUG(D_NET, "%s: state %d, status 0x%x\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + conn->ibc_state, info->Status); + + LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", - i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, - conn->ibc_rxs[i].rx_vaddr); + switch (info->Status) { + default: + LBUG(); + break; - kibnal_post_rx (&conn->ibc_rxs[i], 0); - } + case FCM_DISCONNECT_REQUEST: + /* Schedule conn to iba_cm_disconnect() if it wasn't already */ + kibnal_close_conn (conn, 0); + break; - kibnal_check_sends (conn); - return; + case FCM_DISCONNECT_REPLY: /* peer acks my disconnect req */ + case FCM_DISCONNECTED: /* end of TIME_WAIT */ + CDEBUG(D_NET, "Connection %s disconnected.\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_conn_decref(conn); /* Lose CM's ref */ + break; } +} - /* connection failed */ - if (state == IBNAL_CONN_CONNECTING) { - /* schedule for connd to close */ - kibnal_close_conn_locked (conn, status); - } else { - /* Don't have a CM comm_id; just wait for refs to drain */ - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - } +void +kibnal_cm_passive_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +{ + kib_conn_t *conn = arg; - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + CDEBUG(D_NET, "status 0x%x\n", info->Status); + + /* Established Connection Notifier */ + switch (info->Status) { + default: + CERROR("Unexpected status %d on Connection %s\n", + info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + LBUG(); + break; + + case FCM_CONNECT_TIMEOUT: + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ETIMEDOUT); + break; + + case FCM_CONNECT_REJECT: + kibnal_check_connreject(conn, IBNAL_CONN_PASSIVE, + &info->Info.Reject); + break; - kibnal_peer_connect_failed (conn->ibc_peer, active, status); + case FCM_CONNECT_ESTABLISHED: + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, 0); + break; - /* If we didn't establish the connection we don't have to pass - * through the disconnect protocol before dropping the CM ref */ - if (state < IBNAL_CONN_CONNECTING) - kibnal_put_conn (conn); + case FCM_DISCONNECT_REQUEST: + case FCM_DISCONNECT_REPLY: + case FCM_DISCONNECTED: + kibnal_cm_disconnect_callback(conn, info); + break; + } } -static int -kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, - ptl_nid_t nid, __u64 incarnation, int queue_depth) +int +kibnal_accept (kib_conn_t **connp, IB_HANDLE cep, kib_msg_t *msg, int nob) { - kib_conn_t *conn = kibnal_create_conn(); + lnet_nid_t nid; + kib_conn_t *conn; kib_peer_t *peer; kib_peer_t *peer2; unsigned long flags; + int rc; + + rc = kibnal_unpack_msg(msg, 0, nob); + if (rc != 0) { + /* SILENT! kibnal_unpack_msg() complains if required */ + kibnal_reject(LNET_NID_ANY, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + nid = msg->ibm_srcnid; - if (conn == NULL) - return (-ENOMEM); + if (msg->ibm_version != IBNAL_MSG_VERSION) + CWARN("Connection from %s: old protocol version 0x%x\n", + libcfs_nid2str(nid), msg->ibm_version); - if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-EPROTO); + if (msg->ibm_type != IBNAL_MSG_CONNREQ) { + CERROR("Can't accept %s: bad request type %d (%d expected)\n", + libcfs_nid2str(nid), msg->ibm_type, IBNAL_MSG_CONNREQ); + kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + if (msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid) { + CERROR("Can't accept %s: bad dst NID %s (%s expected)\n", + libcfs_nid2str(nid), + libcfs_nid2str(msg->ibm_dstnid), + libcfs_nid2str(kibnal_data.kib_ni->ni_nid)); + kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE || + msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE || + msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n", + libcfs_nid2str(nid), + msg->ibm_u.connparams.ibcp_queue_depth, + msg->ibm_u.connparams.ibcp_max_msg_size, + msg->ibm_u.connparams.ibcp_max_frags, + IBNAL_MSG_QUEUE_SIZE, + IBNAL_MSG_SIZE, + IBNAL_MAX_RDMA_FRAGS); + kibnal_reject(nid, cep, IBNAL_REJECT_FATAL); + return -EPROTO; + } + + conn = kibnal_create_conn(nid, msg->ibm_version); + if (conn == NULL) { + kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); + return -ENOMEM; } /* assume 'nid' is a new peer */ - peer = kibnal_create_peer (nid); - if (peer == NULL) { - CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); - return (-ENOMEM); + rc = kibnal_create_peer(&peer, nid); + if (rc != 0) { + kibnal_conn_decref(conn); + kibnal_reject(nid, cep, IBNAL_REJECT_NO_RESOURCES); + return -ENOMEM; } write_lock_irqsave (&kibnal_data.kib_global_lock, flags); @@ -2020,456 +2510,253 @@ kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep, if (peer2 == NULL) { /* peer table takes my ref on peer */ list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); + LASSERT (peer->ibp_connecting == 0); } else { - kib_peer_decref (peer); + kibnal_peer_decref(peer); peer = peer2; - } - kib_peer_addref(peer); /* +1 ref for conn */ - peer->ibp_connecting++; + if (peer->ibp_connecting != 0 && + peer->ibp_nid < kibnal_data.kib_ni->ni_nid) { + /* Resolve concurrent connection attempts in favour of + * the higher NID */ + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + kibnal_conn_decref(conn); + kibnal_reject(nid, cep, IBNAL_REJECT_CONN_RACE); + return -EALREADY; + } + } - write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + kibnal_peer_addref(peer); /* +1 ref for conn */ + peer->ibp_accepting++; + kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); conn->ibc_peer = peer; - conn->ibc_state = IBNAL_CONN_CONNECTING; - /* conn->ibc_cep is set when cm_accept is called */ - conn->ibc_incarnation = incarnation; + conn->ibc_incarnation = msg->ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBNAL_RX_MSGS); - *connp = conn; - return (0); -} - -static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state) -{ - IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,}; - FSTATUS frc; - - modify_attr.RequestState = state; + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - frc = iibt_qp_modify(qp, &modify_attr, NULL); - if (frc != FSUCCESS) - CERROR("couldn't set qp state to %d, error %d\n", state, frc); + *connp = conn; + return 0; } -static void kibnal_flush_pending(kib_conn_t *conn) +void +kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) { - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - unsigned long flags; - int done; - - /* NB we wait until the connection has closed before completing - * outstanding passive RDMAs so we can be sure the network can't - * touch the mapped memory any more. */ - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); - - /* set the QP to the error state so that we get flush callbacks - * on our posted receives which can then drop their conn refs */ - kibnal_set_qp_state(conn->ibc_qp, QPStateError); - - spin_lock_irqsave (&conn->ibc_lock, flags); - - /* grab passive RDMAs not waiting for the tx callback */ - list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - /* still waiting for tx callback? */ - if (!tx->tx_passive_rdma_wait) - continue; - - tx->tx_status = -ECONNABORTED; - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - if (!done) - continue; - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } + CM_REQUEST_INFO *req = &info->Info.Request; + CM_REPLY_INFO *rep; + kib_conn_t *conn; + FSTATUS frc; + int rc; + + LASSERT(arg == NULL); /* no conn yet for passive */ - /* grab all blocked transmits */ - list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); + CDEBUG(D_NET, "%x\n", info->Status); + + if (info->Status == FCM_CONNECT_CANCEL) { + up(&kibnal_data.kib_listener_signal); + return; } - spin_unlock_irqrestore (&conn->ibc_lock, flags); + LASSERT (info->Status == FCM_CONNECT_REQUEST); - while (!list_empty(&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); + rc = kibnal_accept(&conn, cep, (kib_msg_t *)req->PrivateData, + CM_REQUEST_INFO_USER_LEN); + if (rc != 0) /* kibnal_accept has rejected */ + return; - list_del(&tx->tx_list); - kibnal_tx_done (tx); + conn->ibc_cvars->cv_path = req->PathInfo.Path; + + rc = kibnal_conn_rts(conn, + req->CEPInfo.QPN, + req->CEPInfo.OfferedInitiatorDepth, + req->CEPInfo.OfferedResponderResources, + req->CEPInfo.StartingPSN); + if (rc != 0) { + kibnal_reject(conn->ibc_peer->ibp_nid, cep, + IBNAL_REJECT_NO_RESOURCES); + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED); + return; } -} -static void -kibnal_reject (IB_HANDLE cep, uint16_t reason) -{ - CM_REJECT_INFO *rej; + memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci)); + rep = &conn->ibc_cvars->cv_cmci.Info.Reply; - PORTAL_ALLOC(rej, sizeof(*rej)); - if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ - return; + rep->QPN = conn->ibc_cvars->cv_qpattrs.QPNumber; + rep->QKey = conn->ibc_cvars->cv_qpattrs.Qkey; + rep->StartingPSN = conn->ibc_cvars->cv_qpattrs.RecvPSN; + rep->EndToEndFlowControl = conn->ibc_cvars->cv_qpattrs.FlowControl; + rep->ArbInitiatorDepth = conn->ibc_cvars->cv_qpattrs.InitiatorDepth; + rep->ArbResponderResources = conn->ibc_cvars->cv_qpattrs.ResponderResources; + rep->TargetAckDelay = kibnal_data.kib_hca_attrs.LocalCaAckDelay; + rep->FailoverAccepted = IBNAL_FAILOVER_ACCEPTED; + rep->RnRRetryCount = req->CEPInfo.RnrRetryCount; + + CLASSERT (CM_REPLY_INFO_USER_LEN >= + offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); - rej->Reason = reason; - iibt_cm_reject(cep, rej); - PORTAL_FREE(rej, sizeof(*rej)); -} + kibnal_pack_connmsg((kib_msg_t *)rep->PrivateData, + conn->ibc_version, + CM_REPLY_INFO_USER_LEN, + IBNAL_MSG_CONNACK, + conn->ibc_peer->ibp_nid, conn->ibc_incarnation); -static FSTATUS -kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res, - IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn) -{ - IB_QP_ATTRIBUTES_MODIFY modify_attr; - FSTATUS frc; - ENTRY; + LASSERT (conn->ibc_cep == NULL); + kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToRecv, - .RecvPSN = IBNAL_STARTING_PSN, - .DestQPNumber = qpn, - .ResponderResources = resp_res, - .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */ - .Attrs = (IB_QP_ATTR_RECVPSN | - IB_QP_ATTR_DESTQPNUMBER | - IB_QP_ATTR_RESPONDERRESOURCES | - IB_QP_ATTR_DESTAV | - IB_QP_ATTR_PATHMTU | - IB_QP_ATTR_MINRNRTIMER), - }; - GetAVFromPath(0, path, &modify_attr.PathMTU, NULL, - &modify_attr.DestAV); - - frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); - if (frc != FSUCCESS) - RETURN(frc); - - modify_attr = (IB_QP_ATTRIBUTES_MODIFY) { - .RequestState = QPStateReadyToSend, - .FlowControl = TRUE, - .InitiatorDepth = init_depth, - .SendPSN = send_psn, - .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */ - .RetryCount = IBNAL_RETRY, - .RnrRetryCount = IBNAL_RNR_RETRY, - .Attrs = (IB_QP_ATTR_FLOWCONTROL | - IB_QP_ATTR_INITIATORDEPTH | - IB_QP_ATTR_SENDPSN | - IB_QP_ATTR_LOCALACKTIMEOUT | - IB_QP_ATTR_RETRYCOUNT | - IB_QP_ATTR_RNRRETRYCOUNT), - }; + frc = iba_cm_accept(cep, + &conn->ibc_cvars->cv_cmci, + NULL, + kibnal_cm_passive_callback, conn, + &conn->ibc_cep); - frc = iibt_qp_modify(qp_handle, &modify_attr, NULL); - RETURN(frc); + if (frc == FSUCCESS || frc == FPENDING) + return; + + CERROR("iba_cm_accept(%s) failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_PASSIVE, -ECONNABORTED); } -static void -kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +void +kibnal_check_connreply(kib_conn_t *conn, CM_REPLY_INFO *rep) { - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = arg; - kib_wire_connreq_t *wcr; - CM_REPLY_INFO *rep = &info->Info.Reply; - uint16_t reason; - FSTATUS frc; + kib_msg_t *msg = (kib_msg_t *)rep->PrivateData; + lnet_nid_t nid = conn->ibc_peer->ibp_nid; + FSTATUS frc; + int rc; - wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData; + rc = kibnal_unpack_msg(msg, conn->ibc_version, CM_REPLY_INFO_USER_LEN); + if (rc != 0) { + CERROR ("Error %d unpacking connack from %s\n", + rc, libcfs_nid2str(nid)); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); + return; + } + + if (msg->ibm_type != IBNAL_MSG_CONNACK) { + CERROR("Bad connack request type %d (%d expected) from %s\n", + msg->ibm_type, IBNAL_MSG_CONNREQ, + libcfs_nid2str(msg->ibm_srcnid)); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); + return; + } - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't connect "LPX64": bad magic %08x\n", - conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = RC_USER_REJ); + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid || + msg->ibm_dststamp != kibnal_data.kib_incarnation) { + CERROR("Stale connack from %s(%s): %s(%s), "LPX64"("LPX64")\n", + libcfs_nid2str(msg->ibm_srcnid), + libcfs_nid2str(conn->ibc_peer->ibp_nid), + libcfs_nid2str(msg->ibm_dstnid), + libcfs_nid2str(kibnal_data.kib_ni->ni_nid), + msg->ibm_dststamp, kibnal_data.kib_incarnation); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ESTALE); + return; } - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't connect "LPX64": bad version %d\n", - conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); - GOTO(reject, reason = RC_USER_REJ); - } - - if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { - CERROR ("Can't connect "LPX64": bad queue depth %d\n", - conn->ibc_peer->ibp_nid, - le16_to_cpu(wcr->wcr_queue_depth)); - GOTO(reject, reason = RC_USER_REJ); + if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE || + msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE || + msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { + CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n", + libcfs_nid2str(msg->ibm_srcnid), + msg->ibm_u.connparams.ibcp_queue_depth, + msg->ibm_u.connparams.ibcp_max_msg_size, + msg->ibm_u.connparams.ibcp_max_frags, + IBNAL_MSG_QUEUE_SIZE, + IBNAL_MSG_SIZE, + IBNAL_MAX_RDMA_FRAGS); + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_FATAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EPROTO); + return; } - if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { - CERROR ("Unexpected NID "LPX64" from "LPX64"\n", - le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); - GOTO(reject, reason = RC_USER_REJ); - } - - CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", - conn, conn->ibc_peer->ibp_nid); + CDEBUG(D_NET, "Connection %s REP_RECEIVED.\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); - conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); + conn->ibc_incarnation = msg->ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; - - frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN, - min_t(__u8, rep->ArbInitiatorDepth, - ca_attr->MaxQPResponderResources), - &conn->ibc_connreq->cr_path, - min_t(__u8, rep->ArbResponderResources, - ca_attr->MaxQPInitiatorDepth), - rep->StartingPSN); - if (frc != FSUCCESS) { - CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", - conn, conn->ibc_peer->ibp_nid, frc); - GOTO(reject, reason = RC_NO_QP); - } - - /* the callback arguments are ignored for an active accept */ - conn->ibc_connreq->cr_discarded.Status = FSUCCESS; - frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded, - NULL, NULL, NULL, NULL); - if (frc != FCM_CONNECT_ESTABLISHED) { - CERROR("Connection %p -> "LPX64" CMAccept failed: %d\n", - conn, conn->ibc_peer->ibp_nid, frc); - kibnal_connreq_done (conn, 1, -ECONNABORTED); - /* XXX don't call reject after accept fails? */ + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBNAL_RX_MSGS); + + rc = kibnal_conn_rts(conn, + rep->QPN, + rep->ArbInitiatorDepth, + rep->ArbResponderResources, + rep->StartingPSN); + if (rc != 0) { + kibnal_reject(nid, conn->ibc_cep, IBNAL_REJECT_NO_RESOURCES); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EIO); return; } - CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", - conn, conn->ibc_peer->ibp_nid); - - kibnal_connreq_done (conn, 1, 0); - return; + memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci)); + + frc = iba_cm_accept(conn->ibc_cep, + &conn->ibc_cvars->cv_cmci, + NULL, NULL, NULL, NULL); -reject: - kibnal_reject(cep, reason); - kibnal_connreq_done (conn, 1, -EPROTO); + if (frc == FCM_CONNECT_ESTABLISHED) { + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, 0); + return; + } + + CERROR("Connection %s CMAccept failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ECONNABORTED); } -/* ib_cm.h has a wealth of information on the CM procedures */ -static void -kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) +void +kibnal_cm_active_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) { kib_conn_t *conn = arg; CDEBUG(D_NET, "status 0x%x\n", info->Status); - /* Established Connection Notifier */ switch (info->Status) { default: - CERROR("unknown status %d on Connection %p -> "LPX64"\n", - info->Status, conn, conn->ibc_peer->ibp_nid); + CERROR("unknown status %d on Connection %s\n", + info->Status, libcfs_nid2str(conn->ibc_peer->ibp_nid)); LBUG(); break; - case FCM_CONNECT_REPLY: - kibnal_connect_reply(cep, info, arg); + case FCM_CONNECT_TIMEOUT: + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ETIMEDOUT); + break; + + case FCM_CONNECT_REJECT: + kibnal_check_connreject(conn, IBNAL_CONN_ACTIVE, + &info->Info.Reject); break; - case FCM_DISCONNECT_REQUEST: - /* XXX lock around these state management bits? */ - if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - kibnal_close_conn (conn, 0); - conn->ibc_state = IBNAL_CONN_DREP; - iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); + case FCM_CONNECT_REPLY: + kibnal_check_connreply(conn, &info->Info.Reply); break; - /* these both guarantee that no more cm callbacks will occur */ - case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */ + case FCM_DISCONNECT_REQUEST: case FCM_DISCONNECT_REPLY: - CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", - conn, conn->ibc_peer->ibp_nid); - - conn->ibc_state = IBNAL_CONN_DISCONNECTED; - kibnal_flush_pending(conn); - kibnal_put_conn(conn); /* Lose CM's ref */ + case FCM_DISCONNECTED: + kibnal_cm_disconnect_callback(conn, info); break; } - - return; -} - -static int -kibnal_set_cm_flags(IB_HANDLE cep) -{ - FSTATUS frc; - uint32 value = 1; - - frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, - (char *)&value, sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting timeout callback: %d\n", frc); - return -1; - } - -#if 0 - frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, - sizeof(value), 0); - if (frc != FSUCCESS) { - CERROR("error setting async accept: %d\n", frc); - return -1; - } -#endif - - return 0; } void -kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) -{ - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - IB_QP_ATTRIBUTES_QUERY *query; - CM_REQUEST_INFO *req; - CM_CONN_INFO *rep = NULL, *rcv = NULL; - kib_wire_connreq_t *wcr; - kib_conn_t *conn = NULL; - uint16_t reason = 0; - FSTATUS frc; - int rc = 0; - - LASSERT(cep); - LASSERT(info); - LASSERT(arg == NULL); /* no conn yet for passive */ - - CDEBUG(D_NET, "status 0x%x\n", info->Status); - - req = &info->Info.Request; - wcr = (kib_wire_connreq_t *)req->PrivateData; - - CDEBUG(D_NET, "%d from "LPX64"\n", info->Status, - le64_to_cpu(wcr->wcr_nid)); - - if (info->Status == FCM_CONNECT_CANCEL) - return; - - LASSERT (info->Status == FCM_CONNECT_REQUEST); - - if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { - CERROR ("Can't accept: bad magic %08x\n", - le32_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = RC_USER_REJ); - } - - if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { - CERROR ("Can't accept: bad version %d\n", - le16_to_cpu(wcr->wcr_magic)); - GOTO(out, reason = RC_USER_REJ); - } - - rc = kibnal_accept(&conn, cep, - le64_to_cpu(wcr->wcr_nid), - le64_to_cpu(wcr->wcr_incarnation), - le16_to_cpu(wcr->wcr_queue_depth)); - if (rc != 0) { - CERROR ("Can't accept "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), rc); - GOTO(out, reason = RC_NO_RESOURCES); - } - - frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN, - min_t(__u8, req->CEPInfo.OfferedInitiatorDepth, - ca_attr->MaxQPResponderResources), - &req->PathInfo.Path, - min_t(__u8, req->CEPInfo.OfferedResponderResources, - ca_attr->MaxQPInitiatorDepth), - req->CEPInfo.StartingPSN); - - if (frc != FSUCCESS) { - CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), frc); - GOTO(out, reason = RC_NO_QP); - } - - frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL); - if (frc != FSUCCESS) { - CERROR ("Couldn't query qp attributes "LPX64": %d\n", - le64_to_cpu(wcr->wcr_nid), frc); - GOTO(out, reason = RC_NO_QP); - } - query = &conn->ibc_qp_attrs; - - PORTAL_ALLOC(rep, sizeof(*rep)); - PORTAL_ALLOC(rcv, sizeof(*rcv)); - if (rep == NULL || rcv == NULL) { - if (rep) PORTAL_FREE(rep, sizeof(*rep)); - if (rcv) PORTAL_FREE(rcv, sizeof(*rcv)); - CERROR ("can't allocate reply and receive buffers\n"); - GOTO(out, reason = RC_INSUFFICIENT_RESP_RES); - } - - /* don't try to deref this into the incoming wcr :) */ - wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData; - - rep->Info.Reply = (CM_REPLY_INFO) { - .QPN = query->QPNumber, - .QKey = query->Qkey, - .StartingPSN = query->RecvPSN, - .EndToEndFlowControl = query->FlowControl, - /* XXX Hmm. */ - .ArbInitiatorDepth = query->InitiatorDepth, - .ArbResponderResources = query->ResponderResources, - .TargetAckDelay = 0, - .FailoverAccepted = 0, - .RnRRetryCount = req->CEPInfo.RnrRetryCount, - }; - - *wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - - frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn, - &conn->ibc_cep); - - PORTAL_FREE(rep, sizeof(*rep)); - PORTAL_FREE(rcv, sizeof(*rcv)); - - if (frc != FCM_CONNECT_ESTABLISHED) { - /* XXX it seems we don't call reject after this point? */ - CERROR("iibt_cm_accept() failed: %d, aborting\n", frc); - rc = -ECONNABORTED; - goto out; - } - - if (kibnal_set_cm_flags(conn->ibc_cep)) { - rc = -ECONNABORTED; - goto out; - } - - CWARN("Connection %p -> "LPX64" ESTABLISHED.\n", - conn, conn->ibc_peer->ibp_nid); - -out: - if (reason) { - kibnal_reject(cep, reason); - rc = -ECONNABORTED; - } - if (conn != NULL) - kibnal_connreq_done(conn, 0, rc); - - return; -} - -static void dump_path_records(PATH_RESULTS *results) { IB_PATH_RECORD *path; int i; - for(i = 0; i < results->NumPathRecords; i++) { + for (i = 0; i < results->NumPathRecords; i++) { path = &results->PathRecords[i]; CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid " LPX64":"LPX64" pkey %x\n", @@ -2482,110 +2769,104 @@ dump_path_records(PATH_RESULTS *results) } } -static void -kibnal_pathreq_callback (void *arg, QUERY *query, - QUERY_RESULT_VALUES *query_res) +void +kibnal_pathreq_callback (void *arg, QUERY *qry, + QUERY_RESULT_VALUES *qrslt) { - IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; - kib_conn_t *conn = arg; - PATH_RESULTS *path; - FSTATUS frc; + IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = arg; + CM_REQUEST_INFO *req = &conn->ibc_cvars->cv_cmci.Info.Request; + PATH_RESULTS *path = (PATH_RESULTS *)qrslt->QueryResult; + FSTATUS frc; - if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { - CERROR ("status %d data size %d\n", query_res->Status, - query_res->ResultDataSize); - kibnal_connreq_done (conn, 1, -EINVAL); + if (qrslt->Status != FSUCCESS || + qrslt->ResultDataSize < sizeof(*path)) { + CDEBUG (D_NETERROR, "pathreq %s failed: status %d data size %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + qrslt->Status, qrslt->ResultDataSize); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - path = (PATH_RESULTS *)query_res->QueryResult; - if (path->NumPathRecords < 1) { - CERROR ("expected path records: %d\n", path->NumPathRecords); - kibnal_connreq_done (conn, 1, -EINVAL); + CDEBUG (D_NETERROR, "pathreq %s failed: no path records\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - dump_path_records(path); + //dump_path_records(path); + conn->ibc_cvars->cv_path = path->PathRecords[0]; - /* just using the first. this is probably a horrible idea. */ - conn->ibc_connreq->cr_path = path->PathRecords[0]; + LASSERT (conn->ibc_cep == NULL); - conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE); + conn->ibc_cep = kibnal_create_cep(conn->ibc_peer->ibp_nid); if (conn->ibc_cep == NULL) { - CERROR ("Can't create CEP\n"); - kibnal_connreq_done (conn, 1, -EINVAL); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -ENOMEM); return; } - if (kibnal_set_cm_flags(conn->ibc_cep)) { - kibnal_connreq_done (conn, 1, -EINVAL); - return; + memset(req, 0, sizeof(*req)); + req->SID = conn->ibc_cvars->cv_svcrec.RID.ServiceID; + req->CEPInfo.CaGUID = kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx]; + req->CEPInfo.EndToEndFlowControl = IBNAL_EE_FLOW; + req->CEPInfo.PortGUID = conn->ibc_cvars->cv_path.SGID.Type.Global.InterfaceID; + req->CEPInfo.RetryCount = IBNAL_RETRY; + req->CEPInfo.RnrRetryCount = IBNAL_RNR_RETRY; + req->CEPInfo.AckTimeout = IBNAL_ACK_TIMEOUT; + req->CEPInfo.StartingPSN = IBNAL_STARTING_PSN; + req->CEPInfo.QPN = conn->ibc_cvars->cv_qpattrs.QPNumber; + req->CEPInfo.QKey = conn->ibc_cvars->cv_qpattrs.Qkey; + req->CEPInfo.OfferedResponderResources = ca_attr->MaxQPResponderResources; + req->CEPInfo.OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth; + req->PathInfo.bSubnetLocal = IBNAL_LOCAL_SUB; + req->PathInfo.Path = conn->ibc_cvars->cv_path; + + CLASSERT (CM_REQUEST_INFO_USER_LEN >= + offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t)); + + kibnal_pack_connmsg((kib_msg_t *)req->PrivateData, + conn->ibc_version, + CM_REQUEST_INFO_USER_LEN, + IBNAL_MSG_CONNREQ, + conn->ibc_peer->ibp_nid, 0); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + ((kib_msg_t *)req->PrivateData)->ibm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + ((kib_msg_t *)req->PrivateData)->ibm_magic = + LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); } - conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { - .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), - .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), - .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), - .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), - .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), - }; - - conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) { - .SID = conn->ibc_connreq->cr_service.RID.ServiceID, - .CEPInfo = (CM_CEP_INFO) { - .CaGUID = kibnal_data.kib_hca_guids[0], - .EndToEndFlowControl = FALSE, - .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID, - .RetryCount = IBNAL_RETRY, - .RnrRetryCount = IBNAL_RNR_RETRY, - .AckTimeout = IBNAL_ACK_TIMEOUT, - .StartingPSN = IBNAL_STARTING_PSN, - .QPN = conn->ibc_qp_attrs.QPNumber, - .QKey = conn->ibc_qp_attrs.Qkey, - .OfferedResponderResources = ca_attr->MaxQPResponderResources, - .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth, - }, - .PathInfo = (CM_CEP_PATHINFO) { - .bSubnetLocal = TRUE, - .Path = conn->ibc_connreq->cr_path, - }, - }; - -#if 0 - /* XXX set timeout just like SDP!!!*/ - conn->ibc_connreq->cr_path.packet_life = 13; -#endif /* Flag I'm getting involved with the CM... */ - conn->ibc_state = IBNAL_CONN_CONNECTING; - - CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", - conn->ibc_connreq->cr_service.RID.ServiceID, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); - - memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0, - CM_REQUEST_INFO_USER_LEN); - memcpy(conn->ibc_connreq->cr_cmreq.PrivateData, - &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); + kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING); - /* kibnal_cm_callback gets my conn ref */ - frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq, - kibnal_cm_callback, conn); - if (frc != FPENDING && frc != FSUCCESS) { - CERROR ("Connect: %d\n", frc); - /* Back out state change as connect failed */ - conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done (conn, 1, -EINVAL); - } + /* cm callback gets my conn ref */ + frc = iba_cm_connect(conn->ibc_cep, req, + kibnal_cm_active_callback, conn); + if (frc == FPENDING || frc == FSUCCESS) + return; + + CERROR ("Connect %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); } -static void -dump_service_records(SERVICE_RECORD_RESULTS *results) +void +kibnal_dump_service_records(SERVICE_RECORD_RESULTS *results) { IB_SERVICE_RECORD *svc; int i; - for(i = 0; i < results->NumServiceRecords; i++) { + for (i = 0; i < results->NumServiceRecords; i++) { svc = &results->ServiceRecords[i]; CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", i, @@ -2596,161 +2877,147 @@ dump_service_records(SERVICE_RECORD_RESULTS *results) } } - -static void -kibnal_service_get_callback (void *arg, QUERY *query, - QUERY_RESULT_VALUES *query_res) +void +kibnal_service_get_callback (void *arg, QUERY *qry, + QUERY_RESULT_VALUES *qrslt) { - kib_conn_t *conn = arg; - SERVICE_RECORD_RESULTS *svc; - COMMAND_CONTROL_PARAMETERS sd_params; - QUERY path_query; - FSTATUS frc; - - if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) { - CERROR ("status %d data size %d\n", query_res->Status, - query_res->ResultDataSize); - kibnal_connreq_done (conn, 1, -EINVAL); + kib_conn_t *conn = arg; + SERVICE_RECORD_RESULTS *svc; + FSTATUS frc; + + if (qrslt->Status != FSUCCESS || + qrslt->ResultDataSize < sizeof(*svc)) { + CDEBUG (D_NETERROR, "Lookup %s failed: status %d data size %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + qrslt->Status, qrslt->ResultDataSize); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult; - + svc = (SERVICE_RECORD_RESULTS *)qrslt->QueryResult; if (svc->NumServiceRecords < 1) { - CERROR ("%d service records\n", svc->NumServiceRecords); - kibnal_connreq_done (conn, 1, -EINVAL); + CDEBUG (D_NETERROR, "lookup %s failed: no service records\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); return; } - dump_service_records(svc); + //kibnal_dump_service_records(svc); + conn->ibc_cvars->cv_svcrec = svc->ServiceRecords[0]; - conn->ibc_connreq->cr_service = svc->ServiceRecords[0]; + qry = &conn->ibc_cvars->cv_query; + memset(qry, 0, sizeof(*qry)); - CDEBUG(D_NET, "Got status %d, service id "LPX64", on "LPX64"\n", - query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID, - *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); + qry->OutputType = OutputTypePathRecord; + qry->InputType = InputTypePortGuidPair; - memset(&path_query, 0, sizeof(path_query)); - path_query.InputType = InputTypePortGuidPair; - path_query.OutputType = OutputTypePathRecord; - path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid; - path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID; + qry->InputValue.PortGuidPair.SourcePortGuid = + kibnal_data.kib_port_guid; + qry->InputValue.PortGuidPair.DestPortGuid = + conn->ibc_cvars->cv_svcrec.RID.ServiceGID.Type.Global.InterfaceID; - memset(&sd_params, 0, sizeof(sd_params)); - sd_params.RetryCount = IBNAL_RETRY; - sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ - - /* kibnal_service_get_callback gets my conn ref */ - - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &path_query, - kibnal_pathreq_callback, - &sd_params, conn); + /* kibnal_pathreq_callback gets my conn ref */ + frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + qry, + kibnal_pathreq_callback, + &kibnal_data.kib_sdretry, + conn); if (frc == FPENDING) return; - CERROR ("Path record request failed: %d\n", frc); - kibnal_connreq_done (conn, 1, -EINVAL); + CERROR ("pathreq %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); } -static void +void kibnal_connect_peer (kib_peer_t *peer) { - COMMAND_CONTROL_PARAMETERS sd_params; - QUERY query; - FSTATUS frc; - kib_conn_t *conn = kibnal_create_conn(); + QUERY *qry; + FSTATUS frc; + kib_conn_t *conn; LASSERT (peer->ibp_connecting != 0); + conn = kibnal_create_conn(peer->ibp_nid, peer->ibp_version); if (conn == NULL) { CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed (peer, 1, -ENOMEM); + kibnal_peer_connect_failed(peer, IBNAL_CONN_ACTIVE, -ENOMEM); return; } conn->ibc_peer = peer; - kib_peer_addref(peer); - - PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); - if (conn->ibc_connreq == NULL) { - CERROR ("Can't allocate connreq\n"); - kibnal_connreq_done (conn, 1, -ENOMEM); - return; - } - - memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); + kibnal_peer_addref(peer); - kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid); + qry = &conn->ibc_cvars->cv_query; + memset(qry, 0, sizeof(*qry)); - memset(&query, 0, sizeof(query)); - query.InputType = InputTypeServiceRecord; - query.OutputType = OutputTypeServiceRecord; - query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service; - query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK; + qry->OutputType = OutputTypeServiceRecord; + qry->InputType = InputTypeServiceRecord; - memset(&sd_params, 0, sizeof(sd_params)); - sd_params.RetryCount = IBNAL_RETRY; - sd_params.Timeout = 10 * 1000; /* wait 10 seconds */ + qry->InputValue.ServiceRecordValue.ComponentMask = + KIBNAL_SERVICE_KEY_MASK; + kibnal_set_service_keys( + &qry->InputValue.ServiceRecordValue.ServiceRecord, + peer->ibp_nid); /* kibnal_service_get_callback gets my conn ref */ - frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd, - kibnal_data.kib_port_guid, - &query, - kibnal_service_get_callback, - &sd_params, conn); + frc = iba_sd_query_port_fabric_info(kibnal_data.kib_sd, + kibnal_data.kib_port_guid, + qry, + kibnal_service_get_callback, + &kibnal_data.kib_sdretry, + conn); if (frc == FPENDING) return; - CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc); - kibnal_connreq_done (conn, 1, frc); + CERROR("Lookup %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), frc); + kibnal_connreq_done(conn, IBNAL_CONN_ACTIVE, -EHOSTUNREACH); } -static int -kibnal_conn_timed_out (kib_conn_t *conn) +int +kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) { kib_tx_t *tx; struct list_head *ttmp; - unsigned long flags; + int timed_out = 0; - spin_lock_irqsave (&conn->ibc_lock, flags); + spin_lock(&conn->ibc_lock); - list_for_each (ttmp, &conn->ibc_tx_queue) { + list_for_each (ttmp, txs) { tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); - - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); } - } - - list_for_each (ttmp, &conn->ibc_active_txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; + timed_out = 1; + break; } } - spin_unlock_irqrestore (&conn->ibc_lock, flags); + spin_unlock(&conn->ibc_lock); + return timed_out; +} - return 0; +int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + return kibnal_check_txs(conn, &conn->ibc_tx_queue) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || + kibnal_check_txs(conn, &conn->ibc_active_txs); } -static void -kibnal_check_conns (int idx) +void +kibnal_check_peers (int idx) { + rwlock_t *rwlock = &kibnal_data.kib_global_lock; struct list_head *peers = &kibnal_data.kib_peers[idx]; struct list_head *ptmp; kib_peer_t *peer; @@ -2762,15 +3029,33 @@ kibnal_check_conns (int idx) /* NB. We expect to have a look at all the peers and not find any * rdmas to time out, so we just use a shared lock while we * take a look... */ - read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + read_lock_irqsave(rwlock, flags); list_for_each (ptmp, peers) { peer = list_entry (ptmp, kib_peer_t, ibp_list); + if (peer->ibp_passivewait) { + LASSERT (list_empty(&peer->ibp_conns)); + + if (!time_after_eq(jiffies, + peer->ibp_passivewait_deadline)) + continue; + + kibnal_peer_addref(peer); /* ++ ref for me... */ + read_unlock_irqrestore(rwlock, flags); + + kibnal_peer_connect_failed(peer, IBNAL_CONN_WAITING, + -ETIMEDOUT); + kibnal_peer_decref(peer); /* ...until here */ + + /* start again now I've dropped the lock */ + goto again; + } + list_for_each (ctmp, &peer->ibp_conns) { conn = list_entry (ctmp, kib_conn_t, ibc_list); - KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); + LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); /* In case we have enough credits to return via a * NOOP, but there were no non-blocking tx descs @@ -2779,60 +3064,57 @@ kibnal_check_conns (int idx) if (!kibnal_conn_timed_out(conn)) continue; + + /* Handle timeout by closing the whole connection. We + * can only be sure RDMA activity has ceased once the + * QP has been modified. */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); + kibnal_conn_addref(conn); /* 1 ref for me... */ - atomic_inc (&conn->ibc_refcount); - read_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); + read_unlock_irqrestore(rwlock, flags); - CERROR("Timed out RDMA with "LPX64"\n", - peer->ibp_nid); + CERROR("Timed out RDMA with %s\n", + libcfs_nid2str(peer->ibp_nid)); kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_put_conn (conn); + kibnal_conn_decref(conn); /* ...until here */ /* start again now I've dropped the lock */ goto again; } } - read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + read_unlock_irqrestore(rwlock, flags); } -static void -kib_connd_handle_state(kib_conn_t *conn) +void +kibnal_disconnect_conn (kib_conn_t *conn) { - FSTATUS frc; - - switch (conn->ibc_state) { - /* all refs have gone, free and be done with it */ - case IBNAL_CONN_DISCONNECTED: - kibnal_destroy_conn (conn); - return; /* avoid put_conn */ + FSTATUS frc; - case IBNAL_CONN_SEND_DREQ: - frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL); - if (frc != FSUCCESS) /* XXX do real things */ - CERROR("disconnect failed: %d\n", frc); - conn->ibc_state = IBNAL_CONN_DREQ; - break; + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTING); - /* a callback got to the conn before we did */ - case IBNAL_CONN_DREP: - break; - - default: - CERROR ("Bad conn %p state: %d\n", conn, - conn->ibc_state); - LBUG(); - break; + kibnal_conn_disconnected(conn); + + frc = iba_cm_disconnect(conn->ibc_cep, NULL, NULL); + switch (frc) { + case FSUCCESS: + break; + + case FINSUFFICIENT_RESOURCES: + CERROR("ENOMEM disconnecting %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* This might cause the module to become unloadable since the + * FCM_DISCONNECTED callback is still outstanding */ + break; + + default: + CERROR("Unexpected error disconnecting %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), frc); + LBUG(); } - /* drop ref from close_conn */ - kibnal_put_conn(conn); + kibnal_peer_notify(conn->ibc_peer); } int @@ -2844,27 +3126,43 @@ kibnal_connd (void *arg) kib_peer_t *peer; int timeout; int i; + int did_something; int peer_index = 0; unsigned long deadline = jiffies; - kportal_daemonize ("kibnal_connd"); - kportal_blockallsigs (); + cfs_daemonize ("kibnal_connd"); + cfs_block_allsigs (); init_waitqueue_entry (&wait, current); - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + + while (!kibnal_data.kib_shutdown) { + did_something = 0; + + if (!list_empty (&kibnal_data.kib_connd_zombies)) { + conn = list_entry (kibnal_data.kib_connd_zombies.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + did_something = 1; + + kibnal_destroy_conn(conn); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } - for (;;) { if (!list_empty (&kibnal_data.kib_connd_conns)) { conn = list_entry (kibnal_data.kib_connd_conns.next, kib_conn_t, ibc_list); list_del (&conn->ibc_list); - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - kib_connd_handle_state(conn); + did_something = 1; + kibnal_disconnect_conn(conn); + kibnal_conn_decref(conn); + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); - continue; } if (!list_empty (&kibnal_data.kib_connd_peers)) { @@ -2873,26 +3171,22 @@ kibnal_connd (void *arg) list_del_init (&peer->ibp_connd_list); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + did_something = 1; kibnal_connect_peer (peer); - kib_peer_decref (peer); + kibnal_peer_decref (peer); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); } - /* shut down and nobody left to reap... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; - - spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); - /* careful with the jiffy wrap... */ while ((timeout = (int)(deadline - jiffies)) <= 0) { const int n = 4; const int p = 1; int chunk = kibnal_data.kib_peer_hash_size; + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + /* Time to check for RDMA timeouts on a few more * peers: I do checks every 'p' seconds on a * proportion of the peer table and I need to check @@ -2901,22 +3195,27 @@ kibnal_connd (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (kibnal_tunables.kib_io_timeout > n * p) + if (*kibnal_tunables.kib_timeout > n * p) chunk = (chunk * n * p) / - kibnal_tunables.kib_io_timeout; + *kibnal_tunables.kib_timeout; if (chunk == 0) chunk = 1; for (i = 0; i < chunk; i++) { - kibnal_check_conns (peer_index); + kibnal_check_peers (peer_index); peer_index = (peer_index + 1) % kibnal_data.kib_peer_hash_size; } deadline += p * HZ; + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + did_something = 1; } - kibnal_data.kib_connd_waketime = jiffies + timeout; + if (did_something) + continue; + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); set_current_state (TASK_INTERRUPTIBLE); add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); @@ -2938,78 +3237,149 @@ kibnal_connd (void *arg) return (0); } + +void +kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev) +{ + /* XXX flesh out. this seems largely for async errors */ + CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode); +} + +void +kibnal_hca_callback (void *hca_arg, void *cq_arg) +{ + unsigned long flags; + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + kibnal_data.kib_ready = 1; + wake_up(&kibnal_data.kib_sched_waitq); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); +} + int kibnal_scheduler(void *arg) { - long id = (long)arg; - char name[16]; - kib_rx_t *rx; - kib_tx_t *tx; - unsigned long flags; - int rc; - int counter = 0; - int did_something; + long id = (long)arg; + wait_queue_t wait; + char name[16]; + FSTATUS frc; + FSTATUS frc2; + IB_WORK_COMPLETION wc; + kib_rx_t *rx; + unsigned long flags; + __u64 rxseq = 0; + int busy_loops = 0; snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); + cfs_daemonize(name); + cfs_block_allsigs(); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + init_waitqueue_entry(&wait, current); - for (;;) { - did_something = 0; + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); - while (!list_empty(&kibnal_data.kib_sched_txq)) { - tx = list_entry(kibnal_data.kib_sched_txq.next, - kib_tx_t, tx_list); - list_del(&tx->tx_list); + while (!kibnal_data.kib_shutdown) { + if (busy_loops++ >= IBNAL_RESCHED) { spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - kibnal_tx_done(tx); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); + our_cond_resched(); + busy_loops = 0; + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } - if (!list_empty(&kibnal_data.kib_sched_rxq)) { - rx = list_entry(kibnal_data.kib_sched_rxq.next, - kib_rx_t, rx_list); - list_del(&rx->rx_list); + if (kibnal_data.kib_ready && + !kibnal_data.kib_checking_cq) { + /* take ownership of completion polling */ + kibnal_data.kib_checking_cq = 1; + /* Assume I'll exhaust the CQ */ + kibnal_data.kib_ready = 0; spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + frc = iba_poll_cq(kibnal_data.kib_cq, &wc); + if (frc == FNOT_DONE) { + /* CQ empty */ + frc2 = iba_rearm_cq(kibnal_data.kib_cq, + CQEventSelNextWC); + LASSERT (frc2 == FSUCCESS); + } + + if (frc == FSUCCESS && + kibnal_wreqid2type(wc.WorkReqId) == IBNAL_WID_RX) { + rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.WorkReqId); + + /* Grab the RX sequence number NOW before + * anyone else can get an RX completion */ + rxseq = rx->rx_conn->ibc_rxseq++; + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + /* give up ownership of completion polling */ + kibnal_data.kib_checking_cq = 0; - kibnal_rx(rx); + if (frc == FNOT_DONE) + continue; - did_something = 1; - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); - } + LASSERT (frc == FSUCCESS); + /* Assume there's more: get another scheduler to check + * while I handle this completion... */ - /* shut down and no receives to complete... */ - if (kibnal_data.kib_shutdown && - atomic_read(&kibnal_data.kib_nconns) == 0) - break; + kibnal_data.kib_ready = 1; + wake_up(&kibnal_data.kib_sched_waitq); - /* nothing to do or hogging CPU */ - if (!did_something || counter++ == IBNAL_RESCHED) { spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); - counter = 0; - - if (!did_something) { - rc = wait_event_interruptible( - kibnal_data.kib_sched_waitq, - !list_empty(&kibnal_data.kib_sched_txq) || - !list_empty(&kibnal_data.kib_sched_rxq) || - (kibnal_data.kib_shutdown && - atomic_read (&kibnal_data.kib_nconns) == 0)); - } else { - our_cond_resched(); - } - spin_lock_irqsave(&kibnal_data.kib_sched_lock, - flags); + switch (kibnal_wreqid2type(wc.WorkReqId)) { + case IBNAL_WID_RX: + kibnal_rx_complete(&wc, rxseq); + break; + + case IBNAL_WID_TX: + kibnal_tx_complete(&wc); + break; + + case IBNAL_WID_RDMA: + /* We only get RDMA completion notification if + * it fails. So we just ignore them completely + * because... + * + * 1) If an RDMA fails, all subsequent work + * items, including the final SEND will fail + * too, so I'm still guaranteed to notice that + * this connection is hosed. + * + * 2) It's positively dangerous to look inside + * the tx descriptor obtained from an RDMA work + * item. As soon as I drop the kib_sched_lock, + * I give a scheduler on another CPU a chance + * to get the final SEND completion, so the tx + * descriptor can get freed as I inspect it. */ + CERROR ("RDMA failed: %d\n", wc.Status); + break; + + default: + LBUG(); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + continue; } + + /* Nothing to do; sleep... */ + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + + schedule(); + + remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait); + set_current_state(TASK_RUNNING); + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); } spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); @@ -3017,13 +3387,3 @@ kibnal_scheduler(void *arg) kibnal_thread_fini(); return (0); } - - -lib_nal_t kibnal_lib = { - libnal_data: &kibnal_data, /* NAL private data */ - libnal_send: kibnal_send, - libnal_send_pages: kibnal_send_pages, - libnal_recv: kibnal_recv, - libnal_recv_pages: kibnal_recv_pages, - libnal_dist: kibnal_dist -}; diff --git a/lnet/klnds/iiblnd/iiblnd_modparams.c b/lnet/klnds/iiblnd/iiblnd_modparams.c new file mode 100644 index 0000000..ceb6e5d --- /dev/null +++ b/lnet/klnds/iiblnd/iiblnd_modparams.c @@ -0,0 +1,179 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "iiblnd.h" + +static char *ipif_basename = "ib"; +CFS_MODULE_PARM(ipif_basename, "s", charp, 0444, + "IPoIB interface base name"); + +static char *service_name = "iiblnd"; +CFS_MODULE_PARM(service_name, "s", charp, 0444, + "IB service name"); + +static int service_number = 0x11b9a2; +CFS_MODULE_PARM(service_number, "i", int, 0444, + "IB service number"); + +static int min_reconnect_interval = 1; +CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, + "minimum connection retry interval (seconds)"); + +static int max_reconnect_interval = 60; +CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, + "maximum connection retry interval (seconds)"); + +static int concurrent_peers = 1152; +CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, + "maximum number of peers that may connect"); + +static int cksum = 0; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +static int ntx = 256; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of message descriptors"); + +static int credits = 128; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static int sd_retries = 8; +CFS_MODULE_PARM(sd_retries, "i", int, 0444, + "# times to retry SD queries"); + +static int keepalive = 100; +CFS_MODULE_PARM(keepalive, "i", int, 0644, + "Idle time in seconds before sending a keepalive"); + +static int concurrent_sends = IBNAL_RX_MSGS; +CFS_MODULE_PARM(concurrent_sends, "i", int, 0644, + "Send work queue sizing"); + +kib_tunables_t kibnal_tunables = { + .kib_ipif_basename = &ipif_basename, + .kib_service_name = &service_name, + .kib_service_number = &service_number, + .kib_min_reconnect_interval = &min_reconnect_interval, + .kib_max_reconnect_interval = &max_reconnect_interval, + .kib_concurrent_peers = &concurrent_peers, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_keepalive = &keepalive, + .kib_ntx = &ntx, + .kib_credits = &credits, + .kib_peercredits = &peer_credits, + .kib_sd_retries = &sd_retries, + .kib_concurrent_sends = &concurrent_sends, +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + +/* NB max_size specified for proc_dostring entries only needs to be big enough + * not to truncate the printout; it only needs to be the actual size of the + * string buffer if we allow writes (and we don't) */ + +static ctl_table kibnal_ctl_table[] = { + {1, "ipif_basename", &ipif_basename, + 1024, 0444, NULL, &proc_dostring}, + {2, "service_name", &service_name, + 1024, 0444, NULL, &proc_dostring}, + {3, "service_number", &service_number, + sizeof(int), 0444, NULL, &proc_dointvec}, + {4, "min_reconnect_interval", &min_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {5, "max_reconnect_interval", &max_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {6, "concurrent_peers", &concurrent_peers, + sizeof(int), 0444, NULL, &proc_dointvec}, + {7, "cksum", &cksum, + sizeof(int), 0644, NULL, &proc_dointvec}, + {8, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {9, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {10, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {11, "peer_credits", &peer_credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {12, "sd_retries", &sd_retries, + sizeof(int), 0444, NULL, &proc_dointvec}, + {13, "keepalive", &keepalive, + sizeof(int), 0644, NULL, &proc_dointvec}, + {14, "concurrent_sends", &concurrent_sends, + sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table kibnal_top_ctl_table[] = { + {203, "openibnal", NULL, 0, 0555, kibnal_ctl_table}, + {0} +}; + +int +kibnal_tunables_init () +{ + kibnal_tunables.kib_sysctl = + register_sysctl_table(kibnal_top_ctl_table, 0); + + if (kibnal_tunables.kib_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + if (*kibnal_tunables.kib_concurrent_sends > IBNAL_RX_MSGS) + *kibnal_tunables.kib_concurrent_sends = IBNAL_RX_MSGS; + if (*kibnal_tunables.kib_concurrent_sends < IBNAL_MSG_QUEUE_SIZE) + *kibnal_tunables.kib_concurrent_sends = IBNAL_MSG_QUEUE_SIZE; + + return 0; +} + +void +kibnal_tunables_fini () +{ + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table(kibnal_tunables.kib_sysctl); +} + +#else + +int +kibnal_tunables_init () +{ + return 0; +} + +void +kibnal_tunables_fini () +{ +} + +#endif diff --git a/lnet/klnds/lolnd/Makefile.in b/lnet/klnds/lolnd/Makefile.in deleted file mode 100644 index 222e861..0000000 --- a/lnet/klnds/lolnd/Makefile.in +++ /dev/null @@ -1,4 +0,0 @@ -MODULES := klonal -klonal-objs := lonal.o lonal_cb.o - -@INCLUDE_RULES@ diff --git a/lnet/klnds/lolnd/lolnd.c b/lnet/klnds/lolnd/lolnd.c deleted file mode 100644 index 03c2742..0000000 --- a/lnet/klnds/lolnd/lolnd.c +++ /dev/null @@ -1,164 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "lonal.h" - -nal_t klonal_api; -klonal_data_t klonal_data; -ptl_handle_ni_t klonal_ni; - - -int -klonal_cmd (struct portals_cfg *pcfg, void *private) -{ - LASSERT (pcfg != NULL); - - switch (pcfg->pcfg_command) { - case NAL_CMD_REGISTER_MYNID: - CDEBUG (D_IOCTL, "setting NID to "LPX64" (was "LPX64")\n", - pcfg->pcfg_nid, klonal_lib.libnal_ni.ni_pid.nid); - klonal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid; - return (0); - - default: - return (-EINVAL); - } -} - -static void -klonal_shutdown(nal_t *nal) -{ - /* NB The first ref was this module! */ - if (nal->nal_refct != 0) - return; - - CDEBUG (D_NET, "shutdown\n"); - LASSERT (nal == &klonal_api); - - switch (klonal_data.klo_init) - { - default: - LASSERT (0); - - case KLO_INIT_ALL: - libcfs_nal_cmd_unregister(LONAL); - /* fall through */ - - case KLO_INIT_LIB: - lib_fini (&klonal_lib); - break; - - case KLO_INIT_NOTHING: - return; - } - - memset(&klonal_data, 0, sizeof (klonal_data)); - - CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); - - printk (KERN_INFO "Lustre: LO NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); - PORTAL_MODULE_UNUSE; -} - -static int -klonal_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - int rc; - ptl_process_id_t my_process_id; - int pkmem = atomic_read(&portal_kmemory); - - LASSERT (nal == &klonal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = klonal_lib.libnal_ni.ni_actual_limits; - return (PTL_OK); - } - - LASSERT (klonal_data.klo_init == KLO_INIT_NOTHING); - - CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); - - /* ensure all pointers NULL etc */ - memset (&klonal_data, 0, sizeof (klonal_data)); - - my_process_id.nid = 0; - my_process_id.pid = requested_pid; - - rc = lib_init(&klonal_lib, nal, my_process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR ("lib_init failed %d\n", rc); - klonal_shutdown (nal); - return (rc); - } - - klonal_data.klo_init = KLO_INIT_LIB; - - rc = libcfs_nal_cmd_register (LONAL, &klonal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - klonal_shutdown (nal); - return (PTL_FAIL); - } - - klonal_data.klo_init = KLO_INIT_ALL; - - printk(KERN_INFO "Lustre: LO NAL (initial mem %d)\n", pkmem); - PORTAL_MODULE_USE; - - return (PTL_OK); -} - -void __exit -klonal_finalise (void) -{ - PtlNIFini(klonal_ni); - - ptl_unregister_nal(LONAL); -} - -static int __init -klonal_initialise (void) -{ - int rc; - - klonal_api.nal_ni_init = klonal_startup; - klonal_api.nal_ni_fini = klonal_shutdown; - - rc = ptl_register_nal(LONAL, &klonal_api); - if (rc != PTL_OK) { - CERROR("Can't register LONAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - return (0); -} - -MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Loopback NAL v0.01"); -MODULE_LICENSE("GPL"); - -module_init (klonal_initialise); -module_exit (klonal_finalise); diff --git a/lnet/klnds/lolnd/lolnd.h b/lnet/klnds/lolnd/lolnd.h deleted file mode 100644 index 6d8d77d..0000000 --- a/lnet/klnds/lolnd/lolnd.h +++ /dev/null @@ -1,72 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#ifndef _LONAL_H -#define _LONAL_H -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include - -#define KLOD_IOV 153401 -#define KLOD_KIOV 153402 - -typedef struct -{ - unsigned int klod_type; - unsigned int klod_niov; - size_t klod_offset; - size_t klod_nob; - union { - struct iovec *iov; - ptl_kiov_t *kiov; - } klod_iov; -} klo_desc_t; - -typedef struct -{ - char klo_init; /* what's been initialised */ -} klonal_data_t; - -/* kqn_init state */ -#define KLO_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ -#define KLO_INIT_LIB 1 -#define KLO_INIT_ALL 2 - -extern lib_nal_t klonal_lib; -extern nal_t klonal_api; -extern klonal_data_t klonal_data; - -#endif /* _LONAL_H */ diff --git a/lnet/klnds/lolnd/lolnd_cb.c b/lnet/klnds/lolnd/lolnd_cb.c deleted file mode 100644 index cf5df0d..0000000 --- a/lnet/klnds/lolnd/lolnd_cb.c +++ /dev/null @@ -1,267 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2004 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include "lonal.h" - -/* - * LIB functions follow - * - */ -static int -klonal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - *dist = 0; /* it's me */ - return (0); -} - -static ptl_err_t -klonal_send (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - size_t payload_offset, - size_t payload_nob) -{ - klo_desc_t klod = { - .klod_type = KLOD_IOV, - .klod_niov = payload_niov, - .klod_offset = payload_offset, - .klod_nob = payload_nob, - .klod_iov = { .iov = payload_iov } }; - ptl_err_t rc; - - LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid); - - rc = lib_parse(&klonal_lib, hdr, &klod); - if (rc == PTL_OK) - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - - return rc; -} - -static ptl_err_t -klonal_send_pages (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - klo_desc_t klod = { - .klod_type = KLOD_KIOV, - .klod_niov = payload_niov, - .klod_offset = payload_offset, - .klod_nob = payload_nob, - .klod_iov = { .kiov = payload_kiov } }; - ptl_err_t rc; - - LASSERT(nid == klonal_lib.libnal_ni.ni_pid.nid); - - rc = lib_parse(&klonal_lib, hdr, &klod); - if (rc == PTL_OK) - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - - return rc; -} - -static ptl_err_t -klonal_recv(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) -{ - klo_desc_t *klod = (klo_desc_t *)private; - - /* I only handle mapped->mapped matches */ - LASSERT(klod->klod_type == KLOD_IOV); - - if (mlen == 0) - return PTL_OK; - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT(niov > 0); - } - - while (klod->klod_offset >= klod->klod_iov.iov->iov_len) { - klod->klod_offset -= klod->klod_iov.iov->iov_len; - klod->klod_iov.iov++; - klod->klod_niov--; - LASSERT(klod->klod_niov > 0); - } - - do { - int fraglen = MIN(iov->iov_len - offset, - klod->klod_iov.iov->iov_len - klod->klod_offset); - - LASSERT(niov > 0); - LASSERT(klod->klod_niov > 0); - - if (fraglen > mlen) - fraglen = mlen; - - memcpy((void *)((unsigned long)iov->iov_base + offset), - (void *)((unsigned long)klod->klod_iov.iov->iov_base + - klod->klod_offset), - fraglen); - - if (offset + fraglen < iov->iov_len) { - offset += fraglen; - } else { - offset = 0; - iov++; - niov--; - } - - if (klod->klod_offset + fraglen < klod->klod_iov.iov->iov_len ) { - klod->klod_offset += fraglen; - } else { - klod->klod_offset = 0; - klod->klod_iov.iov++; - klod->klod_niov--; - } - - mlen -= fraglen; - } while (mlen > 0); - - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - return PTL_OK; -} - -static ptl_err_t -klonal_recv_pages(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - ptl_kiov_t *kiov, - size_t offset, - size_t mlen, - size_t rlen) -{ - void *srcaddr = NULL; - void *dstaddr = NULL; - unsigned long srcfrag = 0; - unsigned long dstfrag = 0; - unsigned long fraglen; - klo_desc_t *klod = (klo_desc_t *)private; - - /* I only handle unmapped->unmapped matches */ - LASSERT(klod->klod_type == KLOD_KIOV); - - if (mlen == 0) - return PTL_OK; - - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT(niov > 0); - } - - while (klod->klod_offset >= klod->klod_iov.kiov->kiov_len) { - klod->klod_offset -= klod->klod_iov.kiov->kiov_len; - klod->klod_iov.kiov++; - klod->klod_niov--; - LASSERT(klod->klod_niov > 0); - } - - do { - /* CAVEAT EMPTOR: I kmap 2 pages at once == slight risk of deadlock */ - LASSERT(niov > 0); - if (dstaddr == NULL) { - dstaddr = (void *)((unsigned long)kmap(kiov->kiov_page) + - kiov->kiov_offset + offset); - dstfrag = kiov->kiov_len - offset; - } - - LASSERT(klod->klod_niov > 0); - if (srcaddr == NULL) { - srcaddr = (void *)((unsigned long)kmap(klod->klod_iov.kiov->kiov_page) + - klod->klod_iov.kiov->kiov_offset + klod->klod_offset); - srcfrag = klod->klod_iov.kiov->kiov_len - klod->klod_offset; - } - - fraglen = MIN(srcfrag, dstfrag); - if (fraglen > mlen) - fraglen = mlen; - - memcpy(dstaddr, srcaddr, fraglen); - - if (fraglen < dstfrag) { - dstfrag -= fraglen; - dstaddr = (void *)((unsigned long)dstaddr + fraglen); - } else { - kunmap(kiov->kiov_page); - dstaddr = NULL; - offset = 0; - kiov++; - niov--; - } - - if (fraglen < srcfrag) { - srcfrag -= fraglen; - srcaddr = (void *)((unsigned long)srcaddr + fraglen); - } else { - kunmap(klod->klod_iov.kiov->kiov_page); - srcaddr = NULL; - klod->klod_offset = 0; - klod->klod_iov.kiov++; - klod->klod_niov--; - } - - mlen -= fraglen; - } while (mlen > 0); - - if (dstaddr != NULL) - kunmap(kiov->kiov_page); - - if (srcaddr != NULL) - kunmap(klod->klod_iov.kiov->kiov_page); - - lib_finalize(&klonal_lib, private, libmsg, PTL_OK); - return PTL_OK; -} - -lib_nal_t klonal_lib = -{ - libnal_data: &klonal_data, /* NAL private data */ - libnal_send: klonal_send, - libnal_send_pages: klonal_send_pages, - libnal_recv: klonal_recv, - libnal_recv_pages: klonal_recv_pages, - libnal_dist: klonal_dist -}; diff --git a/lnet/klnds/mxlnd/.cvsignore b/lnet/klnds/mxlnd/.cvsignore new file mode 100644 index 0000000..26bf56c --- /dev/null +++ b/lnet/klnds/mxlnd/.cvsignore @@ -0,0 +1,11 @@ +.deps +Makefile +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.tmp_versions +.depend + diff --git a/lnet/klnds/mxlnd/Makefile.in b/lnet/klnds/mxlnd/Makefile.in new file mode 100644 index 0000000..378dbdd --- /dev/null +++ b/lnet/klnds/mxlnd/Makefile.in @@ -0,0 +1,6 @@ +MODULES := kmxlnd +kmxlnd-objs := mxlnd.o mxlnd_cb.o mxlnd_modparams.o + +EXTRA_POST_CFLAGS := @MXCPPFLAGS@ + +@INCLUDE_RULES@ diff --git a/lnet/klnds/mxlnd/README b/lnet/klnds/mxlnd/README new file mode 100644 index 0000000..cc87e7a --- /dev/null +++ b/lnet/klnds/mxlnd/README @@ -0,0 +1,190 @@ +************************************************************************* +* * +* Myrinet Express Lustre Networking Driver (MXLND) documentation * +* * +************************************************************************* + +README of MXLND + +MXLND provides support for Myricom's Myrinet Express (MX) communication +layer in Lustre. + +MXLND may be used with either MX-10G or MX-2G. See MX's README for +supported NICs. + +Table of Contents: + I. Installation + 1. Configuring and compiling + 2. Module Parameters + II. MXLND Performance + III. Caveats + 1. Systems with different page sizes + 2. Multi-homing + 3. MX endpoint collision + IV. License + V. Support + +================ +I. Installation +================ + +MXLND is supported on Linux 2.6. It may be possible to run it on 2.4, +but it has not been tested. MXLND requires Myricom's MX version 1.2.1 +or higher. See MX's README for the supported list of processors. + +1. Configuring and compiling + +MXLND should be already integrated into the Lustre build process. To +build MXLND, you will need to set the path to your MX installation +in Lustre's ./configure: + + --with-mx=/opt/mx + +replacing /opt with the actual path. Configure will check to ensure that +the MX version has the required functions. If not, it will fail to build. +To check if MXLND built, look for: + + checking whether to enable Myrinet MX support... yes + +in configure's output or the presence of Makefile in +$LUSTRE/lnet/klnds/mxlnd. + +2. Module Parameters + +MXLND supports a number of load-time parameters using Linux's module +parameter system. On our test systems, we created the following file: + + /etc/modprobe.d/kmxlnd + +On some (older?) systems, you may need to modify /etc/modprobe.conf. + +The available options are: + + n_waitd # of completion daemons + max_peers maximum number of peers that may connect + cksum set non-zero to enable small message (< 4KB) checksums + ntx # of total tx message descriptors + credits # concurrent sends to a single peer + board index value of the Myrinet board (NIC) + ep_id MX endpoint ID + polling Use 0 to block (wait). A value > 0 will poll that many times before blocking + hosts IP-to-hostname resolution file + +Of these, only hosts is required. It must be the absolute path to the +MXLND hosts file. For example: + + options kmxlnd hosts=/etc/hosts.mxlnd + +The file format for the hosts file is as follows: + +IP HOST BOARD EP_ID + +The values must be space and/or tab separated where: + + IP is a valid IPv4 address + HOST is the name returned by `hostname` on that machine + BOARD is the index of the Myricom NIC (0 for the first card, etc.) + EP_ID is the MX endpoint ID + +You may want to vary the remaining options to obtain the optimal performance +for your platform. + + n_waitd sets the number of threads that process completed MX requests +(sends and receives). In our testing, the default of 1 performed best. + + max_peers tells MXLND the upper limit of machines that it will need to +communicate with. This affects how many receives it will pre-post and each +receive will use one page of memory. Ideally, on clients, this value will +be equal to the total number of Lustre servers (MDS and OSS). On servers, +it needs to equal the total number of machines in the storage system. + + cksum turns on small message checksums. It can be used to aid in trouble- +shooting. MX also provides an optional checksumming feature which can check +all messages (large and small). See the MX README for details. + + ntx is the number of total sends in flight from this machine. In actuality, +MXLND reserves half of them for connect messages so make this value twice as large +as you want for the total number of sends in flight. + + credits is the number of in-flight messages for a specific peer. This is part +of the flow-control system in Lustre. Increasing this value may improve performance +but it requires more memory since each message requires at least one page. + + board is the index of the Myricom NIC. Hosts can have multiple Myricom NICs +and this identifies which one MXLND should use. This value must match the board +value in your MXLND hosts file for this host. + + ep_id is the MX endpoint ID. Each process that uses MX is required to have at +least one MX endpoint to access the MX library and NIC. The ID is a simple index +starting at 0. This value must match the endpoint ID value in your MXLND hosts +file for this host. + + polling determines whether this host will poll or block for MX request com- +pletions. A value of 0 blocks and any positive value will poll that many times +before blocking. Since polling increases CPU usage, we suggest you set this to +0 on the client and experiment with different values for servers. + +===================== +II. MXLND Performance +===================== + +On MX-2G systems, MXLND should easily saturate the link and use minimal CPU +(5-10% for read and write operations). On MX-10G systems, MXLND can saturate +the link and use moderate CPU resources (20-30% for read and write operations). +MX-10G relies on PCI-Express which is relatively new and performance varies +considerably by processor, motherboard and PCI-E chipset. Refer to Myricom's +website for the latest DMA read/write performance results by motherboard. The +DMA results will place an upper-bound on MXLND performance. + +============ +III. Caveats +============ + +1. Systems with different page sizes + +MXLND will set the maximum small message size equal to the kernel's page size. +This means that machines running MXLND that have different page sizes are not +able to communicate with each other. If you wish to run MXLND in this case, +send email to help@myri.com. + +2. Multi-homing + +At this time, the MXLND cannot drive more than one interface at a time. Thus, +a single Lustre router cannot route between two MX-10G, between two MX-2G, or +between MX-10G and MX-2G fabrics. + +3. MX endpoint collision + +Each process that uses MX is required to have at least one MX endpoint to +access the MX library and NIC. Other processes may need to use MX and no two +processes can use the same endpoint ID. MPICH-MX dynamically chooses one at +MPI startup and should not interfere with MXLND. Sockets-MX, on the other hand, +is hard coded to use 0 for its ID. If it is possible that anyone will want to +run Sockets-MX on this system, use a non-0 value for MXLND's endpoint ID. + + +=========== +IV. License +=========== + +MXLND is copyright (C) 2006 of Myricom, Inc. + +MXLND is part of Lustre, http://www.lustre.org. + +MXLND is free software; you can redistribute it and/or modify it under the +terms of version 2 of the GNU General Public License as published by the Free +Software Foundation. + +MXLND is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +Lustre; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, +Cambridge, MA 02139, USA. + +========== +V. Support +========== + +If you have questions about MXLND, please contact help@myri.com. diff --git a/lnet/router/autoMakefile.am b/lnet/klnds/mxlnd/autoMakefile.am similarity index 52% rename from lnet/router/autoMakefile.am rename to lnet/klnds/mxlnd/autoMakefile.am index 070b008..1d94f86 100644 --- a/lnet/router/autoMakefile.am +++ b/lnet/klnds/mxlnd/autoMakefile.am @@ -4,14 +4,10 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS - -if LINUX -modulenet_DATA = kptlrouter$(KMODEXT) -endif - +if BUILD_MXLND +modulenet_DATA = kmxlnd$(KMODEXT) endif endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kptlrouter-objs:%.o=%.c) router.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kmxlnd-objs:%.o=%.c) mxlnd.h diff --git a/lnet/klnds/mxlnd/mxlnd.c b/lnet/klnds/mxlnd/mxlnd.c new file mode 100644 index 0000000..bb6991d --- /dev/null +++ b/lnet/klnds/mxlnd/mxlnd.c @@ -0,0 +1,920 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * Copyright (C) 2006 Myricom, Inc. + * Author: Scott Atchley + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "mxlnd.h" + +lnd_t the_kmxlnd = { + .lnd_type = MXLND, + .lnd_startup = mxlnd_startup, + .lnd_shutdown = mxlnd_shutdown, + .lnd_ctl = mxlnd_ctl, + .lnd_send = mxlnd_send, + .lnd_recv = mxlnd_recv, +}; + +kmx_data_t kmxlnd_data; + +/** + * mxlnd_ctx_free - free ctx struct + * @ctx - a kmx_peer pointer + * + * The calling function should remove the ctx from the ctx list first + * then free it. + */ +void +mxlnd_ctx_free(struct kmx_ctx *ctx) +{ + if (ctx == NULL) return; + + if (ctx->mxc_page != NULL) { + __free_page(ctx->mxc_page); + spin_lock(&kmxlnd_data.kmx_global_lock); + kmxlnd_data.kmx_mem_used -= MXLND_EAGER_SIZE; + spin_unlock(&kmxlnd_data.kmx_global_lock); + } + + if (ctx->mxc_seg_list != NULL) { + LASSERT(ctx->mxc_nseg > 0); + MXLND_FREE(ctx->mxc_seg_list, ctx->mxc_nseg * sizeof(mx_ksegment_t)); + } + + MXLND_FREE (ctx, sizeof (*ctx)); + return; +} + +/** + * mxlnd_ctx_alloc - allocate and initialize a new ctx struct + * @ctxp - address of a kmx_ctx pointer + * + * Returns 0 on success and -EINVAL, -ENOMEM on failure + */ +int +mxlnd_ctx_alloc(struct kmx_ctx **ctxp, enum kmx_req_type type) +{ + int ret = 0; + struct kmx_ctx *ctx = NULL; + + if (ctxp == NULL) return -EINVAL; + + MXLND_ALLOC(ctx, sizeof (*ctx)); + if (ctx == NULL) { + CDEBUG(D_NETERROR, "Cannot allocate ctx\n"); + return -ENOMEM; + } + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->mxc_lock); + + ctx->mxc_type = type; + ctx->mxc_page = alloc_page (GFP_KERNEL); + if (ctx->mxc_page == NULL) { + CDEBUG(D_NETERROR, "Can't allocate page\n"); + ret = -ENOMEM; + goto failed; + } + spin_lock(&kmxlnd_data.kmx_global_lock); + kmxlnd_data.kmx_mem_used += MXLND_EAGER_SIZE; + spin_unlock(&kmxlnd_data.kmx_global_lock); + ctx->mxc_msg = (struct kmx_msg *)((char *)page_address(ctx->mxc_page)); + ctx->mxc_seg.segment_ptr = MX_PA_TO_U64(lnet_page2phys(ctx->mxc_page)); + ctx->mxc_state = MXLND_CTX_IDLE; + + *ctxp = ctx; + return 0; + +failed: + mxlnd_ctx_free(ctx); + return ret; +} + +/** + * mxlnd_ctx_init - reset ctx struct to the default values + * @ctx - a kmx_ctx pointer + */ +void +mxlnd_ctx_init(struct kmx_ctx *ctx) +{ + if (ctx == NULL) return; + + /* do not change mxc_type */ + ctx->mxc_incarnation = 0; + ctx->mxc_deadline = 0; + ctx->mxc_state = MXLND_CTX_IDLE; + /* ignore mxc_global_list */ + if (ctx->mxc_list.next != NULL && !list_empty(&ctx->mxc_list)) { + if (ctx->mxc_peer != NULL) + spin_lock(&ctx->mxc_lock); + list_del_init(&ctx->mxc_list); + if (ctx->mxc_peer != NULL) + spin_unlock(&ctx->mxc_lock); + } + /* ignore mxc_rx_list */ + /* ignore mxc_lock */ + ctx->mxc_nid = 0; + ctx->mxc_peer = NULL; + ctx->mxc_conn = NULL; + /* ignore mxc_msg */ + /* ignore mxc_page */ + ctx->mxc_lntmsg[0] = NULL; + ctx->mxc_lntmsg[1] = NULL; + ctx->mxc_msg_type = 0; + ctx->mxc_cookie = 0LL; + ctx->mxc_match = 0LL; + /* ctx->mxc_seg.segment_ptr points to mxc_page */ + ctx->mxc_seg.segment_length = 0; + if (ctx->mxc_seg_list != NULL) { + LASSERT(ctx->mxc_nseg > 0); + MXLND_FREE(ctx->mxc_seg_list, ctx->mxc_nseg * sizeof(mx_ksegment_t)); + } + ctx->mxc_seg_list = NULL; + ctx->mxc_nseg = 0; + ctx->mxc_nob = 0; + ctx->mxc_mxreq = NULL; + memset(&ctx->mxc_status, 0, sizeof(mx_status_t)); + /* ctx->mxc_get */ + /* ctx->mxc_put */ + + ctx->mxc_msg->mxm_type = 0; + ctx->mxc_msg->mxm_credits = 0; + ctx->mxc_msg->mxm_nob = 0; + ctx->mxc_msg->mxm_seq = 0; + + return; +} + +/** + * mxlnd_free_txs - free kmx_txs and associated pages + * + * Called from mxlnd_shutdown() + */ +void +mxlnd_free_txs(void) +{ + struct kmx_ctx *tx = NULL; + struct kmx_ctx *next = NULL; + + list_for_each_entry_safe(tx, next, &kmxlnd_data.kmx_txs, mxc_global_list) { + list_del_init(&tx->mxc_global_list); + mxlnd_ctx_free(tx); + } + return; +} + +/** + * mxlnd_init_txs - allocate tx descriptors then stash on txs and idle tx lists + * + * Called from mxlnd_startup() + * returns 0 on success, else -ENOMEM + */ +int +mxlnd_init_txs(void) +{ + int ret = 0; + int i = 0; + struct kmx_ctx *tx = NULL; + + for (i = 0; i < *kmxlnd_tunables.kmx_ntx; i++) { + ret = mxlnd_ctx_alloc(&tx, MXLND_REQ_TX); + if (ret != 0) { + mxlnd_free_txs(); + return ret; + } + mxlnd_ctx_init(tx); + /* in startup(), no locks required */ + list_add_tail(&tx->mxc_global_list, &kmxlnd_data.kmx_txs); + list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle); + } + return 0; +} + +/** + * mxlnd_free_rxs - free initial kmx_rx descriptors and associated pages + * + * Called from mxlnd_shutdown() + */ +void +mxlnd_free_rxs(void) +{ + struct kmx_ctx *rx = NULL; + struct kmx_ctx *next = NULL; + + list_for_each_entry_safe(rx, next, &kmxlnd_data.kmx_rxs, mxc_global_list) { + list_del_init(&rx->mxc_global_list); + mxlnd_ctx_free(rx); + } + return; +} + +/** + * mxlnd_init_rxs - allocate initial rx descriptors + * + * Called from startup(). We create MXLND_MAX_PEERS plus MXLND_NTX + * rx descriptors. We create one for each potential peer to handle + * the initial connect request. We create on for each tx in case the + * send requires a non-eager receive. + * + * Returns 0 on success, else -ENOMEM + */ +int +mxlnd_init_rxs(void) +{ + int ret = 0; + int i = 0; + struct kmx_ctx *rx = NULL; + + for (i = 0; i < (*kmxlnd_tunables.kmx_ntx + *kmxlnd_tunables.kmx_max_peers); i++) { + ret = mxlnd_ctx_alloc(&rx, MXLND_REQ_RX); + if (ret != 0) { + mxlnd_free_rxs(); + return ret; + } + mxlnd_ctx_init(rx); + /* in startup(), no locks required */ + list_add_tail(&rx->mxc_global_list, &kmxlnd_data.kmx_rxs); + list_add_tail(&rx->mxc_list, &kmxlnd_data.kmx_rx_idle); + } + return 0; +} + +/** + * mxlnd_free_peers - free peers + * + * Called from mxlnd_shutdown() + */ +void +mxlnd_free_peers(void) +{ + int i = 0; + struct kmx_peer *peer = NULL; + struct kmx_peer *next = NULL; + + for (i = 0; i < MXLND_HASH_SIZE; i++) { + list_for_each_entry_safe(peer, next, &kmxlnd_data.kmx_peers[i], mxp_peers) { + list_del_init(&peer->mxp_peers); + if (peer->mxp_conn) mxlnd_conn_decref(peer->mxp_conn); + mxlnd_peer_decref(peer); + } + } +} + +int +mxlnd_host_alloc(struct kmx_host **hostp) +{ + struct kmx_host *host = NULL; + + MXLND_ALLOC(host, sizeof (*host)); + if (host == NULL) { + CDEBUG(D_NETERROR, "Cannot allocate host\n"); + return -1; + } + memset(host, 0, sizeof(*host)); + spin_lock_init(&host->mxh_lock); + + *hostp = host; + + return 0; +} + +void +mxlnd_host_free(struct kmx_host *host) +{ + if (host == NULL) return; + + if (host->mxh_hostname != NULL) + MXLND_FREE(host->mxh_hostname, strlen(host->mxh_hostname) + 1); + + MXLND_FREE(host, sizeof(*host)); + return; +} + +/** + * mxlnd_free_hosts - free kmx_hosts + * + * Called from mxlnd_shutdown() + */ +void +mxlnd_free_hosts(void) +{ + struct kmx_host *host = NULL; + struct kmx_host *next = NULL; + + list_for_each_entry_safe(host, next, &kmxlnd_data.kmx_hosts, mxh_list) { + list_del_init(&host->mxh_list); + mxlnd_host_free(host); + } + return; +} + +#define xstr(s) #s +#define str(s) xstr(s) +#define MXLND_MAX_BOARD 4 /* we expect hosts to have fewer NICs than this */ +#define MXLND_MAX_EP_ID 16 /* we expect hosts to have less than this endpoints */ + +/* this parses a line that consists of: + * + * IP HOSTNAME BOARD ENDPOINT ID + * 169.192.0.113 mds01 0 3 + * + * By default MX uses the alias (short hostname). If you override + * it using mx_hostname to use the FQDN or some other name, the hostname + * here must match exactly. + */ + +/* MX_MAX_HOSTNAME_LEN = 80. See myriexpress.h */ +int +mxlnd_parse_line(char *line) +{ + int i = 0; + int ret = 0; + int len = 0; + u32 ip[4] = { 0, 0, 0, 0 }; + char hostname[MX_MAX_HOSTNAME_LEN]; + u32 board = -1; + u32 ep_id = -1; + struct kmx_host *host = NULL; + + if (line == NULL) return -1; + + len = strlen(line); + + if (len == 0) return -1; + + /* convert tabs to spaces */ + for (i = 0; i < len; i++) { + if (line[i] == '\t') line[i] = ' '; + } + + memset(&hostname, 0 , sizeof(hostname)); + ret = sscanf(line, "%d.%d.%d.%d %" str(MX_MAX_HOSTNAME_LEN) "s %d %d", + &ip[0], &ip[1], &ip[2], &ip[3], hostname, &board, &ep_id); + + if (ret != 7) { + return -1; + } + + /* check for valid values */ + /* we assume a valid IP address (all <= 255), number of NICs, + * and number of endpoint IDs */ + if (ip[0] > 255 || ip [1] > 255 || ip[2] > 255 || ip[3] > 255 || + board > MXLND_MAX_BOARD || ep_id > MXLND_MAX_EP_ID) { + CDEBUG(D_NETERROR, "Illegal value in \"%s\". Ignoring " + "this host.\n", line); + return -1; + } + + ret = mxlnd_host_alloc(&host); + if (ret != 0) return -1; + + host->mxh_addr = ((ip[0]<<24)|(ip[1]<<16)|(ip[2]<<8)|ip[3]); + len = strlen(hostname); + MXLND_ALLOC(host->mxh_hostname, len + 1); + memset(host->mxh_hostname, 0, len + 1); + strncpy(host->mxh_hostname, hostname, len); + host->mxh_board = board; + host->mxh_ep_id = ep_id; + + spin_lock(&kmxlnd_data.kmx_hosts_lock); + list_add_tail(&host->mxh_list, &kmxlnd_data.kmx_hosts); + spin_unlock(&kmxlnd_data.kmx_hosts_lock); + + return 0; +} + +void +mxlnd_print_hosts(void) +{ +#if MXLND_DEBUG + struct kmx_host *host = NULL; + + list_for_each_entry(host, &kmxlnd_data.kmx_hosts, mxh_list) { + int ip[4]; + u32 addr = host->mxh_addr; + + ip[0] = (addr >> 24) & 0xff; + ip[1] = (addr >> 16) & 0xff; + ip[2] = (addr >> 8) & 0xff; + ip[3] = addr & 0xff; + CDEBUG(D_NET, "\tip= %d.%d.%d.%d\n\thost= %s\n\tboard= %d\n\tep_id= %d\n\n", + ip[0], ip[1], ip[2], ip[3], + host->mxh_hostname, host->mxh_board, host->mxh_ep_id); + } +#endif + return; +} + +#define MXLND_BUFSIZE (PAGE_SIZE - 1) + +int +mxlnd_parse_hosts(char *filename) +{ + int ret = 0; + s32 size = 0; + s32 bufsize = MXLND_BUFSIZE; + s32 allocd = 0; + loff_t offset = 0; + struct file *filp = NULL; + char *buf = NULL; + s32 buf_off = 0; + char *sep = NULL; + char *line = NULL; + + if (filename == NULL) return -1; + + filp = filp_open(filename, O_RDONLY, 0); + if (IS_ERR(filp)) { + CERROR("filp_open() failed for %s\n", filename); + return -1; + } + + size = (s32) filp->f_dentry->d_inode->i_size; + if (size < MXLND_BUFSIZE) bufsize = size; + allocd = bufsize; + MXLND_ALLOC(buf, allocd + 1); + if (buf == NULL) { + CERROR("Cannot allocate buf\n"); + filp_close(filp, current->files); + return -1; + } + + while (offset < size) { + memset(buf, 0, bufsize + 1); + ret = kernel_read(filp, (unsigned long) offset, buf, (unsigned long) bufsize); + if (ret < 0) { + CDEBUG(D_NETERROR, "kernel_read() returned %d - closing %s\n", ret, filename); + filp_close(filp, current->files); + MXLND_FREE(buf, allocd + 1); + return -1; + } + + if (ret < bufsize) bufsize = ret; + buf_off = 0; + while (buf_off < bufsize) { + sep = strchr(buf + buf_off, '\n'); + if (sep != NULL) { + /* we have a line */ + line = buf + buf_off; + *sep = '\0'; + ret = mxlnd_parse_line(line); + if (ret != 0 && strlen(line) != 0) { + CDEBUG(D_NETERROR, "Failed to parse \"%s\". Ignoring this host.\n", line); + } + buf_off += strlen(line) + 1; + } else { + /* last line or we need to read more */ + line = buf + buf_off; + ret = mxlnd_parse_line(line); + if (ret != 0) { + bufsize -= strlen(line) + 1; + } + buf_off += strlen(line) + 1; + } + } + offset += bufsize; + bufsize = MXLND_BUFSIZE; + } + + MXLND_FREE(buf, allocd + 1); + filp_close(filp, current->files); + mxlnd_print_hosts(); + + return 0; +} + +/** + * mxlnd_init_mx - open the endpoint, set out ID, register the EAGER callback + * @ni - the network interface + * + * Returns 0 on success, -1 on failure + */ +int +mxlnd_init_mx(lnet_ni_t *ni) +{ + int ret = 0; + int found = 0; + mx_return_t mxret; + mx_endpoint_addr_t addr; + u32 board = *kmxlnd_tunables.kmx_board; + u32 ep_id = *kmxlnd_tunables.kmx_ep_id; + u64 nic_id = 0LL; + struct kmx_host *host = NULL; + + mxret = mx_init(); + if (mxret != MX_SUCCESS) { + CERROR("mx_init() failed with %s (%d)\n", mx_strerror(mxret), mxret); + return -1; + } + + ret = mxlnd_parse_hosts(*kmxlnd_tunables.kmx_hosts); + if (ret != 0) { + if (*kmxlnd_tunables.kmx_hosts != NULL) { + CERROR("mxlnd_parse_hosts(%s) failed\n", *kmxlnd_tunables.kmx_hosts); + } + mx_finalize(); + return -1; + } + + list_for_each_entry(host, &kmxlnd_data.kmx_hosts, mxh_list) { + if (strcmp(host->mxh_hostname, system_utsname.nodename) == 0) { + /* override the defaults and module parameters with + * the info from the hosts file */ + board = host->mxh_board; + ep_id = host->mxh_ep_id; + kmxlnd_data.kmx_localhost = host; + CDEBUG(D_NET, "my hostname is %s board %d ep_id %d\n", kmxlnd_data.kmx_localhost->mxh_hostname, kmxlnd_data.kmx_localhost->mxh_board, kmxlnd_data.kmx_localhost->mxh_ep_id); + found = 1; + break; + } + } + + if (found == 0) { + CERROR("no host entry found for localhost\n"); + mx_finalize(); + return -1; + } + + mxret = mx_open_endpoint(board, ep_id, MXLND_MSG_MAGIC, + NULL, 0, &kmxlnd_data.kmx_endpt); + if (mxret != MX_SUCCESS) { + CERROR("mx_open_endpoint() failed with %d\n", mxret); + mx_finalize(); + return -1; + } + + mx_get_endpoint_addr(kmxlnd_data.kmx_endpt, &addr); + mx_decompose_endpoint_addr(addr, &nic_id, &ep_id); + + LASSERT(host != NULL); + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), host->mxh_addr); + + CDEBUG(D_NET, "My NID is 0x%llx\n", ni->ni_nid); + + /* this will catch all unexpected receives. */ + mxret = mx_register_unexp_handler(kmxlnd_data.kmx_endpt, + (mx_unexp_handler_t) mxlnd_unexpected_recv, + NULL); + if (mxret != MX_SUCCESS) { + CERROR("mx_register_unexp_callback() failed with %s\n", + mx_strerror(mxret)); + mx_close_endpoint(kmxlnd_data.kmx_endpt); + mx_finalize(); + return -1; + } + mxret = mx_set_request_timeout(kmxlnd_data.kmx_endpt, NULL, MXLND_COMM_TIMEOUT/HZ); + if (mxret != MX_SUCCESS) { + CERROR("mx_set_request_timeout() failed with %s\n", + mx_strerror(mxret)); + mx_close_endpoint(kmxlnd_data.kmx_endpt); + mx_finalize(); + return -1; + } + return 0; +} + + +/** + * mxlnd_thread_start - spawn a kernel thread with this function + * @fn - function pointer + * @arg - pointer to the parameter data + * + * Returns 0 on success and a negative value on failure + */ +int +mxlnd_thread_start(int (*fn)(void *arg), void *arg) +{ + int pid = 0; + int i = (int) ((long) arg); + + atomic_inc(&kmxlnd_data.kmx_nthreads); + init_completion(&kmxlnd_data.kmx_completions[i]); + + pid = kernel_thread (fn, arg, 0); + if (pid <= 0) { + CERROR("mx_thread_start() failed with %d\n", pid); + atomic_dec(&kmxlnd_data.kmx_nthreads); + } + return pid; +} + +/** + * mxlnd_thread_stop - decrement thread counter + * + * The thread returns 0 when it detects shutdown. + * We are simply decrementing the thread counter. + */ +void +mxlnd_thread_stop(long id) +{ + int i = (int) id; + atomic_dec (&kmxlnd_data.kmx_nthreads); + complete(&kmxlnd_data.kmx_completions[i]); +} + +/** + * mxlnd_shutdown - stop IO, clean up state + * @ni - LNET interface handle + * + * No calls to the LND should be made after calling this function. + */ +void +mxlnd_shutdown (lnet_ni_t *ni) +{ + int i = 0; + + LASSERT (ni == kmxlnd_data.kmx_ni); + LASSERT (ni->ni_data == &kmxlnd_data); + CDEBUG(D_NET, "in shutdown()\n"); + + CDEBUG(D_MALLOC, "before MXLND cleanup: libcfs_kmemory %d " + "kmx_mem_used %ld\n", atomic_read (&libcfs_kmemory), + kmxlnd_data.kmx_mem_used); + + switch (kmxlnd_data.kmx_init) { + + case MXLND_INIT_ALL: + + CDEBUG(D_NET, "setting shutdown = 1\n"); + /* set shutdown and wakeup request_waitds */ + kmxlnd_data.kmx_shutdown = 1; + mb(); + mx_wakeup(kmxlnd_data.kmx_endpt); + up(&kmxlnd_data.kmx_tx_queue_sem); + mxlnd_sleep(2 * HZ); + + /* fall through */ + + case MXLND_INIT_THREADS: + + CDEBUG(D_NET, "waiting on threads\n"); + /* wait for threads to complete */ + for (i = 0; i < MXLND_NCOMPLETIONS; i++) { + wait_for_completion(&kmxlnd_data.kmx_completions[i]); + } + LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); + + CDEBUG(D_NET, "freeing completions\n"); + MXLND_FREE(kmxlnd_data.kmx_completions, + MXLND_NCOMPLETIONS * sizeof(struct completion)); + + /* fall through */ + + case MXLND_INIT_MX: + + CDEBUG(D_NET, "stopping mx\n"); + + /* wakeup waiters if they missed the above. + * close endpoint to stop all traffic. + * this will cancel and cleanup all requests, etc. */ + + mx_wakeup(kmxlnd_data.kmx_endpt); + mx_close_endpoint(kmxlnd_data.kmx_endpt); + mx_finalize(); + + CDEBUG(D_NET, "mxlnd_free_hosts();\n"); + mxlnd_free_hosts(); + + /* fall through */ + + case MXLND_INIT_RXS: + + CDEBUG(D_NET, "freeing rxs\n"); + + /* free all rxs and associated pages */ + mxlnd_free_rxs(); + + /* fall through */ + + case MXLND_INIT_TXS: + + CDEBUG(D_NET, "freeing txs\n"); + + /* free all txs and associated pages */ + mxlnd_free_txs(); + + /* fall through */ + + case MXLND_INIT_DATA: + + CDEBUG(D_NET, "freeing peers\n"); + + /* free peer list */ + mxlnd_free_peers(); + + /* fall through */ + + case MXLND_INIT_NOTHING: + break; + } + CDEBUG(D_NET, "shutdown complete\n"); + + CDEBUG(D_MALLOC, "after MXLND cleanup: libcfs_kmemory %d " + "kmx_mem_used %ld\n", atomic_read (&libcfs_kmemory), + kmxlnd_data.kmx_mem_used); + + kmxlnd_data.kmx_init = MXLND_INIT_NOTHING; + PORTAL_MODULE_UNUSE; + return; +} + +/** + * mxlnd_startup - initialize state, open an endpoint, start IO + * @ni - LNET interface handle + * + * Initialize state, open an endpoint, start monitoring threads. + * Should only be called once. + */ +int +mxlnd_startup (lnet_ni_t *ni) +{ + int i = 0; + int ret = 0; + struct timeval tv; + + LASSERT (ni->ni_lnd == &the_kmxlnd); + + if (kmxlnd_data.kmx_init != MXLND_INIT_NOTHING) { + CERROR("Only 1 instance supported\n"); + return -EPERM; + } + CDEBUG(D_MALLOC, "before MXLND startup: libcfs_kmemory %d " + "kmx_mem_used %ld\n", atomic_read (&libcfs_kmemory), + kmxlnd_data.kmx_mem_used); + + /* reserve 1/2 of tx for connect request messages */ + ni->ni_maxtxcredits = *kmxlnd_tunables.kmx_ntx / 2; + ni->ni_peertxcredits = *kmxlnd_tunables.kmx_credits; + + PORTAL_MODULE_USE; + memset (&kmxlnd_data, 0, sizeof (kmxlnd_data)); + + kmxlnd_data.kmx_ni = ni; + ni->ni_data = &kmxlnd_data; + + do_gettimeofday(&tv); + kmxlnd_data.kmx_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + CDEBUG(D_NET, "my incarnation is %lld\n", kmxlnd_data.kmx_incarnation); + + spin_lock_init (&kmxlnd_data.kmx_global_lock); + + INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_req); + spin_lock_init (&kmxlnd_data.kmx_conn_lock); + sema_init(&kmxlnd_data.kmx_conn_sem, 0); + + INIT_LIST_HEAD (&kmxlnd_data.kmx_hosts); + spin_lock_init (&kmxlnd_data.kmx_hosts_lock); + + for (i = 0; i < MXLND_HASH_SIZE; i++) { + INIT_LIST_HEAD (&kmxlnd_data.kmx_peers[i]); + } + rwlock_init (&kmxlnd_data.kmx_peers_lock); + + INIT_LIST_HEAD (&kmxlnd_data.kmx_txs); + INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_idle); + spin_lock_init (&kmxlnd_data.kmx_tx_idle_lock); + kmxlnd_data.kmx_tx_next_cookie = 1; + INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_queue); + spin_lock_init (&kmxlnd_data.kmx_tx_queue_lock); + sema_init(&kmxlnd_data.kmx_tx_queue_sem, 0); + + INIT_LIST_HEAD (&kmxlnd_data.kmx_rxs); + spin_lock_init (&kmxlnd_data.kmx_rxs_lock); + INIT_LIST_HEAD (&kmxlnd_data.kmx_rx_idle); + spin_lock_init (&kmxlnd_data.kmx_rx_idle_lock); + + kmxlnd_data.kmx_init = MXLND_INIT_DATA; + /*****************************************************/ + + ret = mxlnd_init_txs(); + if (ret != 0) { + CERROR("Can't alloc tx descs: %d\n", ret); + goto failed; + } + kmxlnd_data.kmx_init = MXLND_INIT_TXS; + /*****************************************************/ + + ret = mxlnd_init_rxs(); + if (ret != 0) { + CERROR("Can't alloc rx descs: %d\n", ret); + goto failed; + } + kmxlnd_data.kmx_init = MXLND_INIT_RXS; + /*****************************************************/ + + ret = mxlnd_init_mx(ni); + if (ret != 0) { + CERROR("Can't init mx\n"); + goto failed; + } + + kmxlnd_data.kmx_init = MXLND_INIT_MX; + /*****************************************************/ + + /* start threads */ + + MXLND_ALLOC (kmxlnd_data.kmx_completions, + MXLND_NCOMPLETIONS * sizeof(struct completion)); + if (kmxlnd_data.kmx_completions == NULL) { + CERROR("failed to alloc kmxlnd_data.kmx_completions"); + goto failed; + } + memset(kmxlnd_data.kmx_completions, 0, + MXLND_NCOMPLETIONS * sizeof(struct completion)); + + { + int i = 0; + if (MXLND_N_SCHED > *kmxlnd_tunables.kmx_n_waitd) { + *kmxlnd_tunables.kmx_n_waitd = MXLND_N_SCHED; + } + CDEBUG(D_NET, "using %d %s in mx_wait_any()\n", + *kmxlnd_tunables.kmx_n_waitd, + *kmxlnd_tunables.kmx_n_waitd == 1 ? "thread" : "threads"); + + for (i = 0; i < *kmxlnd_tunables.kmx_n_waitd; i++) { + ret = mxlnd_thread_start(mxlnd_request_waitd, (void*)((long)i)); + if (ret < 0) { + CERROR("Starting mxlnd_request_waitd[%d] failed with %d\n", i, ret); + for (--i; i >= 0; i--) { + wait_for_completion(&kmxlnd_data.kmx_completions[i]); + } + LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); + MXLND_FREE(kmxlnd_data.kmx_completions, + MXLND_NCOMPLETIONS * sizeof(struct completion)); + + goto failed; + } + } + ret = mxlnd_thread_start(mxlnd_tx_queued, (void*)((long)i++)); + if (ret < 0) { + CERROR("Starting mxlnd_tx_queued failed with %d\n", ret); + for (--i; i >= 0; i--) { + wait_for_completion(&kmxlnd_data.kmx_completions[i]); + } + LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); + MXLND_FREE(kmxlnd_data.kmx_completions, + MXLND_NCOMPLETIONS * sizeof(struct completion)); + goto failed; + } + ret = mxlnd_thread_start(mxlnd_timeoutd, (void*)((long)i++)); + if (ret < 0) { + CERROR("Starting mxlnd_timeoutd failed with %d\n", ret); + for (--i; i >= 0; i--) { + wait_for_completion(&kmxlnd_data.kmx_completions[i]); + } + LASSERT(atomic_read(&kmxlnd_data.kmx_nthreads) == 0); + MXLND_FREE(kmxlnd_data.kmx_completions, + MXLND_NCOMPLETIONS * sizeof(struct completion)); + goto failed; + } + } + + kmxlnd_data.kmx_init = MXLND_INIT_THREADS; + /*****************************************************/ + + kmxlnd_data.kmx_init = MXLND_INIT_ALL; + CDEBUG(D_MALLOC, "startup complete (kmx_mem_used %ld)\n", kmxlnd_data.kmx_mem_used); + + return 0; +failed: + CERROR("mxlnd_startup failed\n"); + mxlnd_shutdown (ni); + return (-ENETDOWN); +} + +static int mxlnd_init(void) +{ + lnet_register_lnd(&the_kmxlnd); + return 0; +} + +static void mxlnd_exit(void) +{ + lnet_unregister_lnd(&the_kmxlnd); + return; +} + +module_init(mxlnd_init); +module_exit(mxlnd_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Myricom, Inc. - help@myri.com"); +MODULE_DESCRIPTION("Kernel MyrinetExpress LND"); +MODULE_VERSION("0.5.0"); diff --git a/lnet/klnds/mxlnd/mxlnd.h b/lnet/klnds/mxlnd/mxlnd.h new file mode 100644 index 0000000..28e58ca --- /dev/null +++ b/lnet/klnds/mxlnd/mxlnd.h @@ -0,0 +1,415 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * Copyright (C) 2006 Myricom, Inc. + * Author: Scott Atchley + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +#define EXPORT_SYMTAB +#endif + +#include +#include /* module */ +#include /* module */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include /* module */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include /* these are needed for ARP */ +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LND + +#include "libcfs/kp30.h" +#include "lnet/lnet.h" +#include "lnet/lib-lnet.h" + +#define MX_KERNEL 1 +#include "mx_extensions.h" +#include "myriexpress.h" + +#if LNET_MAX_IOV > MX_MAX_SEGMENTS + #error LNET_MAX_IOV is greater then MX_MAX_SEGMENTS +#endif + +/* Using MX's 64 match bits + * We are using the match bits to specify message type and the cookie. The + * highest four bits (60-63) are reserved for message type. Below we specify + * the types. MXLND_MASK_ICON_REQ and MXLND_MASK_ICON_ACK are used for + * mx_iconnect(). We reserve the remaining combinations for future use. The + * next 8 bits (52-59) are reserved for returning a status code for failed + * GET_DATA (payload) messages. The last 52 bits are used for cookies. That + * should allow unique cookies for 4 KB messages at 10 Gbps line rate without + * rollover for about 8 years. That should be enough. */ + +/* constants */ +#define MXLND_MASK_ICON_REQ (0xBLL << 60) /* it is a mx_iconnect() completion */ +#define MXLND_MASK_CONN_REQ (0xCLL << 60) /* CONN_REQ msg */ +#define MXLND_MASK_ICON_ACK (0x9LL << 60) /* it is a mx_iconnect() completion */ +#define MXLND_MASK_CONN_ACK (0xALL << 60) /* CONN_ACK msg*/ +#define MXLND_MASK_EAGER (0xELL << 60) /* EAGER msg */ +#define MXLND_MASK_NOOP (0x1LL << 60) /* NOOP msg */ +#define MXLND_MASK_PUT_REQ (0x2LL << 60) /* PUT_REQ msg */ +#define MXLND_MASK_PUT_ACK (0x3LL << 60) /* PUT_ACK msg */ +#define MXLND_MASK_PUT_DATA (0x4LL << 60) /* PUT_DATA msg */ +#define MXLND_MASK_GET_REQ (0x5LL << 60) /* GET_REQ msg */ +#define MXLND_MASK_GET_DATA (0x6LL << 60) /* GET_DATA msg */ +//#define MXLND_MASK_NAK (0x7LL << 60) /* NAK msg */ + +#define MXLND_MAX_COOKIE ((1LL << 52) - 1) /* when to roll-over the cookie value */ +#define MXLND_NCOMPLETIONS (MXLND_N_SCHED + 2) /* max threads for completion array */ + +/* defaults for configurable parameters */ +#define MXLND_N_SCHED 1 /* # schedulers (mx_wait_any() threads) */ +#define MXLND_MX_BOARD 0 /* Use the first MX NIC if more than 1 avail */ +#define MXLND_MX_EP_ID 3 /* MX endpoint ID */ +#define MXLND_COMM_TIMEOUT (20 * HZ) /* timeout for send/recv (jiffies) */ +#define MXLND_WAIT_TIMEOUT HZ /* timeout for wait (jiffies) */ +#define MXLND_POLLING 0 /* poll iterations before blocking */ +#define MXLND_MAX_PEERS 1024 /* number of nodes talking to me */ +#define MXLND_EAGER_NUM MXLND_MAX_PEERS /* number of pre-posted receives */ +#define MXLND_EAGER_SIZE PAGE_SIZE /* pre-posted eager message size */ +#define MXLND_MSG_QUEUE_DEPTH 8 /* msg queue depth */ +#define MXLND_CREDIT_HIGHWATER (MXLND_MSG_QUEUE_DEPTH - 2) + /* when to send a noop to return credits */ +#define MXLND_NTX 256 /* # of kmx_tx - total sends in flight + 1/2 are reserved for connect messages */ + +#define MXLND_HASH_BITS 6 /* the number of bits to hash over */ +#define MXLND_HASH_SIZE (1< 0, poll this many + iterations before blocking */ + char **kmx_hosts; /* Location of hosts file, if used */ +} kmx_tunables_t; + +/* structure to hold IP-to-hostname resolution data */ +struct kmx_host { + struct kmx_peer *mxh_peer; /* pointer to matching peer */ + u32 mxh_addr; /* IP address as int */ + char *mxh_hostname; /* peer's hostname */ + u32 mxh_board; /* peer's board rank */ + u32 mxh_ep_id; /* peer's MX endpoint ID */ + struct list_head mxh_list; /* position on kmx_hosts */ + spinlock_t mxh_lock; /* lock */ +}; + +/* global interface state */ +typedef struct kmx_data +{ + int kmx_init; /* initialization state */ + int kmx_shutdown; /* shutting down? */ + atomic_t kmx_nthreads; /* number of threads */ + struct completion *kmx_completions; /* array of completion structs */ + lnet_ni_t *kmx_ni; /* the LND instance */ + u64 kmx_incarnation; /* my incarnation value - unused */ + long kmx_mem_used; /* memory used */ + struct kmx_host *kmx_localhost; /* pointer to my kmx_host info */ + mx_endpoint_t kmx_endpt; /* the MX endpoint */ + + spinlock_t kmx_global_lock; /* global lock */ + + struct list_head kmx_conn_req; /* list of connection requests */ + spinlock_t kmx_conn_lock; /* connection list lock */ + struct semaphore kmx_conn_sem; /* semaphore for connection request list */ + + struct list_head kmx_hosts; /* host lookup info */ + spinlock_t kmx_hosts_lock; /* hosts list lock */ + + struct list_head kmx_peers[MXLND_HASH_SIZE]; + /* list of all known peers */ + rwlock_t kmx_peers_lock; /* peer list rw lock */ + atomic_t kmx_npeers; /* number of peers */ + + struct list_head kmx_txs; /* all tx descriptors */ + struct list_head kmx_tx_idle; /* list of idle tx */ + spinlock_t kmx_tx_idle_lock; /* lock for idle tx list */ + s32 kmx_tx_used; /* txs in use */ + u64 kmx_tx_next_cookie; /* unique id for tx */ + struct list_head kmx_tx_queue; /* generic send queue */ + spinlock_t kmx_tx_queue_lock; /* lock for generic sends */ + struct semaphore kmx_tx_queue_sem; /* semaphore for tx queue */ + + struct list_head kmx_rxs; /* all rx descriptors */ + spinlock_t kmx_rxs_lock; /* lock for rxs list */ + struct list_head kmx_rx_idle; /* list of idle tx */ + spinlock_t kmx_rx_idle_lock; /* lock for idle rx list */ +} kmx_data_t; + +#define MXLND_INIT_NOTHING 0 /* in the beginning, there was nothing... */ +#define MXLND_INIT_DATA 1 /* main data structures created */ +#define MXLND_INIT_TXS 2 /* tx descriptors created */ +#define MXLND_INIT_RXS 3 /* initial rx descriptors created */ +#define MXLND_INIT_MX 4 /* initiate MX library, open endpoint, get NIC id */ +#define MXLND_INIT_THREADS 5 /* waitd, timeoutd, tx_queued threads */ +#define MXLND_INIT_ALL 6 /* startup completed */ + +#include "mxlnd_wire.h" + +enum kmx_req_type { + MXLND_REQ_TX = 0, + MXLND_REQ_RX = 1, +}; + +/* The life cycle of a request */ +enum kmx_req_state { + MXLND_CTX_INIT = 0, /* just created */ + MXLND_CTX_IDLE = 1, /* available for use */ + MXLND_CTX_PREP = 2, /* getting ready for send/recv */ + MXLND_CTX_PENDING = 3, /* mx_isend() or mx_irecv() called */ + MXLND_CTX_COMPLETED = 4, /* cleaning up after completion or timeout */ + MXLND_CTX_CANCELED = 5, /* timed out but still in ctx list */ +}; + +/* Context Structure - generic tx/rx descriptor + * It represents the context (or state) of each send or receive request. + * In other LNDs, they have separate TX and RX descriptors and this replaces both. + * + * We will keep the these on the global kmx_rxs and kmx_txs lists for cleanup + * during shutdown(). We will move them between the rx/tx idle lists and the + * pending list which is monitored by mxlnd_timeoutd(). + */ +struct kmx_ctx { + enum kmx_req_type mxc_type; /* TX or RX */ + u64 mxc_incarnation; /* store the peer's incarnation here + to verify before changing flow + control credits after completion */ + unsigned long mxc_deadline; /* request time out in absolute jiffies */ + enum kmx_req_state mxc_state; /* what is the state of the request? */ + struct list_head mxc_global_list; /* place on kmx_rxs or kmx_txs */ + struct list_head mxc_list; /* place on rx/tx idle list, tx q, peer tx */ + struct list_head mxc_rx_list; /* place on mxp_rx_posted list */ + spinlock_t mxc_lock; /* lock */ + + lnet_nid_t mxc_nid; /* dst's NID if peer is not known */ + struct kmx_peer *mxc_peer; /* owning peer */ + struct kmx_conn *mxc_conn; /* owning conn */ + struct kmx_msg *mxc_msg; /* msg hdr mapped to mxc_page */ + struct page *mxc_page; /* buffer for eager msgs */ + lnet_msg_t *mxc_lntmsg[2]; /* lnet msgs to finalize */ + + u8 mxc_msg_type; /* what type of message is this? */ + u64 mxc_cookie; /* completion cookie */ + u64 mxc_match; /* MX match info */ + mx_ksegment_t mxc_seg; /* local MX ksegment for non-DATA */ + mx_ksegment_t *mxc_seg_list; /* MX ksegment array for DATA */ + int mxc_nseg; /* number of segments */ + unsigned long mxc_pin_type; /* MX_PIN_KERNEL or MX_PIN_PHYSICAL */ + u32 mxc_nob; /* number of bytes sent/received */ + mx_request_t mxc_mxreq; /* MX request */ + mx_status_t mxc_status; /* MX status */ + s64 mxc_get; /* # of times returned from idle list */ + s64 mxc_put; /* # of times returned from idle list */ +}; + +#define MXLND_CONN_DISCONNECT -2 /* conn is being destroyed - do not add txs */ +#define MXLND_CONN_FAIL -1 /* connect failed (bad handshake, unavail, etc.) */ +#define MXLND_CONN_INIT 0 /* in the beginning, there was nothing... */ +#define MXLND_CONN_REQ 1 /* a connection request message is needed */ +#define MXLND_CONN_ACK 2 /* a connection ack is needed */ +#define MXLND_CONN_WAIT 3 /* waiting for req or ack to complete */ +#define MXLND_CONN_READY 4 /* ready to send */ + +/* connection state - queues for queued and pending msgs */ +struct kmx_conn +{ + u64 mxk_incarnation; /* connections's incarnation value */ + atomic_t mxk_refcount; /* reference counting */ + + struct kmx_peer *mxk_peer; /* owning peer */ + mx_endpoint_addr_t mxk_epa; /* peer's endpoint address */ + + struct list_head mxk_list; /* for placing on mxp_conns */ + spinlock_t mxk_lock; /* lock */ + unsigned long mxk_timeout; /* expiration of oldest pending tx/rx */ + unsigned long mxk_last_tx; /* when last tx completed with success */ + unsigned long mxk_last_rx; /* when last rx completed */ + + int mxk_credits; /* # of my credits for sending to peer */ + int mxk_outstanding; /* # of credits to return */ + + int mxk_status; /* can we send messages? MXLND_CONN_* */ + struct list_head mxk_tx_credit_queue; /* send queue for peer */ + struct list_head mxk_tx_free_queue; /* send queue for peer */ + int mxk_ntx_msgs; /* # of msgs on tx queues */ + int mxk_ntx_data ; /* # of DATA on tx queues */ + int mxk_ntx_posted; /* # of tx msgs in flight */ + int mxk_data_posted; /* # of tx data payloads in flight */ + + struct list_head mxk_pending; /* in flight rxs and txs */ +}; + +/* peer state */ +struct kmx_peer +{ + lnet_nid_t mxp_nid; /* peer's LNET NID */ + u64 mxp_incarnation; /* peer's incarnation value */ + atomic_t mxp_refcount; /* reference counts */ + + struct kmx_host *mxp_host; /* peer lookup info */ + u64 mxp_nic_id; /* remote's MX nic_id for mx_connect() */ + + struct list_head mxp_peers; /* for placing on kmx_peers */ + spinlock_t mxp_lock; /* lock */ + + struct list_head mxp_conns; /* list of connections */ + struct kmx_conn *mxp_conn; /* current connection */ + + unsigned long mxp_reconnect_time; /* when to retry connect */ + int mxp_incompatible; /* incorrect conn_req values */ +}; + +extern kmx_data_t kmxlnd_data; +extern kmx_tunables_t kmxlnd_tunables; + +/* required for the LNET API */ +int mxlnd_startup(lnet_ni_t *ni); +void mxlnd_shutdown(lnet_ni_t *ni); +int mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int mxlnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + +/* in mxlnd.c */ +extern void mxlnd_thread_stop(long id); +extern int mxlnd_ctx_alloc(struct kmx_ctx **ctxp, enum kmx_req_type type); +extern void mxlnd_ctx_free(struct kmx_ctx *ctx); +extern void mxlnd_ctx_init(struct kmx_ctx *ctx); +extern lnet_nid_t mxlnd_nic_id2nid(lnet_ni_t *ni, u64 nic_id); +extern u64 mxlnd_nid2nic_id(lnet_nid_t nid); + +/* in mxlnd_cb.c */ +void mxlnd_eager_recv(void *context, uint64_t match_value, uint32_t length); +extern mx_unexp_handler_action_t mxlnd_unexpected_recv(void *context, + mx_endpoint_addr_t source, uint64_t match_value, uint32_t length, + void *data_if_available); +extern void mxlnd_peer_free(struct kmx_peer *peer); +extern void mxlnd_conn_free(struct kmx_conn *conn); +extern void mxlnd_sleep(unsigned long timeout); +extern int mxlnd_tx_queued(void *arg); +extern void mxlnd_handle_rx_completion(struct kmx_ctx *rx); +extern int mxlnd_check_sends(struct kmx_peer *peer); +extern int mxlnd_tx_peer_queued(void *arg); +extern int mxlnd_request_waitd(void *arg); +extern int mxlnd_unex_recvd(void *arg); +extern int mxlnd_timeoutd(void *arg); +extern int mxlnd_connd(void *arg); + +#define mxlnd_peer_addref(peer) \ +do { \ + LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \ + atomic_inc(&(peer)->mxp_refcount); \ +} while (0) + + +#define mxlnd_peer_decref(peer) \ +do { \ + LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \ + if (atomic_dec_and_test(&(peer)->mxp_refcount)) \ + mxlnd_peer_free(peer); \ +} while (0) + +#define mxlnd_conn_addref(conn) \ +do { \ + LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \ + atomic_inc(&(conn)->mxk_refcount); \ +} while (0) + + +#define mxlnd_conn_decref(conn) \ +do { \ + LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \ + if (atomic_dec_and_test(&(conn)->mxk_refcount)) \ + mxlnd_conn_free(conn); \ +} while (0) diff --git a/lnet/klnds/mxlnd/mxlnd_cb.c b/lnet/klnds/mxlnd/mxlnd_cb.c new file mode 100644 index 0000000..09d0c0b --- /dev/null +++ b/lnet/klnds/mxlnd/mxlnd_cb.c @@ -0,0 +1,3437 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * Copyright (C) 2006 Myricom, Inc. + * Author: Myricom, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "mxlnd.h" + +inline void mxlnd_noop(char *s, ...) +{ + return; +} + +char * +mxlnd_ctxstate_to_str(int mxc_state) +{ + switch (mxc_state) { + case MXLND_CTX_INIT: + return "MXLND_CTX_INIT"; + case MXLND_CTX_IDLE: + return "MXLND_CTX_IDLE"; + case MXLND_CTX_PREP: + return "MXLND_CTX_PREP"; + case MXLND_CTX_PENDING: + return "MXLND_CTX_PENDING"; + case MXLND_CTX_COMPLETED: + return "MXLND_CTX_COMPLETED"; + case MXLND_CTX_CANCELED: + return "MXLND_CTX_CANCELED"; + default: + return "*unknown*"; + } +} + +char * +mxlnd_connstatus_to_str(int mxk_status) +{ + switch (mxk_status) { + case MXLND_CONN_READY: + return "MXLND_CONN_READY"; + case MXLND_CONN_INIT: + return "MXLND_CONN_INIT"; + case MXLND_CONN_REQ: + return "MXLND_CONN_REQ"; + case MXLND_CONN_ACK: + return "MXLND_CONN_ACK"; + case MXLND_CONN_WAIT: + return "MXLND_CONN_WAIT"; + case MXLND_CONN_DISCONNECT: + return "MXLND_CONN_DISCONNECT"; + case MXLND_CONN_FAIL: + return "MXLND_CONN_FAIL"; + default: + return "unknown"; + } +} + +char * +mxlnd_msgtype_to_str(int type) { + switch (type) { + case MXLND_MSG_EAGER: + return "MXLND_MSG_EAGER"; + case MXLND_MSG_CONN_REQ: + return "MXLND_MSG_CONN_REQ"; + case MXLND_MSG_CONN_ACK: + return "MXLND_MSG_CONN_ACK"; + case MXLND_MSG_NOOP: + return "MXLND_MSG_NOOP"; + case MXLND_MSG_PUT_REQ: + return "MXLND_MSG_PUT_REQ"; + case MXLND_MSG_PUT_ACK: + return "MXLND_MSG_PUT_ACK"; + case MXLND_MSG_PUT_DATA: + return "MXLND_MSG_PUT_DATA"; + case MXLND_MSG_GET_REQ: + return "MXLND_MSG_GET_REQ"; + case MXLND_MSG_GET_DATA: + return "MXLND_MSG_GET_DATA"; + default: + return "unknown"; + } +} + +char * +mxlnd_lnetmsg_to_str(int type) +{ + switch (type) { + case LNET_MSG_ACK: + return "LNET_MSG_ACK"; + case LNET_MSG_PUT: + return "LNET_MSG_PUT"; + case LNET_MSG_GET: + return "LNET_MSG_GET"; + case LNET_MSG_REPLY: + return "LNET_MSG_REPLY"; + case LNET_MSG_HELLO: + return "LNET_MSG_HELLO"; + default: + return "*unknown*"; + } +} + +static inline u64 +//mxlnd_create_match(u8 msg_type, u8 error, u64 cookie) +mxlnd_create_match(struct kmx_ctx *ctx, u8 error) +{ + u64 type = (u64) ctx->mxc_msg_type; + u64 err = (u64) error; + u64 match = 0LL; + + LASSERT(ctx->mxc_msg_type != 0); + LASSERT(ctx->mxc_cookie >> 52 == 0); + match = (type << 60) | (err << 52) | ctx->mxc_cookie; + return match; +} + +static inline void +mxlnd_parse_match(u64 match, u8 *msg_type, u8 *error, u64 *cookie) +{ + *msg_type = (u8) (match >> 60); + *error = (u8) ((match >> 52) & 0xFF); + *cookie = match & 0xFFFFFFFFFFFFFLL; + LASSERT(match == (MXLND_MASK_ICON_REQ & 0xF000000000000000LL) || + match == (MXLND_MASK_ICON_ACK & 0xF000000000000000LL) || + *msg_type == MXLND_MSG_EAGER || + *msg_type == MXLND_MSG_CONN_REQ || + *msg_type == MXLND_MSG_CONN_ACK || + *msg_type == MXLND_MSG_NOOP || + *msg_type == MXLND_MSG_PUT_REQ || + *msg_type == MXLND_MSG_PUT_ACK || + *msg_type == MXLND_MSG_PUT_DATA || + *msg_type == MXLND_MSG_GET_REQ || + *msg_type == MXLND_MSG_GET_DATA); + return; +} + +struct kmx_ctx * +mxlnd_get_idle_rx(void) +{ + struct list_head *tmp = NULL; + struct kmx_ctx *rx = NULL; + + spin_lock(&kmxlnd_data.kmx_rx_idle_lock); + + if (list_empty (&kmxlnd_data.kmx_rx_idle)) { + spin_unlock(&kmxlnd_data.kmx_rx_idle_lock); + return NULL; + } + + tmp = &kmxlnd_data.kmx_rx_idle; + rx = list_entry (tmp->next, struct kmx_ctx, mxc_list); + list_del_init(&rx->mxc_list); + spin_unlock(&kmxlnd_data.kmx_rx_idle_lock); + +#if MXLND_DEBUG + if (rx->mxc_get != rx->mxc_put) { + CDEBUG(D_NETERROR, "*** RX get (%lld) != put (%lld) ***\n", rx->mxc_get, rx->mxc_put); + CDEBUG(D_NETERROR, "*** incarnation= %lld ***\n", rx->mxc_incarnation); + CDEBUG(D_NETERROR, "*** deadline= %ld ***\n", rx->mxc_deadline); + CDEBUG(D_NETERROR, "*** state= %s ***\n", mxlnd_ctxstate_to_str(rx->mxc_state)); + CDEBUG(D_NETERROR, "*** listed?= %d ***\n", !list_empty(&rx->mxc_list)); + CDEBUG(D_NETERROR, "*** nid= 0x%llx ***\n", rx->mxc_nid); + CDEBUG(D_NETERROR, "*** peer= 0x%p ***\n", rx->mxc_peer); + CDEBUG(D_NETERROR, "*** msg_type= %s ***\n", mxlnd_msgtype_to_str(rx->mxc_msg_type)); + CDEBUG(D_NETERROR, "*** cookie= 0x%llx ***\n", rx->mxc_cookie); + CDEBUG(D_NETERROR, "*** nob= %d ***\n", rx->mxc_nob); + } +#endif + LASSERT (rx->mxc_get == rx->mxc_put); + + rx->mxc_get++; + + LASSERT (rx->mxc_state == MXLND_CTX_IDLE); + rx->mxc_state = MXLND_CTX_PREP; + + return rx; +} + +int +mxlnd_put_idle_rx(struct kmx_ctx *rx) +{ + if (rx == NULL) { + CDEBUG(D_NETERROR, "called with NULL pointer\n"); + return -EINVAL; + } else if (rx->mxc_type != MXLND_REQ_RX) { + CDEBUG(D_NETERROR, "called with tx\n"); + return -EINVAL; + } + LASSERT(rx->mxc_get == rx->mxc_put + 1); + mxlnd_ctx_init(rx); + rx->mxc_put++; + spin_lock(&kmxlnd_data.kmx_rx_idle_lock); + list_add_tail(&rx->mxc_list, &kmxlnd_data.kmx_rx_idle); + spin_unlock(&kmxlnd_data.kmx_rx_idle_lock); + return 0; +} + +int +mxlnd_reduce_idle_rxs(__u32 count) +{ + __u32 i = 0; + struct kmx_ctx *rx = NULL; + + spin_lock(&kmxlnd_data.kmx_rxs_lock); + for (i = 0; i < count; i++) { + rx = mxlnd_get_idle_rx(); + if (rx != NULL) { + struct list_head *tmp = &rx->mxc_global_list; + list_del_init(tmp); + mxlnd_ctx_free(rx); + } else { + CDEBUG(D_NETERROR, "only reduced %d out of %d rxs\n", i, count); + break; + } + } + spin_unlock(&kmxlnd_data.kmx_rxs_lock); + return 0; +} + +struct kmx_ctx * +mxlnd_get_idle_tx(void) +{ + struct list_head *tmp = NULL; + struct kmx_ctx *tx = NULL; + + spin_lock(&kmxlnd_data.kmx_tx_idle_lock); + + if (list_empty (&kmxlnd_data.kmx_tx_idle)) { + CDEBUG(D_NETERROR, "%d txs in use\n", kmxlnd_data.kmx_tx_used); + spin_unlock(&kmxlnd_data.kmx_tx_idle_lock); + return NULL; + } + + tmp = &kmxlnd_data.kmx_tx_idle; + tx = list_entry (tmp->next, struct kmx_ctx, mxc_list); + list_del_init(&tx->mxc_list); + + /* Allocate a new completion cookie. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... */ + tx->mxc_cookie = kmxlnd_data.kmx_tx_next_cookie++; + if (kmxlnd_data.kmx_tx_next_cookie > MXLND_MAX_COOKIE) { + tx->mxc_cookie = 1; + } + kmxlnd_data.kmx_tx_used++; + spin_unlock(&kmxlnd_data.kmx_tx_idle_lock); + + LASSERT (tx->mxc_get == tx->mxc_put); + + tx->mxc_get++; + + LASSERT (tx->mxc_state == MXLND_CTX_IDLE); + LASSERT (tx->mxc_lntmsg[0] == NULL); + LASSERT (tx->mxc_lntmsg[1] == NULL); + + tx->mxc_state = MXLND_CTX_PREP; + + return tx; +} + +int +mxlnd_put_idle_tx(struct kmx_ctx *tx) +{ + int failed = (tx->mxc_status.code != MX_STATUS_SUCCESS && tx->mxc_status.code != MX_STATUS_TRUNCATED); + int result = failed ? -EIO : 0; + lnet_msg_t *lntmsg[2]; + + if (tx == NULL) { + CDEBUG(D_NETERROR, "called with NULL pointer\n"); + return -EINVAL; + } else if (tx->mxc_type != MXLND_REQ_TX) { + CDEBUG(D_NETERROR, "called with rx\n"); + return -EINVAL; + } + + lntmsg[0] = tx->mxc_lntmsg[0]; + lntmsg[1] = tx->mxc_lntmsg[1]; + + LASSERT(tx->mxc_get == tx->mxc_put + 1); + mxlnd_ctx_init(tx); + tx->mxc_put++; + spin_lock(&kmxlnd_data.kmx_tx_idle_lock); + list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle); + kmxlnd_data.kmx_tx_used--; + spin_unlock(&kmxlnd_data.kmx_tx_idle_lock); + if (lntmsg[0] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result); + if (lntmsg[1] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result); + return 0; +} + +/** + * mxlnd_conn_free - free the conn + * @conn - a kmx_conn pointer + * + * The calling function should remove the conn from the conns list first + * then destroy it. + */ +void +mxlnd_conn_free(struct kmx_conn *conn) +{ + struct kmx_peer *peer = conn->mxk_peer; + + CDEBUG(D_NET, "freeing conn 0x%p *****\n", conn); + LASSERT (list_empty (&conn->mxk_tx_credit_queue) && + list_empty (&conn->mxk_tx_free_queue) && + list_empty (&conn->mxk_pending)); + if (!list_empty(&conn->mxk_list)) { + spin_lock(&peer->mxp_lock); + list_del_init(&conn->mxk_list); + if (peer->mxp_conn == conn) { + peer->mxp_conn = NULL; + if (!(conn->mxk_epa.stuff[0] == 0 && conn->mxk_epa.stuff[1] == 0)) { + mx_set_endpoint_addr_context(conn->mxk_epa, + (void *) NULL); + } + } + spin_unlock(&peer->mxp_lock); + } + mxlnd_peer_decref(conn->mxk_peer); /* drop conn's ref to peer */ + MXLND_FREE (conn, sizeof (*conn)); + return; +} + + +void +mxlnd_conn_cancel_pending_rxs(struct kmx_conn *conn) +{ + int found = 0; + struct kmx_ctx *ctx = NULL; + struct kmx_ctx *next = NULL; + mx_return_t mxret = MX_SUCCESS; + u32 result = 0; + + do { + found = 0; + spin_lock(&conn->mxk_lock); + list_for_each_entry_safe(ctx, next, &conn->mxk_pending, mxc_list) { + /* we will delete all including txs */ + list_del_init(&ctx->mxc_list); + if (ctx->mxc_type == MXLND_REQ_RX) { + found = 1; + mxret = mx_cancel(kmxlnd_data.kmx_endpt, + &ctx->mxc_mxreq, + &result); + if (mxret != MX_SUCCESS) { + CDEBUG(D_NETERROR, "mx_cancel() returned %s (%d)\n", mx_strerror(mxret), mxret); + } + if (result == 1) { + ctx->mxc_status.code = -ECONNABORTED; + ctx->mxc_state = MXLND_CTX_CANCELED; + /* NOTE this calls lnet_finalize() and + * we cannot hold any locks when calling it. + * It also calls mxlnd_conn_decref(conn) */ + spin_unlock(&conn->mxk_lock); + mxlnd_handle_rx_completion(ctx); + spin_lock(&conn->mxk_lock); + } + break; + } + } + spin_unlock(&conn->mxk_lock); + } + while (found); + + return; +} + +/** + * mxlnd_conn_disconnect - shutdown a connection + * @conn - a kmx_conn pointer + * + * This function sets the status to DISCONNECT, completes queued + * txs with failure, calls mx_disconnect, which will complete + * pending txs and matched rxs with failure. + */ +void +mxlnd_conn_disconnect(struct kmx_conn *conn, int mx_dis, int notify) +{ + struct list_head *tmp = NULL; + + spin_lock(&conn->mxk_lock); + if (conn->mxk_status == MXLND_CONN_DISCONNECT) { + spin_unlock(&conn->mxk_lock); + return; + } + conn->mxk_status = MXLND_CONN_DISCONNECT; + conn->mxk_timeout = 0; + + while (!list_empty(&conn->mxk_tx_free_queue) || + !list_empty(&conn->mxk_tx_credit_queue)) { + + struct kmx_ctx *tx = NULL; + + if (!list_empty(&conn->mxk_tx_free_queue)) { + tmp = &conn->mxk_tx_free_queue; + } else { + tmp = &conn->mxk_tx_credit_queue; + } + + tx = list_entry(tmp->next, struct kmx_ctx, mxc_list); + list_del_init(&tx->mxc_list); + tx->mxc_status.code = -ECONNABORTED; + spin_unlock(&conn->mxk_lock); + mxlnd_put_idle_tx(tx); + mxlnd_conn_decref(conn); /* for this tx */ + spin_lock(&conn->mxk_lock); + } + + spin_unlock(&conn->mxk_lock); + + /* cancel pending rxs */ + mxlnd_conn_cancel_pending_rxs(conn); + + if (kmxlnd_data.kmx_shutdown != 1) { + + if (mx_dis) mx_disconnect(kmxlnd_data.kmx_endpt, conn->mxk_epa); + + if (notify) { + time_t last_alive = 0; + unsigned long last_msg = 0; + + /* notify LNET that we are giving up on this peer */ + if (time_after(conn->mxk_last_rx, conn->mxk_last_tx)) { + last_msg = conn->mxk_last_rx; + } else { + last_msg = conn->mxk_last_tx; + } + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - last_msg); + lnet_notify(kmxlnd_data.kmx_ni, conn->mxk_peer->mxp_nid, 0, last_alive); + } + } + mxlnd_conn_decref(conn); /* drop the owning peer's reference */ + + return; +} + +/** + * mxlnd_conn_alloc - allocate and initialize a new conn struct + * @connp - address of a kmx_conn pointer + * @peer - owning kmx_peer + * + * Returns 0 on success and -ENOMEM on failure + */ +int +mxlnd_conn_alloc(struct kmx_conn **connp, struct kmx_peer *peer) +{ + struct kmx_conn *conn = NULL; + + LASSERT(peer != NULL); + + MXLND_ALLOC(conn, sizeof (*conn)); + if (conn == NULL) { + CDEBUG(D_NETERROR, "Cannot allocate conn\n"); + return -ENOMEM; + } + CDEBUG(D_NET, "allocated conn 0x%p for peer 0x%p\n", conn, peer); + + memset(conn, 0, sizeof(*conn)); + + /* conn->mxk_incarnation = 0 - will be set by peer */ + atomic_set(&conn->mxk_refcount, 1); /* ref for owning peer */ + conn->mxk_peer = peer; + /* mxk_epa - to be set after mx_iconnect() */ + INIT_LIST_HEAD(&conn->mxk_list); + spin_lock_init(&conn->mxk_lock); + /* conn->mxk_timeout = 0 */ + conn->mxk_last_tx = jiffies; + conn->mxk_last_rx = conn->mxk_last_tx; + conn->mxk_credits = *kmxlnd_tunables.kmx_credits; + /* mxk_outstanding = 0 */ + conn->mxk_status = MXLND_CONN_INIT; + INIT_LIST_HEAD(&conn->mxk_tx_credit_queue); + INIT_LIST_HEAD(&conn->mxk_tx_free_queue); + /* conn->mxk_ntx_msgs = 0 */ + /* conn->mxk_ntx_data = 0 */ + /* conn->mxk_ntx_posted = 0 */ + /* conn->mxk_data_posted = 0 */ + INIT_LIST_HEAD(&conn->mxk_pending); + + *connp = conn; + + mxlnd_peer_addref(peer); /* add a ref for this conn */ + + /* add to front of peer's conns list */ + spin_lock(&peer->mxp_lock); + list_add(&conn->mxk_list, &peer->mxp_conns); + peer->mxp_conn = conn; + spin_unlock(&peer->mxp_lock); + return 0; +} + + +int +mxlnd_q_pending_ctx(struct kmx_ctx *ctx) +{ + int ret = 0; + struct kmx_conn *conn = ctx->mxc_conn; + + ctx->mxc_state = MXLND_CTX_PENDING; + if (conn != NULL) { + spin_lock(&conn->mxk_lock); + if (conn->mxk_status >= MXLND_CONN_INIT) { + list_add_tail(&ctx->mxc_list, &conn->mxk_pending); + if (conn->mxk_timeout == 0 || ctx->mxc_deadline < conn->mxk_timeout) { + conn->mxk_timeout = ctx->mxc_deadline; + } + } else { + ctx->mxc_state = MXLND_CTX_COMPLETED; + ret = -1; + } + spin_unlock(&conn->mxk_lock); + } + return ret; +} + +int +mxlnd_deq_pending_ctx(struct kmx_ctx *ctx) +{ + LASSERT(ctx->mxc_state == MXLND_CTX_PENDING || + ctx->mxc_state == MXLND_CTX_COMPLETED); + if (ctx->mxc_state != MXLND_CTX_PENDING && + ctx->mxc_state != MXLND_CTX_COMPLETED) { + CDEBUG(D_NETERROR, "deq ctx->mxc_state = %s\n", + mxlnd_ctxstate_to_str(ctx->mxc_state)); + } + ctx->mxc_state = MXLND_CTX_COMPLETED; + if (!list_empty(&ctx->mxc_list)) { + struct kmx_conn *conn = ctx->mxc_conn; + struct kmx_ctx *next = NULL; + LASSERT(conn != NULL); + spin_lock(&conn->mxk_lock); + list_del_init(&ctx->mxc_list); + conn->mxk_timeout = 0; + if (!list_empty(&conn->mxk_pending)) { + next = list_entry(conn->mxk_pending.next, struct kmx_ctx, mxc_list); + conn->mxk_timeout = next->mxc_deadline; + } + spin_unlock(&ctx->mxc_conn->mxk_lock); + } + return 0; +} + +/** + * mxlnd_peer_free - free the peer + * @peer - a kmx_peer pointer + * + * The calling function should decrement the rxs, drain the tx queues and + * remove the peer from the peers list first then destroy it. + */ +void +mxlnd_peer_free(struct kmx_peer *peer) +{ + CDEBUG(D_NET, "freeing peer 0x%p\n", peer); + + LASSERT (atomic_read(&peer->mxp_refcount) == 0); + + if (peer->mxp_host != NULL) { + spin_lock(&peer->mxp_host->mxh_lock); + peer->mxp_host->mxh_peer = NULL; + spin_unlock(&peer->mxp_host->mxh_lock); + } + if (!list_empty(&peer->mxp_peers)) { + /* assume we are locked */ + list_del_init(&peer->mxp_peers); + } + + MXLND_FREE (peer, sizeof (*peer)); + atomic_dec(&kmxlnd_data.kmx_npeers); + return; +} + +void +mxlnd_peer_hostname_to_nic_id(struct kmx_peer *peer) +{ + u64 nic_id = 0LL; + char name[MX_MAX_HOSTNAME_LEN + 1]; + mx_return_t mxret = MX_SUCCESS; + + memset(name, 0, sizeof(name)); + snprintf(name, sizeof(name), "%s:%d", peer->mxp_host->mxh_hostname, peer->mxp_host->mxh_board); + mxret = mx_hostname_to_nic_id(name, &nic_id); + if (mxret == MX_SUCCESS) { + peer->mxp_nic_id = nic_id; + } else { + CDEBUG(D_NETERROR, "mx_hostname_to_nic_id() failed for %s " + "with %s\n", mx_strerror(mxret), name); + mxret = mx_hostname_to_nic_id(peer->mxp_host->mxh_hostname, &nic_id); + if (mxret == MX_SUCCESS) { + peer->mxp_nic_id = nic_id; + } else { + CDEBUG(D_NETERROR, "mx_hostname_to_nic_id() failed for %s " + "with %s\n", mx_strerror(mxret), + peer->mxp_host->mxh_hostname); + } + } + return; +} + +/** + * mxlnd_peer_alloc - allocate and initialize a new peer struct + * @peerp - address of a kmx_peer pointer + * @nid - LNET node id + * + * Returns 0 on success and -ENOMEM on failure + */ +int +mxlnd_peer_alloc(struct kmx_peer **peerp, lnet_nid_t nid) +{ + int i = 0; + int ret = 0; + u32 addr = LNET_NIDADDR(nid); + struct kmx_peer *peer = NULL; + struct kmx_host *host = NULL; + + LASSERT (nid != LNET_NID_ANY && nid != 0LL); + + MXLND_ALLOC(peer, sizeof (*peer)); + if (peer == NULL) { + CDEBUG(D_NETERROR, "Cannot allocate peer for NID 0x%llx\n", nid); + return -ENOMEM; + } + CDEBUG(D_NET, "allocated peer 0x%p for NID 0x%llx\n", peer, nid); + + memset(peer, 0, sizeof(*peer)); + + list_for_each_entry(host, &kmxlnd_data.kmx_hosts, mxh_list) { + if (addr == host->mxh_addr) { + peer->mxp_host = host; + spin_lock(&host->mxh_lock); + host->mxh_peer = peer; + spin_unlock(&host->mxh_lock); + break; + } + } + LASSERT(peer->mxp_host != NULL); + + peer->mxp_nid = nid; + /* peer->mxp_incarnation */ + atomic_set(&peer->mxp_refcount, 1); /* ref for kmx_peers list */ + mxlnd_peer_hostname_to_nic_id(peer); + + INIT_LIST_HEAD(&peer->mxp_peers); + spin_lock_init(&peer->mxp_lock); + INIT_LIST_HEAD(&peer->mxp_conns); + ret = mxlnd_conn_alloc(&peer->mxp_conn, peer); + if (ret != 0) { + mxlnd_peer_decref(peer); + return ret; + } + + for (i = 0; i < *kmxlnd_tunables.kmx_credits - 1; i++) { + struct kmx_ctx *rx = NULL; + ret = mxlnd_ctx_alloc(&rx, MXLND_REQ_RX); + if (ret != 0) { + mxlnd_reduce_idle_rxs(i); + mxlnd_peer_decref(peer); + return ret; + } + spin_lock(&kmxlnd_data.kmx_rxs_lock); + list_add_tail(&rx->mxc_global_list, &kmxlnd_data.kmx_rxs); + spin_unlock(&kmxlnd_data.kmx_rxs_lock); + rx->mxc_put = -1; + mxlnd_put_idle_rx(rx); + } + /* peer->mxp_reconnect_time = 0 */ + /* peer->mxp_incompatible = 0 */ + + *peerp = peer; + return 0; +} + +/** + * mxlnd_nid_to_hash - hash the nid + * @nid - msg pointer + * + * Takes the u64 nid and XORs the lowest N bits by the next lowest N bits. + */ +static inline int +mxlnd_nid_to_hash(lnet_nid_t nid) +{ + return (nid & MXLND_HASH_MASK) ^ + ((nid & (MXLND_HASH_MASK << MXLND_HASH_BITS)) >> MXLND_HASH_BITS); +} + +static inline struct kmx_peer * +mxlnd_find_peer_by_nid(lnet_nid_t nid) +{ + int found = 0; + int hash = 0; + struct kmx_peer *peer = NULL; + + hash = mxlnd_nid_to_hash(nid); + + read_lock(&kmxlnd_data.kmx_peers_lock); + list_for_each_entry(peer, &kmxlnd_data.kmx_peers[hash], mxp_peers) { + if (peer->mxp_nid == nid) { + found = 1; + break; + } + } + read_unlock(&kmxlnd_data.kmx_peers_lock); + return (found ? peer : NULL); +} + +static inline int +mxlnd_tx_requires_credit(struct kmx_ctx *tx) +{ + return (tx->mxc_msg_type == MXLND_MSG_EAGER || + tx->mxc_msg_type == MXLND_MSG_GET_REQ || + tx->mxc_msg_type == MXLND_MSG_PUT_REQ || + tx->mxc_msg_type == MXLND_MSG_NOOP); +} + +/** + * mxlnd_init_msg - set type and number of bytes + * @msg - msg pointer + * @type - of message + * @body_nob - bytes in msg body + */ +static inline void +mxlnd_init_msg(kmx_msg_t *msg, u8 type, int body_nob) +{ + msg->mxm_type = type; + msg->mxm_nob = offsetof(kmx_msg_t, mxm_u) + body_nob; +} + +static inline void +mxlnd_init_tx_msg (struct kmx_ctx *tx, u8 type, int body_nob, lnet_nid_t nid) +{ + int nob = offsetof (kmx_msg_t, mxm_u) + body_nob; + struct kmx_msg *msg = NULL; + + LASSERT (tx != NULL); + LASSERT (nob <= MXLND_EAGER_SIZE); + + tx->mxc_nid = nid; + /* tx->mxc_peer should have already been set if we know it */ + tx->mxc_msg_type = type; + tx->mxc_nseg = 1; + /* tx->mxc_seg.segment_ptr is already pointing to mxc_page */ + tx->mxc_seg.segment_length = nob; + tx->mxc_pin_type = MX_PIN_PHYSICAL; + //tx->mxc_state = MXLND_CTX_PENDING; + + msg = tx->mxc_msg; + msg->mxm_type = type; + msg->mxm_nob = nob; + + return; +} + +static inline __u32 +mxlnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +/** + * mxlnd_pack_msg - complete msg info + * @tx - msg to send + */ +static inline void +mxlnd_pack_msg(struct kmx_ctx *tx) +{ + struct kmx_msg *msg = tx->mxc_msg; + + /* type and nob should already be set in init_msg() */ + msg->mxm_magic = MXLND_MSG_MAGIC; + msg->mxm_version = MXLND_MSG_VERSION; + /* mxm_type */ + /* don't use mxlnd_tx_requires_credit() since we want PUT_ACK to + * return credits as well */ + if (tx->mxc_msg_type != MXLND_MSG_CONN_REQ && + tx->mxc_msg_type != MXLND_MSG_CONN_ACK) { + spin_lock(&tx->mxc_conn->mxk_lock); + msg->mxm_credits = tx->mxc_conn->mxk_outstanding; + tx->mxc_conn->mxk_outstanding = 0; + spin_unlock(&tx->mxc_conn->mxk_lock); + } else { + msg->mxm_credits = 0; + } + /* mxm_nob */ + msg->mxm_cksum = 0; + msg->mxm_srcnid = lnet_ptlcompat_srcnid(kmxlnd_data.kmx_ni->ni_nid, tx->mxc_nid); + msg->mxm_srcstamp = kmxlnd_data.kmx_incarnation; + msg->mxm_dstnid = tx->mxc_nid; + /* if it is a new peer, the dststamp will be 0 */ + msg->mxm_dststamp = tx->mxc_conn->mxk_incarnation; + msg->mxm_seq = tx->mxc_cookie; + + if (*kmxlnd_tunables.kmx_cksum) { + msg->mxm_cksum = mxlnd_cksum(msg, msg->mxm_nob); + } +} + +int +mxlnd_unpack_msg(kmx_msg_t *msg, int nob) +{ + const int hdr_size = offsetof(kmx_msg_t, mxm_u); + __u32 msg_cksum = 0; + int flip = 0; + int msg_nob = 0; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CDEBUG(D_NETERROR, "not enough bytes for magic + hdr: %d\n", nob); + return -EPROTO; + } + + if (msg->mxm_magic == MXLND_MSG_MAGIC) { + flip = 0; + } else if (msg->mxm_magic == __swab32(MXLND_MSG_MAGIC)) { + flip = 1; + } else { + CDEBUG(D_NETERROR, "Bad magic: %08x\n", msg->mxm_magic); + return -EPROTO; + } + + if (msg->mxm_version != + (flip ? __swab16(MXLND_MSG_VERSION) : MXLND_MSG_VERSION)) { + CDEBUG(D_NETERROR, "Bad version: %d\n", msg->mxm_version); + return -EPROTO; + } + + if (nob < hdr_size) { + CDEBUG(D_NETERROR, "not enough for a header: %d\n", nob); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->mxm_nob) : msg->mxm_nob; + if (msg_nob > nob) { + CDEBUG(D_NETERROR, "Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with mxm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->mxm_cksum) : msg->mxm_cksum; + msg->mxm_cksum = 0; + if (msg_cksum != 0 && msg_cksum != mxlnd_cksum(msg, msg_nob)) { + CDEBUG(D_NETERROR, "Bad checksum\n"); + return -EPROTO; + } + msg->mxm_cksum = msg_cksum; + + if (flip) { + /* leave magic unflipped as a clue to peer endianness */ + __swab16s(&msg->mxm_version); + CLASSERT (sizeof(msg->mxm_type) == 1); + CLASSERT (sizeof(msg->mxm_credits) == 1); + msg->mxm_nob = msg_nob; + __swab64s(&msg->mxm_srcnid); + __swab64s(&msg->mxm_srcstamp); + __swab64s(&msg->mxm_dstnid); + __swab64s(&msg->mxm_dststamp); + __swab64s(&msg->mxm_seq); + } + + if (msg->mxm_srcnid == LNET_NID_ANY) { + CDEBUG(D_NETERROR, "Bad src nid: %s\n", libcfs_nid2str(msg->mxm_srcnid)); + return -EPROTO; + } + + switch (msg->mxm_type) { + default: + CDEBUG(D_NETERROR, "Unknown message type %x\n", msg->mxm_type); + return -EPROTO; + + case MXLND_MSG_NOOP: + break; + + case MXLND_MSG_EAGER: + if (msg_nob < offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0])) { + CDEBUG(D_NETERROR, "Short EAGER: %d(%d)\n", msg_nob, + (int)offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[0])); + return -EPROTO; + } + break; + + case MXLND_MSG_PUT_REQ: + if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_req)) { + CDEBUG(D_NETERROR, "Short PUT_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->mxm_u.put_req))); + return -EPROTO; + } + if (flip) + __swab64s(&msg->mxm_u.put_req.mxprm_cookie); + break; + + case MXLND_MSG_PUT_ACK: + if (msg_nob < hdr_size + sizeof(msg->mxm_u.put_ack)) { + CDEBUG(D_NETERROR, "Short PUT_ACK: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->mxm_u.put_ack))); + return -EPROTO; + } + if (flip) { + __swab64s(&msg->mxm_u.put_ack.mxpam_src_cookie); + __swab64s(&msg->mxm_u.put_ack.mxpam_dst_cookie); + } + break; + + case MXLND_MSG_GET_REQ: + if (msg_nob < hdr_size + sizeof(msg->mxm_u.get_req)) { + CDEBUG(D_NETERROR, "Short GET_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->mxm_u.get_req))); + return -EPROTO; + } + if (flip) { + __swab64s(&msg->mxm_u.get_req.mxgrm_cookie); + } + break; + + case MXLND_MSG_CONN_REQ: + case MXLND_MSG_CONN_ACK: + if (msg_nob < hdr_size + sizeof(msg->mxm_u.conn_req)) { + CDEBUG(D_NETERROR, "Short connreq/ack: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->mxm_u.conn_req))); + return -EPROTO; + } + if (flip) { + __swab32s(&msg->mxm_u.conn_req.mxcrm_queue_depth); + __swab32s(&msg->mxm_u.conn_req.mxcrm_eager_size); + } + break; + } + return 0; +} + +/** + * mxlnd_recv_msg + * @lntmsg - the LNET msg that this is continuing. If EAGER, then NULL. + * @rx + * @msg_type + * @cookie + * @length - length of incoming message + * @pending - add to kmx_pending (0 is NO and 1 is YES) + * + * The caller gets the rx and sets nid, peer and conn if known. + * + * Returns 0 on success and -1 on failure + */ +int +mxlnd_recv_msg(lnet_msg_t *lntmsg, struct kmx_ctx *rx, u8 msg_type, u64 cookie, u32 length) +{ + int ret = 0; + mx_return_t mxret = MX_SUCCESS; + uint64_t mask = 0xF00FFFFFFFFFFFFFLL; + + rx->mxc_msg_type = msg_type; + rx->mxc_lntmsg[0] = lntmsg; /* may be NULL if EAGER */ + rx->mxc_cookie = cookie; + /* rx->mxc_match may already be set */ + /* rx->mxc_seg.segment_ptr is already set */ + rx->mxc_seg.segment_length = length; + rx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT; + ret = mxlnd_q_pending_ctx(rx); + if (ret == -1) { + /* FIXME the conn is disconnected, now what? */ + return -1; + } + mxret = mx_kirecv(kmxlnd_data.kmx_endpt, &rx->mxc_seg, 1, MX_PIN_PHYSICAL, + cookie, mask, (void *) rx, &rx->mxc_mxreq); + if (mxret != MX_SUCCESS) { + mxlnd_deq_pending_ctx(rx); + CDEBUG(D_NETERROR, "mx_kirecv() failed with %s (%d)\n", + mx_strerror(mxret), (int) mxret); + return -1; + } + return 0; +} + + +/** + * mxlnd_unexpected_recv - this is the callback function that will handle + * unexpected receives + * @context - NULL, ignore + * @source - the peer's mx_endpoint_addr_t + * @match_value - the msg's bit, should be MXLND_MASK_EAGER + * @length - length of incoming message + * @data_if_available - ignore + * + * If it is an eager-sized msg, we will call recv_msg() with the actual + * length. If it is a large message, we will call recv_msg() with a + * length of 0 bytes to drop it because we should never have a large, + * unexpected message. + * + * NOTE - The MX library blocks until this function completes. Make it as fast as + * possible. DO NOT allocate memory which can block! + * + * If we cannot get a rx or the conn is closed, drop the message on the floor + * (i.e. recv 0 bytes and ignore). + */ +mx_unexp_handler_action_t +mxlnd_unexpected_recv(void *context, mx_endpoint_addr_t source, + uint64_t match_value, uint32_t length, void *data_if_available) +{ + int ret = 0; + struct kmx_ctx *rx = NULL; + mx_ksegment_t seg; + u8 msg_type = 0; + u8 error = 0; + u64 cookie = 0LL; + + if (context != NULL) { + CDEBUG(D_NETERROR, "unexpected receive with non-NULL context\n"); + } + +#if MXLND_DEBUG + CDEBUG(D_NET, "unexpected_recv() bits=0x%llx length=%d\n", match_value, length); +#endif + + rx = mxlnd_get_idle_rx(); + if (rx != NULL) { + mxlnd_parse_match(match_value, &msg_type, &error, &cookie); + if (length <= MXLND_EAGER_SIZE) { + ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, length); + } else { + CDEBUG(D_NETERROR, "unexpected large receive with " + "match_value=0x%llx length=%d\n", + match_value, length); + ret = mxlnd_recv_msg(NULL, rx, msg_type, match_value, 0); + } + if (ret == 0) { + struct kmx_conn *conn = NULL; + mx_get_endpoint_addr_context(source, (void **) &conn); + if (conn != NULL) { + mxlnd_conn_addref(conn); + rx->mxc_conn = conn; + rx->mxc_peer = conn->mxk_peer; + if (conn->mxk_peer != NULL) { + rx->mxc_nid = conn->mxk_peer->mxp_nid; + } else { + CDEBUG(D_NETERROR, "conn is 0x%p and peer " + "is NULL\n", conn); + } + } + } else { + CDEBUG(D_NETERROR, "could not post receive\n"); + mxlnd_put_idle_rx(rx); + } + } + + if (rx == NULL || ret != 0) { + if (rx == NULL) { + CDEBUG(D_NETERROR, "no idle rxs available - dropping rx\n"); + } else { + /* ret != 0 */ + CDEBUG(D_NETERROR, "disconnected peer - dropping rx\n"); + } + seg.segment_ptr = 0LL; + seg.segment_length = 0; + mx_kirecv(kmxlnd_data.kmx_endpt, &seg, 1, MX_PIN_PHYSICAL, + match_value, 0xFFFFFFFFFFFFFFFFLL, NULL, NULL); + } + + return MX_RECV_CONTINUE; +} + + +int +mxlnd_get_peer_info(int index, lnet_nid_t *nidp, int *count) +{ + int i = 0; + struct kmx_peer *peer = NULL; + struct kmx_conn *conn = NULL; + + read_lock(&kmxlnd_data.kmx_peers_lock); + for (i = 0; i < MXLND_HASH_SIZE; i++) { + list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) { + conn = peer->mxp_conn; + if (index-- > 0) + continue; + + *nidp = peer->mxp_nid; + *count = atomic_read(&peer->mxp_refcount); + } + } + read_unlock(&kmxlnd_data.kmx_peers_lock); + + return -ENOENT; +} + +void +mxlnd_del_peer_locked(struct kmx_peer *peer) +{ + list_del_init(&peer->mxp_peers); /* remove from the global list */ + if (peer->mxp_conn) mxlnd_conn_disconnect(peer->mxp_conn, 0, 0); + mxlnd_peer_decref(peer); /* drop global list ref */ + return; +} + +int +mxlnd_del_peer(lnet_nid_t nid) +{ + int i = 0; + int ret = 0; + struct kmx_peer *peer = NULL; + struct kmx_peer *next = NULL; + + if (nid != LNET_NID_ANY) { + peer = mxlnd_find_peer_by_nid(nid); + } + write_lock(&kmxlnd_data.kmx_peers_lock); + if (nid != LNET_NID_ANY) { + if (peer == NULL) { + ret = -ENOENT; + } else { + mxlnd_del_peer_locked(peer); + } + } else { /* LNET_NID_ANY */ + for (i = 0; i < MXLND_HASH_SIZE; i++) { + list_for_each_entry_safe(peer, next, + &kmxlnd_data.kmx_peers[i], mxp_peers) { + mxlnd_del_peer_locked(peer); + } + } + } + write_unlock(&kmxlnd_data.kmx_peers_lock); + + return ret; +} + +struct kmx_conn * +mxlnd_get_conn_by_idx(int index) +{ + int i = 0; + struct kmx_peer *peer = NULL; + struct kmx_conn *conn = NULL; + + read_lock(&kmxlnd_data.kmx_peers_lock); + for (i = 0; i < MXLND_HASH_SIZE; i++) { + list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) { + list_for_each_entry(conn, &peer->mxp_conns, mxk_list) { + if (index-- > 0) + continue; + + mxlnd_conn_addref(conn); /* add ref here, dec in ctl() */ + read_unlock(&kmxlnd_data.kmx_peers_lock); + return conn; + } + } + } + read_unlock(&kmxlnd_data.kmx_peers_lock); + + return NULL; +} + +void +mxlnd_close_matching_conns_locked(struct kmx_peer *peer) +{ + struct kmx_conn *conn = NULL; + struct kmx_conn *next = NULL; + + list_for_each_entry_safe(conn, next, &peer->mxp_conns, mxk_list) { + mxlnd_conn_disconnect(conn, 0 , 0); + } + return; +} + +int +mxlnd_close_matching_conns(lnet_nid_t nid) +{ + int i = 0; + int ret = 0; + struct kmx_peer *peer = NULL; + + read_lock(&kmxlnd_data.kmx_peers_lock); + if (nid != LNET_NID_ANY) { + peer = mxlnd_find_peer_by_nid(nid); + if (peer == NULL) { + ret = -ENOENT; + } else { + mxlnd_close_matching_conns_locked(peer); + } + } else { /* LNET_NID_ANY */ + for (i = 0; i < MXLND_HASH_SIZE; i++) { + list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) + mxlnd_close_matching_conns_locked(peer); + } + } + read_unlock(&kmxlnd_data.kmx_peers_lock); + + return ret; +} + +/** + * mxlnd_ctl - modify MXLND parameters + * @ni - LNET interface handle + * @cmd - command to change + * @arg - the ioctl data + * + * Not implemented yet. + */ +int +mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + int ret = -EINVAL; + + LASSERT (ni == kmxlnd_data.kmx_ni); + + switch (cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + int count = 0; + + ret = mxlnd_get_peer_info(data->ioc_count, &nid, &count); + data->ioc_nid = nid; + data->ioc_count = count; + break; + } + case IOC_LIBCFS_DEL_PEER: { + ret = mxlnd_del_peer(data->ioc_nid); + break; + } + case IOC_LIBCFS_GET_CONN: { + struct kmx_conn *conn = NULL; + + conn = mxlnd_get_conn_by_idx(data->ioc_count); + if (conn == NULL) { + ret = -ENOENT; + } else { + ret = 0; + data->ioc_nid = conn->mxk_peer->mxp_nid; + mxlnd_conn_decref(conn); /* dec ref taken in get_conn_by_idx() */ + } + break; + } + case IOC_LIBCFS_CLOSE_CONNECTION: { + ret = mxlnd_close_matching_conns(data->ioc_nid); + break; + } + default: + CDEBUG(D_NETERROR, "unknown ctl(%d)\n", cmd); + break; + } + + return ret; +} + +/** + * mxlnd_peer_queue_tx_locked - add the tx to the global tx queue + * @tx + * + * Add the tx to the peer's msg or data queue. The caller has locked the peer. + */ +void +mxlnd_peer_queue_tx_locked(struct kmx_ctx *tx) +{ + u8 msg_type = tx->mxc_msg_type; + //struct kmx_peer *peer = tx->mxc_peer; + struct kmx_conn *conn = tx->mxc_conn; + + LASSERT (msg_type != 0); + LASSERT (tx->mxc_nid != 0); + LASSERT (tx->mxc_peer != NULL); + LASSERT (tx->mxc_conn != NULL); + + tx->mxc_incarnation = conn->mxk_incarnation; + + if (msg_type != MXLND_MSG_PUT_DATA && + msg_type != MXLND_MSG_GET_DATA) { + /* msg style tx */ + if (mxlnd_tx_requires_credit(tx)) { + list_add_tail(&tx->mxc_list, &conn->mxk_tx_credit_queue); + conn->mxk_ntx_msgs++; + } else if (msg_type == MXLND_MSG_CONN_REQ || + msg_type == MXLND_MSG_CONN_ACK) { + /* put conn msgs at the front of the queue */ + list_add(&tx->mxc_list, &conn->mxk_tx_free_queue); + } else { + /* PUT_ACK, PUT_NAK */ + list_add_tail(&tx->mxc_list, &conn->mxk_tx_free_queue); + conn->mxk_ntx_msgs++; + } + } else { + /* data style tx */ + list_add_tail(&tx->mxc_list, &conn->mxk_tx_free_queue); + conn->mxk_ntx_data++; + } + + return; +} + +/** + * mxlnd_peer_queue_tx - add the tx to the global tx queue + * @tx + * + * Add the tx to the peer's msg or data queue + */ +static inline void +mxlnd_peer_queue_tx(struct kmx_ctx *tx) +{ + LASSERT(tx->mxc_peer != NULL); + LASSERT(tx->mxc_conn != NULL); + spin_lock(&tx->mxc_conn->mxk_lock); + mxlnd_peer_queue_tx_locked(tx); + spin_unlock(&tx->mxc_conn->mxk_lock); + + return; +} + +/** + * mxlnd_queue_tx - add the tx to the global tx queue + * @tx + * + * Add the tx to the global queue and up the tx_queue_sem + */ +void +mxlnd_queue_tx(struct kmx_ctx *tx) +{ + int ret = 0; + struct kmx_peer *peer = tx->mxc_peer; + LASSERT (tx->mxc_nid != 0); + + if (peer != NULL) { + if (peer->mxp_incompatible && + tx->mxc_msg_type != MXLND_MSG_CONN_ACK) { + /* let this fail now */ + tx->mxc_status.code = -ECONNABORTED; + mxlnd_put_idle_tx(tx); + return; + } + if (tx->mxc_conn == NULL) { + mxlnd_conn_alloc(&tx->mxc_conn, peer); + } + LASSERT(tx->mxc_conn != NULL); + mxlnd_peer_queue_tx(tx); + ret = mxlnd_check_sends(peer); + } else { + spin_lock(&kmxlnd_data.kmx_tx_queue_lock); + list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_queue); + spin_unlock(&kmxlnd_data.kmx_tx_queue_lock); + up(&kmxlnd_data.kmx_tx_queue_sem); + } + return; +} + +int +mxlnd_setup_iov(struct kmx_ctx *ctx, u32 niov, struct iovec *iov, u32 offset, u32 nob) +{ + int i = 0; + int sum = 0; + int old_sum = 0; + int nseg = 0; + int first_iov = -1; + int first_iov_offset = 0; + int first_found = 0; + int last_iov = -1; + int last_iov_length = 0; + mx_ksegment_t *seg = NULL; + + if (niov == 0) return 0; + LASSERT(iov != NULL); + + for (i = 0; i < niov; i++) { + sum = old_sum + (u32) iov[i].iov_len; + if (!first_found && (sum > offset)) { + first_iov = i; + first_iov_offset = offset - old_sum; + first_found = 1; + sum = (u32) iov[i].iov_len - first_iov_offset; + old_sum = 0; + } + if (sum >= nob) { + last_iov = i; + last_iov_length = (u32) iov[i].iov_len - (sum - nob); + if (first_iov == last_iov) last_iov_length -= first_iov_offset; + break; + } + old_sum = sum; + } + LASSERT(first_iov >= 0 && last_iov >= first_iov); + nseg = last_iov - first_iov + 1; + LASSERT(nseg > 0); + + MXLND_ALLOC (seg, nseg * sizeof(*seg)); + if (seg == NULL) { + CDEBUG(D_NETERROR, "MXLND_ALLOC() failed\n"); + return -1; + } + memset(seg, 0, nseg * sizeof(*seg)); + ctx->mxc_nseg = nseg; + sum = 0; + for (i = 0; i < nseg; i++) { + seg[i].segment_ptr = MX_KVA_TO_U64(iov[first_iov + i].iov_base); + seg[i].segment_length = (u32) iov[first_iov + i].iov_len; + if (i == 0) { + seg[i].segment_ptr += (u64) first_iov_offset; + seg[i].segment_length -= (u32) first_iov_offset; + } + if (i == (nseg - 1)) { + seg[i].segment_length = (u32) last_iov_length; + } + sum += seg[i].segment_length; + } + ctx->mxc_seg_list = seg; + ctx->mxc_pin_type = MX_PIN_KERNEL; +#ifdef MX_PIN_FULLPAGES + ctx->mxc_pin_type |= MX_PIN_FULLPAGES; +#endif + LASSERT(nob == sum); + return 0; +} + +int +mxlnd_setup_kiov(struct kmx_ctx *ctx, u32 niov, lnet_kiov_t *kiov, u32 offset, u32 nob) +{ + int i = 0; + int sum = 0; + int old_sum = 0; + int nseg = 0; + int first_kiov = -1; + int first_kiov_offset = 0; + int first_found = 0; + int last_kiov = -1; + int last_kiov_length = 0; + mx_ksegment_t *seg = NULL; + + if (niov == 0) return 0; + LASSERT(kiov != NULL); + + for (i = 0; i < niov; i++) { + sum = old_sum + kiov[i].kiov_len; + if (i == 0) sum -= kiov[i].kiov_offset; + if (!first_found && (sum > offset)) { + first_kiov = i; + first_kiov_offset = offset - old_sum; + //if (i == 0) first_kiov_offset + kiov[i].kiov_offset; + if (i == 0) first_kiov_offset = kiov[i].kiov_offset; + first_found = 1; + sum = kiov[i].kiov_len - first_kiov_offset; + old_sum = 0; + } + if (sum >= nob) { + last_kiov = i; + last_kiov_length = kiov[i].kiov_len - (sum - nob); + if (first_kiov == last_kiov) last_kiov_length -= first_kiov_offset; + break; + } + old_sum = sum; + } + LASSERT(first_kiov >= 0 && last_kiov >= first_kiov); + nseg = last_kiov - first_kiov + 1; + LASSERT(nseg > 0); + + MXLND_ALLOC (seg, nseg * sizeof(*seg)); + if (seg == NULL) { + CDEBUG(D_NETERROR, "MXLND_ALLOC() failed\n"); + return -1; + } + memset(seg, 0, niov * sizeof(*seg)); + ctx->mxc_nseg = niov; + sum = 0; + for (i = 0; i < niov; i++) { + seg[i].segment_ptr = lnet_page2phys(kiov[first_kiov + i].kiov_page); + seg[i].segment_length = kiov[first_kiov + i].kiov_len; + if (i == 0) { + seg[i].segment_ptr += (u64) first_kiov_offset; + /* we have to add back the original kiov_offset */ + seg[i].segment_length -= first_kiov_offset + + kiov[first_kiov].kiov_offset; + } + if (i == (nseg - 1)) { + seg[i].segment_length = last_kiov_length; + } + sum += seg[i].segment_length; + } + ctx->mxc_seg_list = seg; + ctx->mxc_pin_type = MX_PIN_PHYSICAL; +#ifdef MX_PIN_FULLPAGES + ctx->mxc_pin_type |= MX_PIN_FULLPAGES; +#endif + LASSERT(nob == sum); + return 0; +} + +void +mxlnd_send_nak(struct kmx_ctx *tx, lnet_nid_t nid, int type, int status, __u64 cookie) +{ + LASSERT(type == MXLND_MSG_PUT_ACK); + mxlnd_init_tx_msg(tx, type, sizeof(kmx_putack_msg_t), tx->mxc_nid); + tx->mxc_cookie = cookie; + tx->mxc_msg->mxm_u.put_ack.mxpam_src_cookie = cookie; + tx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie = ((u64) status << 52); /* error code */ + tx->mxc_match = mxlnd_create_match(tx, status); + + mxlnd_queue_tx(tx); +} + + +/** + * mxlnd_send_data - get tx, map [k]iov, queue tx + * @ni + * @lntmsg + * @peer + * @msg_type + * @cookie + * + * This setups the DATA send for PUT or GET. + * + * On success, it queues the tx, on failure it calls lnet_finalize() + */ +void +mxlnd_send_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, struct kmx_peer *peer, u8 msg_type, u64 cookie) +{ + int ret = 0; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + struct kmx_ctx *tx = NULL; + + LASSERT(lntmsg != NULL); + LASSERT(peer != NULL); + LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA); + LASSERT((cookie>>52) == 0); + + tx = mxlnd_get_idle_tx(); + if (tx == NULL) { + CDEBUG(D_NETERROR, "Can't allocate %s tx for %s\n", + msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA", + libcfs_nid2str(target.nid)); + goto failed_0; + } + tx->mxc_nid = target.nid; + mxlnd_conn_addref(peer->mxp_conn); + tx->mxc_peer = peer; + tx->mxc_conn = peer->mxp_conn; + tx->mxc_msg_type = msg_type; + tx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT; + tx->mxc_state = MXLND_CTX_PENDING; + tx->mxc_lntmsg[0] = lntmsg; + tx->mxc_cookie = cookie; + tx->mxc_match = mxlnd_create_match(tx, 0); + + /* This setups up the mx_ksegment_t to send the DATA payload */ + if (nob == 0) { + /* do not setup the segments */ + CDEBUG(D_NETERROR, "nob = 0; why didn't we use an EAGER reply " + "to %s?\n", libcfs_nid2str(target.nid)); + ret = 0; + } else if (kiov == NULL) { + ret = mxlnd_setup_iov(tx, niov, iov, offset, nob); + } else { + ret = mxlnd_setup_kiov(tx, niov, kiov, offset, nob); + } + if (ret != 0) { + CDEBUG(D_NETERROR, "Can't setup send DATA for %s\n", + libcfs_nid2str(target.nid)); + tx->mxc_status.code = -EIO; + goto failed_1; + } + mxlnd_queue_tx(tx); + return; + +failed_1: + mxlnd_conn_decref(peer->mxp_conn); + mxlnd_put_idle_tx(tx); + return; + +failed_0: + CDEBUG(D_NETERROR, "no tx avail\n"); + lnet_finalize(ni, lntmsg, -EIO); + return; +} + +/** + * mxlnd_recv_data - map [k]iov, post rx + * @ni + * @lntmsg + * @rx + * @msg_type + * @cookie + * + * This setups the DATA receive for PUT or GET. + * + * On success, it returns 0, on failure it returns -1 + */ +int +mxlnd_recv_data(lnet_ni_t *ni, lnet_msg_t *lntmsg, struct kmx_ctx *rx, u8 msg_type, u64 cookie) +{ + int ret = 0; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + mx_return_t mxret = MX_SUCCESS; + + /* above assumes MXLND_MSG_PUT_DATA */ + if (msg_type == MXLND_MSG_GET_DATA) { + niov = lntmsg->msg_md->md_niov; + iov = lntmsg->msg_md->md_iov.iov; + kiov = lntmsg->msg_md->md_iov.kiov; + offset = 0; + nob = lntmsg->msg_md->md_length; + } + + LASSERT(lntmsg != NULL); + LASSERT(rx != NULL); + LASSERT(msg_type == MXLND_MSG_PUT_DATA || msg_type == MXLND_MSG_GET_DATA); + LASSERT((cookie>>52) == 0); /* ensure top 12 bits are 0 */ + + rx->mxc_msg_type = msg_type; + rx->mxc_deadline = jiffies + MXLND_COMM_TIMEOUT; + rx->mxc_state = MXLND_CTX_PENDING; + rx->mxc_nid = target.nid; + /* if posting a GET_DATA, we may not yet know the peer */ + if (rx->mxc_peer != NULL) { + rx->mxc_conn = rx->mxc_peer->mxp_conn; + } + rx->mxc_lntmsg[0] = lntmsg; + rx->mxc_cookie = cookie; + rx->mxc_match = mxlnd_create_match(rx, 0); + /* This setups up the mx_ksegment_t to receive the DATA payload */ + if (kiov == NULL) { + ret = mxlnd_setup_iov(rx, niov, iov, offset, nob); + } else { + ret = mxlnd_setup_kiov(rx, niov, kiov, offset, nob); + } + if (msg_type == MXLND_MSG_GET_DATA) { + rx->mxc_lntmsg[1] = lnet_create_reply_msg(kmxlnd_data.kmx_ni, lntmsg); + if (rx->mxc_lntmsg[1] == NULL) { + CDEBUG(D_NETERROR, "Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); + ret = -1; + } + } + if (ret != 0) { + CDEBUG(D_NETERROR, "Can't setup %s rx for %s\n", + msg_type == MXLND_MSG_PUT_DATA ? "PUT_DATA" : "GET_DATA", + libcfs_nid2str(target.nid)); + return -1; + } + ret = mxlnd_q_pending_ctx(rx); + if (ret == -1) { + return -1; + } + CDEBUG(D_NET, "receiving %s 0x%llx\n", mxlnd_msgtype_to_str(msg_type), rx->mxc_cookie); + mxret = mx_kirecv(kmxlnd_data.kmx_endpt, + rx->mxc_seg_list, rx->mxc_nseg, + rx->mxc_pin_type, rx->mxc_match, + 0xF00FFFFFFFFFFFFFLL, (void *) rx, + &rx->mxc_mxreq); + if (mxret != MX_SUCCESS) { + if (rx->mxc_conn != NULL) { + mxlnd_deq_pending_ctx(rx); + } + CDEBUG(D_NETERROR, "mx_kirecv() failed with %d for %s\n", + (int) mxret, libcfs_nid2str(target.nid)); + return -1; + } + + return 0; +} + +/** + * mxlnd_send - the LND required send function + * @ni + * @private + * @lntmsg + * + * This must not block. Since we may not have a peer struct for the receiver, + * it will append send messages on a global tx list. We will then up the + * tx_queued's semaphore to notify it of the new send. + */ +int +mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + int ret = 0; + int type = lntmsg->msg_type; + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + lnet_process_id_t target = lntmsg->msg_target; + lnet_nid_t nid = target.nid; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + struct kmx_ctx *tx = NULL; + struct kmx_msg *txmsg = NULL; + struct kmx_ctx *rx = (struct kmx_ctx *) private; /* for REPLY */ + struct kmx_ctx *rx_data = NULL; + struct kmx_conn *conn = NULL; + int nob = 0; + uint32_t length = 0; + struct kmx_peer *peer = NULL; + + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + /* private is used on LNET_GET_REPLY only, NULL for all other cases */ + + /* NOTE we may not know the peer if it is the very first PUT_REQ or GET_REQ + * to a new peer, use the nid */ + peer = mxlnd_find_peer_by_nid(nid); + if (peer != NULL) { + conn = peer->mxp_conn; + if (conn) mxlnd_conn_addref(conn); + } + if (conn == NULL && peer != NULL) { + CDEBUG(D_NETERROR, "conn==NULL peer=0x%p nid=0x%llx payload_nob=%d type=%s\n", + peer, nid, payload_nob, ((type==LNET_MSG_PUT) ? "PUT" : + ((type==LNET_MSG_GET) ? "GET" : "Other"))); + } + + switch (type) { + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need DATA? */ + nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob]); + if (nob <= MXLND_EAGER_SIZE) + break; /* send EAGER */ + + tx = mxlnd_get_idle_tx(); + if (unlikely(tx == NULL)) { + CDEBUG(D_NETERROR, "Can't allocate %s tx for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(nid)); + if (conn) mxlnd_conn_decref(conn); + return -ENOMEM; + } + + /* the peer may be NULL */ + tx->mxc_peer = peer; + tx->mxc_conn = conn; /* may be NULL */ + /* we added a conn ref above */ + mxlnd_init_tx_msg (tx, MXLND_MSG_PUT_REQ, sizeof(kmx_putreq_msg_t), nid); + txmsg = tx->mxc_msg; + txmsg->mxm_u.put_req.mxprm_hdr = *hdr; + txmsg->mxm_u.put_req.mxprm_cookie = tx->mxc_cookie; + tx->mxc_match = mxlnd_create_match(tx, 0); + + /* we must post a receive _before_ sending the request. + * we need to determine how much to receive, it will be either + * a put_ack or a put_nak. The put_ack is larger, so use it. */ + + rx = mxlnd_get_idle_rx(); + if (unlikely(rx == NULL)) { + CDEBUG(D_NETERROR, "Can't allocate rx for PUT_ACK for %s\n", + libcfs_nid2str(nid)); + mxlnd_put_idle_tx(tx); + if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */ + return -ENOMEM; + } + rx->mxc_nid = nid; + rx->mxc_peer = peer; + /* conn may be NULL but unlikely since the first msg is always small */ + if (conn) mxlnd_conn_addref(conn); /* for this rx */ + rx->mxc_conn = conn; + rx->mxc_msg_type = MXLND_MSG_PUT_ACK; + rx->mxc_cookie = tx->mxc_cookie; + rx->mxc_match = mxlnd_create_match(rx, 0); + + length = offsetof(kmx_msg_t, mxm_u) + sizeof(kmx_putack_msg_t); + ret = mxlnd_recv_msg(lntmsg, rx, MXLND_MSG_PUT_ACK, rx->mxc_match, length); + if (unlikely(ret != 0)) { + CDEBUG(D_NETERROR, "recv_msg() failed for PUT_ACK for %s\n", + libcfs_nid2str(nid)); + rx->mxc_lntmsg[0] = NULL; + mxlnd_put_idle_rx(rx); + mxlnd_put_idle_tx(tx); + if (conn) { + mxlnd_conn_decref(conn); /* for the rx... */ + mxlnd_conn_decref(conn); /* and for the tx */ + } + return -ENOMEM; + } + + mxlnd_queue_tx(tx); + return 0; + + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send EAGER */ + + /* is the REPLY message too small for DATA? */ + nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[lntmsg->msg_md->md_length]); + if (nob <= MXLND_EAGER_SIZE) + break; /* send EAGER */ + + /* get tx (we need the cookie) , post rx for incoming DATA, + * then post GET_REQ tx */ + tx = mxlnd_get_idle_tx(); + if (unlikely(tx == NULL)) { + CDEBUG(D_NETERROR, "Can't allocate GET tx for %s\n", + libcfs_nid2str(nid)); + if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */ + return -ENOMEM; + } + rx_data = mxlnd_get_idle_rx(); + if (unlikely(rx_data == NULL)) { + CDEBUG(D_NETERROR, "Can't allocate DATA rx for %s\n", + libcfs_nid2str(nid)); + mxlnd_put_idle_tx(tx); + if (conn) mxlnd_conn_decref(conn); /* for the ref taken above */ + return -ENOMEM; + } + rx_data->mxc_peer = peer; + if (conn) mxlnd_conn_addref(conn); /* for the rx_data */ + rx_data->mxc_conn = conn; /* may be NULL */ + + ret = mxlnd_recv_data(ni, lntmsg, rx_data, MXLND_MSG_GET_DATA, tx->mxc_cookie); + if (unlikely(ret != 0)) { + CDEBUG(D_NETERROR, "Can't setup GET sink for %s\n", + libcfs_nid2str(nid)); + mxlnd_put_idle_rx(rx_data); + mxlnd_put_idle_tx(tx); + if (conn) { + mxlnd_conn_decref(conn); /* for the rx_data... */ + mxlnd_conn_decref(conn); /* and for the tx */ + } + return -EIO; + } + + tx->mxc_peer = peer; + tx->mxc_conn = conn; /* may be NULL */ + /* conn ref taken above */ + mxlnd_init_tx_msg(tx, MXLND_MSG_GET_REQ, sizeof(kmx_getreq_msg_t), nid); + txmsg = tx->mxc_msg; + txmsg->mxm_u.get_req.mxgrm_hdr = *hdr; + txmsg->mxm_u.get_req.mxgrm_cookie = tx->mxc_cookie; + tx->mxc_match = mxlnd_create_match(tx, 0); + + mxlnd_queue_tx(tx); + return 0; + + default: + LBUG(); + if (conn) mxlnd_conn_decref(conn); /* drop ref taken above */ + return -EIO; + } + + /* send EAGER */ + + LASSERT (offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[payload_nob]) + <= MXLND_EAGER_SIZE); + + tx = mxlnd_get_idle_tx(); + if (unlikely(tx == NULL)) { + CDEBUG(D_NETERROR, "Can't send %s to %s: tx descs exhausted\n", + mxlnd_lnetmsg_to_str(type), libcfs_nid2str(nid)); + if (conn) mxlnd_conn_decref(conn); /* drop ref taken above */ + return -ENOMEM; + } + + tx->mxc_peer = peer; + tx->mxc_conn = conn; /* may be NULL */ + /* conn ref taken above */ + nob = offsetof(kmx_eager_msg_t, mxem_payload[payload_nob]); + mxlnd_init_tx_msg (tx, MXLND_MSG_EAGER, nob, nid); + tx->mxc_match = mxlnd_create_match(tx, 0); + + txmsg = tx->mxc_msg; + txmsg->mxm_u.eager.mxem_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(MXLND_EAGER_SIZE, txmsg, + offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), + payload_niov, payload_kiov, payload_offset, payload_nob); + else + lnet_copy_iov2flat(MXLND_EAGER_SIZE, txmsg, + offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), + payload_niov, payload_iov, payload_offset, payload_nob); + + tx->mxc_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + mxlnd_queue_tx(tx); + return 0; +} + +/** + * mxlnd_recv - the LND required recv function + * @ni + * @private + * @lntmsg + * @delayed + * @niov + * @kiov + * @offset + * @mlen + * @rlen + * + * This must not block. + */ +int +mxlnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + int ret = 0; + int nob = 0; + int len = 0; + struct kmx_ctx *rx = private; + struct kmx_msg *rxmsg = rx->mxc_msg; + lnet_nid_t nid = rx->mxc_nid; + struct kmx_ctx *tx = NULL; + struct kmx_msg *txmsg = NULL; + struct kmx_peer *peer = rx->mxc_peer; + struct kmx_conn *conn = peer->mxp_conn; + u64 cookie = 0LL; + int msg_type = rxmsg->mxm_type; + int repost = 1; + int credit = 0; + int finalize = 0; + + LASSERT (mlen <= rlen); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + LASSERT (peer != NULL); + + /* conn_addref(conn) already taken for the primary rx */ + + switch (msg_type) { + case MXLND_MSG_EAGER: + nob = offsetof(kmx_msg_t, mxm_u.eager.mxem_payload[rlen]); + len = rx->mxc_status.xfer_length; + if (unlikely(nob > len)) { + CDEBUG(D_NETERROR, "Eager message from %s too big: %d(%d)\n", + libcfs_nid2str(nid), nob, len); + ret = -EPROTO; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov(niov, kiov, offset, + MXLND_EAGER_SIZE, rxmsg, + offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), + mlen); + else + lnet_copy_flat2iov(niov, iov, offset, + MXLND_EAGER_SIZE, rxmsg, + offsetof(kmx_msg_t, mxm_u.eager.mxem_payload), + mlen); + finalize = 1; + credit = 1; + break; + + case MXLND_MSG_PUT_REQ: + /* we are going to reuse the rx, store the needed info */ + cookie = rxmsg->mxm_u.put_req.mxprm_cookie; + + /* get tx, post rx, send PUT_ACK */ + + tx = mxlnd_get_idle_tx(); + if (unlikely(tx == NULL)) { + CDEBUG(D_NETERROR, "Can't allocate tx for %s\n", libcfs_nid2str(nid)); + /* Not replying will break the connection */ + ret = -ENOMEM; + break; + } + if (unlikely(mlen == 0)) { + finalize = 1; + tx->mxc_peer = peer; + tx->mxc_conn = conn; + mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, 0, cookie); + /* repost = 1 */ + break; + } + + mxlnd_init_tx_msg(tx, MXLND_MSG_PUT_ACK, sizeof(kmx_putack_msg_t), nid); + tx->mxc_peer = peer; + tx->mxc_conn = conn; + mxlnd_conn_addref(conn); /* for the tx */ + txmsg = tx->mxc_msg; + txmsg->mxm_u.put_ack.mxpam_src_cookie = cookie; + txmsg->mxm_u.put_ack.mxpam_dst_cookie = tx->mxc_cookie; + tx->mxc_cookie = cookie; + tx->mxc_match = mxlnd_create_match(tx, 0); + + /* we must post a receive _before_ sending the PUT_ACK */ + mxlnd_ctx_init(rx); + rx->mxc_state = MXLND_CTX_PREP; + rx->mxc_peer = peer; + rx->mxc_conn = conn; + /* do not take another ref for this rx, it is already taken */ + rx->mxc_nid = peer->mxp_nid; + ret = mxlnd_recv_data(ni, lntmsg, rx, MXLND_MSG_PUT_DATA, + txmsg->mxm_u.put_ack.mxpam_dst_cookie); + + if (unlikely(ret != 0)) { + /* Notify peer that it's over */ + CDEBUG(D_NETERROR, "Can't setup PUT_DATA rx for %s: %d\n", + libcfs_nid2str(nid), ret); + mxlnd_ctx_init(tx); + tx->mxc_state = MXLND_CTX_PREP; + tx->mxc_peer = peer; + tx->mxc_conn = conn; + /* finalize = 0, let the PUT_ACK tx finalize this */ + tx->mxc_lntmsg[0] = rx->mxc_lntmsg[0]; + tx->mxc_lntmsg[1] = rx->mxc_lntmsg[1]; + /* conn ref already taken above */ + mxlnd_send_nak(tx, nid, MXLND_MSG_PUT_ACK, ret, cookie); + /* repost = 1 */ + break; + } + + mxlnd_queue_tx(tx); + /* do not return a credit until after PUT_DATA returns */ + repost = 0; + break; + + case MXLND_MSG_GET_REQ: + if (likely(lntmsg != NULL)) { + mxlnd_send_data(ni, lntmsg, rx->mxc_peer, MXLND_MSG_GET_DATA, + rx->mxc_msg->mxm_u.get_req.mxgrm_cookie); + } else { + /* GET didn't match anything */ + /* The initiator has a rx mapped to [k]iov. We cannot send a nak. + * We have to embed the error code in the match bits. + * Send the error in bits 52-59 and the cookie in bits 0-51 */ + u64 cookie = rxmsg->mxm_u.get_req.mxgrm_cookie; + + tx = mxlnd_get_idle_tx(); + if (unlikely(tx == NULL)) { + CDEBUG(D_NETERROR, "Can't get tx for GET NAK for %s\n", + libcfs_nid2str(nid)); + ret = -ENOMEM; + break; + } + tx->mxc_msg_type = MXLND_MSG_GET_DATA; + tx->mxc_state = MXLND_CTX_PENDING; + tx->mxc_nid = nid; + tx->mxc_peer = peer; + tx->mxc_conn = conn; + mxlnd_conn_addref(conn); /* for this tx */ + tx->mxc_cookie = cookie; + tx->mxc_match = mxlnd_create_match(tx, ENODATA); + tx->mxc_pin_type = MX_PIN_PHYSICAL; + mxlnd_queue_tx(tx); + } + /* finalize lntmsg after tx completes */ + break; + + default: + LBUG(); + } + + if (repost) { + /* we received a message, increment peer's outstanding credits */ + if (credit == 1) { + spin_lock(&conn->mxk_lock); + conn->mxk_outstanding++; + spin_unlock(&conn->mxk_lock); + } + /* we are done with the rx */ + mxlnd_put_idle_rx(rx); + mxlnd_conn_decref(conn); + } + + if (finalize == 1) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg, 0); + + /* we received a credit, see if we can use it to send a msg */ + if (credit) mxlnd_check_sends(peer); + + return ret; +} + +void +mxlnd_sleep(unsigned long timeout) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(timeout); + return; +} + +/** + * mxlnd_tx_queued - the generic send queue thread + * @arg - thread id (as a void *) + * + * This thread moves send messages from the global tx_queue to the owning + * peer's tx_[msg|data]_queue. If the peer does not exist, it creates one and adds + * it to the global peer list. + */ +int +mxlnd_tx_queued(void *arg) +{ + long id = (long) arg; + int ret = 0; + int found = 0; + struct kmx_ctx *tx = NULL; + struct kmx_peer *peer = NULL; + struct list_head *tmp_tx = NULL; + + cfs_daemonize("mxlnd_tx_queued"); + //cfs_block_allsigs(); + + while (!kmxlnd_data.kmx_shutdown) { + ret = down_interruptible(&kmxlnd_data.kmx_tx_queue_sem); + if (kmxlnd_data.kmx_shutdown) + break; + if (ret != 0) // Should we check for -EINTR? + continue; + spin_lock(&kmxlnd_data.kmx_tx_queue_lock); + if (list_empty (&kmxlnd_data.kmx_tx_queue)) { + spin_unlock(&kmxlnd_data.kmx_tx_queue_lock); + continue; + } + tmp_tx = &kmxlnd_data.kmx_tx_queue; + tx = list_entry (tmp_tx->next, struct kmx_ctx, mxc_list); + list_del_init(&tx->mxc_list); + spin_unlock(&kmxlnd_data.kmx_tx_queue_lock); + + found = 0; + peer = mxlnd_find_peer_by_nid(tx->mxc_nid); + if (peer != NULL) { + tx->mxc_peer = peer; + tx->mxc_conn = peer->mxp_conn; + mxlnd_conn_addref(tx->mxc_conn); /* for this tx */ + mxlnd_queue_tx(tx); + found = 1; + } + if (found == 0) { + int hash = 0; + struct kmx_peer *peer = NULL; + struct kmx_peer *old = NULL; + + hash = mxlnd_nid_to_hash(tx->mxc_nid); + + LASSERT(tx->mxc_msg_type != MXLND_MSG_PUT_DATA && + tx->mxc_msg_type != MXLND_MSG_GET_DATA); + /* create peer */ + ret = mxlnd_peer_alloc(&peer, tx->mxc_nid); + if (ret != 0) { + /* finalize message */ + tx->mxc_status.code = -ECONNABORTED; + mxlnd_put_idle_tx(tx); + continue; + } + tx->mxc_peer = peer; + tx->mxc_conn = peer->mxp_conn; + + /* add peer to global peer list, but look to see + * if someone already created it after we released + * the read lock */ + write_lock(&kmxlnd_data.kmx_peers_lock); + list_for_each_entry(old, &kmxlnd_data.kmx_peers[hash], mxp_peers) { + if (old->mxp_nid == peer->mxp_nid) { + /* somebody beat us here, we created a duplicate */ + found = 1; + break; + } + } + + if (found == 0) { + list_add_tail(&peer->mxp_peers, &kmxlnd_data.kmx_peers[hash]); + atomic_inc(&kmxlnd_data.kmx_npeers); + } else { + tx->mxc_peer = old; + tx->mxc_conn = old->mxp_conn; + mxlnd_reduce_idle_rxs(*kmxlnd_tunables.kmx_credits - 1); + mxlnd_peer_decref(peer); + } + mxlnd_conn_addref(tx->mxc_conn); /* for this tx */ + write_unlock(&kmxlnd_data.kmx_peers_lock); + + mxlnd_queue_tx(tx); + } + } + mxlnd_thread_stop(id); + return 0; +} + +/* When calling this, we must not have the peer lock. */ +void +mxlnd_iconnect(struct kmx_peer *peer, u64 mask) +{ + mx_return_t mxret = MX_SUCCESS; + mx_request_t request; + struct kmx_conn *conn = peer->mxp_conn; + + mxlnd_conn_addref(conn); /* hold until CONN_REQ or CONN_ACK completes */ + + LASSERT(mask == MXLND_MASK_ICON_REQ || + mask == MXLND_MASK_ICON_ACK); + + if (peer->mxp_reconnect_time == 0) { + peer->mxp_reconnect_time = jiffies; + } + + if (peer->mxp_nic_id == 0LL) { + mxlnd_peer_hostname_to_nic_id(peer); + if (peer->mxp_nic_id == 0LL) { + /* not mapped yet, return */ + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_INIT; + spin_unlock(&conn->mxk_lock); + if (time_after(jiffies, peer->mxp_reconnect_time + MXLND_WAIT_TIMEOUT)) { + /* give up and notify LNET */ + mxlnd_conn_disconnect(conn, 0, 1); + mxlnd_conn_alloc(&peer->mxp_conn, peer); + } + mxlnd_conn_decref(conn); + return; + } + } + + mxret = mx_iconnect(kmxlnd_data.kmx_endpt, peer->mxp_nic_id, + peer->mxp_host->mxh_ep_id, MXLND_MSG_MAGIC, mask, + (void *) peer, &request); + if (unlikely(mxret != MX_SUCCESS)) { + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + CDEBUG(D_NETERROR, "mx_iconnect() failed with %s (%d) to %s\n", + mx_strerror(mxret), mxret, libcfs_nid2str(peer->mxp_nid)); + mxlnd_conn_decref(conn); + } + return; +} + +#define MXLND_STATS 0 + +int +mxlnd_check_sends(struct kmx_peer *peer) +{ + int ret = 0; + int found = 0; + mx_return_t mxret = MX_SUCCESS; + struct kmx_ctx *tx = NULL; + struct kmx_conn *conn = NULL; + u8 msg_type = 0; + int credit = 0; + int status = 0; + int ntx_posted = 0; + int credits = 0; +#if MXLND_STATS + static unsigned long last = 0; +#endif + + if (unlikely(peer == NULL)) { + LASSERT(peer != NULL); + return -1; + } + conn = peer->mxp_conn; + /* do not add another ref for this tx */ + + if (conn == NULL) { + /* we do not have any conns */ + return -1; + } + +#if MXLND_STATS + if (time_after(jiffies, last)) { + last = jiffies + HZ; + CDEBUG(D_NET, "status= %s credits= %d outstanding= %d ntx_msgs= %d " + "ntx_posted= %d ntx_data= %d data_posted= %d\n", + mxlnd_connstatus_to_str(conn->mxk_status), conn->mxk_credits, + conn->mxk_outstanding, conn->mxk_ntx_msgs, conn->mxk_ntx_posted, + conn->mxk_ntx_data, conn->mxk_data_posted); + } +#endif + + /* cache peer state for asserts */ + spin_lock(&conn->mxk_lock); + ntx_posted = conn->mxk_ntx_posted; + credits = conn->mxk_credits; + spin_unlock(&conn->mxk_lock); + + LASSERT(ntx_posted <= *kmxlnd_tunables.kmx_credits); + LASSERT(ntx_posted >= 0); + + LASSERT(credits <= *kmxlnd_tunables.kmx_credits); + LASSERT(credits >= 0); + + /* check number of queued msgs, ignore data */ + spin_lock(&conn->mxk_lock); + if (conn->mxk_outstanding >= MXLND_CREDIT_HIGHWATER) { + /* check if any txs queued that could return credits... */ + if (list_empty(&conn->mxk_tx_credit_queue) || conn->mxk_ntx_msgs == 0) { + /* if not, send a NOOP */ + tx = mxlnd_get_idle_tx(); + if (likely(tx != NULL)) { + tx->mxc_peer = peer; + tx->mxc_conn = peer->mxp_conn; + mxlnd_conn_addref(conn); /* for this tx */ + mxlnd_init_tx_msg (tx, MXLND_MSG_NOOP, 0, peer->mxp_nid); + tx->mxc_match = mxlnd_create_match(tx, 0); + mxlnd_peer_queue_tx_locked(tx); + found = 1; + goto done_locked; + } + } + } + spin_unlock(&conn->mxk_lock); + + /* if the peer is not ready, try to connect */ + spin_lock(&conn->mxk_lock); + if (unlikely(conn->mxk_status == MXLND_CONN_INIT || + conn->mxk_status == MXLND_CONN_FAIL || + conn->mxk_status == MXLND_CONN_REQ)) { + CDEBUG(D_NET, "status=%s\n", mxlnd_connstatus_to_str(conn->mxk_status)); + conn->mxk_status = MXLND_CONN_WAIT; + spin_unlock(&conn->mxk_lock); + mxlnd_iconnect(peer, MXLND_MASK_ICON_REQ); + goto done; + } + spin_unlock(&conn->mxk_lock); + + spin_lock(&conn->mxk_lock); + while (!list_empty(&conn->mxk_tx_free_queue) || + !list_empty(&conn->mxk_tx_credit_queue)) { + /* We have something to send. If we have a queued tx that does not + * require a credit (free), choose it since its completion will + * return a credit (here or at the peer), complete a DATA or + * CONN_REQ or CONN_ACK. */ + struct list_head *tmp_tx = NULL; + if (!list_empty(&conn->mxk_tx_free_queue)) { + tmp_tx = &conn->mxk_tx_free_queue; + } else { + tmp_tx = &conn->mxk_tx_credit_queue; + } + tx = list_entry(tmp_tx->next, struct kmx_ctx, mxc_list); + + msg_type = tx->mxc_msg_type; + + /* don't try to send a rx */ + LASSERT(tx->mxc_type == MXLND_REQ_TX); + + /* ensure that it is a valid msg type */ + LASSERT(msg_type == MXLND_MSG_CONN_REQ || + msg_type == MXLND_MSG_CONN_ACK || + msg_type == MXLND_MSG_NOOP || + msg_type == MXLND_MSG_EAGER || + msg_type == MXLND_MSG_PUT_REQ || + msg_type == MXLND_MSG_PUT_ACK || + msg_type == MXLND_MSG_PUT_DATA || + msg_type == MXLND_MSG_GET_REQ || + msg_type == MXLND_MSG_GET_DATA); + LASSERT(tx->mxc_peer == peer); + LASSERT(tx->mxc_nid == peer->mxp_nid); + + credit = mxlnd_tx_requires_credit(tx); + if (credit) { + + if (conn->mxk_ntx_posted == *kmxlnd_tunables.kmx_credits) { + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(peer->mxp_nid)); + goto done_locked; + } + + if (conn->mxk_credits == 0) { + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(peer->mxp_nid)); + goto done_locked; + } + + if (conn->mxk_credits == 1 && /* last credit reserved for */ + conn->mxk_outstanding == 0) { /* giving back credits */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(peer->mxp_nid)); + goto done_locked; + } + } + + if (unlikely(conn->mxk_status != MXLND_CONN_READY)) { + if ( ! (msg_type == MXLND_MSG_CONN_REQ || + msg_type == MXLND_MSG_CONN_ACK)) { + CDEBUG(D_NET, "peer status is %s for tx 0x%llx (%s)\n", + mxlnd_connstatus_to_str(conn->mxk_status), + tx->mxc_cookie, + mxlnd_msgtype_to_str(tx->mxc_msg_type)); + if (conn->mxk_status == MXLND_CONN_DISCONNECT) { + list_del_init(&tx->mxc_list); + tx->mxc_status.code = -ECONNABORTED; + mxlnd_put_idle_tx(tx); + mxlnd_conn_decref(conn); + } + goto done_locked; + } + } + + list_del_init(&tx->mxc_list); + + /* handle credits, etc now while we have the lock to avoid races */ + if (credit) { + conn->mxk_credits--; + conn->mxk_ntx_posted++; + } + if (msg_type != MXLND_MSG_PUT_DATA && + msg_type != MXLND_MSG_GET_DATA) { + if (msg_type != MXLND_MSG_CONN_REQ && + msg_type != MXLND_MSG_CONN_ACK) { + conn->mxk_ntx_msgs--; + } + } + if (tx->mxc_incarnation == 0 && + conn->mxk_incarnation != 0) { + tx->mxc_incarnation = conn->mxk_incarnation; + } + spin_unlock(&conn->mxk_lock); + + /* if this is a NOOP and (1) mxp_conn->mxk_outstanding < CREDIT_HIGHWATER + * or (2) there is a non-DATA msg that can return credits in the + * queue, then drop this duplicate NOOP */ + if (unlikely(msg_type == MXLND_MSG_NOOP)) { + spin_lock(&conn->mxk_lock); + if ((conn->mxk_outstanding < MXLND_CREDIT_HIGHWATER) || + (conn->mxk_ntx_msgs >= 1)) { + conn->mxk_credits++; + conn->mxk_ntx_posted--; + spin_unlock(&conn->mxk_lock); + /* redundant NOOP */ + mxlnd_put_idle_tx(tx); + mxlnd_conn_decref(conn); + CDEBUG(D_NET, "%s: redundant noop\n", + libcfs_nid2str(peer->mxp_nid)); + found = 1; + goto done; + } + spin_unlock(&conn->mxk_lock); + } + + found = 1; + if (likely((msg_type != MXLND_MSG_PUT_DATA) && + (msg_type != MXLND_MSG_GET_DATA))) { + mxlnd_pack_msg(tx); + } + + //ret = -ECONNABORTED; + mxret = MX_SUCCESS; + + spin_lock(&conn->mxk_lock); + status = conn->mxk_status; + spin_unlock(&conn->mxk_lock); + + if (likely((status == MXLND_CONN_READY) || + (msg_type == MXLND_MSG_CONN_REQ) || + (msg_type == MXLND_MSG_CONN_ACK))) { + ret = 0; + if (msg_type != MXLND_MSG_CONN_REQ && + msg_type != MXLND_MSG_CONN_ACK) { + /* add to the pending list */ + ret = mxlnd_q_pending_ctx(tx); + if (ret == -1) { + /* FIXME the conn is disconnected, now what? */ + } + } else { + /* CONN_REQ/ACK */ + tx->mxc_state = MXLND_CTX_PENDING; + } + + if (ret == 0) { + if (likely(msg_type != MXLND_MSG_PUT_DATA && + msg_type != MXLND_MSG_GET_DATA)) { + /* send a msg style tx */ + LASSERT(tx->mxc_nseg == 1); + LASSERT(tx->mxc_pin_type == MX_PIN_PHYSICAL); + CDEBUG(D_NET, "sending %s 0x%llx\n", + mxlnd_msgtype_to_str(msg_type), + tx->mxc_cookie); + mxret = mx_kisend(kmxlnd_data.kmx_endpt, + &tx->mxc_seg, + tx->mxc_nseg, + tx->mxc_pin_type, + conn->mxk_epa, + tx->mxc_match, + (void *) tx, + &tx->mxc_mxreq); + } else { + /* send a DATA tx */ + spin_lock(&conn->mxk_lock); + conn->mxk_ntx_data--; + conn->mxk_data_posted++; + spin_unlock(&conn->mxk_lock); + CDEBUG(D_NET, "sending %s 0x%llx\n", + mxlnd_msgtype_to_str(msg_type), + tx->mxc_cookie); + mxret = mx_kisend(kmxlnd_data.kmx_endpt, + tx->mxc_seg_list, + tx->mxc_nseg, + tx->mxc_pin_type, + conn->mxk_epa, + tx->mxc_match, + (void *) tx, + &tx->mxc_mxreq); + } + } else { + mxret = MX_CONNECTION_FAILED; + } + if (likely(mxret == MX_SUCCESS)) { + ret = 0; + } else { + CDEBUG(D_NETERROR, "mx_kisend() failed with %s (%d) " + "sending to %s\n", mx_strerror(mxret), (int) mxret, + libcfs_nid2str(peer->mxp_nid)); + /* NOTE mx_kisend() only fails if there are not enough + * resources. Do not change the connection status. */ + if (mxret == MX_NO_RESOURCES) { + tx->mxc_status.code = -ENOMEM; + } else { + tx->mxc_status.code = -ECONNABORTED; + } + if (credit) { + spin_lock(&conn->mxk_lock); + conn->mxk_ntx_posted--; + conn->mxk_credits++; + spin_unlock(&conn->mxk_lock); + } else if (msg_type == MXLND_MSG_PUT_DATA || + msg_type == MXLND_MSG_GET_DATA) { + spin_lock(&conn->mxk_lock); + conn->mxk_data_posted--; + spin_unlock(&conn->mxk_lock); + } + if (msg_type != MXLND_MSG_PUT_DATA && + msg_type != MXLND_MSG_GET_DATA && + msg_type != MXLND_MSG_CONN_REQ && + msg_type != MXLND_MSG_CONN_ACK) { + spin_lock(&conn->mxk_lock); + conn->mxk_outstanding += tx->mxc_msg->mxm_credits; + spin_unlock(&conn->mxk_lock); + } + if (msg_type != MXLND_MSG_CONN_REQ && + msg_type != MXLND_MSG_CONN_ACK) { + /* remove from the pending list */ + mxlnd_deq_pending_ctx(tx); + } + mxlnd_put_idle_tx(tx); + mxlnd_conn_decref(conn); + } + } + spin_lock(&conn->mxk_lock); + } +done_locked: + spin_unlock(&conn->mxk_lock); +done: + return found; +} + + +/** + * mxlnd_handle_tx_completion - a tx completed, progress or complete the msg + * @ctx - the tx descriptor + * + * Determine which type of send request it was and start the next step, if needed, + * or, if done, signal completion to LNET. After we are done, put back on the + * idle tx list. + */ +void +mxlnd_handle_tx_completion(struct kmx_ctx *tx) +{ + int failed = (tx->mxc_status.code != MX_STATUS_SUCCESS); + struct kmx_msg *msg = tx->mxc_msg; + struct kmx_peer *peer = tx->mxc_peer; + struct kmx_conn *conn = tx->mxc_conn; + u8 type = tx->mxc_msg_type; + int credit = mxlnd_tx_requires_credit(tx); + u64 cookie = tx->mxc_cookie; + + CDEBUG(D_NET, "entering %s (0x%llx):\n", + mxlnd_msgtype_to_str(tx->mxc_msg_type), cookie); + + if (unlikely(conn == NULL)) { + mx_get_endpoint_addr_context(tx->mxc_status.source, (void **) &conn); + if (conn != NULL) { + /* do not add a ref for the tx, it was set before sending */ + tx->mxc_conn = conn; + tx->mxc_peer = conn->mxk_peer; + } + } + LASSERT (peer != NULL); + LASSERT (conn != NULL); + + if (type != MXLND_MSG_PUT_DATA && type != MXLND_MSG_GET_DATA) { + LASSERT (type == msg->mxm_type); + } + + if (failed) { + tx->mxc_status.code = -EIO; + } else { + spin_lock(&conn->mxk_lock); + conn->mxk_last_tx = jiffies; + spin_unlock(&conn->mxk_lock); + } + + switch (type) { + + case MXLND_MSG_GET_DATA: + spin_lock(&conn->mxk_lock); + if (conn->mxk_incarnation == tx->mxc_incarnation) { + conn->mxk_outstanding++; + conn->mxk_data_posted--; + } + spin_unlock(&conn->mxk_lock); + break; + + case MXLND_MSG_PUT_DATA: + spin_lock(&conn->mxk_lock); + if (conn->mxk_incarnation == tx->mxc_incarnation) { + conn->mxk_data_posted--; + } + spin_unlock(&conn->mxk_lock); + break; + + case MXLND_MSG_NOOP: + case MXLND_MSG_PUT_REQ: + case MXLND_MSG_PUT_ACK: + case MXLND_MSG_GET_REQ: + case MXLND_MSG_EAGER: + //case MXLND_MSG_NAK: + break; + + case MXLND_MSG_CONN_ACK: + if (peer->mxp_incompatible) { + /* we sent our params, now close this conn */ + mxlnd_conn_disconnect(conn, 0, 1); + } + case MXLND_MSG_CONN_REQ: + if (failed) { + CDEBUG(D_NETERROR, "handle_tx_completion(): %s " + "failed with %s (%d) to %s\n", + type == MXLND_MSG_CONN_REQ ? "CONN_REQ" : "CONN_ACK", + mx_strstatus(tx->mxc_status.code), + tx->mxc_status.code, + libcfs_nid2str(tx->mxc_nid)); + if (!peer->mxp_incompatible) { + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + } + } + break; + + default: + CDEBUG(D_NETERROR, "Unknown msg type of %d\n", type); + LBUG(); + } + + if (credit) { + spin_lock(&conn->mxk_lock); + if (conn->mxk_incarnation == tx->mxc_incarnation) { + conn->mxk_ntx_posted--; + } + spin_unlock(&conn->mxk_lock); + } + + CDEBUG(D_NET, "leaving mxlnd_handle_tx_completion()\n"); + mxlnd_put_idle_tx(tx); + mxlnd_conn_decref(conn); + + mxlnd_check_sends(peer); + + return; +} + +void +mxlnd_handle_rx_completion(struct kmx_ctx *rx) +{ + int ret = 0; + int repost = 1; + int credit = 1; + u32 nob = rx->mxc_status.xfer_length; + u64 bits = rx->mxc_status.match_info; + struct kmx_msg *msg = rx->mxc_msg; + struct kmx_peer *peer = rx->mxc_peer; + struct kmx_conn *conn = rx->mxc_conn; + u8 type = rx->mxc_msg_type; + u64 seq = 0LL; + lnet_msg_t *lntmsg[2]; + int result = 0; + u64 nic_id = 0LL; + u32 ep_id = 0; + int decref = 1; + int incompatible = 0; + + /* NOTE We may only know the peer's nid if it is a PUT_REQ, GET_REQ, + * failed GET reply, CONN_REQ, or a CONN_ACK */ + + /* NOTE peer may still be NULL if it is a new peer */ + if (peer == NULL || conn == NULL) { + /* if the peer was disconnected, the peer may exist but + * not have any valid conns */ + decref = 0; /* no peer means no ref was taken for this rx */ + } + + if (conn == NULL && peer != NULL) { + conn = peer->mxp_conn; + rx->mxc_conn = conn; + } + +#if MXLND_DEBUG + CDEBUG(D_NET, "receiving msg bits=0x%llx nob=%d peer=0x%p\n", bits, nob, peer); +#endif + + lntmsg[0] = NULL; + lntmsg[1] = NULL; + + if (rx->mxc_status.code != MX_STATUS_SUCCESS) { + CDEBUG(D_NETERROR, "rx from %s failed with %s (%d)\n", + libcfs_nid2str(rx->mxc_nid), + mx_strstatus(rx->mxc_status.code), + (int) rx->mxc_status.code); + credit = 0; + goto cleanup; + } + + if (nob == 0) { + /* this may be a failed GET reply */ + if (type == MXLND_MSG_GET_DATA) { + bits = rx->mxc_status.match_info & 0x0FF0000000000000LL; + ret = (u32) (bits>>52); + lntmsg[0] = rx->mxc_lntmsg[0]; + result = -ret; + goto cleanup; + } else { + /* we had a rx complete with 0 bytes (no hdr, nothing) */ + CDEBUG(D_NETERROR, "rx from %s returned with 0 bytes\n", + libcfs_nid2str(rx->mxc_nid)); + goto cleanup; + } + } + + /* NOTE PUT_DATA and GET_DATA do not have mxc_msg, do not call unpack() */ + if (type == MXLND_MSG_PUT_DATA) { + result = rx->mxc_status.code; + lntmsg[0] = rx->mxc_lntmsg[0]; + goto cleanup; + } else if (type == MXLND_MSG_GET_DATA) { + result = rx->mxc_status.code; + lntmsg[0] = rx->mxc_lntmsg[0]; + lntmsg[1] = rx->mxc_lntmsg[1]; + goto cleanup; + } + + ret = mxlnd_unpack_msg(msg, nob); + if (ret != 0) { + CDEBUG(D_NETERROR, "Error %d unpacking rx from %s\n", + ret, libcfs_nid2str(rx->mxc_nid)); + goto cleanup; + } + rx->mxc_nob = nob; + type = msg->mxm_type; + seq = msg->mxm_seq; + + if (type != MXLND_MSG_CONN_REQ && + (!lnet_ptlcompat_matchnid(rx->mxc_nid, msg->mxm_srcnid) || + !lnet_ptlcompat_matchnid(kmxlnd_data.kmx_ni->ni_nid, msg->mxm_dstnid))) { + CDEBUG(D_NETERROR, "rx with mismatched NID (type %s) (my nid is " + "0x%llx and rx msg dst is 0x%llx)\n", + mxlnd_msgtype_to_str(type), kmxlnd_data.kmx_ni->ni_nid, + msg->mxm_dstnid); + goto cleanup; + } + + if (type != MXLND_MSG_CONN_REQ && type != MXLND_MSG_CONN_ACK) { + if ((conn != NULL && msg->mxm_srcstamp != conn->mxk_incarnation) || + msg->mxm_dststamp != kmxlnd_data.kmx_incarnation) { + if (conn != NULL) { + CDEBUG(D_NETERROR, "Stale rx from %s with type %s " + "(mxm_srcstamp (%lld) != mxk_incarnation (%lld) " + "|| mxm_dststamp (%lld) != kmx_incarnation (%lld))\n", + libcfs_nid2str(rx->mxc_nid), mxlnd_msgtype_to_str(type), + msg->mxm_srcstamp, conn->mxk_incarnation, + msg->mxm_dststamp, kmxlnd_data.kmx_incarnation); + } else { + CDEBUG(D_NETERROR, "Stale rx from %s with type %s " + "mxm_dststamp (%lld) != kmx_incarnation (%lld))\n", + libcfs_nid2str(rx->mxc_nid), mxlnd_msgtype_to_str(type), + msg->mxm_dststamp, kmxlnd_data.kmx_incarnation); + } + credit = 0; + goto cleanup; + } + } + + CDEBUG(D_NET, "Received %s with %d credits\n", + mxlnd_msgtype_to_str(type), msg->mxm_credits); + + if (msg->mxm_type != MXLND_MSG_CONN_REQ && + msg->mxm_type != MXLND_MSG_CONN_ACK) { + LASSERT(peer != NULL); + LASSERT(conn != NULL); + if (msg->mxm_credits != 0) { + spin_lock(&conn->mxk_lock); + if (msg->mxm_srcstamp == conn->mxk_incarnation) { + if ((conn->mxk_credits + msg->mxm_credits) > + *kmxlnd_tunables.kmx_credits) { + CDEBUG(D_NETERROR, "mxk_credits %d mxm_credits %d\n", + conn->mxk_credits, msg->mxm_credits); + } + conn->mxk_credits += msg->mxm_credits; + LASSERT(conn->mxk_credits >= 0); + LASSERT(conn->mxk_credits <= *kmxlnd_tunables.kmx_credits); + } + spin_unlock(&conn->mxk_lock); + } + } + + CDEBUG(D_NET, "switch %s for rx (0x%llx)\n", mxlnd_msgtype_to_str(type), seq); + switch (type) { + case MXLND_MSG_NOOP: + break; + + case MXLND_MSG_EAGER: + ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.eager.mxem_hdr, + msg->mxm_srcnid, rx, 0); + repost = ret < 0; + break; + + case MXLND_MSG_PUT_REQ: + ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.put_req.mxprm_hdr, + msg->mxm_srcnid, rx, 1); + repost = ret < 0; + break; + + case MXLND_MSG_PUT_ACK: { + u64 cookie = (u64) msg->mxm_u.put_ack.mxpam_dst_cookie; + if (cookie > MXLND_MAX_COOKIE) { + CDEBUG(D_NETERROR, "NAK for msg_type %d from %s\n", rx->mxc_msg_type, + libcfs_nid2str(rx->mxc_nid)); + result = -((cookie >> 52) & 0xff); + lntmsg[0] = rx->mxc_lntmsg[0]; + } else { + mxlnd_send_data(kmxlnd_data.kmx_ni, rx->mxc_lntmsg[0], + rx->mxc_peer, MXLND_MSG_PUT_DATA, + rx->mxc_msg->mxm_u.put_ack.mxpam_dst_cookie); + } + /* repost == 1 */ + break; + } + case MXLND_MSG_GET_REQ: + ret = lnet_parse(kmxlnd_data.kmx_ni, &msg->mxm_u.get_req.mxgrm_hdr, + msg->mxm_srcnid, rx, 1); + repost = ret < 0; + break; + + case MXLND_MSG_CONN_REQ: + if (!lnet_ptlcompat_matchnid(kmxlnd_data.kmx_ni->ni_nid, msg->mxm_dstnid)) { + CDEBUG(D_NETERROR, "Can't accept %s: bad dst nid %s\n", + libcfs_nid2str(msg->mxm_srcnid), + libcfs_nid2str(msg->mxm_dstnid)); + goto cleanup; + } + if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_credits) { + CDEBUG(D_NETERROR, "Can't accept %s: incompatible queue depth " + "%d (%d wanted)\n", + libcfs_nid2str(msg->mxm_srcnid), + msg->mxm_u.conn_req.mxcrm_queue_depth, + *kmxlnd_tunables.kmx_credits); + incompatible = 1; + } + if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_EAGER_SIZE) { + CDEBUG(D_NETERROR, "Can't accept %s: incompatible EAGER size " + "%d (%d wanted)\n", + libcfs_nid2str(msg->mxm_srcnid), + msg->mxm_u.conn_req.mxcrm_eager_size, + (int) MXLND_EAGER_SIZE); + incompatible = 1; + } + if (peer == NULL) { + peer = mxlnd_find_peer_by_nid(msg->mxm_srcnid); + if (peer == NULL) { + int hash = 0; + hash = mxlnd_nid_to_hash(msg->mxm_srcnid); + + mx_decompose_endpoint_addr(rx->mxc_status.source, + &nic_id, &ep_id); + rx->mxc_nid = msg->mxm_srcnid; + + ret = mxlnd_peer_alloc(&peer, msg->mxm_srcnid); + if (ret != 0) { + goto cleanup; + } + LASSERT(peer->mxp_host->mxh_ep_id == ep_id); + write_lock(&kmxlnd_data.kmx_peers_lock); + list_add_tail(&peer->mxp_peers, + &kmxlnd_data.kmx_peers[hash]); + write_unlock(&kmxlnd_data.kmx_peers_lock); + atomic_inc(&kmxlnd_data.kmx_npeers); + } else { + ret = mxlnd_conn_alloc(&conn, peer); + if (ret != 0) { + CDEBUG(D_NETERROR, "Cannot allocate mxp_conn\n"); + goto cleanup; + } + } + conn = peer->mxp_conn; + } else { + struct kmx_conn *old_conn = conn; + + /* do not call mx_disconnect() */ + mxlnd_conn_disconnect(old_conn, 0, 0); + + /* the ref for this rx was taken on the old_conn */ + mxlnd_conn_decref(old_conn); + + /* do not decref this conn below */ + decref = 0; + + /* This allocs a conn, points peer->mxp_conn to this one. + * The old conn is still on the peer->mxp_conns list. + * As the pending requests complete, they will call + * conn_decref() which will eventually free it. */ + ret = mxlnd_conn_alloc(&conn, peer); + if (ret != 0) { + CDEBUG(D_NETERROR, "Cannot allocate peer->mxp_conn\n"); + goto cleanup; + } + } + spin_lock(&peer->mxp_lock); + peer->mxp_incarnation = msg->mxm_srcstamp; + peer->mxp_incompatible = incompatible; + spin_unlock(&peer->mxp_lock); + spin_lock(&conn->mxk_lock); + conn->mxk_incarnation = msg->mxm_srcstamp; + conn->mxk_status = MXLND_CONN_WAIT; + spin_unlock(&conn->mxk_lock); + + /* handle_conn_ack() will create the CONN_ACK msg */ + mxlnd_iconnect(peer, MXLND_MASK_ICON_ACK); + + break; + + case MXLND_MSG_CONN_ACK: + if (!lnet_ptlcompat_matchnid(kmxlnd_data.kmx_ni->ni_nid, msg->mxm_dstnid)) { + CDEBUG(D_NETERROR, "Can't accept CONN_ACK from %s: " + "bad dst nid %s\n", libcfs_nid2str(msg->mxm_srcnid), + libcfs_nid2str(msg->mxm_dstnid)); + ret = -1; + goto failed; + } + if (msg->mxm_u.conn_req.mxcrm_queue_depth != *kmxlnd_tunables.kmx_credits) { + CDEBUG(D_NETERROR, "Can't accept CONN_ACK from %s: " + "incompatible queue depth %d (%d wanted)\n", + libcfs_nid2str(msg->mxm_srcnid), + msg->mxm_u.conn_req.mxcrm_queue_depth, + *kmxlnd_tunables.kmx_credits); + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + incompatible = 1; + ret = -1; + } + if (msg->mxm_u.conn_req.mxcrm_eager_size != MXLND_EAGER_SIZE) { + CDEBUG(D_NETERROR, "Can't accept CONN_ACK from %s: " + "incompatible EAGER size %d (%d wanted)\n", + libcfs_nid2str(msg->mxm_srcnid), + msg->mxm_u.conn_req.mxcrm_eager_size, + (int) MXLND_EAGER_SIZE); + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + incompatible = 1; + ret = -1; + } + spin_lock(&peer->mxp_lock); + peer->mxp_incarnation = msg->mxm_srcstamp; + peer->mxp_incompatible = incompatible; + spin_unlock(&peer->mxp_lock); + spin_lock(&conn->mxk_lock); + conn->mxk_credits = *kmxlnd_tunables.kmx_credits; + conn->mxk_outstanding = 0; + conn->mxk_incarnation = msg->mxm_srcstamp; + conn->mxk_timeout = 0; + if (!incompatible) { + conn->mxk_status = MXLND_CONN_READY; + } + spin_unlock(&conn->mxk_lock); + if (incompatible) mxlnd_conn_disconnect(conn, 0, 1); + break; + + default: + CDEBUG(D_NETERROR, "Bad MXLND message type %x from %s\n", msg->mxm_type, + libcfs_nid2str(rx->mxc_nid)); + ret = -EPROTO; + break; + } + +failed: + if (ret < 0) { + MXLND_PRINT("setting PEER_CONN_FAILED\n"); + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + } + +cleanup: + if (conn != NULL) { + spin_lock(&conn->mxk_lock); + conn->mxk_last_rx = cfs_time_current(); /* jiffies */ + spin_unlock(&conn->mxk_lock); + } + + if (repost) { + /* lnet_parse() failed, etc., repost now */ + mxlnd_put_idle_rx(rx); + if (conn != NULL && credit == 1) { + if (type == MXLND_MSG_PUT_DATA) { + spin_lock(&conn->mxk_lock); + conn->mxk_outstanding++; + spin_unlock(&conn->mxk_lock); + } else if (type != MXLND_MSG_GET_DATA && + (type == MXLND_MSG_EAGER || + type == MXLND_MSG_PUT_REQ || + type == MXLND_MSG_NOOP)) { + spin_lock(&conn->mxk_lock); + conn->mxk_outstanding++; + spin_unlock(&conn->mxk_lock); + } + } + if (decref) mxlnd_conn_decref(conn); + } + + if (type == MXLND_MSG_PUT_DATA || type == MXLND_MSG_GET_DATA) { + CDEBUG(D_NET, "leaving for rx (0x%llx)\n", bits); + } else { + CDEBUG(D_NET, "leaving for rx (0x%llx)\n", seq); + } + + if (lntmsg[0] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[0], result); + if (lntmsg[1] != NULL) lnet_finalize(kmxlnd_data.kmx_ni, lntmsg[1], result); + + if (conn != NULL && credit == 1) mxlnd_check_sends(peer); + + return; +} + + + +void +mxlnd_handle_conn_req(struct kmx_peer *peer, mx_status_t status) +{ + struct kmx_ctx *tx = NULL; + struct kmx_msg *txmsg = NULL; + struct kmx_conn *conn = peer->mxp_conn; + + /* a conn ref was taken when calling mx_iconnect(), + * hold it until CONN_REQ or CONN_ACK completes */ + + CDEBUG(D_NET, "entering\n"); + if (status.code != MX_STATUS_SUCCESS) { + CDEBUG(D_NETERROR, "mx_iconnect() failed with %s (%d) to %s\n", + mx_strstatus(status.code), status.code, + libcfs_nid2str(peer->mxp_nid)); + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + + if (time_after(jiffies, peer->mxp_reconnect_time + MXLND_WAIT_TIMEOUT)) { + struct kmx_conn *new_conn = NULL; + CDEBUG(D_NETERROR, "timeout, calling conn_disconnect()\n"); + mxlnd_conn_disconnect(conn, 0, 1); + mxlnd_conn_alloc(&new_conn, peer); + spin_lock(&peer->mxp_lock); + peer->mxp_reconnect_time = 0; + spin_unlock(&peer->mxp_lock); + } + + mxlnd_conn_decref(conn); + return; + } + + spin_lock(&conn->mxk_lock); + conn->mxk_epa = status.source; + spin_unlock(&conn->mxk_lock); + mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn); + + /* mx_iconnect() succeeded, reset delay to 0 */ + spin_lock(&peer->mxp_lock); + peer->mxp_reconnect_time = 0; + spin_unlock(&peer->mxp_lock); + + /* marshal CONN_REQ msg */ + /* we are still using the conn ref from iconnect() - do not take another */ + tx = mxlnd_get_idle_tx(); + if (tx == NULL) { + CDEBUG(D_NETERROR, "Can't allocate CONN_REQ tx for %s\n", + libcfs_nid2str(peer->mxp_nid)); + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + mxlnd_conn_decref(conn); + return; + } + + tx->mxc_peer = peer; + tx->mxc_conn = conn; + mxlnd_init_tx_msg (tx, MXLND_MSG_CONN_REQ, sizeof(kmx_connreq_msg_t), peer->mxp_nid); + txmsg = tx->mxc_msg; + txmsg->mxm_u.conn_req.mxcrm_queue_depth = *kmxlnd_tunables.kmx_credits; + txmsg->mxm_u.conn_req.mxcrm_eager_size = MXLND_EAGER_SIZE; + tx->mxc_match = mxlnd_create_match(tx, 0); + + CDEBUG(D_NET, "sending MXLND_MSG_CONN_REQ\n"); + mxlnd_queue_tx(tx); + return; +} + +void +mxlnd_handle_conn_ack(struct kmx_peer *peer, mx_status_t status) +{ + struct kmx_ctx *tx = NULL; + struct kmx_msg *txmsg = NULL; + struct kmx_conn *conn = peer->mxp_conn; + + /* a conn ref was taken when calling mx_iconnect(), + * hold it until CONN_REQ or CONN_ACK completes */ + + CDEBUG(D_NET, "entering\n"); + if (status.code != MX_STATUS_SUCCESS) { + struct kmx_conn *conn = peer->mxp_conn; + CDEBUG(D_NETERROR, "mx_iconnect() failed for CONN_ACK with %s (%d) " + "to %s mxp_nid = 0x%llx mxp_nic_id = 0x%0llx mxh_ep_id = %d\n", + mx_strstatus(status.code), status.code, + libcfs_nid2str(peer->mxp_nid), + peer->mxp_nid, + peer->mxp_nic_id, + peer->mxp_host->mxh_ep_id); + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + + if (time_after(jiffies, peer->mxp_reconnect_time + MXLND_WAIT_TIMEOUT)) { + struct kmx_conn *new_conn = NULL; + CDEBUG(D_NETERROR, "timeout, calling conn_disconnect()\n"); + mxlnd_conn_disconnect(conn, 0, 1); + mxlnd_conn_alloc(&new_conn, peer); + spin_lock(&peer->mxp_lock); + peer->mxp_reconnect_time = 0; + spin_unlock(&peer->mxp_lock); + } + + mxlnd_conn_decref(conn); + return; + } + spin_lock(&conn->mxk_lock); + conn->mxk_epa = status.source; + if (likely(!peer->mxp_incompatible)) { + conn->mxk_status = MXLND_CONN_READY; + } + spin_unlock(&conn->mxk_lock); + mx_set_endpoint_addr_context(conn->mxk_epa, (void *) conn); + + /* mx_iconnect() succeeded, reset delay to 0 */ + spin_lock(&peer->mxp_lock); + peer->mxp_reconnect_time = 0; + spin_unlock(&peer->mxp_lock); + + /* marshal CONN_ACK msg */ + tx = mxlnd_get_idle_tx(); + if (tx == NULL) { + CDEBUG(D_NETERROR, "Can't allocate CONN_ACK tx for %s\n", + libcfs_nid2str(peer->mxp_nid)); + spin_lock(&conn->mxk_lock); + conn->mxk_status = MXLND_CONN_FAIL; + spin_unlock(&conn->mxk_lock); + mxlnd_conn_decref(conn); + return; + } + + tx->mxc_peer = peer; + tx->mxc_conn = conn; + CDEBUG(D_NET, "sending MXLND_MSG_CONN_ACK\n"); + mxlnd_init_tx_msg (tx, MXLND_MSG_CONN_ACK, sizeof(kmx_connreq_msg_t), peer->mxp_nid); + txmsg = tx->mxc_msg; + txmsg->mxm_u.conn_req.mxcrm_queue_depth = *kmxlnd_tunables.kmx_credits; + txmsg->mxm_u.conn_req.mxcrm_eager_size = MXLND_EAGER_SIZE; + tx->mxc_match = mxlnd_create_match(tx, 0); + + mxlnd_queue_tx(tx); + return; +} + +/** + * mxlnd_request_waitd - the MX request completion thread(s) + * @arg - thread id (as a void *) + * + * This thread waits for a MX completion and then completes the request. + * We will create one thread per CPU. + */ +int +mxlnd_request_waitd(void *arg) +{ + long id = (long) arg; + char name[24]; + __u32 result = 0; + mx_return_t mxret = MX_SUCCESS; + mx_status_t status; + struct kmx_ctx *ctx = NULL; + enum kmx_req_state req_type = MXLND_REQ_TX; + struct kmx_peer *peer = NULL; + struct kmx_conn *conn = NULL; +#if MXLND_POLLING + int count = 0; +#endif + + memset(name, 0, sizeof(name)); + snprintf(name, sizeof(name), "mxlnd_request_waitd_%02ld", id); + cfs_daemonize(name); + //cfs_block_allsigs(); + + memset(&status, 0, sizeof(status)); + + CDEBUG(D_NET, "%s starting\n", name); + + while (!kmxlnd_data.kmx_shutdown) { + mxret = MX_SUCCESS; + result = 0; +#if MXLND_POLLING + if (id == 0 && count++ < *kmxlnd_tunables.kmx_polling) { + mxret = mx_test_any(kmxlnd_data.kmx_endpt, 0LL, 0LL, + &status, &result); + } else { + count = 0; + mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT, + 0LL, 0LL, &status, &result); + } +#else + mxret = mx_wait_any(kmxlnd_data.kmx_endpt, MXLND_WAIT_TIMEOUT, + 0LL, 0LL, &status, &result); +#endif + if (unlikely(kmxlnd_data.kmx_shutdown)) + break; + + if (result != 1) { + /* nothing completed... */ + continue; + } + + if (status.code != MX_STATUS_SUCCESS) { + CDEBUG(D_NETERROR, "wait_any() failed with %s (%d) with " + "match_info 0x%llx and length %d\n", + mx_strstatus(status.code), status.code, + (u64) status.match_info, status.msg_length); + } + + /* This may be a mx_iconnect() request completing, + * check the bit mask for CONN_REQ and CONN_ACK */ + if (status.match_info == MXLND_MASK_ICON_REQ || + status.match_info == MXLND_MASK_ICON_ACK) { + peer = (struct kmx_peer*) status.context; + if (status.match_info == MXLND_MASK_ICON_REQ) { + mxlnd_handle_conn_req(peer, status); + } else { + mxlnd_handle_conn_ack(peer, status); + } + continue; + } + + /* This must be a tx or rx */ + + /* NOTE: if this is a RX from the unexpected callback, it may + * have very little info. If we dropped it in unexpected_recv(), + * it will not have a context. If so, ignore it. */ + ctx = (struct kmx_ctx *) status.context; + if (ctx != NULL) { + + req_type = ctx->mxc_type; + conn = ctx->mxc_conn; /* this may be NULL */ + mxlnd_deq_pending_ctx(ctx); + + /* copy status to ctx->mxc_status */ + memcpy(&ctx->mxc_status, &status, sizeof(status)); + + switch (req_type) { + case MXLND_REQ_TX: + mxlnd_handle_tx_completion(ctx); + break; + case MXLND_REQ_RX: + mxlnd_handle_rx_completion(ctx); + break; + default: + CDEBUG(D_NETERROR, "Unknown ctx type %d\n", req_type); + LBUG(); + break; + } + + /* conn is always set except for the first CONN_REQ rx + * from a new peer */ + if (!(status.code == MX_STATUS_SUCCESS || + status.code == MX_STATUS_TRUNCATED) && + conn != NULL) { + mxlnd_conn_disconnect(conn, 1, 1); + } + } + CDEBUG(D_NET, "waitd() completed task\n"); + } + CDEBUG(D_NET, "%s stopping\n", name); + mxlnd_thread_stop(id); + return 0; +} + + +unsigned long +mxlnd_check_timeouts(unsigned long now) +{ + int i = 0; + int disconnect = 0; + unsigned long next = 0; + struct kmx_peer *peer = NULL; + struct kmx_conn *conn = NULL; + + read_lock(&kmxlnd_data.kmx_peers_lock); + for (i = 0; i < MXLND_HASH_SIZE; i++) { + list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) { + + if (unlikely(kmxlnd_data.kmx_shutdown)) + return next; + + conn = peer->mxp_conn; + if (conn == NULL) + continue; + + mxlnd_conn_addref(conn); + spin_lock(&conn->mxk_lock); + + /* if nothing pending (timeout == 0) or + * if conn is already disconnected, + * skip this conn */ + if (conn->mxk_timeout == 0 || + conn->mxk_status == MXLND_CONN_DISCONNECT) { + spin_unlock(&conn->mxk_lock); + mxlnd_conn_decref(conn); + continue; + } + + /* we want to find the timeout that will occur first. + * if it is in the future, we will sleep until then. + * if it is in the past, then we will sleep one + * second and repeat the process. */ + if ((next == 0) || (conn->mxk_timeout < next)) { + next = conn->mxk_timeout; + } + + disconnect = 0; + + if (time_after_eq(now, conn->mxk_timeout)) { + disconnect = 1; + } + spin_unlock(&conn->mxk_lock); + + if (disconnect) { + mxlnd_conn_disconnect(conn, 1, 1); + } + mxlnd_conn_decref(conn); + } + } + read_unlock(&kmxlnd_data.kmx_peers_lock); + if (next == 0) next = now + MXLND_COMM_TIMEOUT; + + return next; +} + +/** + * mxlnd_timeoutd - enforces timeouts on messages + * @arg - thread id (as a void *) + * + * This thread queries each peer for its earliest timeout. If a peer has timed out, + * it calls mxlnd_conn_disconnect(). + * + * After checking for timeouts, try progressing sends (call check_sends()). + */ +int +mxlnd_timeoutd(void *arg) +{ + int i = 0; + long id = (long) arg; + unsigned long now = 0; + unsigned long next = 0; + unsigned long delay = HZ; + struct kmx_peer *peer = NULL; + struct kmx_conn *conn = NULL; + + cfs_daemonize("mxlnd_timeoutd"); + //cfs_block_allsigs(); + + CDEBUG(D_NET, "timeoutd starting\n"); + + while (!kmxlnd_data.kmx_shutdown) { + + now = jiffies; + /* if the next timeout has not arrived, go back to sleep */ + if (time_after(now, next)) { + next = mxlnd_check_timeouts(now); + } + + read_lock(&kmxlnd_data.kmx_peers_lock); + for (i = 0; i < MXLND_HASH_SIZE; i++) { + list_for_each_entry(peer, &kmxlnd_data.kmx_peers[i], mxp_peers) { + conn = peer->mxp_conn; + if (conn == NULL) + continue; + + if (conn->mxk_status != MXLND_CONN_DISCONNECT && + time_after(now, conn->mxk_last_tx + HZ)) { + mxlnd_check_sends(peer); + } + } + } + read_unlock(&kmxlnd_data.kmx_peers_lock); + + mxlnd_sleep(delay); + } + CDEBUG(D_NET, "timeoutd stopping\n"); + mxlnd_thread_stop(id); + return 0; +} diff --git a/lnet/klnds/mxlnd/mxlnd_modparams.c b/lnet/klnds/mxlnd/mxlnd_modparams.c new file mode 100644 index 0000000..37d77f1 --- /dev/null +++ b/lnet/klnds/mxlnd/mxlnd_modparams.c @@ -0,0 +1,73 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * Copyright (C) 2006 Myricom, Inc. + * Author: Scott Atchley + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "mxlnd.h" + +static int n_waitd = MXLND_N_SCHED; +CFS_MODULE_PARM(n_waitd, "i", int, 0444, + "# of completion daemons"); + +static int max_peers = MXLND_MAX_PEERS; +CFS_MODULE_PARM(max_peers, "i", int, 0444, + "maximum number of peers that may connect"); + +static int cksum = MXLND_CKSUM; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not data payload) checksums"); + +static int ntx = MXLND_NTX; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of total tx message descriptors"); + +static int credits = MXLND_MSG_QUEUE_DEPTH; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int board = MXLND_MX_BOARD; +CFS_MODULE_PARM(board, "i", int, 0444, + "index value of the Myrinet board (NIC)"); + +static int ep_id = MXLND_MX_EP_ID; +CFS_MODULE_PARM(ep_id, "i", int, 0444, + "MX endpoint ID"); + +static int polling = MXLND_POLLING; +CFS_MODULE_PARM(polling, "i", int, 0444, + "Use 0 to block (wait). A value > 0 will poll that many times before blocking"); + +static char *hosts = NULL; +CFS_MODULE_PARM(hosts, "s", charp, 0444, + "IP-to-hostname resolution file"); + +kmx_tunables_t kmxlnd_tunables = { + .kmx_n_waitd = &n_waitd, + .kmx_max_peers = &max_peers, + .kmx_cksum = &cksum, + .kmx_ntx = &ntx, + .kmx_credits = &credits, + .kmx_board = &board, + .kmx_ep_id = &ep_id, + .kmx_polling = &polling, + .kmx_hosts = &hosts +}; diff --git a/lnet/klnds/mxlnd/mxlnd_wire.h b/lnet/klnds/mxlnd/mxlnd_wire.h new file mode 100644 index 0000000..a929608 --- /dev/null +++ b/lnet/klnds/mxlnd/mxlnd_wire.h @@ -0,0 +1,95 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * Copyright (C) 2006 Myricom, Inc. + * Author: Scott Atchley + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * MXLND wire format - sent in sender's byte order + */ + +typedef struct kmx_connreq_msg +{ + u32 mxcrm_queue_depth; /* per peer max messages in flight */ + u32 mxcrm_eager_size; /* size of preposted eager messages */ +} WIRE_ATTR kmx_connreq_msg_t; + +typedef struct kmx_eager_msg +{ + lnet_hdr_t mxem_hdr; /* lnet header */ + char mxem_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kmx_eager_msg_t; + +typedef struct kmx_putreq_msg +{ + lnet_hdr_t mxprm_hdr; /* lnet header */ + u64 mxprm_cookie; /* opaque completion cookie */ +} WIRE_ATTR kmx_putreq_msg_t; + +typedef struct kmx_putack_msg +{ + u64 mxpam_src_cookie; /* reflected completion cookie */ + u64 mxpam_dst_cookie; /* opaque completion cookie */ +} WIRE_ATTR kmx_putack_msg_t; + +typedef struct kmx_getreq_msg +{ + lnet_hdr_t mxgrm_hdr; /* lnet header */ + u64 mxgrm_cookie; /* opaque completion cookie */ +} WIRE_ATTR kmx_getreq_msg_t; + +typedef struct kmx_msg +{ + /* First two fields fixed for all time */ + u32 mxm_magic; /* MXLND message */ + u16 mxm_version; /* version number */ + + u8 mxm_type; /* message type */ + u8 mxm_credits; /* returned credits */ + u32 mxm_nob; /* # of bytes in whole message */ + u32 mxm_cksum; /* checksum (0 == no checksum) */ + u64 mxm_srcnid; /* sender's NID */ + u64 mxm_srcstamp; /* sender's incarnation */ + u64 mxm_dstnid; /* destination's NID */ + u64 mxm_dststamp; /* destination's incarnation */ + u64 mxm_seq; /* sequence number */ + + union { + kmx_connreq_msg_t conn_req; + kmx_eager_msg_t eager; + kmx_putreq_msg_t put_req; + kmx_putack_msg_t put_ack; + kmx_getreq_msg_t get_req; + } WIRE_ATTR mxm_u; +} WIRE_ATTR kmx_msg_t; + +#define MXLND_MSG_MAGIC 0x4d583130 /* unique magic 'MX10' */ +#define MXLND_MSG_VERSION 0x01 + +#define MXLND_MSG_CONN_REQ 0xc /* connection request */ +#define MXLND_MSG_CONN_ACK 0xa /* connection request response */ +#define MXLND_MSG_EAGER 0xe /* eager message */ +#define MXLND_MSG_NOOP 0x1 /* no msg, return credits */ +#define MXLND_MSG_PUT_REQ 0x2 /* put request src->sink */ +#define MXLND_MSG_PUT_ACK 0x3 /* put ack src<-sink */ +#define MXLND_MSG_PUT_DATA 0x4 /* put payload src->sink */ +#define MXLND_MSG_GET_REQ 0x5 /* get request sink->src */ +#define MXLND_MSG_GET_DATA 0x6 /* get payload sink<-src */ diff --git a/lnet/klnds/o2iblnd/.cvsignore b/lnet/klnds/o2iblnd/.cvsignore new file mode 100644 index 0000000..2e9b6f4 --- /dev/null +++ b/lnet/klnds/o2iblnd/.cvsignore @@ -0,0 +1,11 @@ +.deps +Makefile +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.tmp_versions +.depend +wirecheck diff --git a/lnet/klnds/o2iblnd/Makefile.in b/lnet/klnds/o2iblnd/Makefile.in new file mode 100644 index 0000000..52a194d --- /dev/null +++ b/lnet/klnds/o2iblnd/Makefile.in @@ -0,0 +1,6 @@ +MODULES := ko2iblnd +ko2iblnd-objs := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o + +EXTRA_POST_CFLAGS := @O2IBCPPFLAGS@ + +@INCLUDE_RULES@ diff --git a/lnet/klnds/o2iblnd/autoMakefile.am b/lnet/klnds/o2iblnd/autoMakefile.am new file mode 100644 index 0000000..83788fd --- /dev/null +++ b/lnet/klnds/o2iblnd/autoMakefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +if BUILD_O2IBLND +modulenet_DATA = ko2iblnd$(KMODEXT) +endif +endif + +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(ko2iblnd-objs:%.o=%.c) o2iblnd.h diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c new file mode 100644 index 0000000..ded32d6 --- /dev/null +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -0,0 +1,1710 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "o2iblnd.h" + +lnd_t the_kiblnd = { + .lnd_type = O2IBLND, + .lnd_startup = kiblnd_startup, + .lnd_shutdown = kiblnd_shutdown, + .lnd_ctl = kiblnd_ctl, + .lnd_send = kiblnd_send, + .lnd_recv = kiblnd_recv, +}; + +kib_data_t kiblnd_data; + +__u32 +kiblnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +void +kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob) +{ + msg->ibm_type = type; + msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob; +} + +void +kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, + int credits, lnet_nid_t dstnid, __u64 dststamp) +{ + kib_net_t *net = ni->ni_data; + + /* CAVEAT EMPTOR! all message fields not set here should have been + * initialised previously. */ + msg->ibm_magic = IBLND_MSG_MAGIC; + msg->ibm_version = IBLND_MSG_VERSION; + /* ibm_type */ + msg->ibm_credits = credits; + /* ibm_nob */ + msg->ibm_cksum = 0; + msg->ibm_srcnid = lnet_ptlcompat_srcnid(ni->ni_nid, dstnid); + msg->ibm_srcstamp = net->ibn_incarnation; + msg->ibm_dstnid = dstnid; + msg->ibm_dststamp = dststamp; + + if (*kiblnd_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); + } +} + +int +kiblnd_unpack_msg(kib_msg_t *msg, int nob) +{ + const int hdr_size = offsetof(kib_msg_t, ibm_u); + __u32 msg_cksum; + int flip; + int msg_nob; +#if !IBLND_MAP_ON_DEMAND + int i; + int n; +#endif + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + if (msg->ibm_magic == IBLND_MSG_MAGIC) { + flip = 0; + } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { + flip = 1; + } else { + CERROR("Bad magic: %08x\n", msg->ibm_magic); + return -EPROTO; + } + + if (msg->ibm_version != + (flip ? __swab16(IBLND_MSG_VERSION) : IBLND_MSG_VERSION)) { + CERROR("Bad version: %d\n", msg->ibm_version); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short message: %d\n", nob); + return -EPROTO; + } + + msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; + if (msg_nob > nob) { + CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); + return -EPROTO; + } + + /* checksum must be computed with ibm_cksum zero and BEFORE anything + * gets flipped */ + msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; + msg->ibm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kiblnd_cksum(msg, msg_nob)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + msg->ibm_cksum = msg_cksum; + + if (flip) { + /* leave magic unflipped as a clue to peer endianness */ + __swab16s(&msg->ibm_version); + CLASSERT (sizeof(msg->ibm_type) == 1); + CLASSERT (sizeof(msg->ibm_credits) == 1); + msg->ibm_nob = msg_nob; + __swab64s(&msg->ibm_srcnid); + __swab64s(&msg->ibm_srcstamp); + __swab64s(&msg->ibm_dstnid); + __swab64s(&msg->ibm_dststamp); + } + + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); + return -EPROTO; + } + + switch (msg->ibm_type) { + default: + CERROR("Unknown message type %x\n", msg->ibm_type); + return -EPROTO; + + case IBLND_MSG_NOOP: + break; + + case IBLND_MSG_IMMEDIATE: + if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) { + CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])); + return -EPROTO; + } + break; + + case IBLND_MSG_PUT_REQ: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) { + CERROR("Short PUT_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.putreq))); + return -EPROTO; + } + break; + + case IBLND_MSG_PUT_ACK: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.putack))); + return -EPROTO; + } +#if IBLND_MAP_ON_DEMAND + if (flip) { + __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + } +#else + if (flip) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrags); + } + + n = msg->ibm_u.putack.ibpam_rd.rd_nfrags; + if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { + CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", + n, IBLND_MAX_RDMA_FRAGS); + return -EPROTO; + } + + if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { + CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); + return -EPROTO; + } + + if (flip) { + for (i = 0; i < n; i++) { + __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob); + __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr); + } + } +#endif + break; + + case IBLND_MSG_GET_REQ: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.get))); + return -EPROTO; + } +#if IBLND_MAP_ON_DEMAND + if (flip) { + __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + } +#else + if (flip) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrags); + } + + n = msg->ibm_u.get.ibgm_rd.rd_nfrags; + if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { + CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", + n, IBLND_MAX_RDMA_FRAGS); + return -EPROTO; + } + + if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { + CERROR("Short GET_REQ: %d(%d)\n", msg_nob, + (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); + return -EPROTO; + } + + if (flip) + for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrags; i++) { + __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); + __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr); + } +#endif + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) { + CERROR("Short RDMA completion: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.completion))); + return -EPROTO; + } + if (flip) + __swab32s(&msg->ibm_u.completion.ibcm_status); + break; + + case IBLND_MSG_CONNREQ: + case IBLND_MSG_CONNACK: + if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) { + CERROR("Short connreq/ack: %d(%d)\n", msg_nob, + (int)(hdr_size + sizeof(msg->ibm_u.connparams))); + return -EPROTO; + } + if (flip) { + __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); + __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); + __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); + } + break; + } + return 0; +} + +int +kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) +{ + kib_peer_t *peer; + kib_net_t *net = ni->ni_data; + unsigned long flags; + + LASSERT (net != NULL); + LASSERT (nid != LNET_NID_ANY); + + LIBCFS_ALLOC(peer, sizeof(*peer)); + if (peer == NULL) { + CERROR("Cannot allocate peer\n"); + return -ENOMEM; + } + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + peer->ibp_ni = ni; + peer->ibp_nid = nid; + peer->ibp_error = 0; + peer->ibp_last_alive = cfs_time_current(); + atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ + INIT_LIST_HEAD(&peer->ibp_conns); + INIT_LIST_HEAD(&peer->ibp_tx_queue); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* always called with a ref on ni, which prevents ni being shutdown */ + LASSERT (net->ibn_shutdown == 0); + + /* npeers only grows with the global lock held */ + atomic_inc(&net->ibn_npeers); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + *peerp = peer; + return 0; +} + +void +kiblnd_destroy_peer (kib_peer_t *peer) +{ + kib_net_t *net = peer->ibp_ni->ni_data; + + LASSERT (net != NULL); + LASSERT (atomic_read(&peer->ibp_refcount) == 0); + LASSERT (!kiblnd_peer_active(peer)); + LASSERT (peer->ibp_connecting == 0); + LASSERT (peer->ibp_accepting == 0); + LASSERT (list_empty(&peer->ibp_conns)); + LASSERT (list_empty(&peer->ibp_tx_queue)); + + LIBCFS_FREE(peer, sizeof(*peer)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec(&net->ibn_npeers); +} + +void +kiblnd_destroy_dev (kib_dev_t *dev) +{ + LASSERT (dev->ibd_nnets == 0); + + if (!list_empty(&dev->ibd_list)) /* on kib_devs? */ + list_del_init(&dev->ibd_list); + + if (dev->ibd_mr != NULL) + ib_dereg_mr(dev->ibd_mr); + + if (dev->ibd_pd != NULL) + ib_dealloc_pd(dev->ibd_pd); + + if (dev->ibd_cmid != NULL) + rdma_destroy_id(dev->ibd_cmid); + + LIBCFS_FREE(dev, sizeof(*dev)); +} + +kib_peer_t * +kiblnd_find_peer_locked (lnet_nid_t nid) +{ + /* the caller is responsible for accounting the additional reference + * that this creates */ + struct list_head *peer_list = kiblnd_nid2peerlist(nid); + struct list_head *tmp; + kib_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry(tmp, kib_peer_t, ibp_list); + + LASSERT (peer->ibp_connecting > 0 || /* creating conns */ + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); /* active conn */ + + if (peer->ibp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_nid2str(nid), + atomic_read(&peer->ibp_refcount)); + return peer; + } + return NULL; +} + +void +kiblnd_unlink_peer_locked (kib_peer_t *peer) +{ + LASSERT (list_empty(&peer->ibp_conns)); + + LASSERT (kiblnd_peer_active(peer)); + list_del_init(&peer->ibp_list); + /* lose peerlist's ref */ + kiblnd_peer_decref(peer); +} + +int +kiblnd_get_peer_info (lnet_ni_t *ni, int index, + lnet_nid_t *nidp, int *count) +{ + kib_peer_t *peer; + struct list_head *ptmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + + list_for_each (ptmp, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (index-- > 0) + continue; + + *nidp = peer->ibp_nid; + *count = atomic_read(&peer->ibp_refcount); + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return 0; + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return -ENOENT; +} + +void +kiblnd_del_peer_locked (kib_peer_t *peer) +{ + struct list_head *ctmp; + struct list_head *cnxt; + kib_conn_t *conn; + + if (list_empty(&peer->ibp_conns)) { + kiblnd_unlink_peer_locked(peer); + } else { + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + kiblnd_close_conn_locked(conn, 0); + } + /* NB closing peer's last conn unlinked it. */ + } + /* NB peer now unlinked; might even be freed if the peer table had the + * last ref on it. */ +} + +int +kiblnd_del_peer (lnet_ni_t *ni, lnet_nid_t nid) +{ + CFS_LIST_HEAD (zombies); + struct list_head *ptmp; + struct list_head *pnxt; + kib_peer_t *peer; + int lo; + int hi; + int i; + unsigned long flags; + int rc = -ENOENT; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) { + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + } else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) + continue; + + if (!list_empty(&peer->ibp_tx_queue)) { + LASSERT (list_empty(&peer->ibp_conns)); + + list_splice_init(&peer->ibp_tx_queue, &zombies); + } + + kiblnd_del_peer_locked(peer); + rc = 0; /* matched something */ + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_txlist_done(ni, &zombies, -EIO); + + return rc; +} + +kib_conn_t * +kiblnd_get_conn_by_idx (lnet_ni_t *ni, int index) +{ + kib_peer_t *peer; + struct list_head *ptmp; + kib_conn_t *conn; + struct list_head *ctmp; + int i; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + list_for_each (ctmp, &peer->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry(ctmp, kib_conn_t, ibc_list); + kiblnd_conn_addref(conn); + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return conn; + } + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return NULL; +} + +void +kiblnd_debug_rx (kib_rx_t *rx) +{ + CDEBUG(D_CONSOLE, " %p status %d msg_type %x cred %d\n", + rx, rx->rx_status, rx->rx_msg->ibm_type, + rx->rx_msg->ibm_credits); +} + +void +kiblnd_debug_tx (kib_tx_t *tx) +{ + CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx " + "cookie "LPX64" msg %s%s type %x cred %d\n", + tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, + tx->tx_status, tx->tx_deadline, tx->tx_cookie, + tx->tx_lntmsg[0] == NULL ? "-" : "!", + tx->tx_lntmsg[1] == NULL ? "-" : "!", + tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits); +} + +void +kiblnd_debug_conn (kib_conn_t *conn) +{ + struct list_head *tmp; + int i; + + spin_lock(&conn->ibc_lock); + + CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", + atomic_read(&conn->ibc_refcount), conn, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + CDEBUG(D_CONSOLE, " state %d nposted %d cred %d o_cred %d r_cred %d\n", + conn->ibc_state, conn->ibc_nsends_posted, conn->ibc_credits, + conn->ibc_outstanding_credits, conn->ibc_reserved_credits); + CDEBUG(D_CONSOLE, " comms_err %d\n", conn->ibc_comms_error); + + CDEBUG(D_CONSOLE, " early_rxs:\n"); + list_for_each(tmp, &conn->ibc_early_rxs) + kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_nocred) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue:\n"); + list_for_each(tmp, &conn->ibc_tx_queue) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " active_txs:\n"); + list_for_each(tmp, &conn->ibc_active_txs) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " rxs:\n"); + for (i = 0; i < IBLND_RX_MSGS; i++) + kiblnd_debug_rx(&conn->ibc_rxs[i]); + + spin_unlock(&conn->ibc_lock); +} + +kib_conn_t * +kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, int state) +{ + /* CAVEAT EMPTOR: + * If the new conn is created successfully it takes over the caller's + * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself + * is destroyed. On failure, the caller's ref on 'peer' remains and + * she must dispose of 'cmid'. (Actually I'd block forever if I tried + * to destroy 'cmid' here since I'm called from the CM which still has + * its ref on 'cmid'). */ + kib_conn_t *conn; + kib_net_t *net = peer->ibp_ni->ni_data; + int i; + int page_offset; + int ipage; + int rc; + struct ib_cq *cq; + struct ib_qp_init_attr *init_qp_attr; + unsigned long flags; + + LASSERT (net != NULL); + LASSERT (!in_interrupt()); + + LIBCFS_ALLOC(init_qp_attr, sizeof(*init_qp_attr)); + if (init_qp_attr == NULL) { + CERROR("Can't allocate qp_attr for %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed_0; + } + + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + CERROR("Can't allocate connection for %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed_1; + } + + memset(conn, 0, sizeof(*conn)); /* zero flags, NULL pointers etc... */ + + conn->ibc_state = IBLND_CONN_INIT; + conn->ibc_peer = peer; /* I take the caller's ref */ + cmid->context = conn; /* for future CM callbacks */ + conn->ibc_cmid = cmid; + + INIT_LIST_HEAD(&conn->ibc_early_rxs); + INIT_LIST_HEAD(&conn->ibc_tx_queue); + INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); + INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); + INIT_LIST_HEAD(&conn->ibc_active_txs); + spin_lock_init(&conn->ibc_lock); + + LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + if (conn->ibc_connvars == NULL) { + CERROR("Can't allocate in-progress connection state\n"); + goto failed_2; + } + memset(conn->ibc_connvars, 0, sizeof(*conn->ibc_connvars)); + + LIBCFS_ALLOC(conn->ibc_rxs, IBLND_RX_MSGS * sizeof(kib_rx_t)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX buffers\n"); + goto failed_2; + } + memset(conn->ibc_rxs, 0, IBLND_RX_MSGS * sizeof(kib_rx_t)); + + rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, IBLND_RX_MSG_PAGES); + if (rc != 0) + goto failed_2; + + for (i = ipage = page_offset = 0; i < IBLND_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + rx->rx_msgaddr = dma_map_single(cmid->device->dma_device, + rx->rx_msg, + IBLND_MSG_SIZE, + DMA_FROM_DEVICE); + pci_unmap_addr_set(rx, rx_msgunmap, rx->rx_msgaddr); + + CDEBUG(D_NET,"rx %d: %p "LPX64"("LPX64")\n", + i, rx->rx_msg, rx->rx_msgaddr, + lnet_page2phys(page) + page_offset); + + page_offset += IBLND_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBLND_RX_MSG_PAGES); + } + } + + cq = ib_create_cq(cmid->device, + kiblnd_cq_completion, kiblnd_cq_event, conn, + IBLND_CQ_ENTRIES()); + if (!IS_ERR(cq)) { + conn->ibc_cq = cq; + } else { + CERROR("Can't create CQ: %ld\n", PTR_ERR(cq)); + goto failed_2; + } + + rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + if (rc != 0) { + CERROR("Can't request completion notificiation: %d\n", rc); + goto failed_2; + } + + memset(init_qp_attr, 0, sizeof(*init_qp_attr)); + init_qp_attr->event_handler = kiblnd_qp_event; + init_qp_attr->qp_context = conn; + init_qp_attr->cap.max_send_wr = (*kiblnd_tunables.kib_concurrent_sends) * + (1 + IBLND_MAX_RDMA_FRAGS); + init_qp_attr->cap.max_recv_wr = IBLND_RX_MSGS; + init_qp_attr->cap.max_send_sge = 1; + init_qp_attr->cap.max_recv_sge = 1; + init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; + init_qp_attr->qp_type = IB_QPT_RC; + init_qp_attr->send_cq = cq; + init_qp_attr->recv_cq = cq; + + rc = 0; + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + switch (*kiblnd_tunables.kib_ib_mtu) { + default: + rc = *kiblnd_tunables.kib_ib_mtu; + /* fall through to... */ + case 0: /* set tunable to the default + * CAVEAT EMPTOR! this assumes the default is one of the MTUs + * below, otherwise we'll WARN on the next QP create */ + *kiblnd_tunables.kib_ib_mtu = + ib_mtu_enum_to_int(cmid->route.path_rec->mtu); + break; + case 256: + cmid->route.path_rec->mtu = IB_MTU_256; + break; + case 512: + cmid->route.path_rec->mtu = IB_MTU_512; + break; + case 1024: + cmid->route.path_rec->mtu = IB_MTU_1024; + break; + case 2048: + cmid->route.path_rec->mtu = IB_MTU_2048; + break; + case 4096: + cmid->route.path_rec->mtu = IB_MTU_4096; + break; + } + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (rc != 0) + CWARN("Invalid IB MTU value %d, using default value %d\n", + rc, *kiblnd_tunables.kib_ib_mtu); + + rc = rdma_create_qp(cmid, net->ibn_dev->ibd_pd, init_qp_attr); + if (rc != 0) { + CERROR("Can't create QP: %d\n", rc); + goto failed_2; + } + + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + + /* 1 ref for caller and each rxmsg */ + atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS); + conn->ibc_nrx = IBLND_RX_MSGS; + + /* post receives */ + for (i = 0; i < IBLND_RX_MSGS; i++) { + rc = kiblnd_post_rx(&conn->ibc_rxs[i], + IBLND_POSTRX_NO_CREDIT); + if (rc != 0) { + CERROR("Can't post rxmsg: %d\n", rc); + + /* Make posted receives complete */ + kiblnd_abort_receives(conn); + + /* correct # of posted buffers + * NB locking needed now I'm racing with completion */ + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + conn->ibc_nrx -= IBLND_RX_MSGS - i; + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, + flags); + + /* Drop my own and unused rxbuffer refcounts */ + while (i++ <= IBLND_RX_MSGS) + kiblnd_conn_decref(conn); + + return NULL; + } + } + + /* Init successful! */ + LASSERT (state == IBLND_CONN_ACTIVE_CONNECT || + state == IBLND_CONN_PASSIVE_WAIT); + conn->ibc_state = state; + + /* 1 more conn */ + atomic_inc(&net->ibn_nconns); + return conn; + + failed_2: + kiblnd_destroy_conn(conn); + failed_1: + LIBCFS_FREE(init_qp_attr, sizeof(*init_qp_attr)); + failed_0: + return NULL; +} + +void +kiblnd_destroy_conn (kib_conn_t *conn) +{ + struct rdma_cm_id *cmid = conn->ibc_cmid; + kib_peer_t *peer = conn->ibc_peer; + int rc; + int i; + + LASSERT (!in_interrupt()); + LASSERT (atomic_read(&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_early_rxs)); + LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); + LASSERT (list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_nsends_posted == 0); + + switch (conn->ibc_state) { + default: + /* conn must be completely disengaged from the network */ + LBUG(); + + case IBLND_CONN_DISCONNECTED: + /* connvars should have been freed already */ + LASSERT (conn->ibc_connvars == NULL); + break; + + case IBLND_CONN_INIT: + break; + } + + if (conn->ibc_cmid->qp != NULL) + rdma_destroy_qp(conn->ibc_cmid); + + if (conn->ibc_cq != NULL) { + rc = ib_destroy_cq(conn->ibc_cq); + if (rc != 0) + CWARN("Error destroying CQ: %d\n", rc); + } + + if (conn->ibc_rx_pages != NULL) { + LASSERT (conn->ibc_rxs != NULL); + + for (i = 0; i < IBLND_RX_MSGS; i++) { + kib_rx_t *rx = &conn->ibc_rxs[i]; + + LASSERT (rx->rx_nob >= 0); /* not posted */ + + dma_unmap_single(conn->ibc_cmid->device->dma_device, + pci_unmap_addr(rx, rx_msgunmap), + IBLND_MSG_SIZE, DMA_FROM_DEVICE); + } + + kiblnd_free_pages(conn->ibc_rx_pages); + } + + if (conn->ibc_rxs != NULL) { + LIBCFS_FREE(conn->ibc_rxs, + IBLND_RX_MSGS * sizeof(kib_rx_t)); + } + + if (conn->ibc_connvars != NULL) + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + + /* See CAVEAT EMPTOR above in kiblnd_create_conn */ + if (conn->ibc_state != IBLND_CONN_INIT) { + kib_net_t *net = peer->ibp_ni->ni_data; + + kiblnd_peer_decref(peer); + rdma_destroy_id(cmid); + atomic_dec(&net->ibn_nconns); + } + + LIBCFS_FREE(conn, sizeof(*conn)); +} + +int +kiblnd_close_peer_conns_locked (kib_peer_t *peer, int why) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + count++; + kiblnd_close_conn_locked(conn, why); + } + + return count; +} + +int +kiblnd_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + if (conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_incarnation, incarnation); + + count++; + kiblnd_close_conn_locked(conn, -ESTALE); + } + + return count; +} + +int +kiblnd_close_matching_conns (lnet_ni_t *ni, lnet_nid_t nid) +{ + kib_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + unsigned long flags; + int count = 0; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (nid != LNET_NID_ANY) + lo = hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; + else { + lo = 0; + hi = kiblnd_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kiblnd_data.kib_peers[i]) { + + peer = list_entry(ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_connecting > 0 || + peer->ibp_accepting > 0 || + !list_empty(&peer->ibp_conns)); + + if (peer->ibp_ni != ni) + continue; + + if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) + continue; + + count += kiblnd_close_peer_conns_locked(peer, 0); + } + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == LNET_NID_ANY) + return 0; + + return (count == 0) ? -ENOENT : 0; +} + +int +kiblnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; + + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + int count = 0; + + rc = kiblnd_get_peer_info(ni, data->ioc_count, + &nid, &count); + data->ioc_nid = nid; + data->ioc_count = count; + break; + } + + case IOC_LIBCFS_DEL_PEER: { + rc = kiblnd_del_peer(ni, data->ioc_nid); + break; + } + case IOC_LIBCFS_GET_CONN: { + kib_conn_t *conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); + + if (conn == NULL) { + rc = -ENOENT; + } else { + // kiblnd_debug_conn(conn); + rc = 0; + data->ioc_nid = conn->ibc_peer->ibp_nid; + kiblnd_conn_decref(conn); + } + break; + } + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kiblnd_close_matching_conns(ni, data->ioc_nid); + break; + } + + default: + break; + } + + return rc; +} + +void +kiblnd_free_pages (kib_pages_t *p) +{ + int npages = p->ibp_npages; + int i; + + for (i = 0; i < npages; i++) + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + + LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); +} + +int +kiblnd_alloc_pages (kib_pages_t **pp, int npages) +{ + kib_pages_t *p; + int i; + + LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + if (p == NULL) { + CERROR("Can't allocate descriptor for %d pages\n", npages); + return -ENOMEM; + } + + memset(p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = alloc_page(GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { + CERROR("Can't allocate page %d of %d\n", i, npages); + kiblnd_free_pages(p); + return -ENOMEM; + } + } + + *pp = p; + return 0; +} + +void +kiblnd_free_tx_descs (lnet_ni_t *ni) +{ + int i; + kib_net_t *net = ni->ni_data; + + LASSERT (net != NULL); + + if (net->ibn_tx_descs != NULL) { + for (i = 0; i < IBLND_TX_MSGS(); i++) { + kib_tx_t *tx = &net->ibn_tx_descs[i]; + +#if IBLND_MAP_ON_DEMAND + if (tx->tx_pages != NULL) + LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV * + sizeof(*tx->tx_pages)); +#else + if (tx->tx_wrq != NULL) + LIBCFS_FREE(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + + if (tx->tx_sge != NULL) + LIBCFS_FREE(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + + if (tx->tx_rd != NULL) + LIBCFS_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + + if (tx->tx_frags != NULL) + LIBCFS_FREE(tx->tx_frags, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_frags)); +#endif + } + + LIBCFS_FREE(net->ibn_tx_descs, + IBLND_TX_MSGS() * sizeof(kib_tx_t)); + } + + if (net->ibn_tx_pages != NULL) + kiblnd_free_pages(net->ibn_tx_pages); +} + +int +kiblnd_alloc_tx_descs (lnet_ni_t *ni) +{ + int i; + int rc; + kib_net_t *net = ni->ni_data; + + LASSERT (net != NULL); + + rc = kiblnd_alloc_pages(&net->ibn_tx_pages, IBLND_TX_MSG_PAGES()); + + if (rc != 0) { + CERROR("Can't allocate tx pages\n"); + return rc; + } + + LIBCFS_ALLOC (net->ibn_tx_descs, + IBLND_TX_MSGS() * sizeof(kib_tx_t)); + if (net->ibn_tx_descs == NULL) { + CERROR("Can't allocate %d tx descriptors\n", IBLND_TX_MSGS()); + return -ENOMEM; + } + + memset(net->ibn_tx_descs, 0, + IBLND_TX_MSGS() * sizeof(kib_tx_t)); + + for (i = 0; i < IBLND_TX_MSGS(); i++) { + kib_tx_t *tx = &net->ibn_tx_descs[i]; + +#if IBLND_MAP_ON_DEMAND + LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * + sizeof(*tx->tx_pages)); + if (tx->tx_pages == NULL) { + CERROR("Can't allocate phys page vector[%d]\n", + LNET_MAX_IOV); + return -ENOMEM; + } +#else + LIBCFS_ALLOC(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_wrq)); + if (tx->tx_wrq == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * + sizeof(*tx->tx_sge)); + if (tx->tx_sge == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tx->tx_rd, + offsetof(kib_rdma_desc_t, + rd_frags[IBLND_MAX_RDMA_FRAGS])); + if (tx->tx_rd == NULL) + return -ENOMEM; + + LIBCFS_ALLOC(tx->tx_frags, + IBLND_MAX_RDMA_FRAGS * + sizeof(*tx->tx_frags)); + if (tx->tx_frags == NULL) + return -ENOMEM; +#endif + } + + return 0; +} + +void +kiblnd_unmap_tx_descs (lnet_ni_t *ni) +{ + int i; + kib_tx_t *tx; + kib_net_t *net = ni->ni_data; + + LASSERT (net != NULL); + + for (i = 0; i < IBLND_TX_MSGS(); i++) { + tx = &net->ibn_tx_descs[i]; + + dma_unmap_single(net->ibn_dev->ibd_cmid->device->dma_device, + pci_unmap_addr(tx, tx_msgunmap), + IBLND_MSG_SIZE, DMA_TO_DEVICE); + } +} + +void +kiblnd_map_tx_descs (lnet_ni_t *ni) +{ + int ipage = 0; + int page_offset = 0; + int i; + struct page *page; + kib_tx_t *tx; + kib_net_t *net = ni->ni_data; + + LASSERT (net != NULL); + + /* pre-mapped messages are not bigger than 1 page */ + CLASSERT (IBLND_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + CLASSERT (PAGE_SIZE % IBLND_MSG_SIZE == 0); + + for (i = 0; i < IBLND_TX_MSGS(); i++) { + page = net->ibn_tx_pages->ibp_pages[ipage]; + tx = &net->ibn_tx_descs[i]; + + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + tx->tx_msgaddr = dma_map_single( + net->ibn_dev->ibd_cmid->device->dma_device, + tx->tx_msg, IBLND_MSG_SIZE, DMA_TO_DEVICE); + pci_unmap_addr_set(tx, tx_msgunmap, tx->tx_msgaddr); + + list_add(&tx->tx_list, &net->ibn_idle_txs); + + page_offset += IBLND_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBLND_TX_MSG_PAGES()); + } + } +} + +void +kiblnd_base_shutdown (void) +{ + int i; + + LASSERT (list_empty(&kiblnd_data.kib_devs)); + + CDEBUG(D_MALLOC, "before LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + switch (kiblnd_data.kib_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + case IBLND_INIT_DATA: + LASSERT (kiblnd_data.kib_peers != NULL); + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { + LASSERT (list_empty(&kiblnd_data.kib_peers[i])); + } + LASSERT (list_empty(&kiblnd_data.kib_connd_zombies)); + LASSERT (list_empty(&kiblnd_data.kib_connd_conns)); + + /* flag threads to terminate; wake and wait for them to die */ + kiblnd_data.kib_shutdown = 1; + wake_up_all(&kiblnd_data.kib_sched_waitq); + wake_up_all(&kiblnd_data.kib_connd_waitq); + + i = 2; + while (atomic_read(&kiblnd_data.kib_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read(&kiblnd_data.kib_nthreads)); + cfs_pause(cfs_time_seconds(1)); + } + + /* fall through */ + + case IBLND_INIT_NOTHING: + break; + } + + if (kiblnd_data.kib_peers != NULL) + LIBCFS_FREE(kiblnd_data.kib_peers, + sizeof(struct list_head) * + kiblnd_data.kib_peer_hash_size); + + CDEBUG(D_MALLOC, "after LND base cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + kiblnd_data.kib_init = IBLND_INIT_NOTHING; + PORTAL_MODULE_UNUSE; +} + +void +kiblnd_shutdown (lnet_ni_t *ni) +{ + kib_net_t *net = ni->ni_data; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + int i; + unsigned long flags; + + LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); + + if (net == NULL) + goto out; + + CDEBUG(D_MALLOC, "before LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + write_lock_irqsave(g_lock, flags); + net->ibn_shutdown = 1; + write_unlock_irqrestore(g_lock, flags); + + switch (net->ibn_init) { + default: + LBUG(); + + case IBLND_INIT_ALL: + /* nuke all existing peers within this net */ + kiblnd_del_peer(ni, LNET_NID_ANY); + + /* Wait for all peer state to clean up */ + i = 2; + while (atomic_read(&net->ibn_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ + "%s: waiting for %d peers to disconnect\n", + libcfs_nid2str(ni->ni_nid), + atomic_read(&net->ibn_npeers)); + cfs_pause(cfs_time_seconds(1)); + } + + kiblnd_unmap_tx_descs(ni); + + LASSERT (net->ibn_dev->ibd_nnets > 0); + net->ibn_dev->ibd_nnets--; + + /* fall through */ + + case IBLND_INIT_NOTHING: + LASSERT (atomic_read(&net->ibn_nconns) == 0); + +#if IBLND_MAP_ON_DEMAND + if (net->ibn_fmrpool != NULL) + ib_destroy_fmr_pool(net->ibn_fmrpool); +#endif + if (net->ibn_dev != NULL && + net->ibn_dev->ibd_nnets == 0) + kiblnd_destroy_dev(net->ibn_dev); + + break; + } + + kiblnd_free_tx_descs(ni); + + CDEBUG(D_MALLOC, "after LND net cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + net->ibn_init = IBLND_INIT_NOTHING; + ni->ni_data = NULL; + + LIBCFS_FREE(net, sizeof(*net)); + +out: + if (list_empty(&kiblnd_data.kib_devs)) + kiblnd_base_shutdown(); + return; +} + +int +kiblnd_base_startup (void) +{ + int rc; + int i; + + LASSERT (kiblnd_data.kib_init == IBLND_INIT_NOTHING); + + if (*kiblnd_tunables.kib_credits > *kiblnd_tunables.kib_ntx) { + CERROR("Can't set credits(%d) > ntx(%d)\n", + *kiblnd_tunables.kib_credits, + *kiblnd_tunables.kib_ntx); + return -EINVAL; + } + + PORTAL_MODULE_USE; + memset(&kiblnd_data, 0, sizeof(kiblnd_data)); /* zero pointers, flags etc */ + + rwlock_init(&kiblnd_data.kib_global_lock); + + INIT_LIST_HEAD(&kiblnd_data.kib_devs); + + kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; + LIBCFS_ALLOC(kiblnd_data.kib_peers, + sizeof(struct list_head) * kiblnd_data.kib_peer_hash_size); + if (kiblnd_data.kib_peers == NULL) { + goto failed; + } + for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); + + spin_lock_init(&kiblnd_data.kib_connd_lock); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); + INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); + init_waitqueue_head(&kiblnd_data.kib_connd_waitq); + + spin_lock_init(&kiblnd_data.kib_sched_lock); + INIT_LIST_HEAD(&kiblnd_data.kib_sched_conns); + init_waitqueue_head(&kiblnd_data.kib_sched_waitq); + + kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; + + /* lists/ptrs/locks initialised */ + kiblnd_data.kib_init = IBLND_INIT_DATA; + /*****************************************************/ + + for (i = 0; i < IBLND_N_SCHED; i++) { + rc = kiblnd_thread_start(kiblnd_scheduler, (void *)((long)i)); + if (rc != 0) { + CERROR("Can't spawn o2iblnd scheduler[%d]: %d\n", + i, rc); + goto failed; + } + } + + rc = kiblnd_thread_start(kiblnd_connd, NULL); + if (rc != 0) { + CERROR("Can't spawn o2iblnd connd: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + kiblnd_data.kib_init = IBLND_INIT_ALL; + /*****************************************************/ + + return 0; + + failed: + kiblnd_base_shutdown(); + return -ENETDOWN; +} + +int +kiblnd_startup (lnet_ni_t *ni) +{ + char *ifname; + kib_net_t *net; + kib_dev_t *ibdev; + struct list_head *tmp; + struct timeval tv; + int rc; + + LASSERT (ni->ni_lnd == &the_kiblnd); + + if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { + rc = kiblnd_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + ni->ni_data = net; + if (net == NULL) + goto failed; + + memset(net, 0, sizeof(*net)); + + do_gettimeofday(&tv); + net->ibn_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits; + ni->ni_peertxcredits = *kiblnd_tunables.kib_peercredits; + + spin_lock_init(&net->ibn_tx_lock); + INIT_LIST_HEAD(&net->ibn_idle_txs); + + rc = kiblnd_alloc_tx_descs(ni); + if (rc != 0) { + CERROR("Can't allocate tx descs\n"); + goto failed; + } + + if (ni->ni_interfaces[0] != NULL) { + /* Use the IPoIB interface specified in 'networks=' */ + + CLASSERT (LNET_MAX_INTERFACES > 1); + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + goto failed; + } + + ifname = ni->ni_interfaces[0]; + } else { + ifname = *kiblnd_tunables.kib_default_ipif; + } + + if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { + CERROR("IPoIB interface name too long: %s\n", ifname); + goto failed; + } + + ibdev = NULL; + list_for_each (tmp, &kiblnd_data.kib_devs) { + ibdev = list_entry(tmp, kib_dev_t, ibd_list); + + if (!strcmp(&ibdev->ibd_ifname[0], ifname)) + break; + + ibdev = NULL; + } + + if (ibdev == NULL) { + __u32 ip; + __u32 netmask; + int up; + struct rdma_cm_id *id; + struct ib_pd *pd; + struct ib_mr *mr; + struct sockaddr_in addr; + + rc = libcfs_ipif_query(ifname, &up, &ip, &netmask); + if (rc != 0) { + CERROR("Can't query IPoIB interface %s: %d\n", + ifname, rc); + goto failed; + } + + if (!up) { + CERROR("Can't query IPoIB interface %s: it's down\n", + ifname); + goto failed; + } + + LIBCFS_ALLOC(ibdev, sizeof(*ibdev)); + if (ibdev == NULL) + goto failed; + + memset(ibdev, 0, sizeof(*ibdev)); + + INIT_LIST_HEAD(&ibdev->ibd_list); /* not yet in kib_devs */ + ibdev->ibd_ifip = ip; + strcpy(&ibdev->ibd_ifname[0], ifname); + + id = rdma_create_id(kiblnd_cm_callback, ibdev, RDMA_PS_TCP); + if (!IS_ERR(id)) { + ibdev->ibd_cmid = id; + } else { + CERROR("Can't create listen ID: %ld\n", PTR_ERR(id)); + goto failed; + } + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = AF_INET; + addr.sin_port = htons(*kiblnd_tunables.kib_service); + addr.sin_addr.s_addr = htonl(ip); + + rc = rdma_bind_addr(id, (struct sockaddr *)&addr); + if (rc != 0) { + CERROR("Can't bind to %s: %d\n", ifname, rc); + goto failed; + } + + /* Binding should have assigned me an IB device */ + LASSERT (id->device != NULL); + + pd = ib_alloc_pd(id->device); + if (!IS_ERR(pd)) { + ibdev->ibd_pd = pd; + } else { + CERROR("Can't allocate PD: %ld\n", PTR_ERR(pd)); + goto failed; + } + +#if IBLND_MAP_ON_DEMAND + /* MR for sends and receives */ + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE); +#else + /* MR for sends, recieves _and_ RDMA...........v */ + mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); +#endif + if (!IS_ERR(mr)) { + ibdev->ibd_mr = mr; + } else { + CERROR("Can't get MR: %ld\n", PTR_ERR(pd)); + goto failed; + } + + rc = rdma_listen(id, 0); + if (rc != 0) { + CERROR("Can't start listener: %d\n", rc); + goto failed; + } + + list_add_tail(&ibdev->ibd_list, + &kiblnd_data.kib_devs); + } + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); + net->ibn_dev = ibdev; + +#if IBLND_MAP_ON_DEMAND + /* FMR pool for RDMA */ + { + struct ib_fmr_pool *fmrpool; + struct ib_fmr_pool_param param = { + .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE, + .page_shift = PAGE_SHIFT, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE), + .pool_size = *kiblnd_tunables.kib_fmr_pool_size, + .dirty_watermark = *kiblnd_tunables.kib_fmr_flush_trigger, + .flush_function = NULL, + .flush_arg = NULL, + .cache = *kiblnd_tunables.kib_fmr_cache}; + + if (*kiblnd_tunables.kib_fmr_pool_size < + *kiblnd_tunables.kib_ntx) { + CERROR("Can't set fmr pool size (%d) < ntx(%d)\n", + *kiblnd_tunables.kib_fmr_pool_size, + *kiblnd_tunables.kib_ntx); + goto failed; + } + + fmrpool = ib_create_fmr_pool(ibdev->ibd_pd, ¶m); + if (!IS_ERR(fmrpool)) { + net->ibn_fmrpool = fmrpool; + } else { + CERROR("Can't create FMR pool: %ld\n", + PTR_ERR(fmrpool)); + goto failed; + } + } +#endif + + kiblnd_map_tx_descs(ni); + + ibdev->ibd_nnets++; + net->ibn_init = IBLND_INIT_ALL; + + return 0; + +failed: + kiblnd_shutdown(ni); + + CDEBUG(D_NET, "kiblnd_startup failed\n"); + return -ENETDOWN; +} + +void __exit +kiblnd_module_fini (void) +{ + lnet_unregister_lnd(&the_kiblnd); + kiblnd_tunables_fini(); +} + +int __init +kiblnd_module_init (void) +{ + int rc; + + CLASSERT (sizeof(kib_msg_t) <= IBLND_MSG_SIZE); +#if !IBLND_MAP_ON_DEMAND + CLASSERT (offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) + <= IBLND_MSG_SIZE); + CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) + <= IBLND_MSG_SIZE); +#endif + rc = kiblnd_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_kiblnd); + + return 0; +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel OpenIB gen2 LND v1.00"); +MODULE_LICENSE("GPL"); + +module_init(kiblnd_module_init); +module_exit(kiblnd_module_fini); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h new file mode 100644 index 0000000..24e4be2 --- /dev/null +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -0,0 +1,630 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define DEBUG_SUBSYSTEM S_LND + +#include +#include +#include + +#if !HAVE_GFP_T +typedef int gfp_t; +#endif + +#include +#include +#include +#include + +/* tunables fixed at compile time */ +#if CONFIG_SMP +# define IBLND_N_SCHED num_online_cpus() /* # schedulers */ +#else +# define IBLND_N_SCHED 1 /* # schedulers */ +#endif + +#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBLND_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBLND_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBLND_CREDIT_HIGHWATER 7 /* when eagerly to return credits */ +#define IBLND_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ + +#define IBLND_MAP_ON_DEMAND 0 +#if IBLND_MAP_ON_DEMAND +# define IBLND_MAX_RDMA_FRAGS 1 +#else +# define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV +#endif + +/************************/ +/* derived constants... */ + +/* TX messages (shared by all connections) */ +#define IBLND_TX_MSGS() (*kiblnd_tunables.kib_ntx) +#define IBLND_TX_MSG_BYTES() (IBLND_TX_MSGS() * IBLND_MSG_SIZE) +#define IBLND_TX_MSG_PAGES() ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) + +/* RX messages (per connection) */ +#define IBLND_RX_MSGS (IBLND_MSG_QUEUE_SIZE*2) +#define IBLND_RX_MSG_BYTES (IBLND_RX_MSGS * IBLND_MSG_SIZE) +#define IBLND_RX_MSG_PAGES ((IBLND_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + +#define IBLND_CQ_ENTRIES() (IBLND_RX_MSGS + \ + (*kiblnd_tunables.kib_concurrent_sends) * \ + (1 + IBLND_MAX_RDMA_FRAGS)) + +typedef struct +{ + unsigned int *kib_service; /* IB service number */ + int *kib_min_reconnect_interval; /* first failed connection retry... */ + int *kib_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kib_cksum; /* checksum kib_msg_t? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_keepalive; /* keepalive timeout (seconds) */ + int *kib_ntx; /* # tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peercredits; /* # concurrent sends to 1 peer */ + char **kib_default_ipif; /* default IPoIB interface */ + int *kib_retry_count; + int *kib_rnr_retry_count; + int *kib_concurrent_sends; /* send work queue sizing */ + int *kib_ib_mtu; /* IB MTU */ +#if IBLND_MAP_ON_DEMAND + int *kib_fmr_pool_size; /* # FMRs in pool */ + int *kib_fmr_flush_trigger; /* When to trigger FMR flush */ + int *kib_fmr_cache; /* enable FMR pool cache? */ +#endif +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +#endif +} kib_tunables_t; + +typedef struct +{ + int ibp_npages; /* # pages */ + struct page *ibp_pages[0]; +} kib_pages_t; + +typedef struct +{ + struct list_head ibd_list; /* chain on kib_devs */ + __u32 ibd_ifip; /* IPoIB interface IP */ + char ibd_ifname[32]; /* IPoIB interface name */ + int ibd_nnets; /* # nets extant */ + + struct rdma_cm_id *ibd_cmid; /* IB listener (bound to 1 device) */ + struct ib_pd *ibd_pd; /* PD for the device */ + struct ib_mr *ibd_mr; /* MR for non RDMA I/O */ +} kib_dev_t; + +typedef struct +{ + __u64 ibn_incarnation; /* my epoch */ + int ibn_init; /* initialisation state */ + int ibn_shutdown; /* shutting down? */ + + atomic_t ibn_npeers; /* # peers extant */ + atomic_t ibn_nconns; /* # connections extant */ + + struct kib_tx *ibn_tx_descs; /* all the tx descriptors */ + kib_pages_t *ibn_tx_pages; /* premapped tx msg pages */ + struct list_head ibn_idle_txs; /* idle tx descriptors */ + spinlock_t ibn_tx_lock; /* serialise */ + +#if IBLND_MAP_ON_DEMAND + struct ib_fmr_pool *ibn_fmrpool; /* FMR pool for RDMA I/O */ +#endif + + kib_dev_t *ibn_dev; /* underlying IB device */ +} kib_net_t; + +typedef struct +{ + int kib_init; /* initialisation state */ + int kib_shutdown; /* shut down? */ + struct list_head kib_devs; /* IB devices extant */ + atomic_t kib_nthreads; /* # live threads */ + rwlock_t kib_global_lock; /* stabilize net/dev/peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + + void *kib_connd; /* the connd task (serialisation assertions) */ + struct list_head kib_connd_conns; /* connections to setup/teardown */ + struct list_head kib_connd_zombies; /* connections with zero refcount */ + wait_queue_head_t kib_connd_waitq; /* connection daemon sleeps here */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_conns; /* conns to check for rx completions */ + spinlock_t kib_sched_lock; /* serialise */ + + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ +} kib_data_t; + +#define IBLND_INIT_NOTHING 0 +#define IBLND_INIT_DATA 1 +#define IBLND_INIT_ALL 2 + +/************************************************************************ + * IB Wire message format. + * These are sent in sender's byte order (i.e. receiver flips). + */ + +typedef struct kib_connparams +{ + __u16 ibcp_queue_depth; + __u16 ibcp_max_frags; + __u32 ibcp_max_msg_size; +} WIRE_ATTR kib_connparams_t; + +typedef struct +{ + lnet_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kib_immediate_msg_t; + +#if IBLND_MAP_ON_DEMAND +typedef struct +{ + __u64 rd_addr; /* IO VMA address */ + __u32 rd_nob; /* # of bytes */ + __u32 rd_key; /* remote key */ +} WIRE_ATTR kib_rdma_desc_t; +#else +typedef struct +{ + __u32 rf_nob; /* # bytes this frag */ + __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */ +} WIRE_ATTR kib_rdma_frag_t; + +typedef struct +{ + __u32 rd_key; /* local/remote key */ + __u32 rd_nfrags; /* # fragments */ + kib_rdma_frag_t rd_frags[0]; /* buffer frags */ +} WIRE_ATTR kib_rdma_desc_t; +#endif + +typedef struct +{ + lnet_hdr_t ibprm_hdr; /* portals header */ + __u64 ibprm_cookie; /* opaque completion cookie */ +} WIRE_ATTR kib_putreq_msg_t; + +typedef struct +{ + __u64 ibpam_src_cookie; /* reflected completion cookie */ + __u64 ibpam_dst_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibpam_rd; /* sender's sink buffer */ +} WIRE_ATTR kib_putack_msg_t; + +typedef struct +{ + lnet_hdr_t ibgm_hdr; /* portals header */ + __u64 ibgm_cookie; /* opaque completion cookie */ + kib_rdma_desc_t ibgm_rd; /* rdma descriptor */ +} WIRE_ATTR kib_get_msg_t; + +typedef struct +{ + __u64 ibcm_cookie; /* opaque completion cookie */ + __s32 ibcm_status; /* < 0 failure: >= 0 length */ +} WIRE_ATTR kib_completion_msg_t; + +typedef struct +{ + /* First 2 fields fixed FOR ALL TIME */ + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ + __u32 ibm_nob; /* # bytes in whole message */ + __u32 ibm_cksum; /* checksum (0 == no checksum) */ + __u64 ibm_srcnid; /* sender's NID */ + __u64 ibm_srcstamp; /* sender's incarnation */ + __u64 ibm_dstnid; /* destination's NID */ + __u64 ibm_dststamp; /* destination's incarnation */ + + union { + kib_connparams_t connparams; + kib_immediate_msg_t immediate; + kib_putreq_msg_t putreq; + kib_putack_msg_t putack; + kib_get_msg_t get; + kib_completion_msg_t completion; + } WIRE_ATTR ibm_u; +} WIRE_ATTR kib_msg_t; + +#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */ + +#define IBLND_MSG_VERSION 0x11 + +#define IBLND_MSG_CONNREQ 0xc0 /* connection request */ +#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */ +#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */ +#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ +#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ +#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ +#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ +#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ +#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +typedef struct { + __u32 ibr_magic; /* sender's magic */ + __u16 ibr_version; /* sender's version */ + __u8 ibr_why; /* reject reason */ +} WIRE_ATTR kib_rej_t; + + +/* connection rejection reasons */ +#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */ +#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */ +#define IBLND_REJECT_FATAL 3 /* Anything else */ + +/***********************************************************************/ + +typedef struct kib_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + struct kib_conn *rx_conn; /* owning conn */ + int rx_nob; /* # bytes received (-1 while posted) */ + enum ib_wc_status rx_status; /* completion status */ + kib_msg_t *rx_msg; /* message buffer (host vaddr) */ + __u64 rx_msgaddr; /* message buffer (I/O addr) */ + DECLARE_PCI_UNMAP_ADDR (rx_msgunmap); /* for dma_unmap_single() */ + struct ib_recv_wr rx_wrq; /* receive work item... */ + struct ib_sge rx_sge; /* ...and its memory */ +} kib_rx_t; + +#define IBLND_POSTRX_DONT_POST 0 /* don't post */ +#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */ +#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */ +#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give myself back 1 reserved credit */ + +typedef struct kib_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + struct kib_conn *tx_conn; /* owning conn */ + int tx_sending; /* # tx callbacks outstanding */ + int tx_queued; /* queued for sending */ + int tx_waiting; /* waiting for peer */ + int tx_status; /* LNET completion status */ + unsigned long tx_deadline; /* completion deadline */ + __u64 tx_cookie; /* completion cookie */ + lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ + kib_msg_t *tx_msg; /* message buffer (host vaddr) */ + __u64 tx_msgaddr; /* message buffer (I/O addr) */ + DECLARE_PCI_UNMAP_ADDR (tx_msgunmap); /* for dma_unmap_single() */ + int tx_nwrq; /* # send work items */ +#if IBLND_MAP_ON_DEMAND + struct ib_send_wr tx_wrq[2]; /* send work items... */ + struct ib_sge tx_sge[2]; /* ...and their memory */ + kib_rdma_desc_t tx_rd[1]; /* rdma descriptor */ + __u64 *tx_pages; /* rdma phys page addrs */ + struct ib_pool_fmr *tx_fmr; /* rdma mapping (mapped if != NULL) */ +#else + struct ib_send_wr *tx_wrq; /* send work items... */ + struct ib_sge *tx_sge; /* ...and their memory */ + kib_rdma_desc_t *tx_rd; /* rdma descriptor */ + int tx_nfrags; /* # entries in... */ + struct scatterlist *tx_frags; /* dma_map_sg descriptor */ + int tx_dmadir; /* dma direction */ +#endif +} kib_tx_t; + +typedef struct kib_connvars +{ + /* connection-in-progress variables */ + kib_msg_t cv_msg; +} kib_connvars_t; + +typedef struct kib_conn +{ + struct kib_peer *ibc_peer; /* owning peer */ + struct list_head ibc_list; /* stash on peer's conn list */ + struct list_head ibc_sched_list; /* schedule for attention */ + __u64 ibc_incarnation; /* which instance of the peer */ + atomic_t ibc_refcount; /* # users */ + int ibc_state; /* what's happening */ + int ibc_nsends_posted; /* # uncompleted sends */ + int ibc_credits; /* # credits I have */ + int ibc_outstanding_credits; /* # credits to return */ + int ibc_reserved_credits;/* # ACK/DONE msg credits */ + int ibc_comms_error; /* set on comms error */ + int ibc_nrx:8; /* receive buffers owned */ + int ibc_scheduled:1; /* scheduled for attention */ + int ibc_ready:1; /* CQ callback fired */ + unsigned long ibc_last_send; /* time of last send */ + struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ + struct list_head ibc_tx_queue; /* sends that need a credit */ + struct list_head ibc_tx_queue_nocred;/* sends that don't need a credit */ + struct list_head ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ + spinlock_t ibc_lock; /* serialise */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + + struct rdma_cm_id *ibc_cmid; /* CM id */ + struct ib_cq *ibc_cq; /* completion queue */ + + kib_connvars_t *ibc_connvars; /* in-progress connection state */ +} kib_conn_t; + +#define IBLND_CONN_INIT 0 /* being intialised */ +#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ +#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ +#define IBLND_CONN_ESTABLISHED 3 /* connection established */ +#define IBLND_CONN_CLOSING 4 /* being closed */ +#define IBLND_CONN_DISCONNECTED 5 /* disconnected */ + +typedef struct kib_peer +{ + struct list_head ibp_list; /* stash on global peer list */ + lnet_nid_t ibp_nid; /* who's on the other end(s) */ + lnet_ni_t *ibp_ni; /* LNet interface */ + atomic_t ibp_refcount; /* # users */ + struct list_head ibp_conns; /* all active connections */ + struct list_head ibp_tx_queue; /* msgs waiting for a conn */ + int ibp_connecting; /* current active connection attempts */ + int ibp_accepting; /* current passive connection attempts */ + int ibp_error; /* errno on closing this peer */ + cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ +} kib_peer_t; + + +extern kib_data_t kiblnd_data; +extern kib_tunables_t kiblnd_tunables; + +#define kiblnd_conn_addref(conn) \ +do { \ + CDEBUG(D_NET, "conn[%p] (%d)++\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ + atomic_inc(&(conn)->ibc_refcount); \ +} while (0) + +#define kiblnd_conn_decref(conn) \ +do { \ + unsigned long flags; \ + \ + CDEBUG(D_NET, "conn[%p] (%d)--\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ + if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \ + list_add_tail(&(conn)->ibc_list, \ + &kiblnd_data.kib_connd_zombies); \ + wake_up(&kiblnd_data.kib_connd_waitq); \ + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); \ + } \ +} while (0) + +#define kiblnd_peer_addref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ + atomic_inc(&(peer)->ibp_refcount); \ +} while (0) + +#define kiblnd_peer_decref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ + if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ + kiblnd_destroy_peer(peer); \ +} while (0) + +static inline struct list_head * +kiblnd_nid2peerlist (lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size; + + return (&kiblnd_data.kib_peers [hash]); +} + +static inline int +kiblnd_peer_active (kib_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->ibp_list)); +} + +static inline kib_conn_t * +kiblnd_get_conn_locked (kib_peer_t *peer) +{ + LASSERT (!list_empty(&peer->ibp_conns)); + + /* just return the first connection */ + return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list); +} + +static inline int +kiblnd_send_keepalive(kib_conn_t *conn) +{ + return (*kiblnd_tunables.kib_keepalive > 0) && + time_after(jiffies, conn->ibc_last_send + + *kiblnd_tunables.kib_keepalive*HZ); +} + +static inline void +kiblnd_abort_receives(kib_conn_t *conn) +{ + ib_modify_qp(conn->ibc_cmid->qp, + &kiblnd_data.kib_error_qpa, IB_QP_STATE); +} + +/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the + * lowest bits of the work request id to stash the work item type. */ + +#define IBLND_WID_TX 0 +#define IBLND_WID_RDMA 1 +#define IBLND_WID_RX 2 +#define IBLND_WID_MASK 3UL + +static inline __u64 +kiblnd_ptr2wreqid (void *ptr, int type) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & IBLND_WID_MASK) == 0); + LASSERT ((type & ~IBLND_WID_MASK) == 0); + return (__u64)(lptr | type); +} + +static inline void * +kiblnd_wreqid2ptr (__u64 wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK); +} + +static inline int +kiblnd_wreqid2type (__u64 wreqid) +{ + return (wreqid & IBLND_WID_MASK); +} + +static inline void +kiblnd_set_conn_state (kib_conn_t *conn, int state) +{ + conn->ibc_state = state; + mb(); +} + +#if IBLND_MAP_ON_DEMAND +static inline int +kiblnd_rd_size (kib_rdma_desc_t *rd) +{ + return rd->rd_nob; +} +#else +static inline int +kiblnd_rd_size (kib_rdma_desc_t *rd) +{ + int i; + int size; + + for (i = size = 0; i < rd->rd_nfrags; i++) + size += rd->rd_frags[i].rf_nob; + + return size; +} +#endif + +int kiblnd_startup (lnet_ni_t *ni); +void kiblnd_shutdown (lnet_ni_t *ni); +int kiblnd_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); + +int kiblnd_tunables_init(void); +void kiblnd_tunables_fini(void); + +int kiblnd_connd (void *arg); +int kiblnd_scheduler(void *arg); +int kiblnd_thread_start (int (*fn)(void *arg), void *arg); + +int kiblnd_alloc_pages (kib_pages_t **pp, int npages); +void kiblnd_free_pages (kib_pages_t *p); + +int kiblnd_cm_callback(struct rdma_cm_id *cmid, + struct rdma_cm_event *event); + +int kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid); +void kiblnd_destroy_peer (kib_peer_t *peer); +void kiblnd_destroy_dev (kib_dev_t *dev); +void kiblnd_unlink_peer_locked (kib_peer_t *peer); +void kiblnd_peer_alive (kib_peer_t *peer); +kib_peer_t *kiblnd_find_peer_locked (lnet_nid_t nid); +void kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error); +int kiblnd_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation); + +void kiblnd_connreq_done(kib_conn_t *conn, int status); +kib_conn_t *kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, + int state); +void kiblnd_destroy_conn (kib_conn_t *conn); +void kiblnd_close_conn (kib_conn_t *conn, int error); +void kiblnd_close_conn_locked (kib_conn_t *conn, int error); + +int kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type, + int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie); + +void kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn); +void kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn); +void kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob); +void kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status); +void kiblnd_check_sends (kib_conn_t *conn); + +void kiblnd_qp_event(struct ib_event *event, void *arg); +void kiblnd_cq_event(struct ib_event *event, void *arg); +void kiblnd_cq_completion(struct ib_cq *cq, void *arg); + +void kiblnd_init_msg (kib_msg_t *msg, int type, int body_nob); +void kiblnd_pack_msg (lnet_ni_t *ni, kib_msg_t *msg, + int credits, lnet_nid_t dstnid, __u64 dststamp); +int kiblnd_unpack_msg(kib_msg_t *msg, int nob); +int kiblnd_post_rx (kib_rx_t *rx, int credit); + +int kiblnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kiblnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + + + diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c new file mode 100644 index 0000000..3e5756d --- /dev/null +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -0,0 +1,3159 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "o2iblnd.h" + +char * +kiblnd_msgtype2str(int type) +{ + switch (type) { + case IBLND_MSG_CONNREQ: + return "CONNREQ"; + + case IBLND_MSG_CONNACK: + return "CONNACK"; + + case IBLND_MSG_NOOP: + return "NOOP"; + + case IBLND_MSG_IMMEDIATE: + return "IMMEDIATE"; + + case IBLND_MSG_PUT_REQ: + return "PUT_REQ"; + + case IBLND_MSG_PUT_NAK: + return "PUT_NAK"; + + case IBLND_MSG_PUT_ACK: + return "PUT_ACK"; + + case IBLND_MSG_PUT_DONE: + return "PUT_DONE"; + + case IBLND_MSG_GET_REQ: + return "GET_REQ"; + + case IBLND_MSG_GET_DONE: + return "GET_DONE"; + + default: + return "???"; + } +} + +void +kiblnd_tx_done (lnet_ni_t *ni, kib_tx_t *tx) +{ + lnet_msg_t *lntmsg[2]; + kib_net_t *net = ni->ni_data; + int rc; + int i; + + LASSERT (net != NULL); + LASSERT (!in_interrupt()); + LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */ + LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */ + +#if IBLND_MAP_ON_DEMAND + if (tx->tx_fmr != NULL) { + rc = ib_fmr_pool_unmap(tx->tx_fmr); + LASSERT (rc == 0); + + if (tx->tx_status != 0) { + rc = ib_flush_fmr_pool(net->ibn_fmrpool); + LASSERT (rc == 0); + } + + tx->tx_fmr = NULL; + } +#else + if (tx->tx_nfrags != 0) { + dma_unmap_sg(net->ibn_dev->ibd_cmid->device->dma_device, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + tx->tx_nfrags = 0; + } +#endif + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + rc = tx->tx_status; + + if (tx->tx_conn != NULL) { + LASSERT (ni == tx->tx_conn->ibc_peer->ibp_ni); + + kiblnd_conn_decref(tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nwrq = 0; + tx->tx_status = 0; + + spin_lock(&net->ibn_tx_lock); + + list_add(&tx->tx_list, &net->ibn_idle_txs); + + spin_unlock(&net->ibn_tx_lock); + + /* delay finalize until my descs have been freed */ + for (i = 0; i < 2; i++) { + if (lntmsg[i] == NULL) + continue; + + lnet_finalize(ni, lntmsg[i], rc); + } +} + +void +kiblnd_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int status) +{ + kib_tx_t *tx; + + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + kiblnd_tx_done(ni, tx); + } +} + +kib_tx_t * +kiblnd_get_idle_tx (lnet_ni_t *ni) +{ + kib_net_t *net = ni->ni_data; + kib_tx_t *tx; + + LASSERT (net != NULL); + + spin_lock(&net->ibn_tx_lock); + + if (list_empty(&net->ibn_idle_txs)) { + spin_unlock(&net->ibn_tx_lock); + return NULL; + } + + tx = list_entry(net->ibn_idle_txs.next, kib_tx_t, tx_list); + list_del(&tx->tx_list); + + /* Allocate a new completion cookie. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_cookie = kiblnd_data.kib_next_tx_cookie++; + + spin_unlock(&net->ibn_tx_lock); + + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); +#if IBLND_MAP_ON_DEMAND + LASSERT (tx->tx_fmr == NULL); +#else + LASSERT (tx->tx_nfrags == 0); +#endif + + return tx; +} + +void +kiblnd_drop_rx (kib_rx_t *rx) +{ + kib_conn_t *conn = rx->rx_conn; + unsigned long flags; + + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + LASSERT (conn->ibc_nrx > 0); + conn->ibc_nrx--; + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); + + kiblnd_conn_decref(conn); +} + +int +kiblnd_post_rx (kib_rx_t *rx, int credit) +{ + kib_conn_t *conn = rx->rx_conn; + kib_net_t *net = conn->ibc_peer->ibp_ni->ni_data; + struct ib_recv_wr *bad_wrq; + int rc; + + LASSERT (net != NULL); + LASSERT (!in_interrupt()); + LASSERT (credit == IBLND_POSTRX_NO_CREDIT || + credit == IBLND_POSTRX_PEER_CREDIT || + credit == IBLND_POSTRX_RSRVD_CREDIT); + + rx->rx_sge.length = IBLND_MSG_SIZE; + rx->rx_sge.lkey = net->ibn_dev->ibd_mr->lkey; + rx->rx_sge.addr = rx->rx_msgaddr; + + rx->rx_wrq.next = NULL; + rx->rx_wrq.sg_list = &rx->rx_sge; + rx->rx_wrq.num_sge = 1; + rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX); + + LASSERT (conn->ibc_state >= IBLND_CONN_INIT); + LASSERT (rx->rx_nob >= 0); /* not posted */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) { + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return 0; + } + + rx->rx_nob = -1; /* flag posted */ + + rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq); + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */ + return rc; + + if (rc != 0) { + CERROR("Can't post rx for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_close_conn(conn, rc); + kiblnd_drop_rx(rx); /* No more posts for this rx */ + return rc; + } + + if (credit == IBLND_POSTRX_NO_CREDIT) + return 0; + + spin_lock(&conn->ibc_lock); + if (credit == IBLND_POSTRX_PEER_CREDIT) + conn->ibc_outstanding_credits++; + else + conn->ibc_reserved_credits++; + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); + return 0; +} + +kib_tx_t * +kiblnd_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie) +{ + struct list_head *tmp; + + list_for_each(tmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending != 0 || tx->tx_waiting); + + if (tx->tx_cookie != cookie) + continue; + + if (tx->tx_waiting && + tx->tx_msg->ibm_type == txtype) + return tx; + + CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", + tx->tx_waiting ? "" : "NOT ", + tx->tx_msg->ibm_type, txtype); + } + return NULL; +} + +void +kiblnd_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) +{ + kib_tx_t *tx; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + int idle; + + spin_lock(&conn->ibc_lock); + + tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie); + if (tx == NULL) { + spin_unlock(&conn->ibc_lock); + + CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_close_conn(conn, -EPROTO); + return; + } + + if (tx->tx_status == 0) { /* success so far */ + if (status < 0) { /* failed? */ + tx->tx_status = status; + } else if (txtype == IBLND_MSG_GET_REQ) { + lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status); + } + } + + tx->tx_waiting = 0; + + idle = !tx->tx_queued && (tx->tx_sending == 0); + if (idle) + list_del(&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(ni, tx); +} + +void +kiblnd_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) +{ + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_tx_t *tx = kiblnd_get_idle_tx(ni); + + if (tx == NULL) { + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + tx->tx_msg->ibm_u.completion.ibcm_status = status; + tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; + kiblnd_init_tx_msg(ni, tx, type, sizeof(kib_completion_msg_t)); + + kiblnd_queue_tx(tx, conn); +} + +void +kiblnd_handle_rx (kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + int credits = msg->ibm_credits; + kib_tx_t *tx; + int rc = 0; + int rc2; + int post_credit; + + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + CDEBUG (D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + if (credits != 0) { + /* Have I received credits that will let me send? */ + spin_lock(&conn->ibc_lock); + + if (conn->ibc_credits + credits > IBLND_MSG_QUEUE_SIZE) { + rc2 = conn->ibc_credits; + spin_unlock(&conn->ibc_lock); + + CERROR("Bad credits from %s: %d + %d > %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + rc2, credits, IBLND_MSG_QUEUE_SIZE); + + kiblnd_close_conn(conn, -EPROTO); + kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); + return; + } + + conn->ibc_credits += credits; + + spin_unlock(&conn->ibc_lock); + kiblnd_check_sends(conn); + } + + switch (msg->ibm_type) { + default: + CERROR("Bad IBLND message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_NO_CREDIT; + rc = -EPROTO; + break; + + case IBLND_MSG_NOOP: + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_IMMEDIATE: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_PUT_NAK: + CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_PUT_ACK: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + + spin_lock(&conn->ibc_lock); + tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ, + msg->ibm_u.putack.ibpam_src_cookie); + if (tx != NULL) + list_del(&tx->tx_list); + spin_unlock(&conn->ibc_lock); + + if (tx == NULL) { + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; + break; + } + + LASSERT (tx->tx_waiting); + /* CAVEAT EMPTOR: I could be racing with tx_complete, but... + * (a) I can overwrite tx_msg since my peer has received it! + * (b) tx_waiting set tells tx_complete() it's not done. */ + + tx->tx_nwrq = 0; /* overwrite PUT_REQ */ + + rc2 = kiblnd_init_rdma(ni, tx, IBLND_MSG_PUT_DONE, + kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); + + spin_lock(&conn->ibc_lock); + tx->tx_waiting = 0; /* clear waiting and queue atomically */ + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + break; + + case IBLND_MSG_PUT_DONE: + post_credit = IBLND_POSTRX_PEER_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + + case IBLND_MSG_GET_REQ: + post_credit = IBLND_POSTRX_DONT_POST; + rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx, 1); + if (rc < 0) /* repost on error */ + post_credit = IBLND_POSTRX_PEER_CREDIT; + break; + + case IBLND_MSG_GET_DONE: + post_credit = IBLND_POSTRX_RSRVD_CREDIT; + kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ, + msg->ibm_u.completion.ibcm_status, + msg->ibm_u.completion.ibcm_cookie); + break; + } + + if (rc < 0) /* protocol error */ + kiblnd_close_conn(conn, rc); + + if (post_credit != IBLND_POSTRX_DONT_POST) + kiblnd_post_rx(rx, post_credit); +} + +void +kiblnd_rx_complete (kib_rx_t *rx, int status, int nob) +{ + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + kib_net_t *net = ni->ni_data; + unsigned long flags; + int rc; + int err = -EIO; + + LASSERT (net != NULL); + LASSERT (rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ + + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) + goto ignore; + + if (status != IB_WC_SUCCESS) { + CDEBUG(D_NETERROR, "Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), status); + goto failed; + } + + LASSERT (nob >= 0); + rx->rx_nob = nob; + + rc = kiblnd_unpack_msg(msg, rx->rx_nob); + if (rc != 0) { + CERROR ("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + goto failed; + } + + if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + msg->ibm_dstnid != ni->ni_nid || + msg->ibm_srcstamp != conn->ibc_incarnation || + msg->ibm_dststamp != net->ibn_incarnation) { + CERROR ("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; + goto failed; + } + + /* set time last known alive */ + kiblnd_peer_alive(conn->ibc_peer); + + /* racing with connection establishment/teardown! */ + + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + /* must check holding global lock to eliminate race */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + return; + } + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + } + kiblnd_handle_rx(rx); + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kiblnd_close_conn(conn, err); + ignore: + kiblnd_drop_rx(rx); /* Don't re-post rx. */ +} + +struct page * +kiblnd_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) { + page = vmalloc_to_page ((void *)vaddr); + LASSERT (page != NULL); + return page; + } +#if CONFIG_HIGHMEM + if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { + /* No highmem pages only used for bulk (kiov) I/O */ + CERROR("find page for address in highmem\n"); + LBUG(); + } +#endif + page = virt_to_page (vaddr); + LASSERT (page != NULL); + return page; +} + +#if !IBNAL_MAP_ON_DEMAND +int +kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + unsigned int niov, struct iovec *iov, int offset, int nob) + +{ + struct scatterlist *sg; + int i; + int fragnob; + unsigned long vaddr; + struct page *page; + int page_offset; + kib_net_t *net = ni->ni_data; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (net != NULL); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + sg = tx->tx_frags; + do { + LASSERT (niov > 0); + + vaddr = ((unsigned long)iov->iov_base) + offset; + page_offset = vaddr & (PAGE_SIZE - 1); + page = kiblnd_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR ("Can't find page\n"); + return -EFAULT; + } + + fragnob = min((int)(iov->iov_len - offset), nob); + fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); + + sg->page = page; + sg->offset = page_offset; + sg->length = fragnob; + sg++; + + if (offset + fragnob < iov->iov_len) { + offset += fragnob; + } else { + offset = 0; + iov++; + niov--; + } + nob -= fragnob; + } while (nob > 0); + + /* If rd is not tx_rd, it's going to get sent to a peer and I'm the + * RDMA sink */ + tx->tx_nfrags = sg - tx->tx_frags; + tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + + rd->rd_nfrags = dma_map_sg(net->ibn_dev->ibd_cmid->device->dma_device, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + rd->rd_key = (rd != tx->tx_rd) ? + net->ibn_dev->ibd_mr->rkey : net->ibn_dev->ibd_mr->lkey; + + for (i = 0; i < rd->rd_nfrags; i++) { + rd->rd_frags[i].rf_nob = sg_dma_len(&tx->tx_frags[i]); + rd->rd_frags[i].rf_addr = sg_dma_address(&tx->tx_frags[i]); + } + + return 0; +} + +int +kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) +{ + struct scatterlist *sg; + int i; + int fragnob; + kib_net_t *net = ni->ni_data; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT (net != NULL); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + sg = tx->tx_frags; + do { + LASSERT (nkiov > 0); + + fragnob = min((int)(kiov->kiov_len - offset), nob); + + memset(sg, 0, sizeof(*sg)); + sg->page = kiov->kiov_page; + sg->offset = kiov->kiov_offset + offset; + sg->length = fragnob; + sg++; + + offset = 0; + kiov++; + nkiov--; + nob -= fragnob; + } while (nob > 0); + + /* If rd is not tx_rd, it's going to get sent to a peer and I'm the + * RDMA sink */ + tx->tx_nfrags = sg - tx->tx_frags; + tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + + rd->rd_nfrags = dma_map_sg(net->ibn_dev->ibd_cmid->device->dma_device, + tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); + rd->rd_key = (rd != tx->tx_rd) ? + net->ibn_dev->ibd_mr->rkey : net->ibn_dev->ibd_mr->lkey; + + for (i = 0; i < tx->tx_nfrags; i++) { + rd->rd_frags[i].rf_nob = sg_dma_len(&tx->tx_frags[i]); + rd->rd_frags[i].rf_addr = sg_dma_address(&tx->tx_frags[i]); +#if 0 + CDEBUG(D_WARNING,"frag[%d]: "LPX64" for %d\n", + i, rd->rd_frags[i].rf_addr, rd->rd_frags[i].rf_nob); +#endif + } + + return 0; +} +#else +int +kiblnd_map_tx (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + int npages, unsigned long page_offset, int nob) +{ + struct ib_pool_fmr *fmr; + kib_net_t *net = ni->ni_data; + + LASSERT (net != NULL); + LASSERT (tx->tx_fmr == NULL); + LASSERT (page_offset < PAGE_SIZE); + LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); + LASSERT (npages <= LNET_MAX_IOV); + + rd->rd_addr = 0; + + fmr = ib_fmr_pool_map_phys(net->ibn_fmrpool, tx->tx_pages, + npages, rd->rd_addr); + if (IS_ERR(fmr)) { + CERROR ("Can't map %d pages: %ld\n", npages, PTR_ERR(fmr)); + return PTR_ERR(fmr); + } + + /* If rd is not tx_rd, it's going to get sent to a peer, who will need + * the rkey */ + + rd->rd_key = (rd != tx->tx_rd) ? fmr->fmr->rkey : fmr->fmr->lkey; + rd->rd_nob = nob; + + tx->tx_fmr = fmr; + return 0; +} + +int +kiblnd_setup_rd_iov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + unsigned int niov, struct iovec *iov, int offset, int nob) + +{ + int resid; + int fragnob; + struct page *page; + int npages; + unsigned long page_offset; + unsigned long vaddr; + + LASSERT (nob > 0); + LASSERT (niov > 0); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR ("Can't map multiple vaddr fragments\n"); + return (-EMSGSIZE); + } + + vaddr = ((unsigned long)iov->iov_base) + offset; + + page_offset = vaddr & (PAGE_SIZE - 1); + resid = nob; + npages = 0; + + do { + LASSERT (npages < LNET_MAX_IOV); + + page = kiblnd_kvaddr_to_page(vaddr); + if (page == NULL) { + CERROR("Can't find page for %lu\n", vaddr); + return -EFAULT; + } + + tx->tx_pages[npages++] = lnet_page2phys(page); + + fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); + vaddr += fragnob; + resid -= fragnob; + + } while (resid > 0); + + return kiblnd_map_tx(ni, tx, rd, npages, page_offset, nob); +} + +int +kiblnd_setup_rd_kiov (lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) +{ + int resid; + int npages; + unsigned long page_offset; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT (nkiov <= LNET_MAX_IOV); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + page_offset = kiov->kiov_offset + offset; + + resid = offset + nob; + npages = 0; + + do { + LASSERT (npages < LNET_MAX_IOV); + LASSERT (nkiov > 0); + + if ((npages > 0 && kiov->kiov_offset != 0) || + (resid > kiov->kiov_len && + (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) { + /* Can't have gaps */ + CERROR ("Can't make payload contiguous in I/O VM:" + "page %d, offset %d, len %d \n", + npages, kiov->kiov_offset, kiov->kiov_len); + + return -EINVAL; + } + + tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page); + resid -= kiov->kiov_len; + kiov++; + nkiov--; + } while (resid > 0); + + return kiblnd_map_tx(ni, tx, rd, npages, page_offset, nob); +} +#endif + +void +kiblnd_check_sends (kib_conn_t *conn) +{ + kib_tx_t *tx; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + int rc; + int consume_cred = 0; + struct ib_send_wr *bad_wrq; + int done; + + /* Don't send anything until after the connection is established */ + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CDEBUG(D_NET, "%s too soon\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + } + + spin_lock(&conn->ibc_lock); + + LASSERT (conn->ibc_nsends_posted <= + *kiblnd_tunables.kib_concurrent_sends); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } + + if (list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_nocred) && + (conn->ibc_outstanding_credits >= IBLND_CREDIT_HIGHWATER || + kiblnd_send_keepalive(conn))) { + spin_unlock(&conn->ibc_lock); + + tx = kiblnd_get_idle_tx(ni); + if (tx != NULL) + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); + + spin_lock(&conn->ibc_lock); + + if (tx != NULL) + kiblnd_queue_tx_locked(tx, conn); + } + + for (;;) { + if (!list_empty (&conn->ibc_tx_queue_nocred)) { + tx = list_entry (conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); + consume_cred = 0; + } else if (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, + kib_tx_t, tx_list); + consume_cred = 1; + } else { + /* nothing to send right now */ + break; + } + + LASSERT (tx->tx_queued); + /* We rely on this for QP sizing */ + LASSERT (tx->tx_nwrq > 0 && + tx->tx_nwrq <= 1 + IBLND_MAX_RDMA_FRAGS); + + LASSERT (conn->ibc_outstanding_credits >= 0); + LASSERT (conn->ibc_outstanding_credits <= IBLND_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits >= 0); + LASSERT (conn->ibc_credits <= IBLND_MSG_QUEUE_SIZE); + + if (conn->ibc_nsends_posted == + *kiblnd_tunables.kib_concurrent_sends) { + /* tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + + if (consume_cred) { + if (conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) { /* giving back credits */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + } + + list_del (&tx->tx_list); + tx->tx_queued = 0; + + /* NB don't drop ibc_lock before bumping tx_sending */ + + if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP && + (!list_empty(&conn->ibc_tx_queue) || + !list_empty(&conn->ibc_tx_queue_nocred) || + (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER && + !kiblnd_send_keepalive(conn)))) { + /* redundant NOOP */ + spin_unlock(&conn->ibc_lock); + kiblnd_tx_done(ni, tx); + spin_lock(&conn->ibc_lock); + CDEBUG(D_NET, "%s: redundant noop\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + continue; + } + + kiblnd_pack_msg(ni, tx->tx_msg, conn->ibc_outstanding_credits, + conn->ibc_peer->ibp_nid, conn->ibc_incarnation); + + conn->ibc_outstanding_credits = 0; + conn->ibc_nsends_posted++; + if (consume_cred) + conn->ibc_credits--; + + /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA + * PUT. If so, it was first queued here as a PUT_REQ, sent and + * stashed on ibc_active_txs, matched by an incoming PUT_ACK, + * and then re-queued here. It's (just) possible that + * tx_sending is non-zero if we've not done the tx_complete() from + * the first send; hence the ++ rather than = below. */ + tx->tx_sending++; + + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if 0 + { + int i; + + for (i = 0; i < tx->tx_nwrq - 1; i++) { + LASSERT (tx->tx_wrq[i].opcode == IB_WR_RDMA_WRITE); + LASSERT (tx->tx_wrq[i].next == &tx->tx_wrq[i+1]); + LASSERT (tx->tx_wrq[i].sg_list == &tx->tx_sge[i]); + + CDEBUG(D_WARNING, "WORK[%d]: RDMA "LPX64 + " for %d k %x -> "LPX64" k %x\n", i, + tx->tx_wrq[i].sg_list->addr, + tx->tx_wrq[i].sg_list->length, + tx->tx_wrq[i].sg_list->lkey, + tx->tx_wrq[i].wr.rdma.remote_addr, + tx->tx_wrq[i].wr.rdma.rkey); + } + + LASSERT (tx->tx_wrq[i].opcode == IB_WR_SEND); + LASSERT (tx->tx_wrq[i].next == NULL); + LASSERT (tx->tx_wrq[i].sg_list == &tx->tx_sge[i]); + + CDEBUG(D_WARNING, "WORK[%d]: SEND "LPX64" for %d k %x\n", i, + tx->tx_wrq[i].sg_list->addr, + tx->tx_wrq[i].sg_list->length, + tx->tx_wrq[i].sg_list->lkey); + } +#endif + /* I'm still holding ibc_lock! */ + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) + rc = -ECONNABORTED; + else + rc = ib_post_send(conn->ibc_cmid->qp, tx->tx_wrq, &bad_wrq); + + conn->ibc_last_send = jiffies; + + if (rc != 0) { + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; + if (consume_cred) + conn->ibc_credits++; + conn->ibc_nsends_posted--; + + tx->tx_status = rc; + tx->tx_waiting = 0; + tx->tx_sending--; + + done = (tx->tx_sending == 0); + if (done) + list_del (&tx->tx_list); + + spin_unlock(&conn->ibc_lock); + + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CERROR("Error %d posting transmit to %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + else + CDEBUG(D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + kiblnd_close_conn(conn, rc); + + if (done) + kiblnd_tx_done(ni, tx); + return; + } + } + + spin_unlock(&conn->ibc_lock); +} + +void +kiblnd_tx_complete (kib_tx_t *tx, int status) +{ + int failed = (status != IB_WC_SUCCESS); + kib_conn_t *conn = tx->tx_conn; + int idle; + + LASSERT (tx->tx_sending > 0); + + if (failed) { + if (conn->ibc_state == IBLND_CONN_ESTABLISHED) + CDEBUG(D_NETERROR, "Tx -> %s cookie "LPX64 + "sending %d waiting %d: failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_cookie, tx->tx_sending, tx->tx_waiting, + status); + + kiblnd_close_conn(conn, -EIO); + } else { + kiblnd_peer_alive(conn->ibc_peer); + } + + spin_lock(&conn->ibc_lock); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. */ + + tx->tx_sending--; + conn->ibc_nsends_posted--; + + if (failed) { + tx->tx_waiting = 0; /* don't wait for peer */ + tx->tx_status = -EIO; + } + + idle = (tx->tx_sending == 0) && /* This is the final callback */ + !tx->tx_waiting && /* Not waiting for peer */ + !tx->tx_queued; /* Not re-queued (PUT_DONE) */ + if (idle) + list_del(&tx->tx_list); + + kiblnd_conn_addref(conn); /* 1 ref for me.... */ + + spin_unlock(&conn->ibc_lock); + + if (idle) + kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx); + + kiblnd_check_sends(conn); + + kiblnd_conn_decref(conn); /* ...until here */ +} + +void +kiblnd_init_tx_msg (lnet_ni_t *ni, kib_tx_t *tx, int type, int body_nob) +{ + kib_net_t *net = ni->ni_data; + struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq]; + struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq]; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + + LASSERT (net != NULL); + LASSERT (tx->tx_nwrq >= 0); + LASSERT (tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); + LASSERT (nob <= IBLND_MSG_SIZE); + + kiblnd_init_msg(tx->tx_msg, type, body_nob); + + sge->addr = tx->tx_msgaddr; + sge->lkey = net->ibn_dev->ibd_mr->lkey; + sge->length = nob; + + memset(wrq, 0, sizeof(*wrq)); + + wrq->next = NULL; + wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX); + wrq->sg_list = sge; + wrq->num_sge = 1; + wrq->opcode = IB_WR_SEND; + wrq->send_flags = IB_SEND_SIGNALED; + + tx->tx_nwrq++; +} + +int +kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type, + int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie) +{ + kib_msg_t *ibmsg = tx->tx_msg; + kib_rdma_desc_t *srcrd = tx->tx_rd; + struct ib_sge *sge = &tx->tx_sge[0]; + struct ib_send_wr *wrq = &tx->tx_wrq[0]; + int rc = nob; + +#if IBLND_MAP_ON_DEMAND + LASSERT (!in_interrupt()); + LASSERT (tx->tx_nwrq == 0); + LASSERT (type == IBLND_MSG_GET_DONE || + type == IBLND_MSG_PUT_DONE); + + sge->addr = srcrd->rd_addr; + sge->lkey = srcrd->rd_key; + sge->length = nob; + + wrq = &tx->tx_wrq[0]; + + wrq->next = &tx->tx_wrq[1]; + wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); + wrq->sg_list = sge; + wrq->num_sge = 1; + wrq->opcode = IB_WR_RDMA_WRITE; + wrq->send_flags = 0; + + wrq->wr.rdma.remote_addr = dstrd->rd_addr; + wrq->wr.rdma.rkey = dstrd->rd_key; + + tx->tx_nwrq = 1; +#else + /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */ + int resid = nob; + kib_rdma_frag_t *srcfrag; + int srcidx; + kib_rdma_frag_t *dstfrag; + int dstidx; + int wrknob; + + LASSERT (!in_interrupt()); + LASSERT (tx->tx_nwrq == 0); + LASSERT (type == IBLND_MSG_GET_DONE || + type == IBLND_MSG_PUT_DONE); + + srcidx = dstidx = 0; + srcfrag = &srcrd->rd_frags[0]; + dstfrag = &dstrd->rd_frags[0]; + + while (resid > 0) { + if (srcidx >= srcrd->rd_nfrags) { + CERROR("Src buffer exhausted: %d frags\n", srcidx); + rc = -EPROTO; + break; + } + + if (dstidx == dstrd->rd_nfrags) { + CERROR("Dst buffer exhausted: %d frags\n", dstidx); + rc = -EPROTO; + break; + } + + if (tx->tx_nwrq == IBLND_MAX_RDMA_FRAGS) { + CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n", + srcidx, srcrd->rd_nfrags, + dstidx, dstrd->rd_nfrags); + rc = -EMSGSIZE; + break; + } + + wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid); + + sge = &tx->tx_sge[tx->tx_nwrq]; + sge->addr = srcfrag->rf_addr; + sge->length = wrknob; + sge->lkey = srcrd->rd_key; + + wrq = &tx->tx_wrq[tx->tx_nwrq]; + + wrq->next = wrq + 1; + wrq->wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); + wrq->sg_list = sge; + wrq->num_sge = 1; + wrq->opcode = IB_WR_RDMA_WRITE; + wrq->send_flags = 0; + + wrq->wr.rdma.remote_addr = dstfrag->rf_addr; + wrq->wr.rdma.rkey = dstrd->rd_key; + + wrq++; + sge++; + + resid -= wrknob; + if (wrknob < srcfrag->rf_nob) { + srcfrag->rf_nob -= wrknob; + srcfrag->rf_addr += wrknob; + } else { + srcfrag++; + srcidx++; + } + + if (wrknob < dstfrag->rf_nob) { + dstfrag->rf_nob -= wrknob; + dstfrag->rf_addr += wrknob; + } else { + dstfrag++; + dstidx++; + } + + tx->tx_nwrq++; + } + + if (rc < 0) /* no RDMA if completing with failure */ + tx->tx_nwrq = 0; +#endif + ibmsg->ibm_u.completion.ibcm_status = rc; + ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; + kiblnd_init_tx_msg(ni, tx, type, sizeof (kib_completion_msg_t)); + + return rc; +} + +void +kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) +{ + struct list_head *q; + + LASSERT (tx->tx_nwrq > 0); /* work items set up */ + LASSERT (!tx->tx_queued); /* not queued for sending already */ + + tx->tx_queued = 1; + tx->tx_deadline = jiffies + (*kiblnd_tunables.kib_timeout * HZ); + + if (tx->tx_conn == NULL) { + kiblnd_conn_addref(conn); + tx->tx_conn = conn; + LASSERT (tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE); + } else { + /* PUT_DONE first attached to conn as a PUT_REQ */ + LASSERT (tx->tx_conn == conn); + LASSERT (tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); + } + + switch (tx->tx_msg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_PUT_REQ: + case IBLND_MSG_GET_REQ: + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBLND_MSG_PUT_NAK: + case IBLND_MSG_PUT_ACK: + case IBLND_MSG_PUT_DONE: + case IBLND_MSG_GET_DONE: + q = &conn->ibc_tx_queue_nocred; + break; + + case IBLND_MSG_NOOP: + case IBLND_MSG_IMMEDIATE: + q = &conn->ibc_tx_queue; + break; + } + + list_add_tail(&tx->tx_list, q); +} + +void +kiblnd_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + spin_lock(&conn->ibc_lock); + kiblnd_queue_tx_locked(tx, conn); + spin_unlock(&conn->ibc_lock); + + kiblnd_check_sends(conn); +} + +void +kiblnd_connect_peer (kib_peer_t *peer) +{ + struct rdma_cm_id *cmid; + struct sockaddr_in sockaddr; + int rc; + + LASSERT (peer->ibp_connecting > 0); + + cmid = rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP); + if (IS_ERR(cmid)) { + CERROR("Can't create CMID for %s: %ld\n", + libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid)); + rc = PTR_ERR(cmid); + goto failed; + } + + memset(&sockaddr, 0, sizeof(sockaddr)); + sockaddr.sin_family = AF_INET; + sockaddr.sin_port = htons(*kiblnd_tunables.kib_service); + sockaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid)); + + kiblnd_peer_addref(peer); /* cmid's ref */ + + rc = rdma_resolve_addr(cmid, NULL, (struct sockaddr *)&sockaddr, + *kiblnd_tunables.kib_timeout * 1000); + if (rc == 0) + return; + + /* Can't initiate address resolution: */ + CERROR("Can't resolve addr for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + + kiblnd_peer_decref(peer); /* cmid's ref */ + rdma_destroy_id(cmid); + failed: + kiblnd_peer_connect_failed(peer, 1, rc); +} + +void +kiblnd_launch_tx (lnet_ni_t *ni, kib_tx_t *tx, lnet_nid_t nid) +{ + kib_peer_t *peer; + kib_peer_t *peer2; + kib_conn_t *conn; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + unsigned long flags; + int rc; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ + + /* First time, just use a read lock since I expect to find my peer + * connected */ + read_lock_irqsave(g_lock, flags); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL && !list_empty(&peer->ibp_conns)) { + /* Found a peer with an established connection */ + conn = kiblnd_get_conn_locked(peer); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + read_unlock_irqrestore(g_lock, flags); + + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + return; + } + + read_unlock(g_lock); + /* Re-try with a write lock */ + write_lock(g_lock); + + peer = kiblnd_find_peer_locked(nid); + if (peer != NULL) { + if (list_empty(&peer->ibp_conns)) { + /* found a peer, but it's still connecting... */ + LASSERT (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0); + list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + return; + } + + write_unlock_irqrestore(g_lock, flags); + + /* Allocate a peer ready to add to the peer table and retry */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kiblnd_tx_done(ni, tx); + return; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(nid); + if (peer2 != NULL) { + if (list_empty(&peer2->ibp_conns)) { + /* found a peer, but it's still connecting... */ + LASSERT (peer2->ibp_connecting != 0 || + peer2->ibp_accepting != 0); + list_add_tail (&tx->tx_list, &peer2->ibp_tx_queue); + write_unlock_irqrestore(g_lock, flags); + } else { + conn = kiblnd_get_conn_locked(peer2); + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + write_unlock_irqrestore(g_lock, flags); + + kiblnd_queue_tx(tx, conn); + kiblnd_conn_decref(conn); /* ...to here */ + } + + kiblnd_peer_decref(peer); + return; + } + + /* Brand new peer */ + LASSERT (peer->ibp_connecting == 0); + peer->ibp_connecting = 1; + + list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); + + kiblnd_peer_addref(peer); + list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + + kiblnd_connect_peer(peer); + kiblnd_peer_decref(peer); +} + +int +kiblnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + int rc; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + + /* Thread context */ + LASSERT (!in_interrupt()); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return (-EIO); + + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni); + if (tx == NULL) { + CERROR("Can allocate txd for GET to %s: \n", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.get.ibgm_hdr = *hdr; + ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kiblnd_setup_rd_iov(ni, tx, + &ibmsg->ibm_u.get.ibgm_rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); + else + rc = kiblnd_setup_rd_kiov(ni, tx, + &ibmsg->ibm_u.get.ibgm_rd, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + if (rc != 0) { + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kiblnd_tx_done(ni, tx); + return -EIO; + } +#if IBLND_MAP_ON_DEMAND + nob = sizeof(kib_get_msg_t); +#else + nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[tx->tx_nfrags]); +#endif + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); + + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ + tx->tx_waiting = 1; /* waiting for GET_DONE */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob <= IBLND_MSG_SIZE) + break; /* send IMMEDIATE */ + + tx = kiblnd_get_idle_tx(ni); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + if (payload_kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + payload_niov, payload_iov, + payload_offset, payload_nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + payload_niov, payload_kiov, + payload_offset, payload_nob); + if (rc != 0) { + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + kiblnd_tx_done(ni, tx); + return -EIO; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; + ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; + } + + /* send IMMEDIATE */ + + LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) + <= IBLND_MSG_SIZE); + + tx = kiblnd_get_idle_tx(ni); + if (tx == NULL) { + CERROR ("Can't send %d to %s: tx descs exhausted\n", + type, libcfs_nid2str(target.nid)); + return -ENOMEM; + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(IBLND_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); + + nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + kiblnd_launch_tx(ni, tx, target.nid); + return 0; +} + +void +kiblnd_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) +{ + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kib_tx_t *tx; + int rc; + + tx = kiblnd_get_idle_tx(ni); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + goto failed_0; + } + + if (nob == 0) + rc = 0; + else if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, + niov, iov, offset, nob); + else + rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, + niov, kiov, offset, nob); + + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + rc = kiblnd_init_rdma(ni, tx, IBLND_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + if (nob == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(ni, lntmsg, 0); + } else { + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + + kiblnd_queue_tx(tx, rx->rx_conn); + return; + + failed_1: + kiblnd_tx_done(ni, tx); + failed_0: + lnet_finalize(ni, lntmsg, -EIO); +} + +int +kiblnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + kib_tx_t *tx; + kib_msg_t *txmsg; + int nob; + int post_credit = IBLND_POSTRX_PEER_CREDIT; + int rc = 0; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + + case IBLND_MSG_IMMEDIATE: + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov(niov, kiov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + else + lnet_copy_flat2iov(niov, iov, offset, + IBLND_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize (ni, lntmsg, 0); + break; + + case IBLND_MSG_PUT_REQ: + if (mlen == 0) { + lnet_finalize(ni, lntmsg, 0); + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; + } + + tx = kiblnd_get_idle_tx(ni); + if (tx == NULL) { + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; + } + + txmsg = tx->tx_msg; + if (kiov == NULL) + rc = kiblnd_setup_rd_iov(ni, tx, + &txmsg->ibm_u.putack.ibpam_rd, + niov, iov, offset, mlen); + else + rc = kiblnd_setup_rd_kiov(ni, tx, + &txmsg->ibm_u.putack.ibpam_rd, + niov, kiov, offset, mlen); + if (rc != 0) { + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kiblnd_tx_done(ni, tx); + /* tell peer it's over */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; + } + + txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; + txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; +#if IBLND_MAP_ON_DEMAND + nob = sizeof(kib_putack_msg_t); +#else + nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[tx->tx_nfrags]); +#endif + kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob); + + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + tx->tx_waiting = 1; /* waiting for PUT_DONE */ + kiblnd_queue_tx(tx, conn); + + /* reposted buffer reserved for PUT_DONE */ + post_credit = IBLND_POSTRX_NO_CREDIT; + break; + + case IBLND_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kiblnd_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } + break; + } + + kiblnd_post_rx(rx, post_credit); + return rc; +} + +int +kiblnd_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kiblnd_data.kib_nthreads); + return (0); +} + +void +kiblnd_thread_fini (void) +{ + atomic_dec (&kiblnd_data.kib_nthreads); +} + +void +kiblnd_peer_alive (kib_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->ibp_last_alive = cfs_time_current(); + mb(); +} + +void +kiblnd_peer_notify (kib_peer_t *peer) +{ + time_t last_alive = 0; + int error = 0; + unsigned long flags; + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (list_empty(&peer->ibp_conns) && + peer->ibp_accepting == 0 && + peer->ibp_connecting == 0 && + peer->ibp_error != 0) { + error = peer->ibp_error; + peer->ibp_error = 0; + + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive); + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(peer->ibp_ni, + peer->ibp_nid, 0, last_alive); +} + +void +kiblnd_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immediate housekeeping. 'error' is zero for a + * normal shutdown which can happen only after the connection has been + * established. If the connection is established, schedule the + * connection to be finished off by the connd. Otherwise the connd is + * already dealing with it (either to set it up or tear it down). + * Caller holds kib_global_lock exclusively in irq context */ + unsigned long flags; + kib_peer_t *peer = conn->ibc_peer; + + LASSERT (error != 0 || conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (error != 0 && conn->ibc_comms_error == 0) + conn->ibc_comms_error = error; + + if (conn->ibc_state != IBLND_CONN_ESTABLISHED) + return; /* already being handled */ + + if (error == 0 && + list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && + list_empty(&conn->ibc_active_txs)) { + CDEBUG(D_NET, "closing conn to %s\n", + libcfs_nid2str(peer->ibp_nid)); + } else { + CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s\n", + libcfs_nid2str(peer->ibp_nid), error, + list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", + list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); + } + + list_del (&conn->ibc_list); + /* connd (see below) takes over ibc_list's ref */ + + if (list_empty (&peer->ibp_conns) && /* no more conns */ + kiblnd_peer_active(peer)) { /* still in peer table */ + kiblnd_unlink_peer_locked(peer); + + /* set/clear error on last conn */ + peer->ibp_error = conn->ibc_comms_error; + } + + kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING); + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + list_add_tail (&conn->ibc_list, &kiblnd_data.kib_connd_conns); + wake_up (&kiblnd_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); +} + +void +kiblnd_close_conn (kib_conn_t *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + kiblnd_close_conn_locked(conn, error); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +void +kiblnd_handle_early_rxs(kib_conn_t *conn) +{ + unsigned long flags; + kib_rx_t *rx; + + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + while (!list_empty(&conn->ibc_early_rxs)) { + rx = list_entry(conn->ibc_early_rxs.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_handle_rx(rx); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + } + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +void +kiblnd_abort_txs(kib_conn_t *conn, struct list_head *txs) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + + spin_lock(&conn->ibc_lock); + + list_for_each_safe (tmp, nxt, txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || + tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); + } + + tx->tx_status = -ECONNABORTED; + tx->tx_queued = 0; + tx->tx_waiting = 0; + + if (tx->tx_sending == 0) { + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + } + + spin_unlock(&conn->ibc_lock); + + kiblnd_txlist_done(conn->ibc_peer->ibp_ni, + &zombies, -ECONNABORTED); +} + +void +kiblnd_finalise_conn (kib_conn_t *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state > IBLND_CONN_INIT); + + kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED); + + /* abort_receives moves QP state to IB_QPS_ERR. This is only required + * for connections that didn't get as far as being connected, because + * rdma_disconnect() does this for free. */ + kiblnd_abort_receives(conn); + + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + kiblnd_abort_txs(conn, &conn->ibc_tx_queue); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); + kiblnd_abort_txs(conn, &conn->ibc_active_txs); + + kiblnd_handle_early_rxs(conn); +} + +void +kiblnd_peer_connect_failed (kib_peer_t *peer, int active, int error) +{ + LIST_HEAD (zombies); + unsigned long flags; + + LASSERT (error != 0); + LASSERT (!in_interrupt()); + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + if (active) { + LASSERT (peer->ibp_connecting > 0); + peer->ibp_connecting--; + } else { + LASSERT (peer->ibp_accepting > 0); + peer->ibp_accepting--; + } + + if (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0) { + /* another connection attempt under way... */ + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return; + } + + if (list_empty(&peer->ibp_conns)) { + /* Take peer's blocked transmits to complete with error */ + list_add(&zombies, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + if (kiblnd_peer_active(peer)) + kiblnd_unlink_peer_locked(peer); + + peer->ibp_error = error; + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT (list_empty(&peer->ibp_tx_queue)); + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + kiblnd_peer_notify(peer); + + if (list_empty (&zombies)) + return; + + CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer->ibp_nid)); + + kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH); +} + +void +kiblnd_connreq_done(kib_conn_t *conn, int status) +{ + struct list_head txs; + + kib_peer_t *peer = conn->ibc_peer; + int active; + unsigned long flags; + kib_tx_t *tx; + + active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + CDEBUG(D_NET,"%s: %d, %d\n", libcfs_nid2str(peer->ibp_nid), + active, status); + + LASSERT (!in_interrupt()); + LASSERT ((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && + peer->ibp_connecting > 0) || + (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && + peer->ibp_accepting > 0)); + + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + conn->ibc_connvars = NULL; + + if (status != 0) { + /* failed to establish connection */ + kiblnd_peer_connect_failed(conn->ibc_peer, active, status); + kiblnd_finalise_conn(conn); + return; + } + + /* connection established */ + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + conn->ibc_last_send = jiffies; + kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); + kiblnd_peer_alive(peer); + + /* Add conn to peer's list and nuke any dangling conns from a different + * peer instance... */ + kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ + list_add(&conn->ibc_list, &peer->ibp_conns); + if (active) + peer->ibp_connecting--; + else + peer->ibp_accepting--; + + kiblnd_close_stale_conns_locked(conn->ibc_peer, + conn->ibc_incarnation); + + if (!kiblnd_peer_active(peer) || /* peer has been deleted */ + conn->ibc_comms_error != 0) { /* error has happened already */ + + /* start to shut down connection */ + kiblnd_close_conn_locked(conn, -ECONNABORTED); + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + return; + } + + /* grab pending txs while I have the lock */ + list_add(&txs, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + /* Schedule blocked txs */ + spin_lock (&conn->ibc_lock); + while (!list_empty (&txs)) { + tx = list_entry (txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); + + kiblnd_queue_tx_locked(tx, conn); + } + spin_unlock (&conn->ibc_lock); + + kiblnd_check_sends(conn); + + /* schedule blocked rxs */ + kiblnd_handle_early_rxs(conn); +} + +void +kiblnd_reject(struct rdma_cm_id *cmid, int why) +{ + int rc; + kib_rej_t rej = {.ibr_magic = IBLND_MSG_MAGIC, + .ibr_version = IBLND_MSG_VERSION, + .ibr_why = why}; + + rc = rdma_reject(cmid, &rej, sizeof(rej)); + + if (rc != 0) + CWARN("Error %d sending reject\n", rc); +} + +int +kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob) +{ + kib_msg_t *ackmsg; + kib_msg_t *reqmsg = priv; + rwlock_t *g_lock = &kiblnd_data.kib_global_lock; + struct rdma_conn_param cp; + unsigned long flags; + lnet_ni_t *ni = NULL; + kib_dev_t *ibdev; + kib_peer_t *peer; + kib_peer_t *peer2; + kib_conn_t *conn; + lnet_nid_t nid; + int rc; + int rej = IBLND_REJECT_FATAL; + + LASSERT (!in_interrupt()); + + /* cmid inherits 'context' from the corresponding listener id */ + ibdev = (kib_dev_t *)cmid->context; + LASSERT (ibdev != NULL); + + if (priv_nob < offsetof(kib_msg_t, ibm_type)) { + CERROR("Short connection request\n"); + goto failed; + } + + if (reqmsg->ibm_magic == LNET_PROTO_MAGIC || + reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC) || + (reqmsg->ibm_magic == IBLND_MSG_MAGIC && + reqmsg->ibm_version != IBLND_MSG_VERSION) || + (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) && + reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION))) { + /* Future protocol version compatibility support! If the + * o2iblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will + * negotiate a protocol version. I trap this here to avoid + * console errors; the reject tells the peer which protocol I + * speak. */ + goto failed; + } + + rc = kiblnd_unpack_msg(reqmsg, priv_nob); + if (rc != 0) { + CERROR("Can't parse connection request: %d\n", rc); + goto failed; + } + + nid = reqmsg->ibm_srcnid; + + if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) { + CERROR("Unexpected connreq msg type: %x from %s\n", + reqmsg->ibm_type, libcfs_nid2str(nid)); + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_queue_depth != IBLND_MSG_QUEUE_SIZE) { + CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE); + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_frags != IBLND_MAX_RDMA_FRAGS) { + CERROR("Can't accept %s: incompatible max_frags %d (%d wanted)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE); + goto failed; + } + + if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("Can't accept %s: message size %d too big (%d max)\n", + libcfs_nid2str(nid), + reqmsg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + goto failed; + } + + ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid)); + if (ni == NULL || /* no matching net */ + ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ + ((kib_net_t*)ni->ni_data)->ibn_dev != ibdev) { /* wrong device */ + CERROR("Can't accept %s: bad dst nid %s\n", + libcfs_nid2str(nid), + libcfs_nid2str(reqmsg->ibm_dstnid)); + + goto failed; + } + + /* assume 'nid' is a new peer; create */ + rc = kiblnd_create_peer(ni, &peer, nid); + if (rc != 0) { + CERROR("Can't create peer for %s\n", libcfs_nid2str(nid)); + rej = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + write_lock_irqsave(g_lock, flags); + + peer2 = kiblnd_find_peer_locked(nid); + if (peer2 != NULL) { + /* tie-break connection race in favour of the higher NID */ + if (peer2->ibp_connecting != 0 && + nid < ni->ni_nid) { + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn race %s\n", + libcfs_nid2str(peer2->ibp_nid)); + + kiblnd_peer_decref(peer); + rej = IBLND_REJECT_CONN_RACE; + goto failed; + } + + peer2->ibp_accepting++; + kiblnd_peer_addref(peer2); + + write_unlock_irqrestore(g_lock, flags); + kiblnd_peer_decref(peer); + peer = peer2; + } else { + /* Brand new peer */ + LASSERT (peer->ibp_accepting == 0); + peer->ibp_accepting = 1; + + kiblnd_peer_addref(peer); + list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); + + write_unlock_irqrestore(g_lock, flags); + } + + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer, 0, -ENOMEM); + kiblnd_peer_decref(peer); + rej = IBLND_REJECT_NO_RESOURCES; + goto failed; + } + + /* conn now "owns" cmid, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. */ + + conn->ibc_incarnation = reqmsg->ibm_srcstamp; + conn->ibc_credits = IBLND_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBLND_RX_MSGS); + + ackmsg = &conn->ibc_connvars->cv_msg; + memset(ackmsg, 0, sizeof(*ackmsg)); + + kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, + sizeof(ackmsg->ibm_u.connparams)); + ackmsg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE; + ackmsg->ibm_u.connparams.ibcp_max_frags = IBLND_MAX_RDMA_FRAGS; + ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + kiblnd_pack_msg(ni, ackmsg, 0, nid, reqmsg->ibm_srcstamp); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = ackmsg; + cp.private_data_len = ackmsg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); + + rc = rdma_accept(cmid, &cp); + if (rc != 0) { + CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); + kiblnd_reject(cmid, IBLND_REJECT_FATAL); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + lnet_ni_decref(ni); + return 0; + + failed: + if (ni != NULL) + lnet_ni_decref(ni); + + kiblnd_reject(cmid, rej); + return -ECONNREFUSED; +} + +void +kiblnd_reconnect (kib_conn_t *conn, char *why) +{ + kib_peer_t *peer = conn->ibc_peer; + int retry = 0; + unsigned long flags; + + LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */ + + write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + /* retry connection if it's still needed and no other connection + * attempts (active or passive) are in progress */ + if (!list_empty(&peer->ibp_tx_queue) && + peer->ibp_connecting == 1 && + peer->ibp_accepting == 0) { + retry = 1; + peer->ibp_connecting++; + } + + write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (retry) { + CDEBUG(D_NETERROR, "%s: retrying (%s)\n", + libcfs_nid2str(peer->ibp_nid), why); + kiblnd_connect_peer(peer); + } +} + +void +kiblnd_rejected (kib_conn_t *conn, int reason, void *priv, int priv_nob) +{ + kib_peer_t *peer = conn->ibc_peer; + + LASSERT (!in_interrupt()); + LASSERT (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); + + switch (reason) { + case IB_CM_REJ_STALE_CONN: + kiblnd_reconnect(conn, "stale"); + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + if (priv_nob >= sizeof(kib_rej_t)) { + kib_rej_t *rej = priv; + + if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || + rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { + __swab32s(&rej->ibr_magic); + __swab16s(&rej->ibr_version); + } + + if (rej->ibr_magic != IBLND_MSG_MAGIC && + rej->ibr_magic != LNET_PROTO_MAGIC) { + CERROR("%s rejected: consumer defined fatal error\n", + libcfs_nid2str(peer->ibp_nid)); + break; + } + + if (rej->ibr_version != IBLND_MSG_VERSION) { + CERROR("%s rejected: o2iblnd version %d error\n", + libcfs_nid2str(peer->ibp_nid), + rej->ibr_version); + break; + } + + switch (rej->ibr_why) { + case IBLND_REJECT_CONN_RACE: + kiblnd_reconnect(conn, "conn race"); + break; + + case IBLND_REJECT_NO_RESOURCES: + CERROR("%s rejected: o2iblnd no resources\n", + libcfs_nid2str(peer->ibp_nid)); + break; + case IBLND_REJECT_FATAL: + CERROR("%s rejected: o2iblnd fatal error\n", + libcfs_nid2str(peer->ibp_nid)); + break; + default: + CERROR("%s rejected: o2iblnd reason %d\n", + libcfs_nid2str(peer->ibp_nid), + rej->ibr_why); + break; + } + break; + } + /* fall through */ + default: + CDEBUG(D_NETERROR, "%s rejected: reason %d, size %d\n", + libcfs_nid2str(peer->ibp_nid), reason, priv_nob); + break; + } + + kiblnd_connreq_done(conn, -ECONNREFUSED); +} + +void +kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob) +{ + kib_peer_t *peer = conn->ibc_peer; + lnet_ni_t *ni = peer->ibp_ni; + kib_net_t *net = ni->ni_data; + kib_msg_t *msg = priv; + int rc = kiblnd_unpack_msg(msg, priv_nob); + unsigned long flags; + + LASSERT (net != NULL); + + if (rc != 0) { + CERROR("Can't unpack connack from %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + goto failed; + } + + if (msg->ibm_type != IBLND_MSG_CONNACK) { + CERROR("Unexpected message %d from %s\n", + msg->ibm_type, libcfs_nid2str(peer->ibp_nid)); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_queue_depth != IBLND_MSG_QUEUE_SIZE) { + CERROR("%s has incompatible queue depth %d(%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_frags != IBLND_MAX_RDMA_FRAGS) { + CERROR("%s has incompatible max_frags %d (%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_queue_depth, + IBLND_MSG_QUEUE_SIZE); + rc = -EPROTO; + goto failed; + } + + if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { + CERROR("%s max message size %d too big (%d max)\n", + libcfs_nid2str(peer->ibp_nid), + msg->ibm_u.connparams.ibcp_max_msg_size, + IBLND_MSG_SIZE); + rc = -EPROTO; + goto failed; + } + + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + if (msg->ibm_dstnid == ni->ni_nid && + msg->ibm_dststamp == net->ibn_incarnation) + rc = 0; + else + rc = -ESTALE; + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Stale connection reply from %s\n", + libcfs_nid2str(peer->ibp_nid)); + goto failed; + } + + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = IBLND_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBLND_RX_MSGS); + + kiblnd_connreq_done(conn, 0); + return; + + failed: + /* NB My QP has already established itself, so I handle anything going + * wrong here by setting ibc_comms_error. + * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then + * immediately tears it down. */ + + LASSERT (rc != 0); + conn->ibc_comms_error = rc; + kiblnd_connreq_done(conn, 0); +} + +int +kiblnd_active_connect (struct rdma_cm_id *cmid) +{ + kib_peer_t *peer = (kib_peer_t *)cmid->context; + kib_conn_t *conn; + kib_msg_t *msg; + struct rdma_conn_param cp; + int rc; + + conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT); + if (conn == NULL) { + kiblnd_peer_connect_failed(peer, 1, -ENOMEM); + kiblnd_peer_decref(peer); /* lose cmid's ref */ + return -ENOMEM; + } + + /* conn "owns" cmid now, so I return success from here on to ensure the + * CM callback doesn't destroy cmid. conn also takes over cmid's ref + * on peer */ + + msg = &conn->ibc_connvars->cv_msg; + + memset(msg, 0, sizeof(*msg)); + kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); + msg->ibm_u.connparams.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE; + msg->ibm_u.connparams.ibcp_max_frags = IBLND_MAX_RDMA_FRAGS; + msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; + kiblnd_pack_msg(peer->ibp_ni, msg, 0, peer->ibp_nid, 0); + + memset(&cp, 0, sizeof(cp)); + cp.private_data = msg; + cp.private_data_len = msg->ibm_nob; + cp.responder_resources = 0; /* No atomic ops or RDMA reads */ + cp.initiator_depth = 0; + cp.flow_control = 1; + cp.retry_count = *kiblnd_tunables.kib_retry_count; + cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; + + LASSERT(cmid->context == (void *)conn); + LASSERT(conn->ibc_cmid == cmid); + + rc = rdma_connect(cmid, &cp); + if (rc != 0) { + CERROR("Can't connect to %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + kiblnd_connreq_done(conn, rc); + kiblnd_conn_decref(conn); + } + + return 0; +} + +int +kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) +{ + kib_peer_t *peer; + kib_conn_t *conn; + int rc; + + switch (event->event) { + default: + LBUG(); + + case RDMA_CM_EVENT_CONNECT_REQUEST: + /* destroy cmid on failure */ + rc = kiblnd_passive_connect(cmid, + event->private_data, + event->private_data_len); + CDEBUG(D_NET, "connreq: %d\n", rc); + return rc; + + case RDMA_CM_EVENT_ADDR_ERROR: + peer = (kib_peer_t *)cmid->context; + CDEBUG(D_NETERROR, "%s: ADDR ERROR %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ADDR_RESOLVED: + peer = (kib_peer_t *)cmid->context; + + CDEBUG(D_NET,"%s Addr resolved: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + + if (event->status != 0) { + CDEBUG(D_NETERROR, "Can't resolve address for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + rc = event->status; + } else { + rc = rdma_resolve_route( + cmid, *kiblnd_tunables.kib_timeout * 1000); + if (rc == 0) + return 0; + /* Can't initiate route resolution */ + CERROR("Can't resolve route for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), rc); + } + kiblnd_peer_connect_failed(peer, 1, rc); + kiblnd_peer_decref(peer); + return rc; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_ERROR: + peer = (kib_peer_t *)cmid->context; + CDEBUG(D_NETERROR, "%s: ROUTE ERROR %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); + kiblnd_peer_decref(peer); + return -EHOSTUNREACH; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + peer = (kib_peer_t *)cmid->context; + CDEBUG(D_NET,"%s Route resolved: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + + if (event->status == 0) + return kiblnd_active_connect(cmid); + + CDEBUG(D_NETERROR, "Can't resolve route for %s: %d\n", + libcfs_nid2str(peer->ibp_nid), event->status); + kiblnd_peer_connect_failed(peer, 1, event->status); + kiblnd_peer_decref(peer); + return event->status; /* rc != 0 destroys cmid */ + + case RDMA_CM_EVENT_UNREACHABLE: + conn = (kib_conn_t *)cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CDEBUG(D_NETERROR, "%s: UNREACHABLE %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENETDOWN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_CONNECT_ERROR: + conn = (kib_conn_t *)cmid->context; + LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || + conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); + CDEBUG(D_NETERROR, "%s: CONNECT ERROR %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); + kiblnd_connreq_done(conn, -ENOTCONN); + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_REJECTED: + conn = (kib_conn_t *)cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CERROR ("%s: REJECTED %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + event->status); + kiblnd_connreq_done(conn, -ECONNRESET); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + kiblnd_rejected(conn, event->status, + event->private_data, + event->private_data_len); + break; + } + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_ESTABLISHED: + conn = (kib_conn_t *)cmid->context; + switch (conn->ibc_state) { + default: + LBUG(); + + case IBLND_CONN_PASSIVE_WAIT: + CDEBUG(D_NET, "ESTABLISHED (passive): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, 0); + break; + + case IBLND_CONN_ACTIVE_CONNECT: + CDEBUG(D_NET, "ESTABLISHED(active): %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_check_connreply(conn, + event->private_data, + event->private_data_len); + break; + } + /* net keeps its ref on conn! */ + return 0; + + case RDMA_CM_EVENT_DISCONNECTED: + conn = (kib_conn_t *)cmid->context; + if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { + CERROR("%s DISCONNECTED\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kiblnd_connreq_done(conn, -ECONNRESET); + } else { + kiblnd_close_conn(conn, 0); + } + kiblnd_conn_decref(conn); + return 0; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + LCONSOLE_ERROR("Received notification of device removal\n"); + LCONSOLE_ERROR("Please shutdown LNET to allow this to proceed\n"); + /* Can't remove network from underneath LNET for now, so I have + * to ignore this */ + return 0; + } +} + +int +kiblnd_check_txs (kib_conn_t *conn, struct list_head *txs) +{ + kib_tx_t *tx; + struct list_head *ttmp; + int timed_out = 0; + + spin_lock(&conn->ibc_lock); + + list_for_each (ttmp, txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + if (txs != &conn->ibc_active_txs) { + LASSERT (tx->tx_queued); + } else { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } + + if (time_after_eq (jiffies, tx->tx_deadline)) { + timed_out = 1; + break; + } + } + + spin_unlock(&conn->ibc_lock); + return timed_out; +} + +int +kiblnd_conn_timed_out (kib_conn_t *conn) +{ + return kiblnd_check_txs(conn, &conn->ibc_tx_queue) || + kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || + kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) || + kiblnd_check_txs(conn, &conn->ibc_active_txs); +} + +void +kiblnd_check_conns (int idx) +{ + struct list_head *peers = &kiblnd_data.kib_peers[idx]; + struct list_head *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + struct list_head *ctmp; + unsigned long flags; + + again: + /* NB. We expect to have a look at all the peers and not find any + * rdmas to time out, so we just use a shared lock while we + * take a look... */ + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + + list_for_each (ctmp, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED); + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + kiblnd_check_sends(conn); + + if (!kiblnd_conn_timed_out(conn)) + continue; + + /* Handle timeout by closing the whole connection. We + * can only be sure RDMA activity has ceased once the + * QP has been modified. */ + + kiblnd_conn_addref(conn); /* 1 ref for me... */ + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, + flags); + + CERROR("Timed out RDMA with %s\n", + libcfs_nid2str(peer->ibp_nid)); + + kiblnd_close_conn(conn, -ETIMEDOUT); + kiblnd_conn_decref(conn); /* ...until here */ + + /* start again now I've dropped the lock */ + goto again; + } + } + + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); +} + +void +kiblnd_disconnect_conn (kib_conn_t *conn) +{ + LASSERT (!in_interrupt()); + LASSERT (current == kiblnd_data.kib_connd); + LASSERT (conn->ibc_state == IBLND_CONN_CLOSING); + + rdma_disconnect(conn->ibc_cmid); + kiblnd_finalise_conn(conn); + + kiblnd_peer_notify(conn->ibc_peer); +} + +int +kiblnd_connd (void *arg) +{ + wait_queue_t wait; + unsigned long flags; + kib_conn_t *conn; + int timeout; + int i; + int dropped_lock; + int peer_index = 0; + unsigned long deadline = jiffies; + + cfs_daemonize ("kiblnd_connd"); + cfs_block_allsigs (); + + init_waitqueue_entry (&wait, current); + kiblnd_data.kib_connd = current; + + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + + dropped_lock = 0; + + if (!list_empty (&kiblnd_data.kib_connd_zombies)) { + conn = list_entry (kiblnd_data.kib_connd_zombies.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + + spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags); + dropped_lock = 1; + + kiblnd_destroy_conn(conn); + + spin_lock_irqsave (&kiblnd_data.kib_connd_lock, flags); + } + + if (!list_empty (&kiblnd_data.kib_connd_conns)) { + conn = list_entry (kiblnd_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + + spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags); + dropped_lock = 1; + + kiblnd_disconnect_conn(conn); + kiblnd_conn_decref(conn); + + spin_lock_irqsave (&kiblnd_data.kib_connd_lock, flags); + } + + /* careful with the jiffy wrap... */ + timeout = (int)(deadline - jiffies); + if (timeout <= 0) { + const int n = 4; + const int p = 1; + int chunk = kiblnd_data.kib_peer_hash_size; + + spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); + dropped_lock = 1; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if (*kiblnd_tunables.kib_timeout > n * p) + chunk = (chunk * n * p) / + *kiblnd_tunables.kib_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kiblnd_check_conns(peer_index); + peer_index = (peer_index + 1) % + kiblnd_data.kib_peer_hash_size; + } + + deadline += p * HZ; + spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); + } + + if (dropped_lock) + continue; + + /* Nothing to do for 'timeout' */ + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&kiblnd_data.kib_connd_waitq, &wait); + spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags); + + schedule_timeout (timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&kiblnd_data.kib_connd_waitq, &wait); + spin_lock_irqsave (&kiblnd_data.kib_connd_lock, flags); + } + + spin_unlock_irqrestore (&kiblnd_data.kib_connd_lock, flags); + + kiblnd_thread_fini(); + return (0); +} + +void +kiblnd_qp_event(struct ib_event *event, void *arg) +{ + kib_conn_t *conn = arg; + + switch (event->event) { + case IB_EVENT_COMM_EST: + CDEBUG(D_NET, "%s established\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return; + + default: + CERROR("%s: Async QP event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); + return; + } +} + +void +kiblnd_complete (struct ib_wc *wc) +{ + switch (kiblnd_wreqid2type(wc->wr_id)) { + default: + LBUG(); + + case IBLND_WID_RDMA: + /* We only get RDMA completion notification if it fails. All + * subsequent work items, including the final SEND will fail + * too. However we can't print out any more info about the + * failing RDMA because 'tx' might be back on the idle list or + * even reused already if we didn't manage to post all our work + * items */ + CDEBUG(D_NETERROR, "RDMA (tx: %p) failed: %d\n", + kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_TX: + kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status); + return; + + case IBLND_WID_RX: + kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status, + wc->byte_len); + return; + } +} + +void +kiblnd_cq_completion (struct ib_cq *cq, void *arg) +{ + /* NB I'm not allowed to schedule this conn once its refcount has + * reached 0. Since fundamentally I'm racing with scheduler threads + * consuming my CQ I could be called after all completions have + * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0 + * and this CQ is about to be destroyed so I NOOP. */ + kib_conn_t *conn = (kib_conn_t *)arg; + unsigned long flags; + + LASSERT (cq == conn->ibc_cq); + + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + + conn->ibc_ready = 1; + + if (!conn->ibc_scheduled && + (conn->ibc_nrx > 0 || + conn->ibc_nsends_posted > 0)) { + kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ + conn->ibc_scheduled = 1; + list_add_tail(&conn->ibc_sched_list, + &kiblnd_data.kib_sched_conns); + wake_up(&kiblnd_data.kib_sched_waitq); + } + + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); +} + +void +kiblnd_cq_event(struct ib_event *event, void *arg) +{ + kib_conn_t *conn = arg; + + CERROR("%s: async CQ event type %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); +} + +int +kiblnd_scheduler(void *arg) +{ + long id = (long)arg; + wait_queue_t wait; + char name[16]; + unsigned long flags; + kib_conn_t *conn; + struct ib_wc wc; + int rc; + int did_something; + int busy_loops = 0; + + snprintf(name, sizeof(name), "kiblnd_sd_%02ld", id); + cfs_daemonize(name); + cfs_block_allsigs(); + + init_waitqueue_entry(&wait, current); + + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + + while (!kiblnd_data.kib_shutdown) { + if (busy_loops++ >= IBLND_RESCHED) { + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, + flags); + + our_cond_resched(); + busy_loops = 0; + + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + } + + did_something = 0; + + if (!list_empty(&kiblnd_data.kib_sched_conns)) { + conn = list_entry(kiblnd_data.kib_sched_conns.next, + kib_conn_t, ibc_sched_list); + /* take over kib_sched_conns' ref on conn... */ + LASSERT(conn->ibc_scheduled); + list_del(&conn->ibc_sched_list); + conn->ibc_ready = 0; + + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, + flags); + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + if (rc == 0) { + rc = ib_req_notify_cq(conn->ibc_cq, + IB_CQ_NEXT_COMP); + LASSERT (rc >= 0); + + rc = ib_poll_cq(conn->ibc_cq, 1, &wc); + } + + LASSERT (rc >= 0); + + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, + flags); + + if (rc != 0 || conn->ibc_ready) { + /* There may be another completion waiting; get + * another scheduler to check while I handle + * this one... */ + kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ + list_add_tail(&conn->ibc_sched_list, + &kiblnd_data.kib_sched_conns); + wake_up(&kiblnd_data.kib_sched_waitq); + } else { + conn->ibc_scheduled = 0; + } + + if (rc != 0) { + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, + flags); + + kiblnd_complete(&wc); + + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, + flags); + } + + kiblnd_conn_decref(conn); /* ...drop my ref from above */ + did_something = 1; + } + + if (did_something) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&kiblnd_data.kib_sched_waitq, &wait); + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); + + schedule(); + busy_loops = 0; + + remove_wait_queue(&kiblnd_data.kib_sched_waitq, &wait); + set_current_state(TASK_RUNNING); + spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags); + } + + spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags); + + kiblnd_thread_fini(); + return (0); +} diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c new file mode 100644 index 0000000..ef42ffe --- /dev/null +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -0,0 +1,218 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2006 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "o2iblnd.h" + +static int service = 987; +CFS_MODULE_PARM(service, "i", int, 0444, + "service number (within RDMA_PS_TCP)"); + +static int cksum = 0; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +static int ntx = 256; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of message descriptors"); + +static int credits = 64; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static char *ipif_name = "ib0"; +CFS_MODULE_PARM(ipif_name, "s", charp, 0444, + "IPoIB interface name"); + +static int retry_count = 5; +CFS_MODULE_PARM(retry_count, "i", int, 0644, + "Retransmissions when no ACK received"); + +static int rnr_retry_count = 6; +CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644, + "RNR retransmissions"); + +static int keepalive = 100; +CFS_MODULE_PARM(keepalive, "i", int, 0644, + "Idle time in seconds before sending a keepalive"); + +static int ib_mtu = 0; +CFS_MODULE_PARM(ib_mtu, "i", int, 0444, + "IB MTU 256/512/1024/2048/4096"); + +#if IBLND_MAP_ON_DEMAND +static int concurrent_sends = IBLND_RX_MSGS; +#else +static int concurrent_sends = IBLND_MSG_QUEUE_SIZE; +#endif +CFS_MODULE_PARM(concurrent_sends, "i", int, 0444, + "send work-queue sizing"); + +#if IBLND_MAP_ON_DEMAND +static int fmr_pool_size = 512; +CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444, + "size of the fmr pool (>= ntx)"); + +static int fmr_flush_trigger = 384; +CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444, + "# dirty FMRs that triggers pool flush"); + +static int fmr_cache = 1; +CFS_MODULE_PARM(fmr_cache, "i", int, 0444, + "non-zero to enable FMR caching"); +#endif + +kib_tunables_t kiblnd_tunables = { + .kib_service = &service, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_keepalive = &keepalive, + .kib_ntx = &ntx, + .kib_credits = &credits, + .kib_peercredits = &peer_credits, + .kib_default_ipif = &ipif_name, + .kib_retry_count = &retry_count, + .kib_rnr_retry_count = &rnr_retry_count, + .kib_concurrent_sends = &concurrent_sends, + .kib_ib_mtu = &ib_mtu, +#if IBLND_MAP_ON_DEMAND + .kib_fmr_pool_size = &fmr_pool_size, + .kib_fmr_flush_trigger = &fmr_flush_trigger, + .kib_fmr_cache = &fmr_cache, +#endif +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + +static char ipif_basename_space[32]; + +static ctl_table kiblnd_ctl_table[] = { + {1, "service", &service, + sizeof(int), 0444, NULL, &proc_dointvec}, + {2, "cksum", &cksum, + sizeof(int), 0644, NULL, &proc_dointvec}, + {3, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {4, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {5, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {6, "peer_credits", &peer_credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {7, "ipif_name", ipif_basename_space, + sizeof(ipif_basename_space), 0444, NULL, &proc_dostring}, + {8, "retry_count", &retry_count, + sizeof(int), 0644, NULL, &proc_dointvec}, + {9, "rnr_retry_count", &rnr_retry_count, + sizeof(int), 0644, NULL, &proc_dointvec}, + {10, "keepalive", &keepalive, + sizeof(int), 0644, NULL, &proc_dointvec}, + {11, "concurrent_sends", &concurrent_sends, + sizeof(int), 0644, NULL, &proc_dointvec}, + {12, "ib_mtu", &ib_mtu, + sizeof(int), 0444, NULL, &proc_dointvec}, +#if IBLND_MAP_ON_DEMAND + {12, "fmr_pool_size", &fmr_pool_size, + sizeof(int), 0444, NULL, &proc_dointvec}, + {13, "fmr_flush_trigger", &fmr_flush_trigger, + sizeof(int), 0444, NULL, &proc_dointvec}, + {14, "fmr_cache", &fmr_cache, + sizeof(int), 0444, NULL, &proc_dointvec}, +#endif + {0} +}; + +static ctl_table kiblnd_top_ctl_table[] = { + {203, "o2iblnd", NULL, 0, 0555, kiblnd_ctl_table}, + {0} +}; + +void +kiblnd_initstrtunable(char *space, char *str, int size) +{ + strncpy(space, str, size); + space[size-1] = 0; +} + +void +kiblnd_sysctl_init (void) +{ + kiblnd_initstrtunable(ipif_basename_space, ipif_name, + sizeof(ipif_basename_space)); + + kiblnd_tunables.kib_sysctl = + register_sysctl_table(kiblnd_top_ctl_table, 0); + + if (kiblnd_tunables.kib_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); +} + +void +kiblnd_sysctl_fini (void) +{ + if (kiblnd_tunables.kib_sysctl != NULL) + unregister_sysctl_table(kiblnd_tunables.kib_sysctl); +} + +#else + +void +kiblnd_sysctl_init (void) +{ +} + +void +kiblnd_sysctl_fini (void) +{ +} + +#endif + +int +kiblnd_tunables_init (void) +{ + kiblnd_sysctl_init(); + + if (*kiblnd_tunables.kib_concurrent_sends > IBLND_RX_MSGS) + *kiblnd_tunables.kib_concurrent_sends = IBLND_RX_MSGS; + if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE) + *kiblnd_tunables.kib_concurrent_sends = IBLND_MSG_QUEUE_SIZE; + + return 0; +} + +void +kiblnd_tunables_fini (void) +{ + kiblnd_sysctl_fini(); +} + + + diff --git a/lnet/klnds/openiblnd/Makefile.in b/lnet/klnds/openiblnd/Makefile.in index 9b8ed5d..86fa9cd 100644 --- a/lnet/klnds/openiblnd/Makefile.in +++ b/lnet/klnds/openiblnd/Makefile.in @@ -1,5 +1,5 @@ -MODULES := kopenibnal -kopenibnal-objs := openibnal.o openibnal_cb.o +MODULES := kopeniblnd +kopeniblnd-objs := openiblnd.o openiblnd_cb.o openiblnd_modparams.o EXTRA_POST_CFLAGS := @OPENIBCPPFLAGS@ diff --git a/lnet/klnds/openiblnd/autoMakefile.am b/lnet/klnds/openiblnd/autoMakefile.am index 6f56421..b4e0fb7 100644 --- a/lnet/klnds/openiblnd/autoMakefile.am +++ b/lnet/klnds/openiblnd/autoMakefile.am @@ -4,12 +4,10 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS -if BUILD_OPENIBNAL -modulenet_DATA = kopenibnal$(KMODEXT) -endif +if BUILD_OPENIBLND +modulenet_DATA = kopeniblnd$(KMODEXT) endif endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kopenibnal-objs:%.o=%.c) openibnal.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kopeniblnd-objs:%.o=%.c) openiblnd.h diff --git a/lnet/klnds/openiblnd/openiblnd.c b/lnet/klnds/openiblnd/openiblnd.c index d0385a6..4219005 100644 --- a/lnet/klnds/openiblnd/openiblnd.c +++ b/lnet/klnds/openiblnd/openiblnd.c @@ -21,42 +21,24 @@ * */ -#include "openibnal.h" +#include "openiblnd.h" -nal_t kibnal_api; -ptl_handle_ni_t kibnal_ni; -kib_data_t kibnal_data; -kib_tunables_t kibnal_tunables; - -#define IBNAL_SYSCTL 202 - -enum { - IBNAL_SYSCTL_TIMEOUT=1, - IBNAL_SYSCTL_LISTENER_TIMEOUT, - IBNAL_SYSCTL_BACKLOG, - IBNAL_SYSCTL_PORT -}; - -static ctl_table kibnal_ctl_table[] = { - {IBNAL_SYSCTL_TIMEOUT, "timeout", - &kibnal_tunables.kib_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - {IBNAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", - &kibnal_tunables.kib_listener_timeout, sizeof(int), - 0644, NULL, &proc_dointvec}, - {IBNAL_SYSCTL_BACKLOG, "backlog", - &kibnal_tunables.kib_backlog, sizeof(int), - 0644, NULL, kibnal_listener_procint}, - {IBNAL_SYSCTL_PORT, "port", - &kibnal_tunables.kib_port, sizeof(int), - 0644, NULL, kibnal_listener_procint}, - { 0 } +lnd_t the_kiblnd = { +#ifdef USING_TSAPI + .lnd_type = CIBLND, +#else + .lnd_type = OPENIBLND, +#endif + .lnd_startup = kibnal_startup, + .lnd_shutdown = kibnal_shutdown, + .lnd_ctl = kibnal_ctl, + .lnd_send = kibnal_send, + .lnd_recv = kibnal_recv, + .lnd_eager_recv = kibnal_eager_recv, + .lnd_accept = kibnal_accept, }; -static ctl_table kibnal_top_ctl_table[] = { - {IBNAL_SYSCTL, "openibnal", NULL, 0, 0555, kibnal_ctl_table}, - { 0 } -}; +kib_data_t kibnal_data; __u32 kibnal_cksum (void *ptr, int nob) @@ -79,31 +61,35 @@ kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) } void -kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, __u64 dststamp) +kibnal_pack_msg(kib_msg_t *msg, int version, int credits, + lnet_nid_t dstnid, __u64 dststamp) { /* CAVEAT EMPTOR! all message fields not set here should have been * initialised previously. */ msg->ibm_magic = IBNAL_MSG_MAGIC; - msg->ibm_version = IBNAL_MSG_VERSION; + msg->ibm_version = version; /* ibm_type */ msg->ibm_credits = credits; /* ibm_nob */ msg->ibm_cksum = 0; - msg->ibm_srcnid = kibnal_lib.libnal_ni.ni_pid.nid; + msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid, + dstnid); msg->ibm_srcstamp = kibnal_data.kib_incarnation; msg->ibm_dstnid = dstnid; msg->ibm_dststamp = dststamp; -#if IBNAL_CKSUM - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); -#endif + + if (*kibnal_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); + } } int -kibnal_unpack_msg(kib_msg_t *msg, int nob) +kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob) { const int hdr_size = offsetof(kib_msg_t, ibm_u); __u32 msg_cksum; + int msg_version; int flip; int msg_nob; @@ -121,9 +107,12 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) return -EPROTO; } - if (msg->ibm_version != - (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) { - CERROR("Bad version: %d\n", msg->ibm_version); + msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if ((expected_version == 0) ? + (msg_version != IBNAL_MSG_VERSION && + msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) : + (msg_version != expected_version)) { + CERROR("Bad version: %x\n", msg_version); return -EPROTO; } @@ -151,7 +140,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) if (flip) { /* leave magic unflipped as a clue to peer endianness */ - __swab16s(&msg->ibm_version); + msg->ibm_version = msg_version; LASSERT (sizeof(msg->ibm_type) == 1); LASSERT (sizeof(msg->ibm_credits) == 1); msg->ibm_nob = msg_nob; @@ -161,8 +150,8 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) __swab64s(&msg->ibm_dststamp); } - if (msg->ibm_srcnid == PTL_NID_ANY) { - CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid); + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); return -EPROTO; } @@ -235,223 +224,11 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) } int -kibnal_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - - /* We've set up the socket's send buffer to be large enough for - * everything we send, so a single non-blocking send should - * complete without error. */ - - set_fs(KERNEL_DS); - rc = sock_sendmsg(sock, &msg, iov.iov_len); - set_fs(oldmm); - - if (rc == nob) - return 0; - - if (rc >= 0) - return -EAGAIN; - - return rc; -} - -int -kibnal_sock_read (struct socket *sock, void *buffer, int nob, int timeout) -{ - int rc; - mm_segment_t oldmm = get_fs(); - long ticks = timeout * HZ; - unsigned long then; - struct timeval tv; - - LASSERT (nob > 0); - LASSERT (ticks > 0); - - for (;;) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - /* Set receive timeout to remaining time */ - tv = (struct timeval) { - .tv_sec = ticks / HZ, - .tv_usec = ((ticks % HZ) * 1000000) / HZ - }; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof(tv)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set socket recv timeout %d: %d\n", - timeout, rc); - return rc; - } - - set_fs(KERNEL_DS); - then = jiffies; - rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); - ticks -= jiffies - then; - set_fs(oldmm); - - if (rc < 0) - return rc; - - if (rc == 0) - return -ECONNABORTED; - - buffer = ((char *)buffer) + rc; - nob -= rc; - - if (nob == 0) - return 0; - - if (ticks <= 0) - return -ETIMEDOUT; - } -} - -int -kibnal_create_sock(struct socket **sockp) -{ - struct socket *sock; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - if (rc != 0) { - CERROR("Can't create socket: %d\n", rc); - return rc; - } - - /* Ensure sends will not block */ - option = 2 * sizeof(kib_msg_t); - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set send buffer %d: %d\n", option, rc); - goto failed; - } - - option = 1; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR: %d\n", rc); - goto failed; - } - - *sockp = sock; - return 0; - - failed: - sock_release(sock); - return rc; -} - -void -kibnal_pause(int ticks) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ticks); -} - -int -kibnal_connect_sock(kib_peer_t *peer, struct socket **sockp) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct socket *sock; - unsigned int port; - int rc; - - for (port = 1023; port >= 512; port--) { - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(port); - locaddr.sin_addr.s_addr = htonl(INADDR_ANY); - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (peer->ibp_port); - srvaddr.sin_addr.s_addr = htonl (peer->ibp_ip); - - rc = kibnal_create_sock(&sock); - if (rc != 0) - return rc; - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc != 0) { - sock_release(sock); - - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", port); - continue; - } - - CERROR("Can't bind to reserved port %d: %d\n", port, rc); - return rc; - } - - rc = sock->ops->connect(sock, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - 0); - if (rc == 0) { - *sockp = sock; - return 0; - } - - sock_release(sock); - - if (rc != -EADDRNOTAVAIL) { - CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n", - port, HIPQUAD(peer->ibp_ip), peer->ibp_port, rc); - return rc; - } - - CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", - port, HIPQUAD(peer->ibp_ip), peer->ibp_port); - } - - /* all ports busy */ - return -EHOSTUNREACH; -} - -int kibnal_make_svcqry (kib_conn_t *conn) { kib_peer_t *peer = conn->ibc_peer; + int version = IBNAL_MSG_VERSION; + int msg_version; kib_msg_t *msg; struct socket *sock; int rc; @@ -460,115 +237,235 @@ kibnal_make_svcqry (kib_conn_t *conn) LASSERT (conn->ibc_connreq != NULL); msg = &conn->ibc_connreq->cr_msg; + again: kibnal_init_msg(msg, IBNAL_MSG_SVCQRY, 0); - kibnal_pack_msg(msg, 0, peer->ibp_nid, 0); + kibnal_pack_msg(msg, version, 0, peer->ibp_nid, 0); - rc = kibnal_connect_sock(peer, &sock); + rc = lnet_connect(&sock, peer->ibp_nid, + 0, peer->ibp_ip, peer->ibp_port); if (rc != 0) - return rc; + return -ECONNABORTED; - rc = kibnal_sock_write(sock, msg, msg->ibm_nob); + rc = libcfs_sock_write(sock, msg, msg->ibm_nob, + lnet_acceptor_timeout()); + if (rc != 0) { + CERROR("Error %d sending svcqry to %s at %u.%u.%u.%u/%d\n", + rc, libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); + goto out; + } + + /* The first 6 bytes are invariably MAGIC + proto version */ + rc = libcfs_sock_read(sock, msg, 6, *kibnal_tunables.kib_timeout); if (rc != 0) { - CERROR("Error %d sending svcqry to " - LPX64"@%u.%u.%u.%u/%d\n", rc, - peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port); + CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", + rc, libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); + goto out; + } + + if (msg->ibm_magic != IBNAL_MSG_MAGIC && + msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + CERROR("Bad magic: %08x from %s at %u.%u.%u.%u/%d\n", + msg->ibm_magic, libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); + rc = -EPROTO; goto out; } - nob = offsetof(kib_msg_t, ibm_u) + sizeof(msg->ibm_u.svcrsp); - rc = kibnal_sock_read(sock, msg, nob, kibnal_tunables.kib_io_timeout); + msg_version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ? + msg->ibm_version : __swab16(msg->ibm_version); + if (msg_version != version) { + if (version == IBNAL_MSG_VERSION) { + /* retry with previous version */ + libcfs_sock_release(sock); + version = IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD; + goto again; + } + + CERROR("Bad version %x from %s at %u.%u.%u.%u/%d\n", + msg_version, libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); + rc = -EPROTO; + goto out; + } + + /* Read in the rest of the message now we know the expected format */ + nob = offsetof(kib_msg_t, ibm_u) + sizeof(kib_svcrsp_t); + rc = libcfs_sock_read(sock, ((char *)msg) + 6, nob - 6, + *kibnal_tunables.kib_timeout); if (rc != 0) { - CERROR("Error %d receiving svcrsp from " - LPX64"@%u.%u.%u.%u/%d\n", rc, - peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port); + CERROR("Error %d receiving svcrsp from %s at %u.%u.%u.%u/%d\n", + rc, libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); goto out; } - rc = kibnal_unpack_msg(msg, nob); + rc = kibnal_unpack_msg(msg, version, nob); if (rc != 0) { - CERROR("Error %d unpacking svcrsp from " - LPX64"@%u.%u.%u.%u/%d\n", rc, - peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port); + CERROR("Error %d unpacking svcrsp from %s at %u.%u.%u.%u/%d\n", + rc, libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); goto out; } if (msg->ibm_type != IBNAL_MSG_SVCRSP) { - CERROR("Unexpected response type %d from " - LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_type, - peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port); + CERROR("Unexpected response type %d from %s at %u.%u.%u.%u/%d\n", + msg->ibm_type, libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); rc = -EPROTO; goto out; } - if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || + if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + msg->ibm_dstnid) || msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR("Unexpected dst NID/stamp "LPX64"/"LPX64" from " - LPX64"@%u.%u.%u.%u/%d\n", - msg->ibm_dstnid, msg->ibm_dststamp, - peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port); + CERROR("Unexpected dst NID/stamp %s/"LPX64" from " + "%s at %u.%u.%u.%u/%d\n", + libcfs_nid2str(msg->ibm_dstnid), msg->ibm_dststamp, + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), + peer->ibp_port); rc = -EPROTO; goto out; } - if (msg->ibm_srcnid != peer->ibp_nid) { - CERROR("Unexpected src NID "LPX64" from " - LPX64"@%u.%u.%u.%u/%d\n", msg->ibm_srcnid, - peer->ibp_nid, HIPQUAD(peer->ibp_ip), peer->ibp_port); + if (!lnet_ptlcompat_matchnid(peer->ibp_nid, msg->ibm_srcnid)) { + CERROR("Unexpected src NID %s from %s at %u.%u.%u.%u/%d\n", + libcfs_nid2str(msg->ibm_srcnid), + libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), peer->ibp_port); rc = -EPROTO; goto out; } conn->ibc_incarnation = msg->ibm_srcstamp; conn->ibc_connreq->cr_svcrsp = msg->ibm_u.svcrsp; + conn->ibc_version = version; + out: - sock_release(sock); + libcfs_sock_release(sock); return rc; } void kibnal_handle_svcqry (struct socket *sock) { - struct sockaddr_in addr; __u32 peer_ip; unsigned int peer_port; kib_msg_t *msg; __u64 srcnid; __u64 srcstamp; - int len; + int version; + int reject = 0; int rc; - len = sizeof(addr); - rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2); + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); if (rc != 0) { CERROR("Can't get peer's IP: %d\n", rc); return; } - peer_ip = ntohl(addr.sin_addr.s_addr); - peer_port = ntohs(addr.sin_port); - - if (peer_port >= 1024) { - CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n", + LIBCFS_ALLOC(msg, sizeof(*msg)); + if (msg == NULL) { + CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n", HIPQUAD(peer_ip), peer_port); return; } + + rc = libcfs_sock_read(sock, &msg->ibm_magic, sizeof(msg->ibm_magic), + lnet_acceptor_timeout()); + if (rc != 0) { + CERROR("Error %d receiving svcqry(1) from %u.%u.%u.%u/%d\n", + rc, HIPQUAD(peer_ip), peer_port); + goto out; + } - PORTAL_ALLOC(msg, sizeof(*msg)); - if (msg == NULL) { - CERROR("Can't allocate msgs for %u.%u.%u.%u/%d\n", - HIPQUAD(peer_ip), peer_port); + if (msg->ibm_magic != IBNAL_MSG_MAGIC && + msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + /* Unexpected magic! */ + if (the_lnet.ln_ptlcompat == 0) { + if (msg->ibm_magic == LNET_PROTO_MAGIC || + msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) { + /* future protocol version compatibility! + * When LNET unifies protocols over all LNDs, + * the first thing sent will be a version + * query. I send back a reply in my current + * protocol to tell her I'm "old" */ + kibnal_init_msg(msg, 0, 0); + kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, + LNET_NID_ANY, 0); + reject = 1; + goto reply; + } + + CERROR ("Bad magic(1) %#08x (%#08x expected) from " + "%u.%u.%u.%u/%d\n", msg->ibm_magic, + IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port); + goto out; + } + + /* When portals compatibility is set, I may be passed a new + * connection "blindly" by the acceptor, and I have to + * determine if my peer has sent an acceptor connection request + * or not. */ + rc = lnet_accept(kibnal_data.kib_ni, sock, msg->ibm_magic); + if (rc != 0) + goto out; + + /* It was an acceptor connection request! + * Now I should see my magic... */ + rc = libcfs_sock_read(sock, &msg->ibm_magic, + sizeof(msg->ibm_magic), + lnet_acceptor_timeout()); + if (rc != 0) { + CERROR("Error %d receiving svcqry(2) from %u.%u.%u.%u/%d\n", + rc, HIPQUAD(peer_ip), peer_port); + goto out; + } + + if (msg->ibm_magic != IBNAL_MSG_MAGIC && + msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + CERROR ("Bad magic(2) %#08x (%#08x expected) from " + "%u.%u.%u.%u/%d\n", msg->ibm_magic, + IBNAL_MSG_MAGIC, HIPQUAD(peer_ip), peer_port); + goto out; + } + } + + /* Now check version */ + + rc = libcfs_sock_read(sock, &msg->ibm_version, sizeof(msg->ibm_version), + lnet_acceptor_timeout()); + if (rc != 0) { + CERROR("Error %d receiving svcqry(3) from %u.%u.%u.%u/%d\n", + rc, HIPQUAD(peer_ip), peer_port); goto out; } + + version = (msg->ibm_magic == IBNAL_MSG_MAGIC) ? + msg->ibm_version : __swab32(msg->ibm_version); + /* Peer is a different protocol version: reply in my current protocol + * to tell her I'm "old" */ + if (version != IBNAL_MSG_VERSION && + version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + kibnal_init_msg(msg, 0, 0); + kibnal_pack_msg(msg, IBNAL_MSG_VERSION, 0, LNET_NID_ANY, 0); + reject = 1; + goto reply; + } - rc = kibnal_sock_read(sock, msg, offsetof(kib_msg_t, ibm_u), - kibnal_tunables.kib_listener_timeout); + /* Now read in all the rest */ + rc = libcfs_sock_read(sock, &msg->ibm_type, + offsetof(kib_msg_t, ibm_u) - + offsetof(kib_msg_t, ibm_type), + lnet_acceptor_timeout()); if (rc != 0) { - CERROR("Error %d receiving svcqry from %u.%u.%u.%u/%d\n", + CERROR("Error %d receiving svcqry(4) from %u.%u.%u.%u/%d\n", rc, HIPQUAD(peer_ip), peer_port); goto out; } - rc = kibnal_unpack_msg(msg, offsetof(kib_msg_t, ibm_u)); + rc = kibnal_unpack_msg(msg, version, offsetof(kib_msg_t, ibm_u)); if (rc != 0) { CERROR("Error %d unpacking svcqry from %u.%u.%u.%u/%d\n", rc, HIPQUAD(peer_ip), peer_port); @@ -581,10 +478,11 @@ kibnal_handle_svcqry (struct socket *sock) goto out; } - if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) { - CERROR("Unexpected dstnid "LPX64"(expected "LPX64" " - "from %u.%u.%u.%u/%d\n", msg->ibm_dstnid, - kibnal_lib.libnal_ni.ni_pid.nid, + if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + msg->ibm_dstnid)) { + CERROR("Unexpected dstnid %s: expected %s from %u.%u.%u.%u/%d\n", + libcfs_nid2str(msg->ibm_dstnid), + libcfs_nid2str(kibnal_data.kib_ni->ni_nid), HIPQUAD(peer_ip), peer_port); goto out; } @@ -599,277 +497,50 @@ kibnal_handle_svcqry (struct socket *sock) sizeof(kibnal_data.kib_svc_gid)); msg->ibm_u.svcrsp.ibsr_svc_pkey = kibnal_data.kib_svc_pkey; - kibnal_pack_msg(msg, 0, srcnid, srcstamp); - - rc = kibnal_sock_write (sock, msg, msg->ibm_nob); - if (rc != 0) { + kibnal_pack_msg(msg, version, 0, srcnid, srcstamp); + + reply: + rc = libcfs_sock_write (sock, msg, msg->ibm_nob, + lnet_acceptor_timeout()); + if (!reject && rc != 0) { + /* Only complain if we're not rejecting */ CERROR("Error %d replying to svcqry from %u.%u.%u.%u/%d\n", rc, HIPQUAD(peer_ip), peer_port); goto out; } out: - PORTAL_FREE(msg, sizeof(*msg)); + LIBCFS_FREE(msg, sizeof(*msg)); } void kibnal_free_acceptsock (kib_acceptsock_t *as) { - sock_release(as->ibas_sock); - PORTAL_FREE(as, sizeof(*as)); + libcfs_sock_release(as->ibas_sock); + LIBCFS_FREE(as, sizeof(*as)); } int -kibnal_ip_listener(void *arg) +kibnal_accept(lnet_ni_t *ni, struct socket *sock) { - struct sockaddr_in addr; - wait_queue_t wait; - struct socket *sock; kib_acceptsock_t *as; - int port; - char name[16]; - int rc; unsigned long flags; - /* Parent thread holds kib_nid_mutex, and is, or is about to - * block on kib_listener_signal */ - - port = kibnal_tunables.kib_port; - snprintf(name, sizeof(name), "kibnal_lstn%03d", port); - kportal_daemonize(name); - kportal_blockallsigs(); - - init_waitqueue_entry(&wait, current); - - rc = kibnal_create_sock(&sock); - if (rc != 0) - goto out_0; - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(port); - addr.sin_addr.s_addr = INADDR_ANY; - - rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr)); - if (rc != 0) { - CERROR("Can't bind to port %d\n", port); - goto out_1; + LIBCFS_ALLOC(as, sizeof(*as)); + if (as == NULL) { + CERROR("Out of Memory\n"); + return -ENOMEM; } - rc = sock->ops->listen(sock, kibnal_tunables.kib_backlog); - if (rc != 0) { - CERROR("Can't set listen backlog %d: %d\n", - kibnal_tunables.kib_backlog, rc); - goto out_1; - } - - LASSERT (kibnal_data.kib_listener_sock == NULL); - kibnal_data.kib_listener_sock = sock; - - /* unblock waiting parent */ - LASSERT (kibnal_data.kib_listener_shutdown == 0); - up(&kibnal_data.kib_listener_signal); - - /* Wake me any time something happens on my socket */ - add_wait_queue(sock->sk->sk_sleep, &wait); - as = NULL; - - while (kibnal_data.kib_listener_shutdown == 0) { - - if (as == NULL) { - PORTAL_ALLOC(as, sizeof(*as)); - if (as == NULL) { - CERROR("Out of Memory: pausing...\n"); - kibnal_pause(HZ); - continue; - } - as->ibas_sock = NULL; - } - - if (as->ibas_sock == NULL) { - as->ibas_sock = sock_alloc(); - if (as->ibas_sock == NULL) { - CERROR("Can't allocate socket: pausing...\n"); - kibnal_pause(HZ); - continue; - } - /* XXX this should add a ref to sock->ops->owner, if - * TCP could be a module */ - as->ibas_sock->type = sock->type; - as->ibas_sock->ops = sock->ops; - } + as->ibas_sock = sock; - set_current_state(TASK_INTERRUPTIBLE); - - rc = sock->ops->accept(sock, as->ibas_sock, O_NONBLOCK); - - /* Sleep for socket activity? */ - if (rc == -EAGAIN && - kibnal_data.kib_listener_shutdown == 0) - schedule(); - - set_current_state(TASK_RUNNING); - - if (rc == 0) { - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail(&as->ibas_list, - &kibnal_data.kib_connd_acceptq); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); - wake_up(&kibnal_data.kib_connd_waitq); - - as = NULL; - continue; - } - - if (rc != -EAGAIN) { - CERROR("Accept failed: %d, pausing...\n", rc); - kibnal_pause(HZ); - } - } - - if (as != NULL) { - if (as->ibas_sock != NULL) - sock_release(as->ibas_sock); - PORTAL_FREE(as, sizeof(*as)); - } - - rc = 0; - remove_wait_queue(sock->sk->sk_sleep, &wait); - out_1: - sock_release(sock); - kibnal_data.kib_listener_sock = NULL; - out_0: - /* set completion status and unblock thread waiting for me - * (parent on startup failure, executioner on normal shutdown) */ - kibnal_data.kib_listener_shutdown = rc; - up(&kibnal_data.kib_listener_signal); - - return 0; -} - -int -kibnal_start_ip_listener (void) -{ - long pid; - int rc; - - CDEBUG(D_NET, "Starting listener\n"); - - /* Called holding kib_nid_mutex: listener stopped */ - LASSERT (kibnal_data.kib_listener_sock == NULL); - - kibnal_data.kib_listener_shutdown = 0; - pid = kernel_thread(kibnal_ip_listener, NULL, 0); - if (pid < 0) { - CERROR("Can't spawn listener: %ld\n", pid); - return (int)pid; - } - - /* Block until listener has started up. */ - down(&kibnal_data.kib_listener_signal); - - rc = kibnal_data.kib_listener_shutdown; - LASSERT ((rc != 0) == (kibnal_data.kib_listener_sock == NULL)); - - CDEBUG((rc == 0) ? D_WARNING : D_ERROR, - "Listener %s: pid:%ld port:%d backlog:%d\n", - (rc == 0) ? "started OK" : "startup failed", - pid, kibnal_tunables.kib_port, kibnal_tunables.kib_backlog); - - return rc; -} - -void -kibnal_stop_ip_listener(int clear_acceptq) -{ - struct list_head zombie_accepts; - kib_acceptsock_t *as; - unsigned long flags; - - CDEBUG(D_NET, "Stopping listener\n"); - - /* Called holding kib_nid_mutex: listener running */ - LASSERT (kibnal_data.kib_listener_sock != NULL); - - kibnal_data.kib_listener_shutdown = 1; - wake_up_all(kibnal_data.kib_listener_sock->sk->sk_sleep); - - /* Block until listener has torn down. */ - down(&kibnal_data.kib_listener_signal); - - LASSERT (kibnal_data.kib_listener_sock == NULL); - CWARN("Listener stopped\n"); - - if (!clear_acceptq) - return; - - /* Close any unhandled accepts */ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add(&zombie_accepts, &kibnal_data.kib_connd_acceptq); - list_del_init(&kibnal_data.kib_connd_acceptq); + + list_add_tail(&as->ibas_list, &kibnal_data.kib_connd_acceptq); + wake_up(&kibnal_data.kib_connd_waitq); spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); - - while (!list_empty(&zombie_accepts)) { - as = list_entry(zombie_accepts.next, - kib_acceptsock_t, ibas_list); - list_del(&as->ibas_list); - kibnal_free_acceptsock(as); - } -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) -int -kibnal_listener_procint(ctl_table *table, int write, struct file *filp, - void *buffer, size_t *lenp) -#else -int -kibnal_listener_procint(ctl_table *table, int write, struct file *filp, - void *buffer, size_t *lenp, loff_t *ppos) -#endif -{ - int *tunable = (int *)table->data; - int old_val; - int rc; - - /* No race with nal initialisation since the nal is setup all the time - * it's loaded. When that changes, change this! */ - LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL); - - down(&kibnal_data.kib_nid_mutex); - - LASSERT (tunable == &kibnal_tunables.kib_port || - tunable == &kibnal_tunables.kib_backlog); - old_val = *tunable; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) - rc = proc_dointvec(table, write, filp, buffer, lenp); -#else - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); -#endif - if (write && - (*tunable != old_val || - kibnal_data.kib_listener_sock == NULL)) { - - if (kibnal_data.kib_listener_sock != NULL) - kibnal_stop_ip_listener(0); - - rc = kibnal_start_ip_listener(); - if (rc != 0) { - CERROR("Unable to restart listener with new tunable:" - " reverting to old value\n"); - *tunable = old_val; - kibnal_start_ip_listener(); - } - } - - up(&kibnal_data.kib_nid_mutex); - - LASSERT (kibnal_data.kib_init == IBNAL_INIT_ALL); - return rc; + return 0; } int @@ -929,79 +600,20 @@ kibnal_stop_ib_listener (void) } int -kibnal_set_mynid (ptl_nid_t nid) +kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) { - lib_ni_t *ni = &kibnal_lib.libnal_ni; - int rc; - - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); + kib_peer_t *peer; + unsigned long flags; + int rc; - down (&kibnal_data.kib_nid_mutex); + LASSERT (nid != LNET_NID_ANY); - if (nid == kibnal_data.kib_nid) { - /* no change of NID */ - up (&kibnal_data.kib_nid_mutex); - return (0); + LIBCFS_ALLOC(peer, sizeof (*peer)); + if (peer == NULL) { + CERROR("Cannot allocate peer\n"); + return -ENOMEM; } - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", - kibnal_data.kib_nid, nid); - - if (kibnal_data.kib_listener_sock != NULL) - kibnal_stop_ip_listener(1); - - if (kibnal_data.kib_listen_handle != NULL) - kibnal_stop_ib_listener(); - - ni->ni_pid.nid = nid; - kibnal_data.kib_incarnation++; - mb(); - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave new - * world. */ - kibnal_del_peer (PTL_NID_ANY, 0); - - if (ni->ni_pid.nid != PTL_NID_ANY) { - /* got a new NID to install */ - rc = kibnal_start_ib_listener(); - if (rc != 0) { - CERROR("Can't start IB listener: %d\n", rc); - goto failed_0; - } - - rc = kibnal_start_ip_listener(); - if (rc != 0) { - CERROR("Can't start IP listener: %d\n", rc); - goto failed_1; - } - } - - up(&kibnal_data.kib_nid_mutex); - return 0; - - failed_1: - kibnal_stop_ib_listener(); - failed_0: - ni->ni_pid.nid = PTL_NID_ANY; - kibnal_data.kib_incarnation++; - mb(); - kibnal_del_peer (PTL_NID_ANY, 0); - up(&kibnal_data.kib_nid_mutex); - return rc; -} - -kib_peer_t * -kibnal_create_peer (ptl_nid_t nid) -{ - kib_peer_t *peer; - - LASSERT (nid != PTL_NID_ANY); - - PORTAL_ALLOC (peer, sizeof (*peer)); - if (peer == NULL) - return (NULL); - memset(peer, 0, sizeof(*peer)); /* zero flags etc */ peer->ibp_nid = nid; @@ -1012,53 +624,63 @@ kibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_tx_queue); INIT_LIST_HEAD (&peer->ibp_connd_list); /* not queued for connecting */ - peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_error = 0; + peer->ibp_last_alive = cfs_time_current(); + peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ - atomic_inc (&kibnal_data.kib_npeers); - CDEBUG(D_NET, "peer %p "LPX64"\n", peer, nid); + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - return (peer); + if (atomic_read(&kibnal_data.kib_npeers) >= + *kibnal_tunables.kib_concurrent_peers) { + rc = -EOVERFLOW; /* !! but at least it distinguishes */ + } else if (kibnal_data.kib_nonewpeers) { + rc = -ESHUTDOWN; /* shutdown has started */ + } else { + rc = 0; + /* npeers only grows with kib_global_lock held */ + atomic_inc(&kibnal_data.kib_npeers); + } + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (rc != 0) { + CERROR("Can't create peer: %s\n", + (rc == -ESHUTDOWN) ? "shutting down" : + "too many peers"); + LIBCFS_FREE(peer, sizeof(*peer)); + } else { + *peerp = peer; + } + + return rc; } void kibnal_destroy_peer (kib_peer_t *peer) { - CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ibp_nid, peer); + CDEBUG (D_NET, "peer %s %p deleted\n", + libcfs_nid2str(peer->ibp_nid), peer); LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); + LASSERT (peer->ibp_accepting == 0); LASSERT (list_empty (&peer->ibp_connd_list)); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); - PORTAL_FREE (peer, sizeof (*peer)); + LIBCFS_FREE (peer, sizeof (*peer)); /* NB a peer's connections keep a reference on their peer until * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&kibnal_data.kib_npeers); -} - -void -kibnal_put_peer (kib_peer_t *peer) -{ - CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", - peer, peer->ibp_nid, - atomic_read (&peer->ibp_refcount)); - - LASSERT (atomic_read (&peer->ibp_refcount) > 0); - if (!atomic_dec_and_test (&peer->ibp_refcount)) - return; - - kibnal_destroy_peer (peer); + atomic_dec(&kibnal_data.kib_npeers); } kib_peer_t * -kibnal_find_peer_locked (ptl_nid_t nid) +kibnal_find_peer_locked (lnet_nid_t nid) { struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; @@ -1070,20 +692,19 @@ kibnal_find_peer_locked (ptl_nid_t nid) LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ peer->ibp_connecting != 0 || /* creating conns */ + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); /* active conn */ if (peer->ibp_nid != nid) continue; - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ibp_refcount)); return (peer); } return (NULL); } kib_peer_t * -kibnal_get_peer (ptl_nid_t nid) +kibnal_get_peer (lnet_nid_t nid) { kib_peer_t *peer; unsigned long flags; @@ -1091,7 +712,7 @@ kibnal_get_peer (ptl_nid_t nid) read_lock_irqsave(&kibnal_data.kib_global_lock, flags); peer = kibnal_find_peer_locked (nid); if (peer != NULL) /* +1 ref for caller? */ - atomic_inc (&peer->ibp_refcount); + kibnal_peer_addref(peer); read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (peer); @@ -1106,11 +727,11 @@ kibnal_unlink_peer_locked (kib_peer_t *peer) LASSERT (kibnal_peer_active(peer)); list_del_init (&peer->ibp_list); /* lose peerlist's ref */ - kibnal_put_peer (peer); + kibnal_peer_decref(peer); } int -kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp, +kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp, int *persistencep) { kib_peer_t *peer; @@ -1127,6 +748,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp, peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); if (index-- > 0) @@ -1148,24 +770,25 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp, } int -kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port) +kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port) { unsigned long flags; kib_peer_t *peer; kib_peer_t *peer2; + int rc; - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return (-EINVAL); - peer = kibnal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); + rc = kibnal_create_peer (&peer, nid); + if (rc != 0) + return rc; write_lock_irqsave (&kibnal_data.kib_global_lock, flags); peer2 = kibnal_find_peer_locked (nid); if (peer2 != NULL) { - kibnal_put_peer (peer); + kibnal_peer_decref(peer); peer = peer2; } else { /* peer table takes existing ref on peer */ @@ -1182,19 +805,13 @@ kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port) } void -kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +kibnal_del_peer_locked (kib_peer_t *peer) { struct list_head *ctmp; struct list_head *cnxt; kib_conn_t *conn; - if (!single_share) - peer->ibp_persistence = 0; - else if (peer->ibp_persistence > 0) - peer->ibp_persistence--; - - if (peer->ibp_persistence != 0) - return; + peer->ibp_persistence = 0; if (list_empty(&peer->ibp_conns)) { kibnal_unlink_peer_locked(peer); @@ -1212,9 +829,10 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share) } int -kibnal_del_peer (ptl_nid_t nid, int single_share) +kibnal_del_peer (lnet_nid_t nid) { unsigned long flags; + CFS_LIST_HEAD (zombies); struct list_head *ptmp; struct list_head *pnxt; kib_peer_t *peer; @@ -1225,7 +843,7 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -1237,21 +855,27 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) continue; - kibnal_del_peer_locked (peer, single_share); - rc = 0; /* matched something */ + if (!list_empty(&peer->ibp_tx_queue)) { + LASSERT (list_empty(&peer->ibp_conns)); - if (single_share) - goto out; + list_splice_init(&peer->ibp_tx_queue, &zombies); + } + + kibnal_del_peer_locked (peer); + rc = 0; /* matched something */ } } - out: + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + kibnal_txlist_done(&zombies, -EIO); + return (rc); } @@ -1273,6 +897,7 @@ kibnal_get_conn_by_idx (int index) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence > 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); list_for_each (ctmp, &peer->ibp_conns) { @@ -1280,10 +905,7 @@ kibnal_get_conn_by_idx (int index) continue; conn = list_entry (ctmp, kib_conn_t, ibc_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (conn); @@ -1310,7 +932,7 @@ kibnal_create_conn (void) struct ib_qp_attribute qp_attr; } params; - PORTAL_ALLOC (conn, sizeof (*conn)); + LIBCFS_ALLOC (conn, sizeof (*conn)); if (conn == NULL) { CERROR ("Can't allocate connection\n"); return (NULL); @@ -1319,14 +941,16 @@ kibnal_create_conn (void) /* zero flags, NULL pointers etc... */ memset (conn, 0, sizeof (*conn)); + INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ - PORTAL_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) goto failed; memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); @@ -1360,11 +984,13 @@ kibnal_create_conn (void) } } + /* We can post up to IBLND_MSG_QUEUE_SIZE immediate/req messages and + * the same # of ack/nak/rdma+done messages */ + params.qp_create = (struct ib_qp_create_param) { .limit = { - /* Sends have an optional RDMA */ - .max_outstanding_send_request = 2 * IBNAL_MSG_QUEUE_SIZE, - .max_outstanding_receive_request = IBNAL_MSG_QUEUE_SIZE, + .max_outstanding_send_request = 3 * IBNAL_MSG_QUEUE_SIZE, + .max_outstanding_receive_request = IBNAL_RX_MSGS, .max_send_gather_element = 1, .max_receive_scatter_element = 1, }, @@ -1421,6 +1047,8 @@ kibnal_destroy_conn (kib_conn_t *conn) LASSERT (atomic_read (&conn->ibc_refcount) == 0); LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); LASSERT (conn->ibc_connreq == NULL); @@ -1446,13 +1074,13 @@ kibnal_destroy_conn (kib_conn_t *conn) kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) - PORTAL_FREE(conn->ibc_rxs, + LIBCFS_FREE(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof(kib_rx_t)); if (conn->ibc_peer != NULL) - kibnal_put_peer(conn->ibc_peer); + kibnal_peer_decref(conn->ibc_peer); - PORTAL_FREE(conn, sizeof (*conn)); + LIBCFS_FREE(conn, sizeof (*conn)); atomic_dec(&kibnal_data.kib_nconns); @@ -1465,30 +1093,6 @@ kibnal_destroy_conn (kib_conn_t *conn) } } -void -kibnal_put_conn (kib_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - - LASSERT (atomic_read (&conn->ibc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ibc_refcount)) - return; - - /* last ref only goes on zombies */ - LASSERT (conn->ibc_state == IBNAL_CONN_ZOMBIE); - - spin_lock_irqsave (&kibnal_data.kib_reaper_lock, flags); - - list_add (&conn->ibc_list, &kibnal_data.kib_reaper_conns); - wake_up (&kibnal_data.kib_reaper_waitq); - - spin_unlock_irqrestore (&kibnal_data.kib_reaper_lock, flags); -} - int kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) { @@ -1521,9 +1125,10 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) if (conn->ibc_incarnation == incarnation) continue; - CDEBUG(D_NET, "Closing stale conn %p nid:"LPX64 + CDEBUG(D_NET, "Closing stale conn %p nid: %s" " incarnation:"LPX64"("LPX64")\n", conn, - peer->ibp_nid, conn->ibc_incarnation, incarnation); + libcfs_nid2str(peer->ibp_nid), + conn->ibc_incarnation, incarnation); count++; kibnal_close_conn_locked (conn, -ESTALE); @@ -1533,7 +1138,7 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) } int -kibnal_close_matching_conns (ptl_nid_t nid) +kibnal_close_matching_conns (lnet_nid_t nid) { unsigned long flags; kib_peer_t *peer; @@ -1546,7 +1151,7 @@ kibnal_close_matching_conns (ptl_nid_t nid) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -1559,9 +1164,10 @@ kibnal_close_matching_conns (ptl_nid_t nid) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) continue; count += kibnal_close_peer_conns_locked (peer, 0); @@ -1571,72 +1177,71 @@ kibnal_close_matching_conns (ptl_nid_t nid) write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return (0); return (count == 0 ? -ENOENT : 0); } int -kibnal_cmd(struct portals_cfg *pcfg, void * private) +kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { - int rc = -EINVAL; + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; - LASSERT (pcfg != NULL); + LASSERT (ni == kibnal_data.kib_ni); - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; __u32 ip = 0; int port = 0; int share_count = 0; - rc = kibnal_get_peer_info(pcfg->pcfg_count, + rc = kibnal_get_peer_info(data->ioc_count, &nid, &ip, &port, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = ip; - pcfg->pcfg_misc = port; - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; + data->ioc_nid = nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; break; } - case NAL_CMD_ADD_PEER: { - rc = kibnal_add_persistent_peer (pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_misc); /* port */ + case IOC_LIBCFS_ADD_PEER: { + rc = kibnal_add_persistent_peer (data->ioc_nid, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ break; } - case NAL_CMD_DEL_PEER: { - rc = kibnal_del_peer (pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); + case IOC_LIBCFS_DEL_PEER: { + rc = kibnal_del_peer (data->ioc_nid); break; } - case NAL_CMD_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + case IOC_LIBCFS_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); if (conn == NULL) rc = -ENOENT; else { rc = 0; - pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; - kibnal_put_conn (conn); + data->ioc_nid = conn->ibc_peer->ibp_nid; + kibnal_conn_decref(conn); } break; } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (data->ioc_nid); break; } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) + case IOC_LIBCFS_REGISTER_MYNID: { + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; - else - rc = kibnal_set_mynid (pcfg->pcfg_nid); + } break; } } @@ -1661,7 +1266,7 @@ kibnal_free_pages (kib_pages_t *p) if (p->ibp_pages[i] != NULL) __free_page(p->ibp_pages[i]); - PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int @@ -1672,7 +1277,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) int i; int rc; - PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); @@ -1690,7 +1295,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) } } - PORTAL_ALLOC(phys_pages, npages * sizeof(*phys_pages)); + LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages)); if (phys_pages == NULL) { CERROR ("Can't allocate physarray for %d pages\n", npages); kibnal_free_pages(p); @@ -1700,7 +1305,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) for (i = 0; i < npages; i++) { phys_pages[i].size = PAGE_SIZE; phys_pages[i].address = - kibnal_page2phys(p->ibp_pages[i]); + lnet_page2phys(p->ibp_pages[i]); } p->ibp_vaddr = 0; @@ -1713,7 +1318,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int access) &p->ibp_lkey, &p->ibp_rkey); - PORTAL_FREE(phys_pages, npages * sizeof(*phys_pages)); + LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages)); if (rc != 0) { CERROR ("Error %d mapping %d pages\n", rc, npages); @@ -1745,14 +1350,14 @@ kibnal_setup_tx_descs (void) LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, - IBNAL_TX_MSG_PAGES, + IBNAL_TX_MSG_PAGES(), 0); /* local read access only */ if (rc != 0) return (rc); vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; @@ -1760,21 +1365,15 @@ kibnal_setup_tx_descs (void) tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); tx->tx_vaddr = vaddr; - tx->tx_isnblk = (i >= IBNAL_NTX); tx->tx_mapped = KIB_TX_UNMAPPED; CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n", i, tx, tx->tx_msg, tx->tx_vaddr); - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); vaddr += IBNAL_MSG_SIZE; - LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES()); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1782,7 +1381,7 @@ kibnal_setup_tx_descs (void) if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); } } @@ -1790,21 +1389,17 @@ kibnal_setup_tx_descs (void) } void -kibnal_api_shutdown (nal_t *nal) +kibnal_shutdown (lnet_ni_t *ni) { - int i; - int rc; - - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } + int i; + int rc; + unsigned long flags; CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); + atomic_read (&libcfs_kmemory)); - LASSERT(nal == &kibnal_api); + LASSERT(ni == kibnal_data.kib_ni); + LASSERT(ni->ni_data == &kibnal_data); switch (kibnal_data.kib_init) { default: @@ -1812,23 +1407,39 @@ kibnal_api_shutdown (nal_t *nal) LBUG(); case IBNAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(OPENIBNAL); - /* No new peers */ + /* Prevent new peers from being created */ + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + kibnal_data.kib_nonewpeers = 1; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - /* resetting my NID unadvertises me, removes my - * listener and nukes all current peers */ - kibnal_set_mynid (PTL_NID_ANY); + kibnal_stop_ib_listener(); + + /* Remove all existing peers from the peer table */ + kibnal_del_peer(LNET_NID_ANY); + + /* Wait for pending conn reqs to be handled */ + i = 2; + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + while (!list_empty(&kibnal_data.kib_connd_acceptq)) { + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, + flags); + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ + "waiting for conn reqs to clean up\n"); + cfs_pause(cfs_time_seconds(1)); + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + } + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&kibnal_data.kib_npeers) != 0) { + while (atomic_read(&kibnal_data.kib_npeers) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for %d peers to close down\n", - atomic_read (&kibnal_data.kib_npeers)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); + atomic_read(&kibnal_data.kib_npeers)); + cfs_pause(cfs_time_seconds(1)); } /* fall through */ @@ -1854,14 +1465,10 @@ kibnal_api_shutdown (nal_t *nal) CERROR ("Destroy PD error: %d\n", rc); /* fall through */ - case IBNAL_INIT_LIB: - lib_fini(&kibnal_lib); - /* fall through */ - case IBNAL_INIT_DATA: /* Module refcount only gets to zero when all peers * have been closed so all lists must be empty */ - LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0); LASSERT (kibnal_data.kib_peers != NULL); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { LASSERT (list_empty (&kibnal_data.kib_peers[i])); @@ -1885,8 +1492,7 @@ kibnal_api_shutdown (nal_t *nal) CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", atomic_read (&kibnal_data.kib_nthreads)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); + cfs_pause(cfs_time_seconds(1)); } /* fall through */ @@ -1895,57 +1501,135 @@ kibnal_api_shutdown (nal_t *nal) } if (kibnal_data.kib_tx_descs != NULL) - PORTAL_FREE (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + LIBCFS_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); if (kibnal_data.kib_peers != NULL) - PORTAL_FREE (kibnal_data.kib_peers, + LIBCFS_FREE (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - printk(KERN_INFO "Lustre: OpenIB NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + atomic_read (&libcfs_kmemory)); kibnal_data.kib_init = IBNAL_INIT_NOTHING; + PORTAL_MODULE_UNUSE; +} + +int +kibnal_get_ipoibidx(void) +{ + /* NB single threaded! */ + static struct ib_port_properties port_props; + + int ipoibidx = 0; + int devidx; + int port; + int rc; + struct ib_device *device; + + for (devidx = 0; devidx <= kibnal_data.kib_hca_idx; devidx++) { + device = ib_device_get_by_index(devidx); + + if (device == NULL) { + CERROR("Can't get IB device %d\n", devidx); + return -1; + } + + for (port = 1; port <= 2; port++) { + if (devidx == kibnal_data.kib_hca_idx && + port == kibnal_data.kib_port) + return ipoibidx; + + rc = ib_port_properties_get(device, port, + &port_props); + if (rc == 0) + ipoibidx++; + } + } + + LBUG(); + return -1; } int -kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +kibnal_startup (lnet_ni_t *ni) { + char ipif_name[32]; + __u32 ip; + __u32 netmask; + int up; struct timeval tv; - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); int rc; + int hca; + int port; int i; + int nob; - LASSERT (nal == &kibnal_api); + LASSERT (ni->ni_lnd == &the_kiblnd); - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); + /* Only 1 instance supported */ + if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { + CERROR ("Only 1 instance supported\n"); + return -EPERM; } - LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kibnal_tunables.kib_credits, + *kibnal_tunables.kib_ntx); + return -EINVAL; + } memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ + ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; + ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; + + CLASSERT (LNET_MAX_INTERFACES > 1); + + + kibnal_data.kib_hca_idx = 0; /* default: first HCA */ + kibnal_data.kib_port = 0; /* any port */ + + if (ni->ni_interfaces[0] != NULL) { + /* hca.port specified in 'networks=openib(h.p)' */ + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + return -EPERM; + } + + nob = strlen(ni->ni_interfaces[0]); + i = sscanf(ni->ni_interfaces[0], "%d.%d%n", &hca, &port, &nob); + if (i >= 2 && nob == strlen(ni->ni_interfaces[0])) { + kibnal_data.kib_hca_idx = hca; + kibnal_data.kib_port = port; + } else { + nob = strlen(ni->ni_interfaces[0]); + i = sscanf(ni->ni_interfaces[0], "%d%n", &hca, &nob); + + if (i >= 1 && nob == strlen(ni->ni_interfaces[0])) { + kibnal_data.kib_hca_idx = hca; + } else { + CERROR("Can't parse interface '%s'\n", + ni->ni_interfaces[0]); + return -EINVAL; + } + } + } + + kibnal_data.kib_ni = ni; + ni->ni_data = &kibnal_data; + do_gettimeofday(&tv); kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - init_MUTEX (&kibnal_data.kib_nid_mutex); - init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal); + PORTAL_MODULE_USE; rwlock_init(&kibnal_data.kib_global_lock); kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (kibnal_data.kib_peers, + LIBCFS_ALLOC (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); if (kibnal_data.kib_peers == NULL) { goto failed; @@ -1969,11 +1653,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); - PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + LIBCFS_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); if (kibnal_data.kib_tx_descs == NULL) { CERROR ("Can't allocate tx descs\n"); goto failed; @@ -1983,21 +1665,6 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ - - process_id.pid = requested_pid; - process_id.nid = PTL_NID_ANY; /* don't know my NID yet */ - - rc = lib_init(&kibnal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kibnal_data.kib_init = IBNAL_INIT_LIB; - /*****************************************************/ - for (i = 0; i < IBNAL_N_SCHED; i++) { rc = kibnal_thread_start (kibnal_scheduler, (void *)((unsigned long)i)); @@ -2008,7 +1675,13 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } } - for (i = 0; i < IBNAL_N_CONND; i++) { + /* must have at least 2 connds to remain responsive to svcqry while + * connecting */ + if (*kibnal_tunables.kib_n_connd < 2) + *kibnal_tunables.kib_n_connd = 2; + + + for (i = 0; i < *kibnal_tunables.kib_n_connd; i++) { rc = kibnal_thread_start (kibnal_connd, (void *)((unsigned long)i)); if (rc != 0) { @@ -2024,9 +1697,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, goto failed; } - kibnal_data.kib_device = ib_device_get_by_index(0); + kibnal_data.kib_device = ib_device_get_by_index(kibnal_data.kib_hca_idx); if (kibnal_data.kib_device == NULL) { - CERROR ("Can't open ib device 0\n"); + CERROR ("Can't open ib device %d\n", + kibnal_data.kib_hca_idx); goto failed; } @@ -2041,19 +1715,54 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_device_props.max_initiator_per_qp, kibnal_data.kib_device_props.max_responder_per_qp); - kibnal_data.kib_port = 0; - for (i = 1; i <= 2; i++) { - rc = ib_port_properties_get(kibnal_data.kib_device, i, + if (kibnal_data.kib_port != 0) { + rc = ib_port_properties_get(kibnal_data.kib_device, + kibnal_data.kib_port, &kibnal_data.kib_port_props); - if (rc == 0) { - kibnal_data.kib_port = i; - break; + if (rc != 0) { + CERROR("Error %d open port %d on HCA %d\n", rc, + kibnal_data.kib_port, + kibnal_data.kib_hca_idx); + goto failed; + } + } else { + for (i = 1; i <= 2; i++) { + rc = ib_port_properties_get(kibnal_data.kib_device, i, + &kibnal_data.kib_port_props); + if (rc == 0) { + kibnal_data.kib_port = i; + break; + } } + if (kibnal_data.kib_port == 0) { + CERROR ("Can't find a port\n"); + goto failed; + } + } + + i = kibnal_get_ipoibidx(); + if (i < 0) + goto failed; + + snprintf(ipif_name, sizeof(ipif_name), "%s%d", + *kibnal_tunables.kib_ipif_basename, i); + if (strlen(ipif_name) == sizeof(ipif_name - 1)) { + CERROR("IPoIB interface name %s truncated\n", ipif_name); + return -EINVAL; } - if (kibnal_data.kib_port == 0) { - CERROR ("Can't find a port\n"); + + rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); + if (rc != 0) { + CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); + goto failed; + } + + if (!up) { + CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); goto failed; } + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); rc = ib_pd_create(kibnal_data.kib_device, NULL, &kibnal_data.kib_pd); @@ -2067,9 +1776,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /*****************************************************/ #if IBNAL_FMR { - const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; + const int pool_size = *kibnal_tunables.kib_ntx; struct ib_fmr_pool_param params = { - .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, + .max_pages_per_fmr = LNET_MAX_PAYLOAD/PAGE_SIZE, .access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ), @@ -2112,7 +1821,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, }, .arg = NULL, }; - int nentries = IBNAL_CQ_ENTRIES; + int nentries = IBNAL_CQ_ENTRIES(); rc = ib_cq_create (kibnal_data.kib_device, &nentries, &callback, NULL, @@ -2126,39 +1835,31 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, rc = ib_cq_request_notification(kibnal_data.kib_cq, 1); LASSERT (rc == 0); } - + /* flag CQ initialised */ kibnal_data.kib_init = IBNAL_INIT_CQ; /*****************************************************/ - - rc = libcfs_nal_cmd_register(OPENIBNAL, &kibnal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - goto failed; - } + rc = kibnal_start_ib_listener(); + if (rc != 0) + goto failed; + /* flag everything initialised */ kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ - printk(KERN_INFO "Lustre: OpenIB NAL loaded " - "(initial mem %d)\n", pkmem); - - return (PTL_OK); + return 0; failed: - kibnal_api_shutdown (&kibnal_api); - return (PTL_FAIL); + kibnal_shutdown(ni); + return -ENETDOWN; } void __exit kibnal_module_fini (void) { - if (kibnal_tunables.kib_sysctl != NULL) - unregister_sysctl_table (kibnal_tunables.kib_sysctl); - PtlNIFini(kibnal_ni); - - ptl_unregister_nal(OPENIBNAL); + lnet_unregister_lnd(&the_kiblnd); + kibnal_tunables_fini(); } int __init @@ -2166,48 +1867,21 @@ kibnal_module_init (void) { int rc; - /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT (sizeof(kibnal_tunables.kib_io_timeout) == sizeof(int)); - LASSERT (sizeof(kibnal_tunables.kib_listener_timeout) == sizeof(int)); - LASSERT (sizeof(kibnal_tunables.kib_backlog) == sizeof(int)); - LASSERT (sizeof(kibnal_tunables.kib_port) == sizeof(int)); - - kibnal_api.nal_ni_init = kibnal_api_startup; - kibnal_api.nal_ni_fini = kibnal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; - kibnal_tunables.kib_listener_timeout = IBNAL_LISTENER_TIMEOUT; - kibnal_tunables.kib_backlog = IBNAL_BACKLOG; - kibnal_tunables.kib_port = IBNAL_PORT; - - rc = ptl_register_nal(OPENIBNAL, &kibnal_api); - if (rc != PTL_OK) { - CERROR("Can't register IBNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(OPENIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(OPENIBNAL); - return (-ENODEV); - } + rc = kibnal_tunables_init(); + if (rc != 0) + return rc; - kibnal_tunables.kib_sysctl = - register_sysctl_table (kibnal_top_ctl_table, 0); - if (kibnal_tunables.kib_sysctl == NULL) { - CERROR("Can't register sysctl table\n"); - PtlNIFini(kibnal_ni); - ptl_unregister_nal(OPENIBNAL); - return (-ENOMEM); - } + lnet_register_lnd(&the_kiblnd); return (0); } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel OpenIB NAL v0.01"); +#ifdef USING_TSAPI +MODULE_DESCRIPTION("Kernel Cisco IB LND v1.00"); +#else +MODULE_DESCRIPTION("Kernel OpenIB(gen1) LND v1.00"); +#endif MODULE_LICENSE("GPL"); module_init(kibnal_module_init); diff --git a/lnet/klnds/openiblnd/openiblnd.h b/lnet/klnds/openiblnd/openiblnd.h index 5ba102e..6ed306c 100644 --- a/lnet/klnds/openiblnd/openiblnd.h +++ b/lnet/klnds/openiblnd/openiblnd.h @@ -51,82 +51,115 @@ #include #include -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND #include -#include -#include -#include +#include +#include #include #include #include +#ifndef USING_TSAPI + +/* OpenIB Gen1 */ +typedef struct ib_qp ib_qp_t; +typedef struct ib_mr ib_mr_t; +typedef struct ib_fmr ib_fmr_t; +typedef struct ib_pd ib_pd_t; +typedef struct ib_cq ib_cq_t; +typedef struct ib_fmr_pool ib_fmr_pool_t; + +#else + +/* Cisco (topspin) */ +typedef void ib_qp_t; +typedef void ib_mr_t; +typedef void ib_fmr_t; +typedef void ib_pd_t; +typedef void ib_cq_t; +typedef void ib_fmr_pool_t; + +#define IB_ACCESS_LOCAL_WRITE TS_IB_ACCESS_LOCAL_WRITE +#define IB_WQ_SIGNAL_SELECTABLE TS_IB_ACCESS_LOCAL_WRITE +#define IB_TRANSPORT_RC TS_IB_TRANSPORT_RC +#define IB_QP_STATE_INIT TS_IB_QP_STATE_INIT +#define IB_QP_ATTRIBUTE_STATE TS_IB_QP_ATTRIBUTE_STATE +#define IB_QP_ATTRIBUTE_PORT TS_IB_QP_ATTRIBUTE_PORT +#define IB_QP_ATTRIBUTE_PKEY_INDEX TS_IB_QP_ATTRIBUTE_PKEY_INDEX +#define IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE +#define IB_ACCESS_LOCAL_WRITE TS_IB_ACCESS_LOCAL_WRITE +#define IB_ACCESS_REMOTE_WRITE TS_IB_ACCESS_REMOTE_WRITE +#define IB_ACCESS_REMOTE_READ TS_IB_ACCESS_REMOTE_READ +#define IB_CQ_CALLBACK_INTERRU TS_IB_CQ_CALLBACK_INTERRUPTPT +#define IB_CQ_PROVIDER_REARM TS_IB_CQ_PROVIDER_REARM +#define IB_CQ_CALLBACK_INTERRUPT TS_IB_CQ_CALLBACK_INTERRUPT +#define IB_COMPLETION_STATUS_SUCCESS TS_IB_COMPLETION_STATUS_SUCCESS +#define IB_OP_SEND TS_IB_OP_SEND +#define IB_OP_RDMA_WRITE TS_IB_OP_RDMA_WRITE +#define IB_OP_RDMA_READ TS_IB_OP_RDMA_READ + +#endif + #if CONFIG_SMP # define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ #else # define IBNAL_N_SCHED 1 /* # schedulers */ #endif -#define IBNAL_N_CONND 4 /* # connection daemons */ - -#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ - -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ -#define IBNAL_RETRY 7 /* # times to retry */ -#define IBNAL_RNR_RETRY 7 /* */ -#define IBNAL_CM_RETRY 7 /* # times to retry connection */ -#define IBNAL_FLOW_CONTROL 1 -#define IBNAL_RESPONDER_RESOURCES 8 - -#define IBNAL_NTX 64 /* # tx descs */ -#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_FMR 1 +//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS +#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ +/* tunables fixed at compile time */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 6 /* when to eagerly return credits */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ +#define IBNAL_RDMA_BASE 0x0eeb0000 -/* default vals for runtime tunables */ -#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ -#define IBNAL_LISTENER_TIMEOUT 5 /* default listener timeout (seconds) */ -#define IBNAL_BACKLOG 127 /* default listener backlog */ -#define IBNAL_PORT 988 /* default listener port */ +/* QP tunables */ +#define IBNAL_RETRY 7 /* # times to retry */ +#define IBNAL_RNR_RETRY 7 /* */ +#define IBNAL_CM_RETRY 7 /* # times to retry connection */ +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_RESPONDER_RESOURCES 8 /************************/ /* derived constants... */ /* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) -#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) +#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) /* RX messages (per connection) */ -#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_RX_MSGS (IBNAL_MSG_QUEUE_SIZE * 2) +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) /* we may have up to 2 completions per transmit + 1 completion per receive, per connection */ -#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ - (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) - -#define IBNAL_RDMA_BASE 0x0eeb0000 -#define IBNAL_FMR 1 -#define IBNAL_CKSUM 1 -//#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_PROCESS -#define IBNAL_CALLBACK_CTXT IB_CQ_CALLBACK_INTERRUPT +#define IBNAL_CQ_ENTRIES() ((2*IBNAL_TX_MSGS()) + \ + (IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers)) typedef struct { - int kib_io_timeout; /* comms timeout (seconds) */ - int kib_listener_timeout; /* listener's timeout */ - int kib_backlog; /* listenter's accept backlog */ - int kib_port; /* where the listener listens */ + char **kib_ipif_basename; /* IPoIB interface base name */ + int *kib_n_connd; /* # connection daemons */ + int *kib_min_reconnect_interval; /* min connect retry seconds... */ + int *kib_max_reconnect_interval; /* max connect retry seconds */ + int *kib_concurrent_peers; /* max # peers */ + int *kib_cksum; /* checksum kib_msg_t? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_keepalive; /* keepalive (seconds) */ + int *kib_ntx; /* # tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peercredits; /* # concurrent sends to 1 peer */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ } kib_tunables_t; @@ -137,7 +170,7 @@ typedef struct __u64 ibp_vaddr; /* mapped region vaddr */ __u32 ibp_lkey; /* mapped region lkey */ __u32 ibp_rkey; /* mapped region rkey */ - struct ib_mr *ibp_handle; /* mapped region handle */ + ib_mr_t *ibp_handle; /* mapped region handle */ struct page *ibp_pages[0]; } kib_pages_t; @@ -147,22 +180,19 @@ typedef struct __u64 kib_incarnation; /* which one am I */ int kib_shutdown; /* shut down? */ atomic_t kib_nthreads; /* # live threads */ + lnet_ni_t *kib_ni; /* _the_ openib interface */ __u64 kib_svc_id; /* service number I listen on */ tTS_IB_GID kib_svc_gid; /* device/port GID */ __u16 kib_svc_pkey; /* device/port pkey */ - ptl_nid_t kib_nid; /* my NID */ - struct semaphore kib_nid_mutex; /* serialise NID ops */ - struct semaphore kib_listener_signal; /* signal IP listener completion */ - struct socket *kib_listener_sock; /* IP listener's socket */ - int kib_listener_shutdown; /* ask IP listener to close */ void *kib_listen_handle; /* IB listen handle */ rwlock_t kib_global_lock; /* stabilize peer/conn ops */ struct list_head *kib_peers; /* hash table of all my known peers */ int kib_peer_hash_size; /* size of kib_peers */ + int kib_nonewpeers; /* prevent new peers? */ atomic_t kib_npeers; /* # peers extant */ atomic_t kib_nconns; /* # connections extant */ @@ -174,6 +204,7 @@ typedef struct struct list_head kib_connd_peers; /* peers waiting for a connection */ struct list_head kib_connd_acceptq; /* accepted sockets to handle */ wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + int kib_connd_connecting; /* # connds connecting */ spinlock_t kib_connd_lock; /* serialise */ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ @@ -185,20 +216,19 @@ typedef struct kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ __u64 kib_next_tx_cookie; /* RDMA completion cookie */ spinlock_t kib_tx_lock; /* serialise */ + int kib_hca_idx; /* my HCA number */ struct ib_device *kib_device; /* "the" device */ struct ib_device_properties kib_device_props; /* its properties */ int kib_port; /* port on the device */ struct ib_port_properties kib_port_props; /* its properties */ - struct ib_pd *kib_pd; /* protection domain */ + ib_pd_t *kib_pd; /* protection domain */ #if IBNAL_FMR - struct ib_fmr_pool *kib_fmr_pool; /* fast memory region pool */ + ib_fmr_pool_t *kib_fmr_pool; /* fast memory region pool */ #endif - struct ib_cq *kib_cq; /* completion queue */ + ib_cq_t *kib_cq; /* completion queue */ } kib_data_t; @@ -239,8 +269,8 @@ typedef struct kib_connparams typedef struct { union { - struct ib_mr *mr; - struct ib_fmr *fmr; + ib_mr_t *mr; + ib_fmr_t *fmr; } md_handle; __u32 md_lkey; __u32 md_rkey; @@ -256,13 +286,13 @@ typedef struct typedef struct { - ptl_hdr_t ibim_hdr; /* portals header */ + lnet_hdr_t ibim_hdr; /* portals header */ char ibim_payload[0]; /* piggy-backed payload */ } WIRE_ATTR kib_immediate_msg_t; typedef struct { - ptl_hdr_t ibrm_hdr; /* portals header */ + lnet_hdr_t ibrm_hdr; /* portals header */ __u64 ibrm_cookie; /* opaque completion cookie */ kib_rdma_desc_t ibrm_desc; /* where to suck/blow */ } WIRE_ATTR kib_rdma_msg_t; @@ -296,8 +326,9 @@ typedef struct } WIRE_ATTR ibm_u; } WIRE_ATTR kib_msg_t; -#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ -#define IBNAL_MSG_VERSION 2 /* current protocol version */ +#define IBNAL_MSG_MAGIC LNET_PROTO_OPENIB_MAGIC /* unique magic */ +#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 2 /* previous protocol version */ +#define IBNAL_MSG_VERSION 3 /* current protocol version */ #define IBNAL_MSG_SVCQRY 0xb0 /* service query */ #define IBNAL_MSG_SVCRSP 0xb1 /* service response */ @@ -316,8 +347,7 @@ typedef struct kib_rx /* receive message */ { struct list_head rx_list; /* queue for attention */ struct kib_conn *rx_conn; /* owning conn */ - int rx_rdma; /* RDMA completion posted? */ - int rx_posted; /* posted? */ + int rx_nob; /* # bytes received (-1 while posted) */ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ struct ib_receive_param rx_sp; /* receive work item */ @@ -327,7 +357,6 @@ typedef struct kib_rx /* receive message */ typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ struct kib_conn *tx_conn; /* owning conn */ int tx_mapped; /* mapped for RDMA? */ int tx_sending; /* # tx callbacks outstanding */ @@ -336,7 +365,7 @@ typedef struct kib_tx /* transmit message */ int tx_passive_rdma; /* peer sucks/blows */ int tx_passive_rdma_wait; /* waiting for peer to complete */ __u64 tx_passive_rdma_cookie; /* completion cookie */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ kib_md_t tx_md; /* RDMA mapping (active/passive) */ __u64 tx_vaddr; /* pre-mapped buffer (hca vaddr) */ kib_msg_t *tx_msg; /* pre-mapped buffer (host vaddr) */ @@ -366,18 +395,22 @@ typedef struct kib_conn struct kib_peer *ibc_peer; /* owning peer */ struct list_head ibc_list; /* stash on peer's conn list */ __u64 ibc_incarnation; /* which instance of the peer */ + int ibc_version; /* peer protocol version */ atomic_t ibc_refcount; /* # users */ int ibc_state; /* what's happening */ - atomic_t ibc_nob; /* # bytes buffered */ int ibc_nsends_posted; /* # uncompleted sends */ int ibc_credits; /* # credits I have */ int ibc_outstanding_credits; /* # credits to return */ + int ibc_reserved_credits; /* # credits for ACK/DONE msgs */ + unsigned long ibc_last_send; /* time of last send */ + struct list_head ibc_tx_queue_nocred; /* sends that don't need a credit */ + struct list_head ibc_tx_queue_rsrvd; /* sends that need a reserved cred */ struct list_head ibc_tx_queue; /* send queue */ struct list_head ibc_active_txs; /* active tx awaiting completion */ spinlock_t ibc_lock; /* serialise */ kib_rx_t *ibc_rxs; /* the rx descs */ kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ - struct ib_qp *ibc_qp; /* queue pair */ + ib_qp_t *ibc_qp; /* queue pair */ __u32 ibc_qpn; /* queue pair number */ tTS_IB_CM_COMM_ID ibc_comm_id; /* connection ID? */ kib_connreq_t *ibc_connreq; /* connection request state */ @@ -394,7 +427,7 @@ typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - ptl_nid_t ibp_nid; /* who's on the other end(s) */ + lnet_nid_t ibp_nid; /* who's on the other end(s) */ __u32 ibp_ip; /* IP to query for peer conn params */ int ibp_port; /* port to qery for peer conn params */ __u64 ibp_incarnation; /* peer's incarnation */ @@ -402,17 +435,69 @@ typedef struct kib_peer int ibp_persistence; /* "known" peer refs */ struct list_head ibp_conns; /* all active connections */ struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* connecting+accepting */ + int ibp_connecting; /* current active connection attempts */ + int ibp_accepting; /* current passive connection attempts */ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ unsigned long ibp_reconnect_interval; /* exponential backoff */ + int ibp_error; /* errno on closing this peer */ + cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ } kib_peer_t; -extern lib_nal_t kibnal_lib; extern kib_data_t kibnal_data; extern kib_tunables_t kibnal_tunables; +/******************************************************************************/ + +/* these are purposely avoiding using local vars so they don't increase + * stack consumption. */ + +#define kibnal_conn_addref(conn) \ +do { \ + CDEBUG(D_NET, "conn[%p] (%d)++\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ + atomic_inc(&(conn)->ibc_refcount); \ +} while (0) + +#define kibnal_conn_decref(conn) \ +do { \ + unsigned long flags; \ + \ + CDEBUG(D_NET, "conn[%p] (%d)--\n", \ + (conn), atomic_read(&(conn)->ibc_refcount)); \ + LASSERT(atomic_read(&(conn)->ibc_refcount) > 0); \ + if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ + spin_lock_irqsave(&kibnal_data.kib_reaper_lock, flags); \ + list_add_tail(&(conn)->ibc_list, \ + &kibnal_data.kib_reaper_conns); \ + wake_up(&kibnal_data.kib_reaper_waitq); \ + spin_unlock_irqrestore(&kibnal_data.kib_reaper_lock, flags); \ + } \ +} while (0) + +#define kibnal_peer_addref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ + atomic_inc(&(peer)->ibp_refcount); \ +} while (0) + +#define kibnal_peer_decref(peer) \ +do { \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ + atomic_read (&(peer)->ibp_refcount)); \ + LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ + if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ + kibnal_destroy_peer(peer); \ +} while (0) + +/******************************************************************************/ + static inline struct list_head * -kibnal_nid2peerlist (ptl_nid_t nid) +kibnal_nid2peerlist (lnet_nid_t nid) { unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; @@ -429,42 +514,57 @@ kibnal_peer_active(kib_peer_t *peer) static inline void kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) { - /* CAVEAT EMPTOR: tx takes caller's ref on conn */ + struct list_head *q; LASSERT (tx->tx_nsp > 0); /* work items set up */ LASSERT (tx->tx_conn == NULL); /* only set here */ + kibnal_conn_addref(conn); tx->tx_conn = conn; - tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); -} - -#if 0 -static inline void -kibnal_show_rdma_attr (kib_conn_t *conn) -{ - struct ib_qp_attribute qp_attr; - int rc; - - memset (&qp_attr, 0, sizeof(qp_attr)); - rc = ib_qp_query(conn->ibc_qp, &qp_attr); - if (rc != 0) { - CERROR ("Can't get qp attrs: %d\n", rc); - return; + tx->tx_deadline = jiffies + *kibnal_tunables.kib_timeout * HZ; + + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + /* All messages have simple credit control */ + q = &conn->ibc_tx_queue; + } else { + LASSERT (conn->ibc_version == IBNAL_MSG_VERSION); + + switch (tx->tx_msg->ibm_type) { + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + /* RDMA request: reserve a buffer for the RDMA reply + * before sending */ + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + /* RDMA completion: no credits; peer has reserved a + * reply buffer */ + q = &conn->ibc_tx_queue_nocred; + break; + + case IBNAL_MSG_NOOP: + case IBNAL_MSG_IMMEDIATE: + /* Otherwise: consume a credit before sending */ + q = &conn->ibc_tx_queue; + break; + + default: + LBUG(); + q = NULL; + } } - CWARN ("RDMA CAPABILITY: write %s read %s\n", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_write ? "enabled" : "disabled") : "invalid", - (qp_attr.valid_fields & TS_IB_QP_ATTRIBUTE_RDMA_ATOMIC_ENABLE) ? - (qp_attr.enable_rdma_read ? "enabled" : "disabled") : "invalid"); + list_add_tail(&tx->tx_list, q); } -#endif -static inline __u64 -kibnal_page2phys (struct page *p) +static inline int +kibnal_send_keepalive(kib_conn_t *conn) { - return page_to_phys(p); + return (*kibnal_tunables.kib_keepalive > 0) && + time_after(jiffies, conn->ibc_last_send + + *kibnal_tunables.kib_keepalive*HZ); } /* CAVEAT EMPTOR: @@ -494,38 +594,63 @@ kibnal_wreqid_is_rx (__u64 wreqid) return (wreqid & 1) != 0; } -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_wmem_queued wmem_queued -# define sk_err err -# define sk_sleep sleep +#if (IB_NTXRXPARAMS == 3) +static inline int +kibnal_ib_send(ib_qp_t *qp, struct ib_send_param *p) +{ + return ib_send(qp, p, 1); +} + +static inline int +kibnal_ib_receive(ib_qp_t *qp, struct ib_receive_param *p) +{ + return ib_receive(qp, p, 1); +} +#elif (IB_NTXRXPARAMS == 4) +static inline int +kibnal_ib_send(ib_qp_t *qp, struct ib_send_param *p) +{ + return ib_send(qp, p, 1, NULL); +} + +static inline int +kibnal_ib_receive(ib_qp_t *qp, struct ib_receive_param *p) +{ + return ib_receive(qp, p, 1, NULL); +} +#else + #error "IB_NTXRXPARAMS not set correctly" #endif +int kibnal_startup (lnet_ni_t *ni); +void kibnal_shutdown (lnet_ni_t *ni); +int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kibnal_eager_recv (lnet_ni_t *ni, void *private, + lnet_msg_t *lntmsg, void **new_private); +int kibnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int kibnal_accept(lnet_ni_t *ni, struct socket *sock); + extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); -extern void kibnal_pack_msg(kib_msg_t *msg, int credits, - ptl_nid_t dstnid, __u64 dststamp); -extern int kibnal_unpack_msg(kib_msg_t *msg, int nob); +extern void kibnal_pack_msg(kib_msg_t *msg, int version, int credits, + lnet_nid_t dstnid, __u64 dststamp); +extern int kibnal_unpack_msg(kib_msg_t *msg, int expected_version, int nob); extern void kibnal_handle_svcqry (struct socket *sock); extern int kibnal_make_svcqry (kib_conn_t *conn); extern void kibnal_free_acceptsock (kib_acceptsock_t *as); -extern int kibnal_listener_procint(ctl_table *table, int write, - struct file *filp, void *buffer, - size_t *lenp); -extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); -extern void kibnal_put_peer (kib_peer_t *peer); -extern int kibnal_del_peer (ptl_nid_t nid, int single_share); -extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern int kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid); +extern void kibnal_destroy_peer (kib_peer_t *peer); +extern int kibnal_add_persistent_peer(lnet_nid_t nid, __u32 ip, int port); +extern int kibnal_del_peer (lnet_nid_t nid); +extern kib_peer_t *kibnal_find_peer_locked (lnet_nid_t nid); extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern void kibnal_peer_alive(kib_peer_t *peer); extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation); extern kib_conn_t *kibnal_create_conn (void); -extern void kibnal_put_conn (kib_conn_t *conn); extern void kibnal_destroy_conn (kib_conn_t *conn); extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); extern void kibnal_free_pages (kib_pages_t *p); @@ -548,16 +673,15 @@ extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); extern int kibnal_scheduler(void *arg); extern int kibnal_connd (void *arg); extern int kibnal_reaper (void *arg); -extern void kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg); +extern void kibnal_callback (ib_cq_t *cq, struct ib_cq_entry *e, void *arg); +extern void kibnal_txlist_done (struct list_head *txlist, int status); extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); extern int kibnal_close_conn (kib_conn_t *conn, int why); extern void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, + kib_rx_t *rx, lnet_msg_t *lntmsg, unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, + struct iovec *iov, lnet_kiov_t *kiov, int offset, int nob); - - - - +extern int kibnal_tunables_init(void); +extern void kibnal_tunables_fini(void); diff --git a/lnet/klnds/openiblnd/openiblnd_cb.c b/lnet/klnds/openiblnd/openiblnd_cb.c index a356eaf..75f3e23 100644 --- a/lnet/klnds/openiblnd/openiblnd_cb.c +++ b/lnet/klnds/openiblnd/openiblnd_cb.c @@ -21,7 +21,7 @@ * */ -#include "openibnal.h" +#include "openiblnd.h" /* * LIB functions follow @@ -43,7 +43,7 @@ kibnal_schedule_tx_done (kib_tx_t *tx) void kibnal_tx_done (kib_tx_t *tx) { - ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; + lnet_msg_t *lntmsg[2]; unsigned long flags; int i; int rc; @@ -51,6 +51,12 @@ kibnal_tx_done (kib_tx_t *tx) LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + if (in_interrupt()) { + /* can't deregister memory/flush FMAs/finalize in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + switch (tx->tx_mapped) { default: LBUG(); @@ -59,11 +65,6 @@ kibnal_tx_done (kib_tx_t *tx) break; case KIB_TX_MAPPED: - if (in_interrupt()) { - /* can't deregister memory in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } rc = ib_memory_deregister(tx->tx_md.md_handle.mr); LASSERT (rc == 0); tx->tx_mapped = KIB_TX_UNMAPPED; @@ -71,33 +72,27 @@ kibnal_tx_done (kib_tx_t *tx) #if IBNAL_FMR case KIB_TX_MAPPED_FMR: - if (in_interrupt() && tx->tx_status != 0) { - /* can't flush FMRs in IRQ context... */ - kibnal_schedule_tx_done(tx); - return; - } - rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); LASSERT (rc == 0); +#ifndef USING_TSAPI + /* Somewhat belt-and-braces since the tx's conn has closed if + * this was a passive RDMA waiting to complete... */ if (tx->tx_status != 0) ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); +#endif tx->tx_mapped = KIB_TX_UNMAPPED; break; #endif } - for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) - continue; + /* tx may have up to 2 ptlmsgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; + rc = tx->tx_status; - lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; - } - if (tx->tx_conn != NULL) { - kibnal_put_conn (tx->tx_conn); + kibnal_conn_decref(tx->tx_conn); tx->tx_conn = NULL; } @@ -107,88 +102,53 @@ kibnal_tx_done (kib_tx_t *tx) spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - if (tx->tx_isnblk) { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + /* delay finalize until my descs have been freed */ + for (i = 0; i < 2; i++) { + if (lntmsg[i] == NULL) + continue; + + lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); + } } kib_tx_t * -kibnal_get_idle_tx (int may_block) +kibnal_get_idle_tx (void) { unsigned long flags; - kib_tx_t *tx = NULL; + kib_tx_t *tx; - for (;;) { - spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); - /* block for idle tx */ + if (list_empty (&kibnal_data.kib_idle_txs)) { spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); + return NULL; } - if (tx != NULL) { - list_del (&tx->tx_list); + tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - /* Allocate a new passive RDMA completion cookie. It might - * not be needed, but we've got a lock right now and we're - * unlikely to wrap... */ - tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; - - LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); - LASSERT (tx->tx_nsp == 0); - LASSERT (tx->tx_sending == 0); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (!tx->tx_passive_rdma); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } + /* Allocate a new passive RDMA completion cookie. It might not be + * needed, but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); - - return (tx); -} - -int -kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if kibnal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - return 0; + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (tx->tx_nsp == 0); + LASSERT (tx->tx_sending == 0); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (!tx->tx_passive_rdma); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + + return tx; } void @@ -215,6 +175,8 @@ kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + /* XXX Set mlength of reply here */ + tx->tx_status = status; tx->tx_passive_rdma_wait = 0; idle = (tx->tx_sending == 0); @@ -233,17 +195,20 @@ kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) spin_unlock_irqrestore (&conn->ibc_lock, flags); - CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", - cookie, conn->ibc_peer->ibp_nid); + CERROR ("Unmatched (late?) RDMA completion "LPX64" from %s\n", + cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); } void -kibnal_post_rx (kib_rx_t *rx, int do_credits) +kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) { kib_conn_t *conn = rx->rx_conn; int rc; unsigned long flags; + LASSERT(!rsrvd_credit || + conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + rx->rx_gl = (struct ib_gather_scatter) { .address = rx->rx_vaddr, .length = IBNAL_MSG_SIZE, @@ -259,19 +224,24 @@ kibnal_post_rx (kib_rx_t *rx, int do_credits) }; LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - LASSERT (!rx->rx_posted); - rx->rx_posted = 1; + LASSERT (rx->rx_nob >= 0); /* not posted */ + rx->rx_nob = -1; /* is now */ mb(); if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) rc = -ECONNABORTED; else - rc = ib_receive (conn->ibc_qp, &rx->rx_sp, 1); + rc = kibnal_ib_receive(conn->ibc_qp, &rx->rx_sp); if (rc == 0) { - if (do_credits) { + if (credit || rsrvd_credit) { spin_lock_irqsave(&conn->ibc_lock, flags); - conn->ibc_outstanding_credits++; + + if (credit) + conn->ibc_outstanding_credits++; + if (rsrvd_credit) + conn->ibc_reserved_credits++; + spin_unlock_irqrestore(&conn->ibc_lock, flags); kibnal_check_sends(conn); @@ -280,16 +250,16 @@ kibnal_post_rx (kib_rx_t *rx, int do_credits) } if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { - CERROR ("Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); + CERROR ("Error posting receive -> %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); kibnal_close_conn (rx->rx_conn, rc); } else { - CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); + CDEBUG (D_NET, "Error posting receive -> %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); } /* Drop rx's ref */ - kibnal_put_conn (conn); + kibnal_conn_decref(conn); } void @@ -301,10 +271,11 @@ kibnal_rx_callback (struct ib_cq_entry *e) int credits; unsigned long flags; int rc; + int err = -ECONNABORTED; CDEBUG (D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_posted); - rx->rx_posted = 0; + LASSERT (rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ mb(); /* receives complete with error in any case after we've started @@ -316,24 +287,31 @@ kibnal_rx_callback (struct ib_cq_entry *e) LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { - CERROR("Rx from "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, e->status); + CERROR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status); goto failed; } - rc = kibnal_unpack_msg(msg, e->bytes_transferred); + LASSERT (e->bytes_transferred >= 0); + rx->rx_nob = e->bytes_transferred; + mb(); + + rc = kibnal_unpack_msg(msg, conn->ibc_version, rx->rx_nob); if (rc != 0) { - CERROR ("Error %d unpacking rx from "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); + CERROR ("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } - if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid, + msg->ibm_srcnid) || + !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + msg->ibm_dstnid) || msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR ("Stale rx from "LPX64"\n", - conn->ibc_peer->ibp_nid); + CERROR ("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + err = -ESTALE; goto failed; } @@ -349,7 +327,7 @@ kibnal_rx_callback (struct ib_cq_entry *e) switch (msg->ibm_type) { case IBNAL_MSG_NOOP: - kibnal_post_rx (rx, 1); + kibnal_post_rx (rx, 1, 0); return; case IBNAL_MSG_IMMEDIATE: @@ -373,15 +351,23 @@ kibnal_rx_callback (struct ib_cq_entry *e) kibnal_complete_passive_rdma (conn, msg->ibm_u.completion.ibcm_cookie, msg->ibm_u.completion.ibcm_status); - kibnal_post_rx (rx, 1); + + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + kibnal_post_rx (rx, 1, 0); + } else { + /* this reply buffer was pre-reserved */ + kibnal_post_rx (rx, 0, 1); + } return; default: - CERROR ("Bad msg type %x from "LPX64"\n", - msg->ibm_type, conn->ibc_peer->ibp_nid); + CERROR ("Bad msg type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } + kibnal_peer_alive(conn->ibc_peer); + /* schedule for kibnal_rx() in thread context */ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); @@ -393,61 +379,43 @@ kibnal_rx_callback (struct ib_cq_entry *e) failed: CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kibnal_close_conn(conn, -ECONNABORTED); + kibnal_close_conn(conn, err); /* Don't re-post rx & drop its ref on conn */ - kibnal_put_conn(conn); + kibnal_conn_decref(conn); } void kibnal_rx (kib_rx_t *rx) { + int rc = 0; kib_msg_t *msg = rx->rx_msg; - /* Clear flag so I can detect if I've sent an RDMA completion */ - rx->rx_rdma = 0; - switch (msg->ibm_type) { case IBNAL_MSG_GET_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - /* If the incoming get was matched, I'll have initiated the - * RDMA and the completion message... */ - if (rx->rx_rdma) - break; - - /* Otherwise, I'll send a failed completion now to prevent - * the peer's GET blocking for the full timeout. */ - CERROR ("Completing unmatched RDMA GET from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); - kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, - rx, NULL, 0, NULL, NULL, 0, 0); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, + msg->ibm_srcnid, rx, 1); break; case IBNAL_MSG_PUT_RDMA: - lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); - if (rx->rx_rdma) - break; - /* This is most unusual, since even if lib_parse() didn't - * match anything, it should have asked us to read (and - * discard) the payload. The portals header must be - * inconsistent with this message type, so it's the - * sender's fault for sending garbage and she can time - * herself out... */ - CERROR ("Uncompleted RMDA PUT from "LPX64"\n", - rx->rx_conn->ibc_peer->ibp_nid); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr, + msg->ibm_srcnid, rx, 1); break; case IBNAL_MSG_IMMEDIATE: - lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); - LASSERT (!rx->rx_rdma); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); break; - + default: LBUG(); break; } - kibnal_post_rx (rx, 1); + if (rc < 0) { + kibnal_close_conn(rx->rx_conn, rc); + kibnal_post_rx (rx, 1, 0); + } } #if 0 @@ -472,14 +440,14 @@ kibnal_kvaddr_to_phys (unsigned long vaddr, __u64 *physp) !VALID_PAGE (page)) return (-EFAULT); - *physp = kibnal_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); + *physp = lnet_page2phys(page) + (vaddr & (PAGE_SIZE - 1)); return (0); } #endif int -kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access, - int niov, struct iovec *iov, int offset, int nob) +kibnal_map_iov (kib_tx_t *tx, int access, + unsigned int niov, struct iovec *iov, int offset, int nob) { void *vaddr; @@ -521,8 +489,8 @@ kibnal_map_iov (kib_tx_t *tx, enum ib_memory_access access, } int -kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, - int nkiov, ptl_kiov_t *kiov, +kibnal_map_kiov (kib_tx_t *tx, int access, + int nkiov, lnet_kiov_t *kiov, int offset, int nob) { #if IBNAL_FMR @@ -552,7 +520,7 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, } phys_size = nkiov * sizeof (*phys); - PORTAL_ALLOC(phys, phys_size); + LIBCFS_ALLOC(phys, phys_size); if (phys == NULL) { CERROR ("Can't allocate tmp phys\n"); return (-ENOMEM); @@ -560,9 +528,9 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, page_offset = kiov->kiov_offset + offset; #if IBNAL_FMR - phys[0] = kibnal_page2phys(kiov->kiov_page); + phys[0] = lnet_page2phys(kiov->kiov_page); #else - phys[0].address = kibnal_page2phys(kiov->kiov_page); + phys[0].address = lnet_page2phys(kiov->kiov_page); phys[0].size = PAGE_SIZE; #endif nphys = 1; @@ -592,7 +560,7 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, goto out; } - if (nphys == PTL_MD_MAX_IOV) { + if (nphys == LNET_MAX_IOV) { CERROR ("payload too big (%d)\n", nphys); rc = -EMSGSIZE; goto out; @@ -600,9 +568,9 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, LASSERT (nphys * sizeof (*phys) < phys_size); #if IBNAL_FMR - phys[nphys] = kibnal_page2phys(kiov->kiov_page); + phys[nphys] = lnet_page2phys(kiov->kiov_page); #else - phys[nphys].address = kibnal_page2phys(kiov->kiov_page); + phys[nphys].address = lnet_page2phys(kiov->kiov_page); phys[nphys].size = PAGE_SIZE; #endif nphys++; @@ -640,7 +608,7 @@ kibnal_map_kiov (kib_tx_t *tx, enum ib_memory_access access, } out: - PORTAL_FREE(phys, phys_size); + LIBCFS_FREE(phys, phys_size); return (rc); } @@ -664,31 +632,57 @@ kibnal_check_sends (kib_conn_t *conn) kib_tx_t *tx; int rc; int i; + int consume_credit; int done; int nwork; spin_lock_irqsave (&conn->ibc_lock, flags); - LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_nsends_posted <= IBNAL_RX_MSGS); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { + list_empty(&conn->ibc_tx_queue_nocred) && + (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || + kibnal_send_keepalive(conn))) { spin_unlock_irqrestore(&conn->ibc_lock, flags); - tx = kibnal_get_idle_tx(0); /* don't block */ + tx = kibnal_get_idle_tx(); if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); spin_lock_irqsave(&conn->ibc_lock, flags); - if (tx != NULL) { - atomic_inc(&conn->ibc_refcount); + if (tx != NULL) kibnal_queue_tx_locked(tx, conn); - } } - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); + for (;;) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry(conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); + consume_credit = 0; + } else if (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, + kib_tx_t, tx_list); + consume_credit = 1; + } else { + /* nothing waiting */ + break; + } /* We rely on this for QP sizing */ LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= 2); @@ -701,21 +695,25 @@ kibnal_check_sends (kib_conn_t *conn) /* Not on ibc_rdma_queue */ LASSERT (!tx->tx_passive_rdma_wait); - if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) + if (conn->ibc_nsends_posted == IBNAL_RX_MSGS) break; - if (conn->ibc_credits == 0) /* no credits */ - break; + if (consume_credit) { + if (conn->ibc_credits == 0) /* no credits */ + break; + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + break; + } - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) /* giving back credits */ - break; - list_del (&tx->tx_list); if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + !list_empty(&conn->ibc_tx_queue_nocred) || + (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && + !kibnal_send_keepalive(conn)))) { /* redundant NOOP */ spin_unlock_irqrestore(&conn->ibc_lock, flags); kibnal_tx_done(tx); @@ -723,12 +721,14 @@ kibnal_check_sends (kib_conn_t *conn) continue; } - kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits, + kibnal_pack_msg(tx->tx_msg, conn->ibc_version, + conn->ibc_outstanding_credits, conn->ibc_peer->ibp_nid, conn->ibc_incarnation); conn->ibc_outstanding_credits = 0; conn->ibc_nsends_posted++; - conn->ibc_credits--; + if (consume_credit) + conn->ibc_credits--; tx->tx_sending = tx->tx_nsp; tx->tx_passive_rdma_wait = tx->tx_passive_rdma; @@ -747,19 +747,22 @@ kibnal_check_sends (kib_conn_t *conn) tx->tx_status = 0; /* Driver only accepts 1 item at a time */ for (i = 0; i < tx->tx_nsp; i++) { - rc = ib_send (conn->ibc_qp, &tx->tx_sp[i], 1); + rc = kibnal_ib_send(conn->ibc_qp, &tx->tx_sp[i]); if (rc != 0) break; nwork++; } } + conn->ibc_last_send = jiffies; + spin_lock_irqsave (&conn->ibc_lock, flags); if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - conn->ibc_credits++; + if (consume_credit) + conn->ibc_credits++; conn->ibc_nsends_posted--; tx->tx_status = rc; @@ -773,11 +776,11 @@ kibnal_check_sends (kib_conn_t *conn) spin_unlock_irqrestore (&conn->ibc_lock, flags); if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); + CERROR ("Error %d posting transmit to %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); else - CDEBUG (D_NET, "Error %d posting transmit to " - LPX64"\n", rc, conn->ibc_peer->ibp_nid); + CDEBUG (D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, rc); @@ -820,10 +823,7 @@ kibnal_tx_callback (struct ib_cq_entry *e) if (idle) list_del(&tx->tx_list); - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); if (tx->tx_sending == 0) conn->ibc_nsends_posted--; @@ -838,19 +838,20 @@ kibnal_tx_callback (struct ib_cq_entry *e) kibnal_tx_done (tx); if (e->status != IB_COMPLETION_STATUS_SUCCESS) { - CERROR ("Tx completion to "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, e->status); + CDEBUG (D_NETERROR, "Tx completion to %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), e->status); kibnal_close_conn (conn, -ENETDOWN); } else { + kibnal_peer_alive(conn->ibc_peer); /* can I shovel some more sends out the door? */ kibnal_check_sends(conn); } - kibnal_put_conn (conn); + kibnal_conn_decref(conn); } void -kibnal_callback (struct ib_cq *cq, struct ib_cq_entry *e, void *arg) +kibnal_callback (ib_cq_t *cq, struct ib_cq_entry *e, void *arg) { if (kibnal_wreqid_is_rx(e->work_request_id)) kibnal_rx_callback (e); @@ -921,7 +922,7 @@ kibnal_schedule_active_connect_locked (kib_peer_t *peer) /* Called with exclusive kib_global_lock */ peer->ibp_connecting++; - atomic_inc (&peer->ibp_refcount); /* extra ref for connd */ + kibnal_peer_addref(peer); /* extra ref for connd */ spin_lock (&kibnal_data.kib_connd_lock); @@ -934,11 +935,13 @@ kibnal_schedule_active_connect_locked (kib_peer_t *peer) } void -kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) { unsigned long flags; kib_peer_t *peer; kib_conn_t *conn; + int retry; + int rc; rwlock_t *g_lock = &kibnal_data.kib_global_lock; /* If I get here, I've committed to send, so I complete the tx with @@ -947,55 +950,65 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ LASSERT (tx->tx_nsp > 0); /* work items have been set up */ - read_lock_irqsave(g_lock, flags); + for (retry = 0; ; retry = 1) { + read_lock_irqsave(g_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - read_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; - } - - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ - read_unlock_irqrestore(g_lock, flags); + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) { + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + kibnal_conn_addref(conn); /* 1 ref for me...*/ + read_unlock_irqrestore(g_lock, flags); - kibnal_queue_tx (tx, conn); - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); + kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...until here */ + return; + } + } + + /* Making one or more connections; I'll need a write lock... */ + read_unlock(g_lock); + write_lock(g_lock); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) + break; + write_unlock_irqrestore (g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - return; + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid), + lnet_acceptor_port()); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + tx->tx_status = rc; + kibnal_tx_done(tx); + return; + } } conn = kibnal_find_conn_locked (peer); if (conn != NULL) { /* Connection exists; queue message on it */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + kibnal_conn_addref(conn); /* +1 ref from me... */ write_unlock_irqrestore (g_lock, flags); kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...until here */ return; } - if (peer->ibp_connecting == 0) { - if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { + if (peer->ibp_connecting == 0 && + peer->ibp_accepting == 0) { + if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ + time_after_eq(jiffies, peer->ibp_reconnect_time))) { write_unlock_irqrestore (g_lock, flags); tx->tx_status = -EHOSTUNREACH; kibnal_tx_done (tx); @@ -1011,11 +1024,27 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) write_unlock_irqrestore (g_lock, flags); } -ptl_err_t -kibnal_start_passive_rdma (int type, ptl_nid_t nid, - lib_msg_t *libmsg, ptl_hdr_t *hdr) +void +kibnal_txlist_done (struct list_head *txlist, int status) +{ + kib_tx_t *tx; + + while (!list_empty(txlist)) { + tx = list_entry (txlist->next, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + /* complete now */ + tx->tx_status = status; + kibnal_tx_done (tx); + } +} + +int +kibnal_start_passive_rdma (int type, lnet_msg_t *lntmsg, + int niov, struct iovec *iov, lnet_kiov_t *kiov, + int nob) { - int nob = libmsg->md->length; + lnet_nid_t nid = lntmsg->msg_target.nid; kib_tx_t *tx; kib_msg_t *ibmsg; int rc; @@ -1033,32 +1062,33 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid, IB_ACCESS_LOCAL_WRITE; } - tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ - LASSERT (tx != NULL); + tx = kibnal_get_idle_tx (); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET", + libcfs_nid2str(nid)); + return -ENOMEM; + } - if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = kibnal_map_iov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, nob); + + if (iov != NULL) + rc = kibnal_map_iov (tx, access, niov, iov, 0, nob); else - rc = kibnal_map_kiov (tx, access, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, nob); + rc = kibnal_map_kiov (tx, access, niov, kiov, 0, nob); if (rc != 0) { - CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); + CERROR ("Can't map RDMA for %s: %d\n", + libcfs_nid2str(nid), rc); goto failed; } if (type == IBNAL_MSG_GET_RDMA) { /* reply gets finalized when tx completes */ - tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, - nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR ("Can't create reply for GET -> "LPX64"\n", - nid); + tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, + lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR ("Can't create reply for GET -> %s\n", + libcfs_nid2str(nid)); rc = -ENOMEM; goto failed; } @@ -1068,7 +1098,7 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid, ibmsg = tx->tx_msg; - ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_hdr = lntmsg->msg_hdr; ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; ibmsg->ibm_u.rdma.ibrm_desc.rd_key = tx->tx_md.md_rkey; ibmsg->ibm_u.rdma.ibrm_desc.rd_addr = tx->tx_md.md_addr; @@ -1081,24 +1111,24 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid, tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, tx->tx_md.md_addr, nob); - /* libmsg gets finalized when tx completes. */ - tx->tx_libmsg[0] = libmsg; + /* lntmsg gets finalized when tx completes. */ + tx->tx_lntmsg[0] = lntmsg; kibnal_launch_tx(tx, nid); - return (PTL_OK); + return (0); failed: tx->tx_status = rc; kibnal_tx_done (tx); - return (PTL_FAIL); + return (-EIO); } void kibnal_start_active_rdma (int type, int status, - kib_rx_t *rx, lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, ptl_kiov_t *kiov, - int offset, int nob) + kib_rx_t *rx, lnet_msg_t *lntmsg, + unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + int offset, int nob) { kib_msg_t *rxmsg = rx->rx_msg; kib_msg_t *txmsg; @@ -1122,12 +1152,6 @@ kibnal_start_active_rdma (int type, int status, LASSERT (type == IBNAL_MSG_GET_DONE || type == IBNAL_MSG_PUT_DONE); - /* Flag I'm completing the RDMA. Even if I fail to send the - * completion message, I will have tried my best so further - * attempts shouldn't be tried. */ - LASSERT (!rx->rx_rdma); - rx->rx_rdma = 1; - if (type == IBNAL_MSG_GET_DONE) { access = 0; rdma_op = IB_OP_RDMA_WRITE; @@ -1138,12 +1162,12 @@ kibnal_start_active_rdma (int type, int status, LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); } - tx = kibnal_get_idle_tx (0); /* Mustn't block */ + tx = kibnal_get_idle_tx (); if (tx == NULL) { - CERROR ("tx descs exhausted on RDMA from "LPX64 + CERROR ("tx descs exhausted on RDMA from %s" " completing locally with failure\n", - rx->rx_conn->ibc_peer->ibp_nid); - lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); + libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid)); + lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM); return; } LASSERT (tx->tx_nsp == 0); @@ -1161,8 +1185,9 @@ kibnal_start_active_rdma (int type, int status, niov, iov, offset, nob); if (rc != 0) { - CERROR ("Can't map RDMA -> "LPX64": %d\n", - rx->rx_conn->ibc_peer->ibp_nid, rc); + CERROR ("Can't map RDMA -> %s: %d\n", + libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid), + rc); /* We'll skip the RDMA and complete with failure. */ status = rc; nob = 0; @@ -1201,53 +1226,45 @@ kibnal_start_active_rdma (int type, int status, if (status == 0 && nob != 0) { LASSERT (tx->tx_nsp > 1); - /* RDMA: libmsg gets finalized when the tx completes. This + /* RDMA: lntmsg gets finalized when the tx completes. This * is after the completion message has been sent, which in * turn is after the RDMA has finished. */ - tx->tx_libmsg[0] = libmsg; + tx->tx_lntmsg[0] = lntmsg; } else { LASSERT (tx->tx_nsp == 1); /* No RDMA: local completion happens now! */ CDEBUG(D_NET, "No data: immediate completion\n"); - lib_finalize (&kibnal_lib, NULL, libmsg, - status == 0 ? PTL_OK : PTL_FAIL); + lnet_finalize (kibnal_data.kib_ni, lntmsg, + status == 0 ? 0 : -EIO); } - /* +1 ref for this tx... */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - rx->rx_conn, rx->rx_conn->ibc_state, - rx->rx_conn->ibc_peer->ibp_nid, - atomic_read (&rx->rx_conn->ibc_refcount)); - atomic_inc (&rx->rx_conn->ibc_refcount); - /* ...and queue it up */ kibnal_queue_tx(tx, rx->rx_conn); } -ptl_err_t -kibnal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - int payload_offset, - int payload_nob) +int +kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; /* NB 'private' is different depending on what we're sending.... */ - CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64" pid %d\n", - payload_nob, payload_niov, nid , pid); + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); + LASSERT (payload_niov <= LNET_MAX_IOV); /* Thread context if we're sending payload */ LASSERT (!in_interrupt() || payload_niov == 0); @@ -1257,126 +1274,111 @@ kibnal_sendmsg(lib_nal_t *nal, switch (type) { default: LBUG(); - return (PTL_FAIL); - - case PTL_MSG_REPLY: { - /* reply's 'private' is the incoming receive */ - kib_rx_t *rx = private; - - /* RDMA reply expected? */ - if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { - kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, - rx, libmsg, payload_niov, - payload_iov, payload_kiov, - payload_offset, payload_nob); - return (PTL_OK); - } + return (-EIO); - /* Incoming message consistent with immediate reply? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { - CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", - nid, rx->rx_msg->ibm_type); - return (PTL_FAIL); - } - - /* Will it fit in a message? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) { - CERROR("REPLY for "LPX64" too big (RDMA not requested): %d\n", - nid, payload_nob); - return (PTL_FAIL); - } - break; - } - - case PTL_MSG_GET: - /* might the REPLY message be big enough to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, - nid, libmsg, hdr)); - break; - - case PTL_MSG_ACK: + case LNET_MSG_ACK: LASSERT (payload_nob == 0); break; - case PTL_MSG_PUT: - /* Is the payload big enough to need RDMA? */ + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg, + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, NULL, + lntmsg->msg_md->md_length); + + return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, lntmsg, + lntmsg->msg_md->md_niov, + NULL, lntmsg->msg_md->md_iov.kiov, + lntmsg->msg_md->md_length); + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) - return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, - nid, libmsg, hdr)); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ - break; + return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, lntmsg, + payload_niov, + payload_iov, payload_kiov, + payload_nob); } - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); + /* Send IMMEDIATE */ + + tx = kibnal_get_idle_tx(); if (tx == NULL) { - CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", - type, nid, in_interrupt() ? " (intr)" : ""); - return (PTL_NO_SPACE); + CERROR ("Can't send %d to %s: tx descs exhausted%s\n", + type, libcfs_nid2str(target.nid), + in_interrupt() ? " (intr)" : ""); + return (-ENOMEM); } ibmsg = tx->tx_msg; ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_iov, - payload_offset, payload_nob); - } + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, offsetof(kib_immediate_msg_t, ibim_payload[payload_nob])); - /* libmsg gets finalized when tx completes */ - tx->tx_libmsg[0] = libmsg; + /* lntmsg gets finalized when tx completes */ + tx->tx_lntmsg[0] = lntmsg; - kibnal_launch_tx(tx, nid); - return (PTL_OK); + kibnal_launch_tx(tx, target.nid); + return (0); } -ptl_err_t -kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) +int +kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + void **new_private) { - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); -} + kib_rx_t *rx = private; + kib_conn_t *conn = rx->rx_conn; -ptl_err_t -kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) -{ - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + /* Can't block if RDMA completions need normal credits */ + LCONSOLE_ERROR("Dropping message from %s: no buffers free. " + "%s is running an old version of LNET that may " + "deadlock if messages wait for buffers)\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return -EDEADLK; + } + + *new_private = private; + return 0; } -ptl_err_t -kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - int offset, int mlen, int rlen) +int +kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) { kib_rx_t *rx = private; kib_msg_t *rxmsg = rx->rx_msg; int msg_nob; + int rc = 0; LASSERT (mlen <= rlen); LASSERT (!in_interrupt ()); @@ -1386,59 +1388,58 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, switch (rxmsg->ibm_type) { default: LBUG(); - return (PTL_FAIL); - + case IBNAL_MSG_IMMEDIATE: msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (msg_nob > IBNAL_MSG_SIZE) { - CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); - return (PTL_FAIL); + if (msg_nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + msg_nob, rx->rx_nob); + rc = -EPROTO; + break; } if (kiov != NULL) - lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); + lnet_copy_flat2kiov( + niov, kiov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); else - lib_copy_buf2iov(niov, iov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); + lnet_copy_flat2iov( + niov, iov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + lnet_finalize (ni, lntmsg, 0); + break; case IBNAL_MSG_GET_RDMA: - /* We get called here just to discard any junk after the - * GET hdr. */ - LASSERT (libmsg == NULL); - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + if (lntmsg != NULL) { + /* GET matched: RDMA lntmsg's payload */ + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, lntmsg, + lntmsg->msg_niov, + lntmsg->msg_iov, + lntmsg->msg_kiov, + lntmsg->msg_offset, + lntmsg->msg_len); + } else { + /* GET didn't match anything */ + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -ENODATA, + rx, NULL, 0, NULL, NULL, 0, 0); + } + break; case IBNAL_MSG_PUT_RDMA: - kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, - rx, libmsg, + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, rx, lntmsg, niov, iov, kiov, offset, mlen); - return (PTL_OK); + break; } -} -ptl_err_t -kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); -} - -ptl_err_t -kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); + kibnal_post_rx(rx, 1, 0); + return rc; } int @@ -1460,6 +1461,40 @@ kibnal_thread_fini (void) } void +kibnal_peer_alive (kib_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->ibp_last_alive = cfs_time_current(); + mb(); +} + +void +kibnal_peer_notify (kib_peer_t *peer) +{ + time_t last_alive = 0; + int error = 0; + unsigned long flags; + + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + if (list_empty(&peer->ibp_conns) && + peer->ibp_accepting == 0 && + peer->ibp_connecting == 0 && + peer->ibp_error != 0) { + error = peer->ibp_error; + peer->ibp_error = 0; + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive); + } + + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); +} + +void kibnal_close_conn_locked (kib_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and schedules the @@ -1467,8 +1502,9 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) * Caller holds kib_global_lock exclusively in irq context */ kib_peer_t *peer = conn->ibc_peer; - CDEBUG (error == 0 ? D_NET : D_ERROR, - "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); + CDEBUG (error == 0 ? D_NET : D_NETERROR, + "closing conn to %s: error %d\n", + libcfs_nid2str(peer->ibp_nid), error); LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED || conn->ibc_state == IBNAL_CONN_CONNECTING); @@ -1478,16 +1514,15 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) list_del (&conn->ibc_list); } else { /* new ref for kib_reaper_conns */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); } - if (list_empty (&peer->ibp_conns) && /* no more conns */ - peer->ibp_persistence == 0 && /* non-persistent peer */ - kibnal_peer_active(peer)) { /* still in peer table */ - kibnal_unlink_peer_locked (peer); + if (list_empty (&peer->ibp_conns)) { /* no more conns */ + if (peer->ibp_persistence == 0 && /* non-persistent peer */ + kibnal_peer_active(peer)) /* still in peer table */ + kibnal_unlink_peer_locked (peer); + + peer->ibp_error = error; /* set/clear error on last conn */ } conn->ibc_state = IBNAL_CONN_DEATHROW; @@ -1521,21 +1556,25 @@ kibnal_close_conn (kib_conn_t *conn, int why) } void -kibnal_peer_connect_failed (kib_peer_t *peer, int rc) +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error) { LIST_HEAD (zombies); - kib_tx_t *tx; unsigned long flags; - LASSERT (rc != 0); - LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); + LASSERT(error != 0); write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - LASSERT (peer->ibp_connecting != 0); - peer->ibp_connecting--; + if (active) { + LASSERT (peer->ibp_connecting != 0); + peer->ibp_connecting--; + } else { + LASSERT (peer->ibp_accepting != 0); + peer->ibp_accepting--; + } - if (peer->ibp_connecting != 0) { + if (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0) { /* another connection attempt under way... */ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); return; @@ -1543,26 +1582,29 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int rc) if (list_empty(&peer->ibp_conns)) { /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; - /* Increase reconnection interval */ - peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - IBNAL_MAX_RECONNECT_INTERVAL); + peer->ibp_reconnect_interval *= 2; + peer->ibp_reconnect_interval = + MAX(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_min_reconnect_interval); + peer->ibp_reconnect_interval = + MIN(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_max_reconnect_interval); + + peer->ibp_reconnect_time = jiffies + + peer->ibp_reconnect_interval * HZ; - /* Take peer's blocked blocked transmits; I'll complete + /* Take peer's blocked transmits; I'll complete * them with error */ - while (!list_empty (&peer->ibp_tx_queue)) { - tx = list_entry (peer->ibp_tx_queue.next, - kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } + list_add(&zombies, &peer->ibp_tx_queue); + list_del_init(&peer->ibp_tx_queue); if (kibnal_peer_active(peer) && (peer->ibp_persistence == 0)) { /* failed connection attempt on non-persistent peer */ kibnal_unlink_peer_locked (peer); } + + peer->ibp_error = error; } else { /* Can't have blocked transmits if there are connections */ LASSERT (list_empty(&peer->ibp_tx_queue)); @@ -1570,22 +1612,17 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int rc) write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + kibnal_peer_notify(peer); + if (!list_empty (&zombies)) - CERROR ("Deleting messages for "LPX64": connection failed\n", - peer->ibp_nid); - - while (!list_empty (&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); + CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer->ibp_nid)); - list_del (&tx->tx_list); - /* complete now */ - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - } + kibnal_txlist_done(&zombies, -EHOSTUNREACH); } void -kibnal_connreq_done (kib_conn_t *conn, int status) +kibnal_connreq_done (kib_conn_t *conn, int active, int status) { int state = conn->ibc_state; kib_peer_t *peer = conn->ibc_peer; @@ -1595,7 +1632,7 @@ kibnal_connreq_done (kib_conn_t *conn, int status) int i; if (conn->ibc_connreq != NULL) { - PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + LIBCFS_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); conn->ibc_connreq = NULL; } @@ -1628,24 +1665,29 @@ kibnal_connreq_done (kib_conn_t *conn, int status) write_lock_irqsave (&kibnal_data.kib_global_lock, flags); - LASSERT (peer->ibp_connecting != 0); + if (active) + LASSERT (peer->ibp_connecting != 0); + else + LASSERT (peer->ibp_accepting != 0); if (status == 0 && /* connection established */ kibnal_peer_active(peer)) { /* peer not deleted */ - peer->ibp_connecting--; + if (active) + peer->ibp_connecting--; + else + peer->ibp_accepting--; + + conn->ibc_last_send = jiffies; conn->ibc_state = IBNAL_CONN_ESTABLISHED; + kibnal_peer_alive(peer); /* +1 ref for ibc_list; caller(== CM)'s ref remains until * the IB_CM_IDLE callback */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); list_add (&conn->ibc_list, &peer->ibp_conns); - - /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ /* post blocked sends to the new connection */ spin_lock (&conn->ibc_lock); @@ -1656,11 +1698,6 @@ kibnal_connreq_done (kib_conn_t *conn, int status) list_del (&tx->tx_list); - /* +1 ref for each tx */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); kibnal_queue_tx_locked (tx, conn); } @@ -1675,16 +1712,13 @@ kibnal_connreq_done (kib_conn_t *conn, int status) /* queue up all the receives */ for (i = 0; i < IBNAL_RX_MSGS; i++) { /* +1 ref for rx desc */ - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_inc (&conn->ibc_refcount); + kibnal_conn_addref(conn); CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n", i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg, conn->ibc_rxs[i].rx_vaddr); - kibnal_post_rx (&conn->ibc_rxs[i], 0); + kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); } kibnal_check_sends (conn); @@ -1703,12 +1737,12 @@ kibnal_connreq_done (kib_conn_t *conn, int status) write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - kibnal_peer_connect_failed (conn->ibc_peer, status); + kibnal_peer_connect_failed (conn->ibc_peer, active, status); } int -kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, - kib_msg_t *msg, int nob) +kibnal_accept_connreq (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, + kib_msg_t *msg, int nob) { kib_conn_t *conn; kib_peer_t *peer; @@ -1716,23 +1750,24 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, unsigned long flags; int rc; - rc = kibnal_unpack_msg(msg, nob); + rc = kibnal_unpack_msg(msg, 0, nob); if (rc != 0) { CERROR("Can't unpack connreq msg: %d\n", rc); return -EPROTO; } - CDEBUG(D_NET, "connreq from "LPX64"\n", msg->ibm_srcnid); + CDEBUG(D_NET, "connreq from %s\n", libcfs_nid2str(msg->ibm_srcnid)); if (msg->ibm_type != IBNAL_MSG_CONNREQ) { - CERROR("Unexpected connreq msg type: %x from "LPX64"\n", - msg->ibm_type, msg->ibm_srcnid); + CERROR("Unexpected connreq msg type: %x from %s\n", + msg->ibm_type, libcfs_nid2str(msg->ibm_srcnid)); return -EPROTO; } if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", - msg->ibm_srcnid, msg->ibm_u.connparams.ibcp_queue_depth, + CERROR("Can't accept %s: bad queue depth %d (%d expected)\n", + libcfs_nid2str(msg->ibm_srcnid), + msg->ibm_u.connparams.ibcp_queue_depth, IBNAL_MSG_QUEUE_SIZE); return (-EPROTO); } @@ -1742,13 +1777,9 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, return (-ENOMEM); /* assume 'nid' is a new peer */ - peer = kibnal_create_peer (msg->ibm_srcnid); - if (peer == NULL) { - CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, conn->ibc_peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); - atomic_dec (&conn->ibc_refcount); - kibnal_destroy_conn(conn); + rc = kibnal_create_peer(&peer, msg->ibm_srcnid); + if (rc != 0) { + kibnal_conn_decref(conn); return (-ENOMEM); } @@ -1758,31 +1789,47 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, * NB If my incarnation changes after this, the peer will get nuked and * we'll spot that when the connection is finally added into the peer's * connlist */ - if (msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || + if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + msg->ibm_dstnid) || msg->ibm_dststamp != kibnal_data.kib_incarnation) { write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); - CERROR("Stale connection params from "LPX64"\n", - msg->ibm_srcnid); - atomic_dec(&conn->ibc_refcount); - kibnal_destroy_conn(conn); - kibnal_put_peer(peer); + CERROR("Stale connection params from %s\n", + libcfs_nid2str(msg->ibm_srcnid)); + kibnal_conn_decref(conn); + kibnal_peer_decref(peer); return -ESTALE; } peer2 = kibnal_find_peer_locked(msg->ibm_srcnid); if (peer2 == NULL) { + /* Brand new peer */ + LASSERT (peer->ibp_accepting == 0); + /* peer table takes my ref on peer */ list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(msg->ibm_srcnid)); } else { - kibnal_put_peer (peer); + /* tie-break connection race in favour of the higher NID */ + if (peer2->ibp_connecting != 0 && + msg->ibm_srcnid < kibnal_data.kib_ni->ni_nid) { + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + CWARN("Conn race %s\n", + libcfs_nid2str(peer2->ibp_nid)); + + kibnal_conn_decref(conn); + kibnal_peer_decref(peer); + return -EALREADY; + } + + kibnal_peer_decref(peer); peer = peer2; } /* +1 ref for conn */ - atomic_inc (&peer->ibp_refcount); - peer->ibp_connecting++; + kibnal_peer_addref(peer); + peer->ibp_accepting++; write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); @@ -1791,6 +1838,8 @@ kibnal_accept (kib_conn_t **connp, tTS_IB_CM_COMM_ID cid, conn->ibc_comm_id = cid; conn->ibc_incarnation = msg->ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; + conn->ibc_version = msg->ibm_version; *connp = conn; return (0); @@ -1807,39 +1856,72 @@ kibnal_bad_conn_callback (tTS_IB_CM_EVENT event, return TS_IB_CM_CALLBACK_PROCEED; } -tTS_IB_CM_CALLBACK_RETURN -kibnal_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) +void +kibnal_abort_txs (kib_conn_t *conn, struct list_head *txs) { - kib_conn_t *conn = arg; LIST_HEAD (zombies); struct list_head *tmp; struct list_head *nxt; kib_tx_t *tx; unsigned long flags; - int done; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each_safe (tmp, nxt, txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + if (txs == &conn->ibc_active_txs) { + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + } else { + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); + } + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + + if (tx->tx_sending == 0) { + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + kibnal_txlist_done (&zombies, -ECONNABORTED); +} + +tTS_IB_CM_CALLBACK_RETURN +kibnal_conn_callback (tTS_IB_CM_EVENT event, + tTS_IB_CM_COMM_ID cid, + void *param, + void *arg) +{ + kib_conn_t *conn = arg; int rc; /* Established Connection Notifier */ switch (event) { default: - CERROR("Connection %p -> "LPX64" ERROR %d\n", - conn, conn->ibc_peer->ibp_nid, event); + CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event); kibnal_close_conn (conn, -ECONNABORTED); break; case TS_IB_CM_DISCONNECTED: - CWARN("Connection %p -> "LPX64" DISCONNECTED.\n", - conn, conn->ibc_peer->ibp_nid); + CDEBUG(D_NETERROR, "Connection %p -> %s DISCONNECTED.\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, 0); break; case TS_IB_CM_IDLE: - CDEBUG(D_NET, "Connection %p -> "LPX64" IDLE.\n", - conn, conn->ibc_peer->ibp_nid); + CDEBUG(D_NET, "Connection %p -> %s IDLE.\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); /* LASSERT (no further callbacks) */ rc = tsIbCmCallbackModify(cid, kibnal_bad_conn_callback, conn); @@ -1849,51 +1931,12 @@ kibnal_conn_callback (tTS_IB_CM_EVENT event, * completing outstanding passive RDMAs so we can be sure * the network can't touch the mapped memory any more. */ - spin_lock_irqsave (&conn->ibc_lock, flags); - - /* grab passive RDMAs not waiting for the tx callback */ - list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - - /* still waiting for tx callback? */ - if (!tx->tx_passive_rdma_wait) - continue; - - tx->tx_status = -ECONNABORTED; - tx->tx_passive_rdma_wait = 0; - done = (tx->tx_sending == 0); - - if (!done) - continue; - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } - - /* grab all blocked transmits */ - list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } + kibnal_abort_txs(conn, &conn->ibc_tx_queue); + kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred); + kibnal_abort_txs(conn, &conn->ibc_active_txs); - spin_unlock_irqrestore (&conn->ibc_lock, flags); - - while (!list_empty(&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); - - list_del(&tx->tx_list); - kibnal_tx_done (tx); - } - - kibnal_put_conn (conn); /* Lose CM's ref */ + kibnal_conn_decref(conn); /* Lose CM's ref */ break; } @@ -1902,9 +1945,9 @@ kibnal_conn_callback (tTS_IB_CM_EVENT event, tTS_IB_CM_CALLBACK_RETURN kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) + tTS_IB_CM_COMM_ID cid, + void *param, + void *arg) { kib_conn_t *conn = arg; int rc; @@ -1917,11 +1960,11 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, return TS_IB_CM_CALLBACK_ABORT; } - CERROR ("%s event %p -> "LPX64": %d\n", + CERROR ("%s event %p -> %s: %d\n", (event == TS_IB_CM_IDLE) ? "IDLE" : "Unexpected", - conn, conn->ibc_peer->ibp_nid, event); - kibnal_connreq_done(conn, -ECONNABORTED); - kibnal_put_conn(conn); /* drop CM's ref */ + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event); + kibnal_connreq_done(conn, 0, -ECONNABORTED); + kibnal_conn_decref(conn); /* drop CM's ref */ return TS_IB_CM_CALLBACK_ABORT; case TS_IB_CM_REQ_RECEIVED: { @@ -1931,13 +1974,13 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, LASSERT (conn == NULL); /* Don't really know srcnid until successful unpack */ - CDEBUG(D_NET, "REQ from ?"LPX64"?\n", msg->ibm_srcnid); + CDEBUG(D_NET, "REQ from ?%s?\n", libcfs_nid2str(msg->ibm_srcnid)); - rc = kibnal_accept(&conn, cid, msg, - req->remote_private_data_len); + rc = kibnal_accept_connreq(&conn, cid, msg, + req->remote_private_data_len); if (rc != 0) { - CERROR ("Can't accept ?"LPX64"?: %d\n", - msg->ibm_srcnid, rc); + CERROR ("Can't accept ?%s?: %d\n", + libcfs_nid2str(msg->ibm_srcnid), rc); return TS_IB_CM_CALLBACK_ABORT; } @@ -1951,7 +1994,7 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; - kibnal_pack_msg(msg, 0, + kibnal_pack_msg(msg, conn->ibc_version, 0, conn->ibc_peer->ibp_nid, conn->ibc_incarnation); @@ -1968,19 +2011,19 @@ kibnal_passive_conn_callback (tTS_IB_CM_EVENT event, case TS_IB_CM_ESTABLISHED: LASSERT (conn != NULL); - CWARN("Connection %p -> "LPX64" ESTABLISHED.\n", - conn, conn->ibc_peer->ibp_nid); + CWARN("Connection %p -> %s ESTABLISHED.\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, 0); + kibnal_connreq_done(conn, 0, 0); return TS_IB_CM_CALLBACK_PROCEED; } } tTS_IB_CM_CALLBACK_RETURN kibnal_active_conn_callback (tTS_IB_CM_EVENT event, - tTS_IB_CM_COMM_ID cid, - void *param, - void *arg) + tTS_IB_CM_COMM_ID cid, + void *param, + void *arg) { kib_conn_t *conn = arg; unsigned long flags; @@ -1992,75 +2035,79 @@ kibnal_active_conn_callback (tTS_IB_CM_EVENT event, int nob = rep->remote_private_data_len; int rc; - rc = kibnal_unpack_msg(msg, nob); + rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); if (rc != 0) { - CERROR ("Error %d unpacking conn ack from "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); - kibnal_connreq_done(conn, rc); - kibnal_put_conn(conn); /* drop CM's ref */ + CERROR ("Error %d unpacking conn ack from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, 1, rc); + kibnal_conn_decref(conn); /* drop CM's ref */ return TS_IB_CM_CALLBACK_ABORT; } if (msg->ibm_type != IBNAL_MSG_CONNACK) { - CERROR ("Unexpected conn ack type %d from "LPX64"\n", - msg->ibm_type, conn->ibc_peer->ibp_nid); - kibnal_connreq_done(conn, -EPROTO); - kibnal_put_conn(conn); /* drop CM's ref */ + CERROR ("Unexpected conn ack type %d from %s\n", + msg->ibm_type, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, 1, -EPROTO); + kibnal_conn_decref(conn); /* drop CM's ref */ return TS_IB_CM_CALLBACK_ABORT; } - if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid, + msg->ibm_srcnid) || + !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + msg->ibm_dstnid) || msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR("Stale conn ack from "LPX64"\n", - conn->ibc_peer->ibp_nid); - kibnal_connreq_done(conn, -ESTALE); - kibnal_put_conn(conn); /* drop CM's ref */ + CERROR("Stale conn ack from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, 1, -ESTALE); + kibnal_conn_decref(conn); /* drop CM's ref */ return TS_IB_CM_CALLBACK_ABORT; } if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR ("Bad queue depth %d from "LPX64"\n", + CERROR ("Bad queue depth %d from %s\n", msg->ibm_u.connparams.ibcp_queue_depth, - conn->ibc_peer->ibp_nid); - kibnal_connreq_done(conn, -EPROTO); - kibnal_put_conn(conn); /* drop CM's ref */ + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + kibnal_connreq_done(conn, 1, -EPROTO); + kibnal_conn_decref(conn); /* drop CM's ref */ return TS_IB_CM_CALLBACK_ABORT; } - CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", - conn, conn->ibc_peer->ibp_nid); + CDEBUG(D_NET, "Connection %p -> %s REP_RECEIVED.\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; return TS_IB_CM_CALLBACK_PROCEED; } case TS_IB_CM_ESTABLISHED: - CWARN("Connection %p -> "LPX64" ESTABLISHED\n", - conn, conn->ibc_peer->ibp_nid); + CWARN("Connection %p -> %s ESTABLISHED\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kibnal_connreq_done(conn, 0); + kibnal_connreq_done(conn, 1, 0); return TS_IB_CM_CALLBACK_PROCEED; case TS_IB_CM_IDLE: - CERROR("Connection %p -> "LPX64" IDLE\n", - conn, conn->ibc_peer->ibp_nid); + CDEBUG(D_NETERROR, "Connection %p -> %s IDLE\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); /* I assume this connection attempt was rejected because the * peer found a stale QP; I'll just try again */ write_lock_irqsave(&kibnal_data.kib_global_lock, flags); kibnal_schedule_active_connect_locked(conn->ibc_peer); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - kibnal_connreq_done(conn, -ECONNABORTED); - kibnal_put_conn(conn); /* drop CM's ref */ + kibnal_connreq_done(conn, 1, -ECONNABORTED); + kibnal_conn_decref(conn); /* drop CM's ref */ return TS_IB_CM_CALLBACK_ABORT; default: - CERROR("Connection %p -> "LPX64" ERROR %d\n", - conn, conn->ibc_peer->ibp_nid, event); - kibnal_connreq_done(conn, -ECONNABORTED); - kibnal_put_conn(conn); /* drop CM's ref */ + CDEBUG(D_NETERROR, "Connection %p -> %s ERROR %d\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), event); + kibnal_connreq_done(conn, 1, -ECONNABORTED); + kibnal_conn_decref(conn); /* drop CM's ref */ return TS_IB_CM_CALLBACK_ABORT; } } @@ -2075,10 +2122,10 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, kib_msg_t *msg = &conn->ibc_connreq->cr_msg; if (status != 0) { - CERROR ("Pathreq %p -> "LPX64" failed: %d\n", - conn, conn->ibc_peer->ibp_nid, status); - kibnal_connreq_done(conn, status); - kibnal_put_conn(conn); /* drop callback's ref */ + CDEBUG (D_NETERROR, "Pathreq %p -> %s failed: %d\n", + conn, libcfs_nid2str(peer->ibp_nid), status); + kibnal_connreq_done(conn, 1, status); + kibnal_conn_decref(conn); /* drop callback's ref */ return 1; /* non-zero prevents further callbacks */ } @@ -2086,7 +2133,8 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, kibnal_init_msg(msg, IBNAL_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; - kibnal_pack_msg(msg, 0, peer->ibp_nid, conn->ibc_incarnation); + kibnal_pack_msg(msg, conn->ibc_version, 0, + peer->ibp_nid, conn->ibc_incarnation); conn->ibc_connreq->cr_connparam = (struct ib_cm_active_param) { .qp = conn->ibc_qp, @@ -2096,7 +2144,7 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, .initiator_depth = IBNAL_RESPONDER_RESOURCES, .retry_count = IBNAL_RETRY, .rnr_retry_count = IBNAL_RNR_RETRY, - .cm_response_timeout = kibnal_tunables.kib_io_timeout, + .cm_response_timeout = *kibnal_tunables.kib_timeout, .max_cm_retries = IBNAL_CM_RETRY, .flow_control = IBNAL_FLOW_CONTROL, }; @@ -2107,8 +2155,9 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, /* Flag I'm getting involved with the CM... */ conn->ibc_state = IBNAL_CONN_CONNECTING; - CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", - conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, peer->ibp_nid); + CDEBUG(D_NET, "Connecting to, service id "LPX64", on %s\n", + conn->ibc_connreq->cr_svcrsp.ibsr_svc_id, + libcfs_nid2str(peer->ibp_nid)); /* kibnal_connect_callback gets my conn ref */ status = ib_cm_connect (&conn->ibc_connreq->cr_connparam, @@ -2117,12 +2166,12 @@ kibnal_pathreq_callback (tTS_IB_CLIENT_QUERY_TID tid, int status, kibnal_active_conn_callback, conn, &conn->ibc_comm_id); if (status != 0) { - CERROR ("Connect %p -> "LPX64" failed: %d\n", - conn, conn->ibc_peer->ibp_nid, status); + CERROR ("Connect %p -> %s failed: %d\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), status); /* Back out state change: I've not got a CM comm_id yet... */ conn->ibc_state = IBNAL_CONN_INIT_QP; - kibnal_connreq_done(conn, status); - kibnal_put_conn(conn); /* Drop callback's ref */ + kibnal_connreq_done(conn, 1, status); + kibnal_conn_decref(conn); /* Drop callback's ref */ } return 1; /* non-zero to prevent further callbacks */ @@ -2137,18 +2186,18 @@ kibnal_connect_peer (kib_peer_t *peer) conn = kibnal_create_conn(); if (conn == NULL) { CERROR ("Can't allocate conn\n"); - kibnal_peer_connect_failed (peer, -ENOMEM); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); return; } conn->ibc_peer = peer; - atomic_inc (&peer->ibp_refcount); + kibnal_peer_addref(peer); - PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + LIBCFS_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); if (conn->ibc_connreq == NULL) { CERROR ("Can't allocate connreq\n"); - kibnal_connreq_done(conn, -ENOMEM); - kibnal_put_conn(conn); /* drop my ref */ + kibnal_connreq_done(conn, 1, -ENOMEM); + kibnal_conn_decref(conn); /* drop my ref */ return; } @@ -2156,8 +2205,8 @@ kibnal_connect_peer (kib_peer_t *peer) rc = kibnal_make_svcqry(conn); if (rc != 0) { - kibnal_connreq_done (conn, rc); - kibnal_put_conn(conn); /* drop my ref */ + kibnal_connreq_done (conn, 1, rc); + kibnal_conn_decref(conn); /* drop my ref */ return; } @@ -2173,58 +2222,60 @@ kibnal_connect_peer (kib_peer_t *peer) conn->ibc_connreq->cr_svcrsp.ibsr_svc_gid, conn->ibc_connreq->cr_svcrsp.ibsr_svc_pkey, 0, - kibnal_tunables.kib_io_timeout * HZ, + *kibnal_tunables.kib_timeout * HZ, 0, kibnal_pathreq_callback, conn, &conn->ibc_connreq->cr_tid); if (rc == 0) return; /* callback now has my ref on conn */ - CERROR ("Path record request %p -> "LPX64" failed: %d\n", - conn, conn->ibc_peer->ibp_nid, rc); - kibnal_connreq_done(conn, rc); - kibnal_put_conn(conn); /* drop my ref */ + CERROR ("Path record request %p -> %s failed: %d\n", + conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); + kibnal_connreq_done(conn, 1, rc); + kibnal_conn_decref(conn); /* drop my ref */ } int -kibnal_conn_timed_out (kib_conn_t *conn) +kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) { kib_tx_t *tx; struct list_head *ttmp; unsigned long flags; + int timed_out = 0; spin_lock_irqsave (&conn->ibc_lock, flags); - list_for_each (ttmp, &conn->ibc_tx_queue) { + list_for_each (ttmp, txs) { tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (!tx->tx_passive_rdma_wait); - LASSERT (tx->tx_sending == 0); + if (txs == &conn->ibc_active_txs) { + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + } else { + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); } - } - - list_for_each (ttmp, &conn->ibc_active_txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (tx->tx_passive_rdma || - !tx->tx_passive_rdma_wait); - - LASSERT (tx->tx_passive_rdma_wait || - tx->tx_sending != 0); - + if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock_irqrestore (&conn->ibc_lock, flags); - return 1; + timed_out = 1; + break; } } spin_unlock_irqrestore (&conn->ibc_lock, flags); + return timed_out; +} - return 0; +int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + return kibnal_check_txs(conn, &conn->ibc_tx_queue) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || + kibnal_check_txs(conn, &conn->ibc_active_txs); } void @@ -2260,19 +2311,16 @@ kibnal_check_conns (int idx) if (!kibnal_conn_timed_out(conn)) continue; - CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", - conn, conn->ibc_state, peer->ibp_nid, - atomic_read (&conn->ibc_refcount)); + kibnal_conn_addref(conn); - atomic_inc (&conn->ibc_refcount); read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - CERROR("Timed out RDMA with "LPX64"\n", - peer->ibp_nid); + CERROR("Timed out RDMA with %s\n", + libcfs_nid2str(peer->ibp_nid)); kibnal_close_conn (conn, -ETIMEDOUT); - kibnal_put_conn (conn); + kibnal_conn_decref(conn); /* start again now I've dropped the lock */ goto again; @@ -2293,8 +2341,10 @@ kibnal_terminate_conn (kib_conn_t *conn) rc = ib_cm_disconnect (conn->ibc_comm_id); if (rc != 0) - CERROR ("Error %d disconnecting conn %p -> "LPX64"\n", - rc, conn, conn->ibc_peer->ibp_nid); + CERROR ("Error %d disconnecting conn %p -> %s\n", + rc, conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + + kibnal_peer_notify(conn->ibc_peer); } int @@ -2308,8 +2358,8 @@ kibnal_reaper (void *arg) int peer_index = 0; unsigned long deadline = jiffies; - kportal_daemonize ("kibnal_reaper"); - kportal_blockallsigs (); + cfs_daemonize ("kibnal_reaper"); + cfs_block_allsigs (); init_waitqueue_entry (&wait, current); @@ -2330,9 +2380,10 @@ kibnal_reaper (void *arg) * callback and last ref reschedules it * here... */ kibnal_terminate_conn(conn); - kibnal_put_conn (conn); + kibnal_conn_decref(conn); break; - + + case IBNAL_CONN_INIT_QP: case IBNAL_CONN_ZOMBIE: kibnal_destroy_conn (conn); break; @@ -2363,9 +2414,9 @@ kibnal_reaper (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (kibnal_tunables.kib_io_timeout > n * p) + if (*kibnal_tunables.kib_timeout > n * p) chunk = (chunk * n * p) / - kibnal_tunables.kib_io_timeout; + *kibnal_tunables.kib_timeout; if (chunk == 0) chunk = 1; @@ -2409,8 +2460,8 @@ kibnal_connd (void *arg) int did_something; snprintf(name, sizeof(name), "kibnal_connd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); + cfs_daemonize(name); + cfs_block_allsigs(); init_waitqueue_entry (&wait, current); @@ -2427,32 +2478,37 @@ kibnal_connd (void *arg) spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); kibnal_handle_svcqry(as->ibas_sock); - sock_release(as->ibas_sock); - PORTAL_FREE(as, sizeof(*as)); + kibnal_free_acceptsock(as); spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); did_something = 1; } - if (!list_empty (&kibnal_data.kib_connd_peers)) { + /* Only handle an outgoing connection request if there is someone left + * to handle an incoming svcqry */ + if (!list_empty (&kibnal_data.kib_connd_peers) && + ((kibnal_data.kib_connd_connecting + 1) < + *kibnal_tunables.kib_n_connd)) { peer = list_entry (kibnal_data.kib_connd_peers.next, kib_peer_t, ibp_connd_list); list_del_init (&peer->ibp_connd_list); + kibnal_data.kib_connd_connecting++; spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); kibnal_connect_peer (peer); - kibnal_put_peer (peer); + kibnal_peer_decref(peer); spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); did_something = 1; + kibnal_data.kib_connd_connecting--; } if (did_something) continue; set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + add_wait_queue_exclusive(&kibnal_data.kib_connd_waitq, &wait); spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); @@ -2483,8 +2539,8 @@ kibnal_scheduler(void *arg) int did_something; snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); + cfs_daemonize(name); + cfs_block_allsigs(); spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); @@ -2524,7 +2580,7 @@ kibnal_scheduler(void *arg) counter = 0; if (!did_something) { - rc = wait_event_interruptible( + rc = wait_event_interruptible_exclusive( kibnal_data.kib_sched_waitq, !list_empty(&kibnal_data.kib_sched_txq) || !list_empty(&kibnal_data.kib_sched_rxq) || @@ -2543,13 +2599,3 @@ kibnal_scheduler(void *arg) kibnal_thread_fini(); return (0); } - - -lib_nal_t kibnal_lib = { - libnal_data: &kibnal_data, /* NAL private data */ - libnal_send: kibnal_send, - libnal_send_pages: kibnal_send_pages, - libnal_recv: kibnal_recv, - libnal_recv_pages: kibnal_recv_pages, - libnal_dist: kibnal_dist -}; diff --git a/lnet/klnds/openiblnd/openiblnd_modparams.c b/lnet/klnds/openiblnd/openiblnd_modparams.c new file mode 100644 index 0000000..f40004b --- /dev/null +++ b/lnet/klnds/openiblnd/openiblnd_modparams.c @@ -0,0 +1,149 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "openiblnd.h" + +static char *ipif_basename = "ib"; +CFS_MODULE_PARM(ipif_basename, "s", charp, 0444, + "IPoIB interface base name"); + +static int n_connd = 4; +CFS_MODULE_PARM(n_connd, "i", int, 0444, + "# of connection daemons"); + +static int min_reconnect_interval = 1; +CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, + "minimum connection retry interval (seconds)"); + +static int max_reconnect_interval = 60; +CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, + "maximum connection retry interval (seconds)"); + +static int concurrent_peers = 1152; +CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, + "maximum number of peers that may connect"); + +static int cksum = 0; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +static int ntx = 384; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of message descriptors"); + +static int credits = 256; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 16; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static int keepalive = 100; +CFS_MODULE_PARM(keepalive, "i", int, 0644, + "Idle time in seconds before sending a keepalive"); + +kib_tunables_t kibnal_tunables = { + .kib_ipif_basename = &ipif_basename, + .kib_n_connd = &n_connd, + .kib_min_reconnect_interval = &min_reconnect_interval, + .kib_max_reconnect_interval = &max_reconnect_interval, + .kib_concurrent_peers = &concurrent_peers, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_ntx = &ntx, + .kib_credits = &credits, + .kib_peercredits = &peer_credits, + .kib_keepalive = &keepalive, +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + +static ctl_table kibnal_ctl_table[] = { + {1, "ipif_basename", &ipif_basename, + 1024, 0444, NULL, &proc_dostring}, + {2, "n_connd", &n_connd, + sizeof(int), 0444, NULL, &proc_dointvec}, + {3, "min_reconnect_interval", &min_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {4, "max_reconnect_interval", &max_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {5, "concurrent_peers", &concurrent_peers, + sizeof(int), 0444, NULL, &proc_dointvec}, + {6, "cksum", &cksum, + sizeof(int), 0644, NULL, &proc_dointvec}, + {7, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {8, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {9, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {10, "peer_credits", &peer_credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {11, "keepalive", &keepalive, + sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table kibnal_top_ctl_table[] = { + {203, "openibnal", NULL, 0, 0555, kibnal_ctl_table}, + {0} +}; + +int +kibnal_tunables_init () +{ + kibnal_tunables.kib_sysctl = + register_sysctl_table(kibnal_top_ctl_table, 0); + + if (kibnal_tunables.kib_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +kibnal_tunables_fini () +{ + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table(kibnal_tunables.kib_sysctl); +} + +#else + +int +kibnal_tunables_init () +{ + return 0; +} + +void +kibnal_tunables_fini () +{ +} + +#endif diff --git a/lnet/klnds/ptllnd/.cvsignore b/lnet/klnds/ptllnd/.cvsignore new file mode 100644 index 0000000..0586565 --- /dev/null +++ b/lnet/klnds/ptllnd/.cvsignore @@ -0,0 +1,11 @@ +.deps +Makefile +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.*.cmd +.tmp_versions +.depend +wirecheck diff --git a/lnet/klnds/ptllnd/Makefile.in b/lnet/klnds/ptllnd/Makefile.in new file mode 100755 index 0000000..ec2f9bb --- /dev/null +++ b/lnet/klnds/ptllnd/Makefile.in @@ -0,0 +1,13 @@ +MODULES := kptllnd + +EXTRA_POST_CFLAGS := @PTLLNDCPPFLAGS@ + +kptllnd-objs := ptllnd.o \ + ptllnd_cb.o \ + ptllnd_modparams.o \ + ptllnd_peer.o \ + ptllnd_rx_buf.o \ + ptllnd_tx.o \ + ptllnd_ptltrace.o + +@INCLUDE_RULES@ diff --git a/lnet/klnds/ptllnd/README b/lnet/klnds/ptllnd/README new file mode 100644 index 0000000..5cb6cfc --- /dev/null +++ b/lnet/klnds/ptllnd/README @@ -0,0 +1,47 @@ +1. This version of the Portals LND is intended to work on the Cray XT3 using + Cray Portals as a network transport. + +2. To enable the building of the Portals LND (ptllnd.ko) configure with the + following option: + ./configure --with-portals= + +3. The following configuration options are supported + + ntx: + The total number of message descritprs + + concurrent_peers: + The maximum number of conncurent peers. Peers attemting + to connect beyond the maximum will not be allowd. + + peer_hash_table_size: + The number of hash table slots for the peers. This number + should scale with concurrent_peers. + + cksum: + Set to non-zero to enable message (not RDMA) checksums for + outgoing packets. Incoming packets will always be checksumed + if necssary, independnt of this value. + + timeout: + The amount of time a request can linger in a peers active + queue, before the peer is considered dead. Units: seconds. + + portal: + The portal ID to use for the ptllnd traffic. + + rxb_npages: + The number of pages in a RX Buffer. + + credits: + The maximum total number of concurrent sends that are + outstanding at any given instant. + + peercredits: + The maximum number of concurrent sends that are + outstanding to a single piere at any given instant. + + max_msg_size: + The maximum immedate message size. This MUST be + the same on all nodes in a cluster. A peer connecting + with a diffrent max_msg_size will be rejected. diff --git a/lnet/klnds/ptllnd/autoMakefile.am b/lnet/klnds/ptllnd/autoMakefile.am new file mode 100755 index 0000000..bd8cc9c --- /dev/null +++ b/lnet/klnds/ptllnd/autoMakefile.am @@ -0,0 +1,8 @@ +if MODULES +if BUILD_PTLLND +modulenet_DATA = kptllnd$(KMODEXT) +endif +endif + +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kptllnd-objs:%.o=%.c) ptllnd.h diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c new file mode 100755 index 0000000..a82babe --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd.c @@ -0,0 +1,836 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +#include "ptllnd.h" + +lnd_t kptllnd_lnd = { + .lnd_type = PTLLND, + .lnd_startup = kptllnd_startup, + .lnd_shutdown = kptllnd_shutdown, + .lnd_ctl = kptllnd_ctl, + .lnd_send = kptllnd_send, + .lnd_recv = kptllnd_recv, + .lnd_eager_recv = kptllnd_eager_recv, +}; + +kptl_data_t kptllnd_data; + +char * +kptllnd_ptlid2str(ptl_process_id_t id) +{ + static char strs[64][32]; + static int idx = 0; + + unsigned long flags; + char *str; + + spin_lock_irqsave(&kptllnd_data.kptl_ptlid2str_lock, flags); + str = strs[idx++]; + if (idx >= sizeof(strs)/sizeof(strs[0])) + idx = 0; + spin_unlock_irqrestore(&kptllnd_data.kptl_ptlid2str_lock, flags); + + snprintf(str, sizeof(strs[0]), FMT_PTLID, id.pid, id.nid); + return str; +} + +void +kptllnd_assert_wire_constants (void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux fedora 2.6.11-co-0.6.4 #1 Mon Jun 19 05:36:13 UTC 2006 i686 i686 i386 GNU + * with gcc version 4.1.1 20060525 (Red Hat 4.1.1-1) */ + + + /* Constants... */ + CLASSERT (PTL_RESERVED_MATCHBITS == 0x100); + CLASSERT (LNET_MSG_MATCHBITS == 0); + CLASSERT (PTLLND_MSG_MAGIC == 0x50746C4E); + CLASSERT (PTLLND_MSG_VERSION == 0x04); + CLASSERT (PTLLND_RDMA_OK == 0x00); + CLASSERT (PTLLND_RDMA_FAIL == 0x01); + CLASSERT (PTLLND_MSG_TYPE_INVALID == 0x00); + CLASSERT (PTLLND_MSG_TYPE_PUT == 0x01); + CLASSERT (PTLLND_MSG_TYPE_GET == 0x02); + CLASSERT (PTLLND_MSG_TYPE_IMMEDIATE == 0x03); + CLASSERT (PTLLND_MSG_TYPE_NOOP == 0x04); + CLASSERT (PTLLND_MSG_TYPE_HELLO == 0x05); + CLASSERT (PTLLND_MSG_TYPE_NAK == 0x06); + + /* Checks for struct kptl_msg_t */ + CLASSERT ((int)sizeof(kptl_msg_t) == 136); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_magic) == 0); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_magic) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_version) == 4); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_version) == 2); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_type) == 6); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_type) == 1); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_credits) == 7); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_credits) == 1); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_nob) == 8); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_nob) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_cksum) == 12); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_cksum) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcnid) == 16); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcnid) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcstamp) == 24); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcstamp) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstnid) == 32); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstnid) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dststamp) == 40); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dststamp) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcpid) == 48); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcpid) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstpid) == 52); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstpid) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.immediate) == 56); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.immediate) == 72); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.rdma) == 56); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.rdma) == 80); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.hello) == 56); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.hello) == 12); + + /* Checks for struct kptl_immediate_msg_t */ + CLASSERT ((int)sizeof(kptl_immediate_msg_t) == 72); + CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_hdr) == 0); + CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_hdr) == 72); + CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_payload[13]) == 85); + CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_payload[13]) == 1); + + /* Checks for struct kptl_rdma_msg_t */ + CLASSERT ((int)sizeof(kptl_rdma_msg_t) == 80); + CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_hdr) == 0); + CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_hdr) == 72); + CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_matchbits) == 72); + CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_matchbits) == 8); + + /* Checks for struct kptl_hello_msg_t */ + CLASSERT ((int)sizeof(kptl_hello_msg_t) == 12); + CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_matchbits) == 0); + CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_matchbits) == 8); + CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_max_msg_size) == 8); + CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_max_msg_size) == 4); +} + +const char *kptllnd_evtype2str(int type) +{ +#define DO_TYPE(x) case x: return #x; + switch(type) + { + DO_TYPE(PTL_EVENT_GET_START); + DO_TYPE(PTL_EVENT_GET_END); + DO_TYPE(PTL_EVENT_PUT_START); + DO_TYPE(PTL_EVENT_PUT_END); + DO_TYPE(PTL_EVENT_REPLY_START); + DO_TYPE(PTL_EVENT_REPLY_END); + DO_TYPE(PTL_EVENT_ACK); + DO_TYPE(PTL_EVENT_SEND_START); + DO_TYPE(PTL_EVENT_SEND_END); + DO_TYPE(PTL_EVENT_UNLINK); + default: + return ""; + } +#undef DO_TYPE +} + +const char *kptllnd_msgtype2str(int type) +{ +#define DO_TYPE(x) case x: return #x; + switch(type) + { + DO_TYPE(PTLLND_MSG_TYPE_INVALID); + DO_TYPE(PTLLND_MSG_TYPE_PUT); + DO_TYPE(PTLLND_MSG_TYPE_GET); + DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE); + DO_TYPE(PTLLND_MSG_TYPE_HELLO); + DO_TYPE(PTLLND_MSG_TYPE_NOOP); + DO_TYPE(PTLLND_MSG_TYPE_NAK); + default: + return ""; + } +#undef DO_TYPE +} + +__u32 +kptllnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +void +kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob) +{ + msg->ptlm_type = type; + msg->ptlm_nob = (offsetof(kptl_msg_t, ptlm_u) + body_nob + 7) & ~7; + + LASSERT(msg->ptlm_nob <= *kptllnd_tunables.kptl_max_msg_size); +} + +void +kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer) +{ + msg->ptlm_magic = PTLLND_MSG_MAGIC; + msg->ptlm_version = PTLLND_MSG_VERSION; + /* msg->ptlm_type Filled in kptllnd_init_msg() */ + msg->ptlm_credits = peer->peer_outstanding_credits; + /* msg->ptlm_nob Filled in kptllnd_init_msg() */ + msg->ptlm_cksum = 0; + msg->ptlm_srcnid = kptllnd_data.kptl_ni->ni_nid; + msg->ptlm_srcstamp = kptllnd_data.kptl_incarnation; + msg->ptlm_dstnid = peer->peer_id.nid; + msg->ptlm_dststamp = peer->peer_incarnation; + msg->ptlm_srcpid = the_lnet.ln_pid; + msg->ptlm_dstpid = peer->peer_id.pid; + + if (*kptllnd_tunables.kptl_checksum) { + /* NB ptlm_cksum zero while computing cksum */ + msg->ptlm_cksum = kptllnd_cksum(msg, + offsetof(kptl_msg_t, ptlm_u)); + } +} + +int +kptllnd_msg_unpack(kptl_msg_t *msg, int nob) +{ + const int hdr_size = offsetof(kptl_msg_t, ptlm_u); + __u32 msg_cksum; + __u16 msg_version; + int flip; + + /* 6 bytes are enough to have received magic + version */ + if (nob < 6) { + CERROR("Very Short message: %d\n", nob); + return -EPROTO; + } + + /* + * Determine if we need to flip + */ + if (msg->ptlm_magic == PTLLND_MSG_MAGIC) { + flip = 0; + } else if (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC)) { + flip = 1; + } else { + CERROR("Bad magic: %08x\n", msg->ptlm_magic); + return -EPROTO; + } + + msg_version = flip ? __swab16(msg->ptlm_version) : msg->ptlm_version; + + if (msg_version != PTLLND_MSG_VERSION) { + CERROR("Bad version: got %04x expected %04x\n", + (__u32)msg_version, PTLLND_MSG_VERSION); + return -EPROTO; + } + + if (nob < hdr_size) { + CERROR("Short message: got %d, wanted at least %d\n", + nob, hdr_size); + return -EPROTO; + } + + /* checksum must be computed with + * 1) ptlm_cksum zero and + * 2) BEFORE anything gets modified/flipped + */ + msg_cksum = flip ? __swab32(msg->ptlm_cksum) : msg->ptlm_cksum; + msg->ptlm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != kptllnd_cksum(msg, hdr_size)) { + CERROR("Bad checksum\n"); + return -EPROTO; + } + + msg->ptlm_version = msg_version; + msg->ptlm_cksum = msg_cksum; + + if (flip) { + /* These two are 1 byte long so we don't swap them + But check this assumtion*/ + CLASSERT (sizeof(msg->ptlm_type) == 1); + CLASSERT (sizeof(msg->ptlm_credits) == 1); + /* src & dst stamps are opaque cookies */ + __swab32s(&msg->ptlm_nob); + __swab64s(&msg->ptlm_srcnid); + __swab64s(&msg->ptlm_dstnid); + __swab32s(&msg->ptlm_srcpid); + __swab32s(&msg->ptlm_dstpid); + } + + if (msg->ptlm_nob != nob) { + CERROR("msg_nob corrupt: got 0x%08x, wanted %08x\n", + msg->ptlm_nob, nob); + return -EPROTO; + } + + switch(msg->ptlm_type) + { + case PTLLND_MSG_TYPE_PUT: + case PTLLND_MSG_TYPE_GET: + if (nob < hdr_size + sizeof(kptl_rdma_msg_t)) { + CERROR("Short rdma request: got %d, want %d\n", + nob, hdr_size + (int)sizeof(kptl_rdma_msg_t)); + return -EPROTO; + } + + if (flip) + __swab64s(&msg->ptlm_u.rdma.kptlrm_matchbits); + + if (msg->ptlm_u.rdma.kptlrm_matchbits < PTL_RESERVED_MATCHBITS) { + CERROR("Bad matchbits "LPX64"\n", + msg->ptlm_u.rdma.kptlrm_matchbits); + return -EPROTO; + } + break; + + case PTLLND_MSG_TYPE_IMMEDIATE: + if (nob < offsetof(kptl_msg_t, + ptlm_u.immediate.kptlim_payload)) { + CERROR("Short immediate: got %d, want %d\n", nob, + (int)offsetof(kptl_msg_t, + ptlm_u.immediate.kptlim_payload)); + return -EPROTO; + } + /* Do nothing */ + break; + + case PTLLND_MSG_TYPE_NOOP: + case PTLLND_MSG_TYPE_NAK: + /* Do nothing */ + break; + + case PTLLND_MSG_TYPE_HELLO: + if (nob < hdr_size + sizeof(kptl_hello_msg_t)) { + CERROR("Short hello: got %d want %d\n", + nob, hdr_size + (int)sizeof(kptl_hello_msg_t)); + return -EPROTO; + } + if (flip) { + __swab64s(&msg->ptlm_u.hello.kptlhm_matchbits); + __swab32s(&msg->ptlm_u.hello.kptlhm_max_msg_size); + } + break; + + default: + CERROR("Bad message type: 0x%02x\n", (__u32)msg->ptlm_type); + return -EPROTO; + } + + return 0; +} + +int +kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; + + CDEBUG(D_NET, ">>> kptllnd_ctl cmd=%u arg=%p\n", cmd, arg); + + /* + * Validate that the context block is actually + * pointing to this interface + */ + LASSERT (ni == kptllnd_data.kptl_ni); + + switch(cmd) { + case IOC_LIBCFS_DEL_PEER: { + lnet_process_id_t id; + + id.nid = data->ioc_nid; + id.pid = data->ioc_u32[1]; + + rc = kptllnd_peer_del(id); + break; + } + + case IOC_LIBCFS_GET_PEER: { + lnet_process_id_t id = {.nid = LNET_NID_ANY, + .pid = LNET_PID_ANY}; + __u64 incarnation = 0; + __u64 next_matchbits = 0; + __u64 last_matchbits_seen = 0; + int state = 0; + int sent_hello = 0; + int refcount = 0; + int nsendq = 0; + int nactiveq = 0; + int credits = 0; + int outstanding_credits = 0; + + rc = kptllnd_get_peer_info(data->ioc_count, &id, + &state, &sent_hello, + &refcount, &incarnation, + &next_matchbits, &last_matchbits_seen, + &nsendq, &nactiveq, + &credits, &outstanding_credits); + /* wince... */ + data->ioc_nid = id.nid; + data->ioc_net = state; + data->ioc_flags = sent_hello; + data->ioc_count = refcount; + data->ioc_u64[0] = incarnation; + data->ioc_u32[0] = (__u32)next_matchbits; + data->ioc_u32[1] = (__u32)(next_matchbits >> 32); + data->ioc_u32[2] = (__u32)last_matchbits_seen; + data->ioc_u32[3] = (__u32)(last_matchbits_seen >> 32); + data->ioc_u32[4] = id.pid; + data->ioc_u32[5] = (nsendq << 16) | nactiveq; + data->ioc_u32[6] = (credits << 16) | outstanding_credits; + break; + } + + default: + rc=-EINVAL; + break; + } + CDEBUG(D_NET, "<<< kptllnd_ctl rc=%d\n", rc); + return rc; +} + +int +kptllnd_startup (lnet_ni_t *ni) +{ + int rc; + int i; + int spares; + struct timeval tv; + ptl_err_t ptl_rc; + + LASSERT (ni->ni_lnd == &kptllnd_lnd); + + if (kptllnd_data.kptl_init != PTLLND_INIT_NOTHING) { + CERROR("Only 1 instance supported\n"); + return -EPERM; + } + + if (*kptllnd_tunables.kptl_max_procs_per_node < 1) { + CERROR("max_procs_per_node must be > 1\n"); + return -EINVAL; + } + + *kptllnd_tunables.kptl_max_msg_size &= ~7; + if (*kptllnd_tunables.kptl_max_msg_size < sizeof(kptl_msg_t)) + *kptllnd_tunables.kptl_max_msg_size = + (sizeof(kptl_msg_t) + 7) & ~7; + /* + * zero pointers, flags etc + * put everything into a known state. + */ + memset (&kptllnd_data, 0, sizeof (kptllnd_data)); + kptllnd_data.kptl_eqh = PTL_INVALID_HANDLE; + kptllnd_data.kptl_nih = PTL_INVALID_HANDLE; + + /* + * Uptick the module reference count + */ + PORTAL_MODULE_USE; + + /* + * Setup pointers between the ni and context data block + */ + kptllnd_data.kptl_ni = ni; + ni->ni_data = &kptllnd_data; + + /* + * Setup Credits + */ + ni->ni_maxtxcredits = *kptllnd_tunables.kptl_credits; + ni->ni_peertxcredits = *kptllnd_tunables.kptl_peercredits; + + kptllnd_data.kptl_expected_peers = + *kptllnd_tunables.kptl_max_nodes * + *kptllnd_tunables.kptl_max_procs_per_node; + + /* + * Initialize the Network interface instance + * We use the default because we don't have any + * way to choose a better interface. + * Requested and actual limits are ignored. + */ + ptl_rc = PtlNIInit( +#ifdef _USING_LUSTRE_PORTALS_ + PTL_IFACE_DEFAULT, +#else + CRAY_KERN_NAL, +#endif + *kptllnd_tunables.kptl_pid, NULL, NULL, + &kptllnd_data.kptl_nih); + + /* + * Note: PTL_IFACE_DUP simply means that the requested + * interface was already inited and that we're sharing it. + * Which is ok. + */ + if (ptl_rc != PTL_OK && ptl_rc != PTL_IFACE_DUP) { + CERROR ("PtlNIInit: error %d\n", ptl_rc); + rc = -EINVAL; + goto failed; + } + + /* NB eq size irrelevant if using a callback */ + ptl_rc = PtlEQAlloc(kptllnd_data.kptl_nih, + 8, /* size */ + kptllnd_eq_callback, /* handler callback */ + &kptllnd_data.kptl_eqh); /* output handle */ + if (ptl_rc != PTL_OK) { + CERROR("PtlEQAlloc failed %d\n", ptl_rc); + rc = -ENOMEM; + goto failed; + } + + /* + * Fetch the lower NID + */ + ptl_rc = PtlGetId(kptllnd_data.kptl_nih, + &kptllnd_data.kptl_portals_id); + if (ptl_rc != PTL_OK) { + CERROR ("PtlGetID: error %d\n", ptl_rc); + rc = -EINVAL; + goto failed; + } + + if (kptllnd_data.kptl_portals_id.pid != *kptllnd_tunables.kptl_pid) { + /* The kernel ptllnd must have the expected PID */ + CERROR("Unexpected PID: %u (%u expected)\n", + kptllnd_data.kptl_portals_id.pid, + *kptllnd_tunables.kptl_pid); + rc = -EINVAL; + goto failed; + } + + ni->ni_nid = kptllnd_ptl2lnetnid(kptllnd_data.kptl_portals_id.nid); + + CDEBUG(D_NET, "ptl id=%s, lnet id=%s\n", + kptllnd_ptlid2str(kptllnd_data.kptl_portals_id), + libcfs_nid2str(ni->ni_nid)); + + /* + * Initialized the incarnation + */ + do_gettimeofday(&tv); + kptllnd_data.kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) + + tv.tv_usec; + CDEBUG(D_NET, "Incarnation="LPX64"\n", kptllnd_data.kptl_incarnation); + + /* + * Setup the sched locks/lists/waitq + */ + spin_lock_init(&kptllnd_data.kptl_sched_lock); + init_waitqueue_head(&kptllnd_data.kptl_sched_waitq); + INIT_LIST_HEAD(&kptllnd_data.kptl_sched_txq); + INIT_LIST_HEAD(&kptllnd_data.kptl_sched_rxq); + INIT_LIST_HEAD(&kptllnd_data.kptl_sched_rxbq); + + /* + * Setup the tx locks/lists + */ + spin_lock_init(&kptllnd_data.kptl_tx_lock); + INIT_LIST_HEAD(&kptllnd_data.kptl_idle_txs); + atomic_set(&kptllnd_data.kptl_ntx, 0); + + /* + * Allocate and setup the peer hash table + */ + rwlock_init(&kptllnd_data.kptl_peer_rw_lock); + init_waitqueue_head(&kptllnd_data.kptl_watchdog_waitq); + INIT_LIST_HEAD(&kptllnd_data.kptl_closing_peers); + INIT_LIST_HEAD(&kptllnd_data.kptl_zombie_peers); + + spin_lock_init(&kptllnd_data.kptl_ptlid2str_lock); + + kptllnd_data.kptl_peer_hash_size = + *kptllnd_tunables.kptl_peer_hash_table_size; + LIBCFS_ALLOC(kptllnd_data.kptl_peers, + (kptllnd_data.kptl_peer_hash_size * + sizeof(struct list_head))); + if (kptllnd_data.kptl_peers == NULL) { + CERROR("Failed to allocate space for peer hash table size=%d\n", + kptllnd_data.kptl_peer_hash_size); + rc = -ENOMEM; + goto failed; + } + for (i = 0; i < kptllnd_data.kptl_peer_hash_size; i++) + INIT_LIST_HEAD(&kptllnd_data.kptl_peers[i]); + + LIBCFS_ALLOC(kptllnd_data.kptl_nak_msg, offsetof(kptl_msg_t, ptlm_u)); + if (kptllnd_data.kptl_nak_msg == NULL) { + CERROR("Can't allocate NAK msg\n"); + rc = -ENOMEM; + goto failed; + } + memset(kptllnd_data.kptl_nak_msg, 0, offsetof(kptl_msg_t, ptlm_u)); + kptllnd_init_msg(kptllnd_data.kptl_nak_msg, PTLLND_MSG_TYPE_NAK, 0); + kptllnd_data.kptl_nak_msg->ptlm_magic = PTLLND_MSG_MAGIC; + kptllnd_data.kptl_nak_msg->ptlm_version = PTLLND_MSG_VERSION; + kptllnd_data.kptl_nak_msg->ptlm_srcpid = the_lnet.ln_pid; + kptllnd_data.kptl_nak_msg->ptlm_srcnid = ni->ni_nid; + kptllnd_data.kptl_nak_msg->ptlm_srcstamp = kptllnd_data.kptl_incarnation; + kptllnd_data.kptl_nak_msg->ptlm_dstpid = LNET_PID_ANY; + kptllnd_data.kptl_nak_msg->ptlm_dstnid = LNET_NID_ANY; + + kptllnd_rx_buffer_pool_init(&kptllnd_data.kptl_rx_buffer_pool); + + kptllnd_data.kptl_rx_cache = + cfs_mem_cache_create("ptllnd_rx", + sizeof(kptl_rx_t) + + *kptllnd_tunables.kptl_max_msg_size, + 0, /* offset */ + 0); /* flags */ + if (kptllnd_data.kptl_rx_cache == NULL) { + CERROR("Can't create slab for RX descriptors\n"); + rc = -ENOMEM; + goto failed; + } + + /* lists/ptrs/locks initialised */ + kptllnd_data.kptl_init = PTLLND_INIT_DATA; + + /*****************************************************/ + + rc = kptllnd_setup_tx_descs(); + if (rc != 0) { + CERROR("Can't pre-allocate %d TX descriptors: %d\n", + *kptllnd_tunables.kptl_ntx, rc); + goto failed; + } + + /* Start the scheduler threads for handling incoming requests. No need + * to advance the state because this will be automatically cleaned up + * now that PTLNAT_INIT_DATA state has been entered */ + CDEBUG(D_NET, "starting %d scheduler threads\n", PTLLND_N_SCHED); + for (i = 0; i < PTLLND_N_SCHED; i++) { + rc = kptllnd_thread_start(kptllnd_scheduler, (void *)((long)i)); + if (rc != 0) { + CERROR("Can't spawn scheduler[%d]: %d\n", i, rc); + goto failed; + } + } + + rc = kptllnd_thread_start(kptllnd_watchdog, NULL); + if (rc != 0) { + CERROR("Can't spawn watchdog: %d\n", rc); + goto failed; + } + + /* Ensure that 'rxb_nspare' buffers can be off the net (being emptied) + * and we will still have enough buffers posted for all our peers */ + spares = *kptllnd_tunables.kptl_rxb_nspare * + ((*kptllnd_tunables.kptl_rxb_npages * PAGE_SIZE)/ + *kptllnd_tunables.kptl_max_msg_size); + + /* reserve and post the buffers */ + rc = kptllnd_rx_buffer_pool_reserve(&kptllnd_data.kptl_rx_buffer_pool, + kptllnd_data.kptl_expected_peers + + spares); + if (rc != 0) { + CERROR("Can't reserve RX Buffer pool: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + kptllnd_data.kptl_init = PTLLND_INIT_ALL; + + /*****************************************************/ + + if (*kptllnd_tunables.kptl_checksum) + CWARN("Checksumming enabled\n"); + + CDEBUG(D_NET, "<<< kptllnd_startup SUCCESS\n"); + return 0; + + failed: + CDEBUG(D_NET, "kptllnd_startup failed rc=%d\n", rc); + kptllnd_shutdown(ni); + return rc; +} + +void +kptllnd_shutdown (lnet_ni_t *ni) +{ + int i; + ptl_err_t prc; + lnet_process_id_t process_id; + unsigned long flags; + + CDEBUG(D_MALLOC, "before LND cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + + LASSERT (ni == kptllnd_data.kptl_ni); + + switch (kptllnd_data.kptl_init) { + default: + LBUG(); + + case PTLLND_INIT_ALL: + case PTLLND_INIT_DATA: + /* Stop receiving */ + kptllnd_rx_buffer_pool_fini(&kptllnd_data.kptl_rx_buffer_pool); + LASSERT (list_empty(&kptllnd_data.kptl_sched_rxq)); + LASSERT (list_empty(&kptllnd_data.kptl_sched_rxbq)); + + /* Hold peertable lock to interleave cleanly with peer birth/death */ + write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + + LASSERT (kptllnd_data.kptl_shutdown == 0); + kptllnd_data.kptl_shutdown = 1; /* phase 1 == destroy peers */ + + /* no new peers possible now */ + write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, + flags); + + /* nuke all existing peers */ + process_id.nid = LNET_NID_ANY; + process_id.pid = LNET_PID_ANY; + kptllnd_peer_del(process_id); + + read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + + LASSERT (kptllnd_data.kptl_n_active_peers == 0); + + i = 2; + while (kptllnd_data.kptl_npeers != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, + "Waiting for %d peers to terminate\n", + kptllnd_data.kptl_npeers); + + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, + flags); + + cfs_pause(cfs_time_seconds(1)); + + read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, + flags); + } + + LASSERT(list_empty(&kptllnd_data.kptl_closing_peers)); + LASSERT(list_empty(&kptllnd_data.kptl_zombie_peers)); + LASSERT (kptllnd_data.kptl_peers != NULL); + for (i = 0; i < kptllnd_data.kptl_peer_hash_size; i++) + LASSERT (list_empty (&kptllnd_data.kptl_peers[i])); + + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); + CDEBUG(D_NET, "All peers deleted\n"); + + /* Shutdown phase 2: kill the daemons... */ + kptllnd_data.kptl_shutdown = 2; + mb(); + + i = 2; + while (atomic_read (&kptllnd_data.kptl_nthreads) != 0) { + /* Wake up all threads*/ + wake_up_all(&kptllnd_data.kptl_sched_waitq); + wake_up_all(&kptllnd_data.kptl_watchdog_waitq); + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read(&kptllnd_data.kptl_nthreads)); + cfs_pause(cfs_time_seconds(1)); + } + + CDEBUG(D_NET, "All Threads stopped\n"); + LASSERT(list_empty(&kptllnd_data.kptl_sched_txq)); + + kptllnd_cleanup_tx_descs(); + + /* Nothing here now, but libcfs might soon require + * us to explicitly destroy wait queues and semaphores + * that would be done here */ + + /* fall through */ + + case PTLLND_INIT_NOTHING: + CDEBUG(D_NET, "PTLLND_INIT_NOTHING\n"); + break; + } + + if (!PtlHandleIsEqual(kptllnd_data.kptl_eqh, PTL_INVALID_HANDLE)) { + prc = PtlEQFree(kptllnd_data.kptl_eqh); + if (prc != PTL_OK) + CERROR("Error %d freeing portals EQ\n", prc); + } + + if (!PtlHandleIsEqual(kptllnd_data.kptl_nih, PTL_INVALID_HANDLE)) { + prc = PtlNIFini(kptllnd_data.kptl_nih); + if (prc != PTL_OK) + CERROR("Error %d finalizing portals NI\n", prc); + } + + LASSERT (atomic_read(&kptllnd_data.kptl_ntx) == 0); + LASSERT (list_empty(&kptllnd_data.kptl_idle_txs)); + + if (kptllnd_data.kptl_rx_cache != NULL) + cfs_mem_cache_destroy(kptllnd_data.kptl_rx_cache); + + if (kptllnd_data.kptl_peers != NULL) + LIBCFS_FREE (kptllnd_data.kptl_peers, + sizeof (struct list_head) * + kptllnd_data.kptl_peer_hash_size); + + if (kptllnd_data.kptl_nak_msg != NULL) + LIBCFS_FREE (kptllnd_data.kptl_nak_msg, + offsetof(kptl_msg_t, ptlm_u)); + + memset(&kptllnd_data, 0, sizeof(kptllnd_data)); + + CDEBUG(D_MALLOC, "after LND cleanup: kmem %d\n", + atomic_read (&libcfs_kmemory)); + + PORTAL_MODULE_UNUSE; +} + +int __init +kptllnd_module_init (void) +{ + int rc; + + kptllnd_assert_wire_constants(); + + rc = kptllnd_tunables_init(); + if (rc != 0) + return rc; + + kptllnd_init_ptltrace(); + + lnet_register_lnd(&kptllnd_lnd); + + return 0; +} + +void __exit +kptllnd_module_fini (void) +{ + lnet_unregister_lnd(&kptllnd_lnd); + kptllnd_tunables_fini(); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Kernel Portals LND v1.00"); +MODULE_LICENSE("GPL"); + +module_init(kptllnd_module_init); +module_exit(kptllnd_module_fini); diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h new file mode 100755 index 0000000..7243a6b --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -0,0 +1,538 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +#define DEBUG_SUBSYSTEM S_LND + +#include +#include +#include +#include +#ifdef CRAY_XT3 +#include +#endif +#include /* Depends on portals/p30.h */ + +/* + * Define this to enable console debug logging + * and simulation + */ +//#define PJK_DEBUGGING + +#if CONFIG_SMP +# define PTLLND_N_SCHED num_online_cpus() /* # schedulers */ +#else +# define PTLLND_N_SCHED 1 /* # schedulers */ +#endif + +#define PTLLND_CREDIT_HIGHWATER ((*kptllnd_tunables.kptl_peercredits)-1) + /* when eagerly to return credits */ + +typedef struct +{ + int *kptl_ntx; /* # tx descs to pre-allocate */ + int *kptl_max_nodes; /* max # nodes all talking to me */ + int *kptl_max_procs_per_node; /* max # processes per node */ + int *kptl_checksum; /* checksum kptl_msg_t? */ + int *kptl_timeout; /* comms timeout (seconds) */ + int *kptl_portal; /* portal number */ + int *kptl_pid; /* portals PID (self + kernel peers) */ + int *kptl_rxb_npages; /* number of pages for rx buffer */ + int *kptl_rxb_nspare; /* number of spare rx buffers */ + int *kptl_credits; /* number of credits */ + int *kptl_peercredits; /* number of credits */ + int *kptl_max_msg_size; /* max immd message size*/ + int *kptl_peer_hash_table_size; /* # slots in peer hash table */ + int *kptl_reschedule_loops; /* scheduler yield loops */ +#ifdef CRAY_XT3 + int *kptl_ptltrace_on_timeout; /* dump pltrace on timeout? */ + char **kptl_ptltrace_basename; /* ptltrace dump file basename */ +#endif +#ifdef PJK_DEBUGGING + int *kptl_simulation_bitmap;/* simulation bitmap */ +#endif + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + struct ctl_table_header *kptl_sysctl; /* sysctl interface */ +#endif +} kptl_tunables_t; + +#include "lnet/ptllnd_wire.h" + +/***********************************************************************/ + +typedef struct kptl_data kptl_data_t; +typedef struct kptl_rx_buffer kptl_rx_buffer_t; +typedef struct kptl_peer kptl_peer_t; + +typedef struct { + char eva_type; +} kptl_eventarg_t; + +#define PTLLND_EVENTARG_TYPE_MSG 0x1 +#define PTLLND_EVENTARG_TYPE_RDMA 0x2 +#define PTLLND_EVENTARG_TYPE_BUF 0x3 + +typedef struct kptl_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + kptl_rx_buffer_t *rx_rxb; /* the rx buffer pointer */ + kptl_msg_t *rx_msg; /* received message */ + int rx_nob; /* received message size */ + ptl_process_id_t rx_initiator; /* sender's address */ +#ifdef CRAY_XT3 + ptl_uid_t rx_uid; /* sender's uid */ +#endif + kptl_peer_t *rx_peer; /* pointer to peer */ + char rx_space[0]; /* copy of incoming request */ +} kptl_rx_t; + +typedef struct kptl_rx_buffer_pool +{ + spinlock_t rxbp_lock; + struct list_head rxbp_list; /* all allocated buffers */ + int rxbp_count; /* # allocated buffers */ + int rxbp_reserved; /* # requests to buffer */ + int rxbp_shutdown; /* shutdown flag */ +} kptl_rx_buffer_pool_t; + +struct kptl_rx_buffer +{ + kptl_rx_buffer_pool_t *rxb_pool; + struct list_head rxb_list; /* for the rxb_pool list */ + struct list_head rxb_repost_list;/* for the kptl_sched_rxbq list */ + int rxb_posted:1; /* on the net */ + int rxb_idle:1; /* all done */ + kptl_eventarg_t rxb_eventarg; /* event->md.user_ptr */ + int rxb_refcount; /* reference count */ + ptl_handle_md_t rxb_mdh; /* the portals memory descriptor (MD) handle */ + char *rxb_buffer; /* the buffer */ + +}; + +enum kptl_tx_type +{ + TX_TYPE_RESERVED = 0, + TX_TYPE_SMALL_MESSAGE = 1, + TX_TYPE_PUT_REQUEST = 2, + TX_TYPE_GET_REQUEST = 3, + TX_TYPE_PUT_RESPONSE = 4, + TX_TYPE_GET_RESPONSE = 5, +}; + +typedef union { +#ifdef _USING_LUSTRE_PORTALS_ + struct iovec iov[PTL_MD_MAX_IOV]; + lnet_kiov_t kiov[PTL_MD_MAX_IOV]; +#else + ptl_md_iovec_t iov[PTL_MD_MAX_IOV]; +#endif +} kptl_fragvec_t; + +typedef struct kptl_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs etc */ + atomic_t tx_refcount; /* reference count*/ + enum kptl_tx_type tx_type; /* small msg/{put,get}{req,resp} */ + int tx_active:1; /* queued on the peer */ + int tx_idle:1; /* on the free list */ + kptl_eventarg_t tx_msg_eventarg; /* event->md.user_ptr */ + kptl_eventarg_t tx_rdma_eventarg; /* event->md.user_ptr */ + int tx_status; /* the status of this tx descriptor */ + ptl_handle_md_t tx_rdma_mdh; /* RDMA buffer */ + ptl_handle_md_t tx_msg_mdh; /* the portals MD handle for the initial message */ + lnet_msg_t *tx_lnet_msg; /* LNET message to finalize */ + lnet_msg_t *tx_lnet_replymsg; /* LNET reply message to finalize */ + kptl_msg_t *tx_msg; /* the message data */ + kptl_peer_t *tx_peer; /* the peer this is waiting on */ + unsigned long tx_deadline; /* deadline */ + ptl_md_t tx_rdma_md; /* rdma buffer */ + kptl_fragvec_t *tx_rdma_frags; /* buffer fragments */ +} kptl_tx_t; + +enum kptllnd_peer_state +{ + PEER_STATE_UNINITIALIZED = 0, + PEER_STATE_ALLOCATED = 1, + PEER_STATE_WAITING_HELLO = 2, + PEER_STATE_ACTIVE = 3, + PEER_STATE_CLOSING = 4, + PEER_STATE_ZOMBIE = 5, +}; + +struct kptl_peer +{ + struct list_head peer_list; + atomic_t peer_refcount; /* The current refrences */ + enum kptllnd_peer_state peer_state; + spinlock_t peer_lock; /* serialize */ + struct list_head peer_sendq; /* txs waiting for mh handles */ + struct list_head peer_activeq; /* txs awaiting completion */ + lnet_process_id_t peer_id; /* Peer's LNET id */ + ptl_process_id_t peer_ptlid; /* Peer's portals id */ + __u64 peer_incarnation; /* peer's incarnation */ + int peer_sent_hello; /* have I sent HELLO? */ + int peer_credits; /* number of send credits */ + int peer_outstanding_credits;/* number of peer credits */ + int peer_error; /* errno on closing this peer */ + cfs_time_t peer_last_alive; /* when (in jiffies) I was last alive */ + __u64 peer_next_matchbits; /* Next value to register RDMA from peer */ + __u64 peer_last_matchbits_seen; /* last matchbits used to RDMA to peer */ +}; + +struct kptl_data +{ + int kptl_init; /* initialisation state */ + volatile int kptl_shutdown; /* shut down? */ + atomic_t kptl_nthreads; /* # live threads */ + lnet_ni_t *kptl_ni; /* _the_ LND instance */ + ptl_handle_ni_t kptl_nih; /* network inteface handle */ + ptl_process_id_t kptl_portals_id; /* Portals ID of interface */ + __u64 kptl_incarnation; /* which one am I */ + ptl_handle_eq_t kptl_eqh; /* Event Queue (EQ) */ + + spinlock_t kptl_sched_lock; /* serialise... */ + wait_queue_head_t kptl_sched_waitq; /* schedulers sleep here */ + struct list_head kptl_sched_txq; /* tx requiring attention */ + struct list_head kptl_sched_rxq; /* rx requiring attention */ + struct list_head kptl_sched_rxbq; /* rxb requiring reposting */ + + wait_queue_head_t kptl_watchdog_waitq; /* watchdog sleeps here */ + + kptl_rx_buffer_pool_t kptl_rx_buffer_pool; /* rx buffer pool */ + cfs_mem_cache_t* kptl_rx_cache; /* rx descripter cache */ + + atomic_t kptl_ntx; /* # tx descs allocated */ + spinlock_t kptl_tx_lock; /* serialise idle tx list*/ + struct list_head kptl_idle_txs; /* idle tx descriptors */ + + rwlock_t kptl_peer_rw_lock; /* lock for peer table */ + struct list_head *kptl_peers; /* hash table of all my known peers */ + struct list_head kptl_closing_peers; /* peers being closed */ + struct list_head kptl_zombie_peers; /* peers waiting for refs to drain */ + int kptl_peer_hash_size; /* size of kptl_peers */ + int kptl_npeers; /* # peers extant */ + int kptl_n_active_peers; /* # active peers */ + int kptl_expected_peers; /* # peers I can buffer HELLOs from */ + + kptl_msg_t *kptl_nak_msg; /* common NAK message */ + spinlock_t kptl_ptlid2str_lock; /* serialise str ops */ +}; + +enum +{ + PTLLND_INIT_NOTHING = 0, + PTLLND_INIT_DATA, + PTLLND_INIT_ALL, +}; + +extern kptl_tunables_t kptllnd_tunables; +extern kptl_data_t kptllnd_data; + +static inline lnet_nid_t +kptllnd_ptl2lnetnid(ptl_nid_t ptl_nid) +{ +#ifdef _USING_LUSTRE_PORTALS_ + return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_ni->ni_nid), + LNET_NIDADDR(ptl_nid)); +#else + return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_ni->ni_nid), + ptl_nid); +#endif +} + +static inline ptl_nid_t +kptllnd_lnet2ptlnid(lnet_nid_t lnet_nid) +{ +#ifdef _USING_LUSTRE_PORTALS_ + return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_portals_id.nid), + LNET_NIDADDR(lnet_nid)); +#else + return LNET_NIDADDR(lnet_nid); +#endif +} + +int kptllnd_startup(lnet_ni_t *ni); +void kptllnd_shutdown(lnet_ni_t *ni); +int kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int kptllnd_eager_recv(struct lnet_ni *ni, void *private, + lnet_msg_t *msg, void **new_privatep); +void kptllnd_eq_callback(ptl_event_t *evp); +int kptllnd_scheduler(void *arg); +int kptllnd_watchdog(void *arg); +int kptllnd_thread_start(int (*fn)(void *arg), void *arg); +int kptllnd_tunables_init(void); +void kptllnd_tunables_fini(void); + +const char *kptllnd_evtype2str(int evtype); +const char *kptllnd_msgtype2str(int msgtype); + +static inline void * +kptllnd_eventarg2obj (kptl_eventarg_t *eva) +{ + switch (eva->eva_type) { + default: + LBUG(); + case PTLLND_EVENTARG_TYPE_BUF: + return list_entry(eva, kptl_rx_buffer_t, rxb_eventarg); + case PTLLND_EVENTARG_TYPE_RDMA: + return list_entry(eva, kptl_tx_t, tx_rdma_eventarg); + case PTLLND_EVENTARG_TYPE_MSG: + return list_entry(eva, kptl_tx_t, tx_msg_eventarg); + } +} + +/* + * RX BUFFER SUPPORT FUNCTIONS + */ +void kptllnd_rx_buffer_pool_init(kptl_rx_buffer_pool_t *rxbp); +void kptllnd_rx_buffer_pool_fini(kptl_rx_buffer_pool_t *rxbp); +int kptllnd_rx_buffer_pool_reserve(kptl_rx_buffer_pool_t *rxbp, int count); +void kptllnd_rx_buffer_pool_unreserve(kptl_rx_buffer_pool_t *rxbp, int count); +void kptllnd_rx_buffer_callback(ptl_event_t *ev); +void kptllnd_rx_buffer_post(kptl_rx_buffer_t *rxb); + +static inline int +kptllnd_rx_buffer_size(void) +{ + return PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages); +} + +static inline void +kptllnd_rx_buffer_addref(kptl_rx_buffer_t *rxb) +{ + unsigned long flags; + + spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags); + rxb->rxb_refcount++; + spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags); +} + +static inline void +kptllnd_rx_buffer_decref_locked(kptl_rx_buffer_t *rxb) +{ + if (--(rxb->rxb_refcount) == 0) { + spin_lock(&kptllnd_data.kptl_sched_lock); + + list_add_tail(&rxb->rxb_repost_list, + &kptllnd_data.kptl_sched_rxbq); + wake_up(&kptllnd_data.kptl_sched_waitq); + + spin_unlock(&kptllnd_data.kptl_sched_lock); + } +} + +static inline void +kptllnd_rx_buffer_decref(kptl_rx_buffer_t *rxb) +{ + unsigned long flags; + int count; + + spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags); + count = --(rxb->rxb_refcount); + spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags); + + if (count == 0) + kptllnd_rx_buffer_post(rxb); +} + +/* + * RX SUPPORT FUNCTIONS + */ +void kptllnd_rx_done(kptl_rx_t *rx); +void kptllnd_rx_parse(kptl_rx_t *rx); + +/* + * PEER SUPPORT FUNCTIONS + */ +int kptllnd_get_peer_info(int index, + lnet_process_id_t *id, + int *state, int *sent_hello, + int *refcount, __u64 *incarnation, + __u64 *next_matchbits, __u64 *last_matchbits_seen, + int *nsendq, int *nactiveq, + int *credits, int *outstanding_credits); +void kptllnd_peer_destroy(kptl_peer_t *peer); +int kptllnd_peer_del(lnet_process_id_t id); +void kptllnd_peer_close_locked(kptl_peer_t *peer, int why); +void kptllnd_peer_close(kptl_peer_t *peer, int why); +void kptllnd_handle_closing_peers(void); +int kptllnd_peer_connect(kptl_tx_t *tx, lnet_nid_t nid); +void kptllnd_peer_check_sends(kptl_peer_t *peer); +void kptllnd_peer_check_bucket(int idx); +void kptllnd_tx_launch(kptl_tx_t *tx, lnet_process_id_t target); +kptl_peer_t *kptllnd_peer_handle_hello(ptl_process_id_t initiator, + kptl_msg_t *msg); +kptl_peer_t *kptllnd_id2peer_locked(lnet_process_id_t id); +void kptllnd_peer_alive(kptl_peer_t *peer); + +static inline void +kptllnd_peer_addref (kptl_peer_t *peer) +{ + atomic_inc(&peer->peer_refcount); +} + +static inline void +kptllnd_peer_decref (kptl_peer_t *peer) +{ + if (atomic_dec_and_test(&peer->peer_refcount)) + kptllnd_peer_destroy(peer); +} + +static inline void +kptllnd_set_tx_peer(kptl_tx_t *tx, kptl_peer_t *peer) +{ + LASSERT (tx->tx_peer == NULL); + + kptllnd_peer_addref(peer); + tx->tx_peer = peer; +} + +static inline struct list_head * +kptllnd_nid2peerlist(lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % + kptllnd_data.kptl_peer_hash_size; + + return &kptllnd_data.kptl_peers[hash]; +} + +static inline kptl_peer_t * +kptllnd_id2peer(lnet_process_id_t id) +{ + kptl_peer_t *peer; + unsigned long flags; + + read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + peer = kptllnd_id2peer_locked(id); + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); + + return peer; +} + +static inline int +kptllnd_reserve_buffers(int n) +{ + return kptllnd_rx_buffer_pool_reserve(&kptllnd_data.kptl_rx_buffer_pool, + n); +} + +static inline int +kptllnd_peer_reserve_buffers(void) +{ + return kptllnd_reserve_buffers(*kptllnd_tunables.kptl_peercredits); +} + +static inline void +kptllnd_peer_unreserve_buffers(void) +{ + kptllnd_rx_buffer_pool_unreserve(&kptllnd_data.kptl_rx_buffer_pool, + *kptllnd_tunables.kptl_peercredits); +} + +/* + * TX SUPPORT FUNCTIONS + */ +int kptllnd_setup_tx_descs(void); +void kptllnd_cleanup_tx_descs(void); +void kptllnd_tx_fini(kptl_tx_t *tx); +kptl_tx_t *kptllnd_get_idle_tx(enum kptl_tx_type purpose); +void kptllnd_tx_callback(ptl_event_t *ev); +const char *kptllnd_tx_typestr(int type); + +static inline void +kptllnd_tx_addref(kptl_tx_t *tx) +{ + atomic_inc(&tx->tx_refcount); +} + +static inline void +kptllnd_tx_decref(kptl_tx_t *tx) +{ + LASSERT (!in_interrupt()); /* Thread context only */ + + if (atomic_dec_and_test(&tx->tx_refcount)) + kptllnd_tx_fini(tx); +} + +/* + * MESSAGE SUPPORT FUNCTIONS + */ +void kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob); +void kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer); +int kptllnd_msg_unpack(kptl_msg_t *msg, int nob); + +/* + * MISC SUPPORT FUNCTIONS + */ +void kptllnd_init_rdma_md(kptl_tx_t *tx, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int nob); +char *kptllnd_ptlid2str(ptl_process_id_t id); + +void kptllnd_init_ptltrace(void); +void kptllnd_dump_ptltrace(void); + +#ifdef PJK_DEBUGGING +#define SIMULATION_FAIL_TX_PUT_ALLOC 0 /* 0x00000001 */ +#define SIMULATION_FAIL_TX_GET_ALLOC 1 /* 0x00000002 */ +#define SIMULATION_FAIL_TX 2 /* 0x00000004 */ +#define SIMULATION_FAIL_RX_ALLOC 3 /* 0x00000008 */ + +#define IS_SIMULATION_ENABLED(x) \ + (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0) +#else +#define IS_SIMULATION_ENABLED(x) 0 +#endif + diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c new file mode 100644 index 0000000..89456ac --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_cb.c @@ -0,0 +1,760 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +#include "ptllnd.h" + +#ifndef _USING_LUSTRE_PORTALS_ +int +kptllnd_extract_iov (int dst_niov, ptl_md_iovec_t *dst, + int src_niov, struct iovec *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the subset of 'src' starting at 'offset', + * for exactly 'len' bytes, and return the number of entries. + * NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->iov_len) { /* skip initial frags */ + offset -= src->iov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (niov <= dst_niov); + + frag_len = src->iov_len - offset; + dst->iov_base = ((char *)src->iov_base) + offset; + + if (len <= frag_len) { + dst->iov_len = len; + return (niov); + } + + dst->iov_len = frag_len; + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} + +int +kptllnd_extract_phys (int dst_niov, ptl_md_iovec_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len) +{ + /* Initialise 'dst' to the physical addresses of the subset of 'src' + * starting at 'offset', for exactly 'len' bytes, and return the number + * of entries. NB not destructive to 'src' */ + unsigned int frag_len; + unsigned int niov; + __u64 phys_page; + __u64 phys; + + if (len == 0) /* no data => */ + return (0); /* no frags */ + + LASSERT (src_niov > 0); + while (offset >= src->kiov_len) { /* skip initial frags */ + offset -= src->kiov_len; + src_niov--; + src++; + LASSERT (src_niov > 0); + } + + niov = 1; + for (;;) { + LASSERT (src_niov > 0); + LASSERT (niov <= dst_niov); + + frag_len = min(src->kiov_len - offset, len); + phys_page = lnet_page2phys(src->kiov_page); + phys = phys_page + src->kiov_offset + offset; + + LASSERT (sizeof(void *) > 4 || + (phys <= 0xffffffffULL && + phys + (frag_len - 1) <= 0xffffffffULL)); + + dst->iov_base = (void *)((unsigned long)phys); + dst->iov_len = frag_len; + + if (frag_len == len) + return niov; + + len -= frag_len; + dst++; + src++; + niov++; + src_niov--; + offset = 0; + } +} +#endif + +void +kptllnd_init_rdma_md(kptl_tx_t *tx, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int nob) +{ + LASSERT (iov == NULL || kiov == NULL); + + memset(&tx->tx_rdma_md, 0, sizeof(tx->tx_rdma_md)); + + tx->tx_rdma_md.start = tx->tx_rdma_frags; + tx->tx_rdma_md.user_ptr = &tx->tx_rdma_eventarg; + tx->tx_rdma_md.eq_handle = kptllnd_data.kptl_eqh; + tx->tx_rdma_md.options = PTL_MD_LUSTRE_COMPLETION_SEMANTICS | + PTL_MD_EVENT_START_DISABLE; + switch (tx->tx_type) { + default: + LBUG(); + + case TX_TYPE_PUT_REQUEST: /* passive: peer gets */ + tx->tx_rdma_md.threshold = 1; /* GET event */ + tx->tx_rdma_md.options |= PTL_MD_OP_GET; + break; + + case TX_TYPE_GET_REQUEST: /* passive: peer puts */ + tx->tx_rdma_md.threshold = 1; /* PUT event */ + tx->tx_rdma_md.options |= PTL_MD_OP_PUT; + break; + + case TX_TYPE_PUT_RESPONSE: /* active: I get */ + tx->tx_rdma_md.threshold = 2; /* SEND + REPLY */ + break; + + case TX_TYPE_GET_RESPONSE: /* active: I put */ + tx->tx_rdma_md.threshold = 1; /* SEND */ + break; + } + + if (nob == 0) { + tx->tx_rdma_md.length = 0; + return; + } + +#ifdef _USING_LUSTRE_PORTALS_ + if (iov != NULL) { + tx->tx_rdma_md.options |= PTL_MD_IOVEC; + tx->tx_rdma_md.length = + lnet_extract_iov(PTL_MD_MAX_IOV, tx->tx_rdma_frags->iov, + niov, iov, offset, nob); + return; + } + + /* Cheating OK since ptl_kiov_t == lnet_kiov_t */ + CLASSERT(sizeof(ptl_kiov_t) == sizeof(lnet_kiov_t)); + CLASSERT(offsetof(ptl_kiov_t, kiov_offset) == + offsetof(lnet_kiov_t, kiov_offset)); + CLASSERT(offsetof(ptl_kiov_t, kiov_page) == + offsetof(lnet_kiov_t, kiov_page)); + CLASSERT(offsetof(ptl_kiov_t, kiov_len) == + offsetof(lnet_kiov_t, kiov_len)); + + tx->tx_rdma_md.options |= PTL_MD_KIOV; + tx->tx_rdma_md.length = + lnet_extract_kiov(PTL_MD_MAX_IOV, tx->tx_rdma_frags->kiov, + niov, kiov, offset, nob); +#else + if (iov != NULL) { + tx->tx_rdma_md.options |= PTL_MD_IOVEC; + tx->tx_rdma_md.length = + kptllnd_extract_iov(PTL_MD_MAX_IOV, tx->tx_rdma_frags->iov, + niov, iov, offset, nob); + return; + } + + tx->tx_rdma_md.options |= PTL_MD_IOVEC | PTL_MD_PHYS; + tx->tx_rdma_md.length = + kptllnd_extract_phys(PTL_MD_MAX_IOV, tx->tx_rdma_frags->iov, + niov, kiov, offset, nob); +#endif +} + +int +kptllnd_active_rdma(kptl_rx_t *rx, lnet_msg_t *lntmsg, int type, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, int nob) +{ + kptl_tx_t *tx; + ptl_err_t ptlrc; + kptl_msg_t *rxmsg = rx->rx_msg; + kptl_peer_t *peer = rx->rx_peer; + unsigned long flags; + ptl_handle_md_t mdh; + + LASSERT (type == TX_TYPE_PUT_RESPONSE || + type == TX_TYPE_GET_RESPONSE); + + tx = kptllnd_get_idle_tx(type); + if (tx == NULL) { + CERROR ("Can't do %s rdma to %s: can't allocate descriptor\n", + type == TX_TYPE_PUT_RESPONSE ? "GET" : "PUT", + libcfs_id2str(peer->peer_id)); + return -ENOMEM; + } + + kptllnd_set_tx_peer(tx, peer); + kptllnd_init_rdma_md(tx, niov, iov, kiov, offset, nob); + + ptlrc = PtlMDBind(kptllnd_data.kptl_nih, tx->tx_rdma_md, + PTL_UNLINK, &mdh); + if (ptlrc != PTL_OK) { + CERROR("PtlMDBind(%s) failed: %d\n", + libcfs_id2str(peer->peer_id), ptlrc); + tx->tx_status = -EIO; + kptllnd_tx_decref(tx); + return -EIO; + } + + spin_lock_irqsave(&peer->peer_lock, flags); + + tx->tx_lnet_msg = lntmsg; + /* lnet_finalize() will be called when tx is torn down, so I must + * return success from here on... */ + + tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ); + tx->tx_rdma_mdh = mdh; + tx->tx_active = 1; + list_add_tail(&tx->tx_list, &peer->peer_activeq); + + /* peer has now got my ref on 'tx' */ + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + if (type == TX_TYPE_GET_RESPONSE) + ptlrc = PtlPut(mdh, + PTL_NOACK_REQ, + rx->rx_initiator, + *kptllnd_tunables.kptl_portal, + 0, /* acl cookie */ + rxmsg->ptlm_u.rdma.kptlrm_matchbits, + 0, /* offset */ + (lntmsg != NULL) ? /* header data */ + PTLLND_RDMA_OK : + PTLLND_RDMA_FAIL); + else + ptlrc = PtlGet(mdh, + rx->rx_initiator, + *kptllnd_tunables.kptl_portal, + 0, /* acl cookie */ + rxmsg->ptlm_u.rdma.kptlrm_matchbits, + 0); /* offset */ + + if (ptlrc != PTL_OK) { + CERROR("Ptl%s failed: %d\n", + (type == TX_TYPE_GET_RESPONSE) ? "Put" : "Get", ptlrc); + + kptllnd_peer_close(peer, -EIO); + /* Everything (including this RDMA) queued on the peer will + * be completed with failure */ + } + + return 0; +} + +int +kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kptl_tx_t *tx; + int nob; + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); /* !!! */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + LASSERT (!in_interrupt()); + + switch (type) { + default: + LBUG(); + return -EINVAL; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + /* Is the payload small enough not to need RDMA? */ + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[payload_nob]); + if (nob <= *kptllnd_tunables.kptl_max_msg_size) + break; + + tx = kptllnd_get_idle_tx(TX_TYPE_PUT_REQUEST); + if (tx == NULL) { + CERROR("Can't send %s to %s: can't allocate descriptor\n", + lnet_msgtyp2str(type), + libcfs_id2str(target)); + return -ENOMEM; + } + + kptllnd_init_rdma_md(tx, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); + + tx->tx_lnet_msg = lntmsg; + tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr; + kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_PUT, + sizeof(kptl_rdma_msg_t)); + kptllnd_tx_launch(tx, target); + return 0; + + case LNET_MSG_GET: + /* routed gets don't RDMA */ + if (target_is_router || routing) + break; + + /* Is the payload small enough not to need RDMA? */ + nob = lntmsg->msg_md->md_length; + nob = offsetof(kptl_msg_t, + ptlm_u.immediate.kptlim_payload[nob]); + if (nob <= *kptllnd_tunables.kptl_max_msg_size) + break; + + tx = kptllnd_get_idle_tx(TX_TYPE_GET_REQUEST); + if (tx == NULL) { + CERROR("Can't send GET to %s: can't allocate descriptor\n", + libcfs_id2str(target)); + return -ENOMEM; + } + + tx->tx_lnet_replymsg = + lnet_create_reply_msg(kptllnd_data.kptl_ni, lntmsg); + if (tx->tx_lnet_replymsg == NULL) { + CERROR("Failed to allocate LNET reply for %s\n", + libcfs_id2str(target)); + kptllnd_tx_decref(tx); + return -ENOMEM; + } + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, NULL, + 0, lntmsg->msg_md->md_length); + else + kptllnd_init_rdma_md(tx, lntmsg->msg_md->md_niov, + NULL, lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); + + tx->tx_lnet_msg = lntmsg; + tx->tx_msg->ptlm_u.rdma.kptlrm_hdr = *hdr; + kptllnd_init_msg (tx->tx_msg, PTLLND_MSG_TYPE_GET, + sizeof(kptl_rdma_msg_t)); + kptllnd_tx_launch(tx, target); + return 0; + + case LNET_MSG_ACK: + CDEBUG(D_NET, "LNET_MSG_ACK\n"); + LASSERT (payload_nob == 0); + break; + } + + tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE); + if (tx == NULL) { + CERROR("Can't send %s to %s: can't allocate descriptor\n", + lnet_msgtyp2str(type), libcfs_id2str(target)); + return -ENOMEM; + } + + tx->tx_lnet_msg = lntmsg; + tx->tx_msg->ptlm_u.immediate.kptlim_hdr = *hdr; + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(*kptllnd_tunables.kptl_max_msg_size, + tx->tx_msg->ptlm_u.immediate.kptlim_payload, + 0, + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(*kptllnd_tunables.kptl_max_msg_size, + tx->tx_msg->ptlm_u.immediate.kptlim_payload, + 0, + payload_niov, payload_iov, + payload_offset, payload_nob); + + nob = offsetof(kptl_immediate_msg_t, kptlim_payload[payload_nob]); + kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_IMMEDIATE, nob); + kptllnd_tx_launch(tx, target); + return 0; +} + +int +kptllnd_eager_recv(struct lnet_ni *ni, void *private, + lnet_msg_t *msg, void **new_privatep) +{ + kptl_rx_t *rx = private; + + CDEBUG(D_NET, "Eager RX=%p RXB=%p\n", rx, rx->rx_rxb); + + /* I have to release my ref on rxb (if I have one) to ensure I'm an + * eager receiver, so I copy the incoming request from the buffer it + * landed in, into space reserved in the descriptor... */ + +#if (PTL_MD_LOCAL_ALIGN8 == 0) + if (rx->rx_rxb == NULL) /* already copied */ + return 0; /* to fix alignment */ +#else + LASSERT(rx->rx_rxb != NULL); +#endif + LASSERT(rx->rx_nob <= *kptllnd_tunables.kptl_max_msg_size); + + memcpy(rx->rx_space, rx->rx_msg, rx->rx_nob); + rx->rx_msg = (kptl_msg_t *)rx->rx_space; + + kptllnd_rx_buffer_decref(rx->rx_rxb); + rx->rx_rxb = NULL; + + return 0; +} + + +int +kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + kptl_rx_t *rx = private; + kptl_msg_t *rxmsg = rx->rx_msg; + int nob; + int rc; + + CDEBUG(D_NET, "%s niov=%d offset=%d mlen=%d rlen=%d\n", + kptllnd_msgtype2str(rxmsg->ptlm_type), + niov, offset, mlen, rlen); + + LASSERT (mlen <= rlen); + LASSERT (mlen >= 0); + LASSERT (!in_interrupt()); + LASSERT (!(kiov != NULL && iov != NULL)); /* never both */ + LASSERT (niov <= PTL_MD_MAX_IOV); /* !!! */ + +#ifdef CRAY_XT3 + if (lntmsg != NULL && + rx->rx_uid != 0) { + /* Set the UID if the sender's uid isn't 0; i.e. non-root + * running in userspace (e.g. a catamount node; linux kernel + * senders, including routers have uid 0). If this is a lustre + * RPC request, this tells lustre not to trust the creds in the + * RPC message body. */ + lnet_set_msg_uid(ni, lntmsg, rx->rx_uid); + } +#endif + switch(rxmsg->ptlm_type) + { + default: + LBUG(); + rc = -EINVAL; + break; + + case PTLLND_MSG_TYPE_IMMEDIATE: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE %d,%d\n", mlen, rlen); + + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[rlen]); + if (nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_id2str(rx->rx_peer->peer_id), nob, + rx->rx_nob); + rc = -EINVAL; + break; + } + + if (kiov != NULL) + lnet_copy_flat2kiov( + niov, kiov, offset, + *kptllnd_tunables.kptl_max_msg_size, + rxmsg->ptlm_u.immediate.kptlim_payload, + 0, + mlen); + else + lnet_copy_flat2iov( + niov, iov, offset, + *kptllnd_tunables.kptl_max_msg_size, + rxmsg->ptlm_u.immediate.kptlim_payload, + 0, + mlen); + + lnet_finalize (ni, lntmsg, 0); + rc = 0; + break; + + case PTLLND_MSG_TYPE_GET: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_GET %d,%d\n", mlen, rlen); + + /* NB always send RDMA so the peer can complete. I send + * success/failure in the portals 'hdr_data' */ + + if (lntmsg == NULL) + rc = kptllnd_active_rdma(rx, NULL, + TX_TYPE_GET_RESPONSE, + 0, NULL, NULL, 0, 0); + else + rc = kptllnd_active_rdma(rx, lntmsg, + TX_TYPE_GET_RESPONSE, + lntmsg->msg_niov, + lntmsg->msg_iov, + lntmsg->msg_kiov, + lntmsg->msg_offset, + lntmsg->msg_len); + break; + + case PTLLND_MSG_TYPE_PUT: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_PUT %d,%d\n", mlen, rlen); + + /* NB always send RDMA so the peer can complete; it'll be 0 + * bytes if there was no match (lntmsg == NULL). I have no way + * to let my peer know this, but she's only interested in when + * the net has stopped accessing her buffer in any case. */ + + rc = kptllnd_active_rdma(rx, lntmsg, TX_TYPE_PUT_RESPONSE, + niov, iov, kiov, offset, mlen); + break; + } + + /* + * We're done with the RX + */ + kptllnd_rx_done(rx); + return rc; +} + +void +kptllnd_eq_callback(ptl_event_t *ev) +{ + kptl_eventarg_t *eva = ev->md.user_ptr; + + switch (eva->eva_type) { + default: + LBUG(); + + case PTLLND_EVENTARG_TYPE_MSG: + case PTLLND_EVENTARG_TYPE_RDMA: + kptllnd_tx_callback(ev); + break; + + case PTLLND_EVENTARG_TYPE_BUF: + kptllnd_rx_buffer_callback(ev); + break; + } +} + +void +kptllnd_thread_fini (void) +{ + atomic_dec(&kptllnd_data.kptl_nthreads); +} + +int +kptllnd_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid; + + atomic_inc(&kptllnd_data.kptl_nthreads); + + pid = kernel_thread (fn, arg, 0); + if (pid >= 0) + return 0; + + CERROR("Failed to start kernel_thread: error %d\n", (int)pid); + kptllnd_thread_fini(); + return (int)pid; +} + +int +kptllnd_watchdog(void *arg) +{ + int id = (long)arg; + char name[16]; + wait_queue_t waitlink; + int peer_index = 0; + unsigned long deadline = jiffies; + int timeout; + int i; + + snprintf(name, sizeof(name), "kptllnd_wd_%02d", id); + cfs_daemonize(name); + cfs_block_allsigs(); + + init_waitqueue_entry(&waitlink, current); + + /* threads shut down in phase 2 after all peers have been destroyed */ + while (kptllnd_data.kptl_shutdown < 2) { + + timeout = (int)(deadline - jiffies); + + if (timeout <= 0) { + const int n = 4; + const int p = 1; + int chunk = kptllnd_data.kptl_peer_hash_size; + + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if ((*kptllnd_tunables.kptl_timeout) > n * p) + chunk = (chunk * n * p) / + (*kptllnd_tunables.kptl_timeout); + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kptllnd_peer_check_bucket(peer_index); + peer_index = (peer_index + 1) % + kptllnd_data.kptl_peer_hash_size; + } + + deadline += p * HZ; + continue; + } + + kptllnd_handle_closing_peers(); + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&kptllnd_data.kptl_watchdog_waitq, + &waitlink); + + schedule_timeout(timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue(&kptllnd_data.kptl_watchdog_waitq, &waitlink); + } + + kptllnd_thread_fini(); + CDEBUG(D_NET, "<<<\n"); + return (0); +}; + +int +kptllnd_scheduler (void *arg) +{ + int id = (long)arg; + char name[16]; + wait_queue_t waitlink; + unsigned long flags; + int did_something; + int counter = 0; + kptl_rx_t *rx; + kptl_rx_buffer_t *rxb; + kptl_tx_t *tx; + + snprintf(name, sizeof(name), "kptllnd_sd_%02d", id); + cfs_daemonize(name); + cfs_block_allsigs(); + + init_waitqueue_entry(&waitlink, current); + + spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags); + + /* threads shut down in phase 2 after all peers have been destroyed */ + while (kptllnd_data.kptl_shutdown < 2) { + + did_something = 0; + + if (!list_empty(&kptllnd_data.kptl_sched_rxq)) { + rx = list_entry (kptllnd_data.kptl_sched_rxq.next, + kptl_rx_t, rx_list); + list_del(&rx->rx_list); + + spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, + flags); + + kptllnd_rx_parse(rx); + did_something = 1; + + spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags); + } + + if (!list_empty(&kptllnd_data.kptl_sched_rxbq)) { + rxb = list_entry (kptllnd_data.kptl_sched_rxbq.next, + kptl_rx_buffer_t, rxb_repost_list); + list_del(&rxb->rxb_repost_list); + + spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, + flags); + + kptllnd_rx_buffer_post(rxb); + did_something = 1; + + spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags); + } + + if (!list_empty(&kptllnd_data.kptl_sched_txq)) { + tx = list_entry (kptllnd_data.kptl_sched_txq.next, + kptl_tx_t, tx_list); + list_del_init(&tx->tx_list); + + spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags); + + kptllnd_tx_fini(tx); + did_something = 1; + + spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags); + } + + if (did_something) { + if (++counter != *kptllnd_tunables.kptl_reschedule_loops) + continue; + } + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&kptllnd_data.kptl_sched_waitq, + &waitlink); + spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags); + + if (!did_something) + schedule(); + else + cond_resched(); + + set_current_state(TASK_RUNNING); + remove_wait_queue(&kptllnd_data.kptl_sched_waitq, &waitlink); + + spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags); + + counter = 0; + } + + spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags); + + kptllnd_thread_fini(); + return 0; +} + diff --git a/lnet/klnds/ptllnd/ptllnd_modparams.c b/lnet/klnds/ptllnd/ptllnd_modparams.c new file mode 100644 index 0000000..84b62d6 --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_modparams.c @@ -0,0 +1,217 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + + +#include "ptllnd.h" + +static int ntx = 256; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of TX descriptors"); + +static int max_nodes = 1152; +CFS_MODULE_PARM(max_nodes, "i", int, 0444, + "maximum number of peer nodes"); + +static int max_procs_per_node = 2; +CFS_MODULE_PARM(max_procs_per_node, "i", int, 0444, + "maximum number of processes per peer node to cache"); + +static int checksum = 0; +CFS_MODULE_PARM(checksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +static int portal = PTLLND_PORTAL; /* */ +CFS_MODULE_PARM(portal, "i", int, 0444, + "portal id"); + +static int pid = PTLLND_PID; /* */ +CFS_MODULE_PARM(pid, "i", int, 0444, + "portals pid"); + +static int rxb_npages = 1; +CFS_MODULE_PARM(rxb_npages, "i", int, 0444, + "# of pages per rx buffer"); + +static int rxb_nspare = 8; +CFS_MODULE_PARM(rxb_nspare, "i", int, 0444, + "# of spare rx buffers"); + +static int credits = 128; +CFS_MODULE_PARM(credits, "i", int, 0444, + "concurrent sends"); + +static int peercredits = PTLLND_PEERCREDITS; /* */ +CFS_MODULE_PARM(peercredits, "i", int, 0444, + "concurrent sends to 1 peer"); + +static int max_msg_size = PTLLND_MAX_MSG_SIZE; /* */ +CFS_MODULE_PARM(max_msg_size, "i", int, 0444, + "max size of immediate message"); + +static int peer_hash_table_size = 101; +CFS_MODULE_PARM(peer_hash_table_size, "i", int, 0444, + "# of slots in the peer hash table"); + +static int reschedule_loops = 100; +CFS_MODULE_PARM(reschedule_loops, "i", int, 0644, + "# of loops before scheduler does cond_resched()"); + +#ifdef CRAY_XT3 +static int ptltrace_on_timeout = 0; +CFS_MODULE_PARM(ptltrace_on_timeout, "i", int, 0644, + "dump ptltrace on timeout"); + +static char *ptltrace_basename = "/tmp/lnet-ptltrace"; +CFS_MODULE_PARM(ptltrace_basename, "s", charp, 0644, + "ptltrace dump file basename"); +#endif +#ifdef PJK_DEBUGGING +static int simulation_bitmap = 0; +CFS_MODULE_PARM(simulation_bitmap, "i", int, 0444, + "simulation bitmap"); +#endif + + +kptl_tunables_t kptllnd_tunables = { + .kptl_ntx = &ntx, + .kptl_max_nodes = &max_nodes, + .kptl_max_procs_per_node = &max_procs_per_node, + .kptl_checksum = &checksum, + .kptl_portal = &portal, + .kptl_pid = &pid, + .kptl_timeout = &timeout, + .kptl_rxb_npages = &rxb_npages, + .kptl_rxb_nspare = &rxb_nspare, + .kptl_credits = &credits, + .kptl_peercredits = &peercredits, + .kptl_max_msg_size = &max_msg_size, + .kptl_peer_hash_table_size = &peer_hash_table_size, + .kptl_reschedule_loops = &reschedule_loops, +#ifdef CRAY_XT3 + .kptl_ptltrace_on_timeout = &ptltrace_on_timeout, + .kptl_ptltrace_basename = &ptltrace_basename, +#endif +#ifdef PJK_DEBUGGING + .kptl_simulation_bitmap = &simulation_bitmap, +#endif +}; + + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +#ifdef CRAY_XT3 +static char ptltrace_basename_space[1024]; + +static void +kptllnd_init_strtunable(char **str_param, char *space, int size) +{ + strncpy(space, *str_param, size); + space[size - 1] = 0; + *str_param = space; +} +#endif + +static ctl_table kptllnd_ctl_table[] = { + {1, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {2, "max_nodes", &max_nodes, + sizeof(int), 0444, NULL, &proc_dointvec}, + {3, "max_procs_per_node", &max_procs_per_node, + sizeof(int), 0444, NULL, &proc_dointvec}, + {4, "checksum", &checksum, + sizeof(int), 0644, NULL, &proc_dointvec}, + {5, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {6, "portal", &portal, + sizeof(int), 0444, NULL, &proc_dointvec}, + {7, "pid", &pid, + sizeof(int), 0444, NULL, &proc_dointvec}, + {8, "rxb_npages", &rxb_npages, + sizeof(int), 0444, NULL, &proc_dointvec}, + {9, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {10, "peercredits", &peercredits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {11, "max_msg_size", &max_msg_size, + sizeof(int), 0444, NULL, &proc_dointvec}, + {12, "peer_hash_table_size", &peer_hash_table_size, + sizeof(int), 0444, NULL, &proc_dointvec}, + {13, "reschedule_loops", &reschedule_loops, + sizeof(int), 0444, NULL, &proc_dointvec}, +#ifdef CRAY_XT3 + {14, "ptltrace_on_timeout", &ptltrace_on_timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {15, "ptltrace_basename", ptltrace_basename_space, + sizeof(ptltrace_basename_space), 0644, NULL, &proc_dostring, + &sysctl_string}, +#endif +#ifdef PJK_DEBUGGING + {16, "simulation_bitmap", &simulation_bitmap, + sizeof(int), 0444, NULL, &proc_dointvec}, +#endif + + {0} +}; + +static ctl_table kptllnd_top_ctl_table[] = { + {203, "ptllnd", NULL, 0, 0555, kptllnd_ctl_table}, + {0} +}; + +int +kptllnd_tunables_init () +{ +#ifdef CRAY_XT3 + kptllnd_init_strtunable(&ptltrace_basename, + ptltrace_basename_space, + sizeof(ptltrace_basename_space)); +#endif + kptllnd_tunables.kptl_sysctl = + register_sysctl_table(kptllnd_top_ctl_table, 0); + + if (kptllnd_tunables.kptl_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +kptllnd_tunables_fini () +{ + if (kptllnd_tunables.kptl_sysctl != NULL) + unregister_sysctl_table(kptllnd_tunables.kptl_sysctl); +} + +#else + +int +kptllnd_tunables_init () +{ + return 0; +} + +void +kptllnd_tunables_fini () +{ +} + +#endif + diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c new file mode 100644 index 0000000..cefcb7d --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -0,0 +1,1209 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * E Barton + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +#include "ptllnd.h" +#include + +static int +kptllnd_count_queue(struct list_head *q) +{ + struct list_head *e; + int n = 0; + + list_for_each(e, q) { + n++; + } + + return n; +} + +int +kptllnd_get_peer_info(int index, + lnet_process_id_t *id, + int *state, int *sent_hello, + int *refcount, __u64 *incarnation, + __u64 *next_matchbits, __u64 *last_matchbits_seen, + int *nsendq, int *nactiveq, + int *credits, int *outstanding_credits) +{ + rwlock_t *g_lock = &kptllnd_data.kptl_peer_rw_lock; + unsigned long flags; + struct list_head *ptmp; + kptl_peer_t *peer; + int i; + int rc = -ENOENT; + + read_lock_irqsave(g_lock, flags); + + for (i = 0; i < kptllnd_data.kptl_peer_hash_size; i++) { + + list_for_each (ptmp, &kptllnd_data.kptl_peers[i]) { + peer = list_entry(ptmp, kptl_peer_t, peer_list); + + if (index-- > 0) + continue; + + *id = peer->peer_id; + *state = peer->peer_state; + *sent_hello = peer->peer_sent_hello; + *refcount = atomic_read(&peer->peer_refcount); + *incarnation = peer->peer_incarnation; + + spin_lock(&peer->peer_lock); + + *next_matchbits = peer->peer_next_matchbits; + *last_matchbits_seen = peer->peer_last_matchbits_seen; + *credits = peer->peer_credits; + *outstanding_credits = peer->peer_outstanding_credits; + + *nsendq = kptllnd_count_queue(&peer->peer_sendq); + *nactiveq = kptllnd_count_queue(&peer->peer_activeq); + + spin_unlock(&peer->peer_lock); + + rc = 0; + goto out; + } + } + + out: + read_unlock_irqrestore(g_lock, flags); + return rc; +} + +void +kptllnd_peer_add_peertable_locked (kptl_peer_t *peer) +{ + LASSERT (kptllnd_data.kptl_n_active_peers < + kptllnd_data.kptl_expected_peers); + + LASSERT (peer->peer_state == PEER_STATE_WAITING_HELLO || + peer->peer_state == PEER_STATE_ACTIVE); + + kptllnd_data.kptl_n_active_peers++; + atomic_inc(&peer->peer_refcount); /* +1 ref for the list */ + + /* NB add to HEAD of peer list for MRU order! + * (see kptllnd_cull_peertable) */ + list_add(&peer->peer_list, kptllnd_nid2peerlist(peer->peer_id.nid)); +} + +void +kptllnd_cull_peertable_locked (lnet_process_id_t pid) +{ + /* I'm about to add a new peer with this portals ID to the peer table, + * so (a) this peer should not exist already and (b) I want to leave at + * most (max_procs_per_nid - 1) peers with this NID in the table. */ + struct list_head *peers = kptllnd_nid2peerlist(pid.nid); + int cull_count = *kptllnd_tunables.kptl_max_procs_per_node; + int count; + struct list_head *tmp; + struct list_head *nxt; + kptl_peer_t *peer; + + count = 0; + list_for_each_safe (tmp, nxt, peers) { + /* NB I rely on kptllnd_peer_add_peertable_locked to add peers + * in MRU order */ + peer = list_entry(tmp, kptl_peer_t, peer_list); + + if (peer->peer_id.nid != pid.nid) + continue; + + LASSERT (peer->peer_id.pid != pid.pid); + + count++; + + if (count < cull_count) /* recent (don't cull) */ + continue; + + CDEBUG(D_NET, "Cull %s(%s)\n", + libcfs_id2str(peer->peer_id), + kptllnd_ptlid2str(peer->peer_ptlid)); + + kptllnd_peer_close_locked(peer, 0); + } +} + +kptl_peer_t * +kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid) +{ + unsigned long flags; + kptl_peer_t *peer; + + LIBCFS_ALLOC(peer, sizeof (*peer)); + if (peer == NULL) { + CERROR("Can't create peer %s (%s)\n", + libcfs_id2str(lpid), + kptllnd_ptlid2str(ppid)); + return NULL; + } + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + INIT_LIST_HEAD (&peer->peer_sendq); + INIT_LIST_HEAD (&peer->peer_activeq); + spin_lock_init (&peer->peer_lock); + + peer->peer_state = PEER_STATE_ALLOCATED; + peer->peer_error = 0; + peer->peer_last_alive = cfs_time_current(); + peer->peer_id = lpid; + peer->peer_ptlid = ppid; + peer->peer_credits = 1; /* enough for HELLO */ + peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS; + peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1; + + atomic_set(&peer->peer_refcount, 1); /* 1 ref for caller */ + + write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + + /* Only increase # peers under lock, to guarantee we dont grow it + * during shutdown */ + if (kptllnd_data.kptl_shutdown) { + write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, + flags); + LIBCFS_FREE(peer, sizeof(*peer)); + return NULL; + } + + kptllnd_data.kptl_npeers++; + write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); + + return peer; +} + +void +kptllnd_peer_destroy (kptl_peer_t *peer) +{ + unsigned long flags; + + CDEBUG(D_NET, "Peer=%p\n", peer); + + LASSERT (!in_interrupt()); + LASSERT (atomic_read(&peer->peer_refcount) == 0); + LASSERT (peer->peer_state == PEER_STATE_ALLOCATED || + peer->peer_state == PEER_STATE_ZOMBIE); + LASSERT (list_empty(&peer->peer_sendq)); + LASSERT (list_empty(&peer->peer_activeq)); + + write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + + if (peer->peer_state == PEER_STATE_ZOMBIE) + list_del(&peer->peer_list); + + kptllnd_data.kptl_npeers--; + + write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); + + LIBCFS_FREE (peer, sizeof (*peer)); +} + +void +kptllnd_peer_cancel_txs(kptl_peer_t *peer) +{ + struct list_head sendq; + struct list_head activeq; + struct list_head *tmp; + struct list_head *nxt; + kptl_tx_t *tx; + unsigned long flags; + + /* atomically grab all the peer's tx-es... */ + + spin_lock_irqsave(&peer->peer_lock, flags); + + list_add(&sendq, &peer->peer_sendq); + list_del_init(&peer->peer_sendq); + list_for_each (tmp, &sendq) { + tx = list_entry(tmp, kptl_tx_t, tx_list); + tx->tx_active = 0; + } + + list_add(&activeq, &peer->peer_activeq); + list_del_init(&peer->peer_activeq); + list_for_each (tmp, &activeq) { + tx = list_entry(tmp, kptl_tx_t, tx_list); + tx->tx_active = 0; + } + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + /* ...then drop the peer's ref on them at leasure. This will get + * kptllnd_tx_fini() to abort outstanding comms if necessary. */ + + list_for_each_safe (tmp, nxt, &sendq) { + tx = list_entry(tmp, kptl_tx_t, tx_list); + list_del(&tx->tx_list); + tx->tx_status = -EIO; + kptllnd_tx_decref(tx); + } + + list_for_each_safe (tmp, nxt, &activeq) { + tx = list_entry(tmp, kptl_tx_t, tx_list); + list_del(&tx->tx_list); + tx->tx_status = -EIO; + kptllnd_tx_decref(tx); + } +} + +void +kptllnd_peer_alive (kptl_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->peer_last_alive = cfs_time_current(); + mb(); +} + +void +kptllnd_peer_notify (kptl_peer_t *peer) +{ + unsigned long flags; + time_t last_alive = 0; + int error = 0; + + spin_lock_irqsave(&peer->peer_lock, flags); + + if (peer->peer_error != 0) { + error = peer->peer_error; + peer->peer_error = 0; + + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->peer_last_alive); + } + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + if (error != 0) + lnet_notify (kptllnd_data.kptl_ni, peer->peer_id.nid, 0, + last_alive); +} + +void +kptllnd_handle_closing_peers () +{ + unsigned long flags; + kptl_peer_t *peer; + struct list_head *tmp; + struct list_head *nxt; + int idle; + + /* Check with a read lock first to avoid blocking anyone */ + + read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + idle = list_empty(&kptllnd_data.kptl_closing_peers); + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); + + if (idle) + return; + + /* Scan the closing peers and cancel their txs. + * NB only safe while there is only a single watchdog */ + + write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + + list_for_each_safe (tmp, nxt, &kptllnd_data.kptl_closing_peers) { + peer = list_entry (tmp, kptl_peer_t, peer_list); + + LASSERT (peer->peer_state == PEER_STATE_CLOSING); + + list_del(&peer->peer_list); + list_add_tail(&peer->peer_list, + &kptllnd_data.kptl_zombie_peers); + peer->peer_state = PEER_STATE_ZOMBIE; + + write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); + + kptllnd_peer_notify(peer); + kptllnd_peer_cancel_txs(peer); + kptllnd_peer_decref(peer); + + write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + } + + write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); +} + +void +kptllnd_peer_close_locked(kptl_peer_t *peer, int why) +{ + switch (peer->peer_state) { + default: + LBUG(); + + case PEER_STATE_WAITING_HELLO: + case PEER_STATE_ACTIVE: + /* Removing from peer table */ + kptllnd_data.kptl_n_active_peers--; + LASSERT (kptllnd_data.kptl_n_active_peers >= 0); + + list_del(&peer->peer_list); + kptllnd_peer_unreserve_buffers(); + + peer->peer_error = why; /* stash 'why' only on first close */ + + /* Schedule for immediate attention, taking peer table's ref */ + list_add_tail(&peer->peer_list, + &kptllnd_data.kptl_closing_peers); + wake_up(&kptllnd_data.kptl_watchdog_waitq); + break; + + case PEER_STATE_ZOMBIE: + /* Schedule for attention at next timeout */ + kptllnd_peer_addref(peer); + list_del(&peer->peer_list); + list_add_tail(&peer->peer_list, + &kptllnd_data.kptl_closing_peers); + break; + + case PEER_STATE_CLOSING: + break; + } + + peer->peer_state = PEER_STATE_CLOSING; +} + +void +kptllnd_peer_close(kptl_peer_t *peer, int why) +{ + unsigned long flags; + + write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + kptllnd_peer_close_locked(peer, why); + write_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); +} + +int +kptllnd_peer_del(lnet_process_id_t id) +{ + struct list_head *ptmp; + struct list_head *pnxt; + kptl_peer_t *peer; + int lo; + int hi; + int i; + unsigned long flags; + int rc = -ENOENT; + + /* + * Find the single bucket we are supposed to look at or if nid is a + * wildcard (LNET_NID_ANY) then look at all of the buckets + */ + if (id.nid != LNET_NID_ANY) { + struct list_head *l = kptllnd_nid2peerlist(id.nid); + + lo = hi = l - kptllnd_data.kptl_peers; + } else { + if (id.pid != LNET_PID_ANY) + return -EINVAL; + + lo = 0; + hi = kptllnd_data.kptl_peer_hash_size - 1; + } + +again: + read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kptllnd_data.kptl_peers[i]) { + peer = list_entry (ptmp, kptl_peer_t, peer_list); + + if (!(id.nid == LNET_NID_ANY || + (peer->peer_id.nid == id.nid && + (id.pid == LNET_PID_ANY || + peer->peer_id.pid == id.pid)))) + continue; + + kptllnd_peer_addref(peer); /* 1 ref for me... */ + + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, + flags); + + kptllnd_peer_close(peer, 0); + kptllnd_peer_decref(peer); /* ...until here */ + + rc = 0; /* matched something */ + + /* start again now I've dropped the lock */ + goto again; + } + } + + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); + + return (rc); +} + +void +kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx) +{ + /* CAVEAT EMPTOR: I take over caller's ref on 'tx' */ + ptl_handle_md_t rdma_mdh = PTL_INVALID_HANDLE; + ptl_handle_md_t msg_mdh = PTL_INVALID_HANDLE; + ptl_handle_me_t meh; + ptl_md_t md; + ptl_err_t prc; + unsigned long flags; + + LASSERT (!tx->tx_idle); + LASSERT (!tx->tx_active); + LASSERT (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE)); + LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE)); + LASSERT (tx->tx_type == TX_TYPE_SMALL_MESSAGE || + tx->tx_type == TX_TYPE_PUT_REQUEST || + tx->tx_type == TX_TYPE_GET_REQUEST); + + kptllnd_set_tx_peer(tx, peer); + + if (tx->tx_type == TX_TYPE_PUT_REQUEST || + tx->tx_type == TX_TYPE_GET_REQUEST) { + + spin_lock_irqsave(&peer->peer_lock, flags); + + /* Assume 64-bit matchbits can't wrap */ + LASSERT (peer->peer_next_matchbits >= PTL_RESERVED_MATCHBITS); + tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits = + peer->peer_next_matchbits++; + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + prc = PtlMEAttach(kptllnd_data.kptl_nih, + *kptllnd_tunables.kptl_portal, + peer->peer_ptlid, + tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits, + 0, /* ignore bits */ + PTL_UNLINK, + PTL_INS_BEFORE, + &meh); + if (prc != PTL_OK) { + CERROR("PtlMEAttach(%s) failed: %d\n", + libcfs_id2str(peer->peer_id), prc); + goto failed; + } + + prc = PtlMDAttach(meh, tx->tx_rdma_md, PTL_UNLINK, &rdma_mdh); + if (prc != PTL_OK) { + CERROR("PtlMDAttach(%s) failed: %d\n", + libcfs_id2str(tx->tx_peer->peer_id), prc); + prc = PtlMEUnlink(meh); + LASSERT(prc == PTL_OK); + rdma_mdh = PTL_INVALID_HANDLE; + goto failed; + } + + /* I'm not racing with the event callback here. It's a bug if + * there's an event on the MD I just attached before I actually + * send the RDMA request message which the event callback + * catches by asserting 'rdma_mdh' is valid. */ + } + + memset(&md, 0, sizeof(md)); + + md.start = tx->tx_msg; + md.length = tx->tx_msg->ptlm_nob; + md.threshold = 1; + md.options = PTL_MD_OP_PUT | + PTL_MD_LUSTRE_COMPLETION_SEMANTICS | + PTL_MD_EVENT_START_DISABLE; + md.user_ptr = &tx->tx_msg_eventarg; + md.eq_handle = kptllnd_data.kptl_eqh; + + prc = PtlMDBind(kptllnd_data.kptl_nih, md, PTL_UNLINK, &msg_mdh); + if (prc != PTL_OK) { + msg_mdh = PTL_INVALID_HANDLE; + goto failed; + } + + spin_lock_irqsave(&peer->peer_lock, flags); + + tx->tx_deadline = jiffies + (*kptllnd_tunables.kptl_timeout * HZ); + tx->tx_active = 1; + tx->tx_rdma_mdh = rdma_mdh; + tx->tx_msg_mdh = msg_mdh; + + /* Ensure HELLO is sent first */ + if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) + list_add(&tx->tx_list, &peer->peer_sendq); + else + list_add_tail(&tx->tx_list, &peer->peer_sendq); + + spin_unlock_irqrestore(&peer->peer_lock, flags); + return; + + failed: + spin_lock_irqsave(&peer->peer_lock, flags); + + tx->tx_status = -EIO; + tx->tx_rdma_mdh = rdma_mdh; + tx->tx_msg_mdh = msg_mdh; + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + kptllnd_tx_decref(tx); +} + +void +kptllnd_peer_check_sends (kptl_peer_t *peer) +{ + + kptl_tx_t *tx; + int rc; + unsigned long flags; + + LASSERT(!in_interrupt()); + + spin_lock_irqsave(&peer->peer_lock, flags); + + if (list_empty(&peer->peer_sendq) && + peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER) { + + /* post a NOOP to return credits */ + spin_unlock_irqrestore(&peer->peer_lock, flags); + + tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE); + if (tx == NULL) { + CERROR("Can't return credits to %s: can't allocate descriptor\n", + libcfs_id2str(peer->peer_id)); + } else { + kptllnd_init_msg(tx->tx_msg, PTLLND_MSG_TYPE_NOOP, 0); + kptllnd_post_tx(peer, tx); + } + + spin_lock_irqsave(&peer->peer_lock, flags); + } + + while (!list_empty(&peer->peer_sendq)) { + tx = list_entry (peer->peer_sendq.next, kptl_tx_t, tx_list); + + LASSERT (tx->tx_active); + LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE)); + LASSERT (tx->tx_type == TX_TYPE_SMALL_MESSAGE || + !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE)); + + LASSERT (peer->peer_outstanding_credits >= 0); + LASSERT (peer->peer_outstanding_credits <= + *kptllnd_tunables.kptl_peercredits); + LASSERT (peer->peer_credits >= 0); + LASSERT (peer->peer_credits <= + *kptllnd_tunables.kptl_peercredits); + + /* Ensure HELLO is sent first */ + if (!peer->peer_sent_hello) { + if (tx->tx_msg->ptlm_type != PTLLND_MSG_TYPE_HELLO) + break; + peer->peer_sent_hello = 1; + } + + if (peer->peer_credits == 0) { + CDEBUG(D_NET, "%s: no credits\n", + libcfs_id2str(peer->peer_id)); + break; + } + + /* Don't use the last credit unless I've got credits to + * return */ + if (peer->peer_credits == 1 && + peer->peer_outstanding_credits == 0) { + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_id2str(peer->peer_id)); + break; + } + + list_del(&tx->tx_list); + + /* Discard any NOOP I queued if I'm not at the high-water mark + * any more or more messages have been queued */ + if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP && + (!list_empty(&peer->peer_sendq) || + peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) { + + tx->tx_active = 0; + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + CDEBUG(D_NET, "%s: redundant noop\n", + libcfs_id2str(peer->peer_id)); + kptllnd_tx_decref(tx); + + spin_lock_irqsave(&peer->peer_lock, flags); + continue; + } + + CDEBUG(D_NET, "tx=%p nob=%d to %s(%s)\n", + tx, tx->tx_msg->ptlm_nob, + libcfs_id2str(peer->peer_id), + kptllnd_ptlid2str(peer->peer_ptlid)); + + /* fill last-minute msg header fields */ + kptllnd_msg_pack(tx->tx_msg, peer); + + peer->peer_outstanding_credits = 0; + peer->peer_credits--; + + list_add_tail(&tx->tx_list, &peer->peer_activeq); + + kptllnd_tx_addref(tx); /* 1 ref for me... */ + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + rc = PtlPut (tx->tx_msg_mdh, + PTL_NOACK_REQ, + peer->peer_ptlid, + *kptllnd_tunables.kptl_portal, + 0, /* acl cookie */ + LNET_MSG_MATCHBITS, + 0, /* offset */ + 0); /* header data */ + if (rc != PTL_OK) { + CERROR("PtlPut %s error %d\n", + libcfs_id2str(peer->peer_id), rc); + + /* Nuke everything (including this tx) */ + kptllnd_peer_close(peer, -EIO); + return; + } + + kptllnd_tx_decref(tx); /* drop my ref */ + + spin_lock_irqsave(&peer->peer_lock, flags); + } + + spin_unlock_irqrestore(&peer->peer_lock, flags); +} + +kptl_tx_t * +kptllnd_find_timed_out_tx(kptl_peer_t *peer) +{ + kptl_tx_t *tx; + struct list_head *tmp; + unsigned long flags; + + spin_lock_irqsave(&peer->peer_lock, flags); + + list_for_each(tmp, &peer->peer_sendq) { + tx = list_entry(peer->peer_sendq.next, kptl_tx_t, tx_list); + + if (time_after_eq(jiffies, tx->tx_deadline)) { + kptllnd_tx_addref(tx); + spin_unlock_irqrestore(&peer->peer_lock, flags); + return tx; + } + } + + list_for_each(tmp, &peer->peer_activeq) { + tx = list_entry(peer->peer_activeq.next, kptl_tx_t, tx_list); + + if (time_after_eq(jiffies, tx->tx_deadline)) { + kptllnd_tx_addref(tx); + spin_unlock_irqrestore(&peer->peer_lock, flags); + return tx; + } + } + + spin_unlock_irqrestore(&peer->peer_lock, flags); + return NULL; +} + + +void +kptllnd_peer_check_bucket (int idx) +{ + struct list_head *peers = &kptllnd_data.kptl_peers[idx]; + struct list_head *ptmp; + kptl_peer_t *peer; + kptl_tx_t *tx; + unsigned long flags; + int nsend; + int nactive; + + CDEBUG(D_NET, "Bucket=%d\n", idx); + + again: + /* NB. Shared lock while I just look */ + read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kptl_peer_t, peer_list); + + CDEBUG(D_NET, "Peer=%s Credits=%d Outstanding=%d\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits); + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + kptllnd_peer_check_sends(peer); + + tx = kptllnd_find_timed_out_tx(peer); + if (tx == NULL) + continue; + + kptllnd_peer_addref(peer); /* 1 ref for me... */ + + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, + flags); + + spin_lock_irqsave(&peer->peer_lock, flags); + nsend = kptllnd_count_queue(&peer->peer_sendq); + nactive = kptllnd_count_queue(&peer->peer_activeq); + spin_unlock_irqrestore(&peer->peer_lock, flags); + + LCONSOLE_ERROR("Timing out %s: please check Portals\n", + libcfs_id2str(peer->peer_id)); + + CERROR("%s timed out: cred %d outstanding %d sendq %d " + "activeq %d Tx %s (%s%s%s) status %d T/O %ds\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits, + nsend, nactive, kptllnd_tx_typestr(tx->tx_type), + tx->tx_active ? "A" : "", + PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE) ? + "" : "M", + PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE) ? + "" : "D", + tx->tx_status, *kptllnd_tunables.kptl_timeout); + + kptllnd_dump_ptltrace(); + + kptllnd_tx_decref(tx); + + kptllnd_peer_close(peer, -ETIMEDOUT); + kptllnd_peer_decref(peer); /* ...until here */ + + /* start again now I've dropped the lock */ + goto again; + } + + read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags); +} + +kptl_peer_t * +kptllnd_id2peer_locked (lnet_process_id_t id) +{ + struct list_head *peers = kptllnd_nid2peerlist(id.nid); + struct list_head *tmp; + kptl_peer_t *peer; + + list_for_each (tmp, peers) { + + peer = list_entry (tmp, kptl_peer_t, peer_list); + + LASSERT(peer->peer_state == PEER_STATE_WAITING_HELLO || + peer->peer_state == PEER_STATE_ACTIVE); + + if (peer->peer_id.nid != id.nid || + peer->peer_id.pid != id.pid) + continue; + + kptllnd_peer_addref(peer); + + CDEBUG(D_NET, "%s -> %s (%d)\n", + libcfs_id2str(id), + kptllnd_ptlid2str(peer->peer_ptlid), + atomic_read (&peer->peer_refcount)); + return peer; + } + + return NULL; +} + +void +kptllnd_peertable_overflow_msg(char *str, lnet_process_id_t id) +{ + LCONSOLE_ERROR("%s %s overflows the peer table[%d]: " + "messages may be dropped\n", + str, libcfs_id2str(id), + kptllnd_data.kptl_n_active_peers); + LCONSOLE_ERROR("Please correct by increasing " + "'max_nodes' or 'max_procs_per_node'\n"); +} + +__u64 +kptllnd_get_last_seen_matchbits_locked(lnet_process_id_t lpid) +{ + kptl_peer_t *peer; + struct list_head *tmp; + + /* Find the last matchbits I saw this new peer using. Note.. + A. This peer cannot be in the peer table - she's new! + B. If I can't find the peer in the closing/zombie peers, all + matchbits are safe because all refs to the (old) peer have gone + so all txs have completed so there's no risk of matchbit + collision! + */ + + LASSERT(kptllnd_id2peer_locked(lpid) == NULL); + + /* peer's last matchbits can't change after it comes out of the peer + * table, so first match is fine */ + + list_for_each (tmp, &kptllnd_data.kptl_closing_peers) { + peer = list_entry (tmp, kptl_peer_t, peer_list); + + if (peer->peer_id.nid == lpid.nid && + peer->peer_id.pid == lpid.pid) + return peer->peer_last_matchbits_seen; + } + + list_for_each (tmp, &kptllnd_data.kptl_zombie_peers) { + peer = list_entry (tmp, kptl_peer_t, peer_list); + + if (peer->peer_id.nid == lpid.nid && + peer->peer_id.pid == lpid.pid) + return peer->peer_last_matchbits_seen; + } + + return PTL_RESERVED_MATCHBITS; +} + +kptl_peer_t * +kptllnd_peer_handle_hello (ptl_process_id_t initiator, + kptl_msg_t *msg) +{ + rwlock_t *g_lock = &kptllnd_data.kptl_peer_rw_lock; + kptl_peer_t *peer; + kptl_peer_t *new_peer; + lnet_process_id_t lpid; + unsigned long flags; + kptl_tx_t *hello_tx; + int rc; + __u64 safe_matchbits; + __u64 last_matchbits_seen; + + lpid.nid = msg->ptlm_srcnid; + lpid.pid = msg->ptlm_srcpid; + + CDEBUG(D_NET, "hello from %s(%s)\n", + libcfs_id2str(lpid), kptllnd_ptlid2str(initiator)); + + if (initiator.pid != kptllnd_data.kptl_portals_id.pid && + (msg->ptlm_srcpid & LNET_PID_USERFLAG) == 0) { + /* If the peer's PID isn't _the_ ptllnd kernel pid, she must be + * userspace. Refuse the connection if she hasn't set the + * correct flag in her PID... */ + CERROR("Userflag not set in hello from %s (%s)\n", + libcfs_id2str(lpid), kptllnd_ptlid2str(initiator)); + return NULL; + } + + /* kptlhm_matchbits are the highest matchbits my peer may have used to + * RDMA to me. I ensure I never register buffers for RDMA that could + * match any she used */ + safe_matchbits = msg->ptlm_u.hello.kptlhm_matchbits + 1; + + if (safe_matchbits < PTL_RESERVED_MATCHBITS) { + CERROR("Illegal matchbits "LPX64" in HELLO from %s\n", + safe_matchbits, libcfs_id2str(lpid)); + return NULL; + } + + if (msg->ptlm_u.hello.kptlhm_max_msg_size != + *kptllnd_tunables.kptl_max_msg_size) { + CERROR("max message size MUST be equal for all peers: " + "got %d expected %d from %s\n", + msg->ptlm_u.hello.kptlhm_max_msg_size, + *kptllnd_tunables.kptl_max_msg_size, + libcfs_id2str(lpid)); + return NULL; + } + + if (msg->ptlm_credits + 1 != *kptllnd_tunables.kptl_peercredits) { + CERROR("peercredits MUST be equal on all peers: " + "got %d expected %d from %s\n", + msg->ptlm_credits + 1, + *kptllnd_tunables.kptl_peercredits, + libcfs_id2str(lpid)); + return NULL; + } + + write_lock_irqsave(g_lock, flags); + + peer = kptllnd_id2peer_locked(lpid); + if (peer != NULL) { + if (peer->peer_state == PEER_STATE_WAITING_HELLO) { + /* Completing HELLO handshake */ + LASSERT(peer->peer_incarnation == 0); + + peer->peer_state = PEER_STATE_ACTIVE; + peer->peer_incarnation = msg->ptlm_srcstamp; + peer->peer_next_matchbits = safe_matchbits; + + write_unlock_irqrestore(g_lock, flags); + return peer; + } + + /* remove old incarnation of this peer */ + kptllnd_peer_close_locked(peer, 0); + } + + kptllnd_cull_peertable_locked(lpid); + + write_unlock_irqrestore(g_lock, flags); + + if (peer != NULL) { + CDEBUG(D_NET, "Peer %s (%s) reconnecting:" + " stamp "LPX64"("LPX64")\n", + libcfs_id2str(lpid), kptllnd_ptlid2str(initiator), + msg->ptlm_srcstamp, peer->peer_incarnation); + + kptllnd_peer_decref(peer); + } + + hello_tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE); + if (hello_tx == NULL) { + CERROR("Unable to allocate HELLO message for %s\n", + libcfs_id2str(lpid)); + return NULL; + } + + kptllnd_init_msg(hello_tx->tx_msg, PTLLND_MSG_TYPE_HELLO, + sizeof(kptl_hello_msg_t)); + + new_peer = kptllnd_peer_allocate(lpid, initiator); + if (new_peer == NULL) { + kptllnd_tx_decref(hello_tx); + return NULL; + } + + rc = kptllnd_peer_reserve_buffers(); + if (rc != 0) { + kptllnd_peer_decref(new_peer); + kptllnd_tx_decref(hello_tx); + + CERROR("Failed to reserve buffers for %s\n", + libcfs_id2str(lpid)); + return NULL; + } + + write_lock_irqsave(g_lock, flags); + + peer = kptllnd_id2peer_locked(lpid); + if (peer != NULL) { + if (peer->peer_state == PEER_STATE_WAITING_HELLO) { + /* An outgoing message instantiated 'peer' for me and + * presumably provoked this reply */ + CWARN("Outgoing instantiated peer %s\n", libcfs_id2str(lpid)); + LASSERT(peer->peer_incarnation == 0); + + peer->peer_state = PEER_STATE_ACTIVE; + peer->peer_incarnation = msg->ptlm_srcstamp; + peer->peer_next_matchbits = safe_matchbits; + } else { + LASSERT (peer->peer_state == PEER_STATE_ACTIVE); + /* WOW! Somehow this peer completed the HELLO + * handshake while I slept. I guess I could have slept + * while it rebooted and sent a new HELLO, so I'll fail + * this one... */ + CWARN("Wow! peer %s\n", libcfs_id2str(lpid)); + kptllnd_peer_decref(peer); + peer = NULL; + } + + write_unlock_irqrestore(g_lock, flags); + + kptllnd_peer_unreserve_buffers(); + kptllnd_peer_decref(new_peer); + kptllnd_tx_decref(hello_tx); + return peer; + } + + if (kptllnd_data.kptl_n_active_peers == + kptllnd_data.kptl_expected_peers) { + /* peer table full */ + write_unlock_irqrestore(g_lock, flags); + + kptllnd_peertable_overflow_msg("Connection from ", lpid); + + rc = kptllnd_reserve_buffers(1); /* HELLO headroom */ + if (rc != 0) { + CERROR("Refusing connection from %s\n", + libcfs_id2str(lpid)); + kptllnd_peer_unreserve_buffers(); + kptllnd_peer_decref(new_peer); + kptllnd_tx_decref(hello_tx); + return NULL; + } + + write_lock_irqsave(g_lock, flags); + kptllnd_data.kptl_expected_peers++; + } + + last_matchbits_seen = kptllnd_get_last_seen_matchbits_locked(lpid); + + hello_tx->tx_msg->ptlm_u.hello.kptlhm_matchbits = last_matchbits_seen; + hello_tx->tx_msg->ptlm_u.hello.kptlhm_max_msg_size = + *kptllnd_tunables.kptl_max_msg_size; + + new_peer->peer_state = PEER_STATE_ACTIVE; + new_peer->peer_incarnation = msg->ptlm_srcstamp; + new_peer->peer_next_matchbits = safe_matchbits; + new_peer->peer_last_matchbits_seen = last_matchbits_seen; + + kptllnd_peer_add_peertable_locked(new_peer); + + write_unlock_irqrestore(g_lock, flags); + + /* NB someone else could get in now and post a message before I post + * the HELLO, but post_tx/check_sends take care of that! */ + + kptllnd_post_tx(new_peer, hello_tx); + kptllnd_peer_check_sends(new_peer); + + return new_peer; +} + +void +kptllnd_tx_launch(kptl_tx_t *tx, lnet_process_id_t target) +{ + rwlock_t *g_lock = &kptllnd_data.kptl_peer_rw_lock; + ptl_process_id_t ptl_id; + kptl_peer_t *peer; + kptl_peer_t *new_peer = NULL; + kptl_tx_t *hello_tx = NULL; + unsigned long flags; + int rc; + __u64 last_matchbits_seen; + + LASSERT (tx->tx_lnet_msg != NULL); + LASSERT (tx->tx_peer == NULL); + + /* I expect to find the peer, so I only take a read lock... */ + read_lock_irqsave(g_lock, flags); + peer = kptllnd_id2peer_locked(target); + read_unlock_irqrestore(g_lock, flags); + + if (peer != NULL) { + goto post; + } + + if ((target.pid & LNET_PID_USERFLAG) != 0) { + CWARN("Refusing to create a new connection to %s " + "(non-kernel peer)\n", libcfs_id2str(target)); + tx->tx_status = -EHOSTUNREACH; + goto failed; + } + + /* The new peer is a kernel ptllnd, and kernel ptllnds all have + * the same portals PID */ + ptl_id.nid = kptllnd_lnet2ptlnid(target.nid); + ptl_id.pid = kptllnd_data.kptl_portals_id.pid; + + write_lock_irqsave(g_lock, flags); + + peer = kptllnd_id2peer_locked(target); + if (peer != NULL) { + write_unlock_irqrestore(g_lock, flags); + goto post; + } + + kptllnd_cull_peertable_locked(target); + + write_unlock_irqrestore(g_lock, flags); + + hello_tx = kptllnd_get_idle_tx(TX_TYPE_SMALL_MESSAGE); + if (hello_tx == NULL) { + CERROR("Unable to allocate connect message for %s\n", + libcfs_id2str(target)); + tx->tx_status = -ENOMEM; + goto failed; + } + + kptllnd_init_msg(hello_tx->tx_msg, PTLLND_MSG_TYPE_HELLO, + sizeof(kptl_hello_msg_t)); + + new_peer = kptllnd_peer_allocate(target, ptl_id); + if (new_peer == NULL) { + tx->tx_status = -ENOMEM; + goto failed; + } + + rc = kptllnd_peer_reserve_buffers(); + if (rc != 0) { + tx->tx_status = rc; + goto failed; + } + + write_lock_irqsave(g_lock, flags); + + peer = kptllnd_id2peer_locked(target); + if (peer != NULL) { /* someone else beat me to it */ + write_unlock_irqrestore(g_lock, flags); + + kptllnd_peer_unreserve_buffers(); + kptllnd_peer_decref(new_peer); + kptllnd_tx_decref(hello_tx); + goto post; + } + + if (kptllnd_data.kptl_n_active_peers == + kptllnd_data.kptl_expected_peers) { + /* peer table full */ + write_unlock_irqrestore(g_lock, flags); + + kptllnd_peertable_overflow_msg("Connection to ", target); + + rc = kptllnd_reserve_buffers(1); /* HELLO headroom */ + if (rc != 0) { + CERROR("Can't create connection to %s\n", + libcfs_id2str(target)); + kptllnd_peer_unreserve_buffers(); + tx->tx_status = -ENOMEM; + goto failed; + } + write_lock_irqsave(g_lock, flags); + kptllnd_data.kptl_expected_peers++; + } + + last_matchbits_seen = kptllnd_get_last_seen_matchbits_locked(target); + + hello_tx->tx_msg->ptlm_u.hello.kptlhm_matchbits = last_matchbits_seen; + hello_tx->tx_msg->ptlm_u.hello.kptlhm_max_msg_size = + *kptllnd_tunables.kptl_max_msg_size; + + new_peer->peer_state = PEER_STATE_WAITING_HELLO; + new_peer->peer_last_matchbits_seen = last_matchbits_seen; + + kptllnd_peer_add_peertable_locked(new_peer); + + write_unlock_irqrestore(g_lock, flags); + + /* NB someone else could get in now and post a message before I post + * the HELLO, but post_tx/check_sends take care of that! */ + + peer = new_peer; + kptllnd_post_tx(peer, hello_tx); + + post: + kptllnd_post_tx(peer, tx); + kptllnd_peer_check_sends(peer); + kptllnd_peer_decref(peer); + return; + + failed: + if (hello_tx != NULL) + kptllnd_tx_decref(hello_tx); + + if (new_peer != NULL) + kptllnd_peer_decref(new_peer); + + LASSERT (tx->tx_status != 0); + kptllnd_tx_decref(tx); + +} diff --git a/lnet/klnds/ptllnd/ptllnd_ptltrace.c b/lnet/klnds/ptllnd/ptllnd_ptltrace.c new file mode 100644 index 0000000..30064dc --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_ptltrace.c @@ -0,0 +1,172 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2006 Cluster File Systems, Inc. All rights reserved. + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +#include "ptllnd.h" + +#ifdef CRAY_XT3 +static struct semaphore ptltrace_mutex; +static struct semaphore ptltrace_signal; + +void +kptllnd_ptltrace_to_file(char *filename) +{ + CFS_DECL_JOURNAL_DATA; + CFS_DECL_MMSPACE; + + cfs_file_t *filp; + char *start; + char *tmpbuf; + int len; + int rc; + loff_t offset = 0; + int eof = 0; + + CWARN("dumping ptltrace to %s\n", filename); + + LIBCFS_ALLOC(tmpbuf, PAGE_SIZE); + if (tmpbuf == NULL) { + CERROR("Can't allocate page buffer to dump %s\n", filename); + return; + } + + CFS_PUSH_JOURNAL; + + filp = cfs_filp_open(filename, + O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600, &rc); + if (filp == NULL) { + CERROR("Error %d creating %s\n", rc, filename); + goto out; + } + + CFS_MMSPACE_OPEN; + + while (!eof) { + start = NULL; + len = ptl_proc_read(tmpbuf, &start, offset, + PAGE_SIZE, &eof, NULL); + + /* we don't allow ptl_proc_read to mimic case 0 or 1 behavior + * for a proc_read method, only #2: from proc_file_read + * + * 2) Set *start = an address within the buffer. + * Put the data of the requested offset at *start. + * Return the number of bytes of data placed there. + * If this number is greater than zero and you + * didn't signal eof and the reader is prepared to + * take more data you will be called again with the + * requested offset advanced by the number of bytes + * absorbed. + */ + + if (len == 0) /* end of file */ + break; + + if (len < 0) { + CERROR("ptl_proc_read: error %d\n", len); + break; + } + + LASSERT (start >= tmpbuf && start + len <= tmpbuf + PAGE_SIZE); + + rc = cfs_filp_write(filp, start, len, cfs_filp_poff(filp)); + if (rc != len) { + if (rc < 0) + CERROR("Error %d writing %s\n", rc, filename); + else + CERROR("Partial write %d(%d) to %s\n", + rc, len, filename); + break; + } + + offset += len; + } + + CFS_MMSPACE_CLOSE; + + rc = cfs_filp_fsync(filp); + if (rc != 0) + CERROR("Error %d syncing %s\n", rc, filename); + + cfs_filp_close(filp); +out: + CFS_POP_JOURNAL; + LIBCFS_FREE(tmpbuf, PAGE_SIZE); +} + +int +kptllnd_dump_ptltrace_thread(void *arg) +{ + static char fname[1024]; + + libcfs_daemonize("ptltracedump"); + + /* serialise with other instances of me */ + mutex_down(&ptltrace_mutex); + + snprintf(fname, sizeof(fname), "%s.%ld.%ld", + *kptllnd_tunables.kptl_ptltrace_basename, + cfs_time_current_sec(), (long)arg); + + kptllnd_ptltrace_to_file(fname); + + mutex_up(&ptltrace_mutex); + + /* unblock my creator */ + mutex_up(&ptltrace_signal); + + return 0; +} + +void +kptllnd_dump_ptltrace(void) +{ + int rc; + + if (!*kptllnd_tunables.kptl_ptltrace_on_timeout) + return; + + rc = cfs_kernel_thread(kptllnd_dump_ptltrace_thread, + (void *)(long)cfs_curproc_pid(), + CLONE_VM | CLONE_FS | CLONE_FILES); + if (rc < 0) { + CERROR("Error %d starting ptltrace dump thread\n", rc); + } else { + /* block until thread completes */ + mutex_down(&ptltrace_signal); + } +} + +void +kptllnd_init_ptltrace(void) +{ + init_mutex(&ptltrace_mutex); + init_mutex_locked(&ptltrace_signal); +} + +#else + +void +kptllnd_dump_ptltrace(void) +{ +} + +void +kptllnd_init_ptltrace(void) +{ +} + +#endif diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c new file mode 100644 index 0000000..74019ef --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_rx_buf.c @@ -0,0 +1,720 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + + #include "ptllnd.h" + +void +kptllnd_rx_buffer_pool_init(kptl_rx_buffer_pool_t *rxbp) +{ + memset(rxbp, 0, sizeof(*rxbp)); + spin_lock_init(&rxbp->rxbp_lock); + INIT_LIST_HEAD(&rxbp->rxbp_list); +} + +void +kptllnd_rx_buffer_destroy(kptl_rx_buffer_t *rxb) +{ + kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool; + + LASSERT(rxb->rxb_refcount == 0); + LASSERT(PtlHandleIsEqual(rxb->rxb_mdh, PTL_INVALID_HANDLE)); + LASSERT(!rxb->rxb_posted); + LASSERT(rxb->rxb_idle); + + list_del(&rxb->rxb_list); + rxbp->rxbp_count--; + + LIBCFS_FREE(rxb->rxb_buffer, kptllnd_rx_buffer_size()); + LIBCFS_FREE(rxb, sizeof(*rxb)); +} + +int +kptllnd_rx_buffer_pool_reserve(kptl_rx_buffer_pool_t *rxbp, int count) +{ + int bufsize; + int msgs_per_buffer; + int rc; + kptl_rx_buffer_t *rxb; + char *buffer; + unsigned long flags; + + bufsize = kptllnd_rx_buffer_size(); + msgs_per_buffer = bufsize / (*kptllnd_tunables.kptl_max_msg_size); + + CDEBUG(D_NET, "kptllnd_rx_buffer_pool_reserve(%d)\n", count); + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + + for (;;) { + if (rxbp->rxbp_shutdown) { + rc = -ESHUTDOWN; + break; + } + + if (rxbp->rxbp_reserved + count <= + rxbp->rxbp_count * msgs_per_buffer) { + rc = 0; + break; + } + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + + LIBCFS_ALLOC(rxb, sizeof(*rxb)); + LIBCFS_ALLOC(buffer, bufsize); + + if (rxb == NULL || buffer == NULL) { + CERROR("Failed to allocate rx buffer\n"); + + if (rxb != NULL) + LIBCFS_FREE(rxb, sizeof(*rxb)); + if (buffer != NULL) + LIBCFS_FREE(buffer, bufsize); + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + rc = -ENOMEM; + break; + } + + memset(rxb, 0, sizeof(*rxb)); + + rxb->rxb_eventarg.eva_type = PTLLND_EVENTARG_TYPE_BUF; + rxb->rxb_refcount = 0; + rxb->rxb_pool = rxbp; + rxb->rxb_idle = 0; + rxb->rxb_posted = 0; + rxb->rxb_buffer = buffer; + rxb->rxb_mdh = PTL_INVALID_HANDLE; + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + + if (rxbp->rxbp_shutdown) { + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + + LIBCFS_FREE(rxb, sizeof(*rxb)); + LIBCFS_FREE(buffer, bufsize); + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + rc = -ESHUTDOWN; + break; + } + + list_add_tail(&rxb->rxb_list, &rxbp->rxbp_list); + rxbp->rxbp_count++; + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + + kptllnd_rx_buffer_post(rxb); + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + } + + if (rc == 0) + rxbp->rxbp_reserved += count; + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + + return rc; +} + +void +kptllnd_rx_buffer_pool_unreserve(kptl_rx_buffer_pool_t *rxbp, + int count) +{ + unsigned long flags; + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + + CDEBUG(D_NET, "kptllnd_rx_buffer_pool_unreserve(%d)\n", count); + rxbp->rxbp_reserved -= count; + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); +} + +void +kptllnd_rx_buffer_pool_fini(kptl_rx_buffer_pool_t *rxbp) +{ + kptl_rx_buffer_t *rxb; + int rc; + int i; + unsigned long flags; + struct list_head *tmp; + struct list_head *nxt; + ptl_handle_md_t mdh; + + /* CAVEAT EMPTOR: I'm racing with everything here!!! + * + * Buffers can still be posted after I set rxbp_shutdown because I + * can't hold rxbp_lock while I'm posting them. + * + * Calling PtlMDUnlink() here races with auto-unlinks; i.e. a buffer's + * MD handle could become invalid under me. I am vulnerable to portals + * re-using handles (i.e. make the same handle valid again, but for a + * different MD) from when the MD is actually unlinked, to when the + * event callback tells me it has been unlinked. */ + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + + rxbp->rxbp_shutdown = 1; + + for (i = 9;; i++) { + list_for_each_safe(tmp, nxt, &rxbp->rxbp_list) { + rxb = list_entry (tmp, kptl_rx_buffer_t, rxb_list); + + if (rxb->rxb_idle) { + spin_unlock_irqrestore(&rxbp->rxbp_lock, + flags); + kptllnd_rx_buffer_destroy(rxb); + spin_lock_irqsave(&rxbp->rxbp_lock, + flags); + continue; + } + + mdh = rxb->rxb_mdh; + if (PtlHandleIsEqual(mdh, PTL_INVALID_HANDLE)) + continue; + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + + rc = PtlMDUnlink(mdh); + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + /* callback clears rxb_mdh and drops net's ref + * (which causes repost, but since I set + * shutdown, it will just set the buffer + * idle) */ +#else + if (rc == PTL_OK) { + rxb->rxb_posted = 0; + rxb->rxb_mdh = PTL_INVALID_HANDLE; + kptllnd_rx_buffer_decref_locked(rxb); + } +#endif + } + + if (list_empty(&rxbp->rxbp_list)) + break; + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + + /* Wait a bit for references to be dropped */ + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d Busy RX Buffers\n", + rxbp->rxbp_count); + + cfs_pause(cfs_time_seconds(1)); + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + } + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); +} + +void +kptllnd_rx_buffer_post(kptl_rx_buffer_t *rxb) +{ + int rc; + ptl_md_t md; + ptl_handle_me_t meh; + ptl_handle_md_t mdh; + ptl_process_id_t any; + kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool; + unsigned long flags; + + LASSERT (!in_interrupt()); + LASSERT (rxb->rxb_refcount == 0); + LASSERT (!rxb->rxb_idle); + LASSERT (!rxb->rxb_posted); + LASSERT (PtlHandleIsEqual(rxb->rxb_mdh, PTL_INVALID_HANDLE)); + + any.nid = PTL_NID_ANY; + any.pid = PTL_PID_ANY; + + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + + if (rxbp->rxbp_shutdown) { + rxb->rxb_idle = 1; + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + return; + } + + rxb->rxb_refcount = 1; /* net's ref */ + rxb->rxb_posted = 1; /* I'm posting */ + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + + rc = PtlMEAttach(kptllnd_data.kptl_nih, + *kptllnd_tunables.kptl_portal, + any, + LNET_MSG_MATCHBITS, + 0, /* all matchbits are valid - ignore none */ + PTL_UNLINK, + PTL_INS_AFTER, + &meh); + if (rc != PTL_OK) { + CERROR("PtlMeAttach rxb failed %d\n", rc); + goto failed; + } + + /* + * Setup MD + */ + md.start = rxb->rxb_buffer; + md.length = PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages; + md.threshold = PTL_MD_THRESH_INF; + md.options = PTL_MD_OP_PUT | + PTL_MD_LUSTRE_COMPLETION_SEMANTICS | + PTL_MD_EVENT_START_DISABLE | + PTL_MD_MAX_SIZE | + PTL_MD_LOCAL_ALIGN8; + md.user_ptr = &rxb->rxb_eventarg; + md.max_size = *kptllnd_tunables.kptl_max_msg_size; + md.eq_handle = kptllnd_data.kptl_eqh; + + rc = PtlMDAttach(meh, md, PTL_UNLINK, &mdh); + if (rc == PTL_OK) { + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + if (rxb->rxb_posted) /* Not auto-unlinked yet!!! */ + rxb->rxb_mdh = mdh; + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + return; + } + + CERROR("PtlMDAttach rxb failed %d\n", rc); + rc = PtlMEUnlink(meh); + LASSERT(rc == PTL_OK); + + failed: + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + rxb->rxb_posted = 0; + /* XXX this will just try again immediately */ + kptllnd_rx_buffer_decref_locked(rxb); + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); +} + +kptl_rx_t * +kptllnd_rx_alloc(void) +{ + kptl_rx_t* rx; + + if (IS_SIMULATION_ENABLED(FAIL_RX_ALLOC)) { + CERROR ("FAIL_RX_ALLOC SIMULATION triggered\n"); + return NULL; + } + + rx = cfs_mem_cache_alloc(kptllnd_data.kptl_rx_cache, CFS_ALLOC_ATOMIC); + if (rx == NULL) { + CERROR("Failed to allocate rx\n"); + return NULL; + } + + memset(rx, 0, sizeof(*rx)); + return rx; +} + +void +kptllnd_rx_done(kptl_rx_t *rx) +{ + kptl_rx_buffer_t *rxb = rx->rx_rxb; + kptl_peer_t *peer = rx->rx_peer; + unsigned long flags; + + CDEBUG(D_NET, "rx=%p rxb %p peer %p\n", rx, rxb, peer); + + if (rxb != NULL) + kptllnd_rx_buffer_decref(rxb); + + if (peer != NULL) { + /* Update credits (after I've decref-ed the buffer) */ + spin_lock_irqsave(&peer->peer_lock, flags); + + peer->peer_outstanding_credits++; + LASSERT (peer->peer_outstanding_credits <= + *kptllnd_tunables.kptl_peercredits); + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + CDEBUG(D_NET, "Peer=%s Credits=%d Outstanding=%d\n", + libcfs_id2str(peer->peer_id), + peer->peer_credits, peer->peer_outstanding_credits); + + /* I might have to send back credits */ + kptllnd_peer_check_sends(peer); + kptllnd_peer_decref(peer); + } + + cfs_mem_cache_free(kptllnd_data.kptl_rx_cache, rx); +} + +void +kptllnd_rx_buffer_callback (ptl_event_t *ev) +{ + kptl_eventarg_t *eva = ev->md.user_ptr; + kptl_rx_buffer_t *rxb = kptllnd_eventarg2obj(eva); + kptl_rx_buffer_pool_t *rxbp = rxb->rxb_pool; + kptl_rx_t *rx; + int unlinked; + unsigned long flags; + +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + unlinked = ev->unlinked; +#else + unlinked = ev->type == PTL_EVENT_UNLINK; +#endif + + CDEBUG(D_NET, "RXB Callback %s(%d) rxb=%p id=%s unlink=%d rc %d\n", + kptllnd_evtype2str(ev->type), ev->type, rxb, + kptllnd_ptlid2str(ev->initiator), + unlinked, ev->ni_fail_type); + + LASSERT (!rxb->rxb_idle); + LASSERT (ev->md.start == rxb->rxb_buffer); + LASSERT (ev->offset + ev->mlength <= + PAGE_SIZE * *kptllnd_tunables.kptl_rxb_npages); + LASSERT (ev->type == PTL_EVENT_PUT_END || + ev->type == PTL_EVENT_UNLINK); + LASSERT (ev->type == PTL_EVENT_UNLINK || + ev->match_bits == LNET_MSG_MATCHBITS); + + if (ev->ni_fail_type != PTL_NI_OK) + CERROR("event type %d, status %d from %s\n", + ev->type, ev->ni_fail_type, + kptllnd_ptlid2str(ev->initiator)); + + if (ev->type == PTL_EVENT_PUT_END && + ev->ni_fail_type == PTL_NI_OK && + !rxbp->rxbp_shutdown) { + + /* rxbp_shutdown sampled without locking! I only treat it as a + * hint since shutdown can start while rx's are queued on + * kptl_sched_rxq. */ +#if (PTL_MD_LOCAL_ALIGN8 == 0) + /* Portals can't force message alignment - someone sending an + * odd-length message will misalign subsequent messages and + * force the fixup below... */ + if ((ev->mlength & 7) != 0) + CWARN("Message from %s has odd length %d: " + "probable version incompatibility\n", + kptllnd_ptlid2str(ev->initiator), + ev->mlength); +#endif + rx = kptllnd_rx_alloc(); + if (rx == NULL) { + CERROR("Message from %s dropped: ENOMEM", + kptllnd_ptlid2str(ev->initiator)); + } else { + if ((ev->offset & 7) == 0) { + kptllnd_rx_buffer_addref(rxb); + rx->rx_rxb = rxb; + rx->rx_nob = ev->mlength; + rx->rx_msg = (kptl_msg_t *) + (rxb->rxb_buffer + ev->offset); + } else { +#if (PTL_MD_LOCAL_ALIGN8 == 0) + /* Portals can't force alignment - copy into + * rx_space (avoiding overflow) to fix */ + int maxlen = *kptllnd_tunables.kptl_max_msg_size; + + rx->rx_rxb = NULL; + rx->rx_nob = MIN(maxlen, ev->mlength); + rx->rx_msg = (kptl_msg_t *)rx->rx_space; + memcpy(rx->rx_msg, rxb->rxb_buffer + ev->offset, + rx->rx_nob); +#else + /* Portals should have forced the alignment */ + LBUG(); +#endif + } + + rx->rx_initiator = ev->initiator; +#ifdef CRAY_XT3 + rx->rx_uid = ev->uid; +#endif + /* Queue for attention */ + spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, + flags); + + list_add_tail(&rx->rx_list, + &kptllnd_data.kptl_sched_rxq); + wake_up(&kptllnd_data.kptl_sched_waitq); + + spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, + flags); + } + } + + if (unlinked) { + spin_lock_irqsave(&rxbp->rxbp_lock, flags); + + rxb->rxb_posted = 0; + rxb->rxb_mdh = PTL_INVALID_HANDLE; + kptllnd_rx_buffer_decref_locked(rxb); + + spin_unlock_irqrestore(&rxbp->rxbp_lock, flags); + } +} + +void +kptllnd_nak (kptl_rx_t *rx) +{ + /* Fire-and-forget a stub message that will let the peer know my + * protocol magic/version and make her drop/refresh any peer state she + * might have with me. */ + ptl_md_t md = { + .start = kptllnd_data.kptl_nak_msg, + .length = kptllnd_data.kptl_nak_msg->ptlm_nob, + .threshold = 1, + .options = 0, + .user_ptr = NULL, + .eq_handle = PTL_EQ_NONE}; + ptl_handle_md_t mdh; + int rc; + + rc = PtlMDBind(kptllnd_data.kptl_nih, md, PTL_UNLINK, &mdh); + if (rc != PTL_OK) { + CWARN("Can't NAK %s: bind failed %d\n", + kptllnd_ptlid2str(rx->rx_initiator), rc); + return; + } + + rc = PtlPut(mdh, PTL_NOACK_REQ, rx->rx_initiator, + *kptllnd_tunables.kptl_portal, 0, + LNET_MSG_MATCHBITS, 0, 0); + + if (rc != PTL_OK) + CWARN("Can't NAK %s: put failed %d\n", + kptllnd_ptlid2str(rx->rx_initiator), rc); +} + +void +kptllnd_rx_parse(kptl_rx_t *rx) +{ + kptl_msg_t *msg = rx->rx_msg; + kptl_peer_t *peer; + int rc; + int credits; + unsigned long flags; + lnet_process_id_t srcid; + + LASSERT (rx->rx_peer == NULL); + + CDEBUG (D_NET, "%s: nob=%d %08x %04x %02x %d %d\n", + kptllnd_ptlid2str(rx->rx_initiator), + rx->rx_nob, msg->ptlm_magic, msg->ptlm_version, + msg->ptlm_type, msg->ptlm_credits, msg->ptlm_nob); + + if ((rx->rx_nob >= 4 && + (msg->ptlm_magic == LNET_PROTO_MAGIC || + msg->ptlm_magic == __swab32(LNET_PROTO_MAGIC))) || + (rx->rx_nob >= 6 && + ((msg->ptlm_magic == PTLLND_MSG_MAGIC && + msg->ptlm_version != PTLLND_MSG_VERSION) || + (msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC) && + msg->ptlm_version != __swab16(PTLLND_MSG_VERSION))))) { + /* NAK incompatible versions + * See other LNDs for how to handle this if/when ptllnd begins + * to allow different versions to co-exist */ + CERROR("Bad version: got %04x expected %04x from %s\n", + (__u32)(msg->ptlm_magic == PTLLND_MSG_MAGIC ? + msg->ptlm_version : __swab16(msg->ptlm_version)), + PTLLND_MSG_VERSION, kptllnd_ptlid2str(rx->rx_initiator)); + kptllnd_nak(rx); + goto rx_done; + } + + rc = kptllnd_msg_unpack(msg, rx->rx_nob); + if (rc != 0) { + CERROR ("Error %d unpacking rx from %s\n", + rc, kptllnd_ptlid2str(rx->rx_initiator)); + goto rx_done; + } + + CDEBUG(D_NET, "rx=%p type=%s(%d) nob %d cred %d\n", + rx, kptllnd_msgtype2str(msg->ptlm_type), msg->ptlm_type, + msg->ptlm_nob, msg->ptlm_credits); + + srcid.nid = msg->ptlm_srcnid; + srcid.pid = msg->ptlm_srcpid; + + if (srcid.nid != kptllnd_ptl2lnetnid(rx->rx_initiator.nid)) { + CERROR("Bad source id %s from %s\n", + libcfs_id2str(srcid), + kptllnd_ptlid2str(rx->rx_initiator)); + goto rx_done; + } + + if (msg->ptlm_type == PTLLND_MSG_TYPE_NAK) { + peer = kptllnd_id2peer(srcid); + if (peer == NULL) + goto rx_done; + + CWARN("NAK from %s (%s)\n", + libcfs_id2str(srcid), + kptllnd_ptlid2str(rx->rx_initiator)); + + rc = -EPROTO; + goto failed; + } + + if (msg->ptlm_dstnid != kptllnd_data.kptl_ni->ni_nid || + msg->ptlm_dstpid != the_lnet.ln_pid) { + CERROR("Bad dstid %s (expected %s) from %s\n", + libcfs_id2str((lnet_process_id_t) { + .nid = msg->ptlm_dstnid, + .pid = msg->ptlm_dstpid}), + libcfs_id2str((lnet_process_id_t) { + .nid = kptllnd_data.kptl_ni->ni_nid, + .pid = the_lnet.ln_pid}), + kptllnd_ptlid2str(rx->rx_initiator)); + goto rx_done; + } + + if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) { + peer = kptllnd_peer_handle_hello(rx->rx_initiator, msg); + if (peer == NULL) { + CWARN("No peer for %s\n", + kptllnd_ptlid2str(rx->rx_initiator)); + goto rx_done; + } + } else { + peer = kptllnd_id2peer(srcid); + if (peer == NULL) { + CWARN("NAK %s: no connection; peer must reconnect\n", + libcfs_id2str(srcid)); + /* NAK to make the peer reconnect */ + kptllnd_nak(rx); + goto rx_done; + } + + /* Ignore anything else while I'm waiting for HELLO */ + if (peer->peer_state == PEER_STATE_WAITING_HELLO) { + kptllnd_peer_decref(peer); + goto rx_done; + } + } + + LASSERT (msg->ptlm_srcnid == peer->peer_id.nid && + msg->ptlm_srcpid == peer->peer_id.pid); + + if (msg->ptlm_srcstamp != peer->peer_incarnation) { + CERROR("Stale rx from %s srcstamp "LPX64" expected "LPX64"\n", + libcfs_id2str(peer->peer_id), + msg->ptlm_srcstamp, + peer->peer_incarnation); + rc = -EPROTO; + goto failed; + } + + if (msg->ptlm_dststamp != kptllnd_data.kptl_incarnation && + (msg->ptlm_type != PTLLND_MSG_TYPE_HELLO || /* HELLO sends a */ + msg->ptlm_dststamp != 0)) { /* zero dststamp */ + CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n", + libcfs_id2str(peer->peer_id), msg->ptlm_dststamp, + kptllnd_data.kptl_incarnation); + rc = -EPROTO; + goto failed; + } + + if (msg->ptlm_credits != 0) { + spin_lock_irqsave(&peer->peer_lock, flags); + + if (peer->peer_credits + msg->ptlm_credits > + *kptllnd_tunables.kptl_peercredits) { + credits = peer->peer_credits; + spin_unlock_irqrestore(&peer->peer_lock, flags); + + CERROR("Credit overflow from %s: %d + %d > %d\n", + libcfs_id2str(peer->peer_id), + credits, msg->ptlm_credits, + *kptllnd_tunables.kptl_peercredits); + rc = -EPROTO; + goto failed; + } + + peer->peer_credits += msg->ptlm_credits; + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + kptllnd_peer_check_sends(peer); + } + + /* ptllnd-level protocol correct - rx takes my ref on peer and increments + * peer_outstanding_credits when it completes */ + rx->rx_peer = peer; + kptllnd_peer_alive(peer); + + switch (msg->ptlm_type) { + default: + /* already checked by kptllnd_msg_unpack() */ + LBUG(); + + case PTLLND_MSG_TYPE_HELLO: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_HELLO\n"); + goto rx_done; + + case PTLLND_MSG_TYPE_NOOP: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_NOOP\n"); + goto rx_done; + + case PTLLND_MSG_TYPE_IMMEDIATE: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n"); + rc = lnet_parse(kptllnd_data.kptl_ni, + &msg->ptlm_u.immediate.kptlim_hdr, + msg->ptlm_srcnid, + rx, 0); + if (rc >= 0) /* kptllnd_recv owns 'rx' now */ + return; + goto failed; + + case PTLLND_MSG_TYPE_PUT: + case PTLLND_MSG_TYPE_GET: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n", + msg->ptlm_type == PTLLND_MSG_TYPE_PUT ? + "PUT" : "GET"); + + /* checked in kptllnd_msg_unpack() */ + LASSERT (msg->ptlm_u.rdma.kptlrm_matchbits >= + PTL_RESERVED_MATCHBITS); + + /* Update last match bits seen */ + spin_lock_irqsave(&peer->peer_lock, flags); + + if (msg->ptlm_u.rdma.kptlrm_matchbits > + rx->rx_peer->peer_last_matchbits_seen) + rx->rx_peer->peer_last_matchbits_seen = + msg->ptlm_u.rdma.kptlrm_matchbits; + + spin_unlock_irqrestore(&rx->rx_peer->peer_lock, flags); + + rc = lnet_parse(kptllnd_data.kptl_ni, + &msg->ptlm_u.rdma.kptlrm_hdr, + msg->ptlm_srcnid, + rx, 1); + if (rc >= 0) /* kptllnd_recv owns 'rx' now */ + return; + goto failed; + } + + failed: + kptllnd_peer_close(peer, rc); + if (rx->rx_peer == NULL) /* drop ref on peer */ + kptllnd_peer_decref(peer); /* unless rx_done will */ + rx_done: + kptllnd_rx_done(rx); +} diff --git a/lnet/klnds/ptllnd/ptllnd_tx.c b/lnet/klnds/ptllnd/ptllnd_tx.c new file mode 100644 index 0000000..1c086d8 --- /dev/null +++ b/lnet/klnds/ptllnd/ptllnd_tx.c @@ -0,0 +1,494 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + + #include "ptllnd.h" + +void +kptllnd_free_tx(kptl_tx_t *tx) +{ + if (tx->tx_msg != NULL) + LIBCFS_FREE(tx->tx_msg, + *kptllnd_tunables.kptl_max_msg_size); + + if (tx->tx_rdma_frags != NULL) + LIBCFS_FREE(tx->tx_rdma_frags, + sizeof(*tx->tx_rdma_frags)); + + LIBCFS_FREE(tx, sizeof(*tx)); + + atomic_dec(&kptllnd_data.kptl_ntx); + + /* Keep the tunable in step for visibility */ + *kptllnd_tunables.kptl_ntx = atomic_read(&kptllnd_data.kptl_ntx); +} + +kptl_tx_t * +kptllnd_alloc_tx(void) +{ + kptl_tx_t *tx; + + LIBCFS_ALLOC(tx, sizeof(*tx)); + if (tx == NULL) { + CERROR("Failed to allocate TX\n"); + return NULL; + } + + atomic_inc(&kptllnd_data.kptl_ntx); + + /* Keep the tunable in step for visibility */ + *kptllnd_tunables.kptl_ntx = atomic_read(&kptllnd_data.kptl_ntx); + + tx->tx_idle = 1; + tx->tx_rdma_mdh = PTL_INVALID_HANDLE; + tx->tx_msg_mdh = PTL_INVALID_HANDLE; + tx->tx_rdma_eventarg.eva_type = PTLLND_EVENTARG_TYPE_RDMA; + tx->tx_msg_eventarg.eva_type = PTLLND_EVENTARG_TYPE_MSG; + tx->tx_msg = NULL; + tx->tx_rdma_frags = NULL; + + LIBCFS_ALLOC(tx->tx_msg, *kptllnd_tunables.kptl_max_msg_size); + if (tx->tx_msg == NULL) { + CERROR("Failed to allocate TX payload\n"); + goto failed; + } + + LIBCFS_ALLOC(tx->tx_rdma_frags, sizeof(*tx->tx_rdma_frags)); + if (tx->tx_rdma_frags == NULL) { + CERROR("Failed to allocate TX frags\n"); + goto failed; + } + + return tx; + + failed: + kptllnd_free_tx(tx); + return NULL; +} + +int +kptllnd_setup_tx_descs() +{ + int n = *kptllnd_tunables.kptl_ntx; + int i; + + for (i = 0; i < n; i++) { + kptl_tx_t *tx = kptllnd_alloc_tx(); + + if (tx == NULL) + return -ENOMEM; + + spin_lock(&kptllnd_data.kptl_tx_lock); + + list_add_tail(&tx->tx_list, &kptllnd_data.kptl_idle_txs); + + spin_unlock(&kptllnd_data.kptl_tx_lock); + } + + return 0; +} + +void +kptllnd_cleanup_tx_descs() +{ + kptl_tx_t *tx; + + /* No locking; single threaded now */ + LASSERT (kptllnd_data.kptl_shutdown == 2); + + while (!list_empty(&kptllnd_data.kptl_idle_txs)) { + tx = list_entry(kptllnd_data.kptl_idle_txs.next, + kptl_tx_t, tx_list); + + list_del(&tx->tx_list); + kptllnd_free_tx(tx); + } + + LASSERT (atomic_read(&kptllnd_data.kptl_ntx) == 0); +} + +kptl_tx_t * +kptllnd_get_idle_tx(enum kptl_tx_type type) +{ + kptl_tx_t *tx = NULL; + + if (IS_SIMULATION_ENABLED(FAIL_TX_PUT_ALLOC) && + type == TX_TYPE_PUT_REQUEST) { + CERROR("FAIL_TX_PUT_ALLOC SIMULATION triggered\n"); + return NULL; + } + + if (IS_SIMULATION_ENABLED(FAIL_TX_GET_ALLOC) && + type == TX_TYPE_GET_REQUEST) { + CERROR ("FAIL_TX_GET_ALLOC SIMULATION triggered\n"); + return NULL; + } + + if (IS_SIMULATION_ENABLED(FAIL_TX)) { + CERROR ("FAIL_TX SIMULATION triggered\n"); + return NULL; + } + + spin_lock(&kptllnd_data.kptl_tx_lock); + + if (list_empty (&kptllnd_data.kptl_idle_txs)) { + spin_unlock(&kptllnd_data.kptl_tx_lock); + + tx = kptllnd_alloc_tx(); + if (tx == NULL) + return NULL; + } else { + tx = list_entry(kptllnd_data.kptl_idle_txs.next, + kptl_tx_t, tx_list); + list_del(&tx->tx_list); + + spin_unlock(&kptllnd_data.kptl_tx_lock); + } + + LASSERT (atomic_read(&tx->tx_refcount)== 0); + LASSERT (tx->tx_idle); + LASSERT (!tx->tx_active); + LASSERT (tx->tx_lnet_msg == NULL); + LASSERT (tx->tx_lnet_replymsg == NULL); + LASSERT (tx->tx_peer == NULL); + LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE)); + LASSERT (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE)); + + tx->tx_type = type; + atomic_set(&tx->tx_refcount, 1); + tx->tx_status = 0; + tx->tx_idle = 0; + + CDEBUG(D_NET, "tx=%p\n", tx); + return tx; +} + +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS +int +kptllnd_tx_abort_netio(kptl_tx_t *tx) +{ + kptl_peer_t *peer = tx->tx_peer; + ptl_handle_md_t msg_mdh; + ptl_handle_md_t rdma_mdh; + unsigned long flags; + + LASSERT (atomic_read(&tx->tx_refcount) == 0); + LASSERT (!tx->tx_active); + + spin_lock_irqsave(&peer->peer_lock, flags); + + msg_mdh = tx->tx_msg_mdh; + rdma_mdh = tx->tx_rdma_mdh; + + if (PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE) && + PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) { + spin_unlock_irqrestore(&peer->peer_lock, flags); + return 0; + } + + /* Uncompleted comms: there must have been some error and it must be + * propagated to LNET... */ + LASSERT (tx->tx_status != 0 || + (tx->tx_lnet_msg == NULL && + tx->tx_lnet_replymsg == NULL)); + + /* stash the tx on its peer until it completes */ + atomic_set(&tx->tx_refcount, 1); + tx->tx_active = 1; + list_add_tail(&tx->tx_list, &peer->peer_activeq); + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + /* These unlinks will ensure completion events (normal or unlink) will + * happen ASAP */ + + if (!PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE)) + PtlMDUnlink(msg_mdh); + + if (!PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) + PtlMDUnlink(rdma_mdh); + + return -EAGAIN; +} +#else +int +kptllnd_tx_abort_netio(kptl_tx_t *tx) +{ + ptl_peer_t *peer = tx->tx_peer; + ptl_handle_md_t msg_mdh; + ptl_handle_md_t rdma_mdh; + unsigned long flags; + ptl_err_t prc; + + LASSERT (atomic_read(&tx->tx_refcount) == 0); + LASSERT (!tx->tx_active); + + spin_lock_irqsave(&peer->peer_lock, flags); + + msg_mdh = tx->tx_msg_mdh; + rdma_mdh = tx->tx_rdma_mdh; + + if (PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE) && + PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) { + spin_unlock_irqrestore(&peer->peer_lock, flags); + return 0; + } + + /* Uncompleted comms: there must have been some error and it must be + * propagated to LNET... */ + LASSERT (tx->tx_status != 0 || + (tx->tx_lnet_msg == NULL && + tx->tx_replymsg == NULL)); + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + if (!PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE)) { + prc = PtlMDUnlink(msg_mdh); + if (prc == PTL_OK) + msg_mdh = PTL_INVALID_HANDLE; + } + + if (!PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) { + prc = PtlMDUnlink(rdma_mdh); + if (prc == PTL_OK) + rdma_mdh = PTL_INVALID_HANDLE; + } + + spin_lock_irqsave(&peer->peer_lock, flags); + + /* update tx_???_mdh if callback hasn't fired */ + if (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE)) + msg_mdh = PTL_INVALID_HANDLE; + else + tx->tx_msg_mdh = msg_mdh; + + if (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE)) + rdma_mdh = PTL_INVALID_HANDLE; + else + tx->tx_rdma_mdh = rdma_mdh; + + if (PtlHandleIsEqual(msg_mdh, PTL_INVALID_HANDLE) && + PtlHandleIsEqual(rdma_mdh, PTL_INVALID_HANDLE)) { + spin_unlock_irqrestore(&peer->peer_lock, flags); + return 0; + } + + /* stash the tx on its peer until it completes */ + atomic_set(&tx->tx_refcount, 1); + tx->tx_active = 1; + list_add_tail(&tx->tx_list, &peer->peer_activeq); + + kptllnd_peer_addref(peer); /* extra ref for me... */ + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + /* This will get the watchdog thread to try aborting all the peer's + * comms again. NB, this deems it fair that 1 failing tx which can't + * be aborted immediately (i.e. its MDs are still busy) is valid cause + * to nuke everything to the same peer! */ + kptllnd_peer_close(peer, tx->tx_status); + + kptllnd_peer_decref(peer); + + return -EAGAIN; +} +#endif + +void +kptllnd_tx_fini (kptl_tx_t *tx) +{ + lnet_msg_t *replymsg = tx->tx_lnet_replymsg; + lnet_msg_t *msg = tx->tx_lnet_msg; + kptl_peer_t *peer = tx->tx_peer; + int status = tx->tx_status; + int rc; + + LASSERT (!in_interrupt()); + LASSERT (atomic_read(&tx->tx_refcount) == 0); + LASSERT (!tx->tx_idle); + LASSERT (!tx->tx_active); + + /* TX has completed or failed */ + + if (peer != NULL) { + rc = kptllnd_tx_abort_netio(tx); + if (rc != 0) + return; + } + + LASSERT (PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE)); + LASSERT (PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE)); + + tx->tx_lnet_msg = tx->tx_lnet_replymsg = NULL; + tx->tx_peer = NULL; + tx->tx_idle = 1; + + spin_lock(&kptllnd_data.kptl_tx_lock); + list_add_tail(&tx->tx_list, &kptllnd_data.kptl_idle_txs); + spin_unlock(&kptllnd_data.kptl_tx_lock); + + /* Must finalize AFTER freeing 'tx' */ + if (msg != NULL) + lnet_finalize(kptllnd_data.kptl_ni, msg, + (replymsg == NULL) ? status : 0); + + if (replymsg != NULL) + lnet_finalize(kptllnd_data.kptl_ni, replymsg, status); + + if (peer != NULL) + kptllnd_peer_decref(peer); +} + +const char * +kptllnd_tx_typestr(int type) +{ + switch (type) { + default: + return ""; + + case TX_TYPE_SMALL_MESSAGE: + return "msg"; + + case TX_TYPE_PUT_REQUEST: + return "put_req"; + + case TX_TYPE_GET_REQUEST: + return "get_req"; + break; + + case TX_TYPE_PUT_RESPONSE: + return "put_rsp"; + break; + + case TX_TYPE_GET_RESPONSE: + return "get_rsp"; + } +} + +void +kptllnd_tx_callback(ptl_event_t *ev) +{ + kptl_eventarg_t *eva = ev->md.user_ptr; + int ismsg = (eva->eva_type == PTLLND_EVENTARG_TYPE_MSG); + kptl_tx_t *tx = kptllnd_eventarg2obj(eva); + kptl_peer_t *peer = tx->tx_peer; + int ok = (ev->ni_fail_type == PTL_OK); + int unlinked; + unsigned long flags; + + LASSERT (peer != NULL); + LASSERT (eva->eva_type == PTLLND_EVENTARG_TYPE_MSG || + eva->eva_type == PTLLND_EVENTARG_TYPE_RDMA); + LASSERT (!ismsg || !PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE)); + LASSERT (ismsg || !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE)); + +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + unlinked = ev->unlinked; +#else + unlinked = (ev->type == PTL_EVENT_UNLINK); +#endif + CDEBUG(D_NET, "%s(%d) tx=%p(%s) fail=%d unlinked=%d\n", + kptllnd_evtype2str(ev->type), ev->type, + tx, libcfs_id2str(peer->peer_id), + ev->ni_fail_type, unlinked); + + switch (tx->tx_type) { + default: + LBUG(); + + case TX_TYPE_SMALL_MESSAGE: + LASSERT (ismsg); + LASSERT (ev->type == PTL_EVENT_UNLINK || + ev->type == PTL_EVENT_SEND_END); + break; + + case TX_TYPE_PUT_REQUEST: + LASSERT (ev->type == PTL_EVENT_UNLINK || + (ismsg && ev->type == PTL_EVENT_SEND_END) || + (!ismsg && ev->type == PTL_EVENT_GET_END)); + break; + + case TX_TYPE_GET_REQUEST: + LASSERT (ev->type == PTL_EVENT_UNLINK || + (ismsg && ev->type == PTL_EVENT_SEND_END) || + (!ismsg && ev->type == PTL_EVENT_PUT_END)); + + if (!ismsg && ok && ev->type == PTL_EVENT_PUT_END) { + if (ev->hdr_data == PTLLND_RDMA_OK) { + lnet_set_reply_msg_len( + kptllnd_data.kptl_ni, + tx->tx_lnet_replymsg, + ev->mlength); + } else { + /* no match at peer */ + tx->tx_status = -EIO; + } + } + break; + + case TX_TYPE_PUT_RESPONSE: + LASSERT (!ismsg); + LASSERT (ev->type == PTL_EVENT_UNLINK || + ev->type == PTL_EVENT_SEND_END || + ev->type == PTL_EVENT_REPLY_END); + break; + + case TX_TYPE_GET_RESPONSE: + LASSERT (!ismsg); + LASSERT (ev->type == PTL_EVENT_UNLINK || + ev->type == PTL_EVENT_SEND_END); + break; + } + + if (!ok) + kptllnd_peer_close(peer, -EIO); + else + kptllnd_peer_alive(peer); + + if (!unlinked) + return; + + spin_lock_irqsave(&peer->peer_lock, flags); + + if (ismsg) + tx->tx_msg_mdh = PTL_INVALID_HANDLE; + else + tx->tx_rdma_mdh = PTL_INVALID_HANDLE; + + if (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE) || + !PtlHandleIsEqual(tx->tx_rdma_mdh, PTL_INVALID_HANDLE) || + !tx->tx_active) { + spin_unlock_irqrestore(&peer->peer_lock, flags); + return; + } + + list_del(&tx->tx_list); + tx->tx_active = 0; + + spin_unlock_irqrestore(&peer->peer_lock, flags); + + /* drop peer's ref, but if it was the last one... */ + if (atomic_dec_and_test(&tx->tx_refcount)) { + /* ...finalize it in thread context! */ + spin_lock_irqsave(&kptllnd_data.kptl_sched_lock, flags); + + list_add_tail(&tx->tx_list, &kptllnd_data.kptl_sched_txq); + wake_up(&kptllnd_data.kptl_sched_waitq); + + spin_unlock_irqrestore(&kptllnd_data.kptl_sched_lock, flags); + } +} diff --git a/lnet/klnds/ptllnd/wirecheck.c b/lnet/klnds/ptllnd/wirecheck.c new file mode 100644 index 0000000..8111cbb --- /dev/null +++ b/lnet/klnds/ptllnd/wirecheck.c @@ -0,0 +1,206 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: PJ Kirner + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ +#include +#include +#include +#include + +#include + +#include + +/* This ghastly hack to allows me to include lib-types.h It doesn't affect any + * assertions generated here (but fails-safe if it ever does) */ +typedef struct { + int counter; +} atomic_t; + +#include +#include + +#ifndef HAVE_STRNLEN +#define strnlen(s, i) strlen(s) +#endif + +#define BLANK_LINE() \ +do { \ + printf ("\n"); \ +} while (0) + +#define COMMENT(c) \ +do { \ + printf (" /* "c" */\n"); \ +} while (0) + +#undef STRINGIFY +#define STRINGIFY(a) #a + +#define CHECK_DEFINE(a) \ +do { \ + printf (" CLASSERT ("#a" == "STRINGIFY(a)");\n"); \ +} while (0) + +#define CHECK_VALUE(a) \ +do { \ + printf (" CLASSERT ("#a" == %d);\n", a); \ +} while (0) + +#define CHECK_MEMBER_OFFSET(s,m) \ +do { \ + CHECK_VALUE((int)offsetof(s, m)); \ +} while (0) + +#define CHECK_MEMBER_SIZEOF(s,m) \ +do { \ + CHECK_VALUE((int)sizeof(((s *)0)->m)); \ +} while (0) + +#define CHECK_MEMBER(s,m) \ +do { \ + CHECK_MEMBER_OFFSET(s, m); \ + CHECK_MEMBER_SIZEOF(s, m); \ +} while (0) + +#define CHECK_STRUCT(s) \ +do { \ + BLANK_LINE (); \ + COMMENT ("Checks for struct "#s); \ + CHECK_VALUE((int)sizeof(s)); \ +} while (0) + +void +system_string (char *cmdline, char *str, int len) +{ + int fds[2]; + int rc; + pid_t pid; + + rc = pipe (fds); + if (rc != 0) + abort (); + + pid = fork (); + if (pid == 0) { + /* child */ + int fd = fileno(stdout); + + rc = dup2(fds[1], fd); + if (rc != fd) + abort(); + + exit(system(cmdline)); + /* notreached */ + } else if ((int)pid < 0) { + abort(); + } else { + FILE *f = fdopen (fds[0], "r"); + + if (f == NULL) + abort(); + + close(fds[1]); + + if (fgets(str, len, f) == NULL) + abort(); + + if (waitpid(pid, &rc, 0) != pid) + abort(); + + if (!WIFEXITED(rc) || + WEXITSTATUS(rc) != 0) + abort(); + + if (strnlen(str, len) == len) + str[len - 1] = 0; + + if (str[strlen(str) - 1] == '\n') + str[strlen(str) - 1] = 0; + + fclose(f); + } +} + +int +main (int argc, char **argv) +{ + char unameinfo[80]; + char gccinfo[80]; + + system_string("uname -a", unameinfo, sizeof(unameinfo)); + system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo)); + + printf ("void kptllnd_assert_wire_constants (void)\n" + "{\n" + " /* Wire protocol assertions generated by 'wirecheck'\n" + " * running on %s\n" + " * with %s */\n" + "\n", unameinfo, gccinfo); + + BLANK_LINE (); + + COMMENT ("Constants..."); + CHECK_DEFINE (PTL_RESERVED_MATCHBITS); + CHECK_DEFINE (LNET_MSG_MATCHBITS); + + CHECK_DEFINE (PTLLND_MSG_MAGIC); + CHECK_DEFINE (PTLLND_MSG_VERSION); + + CHECK_DEFINE (PTLLND_RDMA_OK); + CHECK_DEFINE (PTLLND_RDMA_FAIL); + + CHECK_DEFINE (PTLLND_MSG_TYPE_INVALID); + CHECK_DEFINE (PTLLND_MSG_TYPE_PUT); + CHECK_DEFINE (PTLLND_MSG_TYPE_GET); + CHECK_DEFINE (PTLLND_MSG_TYPE_IMMEDIATE); + CHECK_DEFINE (PTLLND_MSG_TYPE_NOOP); + CHECK_DEFINE (PTLLND_MSG_TYPE_HELLO); + CHECK_DEFINE (PTLLND_MSG_TYPE_NAK); + + CHECK_STRUCT (kptl_msg_t); + CHECK_MEMBER (kptl_msg_t, ptlm_magic); + CHECK_MEMBER (kptl_msg_t, ptlm_version); + CHECK_MEMBER (kptl_msg_t, ptlm_type); + CHECK_MEMBER (kptl_msg_t, ptlm_credits); + CHECK_MEMBER (kptl_msg_t, ptlm_nob); + CHECK_MEMBER (kptl_msg_t, ptlm_cksum); + CHECK_MEMBER (kptl_msg_t, ptlm_srcnid); + CHECK_MEMBER (kptl_msg_t, ptlm_srcstamp); + CHECK_MEMBER (kptl_msg_t, ptlm_dstnid); + CHECK_MEMBER (kptl_msg_t, ptlm_dststamp); + CHECK_MEMBER (kptl_msg_t, ptlm_srcpid); + CHECK_MEMBER (kptl_msg_t, ptlm_dstpid); + CHECK_MEMBER (kptl_msg_t, ptlm_u.immediate); + CHECK_MEMBER (kptl_msg_t, ptlm_u.rdma); + CHECK_MEMBER (kptl_msg_t, ptlm_u.hello); + + CHECK_STRUCT (kptl_immediate_msg_t); + CHECK_MEMBER (kptl_immediate_msg_t, kptlim_hdr); + CHECK_MEMBER (kptl_immediate_msg_t, kptlim_payload[13]); + + CHECK_STRUCT (kptl_rdma_msg_t); + CHECK_MEMBER (kptl_rdma_msg_t, kptlrm_hdr); + CHECK_MEMBER (kptl_rdma_msg_t, kptlrm_matchbits); + + CHECK_STRUCT (kptl_hello_msg_t); + CHECK_MEMBER (kptl_hello_msg_t, kptlhm_matchbits); + CHECK_MEMBER (kptl_hello_msg_t, kptlhm_max_msg_size); + + printf ("}\n\n"); + + return (0); +} diff --git a/lnet/klnds/qswlnd/Makefile.in b/lnet/klnds/qswlnd/Makefile.in index d27240c..b623e02 100644 --- a/lnet/klnds/qswlnd/Makefile.in +++ b/lnet/klnds/qswlnd/Makefile.in @@ -1,5 +1,5 @@ -MODULES := kqswnal -kqswnal-objs := qswnal.o qswnal_cb.o +MODULES := kqswlnd +kqswlnd-objs := qswlnd.o qswlnd_cb.o qswlnd_modparams.o EXTRA_POST_CFLAGS := @QSWCPPFLAGS@ -I/usr/include diff --git a/lnet/klnds/qswlnd/autoMakefile.am b/lnet/klnds/qswlnd/autoMakefile.am index 228689d..721e86f 100644 --- a/lnet/klnds/qswlnd/autoMakefile.am +++ b/lnet/klnds/qswlnd/autoMakefile.am @@ -4,12 +4,10 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS -if BUILD_QSWNAL -modulenet_DATA = kqswnal$(KMODEXT) -endif +if BUILD_QSWLND +modulenet_DATA = kqswlnd$(KMODEXT) endif endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kqswnal-objs:%.o=%.c) qswnal.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kqswlnd-objs:%.o=%.c) qswlnd.h diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index be01f5d..a8ecaca 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -19,50 +19,29 @@ * */ -#include "qswnal.h" +#include "qswlnd.h" -nal_t kqswnal_api; -kqswnal_data_t kqswnal_data; -ptl_handle_ni_t kqswnal_ni; -kqswnal_tunables_t kqswnal_tunables; - -kpr_nal_interface_t kqswnal_router_interface = { - kprni_nalid: QSWNAL, - kprni_arg: NULL, - kprni_fwd: kqswnal_fwd_packet, - kprni_notify: NULL, /* we're connectionless */ -}; - -#if CONFIG_SYSCTL -#define QSWNAL_SYSCTL 201 - -#define QSWNAL_SYSCTL_OPTIMIZED_GETS 1 -#define QSWNAL_SYSCTL_OPTIMIZED_PUTS 2 -static ctl_table kqswnal_ctl_table[] = { - {QSWNAL_SYSCTL_OPTIMIZED_PUTS, "optimized_puts", - &kqswnal_tunables.kqn_optimized_puts, sizeof (int), - 0644, NULL, &proc_dointvec}, - {QSWNAL_SYSCTL_OPTIMIZED_GETS, "optimized_gets", - &kqswnal_tunables.kqn_optimized_gets, sizeof (int), - 0644, NULL, &proc_dointvec}, - {0} +lnd_t the_kqswlnd = +{ + .lnd_type = QSWLND, + .lnd_startup = kqswnal_startup, + .lnd_shutdown = kqswnal_shutdown, + .lnd_ctl = kqswnal_ctl, + .lnd_send = kqswnal_send, + .lnd_recv = kqswnal_recv, }; -static ctl_table kqswnal_top_ctl_table[] = { - {QSWNAL_SYSCTL, "qswnal", NULL, 0, 0555, kqswnal_ctl_table}, - {0} -}; -#endif +kqswnal_data_t kqswnal_data; int -kqswnal_get_tx_desc (struct portals_cfg *pcfg) +kqswnal_get_tx_desc (struct libcfs_ioctl_data *data) { unsigned long flags; struct list_head *tmp; kqswnal_tx_t *ktx; - ptl_hdr_t *hdr; - int index = pcfg->pcfg_count; + lnet_hdr_t *hdr; + int index = data->ioc_count; int rc = -ENOENT; spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); @@ -72,18 +51,15 @@ kqswnal_get_tx_desc (struct portals_cfg *pcfg) continue; ktx = list_entry (tmp, kqswnal_tx_t, ktx_list); - hdr = (ptl_hdr_t *)ktx->ktx_buffer; - - memcpy(pcfg->pcfg_pbuf, ktx, - MIN(sizeof(*ktx), pcfg->pcfg_plen1)); - pcfg->pcfg_count = le32_to_cpu(hdr->type); - pcfg->pcfg_size = le32_to_cpu(hdr->payload_length); - pcfg->pcfg_nid = le64_to_cpu(hdr->dest_nid); - pcfg->pcfg_nid2 = ktx->ktx_nid; - pcfg->pcfg_misc = ktx->ktx_launcher; - pcfg->pcfg_flags = (list_empty (&ktx->ktx_delayed_list) ? 0 : 1) | - (!ktx->ktx_isnblk ? 0 : 2) | - (ktx->ktx_state << 2); + hdr = (lnet_hdr_t *)ktx->ktx_buffer; + + data->ioc_count = le32_to_cpu(hdr->payload_length); + data->ioc_nid = le64_to_cpu(hdr->dest_nid); + data->ioc_u64[0] = ktx->ktx_nid; + data->ioc_u32[0] = le32_to_cpu(hdr->type); + data->ioc_u32[1] = ktx->ktx_launcher; + data->ioc_flags = (list_empty (&ktx->ktx_schedlist) ? 0 : 1) | + (ktx->ktx_state << 2); rc = 0; break; } @@ -93,44 +69,42 @@ kqswnal_get_tx_desc (struct portals_cfg *pcfg) } int -kqswnal_cmd (struct portals_cfg *pcfg, void *private) +kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg) { - LASSERT (pcfg != NULL); - - switch (pcfg->pcfg_command) { - case NAL_CMD_GET_TXDESC: - return (kqswnal_get_tx_desc (pcfg)); - - case NAL_CMD_REGISTER_MYNID: - CDEBUG (D_IOCTL, "setting NID offset to "LPX64" (was "LPX64")\n", - pcfg->pcfg_nid - kqswnal_data.kqn_elanid, - kqswnal_data.kqn_nid_offset); - kqswnal_data.kqn_nid_offset = - pcfg->pcfg_nid - kqswnal_data.kqn_elanid; - kqswnal_lib.libnal_ni.ni_pid.nid = pcfg->pcfg_nid; - return (0); + struct libcfs_ioctl_data *data = arg; + + LASSERT (ni == kqswnal_data.kqn_ni); + + switch (cmd) { + case IOC_LIBCFS_GET_TXDESC: + return (kqswnal_get_tx_desc (data)); + + case IOC_LIBCFS_REGISTER_MYNID: + if (data->ioc_nid == ni->ni_nid) + return 0; + + LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid)); + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return 0; default: return (-EINVAL); } } -static void -kqswnal_shutdown(nal_t *nal) +void +kqswnal_shutdown(lnet_ni_t *ni) { unsigned long flags; kqswnal_tx_t *ktx; kqswnal_rx_t *krx; - int do_lib_fini = 0; - - /* NB The first ref was this module! */ - if (nal->nal_refct != 0) { - PORTAL_MODULE_UNUSE; - return; - } - + CDEBUG (D_NET, "shutdown\n"); - LASSERT (nal == &kqswnal_api); + LASSERT (ni->ni_data == &kqswnal_data); + LASSERT (ni == kqswnal_data.kqn_ni); switch (kqswnal_data.kqn_init) { @@ -138,46 +112,26 @@ kqswnal_shutdown(nal_t *nal) LASSERT (0); case KQN_INIT_ALL: - libcfs_nal_cmd_unregister(QSWNAL); - /* fall through */ - - case KQN_INIT_LIB: - do_lib_fini = 1; - /* fall through */ - case KQN_INIT_DATA: break; - - case KQN_INIT_NOTHING: - return; } /**********************************************************************/ - /* Tell router we're shutting down. Any router calls my threads - * make will now fail immediately and the router will stop calling - * into me. */ - kpr_shutdown (&kqswnal_data.kqn_router); - - /**********************************************************************/ /* Signal the start of shutdown... */ spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags); kqswnal_data.kqn_shuttingdown = 1; spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags); - wake_up_all(&kqswnal_data.kqn_idletxd_waitq); - /**********************************************************************/ /* wait for sends that have allocated a tx desc to launch or give up */ while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) { CDEBUG(D_NET, "waiting for %d pending sends\n", atomic_read (&kqswnal_data.kqn_pending_txs)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); + cfs_pause(cfs_time_seconds(1)); } /**********************************************************************/ /* close elan comms */ -#if MULTIRAIL_EKC /* Shut down receivers first; rx callbacks might try sending... */ if (kqswnal_data.kqn_eprx_small != NULL) ep_free_rcvr (kqswnal_data.kqn_eprx_small); @@ -188,7 +142,7 @@ kqswnal_shutdown(nal_t *nal) /* NB ep_free_rcvr() returns only after we've freed off all receive * buffers (see shutdown handling in kqswnal_requeue_rx()). This * means we must have completed any messages we passed to - * lib_parse() or kpr_fwd_start(). */ + * lnet_parse() */ if (kqswnal_data.kqn_eptx != NULL) ep_free_xmtr (kqswnal_data.kqn_eptx); @@ -196,25 +150,7 @@ kqswnal_shutdown(nal_t *nal) /* NB ep_free_xmtr() returns only after all outstanding transmits * have called their callback... */ LASSERT(list_empty(&kqswnal_data.kqn_activetxds)); -#else - /* "Old" EKC just pretends to shutdown cleanly but actually - * provides no guarantees */ - if (kqswnal_data.kqn_eprx_small != NULL) - ep_remove_large_rcvr (kqswnal_data.kqn_eprx_small); - - if (kqswnal_data.kqn_eprx_large != NULL) - ep_remove_large_rcvr (kqswnal_data.kqn_eprx_large); - /* wait for transmits to complete */ - while (!list_empty(&kqswnal_data.kqn_activetxds)) { - CWARN("waiting for active transmits to complete\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } - - if (kqswnal_data.kqn_eptx != NULL) - ep_free_large_xmtr (kqswnal_data.kqn_eptx); -#endif /**********************************************************************/ /* flag threads to terminate, wake them and wait for them to die */ kqswnal_data.kqn_shuttingdown = 2; @@ -223,8 +159,7 @@ kqswnal_shutdown(nal_t *nal) while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) { CDEBUG(D_NET, "waiting for %d threads to terminate\n", atomic_read (&kqswnal_data.kqn_nthreads)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); + cfs_pause(cfs_time_seconds(1)); } /**********************************************************************/ @@ -232,37 +167,14 @@ kqswnal_shutdown(nal_t *nal) * I control the horizontals and the verticals... */ -#if MULTIRAIL_EKC LASSERT (list_empty (&kqswnal_data.kqn_readyrxds)); + LASSERT (list_empty (&kqswnal_data.kqn_donetxds)); LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds)); - LASSERT (list_empty (&kqswnal_data.kqn_delayedfwds)); -#endif - - /**********************************************************************/ - /* Complete any blocked forwarding packets, with error - */ - - while (!list_empty (&kqswnal_data.kqn_idletxd_fwdq)) - { - kpr_fwd_desc_t *fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, - kpr_fwd_desc_t, kprfd_list); - list_del (&fwd->kprfd_list); - kpr_fwd_done (&kqswnal_data.kqn_router, fwd, -ESHUTDOWN); - } - - /**********************************************************************/ - /* finalise router and portals lib */ - - kpr_deregister (&kqswnal_data.kqn_router); - - if (do_lib_fini) - lib_fini (&kqswnal_lib); /**********************************************************************/ /* Unmap message buffers and free all descriptors and buffers */ -#if MULTIRAIL_EKC /* FTTB, we need to unmap any remaining mapped memory. When * ep_dvma_release() get fixed (and releases any mappings in the * region), we can delete all the code from here --------> */ @@ -294,38 +206,15 @@ kqswnal_shutdown(nal_t *nal) if (kqswnal_data.kqn_ep_tx_nmh != NULL) ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh); -#else - if (kqswnal_data.kqn_eprxdmahandle != NULL) - { - elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eprxdmahandle, 0, - KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE); - - elan3_dma_release(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eprxdmahandle); - } - - if (kqswnal_data.kqn_eptxdmahandle != NULL) - { - elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, 0, - KQSW_NTXMSGPAGES * (KQSW_NTXMSGS + - KQSW_NNBLK_TXMSGS)); - - elan3_dma_release(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle); - } -#endif while (kqswnal_data.kqn_txds != NULL) { ktx = kqswnal_data.kqn_txds; if (ktx->ktx_buffer != NULL) - PORTAL_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); + LIBCFS_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); kqswnal_data.kqn_txds = ktx->ktx_alloclist; - PORTAL_FREE(ktx, sizeof(*ktx)); + LIBCFS_FREE(ktx, sizeof(*ktx)); } while (kqswnal_data.kqn_rxds != NULL) { @@ -337,106 +226,96 @@ kqswnal_shutdown(nal_t *nal) __free_page (krx->krx_kiov[i].kiov_page); kqswnal_data.kqn_rxds = krx->krx_alloclist; - PORTAL_FREE(krx, sizeof (*krx)); + LIBCFS_FREE(krx, sizeof (*krx)); } /* resets flags, pointers to NULL etc */ memset(&kqswnal_data, 0, sizeof (kqswnal_data)); - CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory)); + CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&libcfs_kmemory)); - printk (KERN_INFO "Lustre: Routing QSW NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + PORTAL_MODULE_UNUSE; } -static int -kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +int +kqswnal_startup (lnet_ni_t *ni) { -#if MULTIRAIL_EKC EP_RAILMASK all_rails = EP_RAILMASK_ALL; -#else - ELAN3_DMA_REQUEST dmareq; -#endif int rc; int i; kqswnal_rx_t *krx; kqswnal_tx_t *ktx; int elan_page_idx; - ptl_process_id_t my_process_id; - int pkmem = atomic_read(&portal_kmemory); - LASSERT (nal == &kqswnal_api); + LASSERT (ni->ni_lnd == &the_kqswlnd); - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kqswnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); +#if KQSW_CKSUM + if (the_lnet.ln_ptlcompat != 0) { + CERROR("Checksumming version not portals compatible\n"); + return -ENODEV; } - - LASSERT (kqswnal_data.kqn_init == KQN_INIT_NOTHING); - - CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&portal_kmemory)); - +#endif + /* Only 1 instance supported */ + if (kqswnal_data.kqn_init != KQN_INIT_NOTHING) { + CERROR ("Only 1 instance supported\n"); + return -EPERM; + } + + if (ni->ni_interfaces[0] != NULL) { + CERROR("Explicit interface config not supported\n"); + return -EPERM; + } + + if (*kqswnal_tunables.kqn_credits >= + *kqswnal_tunables.kqn_ntxmsgs) { + LCONSOLE_ERROR("Configuration error: please set " + "ntxmsgs(%d) > credits(%d)\n", + *kqswnal_tunables.kqn_ntxmsgs, + *kqswnal_tunables.kqn_credits); + } + + CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&libcfs_kmemory)); + /* ensure all pointers NULL etc */ memset (&kqswnal_data, 0, sizeof (kqswnal_data)); + kqswnal_data.kqn_ni = ni; + ni->ni_data = &kqswnal_data; + ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits; + ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits; + INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds); - INIT_LIST_HEAD (&kqswnal_data.kqn_nblk_idletxds); INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds); spin_lock_init (&kqswnal_data.kqn_idletxd_lock); - init_waitqueue_head (&kqswnal_data.kqn_idletxd_waitq); - INIT_LIST_HEAD (&kqswnal_data.kqn_idletxd_fwdq); - INIT_LIST_HEAD (&kqswnal_data.kqn_delayedfwds); INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds); + INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds); INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds); spin_lock_init (&kqswnal_data.kqn_sched_lock); init_waitqueue_head (&kqswnal_data.kqn_sched_waitq); - /* Leave kqn_rpc_success zeroed */ -#if MULTIRAIL_EKC - kqswnal_data.kqn_rpc_failed.Data[0] = -ECONNREFUSED; -#else - kqswnal_data.kqn_rpc_failed.Status = -ECONNREFUSED; -#endif - /* pointers/lists/locks initialised */ kqswnal_data.kqn_init = KQN_INIT_DATA; + PORTAL_MODULE_USE; -#if MULTIRAIL_EKC kqswnal_data.kqn_ep = ep_system(); if (kqswnal_data.kqn_ep == NULL) { CERROR("Can't initialise EKC\n"); - kqswnal_shutdown(nal); - return (PTL_IFACE_INVALID); + kqswnal_shutdown(ni); + return (-ENODEV); } if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) { CERROR("Can't get elan ID\n"); - kqswnal_shutdown(nal); - return (PTL_IFACE_INVALID); + kqswnal_shutdown(ni); + return (-ENODEV); } -#else - /**********************************************************************/ - /* Find the first Elan device */ - kqswnal_data.kqn_ep = ep_device (0); - if (kqswnal_data.kqn_ep == NULL) - { - CERROR ("Can't get elan device 0\n"); - kqswnal_shutdown(nal); - return (PTL_IFACE_INVALID); - } -#endif + kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep); + kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep); - kqswnal_data.kqn_nid_offset = 0; - kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep); - kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep); + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), kqswnal_data.kqn_elanid); /**********************************************************************/ /* Get the transmitter */ @@ -445,172 +324,129 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (kqswnal_data.kqn_eptx == NULL) { CERROR ("Can't allocate transmitter\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown (ni); + return (-ENOMEM); } /**********************************************************************/ /* Get the receivers */ - kqswnal_data.kqn_eprx_small = ep_alloc_rcvr (kqswnal_data.kqn_ep, - EP_MSG_SVC_PORTALS_SMALL, - KQSW_EP_ENVELOPES_SMALL); + kqswnal_data.kqn_eprx_small = + ep_alloc_rcvr (kqswnal_data.kqn_ep, + EP_MSG_SVC_PORTALS_SMALL, + *kqswnal_tunables.kqn_ep_envelopes_small); if (kqswnal_data.kqn_eprx_small == NULL) { CERROR ("Can't install small msg receiver\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown (ni); + return (-ENOMEM); } - kqswnal_data.kqn_eprx_large = ep_alloc_rcvr (kqswnal_data.kqn_ep, - EP_MSG_SVC_PORTALS_LARGE, - KQSW_EP_ENVELOPES_LARGE); + kqswnal_data.kqn_eprx_large = + ep_alloc_rcvr (kqswnal_data.kqn_ep, + EP_MSG_SVC_PORTALS_LARGE, + *kqswnal_tunables.kqn_ep_envelopes_large); if (kqswnal_data.kqn_eprx_large == NULL) { CERROR ("Can't install large msg receiver\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown (ni); + return (-ENOMEM); } /**********************************************************************/ /* Reserve Elan address space for transmit descriptors NB we may * either send the contents of associated buffers immediately, or * map them for the peer to suck/blow... */ -#if MULTIRAIL_EKC kqswnal_data.kqn_ep_tx_nmh = ep_dvma_reserve(kqswnal_data.kqn_ep, - KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), + KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs), EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve tx dma space\n"); - kqswnal_shutdown(nal); - return (PTL_NO_SPACE); - } -#else - dmareq.Waitfn = DDI_DMA_SLEEP; - dmareq.ElanAddr = (E3_Addr) 0; - dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; - dmareq.Perm = ELAN_PERM_REMOTEWRITE; - - rc = elan3_dma_reserve(kqswnal_data.kqn_ep->DmaState, - KQSW_NTXMSGPAGES*(KQSW_NTXMSGS+KQSW_NNBLK_TXMSGS), - &dmareq, &kqswnal_data.kqn_eptxdmahandle); - if (rc != DDI_SUCCESS) - { - CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown(ni); + return (-ENOMEM); } -#endif + /**********************************************************************/ /* Reserve Elan address space for receive buffers */ -#if MULTIRAIL_EKC kqswnal_data.kqn_ep_rx_nmh = ep_dvma_reserve(kqswnal_data.kqn_ep, - KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, + KQSW_NRXMSGPAGES_SMALL * + (*kqswnal_tunables.kqn_nrxmsgs_small) + + KQSW_NRXMSGPAGES_LARGE * + (*kqswnal_tunables.kqn_nrxmsgs_large), EP_PERM_WRITE); if (kqswnal_data.kqn_ep_tx_nmh == NULL) { CERROR("Can't reserve rx dma space\n"); - kqswnal_shutdown(nal); - return (PTL_NO_SPACE); - } -#else - dmareq.Waitfn = DDI_DMA_SLEEP; - dmareq.ElanAddr = (E3_Addr) 0; - dmareq.Attr = PTE_LOAD_LITTLE_ENDIAN; - dmareq.Perm = ELAN_PERM_REMOTEWRITE; - - rc = elan3_dma_reserve (kqswnal_data.kqn_ep->DmaState, - KQSW_NRXMSGPAGES_SMALL * KQSW_NRXMSGS_SMALL + - KQSW_NRXMSGPAGES_LARGE * KQSW_NRXMSGS_LARGE, - &dmareq, &kqswnal_data.kqn_eprxdmahandle); - if (rc != DDI_SUCCESS) - { - CERROR ("Can't reserve rx dma space\n"); - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown(ni); + return (-ENOMEM); } -#endif + /**********************************************************************/ /* Allocate/Initialise transmit descriptors */ kqswnal_data.kqn_txds = NULL; - for (i = 0; i < (KQSW_NTXMSGS + KQSW_NNBLK_TXMSGS); i++) + for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++) { int premapped_pages; int basepage = i * KQSW_NTXMSGPAGES; - PORTAL_ALLOC (ktx, sizeof(*ktx)); + LIBCFS_ALLOC (ktx, sizeof(*ktx)); if (ktx == NULL) { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown (ni); + return (-ENOMEM); } memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */ ktx->ktx_alloclist = kqswnal_data.kqn_txds; kqswnal_data.kqn_txds = ktx; - PORTAL_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); + LIBCFS_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); if (ktx->ktx_buffer == NULL) { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown (ni); + return (-ENOMEM); } /* Map pre-allocated buffer NOW, to save latency on transmit */ premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE); -#if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, kqswnal_data.kqn_ep_tx_nmh, basepage, &all_rails, &ktx->ktx_ebuffer); -#else - elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE, - basepage, &ktx->ktx_ebuffer); -#endif + ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */ ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */ - INIT_LIST_HEAD (&ktx->ktx_delayed_list); + INIT_LIST_HEAD (&ktx->ktx_schedlist); ktx->ktx_state = KTX_IDLE; -#if MULTIRAIL_EKC ktx->ktx_rail = -1; /* unset rail */ -#endif - ktx->ktx_isnblk = (i >= KQSW_NTXMSGS); - list_add_tail (&ktx->ktx_list, - ktx->ktx_isnblk ? &kqswnal_data.kqn_nblk_idletxds : - &kqswnal_data.kqn_idletxds); + + list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); } /**********************************************************************/ /* Allocate/Initialise receive descriptors */ kqswnal_data.kqn_rxds = NULL; elan_page_idx = 0; - for (i = 0; i < KQSW_NRXMSGS_SMALL + KQSW_NRXMSGS_LARGE; i++) + for (i = 0; i < *kqswnal_tunables.kqn_nrxmsgs_small + *kqswnal_tunables.kqn_nrxmsgs_large; i++) { -#if MULTIRAIL_EKC EP_NMD elanbuffer; -#else - E3_Addr elanbuffer; -#endif int j; - PORTAL_ALLOC(krx, sizeof(*krx)); + LIBCFS_ALLOC(krx, sizeof(*krx)); if (krx == NULL) { - kqswnal_shutdown(nal); - return (PTL_NO_SPACE); + kqswnal_shutdown(ni); + return (-ENOMEM); } memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */ krx->krx_alloclist = kqswnal_data.kqn_rxds; kqswnal_data.kqn_rxds = krx; - if (i < KQSW_NRXMSGS_SMALL) + if (i < *kqswnal_tunables.kqn_nrxmsgs_small) { krx->krx_npages = KQSW_NRXMSGPAGES_SMALL; krx->krx_eprx = kqswnal_data.kqn_eprx_small; @@ -627,14 +463,15 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, struct page *page = alloc_page(GFP_KERNEL); if (page == NULL) { - kqswnal_shutdown (nal); - return (PTL_NO_SPACE); + kqswnal_shutdown (ni); + return (-ENOMEM); } - krx->krx_kiov[j].kiov_page = page; + krx->krx_kiov[j] = (lnet_kiov_t) {.kiov_page = page, + .kiov_offset = 0, + .kiov_len = PAGE_SIZE}; LASSERT(page_address(page) != NULL); -#if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, page_address(page), PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh, @@ -649,42 +486,13 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /* NB contiguous mapping */ LASSERT(rc); } -#else - elan3_dvma_kaddr_load(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eprxdmahandle, - page_address(page), - PAGE_SIZE, elan_page_idx, - &elanbuffer); - if (j == 0) - krx->krx_elanbuffer = elanbuffer; - - /* NB contiguous mapping */ - LASSERT (elanbuffer == krx->krx_elanbuffer + j * PAGE_SIZE); -#endif elan_page_idx++; } } LASSERT (elan_page_idx == - (KQSW_NRXMSGS_SMALL * KQSW_NRXMSGPAGES_SMALL) + - (KQSW_NRXMSGS_LARGE * KQSW_NRXMSGPAGES_LARGE)); - - /**********************************************************************/ - /* Network interface ready to initialise */ - - my_process_id.nid = kqswnal_elanid2nid(kqswnal_data.kqn_elanid); - my_process_id.pid = requested_pid; - - rc = lib_init(&kqswnal_lib, nal, my_process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) - { - CERROR ("lib_init failed %d\n", rc); - kqswnal_shutdown (nal); - return (rc); - } - - kqswnal_data.kqn_init = KQN_INIT_LIB; + (*kqswnal_tunables.kqn_nrxmsgs_small * KQSW_NRXMSGPAGES_SMALL) + + (*kqswnal_tunables.kqn_nrxmsgs_large * KQSW_NRXMSGPAGES_LARGE)); /**********************************************************************/ /* Queue receives, now that it's OK to run their completion callbacks */ @@ -692,19 +500,12 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) { /* NB this enqueue can allocate/sleep (attr == 0) */ krx->krx_state = KRX_POSTED; -#if MULTIRAIL_EKC rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, &krx->krx_elanbuffer, 0); -#else - rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE, 0); -#endif - if (rc != EP_SUCCESS) - { + if (rc != EP_SUCCESS) { CERROR ("failed ep_queue_receive %d\n", rc); - kqswnal_shutdown (nal); - return (PTL_FAIL); + kqswnal_shutdown (ni); + return (-EIO); } } @@ -715,83 +516,36 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, if (rc != 0) { CERROR ("failed to spawn scheduling thread: %d\n", rc); - kqswnal_shutdown (nal); - return (PTL_FAIL); + kqswnal_shutdown (ni); + return (-ESRCH); } } - /**********************************************************************/ - /* Connect to the router */ - rc = kpr_register (&kqswnal_data.kqn_router, &kqswnal_router_interface); - CDEBUG(D_NET, "Can't initialise routing interface (rc = %d): not routing\n",rc); - - rc = libcfs_nal_cmd_register (QSWNAL, &kqswnal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - kqswnal_shutdown (nal); - return (PTL_FAIL); - } - kqswnal_data.kqn_init = KQN_INIT_ALL; - - printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d " - "(Routing %s, initial mem %d)\n", - kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes, - kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled", - pkmem); - - return (PTL_OK); + return (0); } void __exit kqswnal_finalise (void) { -#if CONFIG_SYSCTL - if (kqswnal_tunables.kqn_sysctl != NULL) - unregister_sysctl_table (kqswnal_tunables.kqn_sysctl); -#endif - PtlNIFini(kqswnal_ni); - - ptl_unregister_nal(QSWNAL); + lnet_unregister_lnd(&the_kqswlnd); + kqswnal_tunables_fini(); } static int __init kqswnal_initialise (void) { - int rc; - - kqswnal_api.nal_ni_init = kqswnal_startup; - kqswnal_api.nal_ni_fini = kqswnal_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kqswnal_tunables.kqn_optimized_puts = KQSW_OPTIMIZED_PUTS; - kqswnal_tunables.kqn_optimized_gets = KQSW_OPTIMIZED_GETS; + int rc = kqswnal_tunables_init(); - rc = ptl_register_nal(QSWNAL, &kqswnal_api); - if (rc != PTL_OK) { - CERROR("Can't register QSWNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } - - /* Pure gateways, and the workaround for 'EKC blocks forever until - * the service is active' want the NAL started up at module load - * time... */ - rc = PtlNIInit(QSWNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kqswnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(QSWNAL); - return (-ENODEV); - } + if (rc != 0) + return rc; -#if CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kqswnal_tunables.kqn_sysctl = - register_sysctl_table (kqswnal_top_ctl_table, 0); -#endif + lnet_register_lnd(&the_kqswlnd); return (0); } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel Quadrics/Elan NAL v1.01"); +MODULE_DESCRIPTION("Kernel Quadrics/Elan LND v1.01"); MODULE_LICENSE("GPL"); module_init (kqswnal_initialise); diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index c138be4..0fe2a5e 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -34,19 +34,7 @@ #include #include -#if MULTIRAIL_EKC -# include -#else -# include -# include -# include -# include -# include -# include -# include -# include -# include -#endif +#include #include #include @@ -72,77 +60,110 @@ #include #include -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND #include -#include -#include -#include -#include - -#define KQSW_CHECKSUM 0 -#if KQSW_CHECKSUM -typedef unsigned long kqsw_csum_t; -#define KQSW_CSUM_SIZE (2 * sizeof (kqsw_csum_t)) -#else -#define KQSW_CSUM_SIZE 0 -#endif -#define KQSW_HDR_SIZE (sizeof (ptl_hdr_t) + KQSW_CSUM_SIZE) - -/* - * Performance Tuning defines - * NB no mention of PAGE_SIZE for interoperability - */ -#define KQSW_MAXPAYLOAD PTL_MTU -#define KQSW_SMALLPAYLOAD ((4<<10) - KQSW_HDR_SIZE) /* small/large ep receiver breakpoint */ - -#define KQSW_TX_MAXCONTIG (1<<10) /* largest payload that gets made contiguous on transmit */ - -#define KQSW_NTXMSGS 8 /* # normal transmit messages */ -#define KQSW_NNBLK_TXMSGS (PAGE_SIZE == 4096 ? 512 : 256) /* # reserved transmit messages if can't block */ /* avoid qsnet crash b=5291 */ - -#define KQSW_NRXMSGS_LARGE 64 /* # large receive buffers */ -#define KQSW_EP_ENVELOPES_LARGE 256 /* # large ep envelopes */ - -#define KQSW_NRXMSGS_SMALL 256 /* # small receive buffers */ -#define KQSW_EP_ENVELOPES_SMALL 2048 /* # small ep envelopes */ +#include +#include +/* fixed constants */ +#define KQSW_SMALLMSG (4<<10) /* small/large ep receiver breakpoint */ #define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */ -#define KQSW_OPTIMIZED_GETS 1 /* optimize gets >= this size */ -#define KQSW_OPTIMIZED_PUTS (32<<10) /* optimize puts >= this size */ -#define KQSW_COPY_SMALL_FWD 0 /* copy small fwd messages to pre-mapped buffer? */ +#define KQSW_CKSUM 0 /* enable checksumming (protocol incompatible) */ /* * derived constants */ -#define KQSW_TX_BUFFER_SIZE (KQSW_HDR_SIZE + KQSW_TX_MAXCONTIG) +#define KQSW_TX_BUFFER_SIZE (offsetof(kqswnal_msg_t, \ + kqm_u.immediate.kqim_payload[*kqswnal_tunables.kqn_tx_maxcontig])) /* The pre-allocated tx buffer (hdr + small payload) */ -#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(KQSW_MAXPAYLOAD) + 1) +#define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(LNET_MAX_PAYLOAD) + 1) /* Reserve elan address space for pre-allocated and pre-mapped transmit * buffer and a full payload too. Extra pages allow for page alignment */ -#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_HDR_SIZE + KQSW_SMALLPAYLOAD)) +#define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_SMALLMSG)) /* receive hdr/payload always contiguous and page aligned */ #define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE) -#define KQSW_NRXMSGPAGES_LARGE (btopr(KQSW_HDR_SIZE + KQSW_MAXPAYLOAD)) +#define KQSW_NRXMSGPAGES_LARGE (btopr(sizeof(lnet_msg_t) + LNET_MAX_PAYLOAD)) /* receive hdr/payload always contiguous and page aligned */ #define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE) /* biggest complete packet we can receive (or transmit) */ +/* Wire messages */ /* Remote memory descriptor */ typedef struct { __u32 kqrmd_nfrag; /* # frags */ -#if MULTIRAIL_EKC EP_NMD kqrmd_frag[0]; /* actual frags */ +} kqswnal_remotemd_t; + +/* Immediate data */ +typedef struct +{ + lnet_hdr_t kqim_hdr; /* LNET header */ + char kqim_payload[0]; /* piggy-backed payload */ +} WIRE_ATTR kqswnal_immediate_msg_t; + +/* RDMA request */ +typedef struct +{ + lnet_hdr_t kqrm_hdr; /* LNET header */ + kqswnal_remotemd_t kqrm_rmd; /* peer's buffer */ +} WIRE_ATTR kqswnal_rdma_msg_t; + +typedef struct +{ + __u32 kqm_magic; /* I'm a qswlnd message */ + __u16 kqm_version; /* this is my version number */ + __u16 kqm_type; /* msg type */ +#if KQSW_CKSUM + __u32 kqm_cksum; /* crc32 checksum */ + __u32 kqm_nob; /* original msg length */ +#endif + union { + kqswnal_immediate_msg_t immediate; + kqswnal_rdma_msg_t rdma; + } WIRE_ATTR kqm_u; +} WIRE_ATTR kqswnal_msg_t; + +#if KQSW_CKSUM /* enable checksums ? */ +# include +static inline __u32 kqswnal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); #else - EP_IOVEC kqrmd_frag[0]; /* actual frags */ + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + return crc; #endif -} kqswnal_remotemd_t; +} +# define QSWLND_PROTO_VERSION 0xbeef +#else +# define QSWLND_PROTO_VERSION 1 +#endif + +#define QSWLND_MSG_IMMEDIATE 0 +#define QSWLND_MSG_RDMA 1 + +typedef union { + EP_STATUSBLK ep_statusblk; + struct { + __u32 status; + __u32 magic; + __u32 version; + union { + struct { + __u32 len; + __u32 cksum; + } WIRE_ATTR get; + } WIRE_ATTR u; + } WIRE_ATTR msg; +} kqswnal_rpc_reply_t; typedef struct kqswnal_rx { @@ -150,19 +171,18 @@ typedef struct kqswnal_rx struct kqswnal_rx *krx_alloclist; /* stack in kqn_rxds */ EP_RCVR *krx_eprx; /* port to post receives to */ EP_RXD *krx_rxd; /* receive descriptor (for repost) */ -#if MULTIRAIL_EKC EP_NMD krx_elanbuffer; /* contiguous Elan buffer */ -#else - E3_Addr krx_elanbuffer; /* contiguous Elan buffer */ -#endif int krx_npages; /* # pages in receive buffer */ int krx_nob; /* Number Of Bytes received into buffer */ - int krx_rpc_reply_needed; /* peer waiting for EKC RPC reply */ - int krx_rpc_reply_status; /* what status to send */ + int krx_rpc_reply_needed:1; /* peer waiting for EKC RPC reply */ + int krx_raw_lnet_hdr:1; /* msg is a raw lnet hdr (portals compatible) */ int krx_state; /* what this RX is doing */ atomic_t krx_refcount; /* how to tell when rpc is done */ - kpr_fwd_desc_t krx_fwd; /* embedded forwarding descriptor */ - ptl_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ +#if KQSW_CKSUM + __u32 krx_cksum; /* checksum */ +#endif + kqswnal_rpc_reply_t krx_rpc_reply; /* rpc reply status block */ + lnet_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE]; /* buffer frags */ } kqswnal_rx_t; #define KRX_POSTED 1 /* receiving */ @@ -173,48 +193,56 @@ typedef struct kqswnal_rx typedef struct kqswnal_tx { struct list_head ktx_list; /* enqueue idle/active */ - struct list_head ktx_delayed_list; /* enqueue delayedtxds */ + struct list_head ktx_schedlist; /* enqueue on scheduler */ struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */ - unsigned int ktx_isnblk:1; /* reserved descriptor? */ unsigned int ktx_state:7; /* What I'm doing */ unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */ uint32_t ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */ int ktx_npages; /* pages reserved for mapping messages */ int ktx_nmappedpages; /* # pages mapped for current message */ int ktx_port; /* destination ep port */ - ptl_nid_t ktx_nid; /* destination node */ + lnet_nid_t ktx_nid; /* destination node */ void *ktx_args[3]; /* completion passthru */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ - + int ktx_status; /* completion status */ +#if KQSW_CKSUM + __u32 ktx_cksum; /* optimized GET payload checksum */ +#endif /* debug/info fields */ pid_t ktx_launcher; /* pid of launching process */ int ktx_nfrag; /* # message frags */ -#if MULTIRAIL_EKC int ktx_rail; /* preferred rail */ EP_NMD ktx_ebuffer; /* elan mapping of ktx_buffer */ EP_NMD ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */ -#else - E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ - EP_IOVEC ktx_frags[EP_MAXFRAG];/* msg frags (elan vaddrs) */ -#endif } kqswnal_tx_t; -#define KTX_IDLE 0 /* on kqn_(nblk_)idletxds */ -#define KTX_FORWARDING 1 /* sending a forwarded packet */ -#define KTX_SENDING 2 /* normal send */ -#define KTX_GETTING 3 /* sending optimised get */ -#define KTX_PUTTING 4 /* sending optimised put */ -#define KTX_RDMAING 5 /* handling optimised put/get */ +#define KTX_IDLE 0 /* on kqn_idletxds */ +#define KTX_SENDING 1 /* normal send */ +#define KTX_GETTING 2 /* sending optimised get */ +#define KTX_PUTTING 3 /* sending optimised put */ +#define KTX_RDMA_FETCH 4 /* handling optimised put */ +#define KTX_RDMA_STORE 5 /* handling optimised get */ typedef struct { - /* dynamic tunables... */ - int kqn_optimized_puts; /* optimized PUTs? */ - int kqn_optimized_gets; /* optimized GETs? */ -#if CONFIG_SYSCTL - struct ctl_table_header *kqn_sysctl; /* sysctl interface */ + int *kqn_tx_maxcontig; /* maximum payload to defrag */ + int *kqn_ntxmsgs; /* # normal tx msgs */ + int *kqn_credits; /* # concurrent sends */ + int *kqn_peercredits; /* # concurrent sends to 1 peer */ + int *kqn_nrxmsgs_large; /* # 'large' rx msgs */ + int *kqn_ep_envelopes_large; /* # 'large' rx ep envelopes */ + int *kqn_nrxmsgs_small; /* # 'small' rx msgs */ + int *kqn_ep_envelopes_small; /* # 'small' rx ep envelopes */ + int *kqn_optimized_puts; /* optimized PUTs? */ + int *kqn_optimized_gets; /* optimized GETs? */ +#if KQSW_CKSUM + int *kqn_inject_csum_error; /* # csum errors to inject */ +#endif + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + struct ctl_table_header *kqn_sysctl; /* sysctl interface */ #endif } kqswnal_tunables_t; @@ -223,82 +251,68 @@ typedef struct char kqn_init; /* what's been initialised */ char kqn_shuttingdown; /* I'm trying to shut down */ atomic_t kqn_nthreads; /* # threads running */ + lnet_ni_t *kqn_ni; /* _the_ instance of me */ kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */ kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */ struct list_head kqn_idletxds; /* transmit descriptors free to use */ - struct list_head kqn_nblk_idletxds; /* reserved free transmit descriptors */ struct list_head kqn_activetxds; /* transmit descriptors being used */ spinlock_t kqn_idletxd_lock; /* serialise idle txd access */ - wait_queue_head_t kqn_idletxd_waitq; /* sender blocks here waiting for idle txd */ - struct list_head kqn_idletxd_fwdq; /* forwarded packets block here waiting for idle txd */ atomic_t kqn_pending_txs; /* # transmits being prepped */ spinlock_t kqn_sched_lock; /* serialise packet schedulers */ wait_queue_head_t kqn_sched_waitq; /* scheduler blocks here */ struct list_head kqn_readyrxds; /* rxds full of data */ - struct list_head kqn_delayedfwds; /* delayed forwards */ + struct list_head kqn_donetxds; /* completed transmits */ struct list_head kqn_delayedtxds; /* delayed transmits */ -#if MULTIRAIL_EKC EP_SYS *kqn_ep; /* elan system */ EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */ EP_NMH *kqn_ep_rx_nmh; /* elan reserved rx vaddrs */ -#else - EP_DEV *kqn_ep; /* elan device */ - ELAN3_DMA_HANDLE *kqn_eptxdmahandle; /* elan reserved tx vaddrs */ - ELAN3_DMA_HANDLE *kqn_eprxdmahandle; /* elan reserved rx vaddrs */ -#endif EP_XMTR *kqn_eptx; /* elan transmitter */ EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */ EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */ - kpr_router_t kqn_router; /* connection to Kernel Portals Router module */ - ptl_nid_t kqn_nid_offset; /* this cluster's NID offset */ int kqn_nnodes; /* this cluster's size */ int kqn_elanid; /* this nodes's elan ID */ EP_STATUSBLK kqn_rpc_success; /* preset RPC reply status blocks */ EP_STATUSBLK kqn_rpc_failed; + EP_STATUSBLK kqn_rpc_version; /* reply to future version query */ + EP_STATUSBLK kqn_rpc_magic; /* reply to future version query */ } kqswnal_data_t; /* kqn_init state */ #define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */ #define KQN_INIT_DATA 1 -#define KQN_INIT_LIB 2 -#define KQN_INIT_ALL 3 +#define KQN_INIT_ALL 2 -extern lib_nal_t kqswnal_lib; -extern nal_t kqswnal_api; extern kqswnal_tunables_t kqswnal_tunables; extern kqswnal_data_t kqswnal_data; extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg); extern void kqswnal_rxhandler(EP_RXD *rxd); extern int kqswnal_scheduler (void *); -extern void kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); extern void kqswnal_rx_done (kqswnal_rx_t *krx); -static inline ptl_nid_t +static inline lnet_nid_t kqswnal_elanid2nid (int elanid) { - return (kqswnal_data.kqn_nid_offset + elanid); + return LNET_MKNID(LNET_NIDNET(kqswnal_data.kqn_ni->ni_nid), elanid); } static inline int -kqswnal_nid2elanid (ptl_nid_t nid) +kqswnal_nid2elanid (lnet_nid_t nid) { - /* not in this cluster? */ - if (nid < kqswnal_data.kqn_nid_offset || - nid >= kqswnal_data.kqn_nid_offset + kqswnal_data.kqn_nnodes) - return (-1); + __u32 elanid = LNET_NIDADDR(nid); - return (nid - kqswnal_data.kqn_nid_offset); + /* not in this cluster? */ + return (elanid >= kqswnal_data.kqn_nnodes) ? -1 : elanid; } -static inline ptl_nid_t +static inline lnet_nid_t kqswnal_rx_nid(kqswnal_rx_t *krx) { return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd))); @@ -314,18 +328,6 @@ kqswnal_pages_spanned (void *base, int nob) return (last_page - first_page + 1); } -#if KQSW_CHECKSUM -static inline kqsw_csum_t kqsw_csum (kqsw_csum_t sum, void *base, int nob) -{ - unsigned char *ptr = (unsigned char *)base; - - while (nob-- > 0) - sum += *ptr++; - - return (sum); -} -#endif - static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) { LASSERT (atomic_read (&krx->krx_refcount) > 0); @@ -333,44 +335,16 @@ static inline void kqswnal_rx_decref (kqswnal_rx_t *krx) kqswnal_rx_done(krx); } -#if MULTIRAIL_EKC -# ifndef EP_RAILMASK_ALL -# error "old (unsupported) version of EKC headers" -# endif -#else -/* multirail defines these in */ -#define EP_MSG_SVC_PORTALS_SMALL (0x10) /* Portals over elan port number (large payloads) */ -#define EP_MSG_SVC_PORTALS_LARGE (0x11) /* Portals over elan port number (small payloads) */ -/* NB small/large message sizes are GLOBAL constants */ - -/* A minimal attempt to minimise inline #ifdeffing */ - -#define EP_SUCCESS ESUCCESS -#define EP_ENOMEM ENOMEM - -static inline EP_XMTR * -ep_alloc_xmtr(EP_DEV *e) -{ - return (ep_alloc_large_xmtr(e)); -} - -static inline EP_RCVR * -ep_alloc_rcvr(EP_DEV *e, int svc, int nenv) -{ - return (ep_install_large_rcvr(e, svc, nenv)); -} - -static inline void -ep_free_xmtr(EP_XMTR *x) -{ - ep_free_large_xmtr(x); -} - -static inline void -ep_free_rcvr(EP_RCVR *r) -{ - ep_remove_large_rcvr(r); -} -#endif +int kqswnal_startup (lnet_ni_t *ni); +void kqswnal_shutdown (lnet_ni_t *ni); +int kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg); +int kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kqswnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); + +int kqswnal_tunables_init(void); +void kqswnal_tunables_fini(void); #endif /* _QSWNAL_H */ diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 22e2cd9..86a1f8f 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -21,23 +21,7 @@ * */ -#include "qswnal.h" - -/* - * LIB functions follow - * - */ -static int -kqswnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - if (nid == nal->libnal_ni.ni_pid.nid) - *dist = 0; /* it's me */ - else if (kqswnal_nid2elanid (nid) >= 0) - *dist = 1; /* it's my peer */ - else - *dist = 2; /* via router */ - return (0); -} +#include "qswlnd.h" void kqswnal_notify_peer_down(kqswnal_tx_t *ktx) @@ -48,22 +32,19 @@ kqswnal_notify_peer_down(kqswnal_tx_t *ktx) do_gettimeofday (&now); then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; - kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then); + lnet_notify(kqswnal_data.kqn_ni, ktx->ktx_nid, 0, then); } void kqswnal_unmap_tx (kqswnal_tx_t *ktx) { -#if MULTIRAIL_EKC int i; ktx->ktx_rail = -1; /* unset rail */ -#endif if (ktx->ktx_nmappedpages == 0) return; -#if MULTIRAIL_EKC CDEBUG(D_NET, "%p unloading %d frags starting at %d\n", ktx, ktx->ktx_nfrag, ktx->ktx_firsttmpfrag); @@ -71,30 +52,20 @@ kqswnal_unmap_tx (kqswnal_tx_t *ktx) ep_dvma_unload(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh, &ktx->ktx_frags[i]); -#else - CDEBUG (D_NET, "%p[%d] unloading pages %d for %d\n", - ktx, ktx->ktx_nfrag, ktx->ktx_basepage, ktx->ktx_nmappedpages); - LASSERT (ktx->ktx_nmappedpages <= ktx->ktx_npages); - LASSERT (ktx->ktx_basepage + ktx->ktx_nmappedpages <= - kqswnal_data.kqn_eptxdmahandle->NumDvmaPages); - - elan3_dvma_unload(kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - ktx->ktx_basepage, ktx->ktx_nmappedpages); -#endif ktx->ktx_nmappedpages = 0; } int -kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_t *kiov) +kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, + unsigned int niov, lnet_kiov_t *kiov) { int nfrags = ktx->ktx_nfrag; int nmapped = ktx->ktx_nmappedpages; int maxmapped = ktx->ktx_npages; uint32_t basepage = ktx->ktx_basepage + nmapped; char *ptr; -#if MULTIRAIL_EKC + EP_RAILMASK railmask; int rail; @@ -104,11 +75,11 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ kqswnal_nid2elanid(ktx->ktx_nid)); rail = ktx->ktx_rail; if (rail < 0) { - CERROR("No rails available for "LPX64"\n", ktx->ktx_nid); + CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid)); return (-ENETDOWN); } railmask = 1 << rail; -#endif + LASSERT (nmapped <= maxmapped); LASSERT (nfrags >= ktx->ktx_firsttmpfrag); LASSERT (nfrags <= EP_MAXFRAG); @@ -154,7 +125,6 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ "%p[%d] loading %p for %d, page %d, %d total\n", ktx, nfrags, ptr, fraglen, basepage, nmapped); -#if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, ptr, fraglen, kqswnal_data.kqn_ep_tx_nmh, basepage, @@ -167,22 +137,6 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ /* new frag if this is the first or can't merge */ nfrags++; } -#else - elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - ptr, fraglen, - basepage, &ktx->ktx_frags[nfrags].Base); - - if (nfrags > 0 && /* previous frag mapped */ - ktx->ktx_frags[nfrags].Base == /* contiguous with this one */ - (ktx->ktx_frags[nfrags-1].Base + ktx->ktx_frags[nfrags-1].Len)) - /* just extend previous */ - ktx->ktx_frags[nfrags - 1].Len += fraglen; - else { - ktx->ktx_frags[nfrags].Len = fraglen; - nfrags++; /* new frag */ - } -#endif kunmap (kiov->kiov_page); @@ -207,15 +161,65 @@ kqswnal_map_tx_kiov (kqswnal_tx_t *ktx, int offset, int nob, int niov, ptl_kiov_ return (0); } +#if KQSW_CKSUM +__u32 +kqswnal_csum_kiov (__u32 csum, int offset, int nob, + unsigned int niov, lnet_kiov_t *kiov) +{ + char *ptr; + + if (nob == 0) + return csum; + + LASSERT (niov > 0); + LASSERT (nob > 0); + + /* skip complete frags before 'offset' */ + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + kiov++; + niov--; + LASSERT (niov > 0); + } + + do { + int fraglen = kiov->kiov_len - offset; + + /* each page frag is contained in one page */ + LASSERT (kiov->kiov_offset + kiov->kiov_len <= PAGE_SIZE); + + if (fraglen > nob) + fraglen = nob; + + ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset + offset; + + csum = kqswnal_csum(csum, ptr, fraglen); + + kunmap (kiov->kiov_page); + + kiov++; + niov--; + nob -= fraglen; + offset = 0; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + return csum; +} +#endif + int kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, - int niov, struct iovec *iov) + unsigned int niov, struct iovec *iov) { int nfrags = ktx->ktx_nfrag; int nmapped = ktx->ktx_nmappedpages; int maxmapped = ktx->ktx_npages; uint32_t basepage = ktx->ktx_basepage + nmapped; -#if MULTIRAIL_EKC + EP_RAILMASK railmask; int rail; @@ -225,11 +229,11 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, kqswnal_nid2elanid(ktx->ktx_nid)); rail = ktx->ktx_rail; if (rail < 0) { - CERROR("No rails available for "LPX64"\n", ktx->ktx_nid); + CERROR("No rails available for %s\n", libcfs_nid2str(ktx->ktx_nid)); return (-ENETDOWN); } railmask = 1 << rail; -#endif + LASSERT (nmapped <= maxmapped); LASSERT (nfrags >= ktx->ktx_firsttmpfrag); LASSERT (nfrags <= EP_MAXFRAG); @@ -270,7 +274,6 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, ktx, nfrags, iov->iov_base + offset, fraglen, basepage, npages, nmapped); -#if MULTIRAIL_EKC ep_dvma_load(kqswnal_data.kqn_ep, NULL, iov->iov_base + offset, fraglen, kqswnal_data.kqn_ep_tx_nmh, basepage, @@ -283,22 +286,6 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, /* new frag if this is the first or can't merge */ nfrags++; } -#else - elan3_dvma_kaddr_load (kqswnal_data.kqn_ep->DmaState, - kqswnal_data.kqn_eptxdmahandle, - iov->iov_base + offset, fraglen, - basepage, &ktx->ktx_frags[nfrags].Base); - - if (nfrags > 0 && /* previous frag mapped */ - ktx->ktx_frags[nfrags].Base == /* contiguous with this one */ - (ktx->ktx_frags[nfrags-1].Base + ktx->ktx_frags[nfrags-1].Len)) - /* just extend previous */ - ktx->ktx_frags[nfrags - 1].Len += fraglen; - else { - ktx->ktx_frags[nfrags].Len = fraglen; - nfrags++; /* new frag */ - } -#endif /* keep in loop for failure case */ ktx->ktx_nmappedpages = nmapped; @@ -321,11 +308,50 @@ kqswnal_map_tx_iov (kqswnal_tx_t *ktx, int offset, int nob, return (0); } +#if KQSW_CKSUM +__u32 +kqswnal_csum_iov (__u32 csum, int offset, int nob, + unsigned int niov, struct iovec *iov) +{ + if (nob == 0) + return csum; + + LASSERT (niov > 0); + LASSERT (nob > 0); + + /* skip complete frags before offset */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + niov--; + LASSERT (niov > 0); + } + + do { + int fraglen = iov->iov_len - offset; + + if (fraglen > nob) + fraglen = nob; + + csum = kqswnal_csum(csum, iov->iov_base + offset, fraglen); + + iov++; + niov--; + nob -= fraglen; + offset = 0; + + /* iov must not run out before end of data */ + LASSERT (nob == 0 || niov > 0); + + } while (nob > 0); + + return csum; +} +#endif void kqswnal_put_idle_tx (kqswnal_tx_t *ktx) { - kpr_fwd_desc_t *fwd = NULL; unsigned long flags; kqswnal_unmap_tx (ktx); /* release temporary mappings */ @@ -334,133 +360,133 @@ kqswnal_put_idle_tx (kqswnal_tx_t *ktx) spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); list_del (&ktx->ktx_list); /* take off active list */ - - if (ktx->ktx_isnblk) { - /* reserved for non-blocking tx */ - list_add (&ktx->ktx_list, &kqswnal_data.kqn_nblk_idletxds); - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - return; - } - list_add (&ktx->ktx_list, &kqswnal_data.kqn_idletxds); - /* anything blocking for a tx descriptor? */ - if (!kqswnal_data.kqn_shuttingdown && - !list_empty(&kqswnal_data.kqn_idletxd_fwdq)) /* forwarded packet? */ - { - CDEBUG(D_NET,"wakeup fwd\n"); - - fwd = list_entry (kqswnal_data.kqn_idletxd_fwdq.next, - kpr_fwd_desc_t, kprfd_list); - list_del (&fwd->kprfd_list); - } - - wake_up (&kqswnal_data.kqn_idletxd_waitq); - spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - - if (fwd == NULL) - return; - - /* schedule packet for forwarding again */ - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - - list_add_tail (&fwd->kprfd_list, &kqswnal_data.kqn_delayedfwds); - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); } kqswnal_tx_t * -kqswnal_get_idle_tx (kpr_fwd_desc_t *fwd, int may_block) +kqswnal_get_idle_tx (void) { unsigned long flags; - kqswnal_tx_t *ktx = NULL; + kqswnal_tx_t *ktx; - for (;;) { - spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); - - if (kqswnal_data.kqn_shuttingdown) - break; - - /* "normal" descriptor is free */ - if (!list_empty (&kqswnal_data.kqn_idletxds)) { - ktx = list_entry (kqswnal_data.kqn_idletxds.next, - kqswnal_tx_t, ktx_list); - break; - } - - if (fwd != NULL) /* forwarded packet? */ - break; - - /* doing a local transmit */ - if (!may_block) { - if (list_empty (&kqswnal_data.kqn_nblk_idletxds)) { - CERROR ("intr tx desc pool exhausted\n"); - break; - } - - ktx = list_entry (kqswnal_data.kqn_nblk_idletxds.next, - kqswnal_tx_t, ktx_list); - break; - } - - /* block for idle tx */ + spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags); + if (kqswnal_data.kqn_shuttingdown || + list_empty (&kqswnal_data.kqn_idletxds)) { spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); - CDEBUG (D_NET, "blocking for tx desc\n"); - wait_event (kqswnal_data.kqn_idletxd_waitq, - !list_empty (&kqswnal_data.kqn_idletxds) || - kqswnal_data.kqn_shuttingdown); + return NULL; } - if (ktx != NULL) { - list_del (&ktx->ktx_list); - list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); - ktx->ktx_launcher = current->pid; - atomic_inc(&kqswnal_data.kqn_pending_txs); - } else if (fwd != NULL) { - /* queue forwarded packet until idle txd available */ - CDEBUG (D_NET, "blocked fwd [%p]\n", fwd); - list_add_tail (&fwd->kprfd_list, - &kqswnal_data.kqn_idletxd_fwdq); - } + ktx = list_entry (kqswnal_data.kqn_idletxds.next, kqswnal_tx_t, ktx_list); + list_del (&ktx->ktx_list); + + list_add (&ktx->ktx_list, &kqswnal_data.kqn_activetxds); + ktx->ktx_launcher = current->pid; + atomic_inc(&kqswnal_data.kqn_pending_txs); spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags); /* Idle descs can't have any mapped (as opposed to pre-mapped) pages */ - LASSERT (ktx == NULL || ktx->ktx_nmappedpages == 0); - + LASSERT (ktx->ktx_nmappedpages == 0); return (ktx); } void -kqswnal_tx_done (kqswnal_tx_t *ktx, int error) +kqswnal_tx_done_in_thread_context (kqswnal_tx_t *ktx) { + lnet_msg_t *lnetmsg0 = NULL; + lnet_msg_t *lnetmsg1 = NULL; + int status0 = 0; + int status1 = 0; + kqswnal_rx_t *krx; + + LASSERT (!in_interrupt()); + + if (ktx->ktx_status == -EHOSTDOWN) + kqswnal_notify_peer_down(ktx); + switch (ktx->ktx_state) { - case KTX_FORWARDING: /* router asked me to forward this packet */ - kpr_fwd_done (&kqswnal_data.kqn_router, - (kpr_fwd_desc_t *)ktx->ktx_args[0], error); + case KTX_RDMA_FETCH: /* optimized PUT/REPLY handled */ + krx = (kqswnal_rx_t *)ktx->ktx_args[0]; + lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; + status0 = ktx->ktx_status; +#if KQSW_CKSUM + if (status0 == 0) { /* RDMA succeeded */ + kqswnal_msg_t *msg; + __u32 csum; + + msg = (kqswnal_msg_t *) + page_address(krx->krx_kiov[0].kiov_page); + + csum = (lnetmsg0->msg_kiov != NULL) ? + kqswnal_csum_kiov(krx->krx_cksum, + lnetmsg0->msg_offset, + lnetmsg0->msg_wanted, + lnetmsg0->msg_niov, + lnetmsg0->msg_kiov) : + kqswnal_csum_iov(krx->krx_cksum, + lnetmsg0->msg_offset, + lnetmsg0->msg_wanted, + lnetmsg0->msg_niov, + lnetmsg0->msg_iov); + + /* Can only check csum if I got it all */ + if (lnetmsg0->msg_wanted == lnetmsg0->msg_len && + csum != msg->kqm_cksum) { + ktx->ktx_status = -EIO; + krx->krx_rpc_reply.msg.status = -EIO; + CERROR("RDMA checksum failed %u(%u) from %s\n", + csum, msg->kqm_cksum, + libcfs_nid2str(kqswnal_rx_nid(krx))); + } + } +#endif + LASSERT (krx->krx_state == KRX_COMPLETING); + kqswnal_rx_decref (krx); break; - case KTX_RDMAING: /* optimized GET/PUT handled */ + case KTX_RDMA_STORE: /* optimized GET handled */ case KTX_PUTTING: /* optimized PUT sent */ case KTX_SENDING: /* normal send */ - lib_finalize (&kqswnal_lib, NULL, - (lib_msg_t *)ktx->ktx_args[1], - (error == 0) ? PTL_OK : PTL_FAIL); + lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; + status0 = ktx->ktx_status; break; - case KTX_GETTING: /* optimized GET sent & REPLY received */ + case KTX_GETTING: /* optimized GET sent & payload received */ /* Complete the GET with success since we can't avoid * delivering a REPLY event; we committed to it when we * launched the GET */ - lib_finalize (&kqswnal_lib, NULL, - (lib_msg_t *)ktx->ktx_args[1], PTL_OK); - lib_finalize (&kqswnal_lib, NULL, - (lib_msg_t *)ktx->ktx_args[2], - (error == 0) ? PTL_OK : PTL_FAIL); + lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; + status0 = 0; + lnetmsg1 = (lnet_msg_t *)ktx->ktx_args[2]; + status1 = ktx->ktx_status; +#if KQSW_CKSUM + if (status1 == 0) { /* RDMA succeeded */ + lnet_msg_t *lnetmsg0 = (lnet_msg_t *)ktx->ktx_args[1]; + lnet_libmd_t *md = lnetmsg0->msg_md; + __u32 csum; + + csum = ((md->md_options & LNET_MD_KIOV) != 0) ? + kqswnal_csum_kiov(~0, 0, + md->md_length, + md->md_niov, + md->md_iov.kiov) : + kqswnal_csum_iov(~0, 0, + md->md_length, + md->md_niov, + md->md_iov.iov); + + if (csum != ktx->ktx_cksum) { + CERROR("RDMA checksum failed %u(%u) from %s\n", + csum, ktx->ktx_cksum, + libcfs_nid2str(ktx->ktx_nid)); + status1 = -EIO; + } + } +#endif break; default: @@ -468,12 +494,39 @@ kqswnal_tx_done (kqswnal_tx_t *ktx, int error) } kqswnal_put_idle_tx (ktx); + + lnet_finalize (kqswnal_data.kqn_ni, lnetmsg0, status0); + if (lnetmsg1 != NULL) + lnet_finalize (kqswnal_data.kqn_ni, lnetmsg1, status1); +} + +void +kqswnal_tx_done (kqswnal_tx_t *ktx, int status) +{ + unsigned long flags; + + ktx->ktx_status = status; + + if (!in_interrupt()) { + kqswnal_tx_done_in_thread_context(ktx); + return; + } + + /* Complete the send in thread context */ + spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); + + list_add_tail(&ktx->ktx_schedlist, + &kqswnal_data.kqn_donetxds); + wake_up(&kqswnal_data.kqn_sched_waitq); + + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); } static void kqswnal_txhandler(EP_TXD *txd, void *arg, int status) { - kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; + kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; + kqswnal_rpc_reply_t *reply; LASSERT (txd != NULL); LASSERT (ktx != NULL); @@ -482,26 +535,57 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) if (status != EP_SUCCESS) { - CERROR ("Tx completion to "LPX64" failed: %d\n", - ktx->ktx_nid, status); + CDEBUG (D_NETERROR, "Tx completion to %s failed: %d\n", + libcfs_nid2str(ktx->ktx_nid), status); - kqswnal_notify_peer_down(ktx); status = -EHOSTDOWN; } else switch (ktx->ktx_state) { case KTX_GETTING: case KTX_PUTTING: - /* RPC completed OK; but what did our peer put in the status - * block? */ -#if MULTIRAIL_EKC - status = ep_txd_statusblk(txd)->Data[0]; -#else - status = ep_txd_statusblk(txd)->Status; + /* RPC complete! */ + reply = (kqswnal_rpc_reply_t *)ep_txd_statusblk(txd); + if (reply->msg.magic == 0) { /* "old" peer */ + status = reply->msg.status; + break; + } + + if (reply->msg.magic != LNET_PROTO_QSW_MAGIC) { + if (reply->msg.magic != swab32(LNET_PROTO_QSW_MAGIC)) { + CERROR("%s unexpected rpc reply magic %08x\n", + libcfs_nid2str(ktx->ktx_nid), + reply->msg.magic); + status = -EPROTO; + break; + } + + __swab32s(&reply->msg.status); + __swab32s(&reply->msg.version); + + if (ktx->ktx_state == KTX_GETTING) { + __swab32s(&reply->msg.u.get.len); + __swab32s(&reply->msg.u.get.cksum); + } + } + + status = reply->msg.status; + if (status != 0) { + CERROR("%s RPC status %08x\n", + libcfs_nid2str(ktx->ktx_nid), status); + break; + } + + if (ktx->ktx_state == KTX_GETTING) { + lnet_set_reply_msg_len(kqswnal_data.kqn_ni, + (lnet_msg_t *)ktx->ktx_args[2], + reply->msg.u.get.len); +#if KQSW_CKSUM + ktx->ktx_cksum = reply->msg.u.get.cksum; #endif + } break; - case KTX_FORWARDING: case KTX_SENDING: status = 0; break; @@ -511,7 +595,7 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) break; } - kqswnal_tx_done (ktx, status); + kqswnal_tx_done(ktx, status); } int @@ -530,14 +614,31 @@ kqswnal_launch (kqswnal_tx_t *ktx) LASSERT (dest >= 0); /* must be a peer */ -#if MULTIRAIL_EKC if (ktx->ktx_nmappedpages != 0) attr = EP_SET_PREFRAIL(attr, ktx->ktx_rail); -#endif switch (ktx->ktx_state) { case KTX_GETTING: case KTX_PUTTING: + if (the_lnet.ln_testprotocompat != 0 && + the_lnet.ln_ptlcompat == 0) { + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; + + /* single-shot proto test: + * Future version queries will use an RPC, so I'll + * co-opt one of the existing ones */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + msg->kqm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + msg->kqm_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + /* NB ktx_frag[0] is the GET/PUT hdr + kqswnal_remotemd_t. * The other frags are the payload, awaiting RDMA */ rc = ep_transmit_rpc(kqswnal_data.kqn_eptx, dest, @@ -546,19 +647,11 @@ kqswnal_launch (kqswnal_tx_t *ktx) NULL, ktx->ktx_frags, 1); break; - case KTX_FORWARDING: case KTX_SENDING: -#if MULTIRAIL_EKC rc = ep_transmit_message(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, kqswnal_txhandler, ktx, NULL, ktx->ktx_frags, ktx->ktx_nfrag); -#else - rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest, - ktx->ktx_port, attr, - kqswnal_txhandler, ktx, - ktx->ktx_frags, ktx->ktx_nfrag); -#endif break; default: @@ -574,14 +667,14 @@ kqswnal_launch (kqswnal_tx_t *ktx) case EP_ENOMEM: /* can't allocate ep txd => queue for later */ spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); + list_add_tail (&ktx->ktx_schedlist, &kqswnal_data.kqn_delayedtxds); wake_up (&kqswnal_data.kqn_sched_waitq); spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); return (0); default: /* fatal error */ - CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc); + CDEBUG (D_NETERROR, "Tx to %s failed: %d\n", libcfs_nid2str(ktx->ktx_nid), rc); kqswnal_notify_peer_down(ktx); return (-EHOSTUNREACH); } @@ -589,16 +682,16 @@ kqswnal_launch (kqswnal_tx_t *ktx) #if 0 static char * -hdr_type_string (ptl_hdr_t *hdr) +hdr_type_string (lnet_hdr_t *hdr) { switch (hdr->type) { - case PTL_MSG_ACK: + case LNET_MSG_ACK: return ("ACK"); - case PTL_MSG_PUT: + case LNET_MSG_PUT: return ("PUT"); - case PTL_MSG_GET: + case LNET_MSG_GET: return ("GET"); - case PTL_MSG_REPLY: + case LNET_MSG_REPLY: return ("REPLY"); default: return (""); @@ -606,7 +699,7 @@ hdr_type_string (ptl_hdr_t *hdr) } static void -kqswnal_cerror_hdr(ptl_hdr_t * hdr) +kqswnal_cerror_hdr(lnet_hdr_t * hdr) { char *type_str = hdr_type_string (hdr); @@ -618,7 +711,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) le32_to_cpu(hdr->dest_pid)); switch (le32_to_cpu(hdr->type)) { - case PTL_MSG_PUT: + case LNET_MSG_PUT: CERROR(" Ptl index %d, ack md "LPX64"."LPX64", " "match bits "LPX64"\n", le32_to_cpu(hdr->msg.put.ptl_index), @@ -630,7 +723,7 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) hdr->msg.put.hdr_data); break; - case PTL_MSG_GET: + case LNET_MSG_GET: CERROR(" Ptl index %d, return md "LPX64"."LPX64", " "match bits "LPX64"\n", le32_to_cpu(hdr->msg.get.ptl_index), @@ -642,14 +735,14 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) le32_to_cpu(hdr->msg.get.src_offset)); break; - case PTL_MSG_ACK: + case LNET_MSG_ACK: CERROR(" dst md "LPX64"."LPX64", manipulated length %d\n", hdr->msg.ack.dst_wmd.wh_interface_cookie, hdr->msg.ack.dst_wmd.wh_object_cookie, le32_to_cpu(hdr->msg.ack.mlength)); break; - case PTL_MSG_REPLY: + case LNET_MSG_REPLY: CERROR(" dst md "LPX64"."LPX64"\n", hdr->msg.reply.dst_wmd.wh_interface_cookie, hdr->msg.reply.dst_wmd.wh_object_cookie); @@ -658,67 +751,6 @@ kqswnal_cerror_hdr(ptl_hdr_t * hdr) } /* end of print_hdr() */ #endif -#if !MULTIRAIL_EKC -void -kqswnal_print_eiov (int how, char *str, int n, EP_IOVEC *iov) -{ - int i; - - CDEBUG (how, "%s: %d\n", str, n); - for (i = 0; i < n; i++) { - CDEBUG (how, " %08x for %d\n", iov[i].Base, iov[i].Len); - } -} - -int -kqswnal_eiovs2datav (int ndv, EP_DATAVEC *dv, - int nsrc, EP_IOVEC *src, - int ndst, EP_IOVEC *dst) -{ - int count; - int nob; - - LASSERT (ndv > 0); - LASSERT (nsrc > 0); - LASSERT (ndst > 0); - - for (count = 0; count < ndv; count++, dv++) { - - if (nsrc == 0 || ndst == 0) { - if (nsrc != ndst) { - /* For now I'll barf on any left over entries */ - CERROR ("mismatched src and dst iovs\n"); - return (-EINVAL); - } - return (count); - } - - nob = (src->Len < dst->Len) ? src->Len : dst->Len; - dv->Len = nob; - dv->Source = src->Base; - dv->Dest = dst->Base; - - if (nob >= src->Len) { - src++; - nsrc--; - } else { - src->Len -= nob; - src->Base += nob; - } - - if (nob >= dst->Len) { - dst++; - ndst--; - } else { - src->Len -= nob; - src->Base += nob; - } - } - - CERROR ("DATAVEC too small\n"); - return (-E2BIG); -} -#else int kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, int nrfrag, EP_NMD *rfrag) @@ -741,36 +773,17 @@ kqswnal_check_rdma (int nlfrag, EP_NMD *lfrag, return (0); } -#endif kqswnal_remotemd_t * -kqswnal_parse_rmd (kqswnal_rx_t *krx, int type, ptl_nid_t expected_nid) +kqswnal_get_portalscompat_rmd (kqswnal_rx_t *krx) { + /* Check that the RMD sent after the "raw" LNET header in a + * portals-compatible QSWLND message is OK */ char *buffer = (char *)page_address(krx->krx_kiov[0].kiov_page); - ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; - kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + KQSW_HDR_SIZE); - ptl_nid_t nid = kqswnal_rx_nid(krx); - - /* Note (1) lib_parse has already flipped hdr. - * (2) RDMA addresses are sent in native endian-ness. When - * EKC copes with different endian nodes, I'll fix this (and - * eat my hat :) */ + kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(buffer + sizeof(lnet_hdr_t)); - LASSERT (krx->krx_nob >= sizeof(*hdr)); - - if (hdr->type != type) { - CERROR ("Unexpected optimized get/put type %d (%d expected)" - "from "LPX64"\n", hdr->type, type, nid); - return (NULL); - } - - if (hdr->src_nid != nid) { - CERROR ("Unexpected optimized get/put source NID " - LPX64" from "LPX64"\n", hdr->src_nid, nid); - return (NULL); - } - - LASSERT (nid == expected_nid); + /* Note RDMA addresses are sent in native endian-ness in the "old" + * portals protocol so no swabbing... */ if (buffer + krx->krx_nob < (char *)(rmd + 1)) { /* msg too small to discover rmd size */ @@ -800,116 +813,96 @@ kqswnal_rdma_store_complete (EP_RXD *rxd) CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, "rxd %p, ktx %p, status %d\n", rxd, ktx, status); - LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (ktx->ktx_state == KTX_RDMA_STORE); LASSERT (krx->krx_rxd == rxd); LASSERT (krx->krx_rpc_reply_needed); krx->krx_rpc_reply_needed = 0; kqswnal_rx_decref (krx); - /* free ktx & finalize() its lib_msg_t */ + /* free ktx & finalize() its lnet_msg_t */ kqswnal_tx_done(ktx, (status == EP_SUCCESS) ? 0 : -ECONNABORTED); } void kqswnal_rdma_fetch_complete (EP_RXD *rxd) { - /* Completed fetching the PUT data */ + /* Completed fetching the PUT/REPLY data */ int status = ep_rxd_status(rxd); kqswnal_tx_t *ktx = (kqswnal_tx_t *)ep_rxd_arg(rxd); kqswnal_rx_t *krx = (kqswnal_rx_t *)ktx->ktx_args[0]; - unsigned long flags; CDEBUG((status == EP_SUCCESS) ? D_NET : D_ERROR, "rxd %p, ktx %p, status %d\n", rxd, ktx, status); - LASSERT (ktx->ktx_state == KTX_RDMAING); + LASSERT (ktx->ktx_state == KTX_RDMA_FETCH); LASSERT (krx->krx_rxd == rxd); /* RPC completes with failure by default */ LASSERT (krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rpc_reply_status != 0); + LASSERT (krx->krx_rpc_reply.msg.status != 0); if (status == EP_SUCCESS) { - status = krx->krx_rpc_reply_status = 0; + krx->krx_rpc_reply.msg.status = 0; + status = 0; } else { /* Abandon RPC since get failed */ krx->krx_rpc_reply_needed = 0; status = -ECONNABORTED; } - /* free ktx & finalize() its lib_msg_t */ - kqswnal_tx_done(ktx, status); - - if (!in_interrupt()) { - /* OK to complete the RPC now (iff I had the last ref) */ - kqswnal_rx_decref (krx); - return; - } - + /* krx gets decref'd in kqswnal_tx_done_in_thread_context() */ LASSERT (krx->krx_state == KRX_PARSE); krx->krx_state = KRX_COMPLETING; - /* Complete the RPC in thread context */ - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - - list_add_tail (&krx->krx_list, &kqswnal_data.kqn_readyrxds); - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + /* free ktx & finalize() its lnet_msg_t */ + kqswnal_tx_done(ktx, status); } int -kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, - int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, size_t len) +kqswnal_rdma (kqswnal_rx_t *krx, lnet_msg_t *lntmsg, + int type, kqswnal_remotemd_t *rmd, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int len) { - kqswnal_remotemd_t *rmd; kqswnal_tx_t *ktx; int eprc; int rc; -#if !MULTIRAIL_EKC - EP_DATAVEC datav[EP_MAXFRAG]; - int ndatav; -#endif - LASSERT (type == PTL_MSG_GET || type == PTL_MSG_PUT); /* Not both mapped and paged payload */ LASSERT (iov == NULL || kiov == NULL); /* RPC completes with failure by default */ LASSERT (krx->krx_rpc_reply_needed); - LASSERT (krx->krx_rpc_reply_status != 0); - - rmd = kqswnal_parse_rmd(krx, type, libmsg->ev.initiator.nid); - if (rmd == NULL) - return (-EPROTO); + LASSERT (krx->krx_rpc_reply.msg.status != 0); if (len == 0) { /* data got truncated to nothing. */ - lib_finalize(&kqswnal_lib, krx, libmsg, PTL_OK); + lnet_finalize(kqswnal_data.kqn_ni, lntmsg, 0); /* Let kqswnal_rx_done() complete the RPC with success */ - krx->krx_rpc_reply_status = 0; + krx->krx_rpc_reply.msg.status = 0; return (0); } /* NB I'm using 'ktx' just to map the local RDMA buffers; I'm not actually sending a portals message with it */ - ktx = kqswnal_get_idle_tx(NULL, 0); + ktx = kqswnal_get_idle_tx(); if (ktx == NULL) { - CERROR ("Can't get txd for RDMA with "LPX64"\n", - libmsg->ev.initiator.nid); + CERROR ("Can't get txd for RDMA with %s\n", + libcfs_nid2str(kqswnal_rx_nid(krx))); return (-ENOMEM); } - ktx->ktx_state = KTX_RDMAING; - ktx->ktx_nid = libmsg->ev.initiator.nid; + ktx->ktx_state = type; + ktx->ktx_nid = kqswnal_rx_nid(krx); ktx->ktx_args[0] = krx; - ktx->ktx_args[1] = libmsg; + ktx->ktx_args[1] = lntmsg; + + LASSERT (atomic_read(&krx->krx_refcount) > 0); + /* Take an extra ref for the completion callback */ + atomic_inc(&krx->krx_refcount); -#if MULTIRAIL_EKC /* Map on the rail the RPC prefers */ ktx->ktx_rail = ep_rcvr_prefrail(krx->krx_eprx, ep_rxd_railmask(krx->krx_rxd)); -#endif /* Start mapping at offset 0 (we're not mapping any headers) */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 0; @@ -924,60 +917,36 @@ kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, goto out; } -#if MULTIRAIL_EKC rc = kqswnal_check_rdma (ktx->ktx_nfrag, ktx->ktx_frags, rmd->kqrmd_nfrag, rmd->kqrmd_frag); if (rc != 0) { CERROR ("Incompatible RDMA descriptors\n"); goto out; } -#else + switch (type) { default: LBUG(); - - case PTL_MSG_GET: - ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, - ktx->ktx_nfrag, ktx->ktx_frags, - rmd->kqrmd_nfrag, rmd->kqrmd_frag); - break; - - case PTL_MSG_PUT: - ndatav = kqswnal_eiovs2datav(EP_MAXFRAG, datav, - rmd->kqrmd_nfrag, rmd->kqrmd_frag, - ktx->ktx_nfrag, ktx->ktx_frags); - break; - } - if (ndatav < 0) { - CERROR ("Can't create datavec: %d\n", ndatav); - rc = ndatav; - goto out; - } + case KTX_RDMA_STORE: + krx->krx_rpc_reply.msg.status = 0; + krx->krx_rpc_reply.msg.magic = LNET_PROTO_QSW_MAGIC; + krx->krx_rpc_reply.msg.version = QSWLND_PROTO_VERSION; + krx->krx_rpc_reply.msg.u.get.len = len; +#if KQSW_CKSUM + krx->krx_rpc_reply.msg.u.get.cksum = (kiov != NULL) ? + kqswnal_csum_kiov(~0, offset, len, niov, kiov) : + kqswnal_csum_iov(~0, offset, len, niov, iov); + if (*kqswnal_tunables.kqn_inject_csum_error == 4) { + krx->krx_rpc_reply.msg.u.get.cksum++; + *kqswnal_tunables.kqn_inject_csum_error = 0; + } #endif - - LASSERT (atomic_read(&krx->krx_refcount) > 0); - /* Take an extra ref for the completion callback */ - atomic_inc(&krx->krx_refcount); - - switch (type) { - default: - LBUG(); - - case PTL_MSG_GET: -#if MULTIRAIL_EKC eprc = ep_complete_rpc(krx->krx_rxd, kqswnal_rdma_store_complete, ktx, - &kqswnal_data.kqn_rpc_success, - ktx->ktx_frags, rmd->kqrmd_frag, rmd->kqrmd_nfrag); -#else - eprc = ep_complete_rpc (krx->krx_rxd, - kqswnal_rdma_store_complete, ktx, - &kqswnal_data.kqn_rpc_success, - datav, ndatav); - if (eprc != EP_SUCCESS) /* "old" EKC destroys rxd on failed completion */ - krx->krx_rxd = NULL; -#endif + &krx->krx_rpc_reply.ep_statusblk, + ktx->ktx_frags, rmd->kqrmd_frag, + rmd->kqrmd_nfrag); if (eprc != EP_SUCCESS) { CERROR("can't complete RPC: %d\n", eprc); /* don't re-attempt RPC completion */ @@ -986,16 +955,10 @@ kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, } break; - case PTL_MSG_PUT: -#if MULTIRAIL_EKC + case KTX_RDMA_FETCH: eprc = ep_rpc_get (krx->krx_rxd, kqswnal_rdma_fetch_complete, ktx, rmd->kqrmd_frag, ktx->ktx_frags, ktx->ktx_nfrag); -#else - eprc = ep_rpc_get (krx->krx_rxd, - kqswnal_rdma_fetch_complete, ktx, - datav, ndatav); -#endif if (eprc != EP_SUCCESS) { CERROR("ep_rpc_get failed: %d\n", eprc); /* Don't attempt RPC completion: @@ -1016,228 +979,243 @@ kqswnal_rdma (kqswnal_rx_t *krx, lib_msg_t *libmsg, int type, return (rc); } -static ptl_err_t -kqswnal_sendmsg (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) +int +kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { - kqswnal_tx_t *ktx; - int rc; - ptl_nid_t targetnid; -#if KQSW_CHECKSUM - int i; - kqsw_csum_t csum; - int sumoff; - int sumnob; -#endif + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + int nob; + kqswnal_tx_t *ktx; + int rc; + /* NB 1. hdr is in network byte order */ /* 2. 'private' depends on the message type */ - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid: "LPX64 - " pid %u\n", payload_nob, payload_niov, nid, pid); + CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); + LASSERT (payload_niov <= LNET_MAX_IOV); /* It must be OK to kmap() if required */ LASSERT (payload_kiov == NULL || !in_interrupt ()); /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - if (payload_nob > KQSW_MAXPAYLOAD) { - CERROR ("request exceeds MTU size "LPSZ" (max %u).\n", - payload_nob, KQSW_MAXPAYLOAD); - return (PTL_FAIL); - } - - if (type == PTL_MSG_REPLY && /* can I look in 'private' */ - ((kqswnal_rx_t *)private)->krx_rpc_reply_needed) { /* is it an RPC */ - /* Must be a REPLY for an optimized GET */ - rc = kqswnal_rdma ((kqswnal_rx_t *)private, libmsg, PTL_MSG_GET, - payload_niov, payload_iov, payload_kiov, - payload_offset, payload_nob); - return ((rc == 0) ? PTL_OK : PTL_FAIL); - } - - targetnid = nid; - if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ - rc = kpr_lookup (&kqswnal_data.kqn_router, nid, - sizeof (ptl_hdr_t) + payload_nob, &targetnid); - if (rc != 0) { - CERROR("Can't route to "LPX64": router error %d\n", - nid, rc); - return (PTL_FAIL); - } - if (kqswnal_nid2elanid (targetnid) < 0) { - CERROR("Bad gateway "LPX64" for "LPX64"\n", - targetnid, nid); - return (PTL_FAIL); - } + if (kqswnal_nid2elanid (target.nid) < 0) { + CERROR("%s not in my cluster\n", libcfs_nid2str(target.nid)); + return -EIO; } /* I may not block for a transmit descriptor if I might block the - * receiver, or an interrupt handler. */ - ktx = kqswnal_get_idle_tx(NULL, !(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt())); + * router, receiver, or an interrupt handler. */ + ktx = kqswnal_get_idle_tx(); if (ktx == NULL) { - CERROR ("Can't get txd for msg type %d for "LPX64"\n", - type, libmsg->ev.initiator.nid); - return (PTL_NO_SPACE); + CERROR ("Can't get txd for msg type %d for %s\n", + type, libcfs_nid2str(target.nid)); + return (-ENOMEM); } ktx->ktx_state = KTX_SENDING; - ktx->ktx_nid = targetnid; + ktx->ktx_nid = target.nid; ktx->ktx_args[0] = private; - ktx->ktx_args[1] = libmsg; + ktx->ktx_args[1] = lntmsg; ktx->ktx_args[2] = NULL; /* set when a GET commits to REPLY */ - memcpy (ktx->ktx_buffer, hdr, sizeof (*hdr)); /* copy hdr from caller's stack */ - -#if KQSW_CHECKSUM - csum = kqsw_csum (0, (char *)hdr, sizeof (*hdr)); - memcpy (ktx->ktx_buffer + sizeof (*hdr), &csum, sizeof (csum)); - for (csum = 0, i = 0, sumoff = payload_offset, sumnob = payload_nob; sumnob > 0; i++) { - LASSERT(i < niov); - if (payload_kiov != NULL) { - ptl_kiov_t *kiov = &payload_kiov[i]; - - if (sumoff >= kiov->kiov_len) { - sumoff -= kiov->kiov_len; - } else { - char *addr = ((char *)kmap (kiov->kiov_page)) + - kiov->kiov_offset + sumoff; - int fragnob = kiov->kiov_len - sumoff; - - csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); - sumnob -= fragnob; - sumoff = 0; - kunmap(kiov->kiov_page); - } - } else { - struct iovec *iov = &payload_iov[i]; - - if (sumoff > iov->iov_len) { - sumoff -= iov->iov_len; - } else { - char *addr = iov->iov_base + sumoff; - int fragnob = iov->iov_len - sumoff; - - csum = kqsw_csum(csum, addr, MIN(sumnob, fragnob)); - sumnob -= fragnob; - sumoff = 0; - } - } - } - memcpy(ktx->ktx_buffer + sizeof(*hdr) + sizeof(csum), &csum, sizeof(csum)); -#endif - - /* The first frag will be the pre-mapped buffer for (at least) the - * portals header. */ + /* The first frag will be the pre-mapped buffer. */ ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - if (nid == targetnid && /* not forwarding */ - ((type == PTL_MSG_GET && /* optimize GET? */ - kqswnal_tunables.kqn_optimized_gets != 0 && - le32_to_cpu(hdr->msg.get.sink_length) >= kqswnal_tunables.kqn_optimized_gets) || - (type == PTL_MSG_PUT && /* optimize PUT? */ - kqswnal_tunables.kqn_optimized_puts != 0 && - payload_nob >= kqswnal_tunables.kqn_optimized_puts))) { - lib_md_t *md = libmsg->md; - kqswnal_remotemd_t *rmd = (kqswnal_remotemd_t *)(ktx->ktx_buffer + KQSW_HDR_SIZE); - + if ((!target_is_router && /* target.nid is final dest */ + !routing && /* I'm the source */ + type == LNET_MSG_GET && /* optimize GET? */ + *kqswnal_tunables.kqn_optimized_gets != 0 && + lntmsg->msg_md->md_length >= + *kqswnal_tunables.kqn_optimized_gets) || + ((type == LNET_MSG_PUT || /* optimize PUT? */ + type == LNET_MSG_REPLY) && /* optimize REPLY? */ + *kqswnal_tunables.kqn_optimized_puts != 0 && + payload_nob >= *kqswnal_tunables.kqn_optimized_puts)) { + lnet_libmd_t *md = lntmsg->msg_md; + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; + lnet_hdr_t *mhdr; + kqswnal_remotemd_t *rmd; + /* Optimised path: I send over the Elan vaddrs of the local * buffers, and my peer DMAs directly to/from them. * * First I set up ktx as if it was going to send this * payload, (it needs to map it anyway). This fills * ktx_frags[1] and onward with the network addresses - * of the GET sink frags. I copy these into ktx_buffer, - * immediately after the header, and send that as my - * message. */ + * of the buffer frags. */ + + if (the_lnet.ln_ptlcompat == 2) { + /* Strong portals compatibility: send "raw" LNET + * header + rdma descriptor */ + mhdr = (lnet_hdr_t *)ktx->ktx_buffer; + rmd = (kqswnal_remotemd_t *)(mhdr + 1); + } else { + /* Send an RDMA message */ + msg->kqm_magic = LNET_PROTO_QSW_MAGIC; + msg->kqm_version = QSWLND_PROTO_VERSION; + msg->kqm_type = QSWLND_MSG_RDMA; + + mhdr = &msg->kqm_u.rdma.kqrm_hdr; + rmd = &msg->kqm_u.rdma.kqrm_rmd; + } - ktx->ktx_state = (type == PTL_MSG_PUT) ? KTX_PUTTING : KTX_GETTING; + *mhdr = *hdr; + nob = (((char *)rmd) - ktx->ktx_buffer); + + if (type == LNET_MSG_GET) { + if ((md->md_options & LNET_MD_KIOV) != 0) + rc = kqswnal_map_tx_kiov (ktx, 0, md->md_length, + md->md_niov, md->md_iov.kiov); + else + rc = kqswnal_map_tx_iov (ktx, 0, md->md_length, + md->md_niov, md->md_iov.iov); + ktx->ktx_state = KTX_GETTING; + } else { + if (payload_kiov != NULL) + rc = kqswnal_map_tx_kiov(ktx, 0, payload_nob, + payload_niov, payload_kiov); + else + rc = kqswnal_map_tx_iov(ktx, 0, payload_nob, + payload_niov, payload_iov); + ktx->ktx_state = KTX_PUTTING; + } - if ((libmsg->md->options & PTL_MD_KIOV) != 0) - rc = kqswnal_map_tx_kiov (ktx, 0, md->length, - md->md_niov, md->md_iov.kiov); - else - rc = kqswnal_map_tx_iov (ktx, 0, md->length, - md->md_niov, md->md_iov.iov); if (rc != 0) goto out; rmd->kqrmd_nfrag = ktx->ktx_nfrag - 1; + nob += offsetof(kqswnal_remotemd_t, + kqrmd_frag[rmd->kqrmd_nfrag]); + LASSERT (nob <= KQSW_TX_BUFFER_SIZE); - payload_nob = offsetof(kqswnal_remotemd_t, - kqrmd_frag[rmd->kqrmd_nfrag]); - LASSERT (KQSW_HDR_SIZE + payload_nob <= KQSW_TX_BUFFER_SIZE); - -#if MULTIRAIL_EKC memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1], rmd->kqrmd_nfrag * sizeof(EP_NMD)); - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + payload_nob); -#else - memcpy(&rmd->kqrmd_frag[0], &ktx->ktx_frags[1], - rmd->kqrmd_nfrag * sizeof(EP_IOVEC)); - - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); +#if KQSW_CKSUM + LASSERT (the_lnet.ln_ptlcompat != 2); + msg->kqm_nob = nob + payload_nob; + msg->kqm_cksum = 0; + msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob); #endif - if (type == PTL_MSG_GET) { + if (type == LNET_MSG_GET) { /* Allocate reply message now while I'm in thread context */ - ktx->ktx_args[2] = lib_create_reply_msg (&kqswnal_lib, - nid, libmsg); + ktx->ktx_args[2] = lnet_create_reply_msg ( + kqswnal_data.kqn_ni, lntmsg); if (ktx->ktx_args[2] == NULL) goto out; /* NB finalizing the REPLY message is my * responsibility now, whatever happens. */ +#if KQSW_CKSUM + if (*kqswnal_tunables.kqn_inject_csum_error == 3) { + msg->kqm_cksum++; + *kqswnal_tunables.kqn_inject_csum_error = 0; + } + + } else if (payload_kiov != NULL) { + /* must checksum payload after header so receiver can + * compute partial header cksum before swab. Sadly + * this causes 2 rounds of kmap */ + msg->kqm_cksum = + kqswnal_csum_kiov(msg->kqm_cksum, 0, payload_nob, + payload_niov, payload_kiov); + if (*kqswnal_tunables.kqn_inject_csum_error == 2) { + msg->kqm_cksum++; + *kqswnal_tunables.kqn_inject_csum_error = 0; + } + } else { + msg->kqm_cksum = + kqswnal_csum_iov(msg->kqm_cksum, 0, payload_nob, + payload_niov, payload_iov); + if (*kqswnal_tunables.kqn_inject_csum_error == 2) { + msg->kqm_cksum++; + *kqswnal_tunables.kqn_inject_csum_error = 0; + } +#endif } - } else if (payload_nob <= KQSW_TX_MAXCONTIG) { + } else if (payload_nob <= *kqswnal_tunables.kqn_tx_maxcontig) { + lnet_hdr_t *mhdr; + char *payload; + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; /* small message: single frag copied into the pre-mapped buffer */ + if (the_lnet.ln_ptlcompat == 2) { + /* Strong portals compatibility: send "raw" LNET header + * + payload */ + mhdr = (lnet_hdr_t *)ktx->ktx_buffer; + payload = (char *)(mhdr + 1); + } else { + /* Send an IMMEDIATE message */ + msg->kqm_magic = LNET_PROTO_QSW_MAGIC; + msg->kqm_version = QSWLND_PROTO_VERSION; + msg->kqm_type = QSWLND_MSG_IMMEDIATE; -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + payload_nob); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + payload_nob; -#endif - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf (ktx->ktx_buffer + KQSW_HDR_SIZE, - payload_niov, payload_iov, - payload_offset, payload_nob); + mhdr = &msg->kqm_u.immediate.kqim_hdr; + payload = msg->kqm_u.immediate.kqim_payload; + } + + *mhdr = *hdr; + nob = (payload - ktx->ktx_buffer) + payload_nob; + + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); + + if (payload_kiov != NULL) + lnet_copy_kiov2flat(KQSW_TX_BUFFER_SIZE, payload, 0, + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(KQSW_TX_BUFFER_SIZE, payload, 0, + payload_niov, payload_iov, + payload_offset, payload_nob); +#if KQSW_CKSUM + LASSERT (the_lnet.ln_ptlcompat != 2); + msg->kqm_nob = nob; + msg->kqm_cksum = 0; + msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob); + if (*kqswnal_tunables.kqn_inject_csum_error == 1) { + msg->kqm_cksum++; + *kqswnal_tunables.kqn_inject_csum_error = 0; } +#endif } else { + lnet_hdr_t *mhdr; + kqswnal_msg_t *msg = (kqswnal_msg_t *)ktx->ktx_buffer; /* large message: multiple frags: first is hdr in pre-mapped buffer */ + if (the_lnet.ln_ptlcompat == 2) { + /* Strong portals compatibility: send "raw" LNET header + * + payload */ + mhdr = (lnet_hdr_t *)ktx->ktx_buffer; + nob = sizeof(lnet_hdr_t); + } else { + /* Send an IMMEDIATE message */ + msg->kqm_magic = LNET_PROTO_QSW_MAGIC; + msg->kqm_version = QSWLND_PROTO_VERSION; + msg->kqm_type = QSWLND_MSG_IMMEDIATE; + + mhdr = &msg->kqm_u.immediate.kqim_hdr; + nob = offsetof(kqswnal_msg_t, + kqm_u.immediate.kqim_payload); + } + + *mhdr = *hdr; + + ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, 0, nob); -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; -#endif if (payload_kiov != NULL) rc = kqswnal_map_tx_kiov (ktx, payload_offset, payload_nob, payload_niov, payload_kiov); @@ -1246,175 +1224,61 @@ kqswnal_sendmsg (lib_nal_t *nal, payload_niov, payload_iov); if (rc != 0) goto out; + +#if KQSW_CKSUM + msg->kqm_nob = nob + payload_nob; + msg->kqm_cksum = 0; + msg->kqm_cksum = kqswnal_csum(~0, (char *)msg, nob); + + msg->kqm_cksum = (payload_kiov != NULL) ? + kqswnal_csum_kiov(msg->kqm_cksum, + payload_offset, payload_nob, + payload_niov, payload_kiov) : + kqswnal_csum_iov(msg->kqm_cksum, + payload_offset, payload_nob, + payload_niov, payload_iov); + + if (*kqswnal_tunables.kqn_inject_csum_error == 1) { + msg->kqm_cksum++; + *kqswnal_tunables.kqn_inject_csum_error = 0; + } +#endif + nob += payload_nob; } - ktx->ktx_port = (payload_nob <= KQSW_SMALLPAYLOAD) ? + ktx->ktx_port = (nob <= KQSW_SMALLMSG) ? EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; rc = kqswnal_launch (ktx); out: - CDEBUG(rc == 0 ? D_NET : D_ERROR, - "%s "LPSZ" bytes to "LPX64" via "LPX64": rc %d\n", - rc == 0 ? "Sent" : "Failed to send", - payload_nob, nid, targetnid, rc); + CDEBUG(rc == 0 ? D_NET : D_NETERROR, "%s %d bytes to %s%s: rc %d\n", + routing ? (rc == 0 ? "Routed" : "Failed to route") : + (rc == 0 ? "Sent" : "Failed to send"), + nob, libcfs_nid2str(target.nid), + target_is_router ? "(router)" : "", rc); if (rc != 0) { - if (ktx->ktx_state == KTX_GETTING && - ktx->ktx_args[2] != NULL) { + lnet_msg_t *repmsg = (lnet_msg_t *)ktx->ktx_args[2]; + int state = ktx->ktx_state; + + kqswnal_put_idle_tx (ktx); + + if (state == KTX_GETTING && repmsg != NULL) { /* We committed to reply, but there was a problem * launching the GET. We can't avoid delivering a * REPLY event since we committed above, so we * pretend the GET succeeded but the REPLY * failed. */ rc = 0; - lib_finalize (&kqswnal_lib, private, libmsg, PTL_OK); - lib_finalize (&kqswnal_lib, private, - (lib_msg_t *)ktx->ktx_args[2], PTL_FAIL); + lnet_finalize (kqswnal_data.kqn_ni, lntmsg, 0); + lnet_finalize (kqswnal_data.kqn_ni, repmsg, -EIO); } - kqswnal_put_idle_tx (ktx); } atomic_dec(&kqswnal_data.kqn_pending_txs); - return (rc == 0 ? PTL_OK : PTL_FAIL); -} - -static ptl_err_t -kqswnal_send (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - size_t payload_offset, - size_t payload_nob) -{ - return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_nob)); -} - -static ptl_err_t -kqswnal_send_pages (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) -{ - return (kqswnal_sendmsg (nal, private, libmsg, hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_nob)); -} - -void -kqswnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - int rc; - kqswnal_tx_t *ktx; - ptl_kiov_t *kiov = fwd->kprfd_kiov; - int niov = fwd->kprfd_niov; - int nob = fwd->kprfd_nob; - ptl_nid_t nid = fwd->kprfd_gateway_nid; - -#if KQSW_CHECKSUM - CERROR ("checksums for forwarded packets not implemented\n"); - LBUG (); -#endif - /* The router wants this NAL to forward a packet */ - CDEBUG (D_NET, "forwarding [%p] to "LPX64", payload: %d frags %d bytes\n", - fwd, nid, niov, nob); - - ktx = kqswnal_get_idle_tx (fwd, 0); - if (ktx == NULL) /* can't get txd right now */ - return; /* fwd will be scheduled when tx desc freed */ - - if (nid == kqswnal_lib.libnal_ni.ni_pid.nid) /* gateway is me */ - nid = fwd->kprfd_target_nid; /* target is final dest */ - - /* copy hdr into pre-mapped buffer */ - memcpy(ktx->ktx_buffer, fwd->kprfd_hdr, sizeof(ptl_hdr_t)); - - ktx->ktx_port = (nob <= KQSW_SMALLPAYLOAD) ? - EP_MSG_SVC_PORTALS_SMALL : EP_MSG_SVC_PORTALS_LARGE; - ktx->ktx_nid = nid; - ktx->ktx_state = KTX_FORWARDING; - ktx->ktx_args[0] = fwd; - ktx->ktx_nfrag = ktx->ktx_firsttmpfrag = 1; - - if (kqswnal_nid2elanid (nid) < 0) { - CERROR("Can't forward [%p] to "LPX64": not a peer\n", fwd, nid); - rc = -EHOSTUNREACH; - goto out; - } - - if (nob <= KQSW_TX_MAXCONTIG) - { - /* send payload from ktx's pre-mapped contiguous buffer */ -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE + nob); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE + nob; -#endif - if (nob > 0) - lib_copy_kiov2buf(ktx->ktx_buffer + KQSW_HDR_SIZE, - niov, kiov, 0, nob); - } - else - { - /* zero copy payload */ -#if MULTIRAIL_EKC - ep_nmd_subset(&ktx->ktx_frags[0], &ktx->ktx_ebuffer, - 0, KQSW_HDR_SIZE); -#else - ktx->ktx_frags[0].Base = ktx->ktx_ebuffer; - ktx->ktx_frags[0].Len = KQSW_HDR_SIZE; -#endif - rc = kqswnal_map_tx_kiov (ktx, 0, nob, niov, kiov); - if (rc != 0) - goto out; - } - - rc = kqswnal_launch (ktx); - out: - if (rc != 0) { - CERROR ("Failed to forward [%p] to "LPX64": %d\n", fwd, nid, rc); - - /* complete now (with failure) */ - kqswnal_tx_done (ktx, rc); - } - - atomic_dec(&kqswnal_data.kqn_pending_txs); -} - -void -kqswnal_fwd_callback (void *arg, int error) -{ - kqswnal_rx_t *krx = (kqswnal_rx_t *)arg; - - /* The router has finished forwarding this packet */ - - if (error != 0) - { - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); - - CERROR("Failed to route packet from "LPX64" to "LPX64": %d\n", - le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid),error); - } - - LASSERT (atomic_read(&krx->krx_refcount) == 1); - kqswnal_rx_decref (krx); + return (rc == 0 ? 0 : -EIO); } void @@ -1425,7 +1289,6 @@ kqswnal_requeue_rx (kqswnal_rx_t *krx) krx->krx_state = KRX_POSTED; -#if MULTIRAIL_EKC if (kqswnal_data.kqn_shuttingdown) { /* free EKC rxd on shutdown */ ep_complete_receive(krx->krx_rxd); @@ -1435,26 +1298,6 @@ kqswnal_requeue_rx (kqswnal_rx_t *krx) kqswnal_rxhandler, krx, &krx->krx_elanbuffer, 0); } -#else - if (kqswnal_data.kqn_shuttingdown) - return; - - if (krx->krx_rxd == NULL) { - /* We had a failed ep_complete_rpc() which nukes the - * descriptor in "old" EKC */ - int eprc = ep_queue_receive(krx->krx_eprx, - kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE, 0); - LASSERT (eprc == EP_SUCCESS); - /* We don't handle failure here; it's incredibly rare - * (never reported?) and only happens with "old" EKC */ - } else { - ep_requeue_receive(krx->krx_rxd, kqswnal_rxhandler, krx, - krx->krx_elanbuffer, - krx->krx_npages * PAGE_SIZE); - } -#endif } void @@ -1477,33 +1320,23 @@ void kqswnal_rx_done (kqswnal_rx_t *krx) { int rc; - EP_STATUSBLK *sblk; LASSERT (atomic_read(&krx->krx_refcount) == 0); if (krx->krx_rpc_reply_needed) { /* We've not completed the peer's RPC yet... */ - sblk = (krx->krx_rpc_reply_status == 0) ? - &kqswnal_data.kqn_rpc_success : - &kqswnal_data.kqn_rpc_failed; + krx->krx_rpc_reply.msg.magic = LNET_PROTO_QSW_MAGIC; + krx->krx_rpc_reply.msg.version = QSWLND_PROTO_VERSION; LASSERT (!in_interrupt()); -#if MULTIRAIL_EKC - rc = ep_complete_rpc(krx->krx_rxd, - kqswnal_rpc_complete, krx, - sblk, NULL, NULL, 0); - if (rc == EP_SUCCESS) - return; -#else + rc = ep_complete_rpc(krx->krx_rxd, kqswnal_rpc_complete, krx, - sblk, NULL, 0); + &krx->krx_rpc_reply.ep_statusblk, + NULL, NULL, 0); if (rc == EP_SUCCESS) return; - /* "old" EKC destroys rxd on failed completion */ - krx->krx_rxd = NULL; -#endif CERROR("can't complete RPC: %d\n", rc); krx->krx_rpc_reply_needed = 0; } @@ -1514,60 +1347,199 @@ kqswnal_rx_done (kqswnal_rx_t *krx) void kqswnal_parse (kqswnal_rx_t *krx) { - ptl_hdr_t *hdr = (ptl_hdr_t *) page_address(krx->krx_kiov[0].kiov_page); - ptl_nid_t dest_nid = le64_to_cpu(hdr->dest_nid); - int payload_nob; + lnet_ni_t *ni = kqswnal_data.kqn_ni; + kqswnal_msg_t *msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page); + lnet_nid_t fromnid = kqswnal_rx_nid(krx); + int swab; + int n; + int i; int nob; - int niov; + int rc; LASSERT (atomic_read(&krx->krx_refcount) == 1); - if (dest_nid == kqswnal_lib.libnal_ni.ni_pid.nid) { /* It's for me :) */ - /* I ignore parse errors since I'm not consuming a byte - * stream */ - (void)lib_parse (&kqswnal_lib, hdr, krx); - - /* Drop my ref; any RDMA activity takes an additional ref */ - kqswnal_rx_decref(krx); - return; + /* If ln_ptlcompat is set, peers may send me an "old" unencapsulated + * lnet hdr */ + LASSERT (offsetof(kqswnal_msg_t, kqm_u) <= sizeof(lnet_hdr_t)); + + if (krx->krx_nob < offsetof(kqswnal_msg_t, kqm_u)) { + CERROR("Short message %d received from %s\n", + krx->krx_nob, libcfs_nid2str(fromnid)); + goto done; } -#if KQSW_CHECKSUM - LASSERTF (0, "checksums for forwarded packets not implemented\n"); + swab = msg->kqm_magic == __swab32(LNET_PROTO_QSW_MAGIC); + + if (swab || msg->kqm_magic == LNET_PROTO_QSW_MAGIC) { +#if KQSW_CKSUM + __u32 csum0; + __u32 csum1; + + /* csum byte array before swab */ + csum1 = msg->kqm_cksum; + msg->kqm_cksum = 0; + csum0 = kqswnal_csum_kiov(~0, 0, krx->krx_nob, + krx->krx_npages, krx->krx_kiov); + msg->kqm_cksum = csum1; #endif - if (kqswnal_nid2elanid (dest_nid) >= 0) /* should have gone direct to peer */ - { - CERROR("dropping packet from "LPX64" for "LPX64 - ": target is peer\n", le64_to_cpu(hdr->src_nid), dest_nid); + if (swab) { + __swab16s(&msg->kqm_version); + __swab16s(&msg->kqm_type); +#if KQSW_CKSUM + __swab32s(&msg->kqm_cksum); + __swab32s(&msg->kqm_nob); +#endif + } - kqswnal_rx_decref (krx); - return; + if (msg->kqm_version != QSWLND_PROTO_VERSION) { + /* Future protocol version compatibility support! + * The next qswlnd-specific protocol rev will first + * send an RPC to check version. + * 1.4.6 and 1.4.7.early reply with a status + * block containing its current version. + * Later versions send a failure (-ve) status + + * magic/version */ + + if (!krx->krx_rpc_reply_needed) { + CERROR("Unexpected version %d from %s\n", + msg->kqm_version, libcfs_nid2str(fromnid)); + goto done; + } + + LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO); + goto done; + } + + switch (msg->kqm_type) { + default: + CERROR("Bad request type %x from %s\n", + msg->kqm_type, libcfs_nid2str(fromnid)); + goto done; + + case QSWLND_MSG_IMMEDIATE: + if (krx->krx_rpc_reply_needed) { + /* Should have been a simple message */ + CERROR("IMMEDIATE sent as RPC from %s\n", + libcfs_nid2str(fromnid)); + goto done; + } + + nob = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload); + if (krx->krx_nob < nob) { + CERROR("Short IMMEDIATE %d(%d) from %s\n", + krx->krx_nob, nob, libcfs_nid2str(fromnid)); + goto done; + } + +#if KQSW_CKSUM + if (csum0 != msg->kqm_cksum) { + CERROR("Bad IMMEDIATE checksum %08x(%08x) from %s\n", + csum0, msg->kqm_cksum, libcfs_nid2str(fromnid)); + CERROR("nob %d (%d)\n", krx->krx_nob, msg->kqm_nob); + goto done; + } +#endif + rc = lnet_parse(ni, &msg->kqm_u.immediate.kqim_hdr, + fromnid, krx, 0); + if (rc < 0) + goto done; + return; + + case QSWLND_MSG_RDMA: + if (!krx->krx_rpc_reply_needed) { + /* Should have been a simple message */ + CERROR("RDMA sent as simple message from %s\n", + libcfs_nid2str(fromnid)); + goto done; + } + + nob = offsetof(kqswnal_msg_t, + kqm_u.rdma.kqrm_rmd.kqrmd_frag[0]); + if (krx->krx_nob < nob) { + CERROR("Short RDMA message %d(%d) from %s\n", + krx->krx_nob, nob, libcfs_nid2str(fromnid)); + goto done; + } + + if (swab) + __swab32s(&msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag); + + n = msg->kqm_u.rdma.kqrm_rmd.kqrmd_nfrag; + nob = offsetof(kqswnal_msg_t, + kqm_u.rdma.kqrm_rmd.kqrmd_frag[n]); + + if (krx->krx_nob < nob) { + CERROR("short RDMA message %d(%d) from %s\n", + krx->krx_nob, nob, libcfs_nid2str(fromnid)); + goto done; + } + + if (swab) { + for (i = 0; i < n; i++) { + EP_NMD *nmd = &msg->kqm_u.rdma.kqrm_rmd.kqrmd_frag[i]; + + __swab32s(&nmd->nmd_addr); + __swab32s(&nmd->nmd_len); + __swab32s(&nmd->nmd_attr); + } + } + +#if KQSW_CKSUM + krx->krx_cksum = csum0; /* stash checksum so far */ +#endif + rc = lnet_parse(ni, &msg->kqm_u.rdma.kqrm_hdr, + fromnid, krx, 1); + if (rc < 0) + goto done; + return; + } + /* Not Reached */ } - nob = payload_nob = krx->krx_nob - KQSW_HDR_SIZE; - niov = 0; - if (nob > 0) { - krx->krx_kiov[0].kiov_offset = KQSW_HDR_SIZE; - krx->krx_kiov[0].kiov_len = MIN(PAGE_SIZE - KQSW_HDR_SIZE, nob); - niov = 1; - nob -= PAGE_SIZE - KQSW_HDR_SIZE; - - while (nob > 0) { - LASSERT (niov < krx->krx_npages); - - krx->krx_kiov[niov].kiov_offset = 0; - krx->krx_kiov[niov].kiov_len = MIN(PAGE_SIZE, nob); - niov++; - nob -= PAGE_SIZE; + if (msg->kqm_magic == LNET_PROTO_MAGIC || + msg->kqm_magic == __swab32(LNET_PROTO_MAGIC)) { + /* Future protocol version compatibility support! + * When LNET unifies protocols over all LNDs, the first thing a + * peer will send will be a version query RPC. + * 1.4.6 and 1.4.7.early reply with a status block containing + * LNET_PROTO_QSW_MAGIC.. + * Later versions send a failure (-ve) status + + * magic/version */ + + if (!krx->krx_rpc_reply_needed) { + CERROR("Unexpected magic %08x from %s\n", + msg->kqm_magic, libcfs_nid2str(fromnid)); + goto done; } + + LASSERT (krx->krx_rpc_reply.msg.status == -EPROTO); + goto done; } - kpr_fwd_init (&krx->krx_fwd, dest_nid, - hdr, payload_nob, niov, krx->krx_kiov, - kqswnal_fwd_callback, krx); + if (the_lnet.ln_ptlcompat != 0) { + /* Portals compatibility (strong or weak) + * This could be an unencapsulated LNET header. If it's big + * enough, let LNET's parser sort it out */ + + if (krx->krx_nob < sizeof(lnet_hdr_t)) { + CERROR("Short portals-compatible message from %s\n", + libcfs_nid2str(fromnid)); + goto done; + } + + krx->krx_raw_lnet_hdr = 1; + rc = lnet_parse(ni, (lnet_hdr_t *)msg, + fromnid, krx, krx->krx_rpc_reply_needed); + if (rc < 0) + goto done; + return; + } - kpr_fwd_start (&kqswnal_data.kqn_router, &krx->krx_fwd); + CERROR("Unrecognised magic %08x from %s\n", + msg->kqm_magic, libcfs_nid2str(fromnid)); + done: + kqswnal_rx_decref(krx); } /* Receive Interrupt Handler: posts to schedulers */ @@ -1578,7 +1550,6 @@ kqswnal_rxhandler(EP_RXD *rxd) int nob = ep_rxd_len (rxd); int status = ep_rxd_status (rxd); kqswnal_rx_t *krx = (kqswnal_rx_t *)ep_rxd_arg (rxd); - CDEBUG(D_NET, "kqswnal_rxhandler: rxd %p, krx %p, nob %d, status %d\n", rxd, krx, nob, status); @@ -1588,6 +1559,7 @@ kqswnal_rxhandler(EP_RXD *rxd) krx->krx_state = KRX_PARSE; krx->krx_rxd = rxd; krx->krx_nob = nob; + krx->krx_raw_lnet_hdr = 0; /* RPC reply iff rpc request received without error */ krx->krx_rpc_reply_needed = ep_rxd_isrpc(rxd) && @@ -1595,24 +1567,16 @@ kqswnal_rxhandler(EP_RXD *rxd) status == EP_MSG_TOO_BIG); /* Default to failure if an RPC reply is requested but not handled */ - krx->krx_rpc_reply_status = -EPROTO; + krx->krx_rpc_reply.msg.status = -EPROTO; atomic_set (&krx->krx_refcount, 1); - /* must receive a whole header to be able to parse */ - if (status != EP_SUCCESS || nob < sizeof (ptl_hdr_t)) - { + if (status != EP_SUCCESS) { /* receives complete with failure when receiver is removed */ -#if MULTIRAIL_EKC if (status == EP_SHUTDOWN) LASSERT (kqswnal_data.kqn_shuttingdown); else CERROR("receive status failed with status %d nob %d\n", ep_rxd_status(rxd), nob); -#else - if (!kqswnal_data.kqn_shuttingdown) - CERROR("receive status failed with status %d nob %d\n", - ep_rxd_status(rxd), nob); -#endif kqswnal_rx_decref(krx); return; } @@ -1630,249 +1594,124 @@ kqswnal_rxhandler(EP_RXD *rxd) spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); } -#if KQSW_CHECKSUM -void -kqswnal_csum_error (kqswnal_rx_t *krx, int ishdr) -{ - ptl_hdr_t *hdr = (ptl_hdr_t *)page_address (krx->krx_kiov[0].kiov_page); - - CERROR ("%s checksum mismatch %p: dnid "LPX64", snid "LPX64 - ", dpid %d, spid %d, type %d\n", - ishdr ? "Header" : "Payload", krx, - le64_to_cpu(hdr->dest_nid), le64_to_cpu(hdr->src_nid) - le32_to_cpu(hdr->dest_pid), le32_to_cpu(hdr->src_pid), - le32_to_cpu(hdr->type)); - - switch (le32_to_cpu(hdr->type)) - { - case PTL_MSG_ACK: - CERROR("ACK: mlen %d dmd "LPX64"."LPX64" match "LPX64 - " len %u\n", - le32_to_cpu(hdr->msg.ack.mlength), - hdr->msg.ack.dst_wmd.handle_cookie, - hdr->msg.ack.dst_wmd.handle_idx, - le64_to_cpu(hdr->msg.ack.match_bits), - le32_to_cpu(hdr->msg.ack.length)); - break; - case PTL_MSG_PUT: - CERROR("PUT: ptl %d amd "LPX64"."LPX64" match "LPX64 - " len %u off %u data "LPX64"\n", - le32_to_cpu(hdr->msg.put.ptl_index), - hdr->msg.put.ack_wmd.handle_cookie, - hdr->msg.put.ack_wmd.handle_idx, - le64_to_cpu(hdr->msg.put.match_bits), - le32_to_cpu(hdr->msg.put.length), - le32_to_cpu(hdr->msg.put.offset), - hdr->msg.put.hdr_data); - break; - case PTL_MSG_GET: - CERROR ("GET: <>\n"); - break; - case PTL_MSG_REPLY: - CERROR ("REPLY: <>\n"); - break; - default: - CERROR ("TYPE?: <>\n"); - } -} -#endif - -static ptl_err_t -kqswnal_recvmsg (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, - ptl_kiov_t *kiov, - size_t offset, - size_t mlen, - size_t rlen) +int +kqswnal_recv (lnet_ni_t *ni, + void *private, + lnet_msg_t *lntmsg, + int delayed, + unsigned int niov, + struct iovec *iov, + lnet_kiov_t *kiov, + unsigned int offset, + unsigned int mlen, + unsigned int rlen) { - kqswnal_rx_t *krx = (kqswnal_rx_t *)private; - char *buffer = page_address(krx->krx_kiov[0].kiov_page); - ptl_hdr_t *hdr = (ptl_hdr_t *)buffer; - int page; - char *page_ptr; - int page_nob; - char *iov_ptr; - int iov_nob; - int frag; - int rc; -#if KQSW_CHECKSUM - kqsw_csum_t senders_csum; - kqsw_csum_t payload_csum = 0; - kqsw_csum_t hdr_csum = kqsw_csum(0, hdr, sizeof(*hdr)); - size_t csum_len = mlen; - int csum_frags = 0; - int csum_nob = 0; - static atomic_t csum_counter; - int csum_verbose = (atomic_read(&csum_counter)%1000001) == 0; - - atomic_inc (&csum_counter); - - memcpy (&senders_csum, buffer + sizeof (ptl_hdr_t), sizeof (kqsw_csum_t)); - if (senders_csum != hdr_csum) - kqswnal_csum_error (krx, 1); -#endif - /* NB lib_parse() has already flipped *hdr */ - - CDEBUG(D_NET,"kqswnal_recv, mlen="LPSZ", rlen="LPSZ"\n", mlen, rlen); - - if (libmsg == NULL) { /* portals is discarding. */ - LASSERT (mlen == 0); - return PTL_OK; /* ignored by caller! */ - } - - if (krx->krx_rpc_reply_needed && - hdr->type == PTL_MSG_PUT) { - /* This must be an optimized PUT */ - rc = kqswnal_rdma (krx, libmsg, PTL_MSG_PUT, - niov, iov, kiov, offset, mlen); - return (rc == 0 ? PTL_OK : PTL_FAIL); - } - - /* What was actually received must be >= payload. */ - LASSERT (mlen <= rlen); - if (krx->krx_nob < KQSW_HDR_SIZE + mlen) { - CERROR("Bad message size: have %d, need %d + %d\n", - krx->krx_nob, (int)KQSW_HDR_SIZE, (int)mlen); - return (PTL_FAIL); - } + kqswnal_rx_t *krx = (kqswnal_rx_t *)private; + lnet_nid_t fromnid; + kqswnal_msg_t *msg; + lnet_hdr_t *hdr; + kqswnal_remotemd_t *rmd; + int msg_offset; + int rc; - /* It must be OK to kmap() if required */ - LASSERT (kiov == NULL || !in_interrupt ()); + LASSERT (!in_interrupt ()); /* OK to map */ /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); - if (mlen != 0) { - page = 0; - page_ptr = buffer + KQSW_HDR_SIZE; - page_nob = PAGE_SIZE - KQSW_HDR_SIZE; - - LASSERT (niov > 0); + fromnid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ep_rxd_node(krx->krx_rxd)); + msg = (kqswnal_msg_t *)page_address(krx->krx_kiov[0].kiov_page); - if (kiov != NULL) { - /* skip complete frags */ - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT (niov > 0); - } - iov_ptr = ((char *)kmap (kiov->kiov_page)) + - kiov->kiov_offset + offset; - iov_nob = kiov->kiov_len - offset; + if (krx->krx_rpc_reply_needed) { + /* optimized (rdma) request sent as RPC */ + + if (krx->krx_raw_lnet_hdr) { + LASSERT (the_lnet.ln_ptlcompat != 0); + hdr = (lnet_hdr_t *)msg; + rmd = kqswnal_get_portalscompat_rmd(krx); + if (rmd == NULL) + return (-EPROTO); } else { - /* skip complete frags */ - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); - } - iov_ptr = iov->iov_base + offset; - iov_nob = iov->iov_len - offset; + LASSERT (msg->kqm_type == QSWLND_MSG_RDMA); + hdr = &msg->kqm_u.rdma.kqrm_hdr; + rmd = &msg->kqm_u.rdma.kqrm_rmd; } - - for (;;) - { - frag = mlen; - if (frag > page_nob) - frag = page_nob; - if (frag > iov_nob) - frag = iov_nob; - - memcpy (iov_ptr, page_ptr, frag); -#if KQSW_CHECKSUM - payload_csum = kqsw_csum (payload_csum, iov_ptr, frag); - csum_nob += frag; - csum_frags++; -#endif - mlen -= frag; - if (mlen == 0) + + /* NB header is still in wire byte order */ + + switch (le32_to_cpu(hdr->type)) { + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + /* This is an optimized PUT/REPLY */ + rc = kqswnal_rdma(krx, lntmsg, + KTX_RDMA_FETCH, rmd, + niov, iov, kiov, offset, mlen); break; - page_nob -= frag; - if (page_nob != 0) - page_ptr += frag; - else - { - page++; - LASSERT (page < krx->krx_npages); - page_ptr = page_address(krx->krx_kiov[page].kiov_page); - page_nob = PAGE_SIZE; - } + case LNET_MSG_GET: +#if KQSW_CKSUM + if (krx->krx_cksum != msg->kqm_cksum) { + CERROR("Bad GET checksum %08x(%08x) from %s\n", + krx->krx_cksum, msg->kqm_cksum, + libcfs_nid2str(fromnid)); + rc = -EIO; + break; + } +#endif + if (lntmsg == NULL) { + /* No buffer match: my decref will + * complete the RPC with failure */ + rc = 0; + } else { + /* Matched something! */ + rc = kqswnal_rdma(krx, lntmsg, + KTX_RDMA_STORE, rmd, + lntmsg->msg_niov, + lntmsg->msg_iov, + lntmsg->msg_kiov, + lntmsg->msg_offset, + lntmsg->msg_len); + } + break; - iov_nob -= frag; - if (iov_nob != 0) - iov_ptr += frag; - else if (kiov != NULL) { - kunmap (kiov->kiov_page); - kiov++; - niov--; - LASSERT (niov > 0); - iov_ptr = ((char *)kmap (kiov->kiov_page)) + kiov->kiov_offset; - iov_nob = kiov->kiov_len; - } else { - iov++; - niov--; - LASSERT (niov > 0); - iov_ptr = iov->iov_base; - iov_nob = iov->iov_len; - } + default: + CERROR("Bad RPC type %d\n", + le32_to_cpu(hdr->type)); + rc = -EPROTO; + break; } - if (kiov != NULL) - kunmap (kiov->kiov_page); + kqswnal_rx_decref(krx); + return rc; } -#if KQSW_CHECKSUM - memcpy (&senders_csum, buffer + sizeof(ptl_hdr_t) + sizeof(kqsw_csum_t), - sizeof(kqsw_csum_t)); - - if (csum_len != rlen) - CERROR("Unable to checksum data in user's buffer\n"); - else if (senders_csum != payload_csum) - kqswnal_csum_error (krx, 0); - - if (csum_verbose) - CERROR("hdr csum %lx, payload_csum %lx, csum_frags %d, " - "csum_nob %d\n", - hdr_csum, payload_csum, csum_frags, csum_nob); -#endif - lib_finalize(nal, private, libmsg, PTL_OK); - - return (PTL_OK); -} + if (krx->krx_raw_lnet_hdr) { + LASSERT (the_lnet.ln_ptlcompat != 0); + msg_offset = sizeof(lnet_hdr_t); + } else { + LASSERT (msg->kqm_type == QSWLND_MSG_IMMEDIATE); + msg_offset = offsetof(kqswnal_msg_t, kqm_u.immediate.kqim_payload); + } + + if (krx->krx_nob < msg_offset + rlen) { + CERROR("Bad message size from %s: have %d, need %d + %d\n", + libcfs_nid2str(fromnid), krx->krx_nob, + msg_offset, rlen); + kqswnal_rx_decref(krx); + return -EPROTO; + } -static ptl_err_t -kqswnal_recv(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) -{ - return (kqswnal_recvmsg(nal, private, libmsg, - niov, iov, NULL, - offset, mlen, rlen)); -} + if (kiov != NULL) + lnet_copy_kiov2kiov(niov, kiov, offset, + krx->krx_npages, krx->krx_kiov, + msg_offset, mlen); + else + lnet_copy_kiov2iov(niov, iov, offset, + krx->krx_npages, krx->krx_kiov, + msg_offset, mlen); -static ptl_err_t -kqswnal_recv_pages (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - ptl_kiov_t *kiov, - size_t offset, - size_t mlen, - size_t rlen) -{ - return (kqswnal_recvmsg(nal, private, libmsg, - niov, NULL, kiov, - offset, mlen, rlen)); + lnet_finalize(ni, lntmsg, 0); + kqswnal_rx_decref(krx); + return 0; } int @@ -1898,14 +1737,13 @@ kqswnal_scheduler (void *arg) { kqswnal_rx_t *krx; kqswnal_tx_t *ktx; - kpr_fwd_desc_t *fwd; unsigned long flags; int rc; int counter = 0; int did_something; - kportal_daemonize ("kqswnal_sched"); - kportal_blockallsigs (); + cfs_daemonize ("kqswnal_sched"); + cfs_block_allsigs (); spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); @@ -1921,49 +1759,42 @@ kqswnal_scheduler (void *arg) spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); - switch (krx->krx_state) { - case KRX_PARSE: - kqswnal_parse (krx); - break; - case KRX_COMPLETING: - kqswnal_rx_decref (krx); - break; - default: - LBUG(); - } + LASSERT (krx->krx_state == KRX_PARSE); + kqswnal_parse (krx); did_something = 1; spin_lock_irqsave(&kqswnal_data.kqn_sched_lock, flags); } - if (!list_empty (&kqswnal_data.kqn_delayedtxds)) + if (!list_empty (&kqswnal_data.kqn_donetxds)) { - ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, - kqswnal_tx_t, ktx_delayed_list); - list_del_init (&ktx->ktx_delayed_list); + ktx = list_entry(kqswnal_data.kqn_donetxds.next, + kqswnal_tx_t, ktx_schedlist); + list_del_init (&ktx->ktx_schedlist); spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, flags); - rc = kqswnal_launch (ktx); - if (rc != 0) { - CERROR("Failed delayed transmit to "LPX64 - ": %d\n", ktx->ktx_nid, rc); - kqswnal_tx_done (ktx, rc); - } - atomic_dec (&kqswnal_data.kqn_pending_txs); + kqswnal_tx_done_in_thread_context(ktx); did_something = 1; spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); } - if (!list_empty (&kqswnal_data.kqn_delayedfwds)) + if (!list_empty (&kqswnal_data.kqn_delayedtxds)) { - fwd = list_entry (kqswnal_data.kqn_delayedfwds.next, kpr_fwd_desc_t, kprfd_list); - list_del (&fwd->kprfd_list); - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + ktx = list_entry(kqswnal_data.kqn_delayedtxds.next, + kqswnal_tx_t, ktx_schedlist); + list_del_init (&ktx->ktx_schedlist); + spin_unlock_irqrestore(&kqswnal_data.kqn_sched_lock, + flags); - /* If we're shutting down, this will just requeue fwd on kqn_idletxd_fwdq */ - kqswnal_fwd_packet (NULL, fwd); + rc = kqswnal_launch (ktx); + if (rc != 0) { + CERROR("Failed delayed transmit to %s: %d\n", + libcfs_nid2str(ktx->ktx_nid), rc); + kqswnal_tx_done (ktx, rc); + } + atomic_dec (&kqswnal_data.kqn_pending_txs); did_something = 1; spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); @@ -1982,11 +1813,12 @@ kqswnal_scheduler (void *arg) * there's nothing left to do */ break; } - rc = wait_event_interruptible (kqswnal_data.kqn_sched_waitq, - kqswnal_data.kqn_shuttingdown == 2 || - !list_empty(&kqswnal_data.kqn_readyrxds) || - !list_empty(&kqswnal_data.kqn_delayedtxds) || - !list_empty(&kqswnal_data.kqn_delayedfwds)); + rc = wait_event_interruptible_exclusive ( + kqswnal_data.kqn_sched_waitq, + kqswnal_data.kqn_shuttingdown == 2 || + !list_empty(&kqswnal_data.kqn_readyrxds) || + !list_empty(&kqswnal_data.kqn_donetxds) || + !list_empty(&kqswnal_data.kqn_delayedtxds)); LASSERT (rc == 0); } else if (need_resched()) schedule (); @@ -1998,13 +1830,3 @@ kqswnal_scheduler (void *arg) kqswnal_thread_fini (); return (0); } - -lib_nal_t kqswnal_lib = -{ - libnal_data: &kqswnal_data, /* NAL private data */ - libnal_send: kqswnal_send, - libnal_send_pages: kqswnal_send_pages, - libnal_recv: kqswnal_recv, - libnal_recv_pages: kqswnal_recv_pages, - libnal_dist: kqswnal_dist -}; diff --git a/lnet/klnds/qswlnd/qswlnd_modparams.c b/lnet/klnds/qswlnd/qswlnd_modparams.c new file mode 100644 index 0000000..62f8924 --- /dev/null +++ b/lnet/klnds/qswlnd/qswlnd_modparams.c @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2002-2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Portals, http://www.lustre.org + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "qswlnd.h" + +static int tx_maxcontig = (1<<10); +CFS_MODULE_PARM(tx_maxcontig, "i", int, 0444, + "maximum payload to de-fragment"); + +static int ntxmsgs = 256; +CFS_MODULE_PARM(ntxmsgs, "i", int, 0444, + "# 'normal' tx msg buffers"); + +static int credits = 128; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# per-peer concurrent sends"); + +static int nrxmsgs_large = 64; +CFS_MODULE_PARM(nrxmsgs_large, "i", int, 0444, + "# 'large' rx msg buffers"); + +static int ep_envelopes_large = 256; +CFS_MODULE_PARM(ep_envelopes_large, "i", int, 0444, + "# 'large' rx msg envelope buffers"); + +static int nrxmsgs_small = 256; +CFS_MODULE_PARM(nrxmsgs_small, "i", int, 0444, + "# 'small' rx msg buffers"); + +static int ep_envelopes_small = 2048; +CFS_MODULE_PARM(ep_envelopes_small, "i", int, 0444, + "# 'small' rx msg envelope buffers"); + +static int optimized_puts = (32<<10); +CFS_MODULE_PARM(optimized_puts, "i", int, 0644, + "zero-copy puts >= this size"); + +static int optimized_gets = 2048; +CFS_MODULE_PARM(optimized_gets, "i", int, 0644, + "zero-copy gets >= this size"); + +#if KQSW_CKSUM +static int inject_csum_error = 0; +CFS_MODULE_PARM(inject_csum_error, "i", int, 0644, + "test checksumming"); +#endif + +kqswnal_tunables_t kqswnal_tunables = { + .kqn_tx_maxcontig = &tx_maxcontig, + .kqn_ntxmsgs = &ntxmsgs, + .kqn_credits = &credits, + .kqn_peercredits = &peer_credits, + .kqn_nrxmsgs_large = &nrxmsgs_large, + .kqn_ep_envelopes_large = &ep_envelopes_large, + .kqn_nrxmsgs_small = &nrxmsgs_small, + .kqn_ep_envelopes_small = &ep_envelopes_small, + .kqn_optimized_puts = &optimized_puts, + .kqn_optimized_gets = &optimized_gets, +#if KQSW_CKSUM + .kqn_inject_csum_error = &inject_csum_error, +#endif +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static ctl_table kqswnal_ctl_table[] = { + {1, "tx_maxcontig", &tx_maxcontig, + sizeof (int), 0444, NULL, &proc_dointvec}, + {2, "ntxmsgs", &ntxmsgs, + sizeof (int), 0444, NULL, &proc_dointvec}, + {3, "credits", &credits, + sizeof (int), 0444, NULL, &proc_dointvec}, + {4, "peer_credits", &peer_credits, + sizeof (int), 0444, NULL, &proc_dointvec}, + {5, "nrxmsgs_large", &nrxmsgs_large, + sizeof (int), 0444, NULL, &proc_dointvec}, + {6, "ep_envelopes_large", &ep_envelopes_large, + sizeof (int), 0444, NULL, &proc_dointvec}, + {7, "nrxmsgs_small", &nrxmsgs_small, + sizeof (int), 0444, NULL, &proc_dointvec}, + {8, "ep_envelopes_small", &ep_envelopes_small, + sizeof (int), 0444, NULL, &proc_dointvec}, + {9, "optimized_puts", &optimized_puts, + sizeof (int), 0644, NULL, &proc_dointvec}, + {10, "optimized_gets", &optimized_gets, + sizeof (int), 0644, NULL, &proc_dointvec}, +#if KQSW_CKSUM + {11, "inject_csum_error", &inject_csum_error, + sizeof (int), 0644, NULL, &proc_dointvec}, +#endif + {0} +}; + +static ctl_table kqswnal_top_ctl_table[] = { + {201, "qswnal", NULL, 0, 0555, kqswnal_ctl_table}, + {0} +}; + +int +kqswnal_tunables_init () +{ + kqswnal_tunables.kqn_sysctl = + register_sysctl_table(kqswnal_top_ctl_table, 0); + + if (kqswnal_tunables.kqn_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +kqswnal_tunables_fini () +{ + if (kqswnal_tunables.kqn_sysctl != NULL) + unregister_sysctl_table(kqswnal_tunables.kqn_sysctl); +} +#else +int +kqswnal_tunables_init () +{ + return 0; +} + +void +kqswnal_tunables_fini () +{ +} +#endif diff --git a/lnet/klnds/ralnd/Makefile.in b/lnet/klnds/ralnd/Makefile.in index 1772cc2..e1f5e82 100644 --- a/lnet/klnds/ralnd/Makefile.in +++ b/lnet/klnds/ralnd/Makefile.in @@ -1,5 +1,5 @@ -MODULES := kranal -kranal-objs := ranal.o ranal_cb.o +MODULES := kralnd +kralnd-objs := ralnd.o ralnd_cb.o ralnd_modparams.o EXTRA_POST_CFLAGS := @RACPPFLAGS@ diff --git a/lnet/klnds/ralnd/autoMakefile.am b/lnet/klnds/ralnd/autoMakefile.am index 3bb7642..7f3df4c 100644 --- a/lnet/klnds/ralnd/autoMakefile.am +++ b/lnet/klnds/ralnd/autoMakefile.am @@ -4,12 +4,10 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS -if BUILD_RANAL -modulenet_DATA = kranal$(KMODEXT) -endif +if BUILD_RALND +modulenet_DATA = kralnd$(KMODEXT) endif endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kranal-objs:%.o=%.c) ranal.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kralnd-objs:%.o=%.c) ralnd.h diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index eb13d73..a0a4d93 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -20,201 +20,26 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * */ -#include "ranal.h" - -static int kranal_devids[] = {RAPK_MAIN_DEVICE_ID, - RAPK_EXPANSION_DEVICE_ID}; - -nal_t kranal_api; -ptl_handle_ni_t kranal_ni; -kra_data_t kranal_data; -kra_tunables_t kranal_tunables; - -#define RANAL_SYSCTL_TIMEOUT 1 -#define RANAL_SYSCTL_LISTENER_TIMEOUT 2 -#define RANAL_SYSCTL_BACKLOG 3 -#define RANAL_SYSCTL_PORT 4 -#define RANAL_SYSCTL_MAX_IMMEDIATE 5 - -#define RANAL_SYSCTL 202 - -static ctl_table kranal_ctl_table[] = { - {RANAL_SYSCTL_TIMEOUT, "timeout", - &kranal_tunables.kra_timeout, sizeof(int), - 0644, NULL, &proc_dointvec}, - {RANAL_SYSCTL_LISTENER_TIMEOUT, "listener_timeout", - &kranal_tunables.kra_listener_timeout, sizeof(int), - 0644, NULL, &proc_dointvec}, - {RANAL_SYSCTL_BACKLOG, "backlog", - &kranal_tunables.kra_backlog, sizeof(int), - 0644, NULL, kranal_listener_procint}, - {RANAL_SYSCTL_PORT, "port", - &kranal_tunables.kra_port, sizeof(int), - 0644, NULL, kranal_listener_procint}, - {RANAL_SYSCTL_MAX_IMMEDIATE, "max_immediate", - &kranal_tunables.kra_max_immediate, sizeof(int), - 0644, NULL, &proc_dointvec}, - { 0 } +#include "ralnd.h" + +static int kranal_devids[RANAL_MAXDEVS] = {RAPK_MAIN_DEVICE_ID, + RAPK_EXPANSION_DEVICE_ID}; + +lnd_t the_kralnd = { + .lnd_type = RALND, + .lnd_startup = kranal_startup, + .lnd_shutdown = kranal_shutdown, + .lnd_ctl = kranal_ctl, + .lnd_send = kranal_send, + .lnd_recv = kranal_recv, + .lnd_eager_recv = kranal_eager_recv, + .lnd_accept = kranal_accept, }; -static ctl_table kranal_top_ctl_table[] = { - {RANAL_SYSCTL, "ranal", NULL, 0, 0555, kranal_ctl_table}, - { 0 } -}; - -int -kranal_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - - /* We've set up the socket's send buffer to be large enough for - * everything we send, so a single non-blocking send should - * complete without error. */ - - set_fs(KERNEL_DS); - rc = sock_sendmsg(sock, &msg, iov.iov_len); - set_fs(oldmm); - - if (rc == nob) - return 0; - - if (rc >= 0) - return -EAGAIN; - - return rc; -} - -int -kranal_sock_read (struct socket *sock, void *buffer, int nob, int timeout) -{ - int rc; - mm_segment_t oldmm = get_fs(); - long ticks = timeout * HZ; - unsigned long then; - struct timeval tv; - - LASSERT (nob > 0); - LASSERT (ticks > 0); - - for (;;) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - /* Set receive timeout to remaining time */ - tv = (struct timeval) { - .tv_sec = ticks / HZ, - .tv_usec = ((ticks % HZ) * 1000000) / HZ - }; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof(tv)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set socket recv timeout %d: %d\n", - timeout, rc); - return rc; - } - - set_fs(KERNEL_DS); - then = jiffies; - rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); - ticks -= jiffies - then; - set_fs(oldmm); - - if (rc < 0) - return rc; - - if (rc == 0) - return -ECONNABORTED; - - buffer = ((char *)buffer) + rc; - nob -= rc; - - if (nob == 0) - return 0; - - if (ticks <= 0) - return -ETIMEDOUT; - } -} - -int -kranal_create_sock(struct socket **sockp) -{ - struct socket *sock; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - if (rc != 0) { - CERROR("Can't create socket: %d\n", rc); - return rc; - } - - /* Ensure sending connection info doesn't block */ - option = 2 * sizeof(kra_connreq_t); - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set send buffer %d: %d\n", option, rc); - goto failed; - } - - option = 1; - set_fs(KERNEL_DS); - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof(option)); - set_fs(oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR: %d\n", rc); - goto failed; - } - - *sockp = sock; - return 0; - - failed: - sock_release(sock); - return rc; -} - -void -kranal_pause(int ticks) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ticks); -} +kra_data_t kranal_data; void -kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid) +kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, lnet_nid_t dstnid) { RAP_RETURN rrc; @@ -222,8 +47,13 @@ kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid) connreq->racr_magic = RANAL_MSG_MAGIC; connreq->racr_version = RANAL_MSG_VERSION; + + if (conn == NULL) /* prepping a "stub" reply */ + return; + connreq->racr_devid = conn->rac_device->rad_id; - connreq->racr_srcnid = kranal_lib.libnal_ni.ni_pid.nid; + connreq->racr_srcnid = lnet_ptlcompat_srcnid(kranal_data.kra_ni->ni_nid, + dstnid); connreq->racr_dstnid = dstnid; connreq->racr_peerstamp = kranal_data.kra_peerstamp; connreq->racr_connstamp = conn->rac_my_connstamp; @@ -234,22 +64,101 @@ kranal_pack_connreq(kra_connreq_t *connreq, kra_conn_t *conn, ptl_nid_t dstnid) } int -kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int timeout) +kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int active) { + int timeout = active ? *kranal_tunables.kra_timeout : + lnet_acceptor_timeout(); + int swab; int rc; - rc = kranal_sock_read(sock, connreq, sizeof(*connreq), timeout); + /* return 0 on success, -ve on error, +ve to tell the peer I'm "old" */ + + rc = libcfs_sock_read(sock, &connreq->racr_magic, + sizeof(connreq->racr_magic), timeout); if (rc != 0) { - CERROR("Read failed: %d\n", rc); - return rc; + CERROR("Read(magic) failed(1): %d\n", rc); + return -EIO; + } + + if (connreq->racr_magic != RANAL_MSG_MAGIC && + connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) { + /* Unexpected magic! */ + if (!active && + the_lnet.ln_ptlcompat == 0 && + (connreq->racr_magic == LNET_PROTO_MAGIC || + connreq->racr_magic == __swab32(LNET_PROTO_MAGIC))) { + /* future protocol version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. +ve rc means I + * reply with my current magic/version */ + return EPROTO; + } + + if (active || + the_lnet.ln_ptlcompat == 0) { + CERROR("Unexpected magic %08x (1)\n", + connreq->racr_magic); + return -EPROTO; + } + + /* When portals compatibility is set, I may be passed a new + * connection "blindly" by the acceptor, and I have to + * determine if my peer has sent an acceptor connection request + * or not. This isn't a connreq, so I'll get the acceptor to + * look at it... */ + rc = lnet_accept(kranal_data.kra_ni, sock, connreq->racr_magic); + if (rc != 0) + return -EPROTO; + + /* ...and if it's OK I'm back to looking for a connreq... */ + rc = libcfs_sock_read(sock, &connreq->racr_magic, + sizeof(connreq->racr_magic), timeout); + if (rc != 0) { + CERROR("Read(magic) failed(2): %d\n", rc); + return -EIO; + } + + if (connreq->racr_magic != RANAL_MSG_MAGIC && + connreq->racr_magic != __swab32(RANAL_MSG_MAGIC)) { + CERROR("Unexpected magic %08x(2)\n", + connreq->racr_magic); + return -EPROTO; + } + } + + swab = (connreq->racr_magic == __swab32(RANAL_MSG_MAGIC)); + + rc = libcfs_sock_read(sock, &connreq->racr_version, + sizeof(connreq->racr_version), timeout); + if (rc != 0) { + CERROR("Read(version) failed: %d\n", rc); + return -EIO; } - if (connreq->racr_magic != RANAL_MSG_MAGIC) { - if (__swab32(connreq->racr_magic) != RANAL_MSG_MAGIC) { - CERROR("Unexpected magic %08x\n", connreq->racr_magic); + if (swab) + __swab16s(&connreq->racr_version); + + if (connreq->racr_version != RANAL_MSG_VERSION) { + if (active) { + CERROR("Unexpected version %d\n", connreq->racr_version); return -EPROTO; } + /* If this is a future version of the ralnd protocol, and I'm + * passive (accepted the connection), tell my peer I'm "old" + * (+ve rc) */ + return EPROTO; + } + + rc = libcfs_sock_read(sock, &connreq->racr_devid, + sizeof(connreq->racr_version) - + offsetof(kra_connreq_t, racr_devid), + timeout); + if (rc != 0) { + CERROR("Read(body) failed: %d\n", rc); + return -EIO; + } + if (swab) { __swab32s(&connreq->racr_magic); __swab16s(&connreq->racr_version); __swab16s(&connreq->racr_devid); @@ -265,14 +174,9 @@ kranal_recv_connreq(struct socket *sock, kra_connreq_t *connreq, int timeout) __swab32s(&connreq->racr_riparams.CompletionCookie); } - if (connreq->racr_version != RANAL_MSG_VERSION) { - CERROR("Unexpected version %d\n", connreq->racr_version); - return -EPROTO; - } - - if (connreq->racr_srcnid == PTL_NID_ANY || - connreq->racr_dstnid == PTL_NID_ANY) { - CERROR("Received PTL_NID_ANY\n"); + if (connreq->racr_srcnid == LNET_NID_ANY || + connreq->racr_dstnid == LNET_NID_ANY) { + CERROR("Received LNET_NID_ANY\n"); return -EPROTO; } @@ -294,7 +198,7 @@ kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn) int loopback; int count = 0; - loopback = peer->rap_nid == kranal_lib.libnal_ni.ni_pid.nid; + loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid; list_for_each_safe (ctmp, cnxt, &peer->rap_conns) { conn = list_entry(ctmp, kra_conn_t, rac_list); @@ -303,8 +207,9 @@ kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn) continue; if (conn->rac_peerstamp != newconn->rac_peerstamp) { - CDEBUG(D_NET, "Closing stale conn nid:"LPX64 - " peerstamp:"LPX64"("LPX64")\n", peer->rap_nid, + CDEBUG(D_NET, "Closing stale conn nid: %s " + " peerstamp:"LPX64"("LPX64")\n", + libcfs_nid2str(peer->rap_nid), conn->rac_peerstamp, newconn->rac_peerstamp); LASSERT (conn->rac_peerstamp < newconn->rac_peerstamp); count++; @@ -322,8 +227,9 @@ kranal_close_stale_conns_locked (kra_peer_t *peer, kra_conn_t *newconn) LASSERT (conn->rac_peer_connstamp < newconn->rac_peer_connstamp); - CDEBUG(D_NET, "Closing stale conn nid:"LPX64 - " connstamp:"LPX64"("LPX64")\n", peer->rap_nid, + CDEBUG(D_NET, "Closing stale conn nid: %s" + " connstamp:"LPX64"("LPX64")\n", + libcfs_nid2str(peer->rap_nid), conn->rac_peer_connstamp, newconn->rac_peer_connstamp); count++; @@ -340,7 +246,7 @@ kranal_conn_isdup_locked(kra_peer_t *peer, kra_conn_t *newconn) struct list_head *tmp; int loopback; - loopback = peer->rap_nid == kranal_lib.libnal_ni.ni_pid.nid; + loopback = peer->rap_nid == kranal_data.kra_ni->ni_nid; list_for_each(tmp, &peer->rap_conns) { conn = list_entry(tmp, kra_conn_t, rac_list); @@ -404,7 +310,7 @@ kranal_create_conn(kra_conn_t **connp, kra_device_t *dev) RAP_RETURN rrc; LASSERT (!in_interrupt()); - PORTAL_ALLOC(conn, sizeof(*conn)); + LIBCFS_ALLOC(conn, sizeof(*conn)); if (conn == NULL) return -ENOMEM; @@ -422,14 +328,14 @@ kranal_create_conn(kra_conn_t **connp, kra_device_t *dev) kranal_set_conn_uniqueness(conn); conn->rac_device = dev; - conn->rac_timeout = MAX(kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT); + conn->rac_timeout = MAX(*kranal_tunables.kra_timeout, RANAL_MIN_TIMEOUT); kranal_update_reaper_timeout(conn->rac_timeout); rrc = RapkCreateRi(dev->rad_handle, conn->rac_cqid, &conn->rac_rihandle); if (rrc != RAP_SUCCESS) { CERROR("RapkCreateRi failed: %d\n", rrc); - PORTAL_FREE(conn, sizeof(*conn)); + LIBCFS_FREE(conn, sizeof(*conn)); return -ENETDOWN; } @@ -460,7 +366,7 @@ kranal_destroy_conn(kra_conn_t *conn) if (conn->rac_peer != NULL) kranal_peer_decref(conn->rac_peer); - PORTAL_FREE(conn, sizeof(*conn)); + LIBCFS_FREE(conn, sizeof(*conn)); atomic_dec(&kranal_data.kra_nconns); } @@ -488,8 +394,9 @@ kranal_close_conn_locked (kra_conn_t *conn, int error) { kra_peer_t *peer = conn->rac_peer; - CDEBUG(error == 0 ? D_NET : D_ERROR, - "closing conn to "LPX64": error %d\n", peer->rap_nid, error); + CDEBUG(error == 0 ? D_NET : D_NETERROR, + "closing conn to %s: error %d\n", + libcfs_nid2str(peer->rap_nid), error); LASSERT (!in_interrupt()); LASSERT (conn->rac_state == RANAL_CONN_ESTABLISHED); @@ -576,10 +483,9 @@ kranal_set_conn_params(kra_conn_t *conn, kra_connreq_t *connreq, } int -kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp, - ptl_nid_t *dst_nidp, kra_conn_t **connp) +kranal_passive_conn_handshake (struct socket *sock, lnet_nid_t *src_nidp, + lnet_nid_t *dst_nidp, kra_conn_t **connp) { - struct sockaddr_in addr; __u32 peer_ip; unsigned int peer_port; kra_connreq_t rx_connreq; @@ -587,33 +493,36 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp, kra_conn_t *conn; kra_device_t *dev; int rc; - int len; int i; - len = sizeof(addr); - rc = sock->ops->getname(sock, (struct sockaddr *)&addr, &len, 2); + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); if (rc != 0) { CERROR("Can't get peer's IP: %d\n", rc); return rc; } - peer_ip = ntohl(addr.sin_addr.s_addr); - peer_port = ntohs(addr.sin_port); + rc = kranal_recv_connreq(sock, &rx_connreq, 0); - if (peer_port >= 1024) { - CERROR("Refusing unprivileged connection from %u.%u.%u.%u/%d\n", - HIPQUAD(peer_ip), peer_port); - return -ECONNREFUSED; - } - - rc = kranal_recv_connreq(sock, &rx_connreq, - kranal_tunables.kra_listener_timeout); - if (rc != 0) { + if (rc < 0) { CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer_ip), peer_port, rc); return rc; } + if (rc > 0) { + /* Request from "new" peer: send reply with my MAGIC/VERSION to + * tell her I'm old... */ + kranal_pack_connreq(&tx_connreq, NULL, LNET_NID_ANY); + + rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq), + lnet_acceptor_timeout()); + if (rc != 0) + CERROR("Can't tx stub connreq to %u.%u.%u.%u/%d: %d\n", + HIPQUAD(peer_ip), peer_port, rc); + + return -EPROTO; + } + for (i = 0;;i++) { if (i == kranal_data.kra_ndevs) { CERROR("Can't match dev %d from %u.%u.%u.%u/%d\n", @@ -631,7 +540,8 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp, kranal_pack_connreq(&tx_connreq, conn, rx_connreq.racr_srcnid); - rc = kranal_sock_write(sock, &tx_connreq, sizeof(tx_connreq)); + rc = libcfs_sock_write(sock, &tx_connreq, sizeof(tx_connreq), + lnet_acceptor_timeout()); if (rc != 0) { CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer_ip), peer_port, rc); @@ -652,72 +562,8 @@ kranal_passive_conn_handshake (struct socket *sock, ptl_nid_t *src_nidp, } int -ranal_connect_sock(kra_peer_t *peer, struct socket **sockp) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct socket *sock; - unsigned int port; - int rc; - - for (port = 1023; port >= 512; port--) { - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(port); - locaddr.sin_addr.s_addr = htonl(INADDR_ANY); - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (peer->rap_port); - srvaddr.sin_addr.s_addr = htonl (peer->rap_ip); - - rc = kranal_create_sock(&sock); - if (rc != 0) - return rc; - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc != 0) { - sock_release(sock); - - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", port); - continue; - } - - CERROR("Can't bind to reserved port %d: %d\n", port, rc); - return rc; - } - - rc = sock->ops->connect(sock, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - 0); - if (rc == 0) { - *sockp = sock; - return 0; - } - - sock_release(sock); - - if (rc != -EADDRNOTAVAIL) { - CERROR("Can't connect port %d to %u.%u.%u.%u/%d: %d\n", - port, HIPQUAD(peer->rap_ip), peer->rap_port, rc); - return rc; - } - - CDEBUG(D_NET, "Port %d not available for %u.%u.%u.%u/%d\n", - port, HIPQUAD(peer->rap_ip), peer->rap_port); - } - - /* all ports busy */ - return -EHOSTUNREACH; -} - - -int kranal_active_conn_handshake(kra_peer_t *peer, - ptl_nid_t *dst_nidp, kra_conn_t **connp) + lnet_nid_t *dst_nidp, kra_conn_t **connp) { kra_connreq_t connreq; kra_conn_t *conn; @@ -728,7 +574,7 @@ kranal_active_conn_handshake(kra_peer_t *peer, /* spread connections over all devices using both peer NIDs to ensure * all nids use all devices */ - idx = peer->rap_nid + kranal_lib.libnal_ni.ni_pid.nid; + idx = peer->rap_nid + kranal_data.kra_ni->ni_nid; dev = &kranal_data.kra_devices[idx % kranal_data.kra_ndevs]; rc = kranal_create_conn(&conn, dev); @@ -737,7 +583,22 @@ kranal_active_conn_handshake(kra_peer_t *peer, kranal_pack_connreq(&connreq, conn, peer->rap_nid); - rc = ranal_connect_sock(peer, &sock); + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto test */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + connreq.racr_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + connreq.racr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + + rc = lnet_connect(&sock, peer->rap_nid, + 0, peer->rap_ip, peer->rap_port); if (rc != 0) goto failed_0; @@ -745,29 +606,31 @@ kranal_active_conn_handshake(kra_peer_t *peer, * immediately after accepting a connection, so we connect and then * send immediately. */ - rc = kranal_sock_write(sock, &connreq, sizeof(connreq)); + rc = libcfs_sock_write(sock, &connreq, sizeof(connreq), + lnet_acceptor_timeout()); if (rc != 0) { CERROR("Can't tx connreq to %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer->rap_ip), peer->rap_port, rc); - goto failed_1; + goto failed_2; } - rc = kranal_recv_connreq(sock, &connreq, kranal_tunables.kra_timeout); + rc = kranal_recv_connreq(sock, &connreq, 1); if (rc != 0) { CERROR("Can't rx connreq from %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer->rap_ip), peer->rap_port, rc); - goto failed_1; + goto failed_2; } - sock_release(sock); + libcfs_sock_release(sock); rc = -EPROTO; if (connreq.racr_srcnid != peer->rap_nid) { CERROR("Unexpected srcnid from %u.%u.%u.%u/%d: " - "received "LPX64" expected "LPX64"\n", + "received %s expected %s\n", HIPQUAD(peer->rap_ip), peer->rap_port, - connreq.racr_srcnid, peer->rap_nid); - goto failed_0; + libcfs_nid2str(connreq.racr_srcnid), + libcfs_nid2str(peer->rap_nid)); + goto failed_1; } if (connreq.racr_devid != dev->rad_id) { @@ -775,20 +638,23 @@ kranal_active_conn_handshake(kra_peer_t *peer, "received %d expected %d\n", HIPQUAD(peer->rap_ip), peer->rap_port, connreq.racr_devid, dev->rad_id); - goto failed_0; + goto failed_1; } rc = kranal_set_conn_params(conn, &connreq, peer->rap_ip, peer->rap_port); if (rc != 0) - goto failed_0; + goto failed_1; *connp = conn; *dst_nidp = connreq.racr_dstnid; return 0; + failed_2: + libcfs_sock_release(sock); failed_1: - sock_release(sock); + lnet_connect_console_error(rc, peer->rap_nid, + peer->rap_ip, peer->rap_port); failed_0: kranal_conn_decref(conn); return rc; @@ -799,8 +665,8 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) { kra_peer_t *peer2; kra_tx_t *tx; - ptl_nid_t peer_nid; - ptl_nid_t dst_nid; + lnet_nid_t peer_nid; + lnet_nid_t dst_nid; unsigned long flags; kra_conn_t *conn; int rc; @@ -837,9 +703,10 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) return rc; /* assume this is a new peer */ - peer = kranal_create_peer(peer_nid); - if (peer == NULL) { - CERROR("Can't allocate peer for "LPX64"\n", peer_nid); + rc = kranal_create_peer(&peer, peer_nid); + if (rc != 0) { + CERROR("Can't create conn for %s\n", + libcfs_nid2str(peer_nid)); kranal_conn_decref(conn); return -ENOMEM; } @@ -861,12 +728,12 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) /* Refuse connection if peer thinks we are a different NID. We check * this while holding the global lock, to synch with connection * destruction on NID change. */ - if (dst_nid != kranal_lib.libnal_ni.ni_pid.nid) { + if (!lnet_ptlcompat_matchnid(kranal_data.kra_ni->ni_nid, dst_nid)) { write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - CERROR("Stale/bad connection with "LPX64 - ": dst_nid "LPX64", expected "LPX64"\n", - peer_nid, dst_nid, kranal_lib.libnal_ni.ni_pid.nid); + CERROR("Stale/bad connection with %s: dst_nid %s, expected %s\n", + libcfs_nid2str(peer_nid), libcfs_nid2str(dst_nid), + libcfs_nid2str(kranal_data.kra_ni->ni_nid)); rc = -ESTALE; goto failed; } @@ -879,8 +746,8 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) LASSERT (!list_empty(&peer->rap_conns)); LASSERT (list_empty(&peer->rap_tx_queue)); write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - CWARN("Not creating duplicate connection to "LPX64": %d\n", - peer_nid, rc); + CWARN("Not creating duplicate connection to %s: %d\n", + libcfs_nid2str(peer_nid), rc); rc = 0; goto failed; } @@ -918,10 +785,12 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) /* CAVEAT EMPTOR: passive peer can disappear NOW */ if (nstale != 0) - CWARN("Closed %d stale conns to "LPX64"\n", nstale, peer_nid); + CWARN("Closed %d stale conns to %s\n", nstale, + libcfs_nid2str(peer_nid)); - CWARN("New connection to "LPX64" on devid[%d] = %d\n", - peer_nid, conn->rac_device->rad_idx, conn->rac_device->rad_id); + CWARN("New connection to %s on devid[%d] = %d\n", + libcfs_nid2str(peer_nid), + conn->rac_device->rad_idx, conn->rac_device->rad_id); /* Ensure conn gets checked. Transmits may have been queued and an * FMA event may have happened before it got in the cq hash table */ @@ -945,11 +814,13 @@ kranal_connect (kra_peer_t *peer) LASSERT (peer->rap_connecting); - CDEBUG(D_NET, "About to handshake "LPX64"\n", peer->rap_nid); + CDEBUG(D_NET, "About to handshake %s\n", + libcfs_nid2str(peer->rap_nid)); rc = kranal_conn_handshake(NULL, peer); - CDEBUG(D_NET, "Done handshake "LPX64":%d \n", peer->rap_nid, rc); + CDEBUG(D_NET, "Done handshake %s:%d \n", + libcfs_nid2str(peer->rap_nid), rc); write_lock_irqsave(&kranal_data.kra_global_lock, flags); @@ -961,18 +832,21 @@ kranal_connect (kra_peer_t *peer) * success to avoid messages jumping the queue */ LASSERT (list_empty(&peer->rap_tx_queue)); - /* reset reconnection timeouts */ - peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL; - peer->rap_reconnect_time = CURRENT_SECONDS; + peer->rap_reconnect_interval = 0; /* OK to reconnect at any time */ write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); return; } - LASSERT (peer->rap_reconnect_interval != 0); - peer->rap_reconnect_time = CURRENT_SECONDS + peer->rap_reconnect_interval; - peer->rap_reconnect_interval = MAX(RANAL_MAX_RECONNECT_INTERVAL, - 1 * peer->rap_reconnect_interval); + peer->rap_reconnect_interval *= 2; + peer->rap_reconnect_interval = + MAX(peer->rap_reconnect_interval, + *kranal_tunables.kra_min_reconnect_interval); + peer->rap_reconnect_interval = + MIN(peer->rap_reconnect_interval, + *kranal_tunables.kra_max_reconnect_interval); + + peer->rap_reconnect_time = jiffies + peer->rap_reconnect_interval * HZ; /* Grab all blocked packets while we have the global lock */ list_add(&zombies, &peer->rap_tx_queue); @@ -983,8 +857,8 @@ kranal_connect (kra_peer_t *peer) if (list_empty(&zombies)) return; - CWARN("Dropping packets for "LPX64": connection failed\n", - peer->rap_nid); + CDEBUG(D_NETERROR, "Dropping packets for %s: connection failed\n", + libcfs_nid2str(peer->rap_nid)); do { tx = list_entry(zombies.next, kra_tx_t, tx_list); @@ -998,309 +872,51 @@ kranal_connect (kra_peer_t *peer) void kranal_free_acceptsock (kra_acceptsock_t *ras) { - sock_release(ras->ras_sock); - PORTAL_FREE(ras, sizeof(*ras)); + libcfs_sock_release(ras->ras_sock); + LIBCFS_FREE(ras, sizeof(*ras)); } int -kranal_listener (void *arg) +kranal_accept (lnet_ni_t *ni, struct socket *sock) { - struct sockaddr_in addr; - wait_queue_t wait; - struct socket *sock; kra_acceptsock_t *ras; - int port; - char name[16]; int rc; + __u32 peer_ip; + int peer_port; unsigned long flags; - /* Parent thread holds kra_nid_mutex, and is, or is about to - * block on kra_listener_signal */ - - port = kranal_tunables.kra_port; - snprintf(name, sizeof(name), "kranal_lstn%03d", port); - kportal_daemonize(name); - kportal_blockallsigs(); - - init_waitqueue_entry(&wait, current); - - rc = kranal_create_sock(&sock); - if (rc != 0) - goto out_0; - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(port); - addr.sin_addr.s_addr = INADDR_ANY; - - rc = sock->ops->bind(sock, (struct sockaddr *)&addr, sizeof(addr)); - if (rc != 0) { - CERROR("Can't bind to port %d\n", port); - goto out_1; - } - - rc = sock->ops->listen(sock, kranal_tunables.kra_backlog); - if (rc != 0) { - CERROR("Can't set listen backlog %d: %d\n", - kranal_tunables.kra_backlog, rc); - goto out_1; - } - - LASSERT (kranal_data.kra_listener_sock == NULL); - kranal_data.kra_listener_sock = sock; - - /* unblock waiting parent */ - LASSERT (kranal_data.kra_listener_shutdown == 0); - up(&kranal_data.kra_listener_signal); - - /* Wake me any time something happens on my socket */ - add_wait_queue(sock->sk->sk_sleep, &wait); - ras = NULL; - - while (kranal_data.kra_listener_shutdown == 0) { - - if (ras == NULL) { - PORTAL_ALLOC(ras, sizeof(*ras)); - if (ras == NULL) { - CERROR("Out of Memory: pausing...\n"); - kranal_pause(HZ); - continue; - } - ras->ras_sock = NULL; - } - - if (ras->ras_sock == NULL) { - ras->ras_sock = sock_alloc(); - if (ras->ras_sock == NULL) { - CERROR("Can't allocate socket: pausing...\n"); - kranal_pause(HZ); - continue; - } - /* XXX this should add a ref to sock->ops->owner, if - * TCP could be a module */ - ras->ras_sock->type = sock->type; - ras->ras_sock->ops = sock->ops; - } - - set_current_state(TASK_INTERRUPTIBLE); - - rc = sock->ops->accept(sock, ras->ras_sock, O_NONBLOCK); - - /* Sleep for socket activity? */ - if (rc == -EAGAIN && - kranal_data.kra_listener_shutdown == 0) - schedule(); - - set_current_state(TASK_RUNNING); - - if (rc == 0) { - spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - - list_add_tail(&ras->ras_list, - &kranal_data.kra_connd_acceptq); - - spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - wake_up(&kranal_data.kra_connd_waitq); - - ras = NULL; - continue; - } - - if (rc != -EAGAIN) { - CERROR("Accept failed: %d, pausing...\n", rc); - kranal_pause(HZ); - } - } + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT (rc == 0); /* we succeeded before */ - if (ras != NULL) { - if (ras->ras_sock != NULL) - sock_release(ras->ras_sock); - PORTAL_FREE(ras, sizeof(*ras)); - } - - rc = 0; - remove_wait_queue(sock->sk->sk_sleep, &wait); - out_1: - sock_release(sock); - kranal_data.kra_listener_sock = NULL; - out_0: - /* set completion status and unblock thread waiting for me - * (parent on startup failure, executioner on normal shutdown) */ - kranal_data.kra_listener_shutdown = rc; - up(&kranal_data.kra_listener_signal); - - return 0; -} - -int -kranal_start_listener (void) -{ - long pid; - int rc; - - CDEBUG(D_NET, "Starting listener\n"); - - /* Called holding kra_nid_mutex: listener stopped */ - LASSERT (kranal_data.kra_listener_sock == NULL); - - kranal_data.kra_listener_shutdown = 0; - pid = kernel_thread(kranal_listener, NULL, 0); - if (pid < 0) { - CERROR("Can't spawn listener: %ld\n", pid); - return (int)pid; + LIBCFS_ALLOC(ras, sizeof(*ras)); + if (ras == NULL) { + CERROR("ENOMEM allocating connection request from " + "%u.%u.%u.%u\n", HIPQUAD(peer_ip)); + return -ENOMEM; } - /* Block until listener has started up. */ - down(&kranal_data.kra_listener_signal); - - rc = kranal_data.kra_listener_shutdown; - LASSERT ((rc != 0) == (kranal_data.kra_listener_sock == NULL)); - - CDEBUG(D_NET, "Listener %ld started OK\n", pid); - return rc; -} - -void -kranal_stop_listener(int clear_acceptq) -{ - struct list_head zombie_accepts; - unsigned long flags; - kra_acceptsock_t *ras; - - CDEBUG(D_NET, "Stopping listener\n"); + ras->ras_sock = sock; - /* Called holding kra_nid_mutex: listener running */ - LASSERT (kranal_data.kra_listener_sock != NULL); - - kranal_data.kra_listener_shutdown = 1; - wake_up_all(kranal_data.kra_listener_sock->sk->sk_sleep); - - /* Block until listener has torn down. */ - down(&kranal_data.kra_listener_signal); - - LASSERT (kranal_data.kra_listener_sock == NULL); - CDEBUG(D_NET, "Listener stopped\n"); - - if (!clear_acceptq) - return; - - /* Close any unhandled accepts */ spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); - list_add(&zombie_accepts, &kranal_data.kra_connd_acceptq); - list_del_init(&kranal_data.kra_connd_acceptq); + list_add_tail(&ras->ras_list, &kranal_data.kra_connd_acceptq); + wake_up(&kranal_data.kra_connd_waitq); spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - - while (!list_empty(&zombie_accepts)) { - ras = list_entry(zombie_accepts.next, - kra_acceptsock_t, ras_list); - list_del(&ras->ras_list); - kranal_free_acceptsock(ras); - } -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) -int -kranal_listener_procint(ctl_table *table, int write, struct file *filp, - void *buffer, size_t *lenp) -#else -int -kranal_listener_procint(ctl_table *table, int write, struct file *filp, - void *buffer, size_t *lenp, loff_t *ppos) -#endif -{ - int *tunable = (int *)table->data; - int old_val; - int rc; - - /* No race with nal initialisation since the nal is setup all the time - * it's loaded. When that changes, change this! */ - LASSERT (kranal_data.kra_init == RANAL_INIT_ALL); - - down(&kranal_data.kra_nid_mutex); - - LASSERT (tunable == &kranal_tunables.kra_port || - tunable == &kranal_tunables.kra_backlog); - old_val = *tunable; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) - rc = proc_dointvec(table, write, filp, buffer, lenp); -#else - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); -#endif - - if (write && - (*tunable != old_val || - kranal_data.kra_listener_sock == NULL)) { - - if (kranal_data.kra_listener_sock != NULL) - kranal_stop_listener(0); - - rc = kranal_start_listener(); - - if (rc != 0) { - CWARN("Unable to start listener with new tunable:" - " reverting to old value\n"); - *tunable = old_val; - kranal_start_listener(); - } - } - - up(&kranal_data.kra_nid_mutex); - - LASSERT (kranal_data.kra_init == RANAL_INIT_ALL); - return rc; + return 0; } int -kranal_set_mynid(ptl_nid_t nid) -{ - unsigned long flags; - lib_ni_t *ni = &kranal_lib.libnal_ni; - int rc = 0; - - CDEBUG(D_NET, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - down(&kranal_data.kra_nid_mutex); - - if (nid == ni->ni_pid.nid) { - /* no change of NID */ - up(&kranal_data.kra_nid_mutex); - return 0; - } - - if (kranal_data.kra_listener_sock != NULL) - kranal_stop_listener(1); - - write_lock_irqsave(&kranal_data.kra_global_lock, flags); - kranal_data.kra_peerstamp++; - ni->ni_pid.nid = nid; - write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); - - /* Delete all existing peers and their connections after new - * NID/connstamp set to ensure no old connections in our brave - * new world. */ - kranal_del_peer(PTL_NID_ANY, 0); - - if (nid != PTL_NID_ANY) - rc = kranal_start_listener(); - - up(&kranal_data.kra_nid_mutex); - return rc; -} - -kra_peer_t * -kranal_create_peer (ptl_nid_t nid) +kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid) { - kra_peer_t *peer; + kra_peer_t *peer; + unsigned long flags; - LASSERT (nid != PTL_NID_ANY); + LASSERT (nid != LNET_NID_ANY); - PORTAL_ALLOC(peer, sizeof(*peer)); + LIBCFS_ALLOC(peer, sizeof(*peer)); if (peer == NULL) - return NULL; + return -ENOMEM; memset(peer, 0, sizeof(*peer)); /* zero flags etc */ @@ -1312,17 +928,32 @@ kranal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD(&peer->rap_conns); INIT_LIST_HEAD(&peer->rap_tx_queue); - peer->rap_reconnect_time = CURRENT_SECONDS; - peer->rap_reconnect_interval = RANAL_MIN_RECONNECT_INTERVAL; + peer->rap_reconnect_interval = 0; /* OK to connect at any time */ + + write_lock_irqsave(&kranal_data.kra_global_lock, flags); + + if (kranal_data.kra_nonewpeers) { + /* shutdown has started already */ + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); + + LIBCFS_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } atomic_inc(&kranal_data.kra_npeers); - return peer; + + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); + + *peerp = peer; + return 0; } void kranal_destroy_peer (kra_peer_t *peer) { - CDEBUG(D_NET, "peer "LPX64" %p deleted\n", peer->rap_nid, peer); + CDEBUG(D_NET, "peer %s %p deleted\n", + libcfs_nid2str(peer->rap_nid), peer); LASSERT (atomic_read(&peer->rap_refcount) == 0); LASSERT (peer->rap_persistence == 0); @@ -1332,7 +963,7 @@ kranal_destroy_peer (kra_peer_t *peer) LASSERT (list_empty(&peer->rap_tx_queue)); LASSERT (list_empty(&peer->rap_connd_list)); - PORTAL_FREE(peer, sizeof(*peer)); + LIBCFS_FREE(peer, sizeof(*peer)); /* NB a peer's connections keep a reference on their peer until * they are destroyed, so we can be assured that _all_ state to do @@ -1342,7 +973,7 @@ kranal_destroy_peer (kra_peer_t *peer) } kra_peer_t * -kranal_find_peer_locked (ptl_nid_t nid) +kranal_find_peer_locked (lnet_nid_t nid) { struct list_head *peer_list = kranal_nid2peerlist(nid); struct list_head *tmp; @@ -1358,15 +989,16 @@ kranal_find_peer_locked (ptl_nid_t nid) if (peer->rap_nid != nid) continue; - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read(&peer->rap_refcount)); + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_nid2str(nid), + atomic_read(&peer->rap_refcount)); return peer; } return NULL; } kra_peer_t * -kranal_find_peer (ptl_nid_t nid) +kranal_find_peer (lnet_nid_t nid) { kra_peer_t *peer; @@ -1393,7 +1025,7 @@ kranal_unlink_peer_locked (kra_peer_t *peer) } int -kranal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp, +kranal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *portp, int *persistencep) { kra_peer_t *peer; @@ -1428,18 +1060,19 @@ kranal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, int *portp, } int -kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port) +kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port) { unsigned long flags; kra_peer_t *peer; kra_peer_t *peer2; + int rc; - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return -EINVAL; - peer = kranal_create_peer(nid); - if (peer == NULL) - return -ENOMEM; + rc = kranal_create_peer(&peer, nid); + if (rc != 0) + return rc; write_lock_irqsave(&kranal_data.kra_global_lock, flags); @@ -1462,19 +1095,13 @@ kranal_add_persistent_peer (ptl_nid_t nid, __u32 ip, int port) } void -kranal_del_peer_locked (kra_peer_t *peer, int single_share) +kranal_del_peer_locked (kra_peer_t *peer) { struct list_head *ctmp; struct list_head *cnxt; kra_conn_t *conn; - if (!single_share) - peer->rap_persistence = 0; - else if (peer->rap_persistence > 0) - peer->rap_persistence--; - - if (peer->rap_persistence != 0) - return; + peer->rap_persistence = 0; if (list_empty(&peer->rap_conns)) { kranal_unlink_peer_locked(peer); @@ -1489,7 +1116,7 @@ kranal_del_peer_locked (kra_peer_t *peer, int single_share) } int -kranal_del_peer (ptl_nid_t nid, int single_share) +kranal_del_peer (lnet_nid_t nid) { unsigned long flags; struct list_head *ptmp; @@ -1502,7 +1129,7 @@ kranal_del_peer (ptl_nid_t nid, int single_share) write_lock_irqsave(&kranal_data.kra_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers; else { lo = 0; @@ -1515,17 +1142,14 @@ kranal_del_peer (ptl_nid_t nid, int single_share) LASSERT (peer->rap_persistence > 0 || !list_empty(&peer->rap_conns)); - if (!(nid == PTL_NID_ANY || peer->rap_nid == nid)) + if (!(nid == LNET_NID_ANY || peer->rap_nid == nid)) continue; - kranal_del_peer_locked(peer, single_share); + kranal_del_peer_locked(peer); rc = 0; /* matched something */ - - if (single_share) - goto out; } } - out: + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); return rc; @@ -1554,8 +1178,8 @@ kranal_get_conn_by_idx (int index) continue; conn = list_entry(ctmp, kra_conn_t, rac_list); - CDEBUG(D_NET, "++conn[%p] -> "LPX64" (%d)\n", - conn, conn->rac_peer->rap_nid, + CDEBUG(D_NET, "++conn[%p] -> %s (%d)\n", conn, + libcfs_nid2str(conn->rac_peer->rap_nid), atomic_read(&conn->rac_refcount)); atomic_inc(&conn->rac_refcount); read_unlock(&kranal_data.kra_global_lock); @@ -1587,7 +1211,7 @@ kranal_close_peer_conns_locked (kra_peer_t *peer, int why) } int -kranal_close_matching_conns (ptl_nid_t nid) +kranal_close_matching_conns (lnet_nid_t nid) { unsigned long flags; kra_peer_t *peer; @@ -1600,7 +1224,7 @@ kranal_close_matching_conns (ptl_nid_t nid) write_lock_irqsave(&kranal_data.kra_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kranal_nid2peerlist(nid) - kranal_data.kra_peers; else { lo = 0; @@ -1614,7 +1238,7 @@ kranal_close_matching_conns (ptl_nid_t nid) LASSERT (peer->rap_persistence > 0 || !list_empty(&peer->rap_conns)); - if (!(nid == PTL_NID_ANY || nid == peer->rap_nid)) + if (!(nid == LNET_NID_ANY || nid == peer->rap_nid)) continue; count += kranal_close_peer_conns_locked(peer, 0); @@ -1624,72 +1248,72 @@ kranal_close_matching_conns (ptl_nid_t nid) write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); /* wildcards always succeed */ - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return 0; return (count == 0) ? -ENOENT : 0; } int -kranal_cmd(struct portals_cfg *pcfg, void * private) +kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { - int rc = -EINVAL; + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; - LASSERT (pcfg != NULL); + LASSERT (ni == kranal_data.kra_ni); - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; __u32 ip = 0; int port = 0; int share_count = 0; - rc = kranal_get_peer_info(pcfg->pcfg_count, + rc = kranal_get_peer_info(data->ioc_count, &nid, &ip, &port, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = ip; - pcfg->pcfg_misc = port; - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; + data->ioc_nid = nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; break; } - case NAL_CMD_ADD_PEER: { - rc = kranal_add_persistent_peer(pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_misc); /* port */ + case IOC_LIBCFS_ADD_PEER: { + rc = kranal_add_persistent_peer(data->ioc_nid, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ break; } - case NAL_CMD_DEL_PEER: { - rc = kranal_del_peer(pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); + case IOC_LIBCFS_DEL_PEER: { + rc = kranal_del_peer(data->ioc_nid); break; } - case NAL_CMD_GET_CONN: { - kra_conn_t *conn = kranal_get_conn_by_idx(pcfg->pcfg_count); + case IOC_LIBCFS_GET_CONN: { + kra_conn_t *conn = kranal_get_conn_by_idx(data->ioc_count); if (conn == NULL) rc = -ENOENT; else { rc = 0; - pcfg->pcfg_nid = conn->rac_peer->rap_nid; - pcfg->pcfg_id = conn->rac_device->rad_id; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; + data->ioc_nid = conn->rac_peer->rap_nid; + data->ioc_u32[0] = conn->rac_device->rad_id; kranal_conn_decref(conn); } break; } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kranal_close_matching_conns(pcfg->pcfg_nid); + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kranal_close_matching_conns(data->ioc_nid); break; } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) + case IOC_LIBCFS_REGISTER_MYNID: { + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; - else - rc = kranal_set_mynid(pcfg->pcfg_nid); + } break; } } @@ -1706,44 +1330,39 @@ kranal_free_txdescs(struct list_head *freelist) tx = list_entry(freelist->next, kra_tx_t, tx_list); list_del(&tx->tx_list); - PORTAL_FREE(tx->tx_phys, PTL_MD_MAX_IOV * sizeof(*tx->tx_phys)); - PORTAL_FREE(tx, sizeof(*tx)); + LIBCFS_FREE(tx->tx_phys, LNET_MAX_IOV * sizeof(*tx->tx_phys)); + LIBCFS_FREE(tx, sizeof(*tx)); } } int kranal_alloc_txdescs(struct list_head *freelist, int n) { - int isnblk = (freelist == &kranal_data.kra_idle_nblk_txs); int i; kra_tx_t *tx; - LASSERT (freelist == &kranal_data.kra_idle_txs || - freelist == &kranal_data.kra_idle_nblk_txs); + LASSERT (freelist == &kranal_data.kra_idle_txs); LASSERT (list_empty(freelist)); for (i = 0; i < n; i++) { - PORTAL_ALLOC(tx, sizeof(*tx)); + LIBCFS_ALLOC(tx, sizeof(*tx)); if (tx == NULL) { - CERROR("Can't allocate %stx[%d]\n", - isnblk ? "nblk " : "", i); + CERROR("Can't allocate tx[%d]\n", i); kranal_free_txdescs(freelist); return -ENOMEM; } - PORTAL_ALLOC(tx->tx_phys, - PTL_MD_MAX_IOV * sizeof(*tx->tx_phys)); + LIBCFS_ALLOC(tx->tx_phys, + LNET_MAX_IOV * sizeof(*tx->tx_phys)); if (tx->tx_phys == NULL) { - CERROR("Can't allocate %stx[%d]->tx_phys\n", - isnblk ? "nblk " : "", i); + CERROR("Can't allocate tx[%d]->tx_phys\n", i); - PORTAL_FREE(tx, sizeof(*tx)); + LIBCFS_FREE(tx, sizeof(*tx)); kranal_free_txdescs(freelist); return -ENOMEM; } - tx->tx_isnblk = isnblk; tx->tx_buftype = RANAL_BUF_NONE; tx->tx_msg.ram_type = RANAL_MSG_NONE; @@ -1756,7 +1375,7 @@ kranal_alloc_txdescs(struct list_head *freelist, int n) int kranal_device_init(int id, kra_device_t *dev) { - const int total_ntx = RANAL_NTX + RANAL_NTX_NBLK; + int total_ntx = *kranal_tunables.kra_ntx; RAP_RETURN rrc; dev->rad_id = id; @@ -1777,16 +1396,17 @@ kranal_device_init(int id, kra_device_t *dev) rrc = RapkCreateCQ(dev->rad_handle, total_ntx, RAP_CQTYPE_SEND, &dev->rad_rdma_cqh); if (rrc != RAP_SUCCESS) { - CERROR("Can't create rdma cq size %d" - " for device %d: %d\n", total_ntx, id, rrc); + CERROR("Can't create rdma cq size %d for device %d: %d\n", + total_ntx, id, rrc); goto failed_1; } - rrc = RapkCreateCQ(dev->rad_handle, RANAL_FMA_CQ_SIZE, RAP_CQTYPE_RECV, - &dev->rad_fma_cqh); + rrc = RapkCreateCQ(dev->rad_handle, + *kranal_tunables.kra_fma_cq_size, + RAP_CQTYPE_RECV, &dev->rad_fma_cqh); if (rrc != RAP_SUCCESS) { - CERROR("Can't create fma cq size %d" - " for device %d: %d\n", RANAL_FMA_CQ_SIZE, id, rrc); + CERROR("Can't create fma cq size %d for device %d: %d\n", + *kranal_tunables.kra_fma_cq_size, id, rrc); goto failed_2; } @@ -1803,6 +1423,13 @@ kranal_device_init(int id, kra_device_t *dev) void kranal_device_fini(kra_device_t *dev) { + LASSERT (list_empty(&dev->rad_ready_conns)); + LASSERT (list_empty(&dev->rad_new_conns)); + LASSERT (dev->rad_nphysmap == 0); + LASSERT (dev->rad_nppphysmap == 0); + LASSERT (dev->rad_nvirtmap == 0); + LASSERT (dev->rad_nobvirtmap == 0); + LASSERT(dev->rad_scheduler == NULL); RapkDestroyCQ(dev->rad_handle, dev->rad_fma_cqh); RapkDestroyCQ(dev->rad_handle, dev->rad_rdma_cqh); @@ -1810,21 +1437,16 @@ kranal_device_fini(kra_device_t *dev) } void -kranal_api_shutdown (nal_t *nal) +kranal_shutdown (lnet_ni_t *ni) { int i; unsigned long flags; - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read(&portal_kmemory)); + atomic_read(&libcfs_kmemory)); - LASSERT (nal == &kranal_api); + LASSERT (ni == kranal_data.kra_ni); + LASSERT (ni->ni_data == &kranal_data); switch (kranal_data.kra_init) { default: @@ -1832,54 +1454,57 @@ kranal_api_shutdown (nal_t *nal) LBUG(); case RANAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(RANAL); - /* No new persistent peers */ - - /* resetting my NID to unadvertises me, removes my - * listener and nukes all current peers */ - kranal_set_mynid(PTL_NID_ANY); - /* no new peers or conns */ + /* Prevent new peers from being created */ + write_lock_irqsave(&kranal_data.kra_global_lock, flags); + kranal_data.kra_nonewpeers = 1; + write_unlock_irqrestore(&kranal_data.kra_global_lock, flags); + + /* Remove all existing peers from the peer table */ + kranal_del_peer(LNET_NID_ANY); - /* Wait for all peer/conn state to clean up */ + /* Wait for pending conn reqs to be handled */ i = 2; - while (atomic_read(&kranal_data.kra_nconns) != 0 || - atomic_read(&kranal_data.kra_npeers) != 0) { + spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); + while (!list_empty(&kranal_data.kra_connd_acceptq)) { + spin_unlock_irqrestore(&kranal_data.kra_connd_lock, + flags); i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers and %d conns to close down\n", - atomic_read(&kranal_data.kra_npeers), - atomic_read(&kranal_data.kra_nconns)); - kranal_pause(HZ); + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ + "waiting for conn reqs to clean up\n"); + cfs_pause(cfs_time_seconds(1)); + + spin_lock_irqsave(&kranal_data.kra_connd_lock, flags); } - /* fall through */ + spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); - case RANAL_INIT_LIB: - lib_fini(&kranal_lib); + /* Wait for all peers to be freed */ + i = 2; + while (atomic_read(&kranal_data.kra_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n */ + "waiting for %d peers to close down\n", + atomic_read(&kranal_data.kra_npeers)); + cfs_pause(cfs_time_seconds(1)); + } /* fall through */ case RANAL_INIT_DATA: break; } - /* Conn/Peer state all cleaned up BEFORE setting shutdown, so threads - * don't have to worry about shutdown races */ - LASSERT (atomic_read(&kranal_data.kra_nconns) == 0); + /* Peer state all cleaned up BEFORE setting shutdown, so threads don't + * have to worry about shutdown races. NB connections may be created + * while there are still active connds, but these will be temporary + * since peer creation always fails after the listener has started to + * shut down. */ LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); - /* flag threads to terminate; wake and wait for them to die */ + /* Flag threads to terminate */ kranal_data.kra_shutdown = 1; for (i = 0; i < kranal_data.kra_ndevs; i++) { kra_device_t *dev = &kranal_data.kra_devices[i]; - LASSERT (list_empty(&dev->rad_ready_conns)); - LASSERT (list_empty(&dev->rad_new_conns)); - LASSERT (dev->rad_nphysmap == 0); - LASSERT (dev->rad_nppphysmap == 0); - LASSERT (dev->rad_nvirtmap == 0); - LASSERT (dev->rad_nobvirtmap == 0); - spin_lock_irqsave(&dev->rad_lock, flags); wake_up(&dev->rad_waitq); spin_unlock_irqrestore(&dev->rad_lock, flags); @@ -1894,13 +1519,14 @@ kranal_api_shutdown (nal_t *nal) wake_up_all(&kranal_data.kra_connd_waitq); spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); + /* Wait for threads to exit */ i = 2; while (atomic_read(&kranal_data.kra_nthreads) != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", atomic_read(&kranal_data.kra_nthreads)); - kranal_pause(HZ); + cfs_pause(cfs_time_seconds(1)); } LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); @@ -1908,7 +1534,7 @@ kranal_api_shutdown (nal_t *nal) for (i = 0; i < kranal_data.kra_peer_hash_size; i++) LASSERT (list_empty(&kranal_data.kra_peers[i])); - PORTAL_FREE(kranal_data.kra_peers, + LIBCFS_FREE(kranal_data.kra_peers, sizeof (struct list_head) * kranal_data.kra_peer_hash_size); } @@ -1918,7 +1544,7 @@ kranal_api_shutdown (nal_t *nal) for (i = 0; i < kranal_data.kra_conn_hash_size; i++) LASSERT (list_empty(&kranal_data.kra_conns[i])); - PORTAL_FREE(kranal_data.kra_conns, + LIBCFS_FREE(kranal_data.kra_conns, sizeof (struct list_head) * kranal_data.kra_conn_hash_size); } @@ -1927,42 +1553,51 @@ kranal_api_shutdown (nal_t *nal) kranal_device_fini(&kranal_data.kra_devices[i]); kranal_free_txdescs(&kranal_data.kra_idle_txs); - kranal_free_txdescs(&kranal_data.kra_idle_nblk_txs); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read(&portal_kmemory)); - printk(KERN_INFO "Lustre: RapidArray NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + atomic_read(&libcfs_kmemory)); kranal_data.kra_init = RANAL_INIT_NOTHING; + PORTAL_MODULE_UNUSE; } int -kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +kranal_startup (lnet_ni_t *ni) { struct timeval tv; - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); + int pkmem = atomic_read(&libcfs_kmemory); int rc; int i; kra_device_t *dev; - LASSERT (nal == &kranal_api); + LASSERT (ni->ni_lnd == &the_kralnd); - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kranal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return PTL_OK; + /* Only 1 instance supported */ + if (kranal_data.kra_init != RANAL_INIT_NOTHING) { + CERROR ("Only 1 instance supported\n"); + return -EPERM; } - LASSERT (kranal_data.kra_init == RANAL_INIT_NOTHING); + if (lnet_set_ip_niaddr(ni) != 0) { + CERROR ("Can't determine my NID\n"); + return -EPERM; + } + if (*kranal_tunables.kra_credits > *kranal_tunables.kra_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kranal_tunables.kra_credits, + *kranal_tunables.kra_ntx); + return -EINVAL; + } + memset(&kranal_data, 0, sizeof(kranal_data)); /* zero pointers, flags etc */ + ni->ni_maxtxcredits = *kranal_tunables.kra_credits; + ni->ni_peertxcredits = *kranal_tunables.kra_peercredits; + + ni->ni_data = &kranal_data; + kranal_data.kra_ni = ni; + /* CAVEAT EMPTOR: Every 'Fma' message includes the sender's NID and * a unique (for all time) connstamp so we can uniquely identify * the sender. The connstamp is an incrementing counter @@ -1973,9 +1608,6 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kranal_data.kra_connstamp = kranal_data.kra_peerstamp = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - init_MUTEX(&kranal_data.kra_nid_mutex); - init_MUTEX_LOCKED(&kranal_data.kra_listener_signal); - rwlock_init(&kranal_data.kra_global_lock); for (i = 0; i < RANAL_MAXDEVS; i++ ) { @@ -1998,15 +1630,14 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init(&kranal_data.kra_connd_lock); INIT_LIST_HEAD(&kranal_data.kra_idle_txs); - INIT_LIST_HEAD(&kranal_data.kra_idle_nblk_txs); - init_waitqueue_head(&kranal_data.kra_idle_tx_waitq); spin_lock_init(&kranal_data.kra_tx_lock); /* OK to call kranal_api_shutdown() to cleanup now */ kranal_data.kra_init = RANAL_INIT_DATA; + PORTAL_MODULE_USE; kranal_data.kra_peer_hash_size = RANAL_PEER_HASH_SIZE; - PORTAL_ALLOC(kranal_data.kra_peers, + LIBCFS_ALLOC(kranal_data.kra_peers, sizeof(struct list_head) * kranal_data.kra_peer_hash_size); if (kranal_data.kra_peers == NULL) goto failed; @@ -2015,7 +1646,7 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, INIT_LIST_HEAD(&kranal_data.kra_peers[i]); kranal_data.kra_conn_hash_size = RANAL_PEER_HASH_SIZE; - PORTAL_ALLOC(kranal_data.kra_conns, + LIBCFS_ALLOC(kranal_data.kra_conns, sizeof(struct list_head) * kranal_data.kra_conn_hash_size); if (kranal_data.kra_conns == NULL) goto failed; @@ -2023,35 +1654,18 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, for (i = 0; i < kranal_data.kra_conn_hash_size; i++) INIT_LIST_HEAD(&kranal_data.kra_conns[i]); - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, RANAL_NTX); + rc = kranal_alloc_txdescs(&kranal_data.kra_idle_txs, + *kranal_tunables.kra_ntx); if (rc != 0) goto failed; - rc = kranal_alloc_txdescs(&kranal_data.kra_idle_nblk_txs,RANAL_NTX_NBLK); - if (rc != 0) - goto failed; - - process_id.pid = requested_pid; - process_id.nid = PTL_NID_ANY; /* don't know my NID yet */ - - rc = lib_init(&kranal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kranal_data.kra_init = RANAL_INIT_LIB; - /*****************************************************/ - rc = kranal_thread_start(kranal_reaper, NULL); if (rc != 0) { CERROR("Can't spawn ranal reaper: %d\n", rc); goto failed; } - for (i = 0; i < RANAL_N_CONND; i++) { + for (i = 0; i < *kranal_tunables.kra_n_connd; i++) { rc = kranal_thread_start(kranal_connd, (void *)(unsigned long)i); if (rc != 0) { CERROR("Can't spawn ranal connd[%d]: %d\n", @@ -2062,16 +1676,15 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, LASSERT (kranal_data.kra_ndevs == 0); - for (i = 0; i < sizeof(kranal_devids)/sizeof(kranal_devids[0]); i++) { - LASSERT (i < RANAL_MAXDEVS); - + /* Use all available RapidArray devices */ + for (i = 0; i < RANAL_MAXDEVS; i++) { dev = &kranal_data.kra_devices[kranal_data.kra_ndevs]; rc = kranal_device_init(kranal_devids[i], dev); if (rc == 0) kranal_data.kra_ndevs++; } - + if (kranal_data.kra_ndevs == 0) { CERROR("Can't initialise any RapidArray devices\n"); goto failed; @@ -2087,36 +1700,23 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } } - rc = libcfs_nal_cmd_register(RANAL, &kranal_cmd, NULL); - if (rc != 0) { - CERROR("Can't initialise command interface (rc = %d)\n", rc); - goto failed; - } - /* flag everything initialised */ kranal_data.kra_init = RANAL_INIT_ALL; /*****************************************************/ - CDEBUG(D_MALLOC, "initial kmem %d\n", atomic_read(&portal_kmemory)); - printk(KERN_INFO "Lustre: RapidArray NAL loaded " - "(initial mem %d)\n", pkmem); - - return PTL_OK; + CDEBUG(D_MALLOC, "initial kmem %d\n", pkmem); + return 0; failed: - kranal_api_shutdown(&kranal_api); - return PTL_FAIL; + kranal_shutdown(ni); + return -ENETDOWN; } void __exit kranal_module_fini (void) { - if (kranal_tunables.kra_sysctl != NULL) - unregister_sysctl_table(kranal_tunables.kra_sysctl); - - PtlNIFini(kranal_ni); - - ptl_unregister_nal(RANAL); + lnet_unregister_lnd(&the_kralnd); + kranal_tunables_fini(); } int __init @@ -2124,51 +1724,17 @@ kranal_module_init (void) { int rc; - /* the following must be sizeof(int) for - * proc_dointvec/kranal_listener_procint() */ - LASSERT (sizeof(kranal_tunables.kra_timeout) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_listener_timeout) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_backlog) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_port) == sizeof(int)); - LASSERT (sizeof(kranal_tunables.kra_max_immediate) == sizeof(int)); - - kranal_api.nal_ni_init = kranal_api_startup; - kranal_api.nal_ni_fini = kranal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kranal_tunables.kra_timeout = RANAL_TIMEOUT; - kranal_tunables.kra_listener_timeout = RANAL_LISTENER_TIMEOUT; - kranal_tunables.kra_backlog = RANAL_BACKLOG; - kranal_tunables.kra_port = RANAL_PORT; - kranal_tunables.kra_max_immediate = RANAL_MAX_IMMEDIATE; - - rc = ptl_register_nal(RANAL, &kranal_api); - if (rc != PTL_OK) { - CERROR("Can't register RANAL: %d\n", rc); - return -ENOMEM; /* or something... */ - } - - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(RANAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kranal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(RANAL); - return -ENODEV; - } - - kranal_tunables.kra_sysctl = - register_sysctl_table(kranal_top_ctl_table, 0); - if (kranal_tunables.kra_sysctl == NULL) { - CERROR("Can't register sysctl table\n"); - PtlNIFini(kranal_ni); - ptl_unregister_nal(RANAL); - return -ENOMEM; - } + rc = kranal_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_kralnd); return 0; } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel RapidArray NAL v0.01"); +MODULE_DESCRIPTION("Kernel RapidArray LND v0.01"); MODULE_LICENSE("GPL"); module_init(kranal_module_init); diff --git a/lnet/klnds/ralnd/ralnd.h b/lnet/klnds/ralnd/ralnd.h index aa269c3..300cf40 100644 --- a/lnet/klnds/ralnd/ralnd.h +++ b/lnet/klnds/ralnd/ralnd.h @@ -51,55 +51,44 @@ #include #include -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND #include -#include -#include -#include +#include +#include #include -#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +/* tunables determined at compile time */ +#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define RANAL_N_CONND 4 /* # connection daemons */ +#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ -#define RANAL_MIN_RECONNECT_INTERVAL 1 /* first failed connection retry (seconds)... */ -#define RANAL_MAX_RECONNECT_INTERVAL 60 /* ...exponentially increasing to this */ +#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ +#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ -#define RANAL_FMA_MAX_PREFIX 232 /* max size of FMA "Prefix" */ +/* fixed constants */ +#define RANAL_MAXDEVS 2 /* max # devices RapidArray supports */ +#define RANAL_FMA_MAX_PREFIX 232 /* max bytes in FMA "Prefix" we can use */ #define RANAL_FMA_MAX_DATA ((7<<10)-256) /* Max FMA MSG is 7K including prefix */ -#define RANAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define RANAL_CONN_HASH_SIZE 101 /* # conn lists */ - -#define RANAL_NTX 64 /* # tx descs */ -#define RANAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define RANAL_FMA_CQ_SIZE 8192 /* # entries in receive CQ - * (overflow is a performance hit) */ - -#define RANAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define RANAL_MIN_TIMEOUT 5 /* minimum timeout interval (seconds) */ -#define RANAL_TIMEOUT2KEEPALIVE(t) (((t)+1)/2) /* timeout -> keepalive interval */ - -/* default vals for runtime tunables */ -#define RANAL_TIMEOUT 30 /* comms timeout (seconds) */ -#define RANAL_LISTENER_TIMEOUT 5 /* listener timeout (seconds) */ -#define RANAL_BACKLOG 127 /* listener's backlog */ -#define RANAL_PORT 988 /* listener's port */ -#define RANAL_MAX_IMMEDIATE (2<<10) /* immediate payload breakpoint */ typedef struct { - int kra_timeout; /* comms timeout (seconds) */ - int kra_listener_timeout; /* max time the listener can block */ - int kra_backlog; /* listener's backlog */ - int kra_port; /* listener's TCP/IP port */ - int kra_max_immediate; /* immediate payload breakpoint */ - + int *kra_n_connd; /* # connection daemons */ + int *kra_min_reconnect_interval; /* first failed connection retry... */ + int *kra_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kra_ntx; /* # tx descs */ + int *kra_credits; /* # concurrent sends */ + int *kra_peercredits; /* # concurrent sends to 1 peer */ + int *kra_fma_cq_size; /* # entries in receive CQ */ + int *kra_timeout; /* comms timeout (seconds) */ + int *kra_max_immediate; /* immediate payload breakpoint */ + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM struct ctl_table_header *kra_sysctl; /* sysctl interface */ +#endif } kra_tunables_t; typedef struct @@ -126,12 +115,8 @@ typedef struct int kra_init; /* initialisation state */ int kra_shutdown; /* shut down? */ atomic_t kra_nthreads; /* # live threads */ - - struct semaphore kra_nid_mutex; /* serialise NID/listener ops */ - struct semaphore kra_listener_signal; /* block for listener startup/shutdown */ - struct socket *kra_listener_sock; /* listener's socket */ - int kra_listener_shutdown; /* ask listener to close */ - + lnet_ni_t *kra_ni; /* _the_ nal instance */ + kra_device_t kra_devices[RANAL_MAXDEVS]; /* device/ptag/cq etc */ int kra_ndevs; /* # devices */ @@ -140,6 +125,7 @@ typedef struct struct list_head *kra_peers; /* hash table of all my known peers */ int kra_peer_hash_size; /* size of kra_peers */ atomic_t kra_npeers; /* # peers extant */ + int kra_nonewpeers; /* prevent new peers */ struct list_head *kra_conns; /* conns hashed by cqid */ int kra_conn_hash_size; /* size of kra_conns */ @@ -158,16 +144,13 @@ typedef struct spinlock_t kra_connd_lock; /* serialise */ struct list_head kra_idle_txs; /* idle tx descriptors */ - struct list_head kra_idle_nblk_txs; /* idle reserved tx descriptors */ __u64 kra_next_tx_cookie; /* RDMA completion cookie */ - wait_queue_head_t kra_idle_tx_waitq; /* block here for tx descriptor */ spinlock_t kra_tx_lock; /* serialise */ } kra_data_t; #define RANAL_INIT_NOTHING 0 #define RANAL_INIT_DATA 1 -#define RANAL_INIT_LIB 2 -#define RANAL_INIT_ALL 3 +#define RANAL_INIT_ALL 2 typedef struct kra_acceptsock /* accepted socket queued for connd */ { @@ -202,13 +185,13 @@ typedef struct typedef struct { - ptl_hdr_t raim_hdr; /* portals header */ + lnet_hdr_t raim_hdr; /* portals header */ /* Portals payload is in FMA "Message Data" */ } kra_immediate_msg_t; typedef struct { - ptl_hdr_t raprm_hdr; /* portals header */ + lnet_hdr_t raprm_hdr; /* portals header */ __u64 raprm_cookie; /* opaque completion cookie */ } kra_putreq_msg_t; @@ -221,7 +204,7 @@ typedef struct typedef struct { - ptl_hdr_t ragm_hdr; /* portals header */ + lnet_hdr_t ragm_hdr; /* portals header */ __u64 ragm_cookie; /* opaque completion cookie */ kra_rdma_desc_t ragm_desc; /* sender's sink buffer */ } kra_get_msg_t; @@ -248,7 +231,7 @@ typedef struct /* NB must fit in FMA "Prefix" * __u32 ram_seq; /* incrementing sequence number */ } kra_msg_t; -#define RANAL_MSG_MAGIC 0x0be91b92 /* unique magic */ +#define RANAL_MSG_MAGIC LNET_PROTO_RA_MAGIC /* unique magic */ #define RANAL_MSG_VERSION 1 /* current protocol version */ #define RANAL_MSG_FENCE 0x80 /* fence RDMA */ @@ -271,9 +254,8 @@ typedef struct kra_tx /* message descriptor */ { struct list_head tx_list; /* queue on idle_txs/rac_sendq/rac_waitq */ struct kra_conn *tx_conn; /* owning conn */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + lnet_msg_t *tx_lntmsg[2]; /* ptl msgs to finalize on completion */ unsigned long tx_qtime; /* when tx started to wait for something (jiffies) */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ int tx_nob; /* # bytes of payload */ int tx_buftype; /* payload buffer type */ void *tx_buffer; /* source/sink buffer */ @@ -334,7 +316,7 @@ typedef struct kra_peer struct list_head rap_connd_list; /* schedule on kra_connd_peers */ struct list_head rap_conns; /* all active connections */ struct list_head rap_tx_queue; /* msgs waiting for a conn */ - ptl_nid_t rap_nid; /* who's on the other end(s) */ + lnet_nid_t rap_nid; /* who's on the other end(s) */ __u32 rap_ip; /* IP address of peer */ int rap_port; /* port on which peer listens */ atomic_t rap_refcount; /* # users */ @@ -344,20 +326,6 @@ typedef struct kra_peer unsigned long rap_reconnect_interval; /* exponential backoff */ } kra_peer_t; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket -# define sk_wmem_queued wmem_queued -# define sk_err err -# define sk_sleep sleep -#endif - -extern lib_nal_t kranal_lib; extern kra_data_t kranal_data; extern kra_tunables_t kranal_tunables; @@ -367,7 +335,7 @@ extern void kranal_destroy_conn(kra_conn_t *conn); static inline void kranal_peer_addref(kra_peer_t *peer) { - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); LASSERT(atomic_read(&peer->rap_refcount) > 0); atomic_inc(&peer->rap_refcount); } @@ -375,14 +343,14 @@ kranal_peer_addref(kra_peer_t *peer) static inline void kranal_peer_decref(kra_peer_t *peer) { - CDEBUG(D_NET, "%p->"LPX64"\n", peer, peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", peer, libcfs_nid2str(peer->rap_nid)); LASSERT(atomic_read(&peer->rap_refcount) > 0); if (atomic_dec_and_test(&peer->rap_refcount)) kranal_destroy_peer(peer); } static inline struct list_head * -kranal_nid2peerlist (ptl_nid_t nid) +kranal_nid2peerlist (lnet_nid_t nid) { unsigned int hash = ((unsigned int)nid) % kranal_data.kra_peer_hash_size; @@ -399,7 +367,8 @@ kranal_peer_active(kra_peer_t *peer) static inline void kranal_conn_addref(kra_conn_t *conn) { - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", conn, + libcfs_nid2str(conn->rac_peer->rap_nid)); LASSERT(atomic_read(&conn->rac_refcount) > 0); atomic_inc(&conn->rac_refcount); } @@ -407,7 +376,8 @@ kranal_conn_addref(kra_conn_t *conn) static inline void kranal_conn_decref(kra_conn_t *conn) { - CDEBUG(D_NET, "%p->"LPX64"\n", conn, conn->rac_peer->rap_nid); + CDEBUG(D_NET, "%p->%s\n", conn, + libcfs_nid2str(conn->rac_peer->rap_nid)); LASSERT(atomic_read(&conn->rac_refcount) > 0); if (atomic_dec_and_test(&conn->rac_refcount)) kranal_destroy_conn(conn); @@ -445,11 +415,17 @@ kranal_tx_mapped (kra_tx_t *tx) tx->tx_buftype == RANAL_BUF_PHYS_MAPPED); } -static inline __u64 -kranal_page2phys (struct page *p) -{ - return page_to_phys(p); -} +int kranal_startup (lnet_ni_t *ni); +void kranal_shutdown (lnet_ni_t *ni); +int kranal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int kranal_eager_recv(lnet_ni_t *ni, void *private, + lnet_msg_t *lntmsg, void **new_private); +int kranal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int kranal_accept(lnet_ni_t *ni, struct socket *sock); extern void kranal_free_acceptsock (kra_acceptsock_t *ras); extern int kranal_listener_procint (ctl_table *table, @@ -459,17 +435,21 @@ extern void kranal_update_reaper_timeout (long timeout); extern void kranal_tx_done (kra_tx_t *tx, int completion); extern void kranal_unlink_peer_locked (kra_peer_t *peer); extern void kranal_schedule_conn (kra_conn_t *conn); -extern kra_peer_t *kranal_create_peer (ptl_nid_t nid); -extern kra_peer_t *kranal_find_peer_locked (ptl_nid_t nid); +extern int kranal_create_peer (kra_peer_t **peerp, lnet_nid_t nid); +extern int kranal_add_persistent_peer (lnet_nid_t nid, __u32 ip, int port); +extern kra_peer_t *kranal_find_peer_locked (lnet_nid_t nid); extern void kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx); -extern int kranal_del_peer (ptl_nid_t nid, int single_share); +extern int kranal_del_peer (lnet_nid_t nid); extern void kranal_device_callback (RAP_INT32 devid, RAP_PVOID arg); extern int kranal_thread_start (int(*fn)(void *arg), void *arg); extern int kranal_connd (void *arg); extern int kranal_reaper (void *arg); extern int kranal_scheduler (void *arg); extern void kranal_close_conn_locked (kra_conn_t *conn, int error); +extern void kranal_close_conn (kra_conn_t *conn, int error); extern void kranal_terminate_conn_locked (kra_conn_t *conn); extern void kranal_connect (kra_peer_t *peer); extern int kranal_conn_handshake (struct socket *sock, kra_peer_t *peer); -extern void kranal_pause(int ticks); +extern int kranal_tunables_init(void); +extern void kranal_tunables_fini(void); +extern void kranal_init_msg(kra_msg_t *msg, int type); diff --git a/lnet/klnds/ralnd/ralnd_cb.c b/lnet/klnds/ralnd/ralnd_cb.c index dd910ce..969efd2 100644 --- a/lnet/klnds/ralnd/ralnd_cb.c +++ b/lnet/klnds/ralnd/ralnd_cb.c @@ -21,21 +21,7 @@ * */ -#include "ranal.h" - -int -kranal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if kranal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; -} +#include "ralnd.h" void kranal_device_callback(RAP_INT32 devid, RAP_PVOID arg) @@ -85,56 +71,33 @@ kranal_schedule_conn(kra_conn_t *conn) } kra_tx_t * -kranal_get_idle_tx (int may_block) +kranal_get_idle_tx (void) { unsigned long flags; - kra_tx_t *tx = NULL; - - for (;;) { - spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); + kra_tx_t *tx; - /* "normal" descriptor is free */ - if (!list_empty(&kranal_data.kra_idle_txs)) { - tx = list_entry(kranal_data.kra_idle_txs.next, - kra_tx_t, tx_list); - break; - } - - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty(&kranal_data.kra_idle_nblk_txs)) { - CERROR("reserved tx desc pool exhausted\n"); - break; - } - - tx = list_entry(kranal_data.kra_idle_nblk_txs.next, - kra_tx_t, tx_list); - break; - } + spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - /* block for idle tx */ + if (list_empty(&kranal_data.kra_idle_txs)) { spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); - - wait_event(kranal_data.kra_idle_tx_waitq, - !list_empty(&kranal_data.kra_idle_txs)); + return NULL; } - if (tx != NULL) { - list_del(&tx->tx_list); + tx = list_entry(kranal_data.kra_idle_txs.next, kra_tx_t, tx_list); + list_del(&tx->tx_list); - /* Allocate a new completion cookie. It might not be - * needed, but we've got a lock right now... */ - tx->tx_cookie = kranal_data.kra_next_tx_cookie++; - - LASSERT (tx->tx_buftype == RANAL_BUF_NONE); - LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } + /* Allocate a new completion cookie. It might not be needed, but we've + * got a lock right now... */ + tx->tx_cookie = kranal_data.kra_next_tx_cookie++; spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); + LASSERT (tx->tx_buftype == RANAL_BUF_NONE); + LASSERT (tx->tx_msg.ram_type == RANAL_MSG_NONE); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + return tx; } @@ -144,24 +107,24 @@ kranal_init_msg(kra_msg_t *msg, int type) msg->ram_magic = RANAL_MSG_MAGIC; msg->ram_version = RANAL_MSG_VERSION; msg->ram_type = type; - msg->ram_srcnid = kranal_lib.libnal_ni.ni_pid.nid; + msg->ram_srcnid = kranal_data.kra_ni->ni_nid; /* ram_connstamp gets set when FMA is sent */ } kra_tx_t * -kranal_new_tx_msg (int may_block, int type) +kranal_new_tx_msg (int type) { - kra_tx_t *tx = kranal_get_idle_tx(may_block); + kra_tx_t *tx = kranal_get_idle_tx(); - if (tx == NULL) - return NULL; + if (tx != NULL) + kranal_init_msg(&tx->tx_msg, type); - kranal_init_msg(&tx->tx_msg, type); return tx; } int -kranal_setup_immediate_buffer (kra_tx_t *tx, int niov, struct iovec *iov, +kranal_setup_immediate_buffer (kra_tx_t *tx, + unsigned int niov, struct iovec *iov, int offset, int nob) { @@ -198,7 +161,8 @@ kranal_setup_immediate_buffer (kra_tx_t *tx, int niov, struct iovec *iov, } int -kranal_setup_virt_buffer (kra_tx_t *tx, int niov, struct iovec *iov, +kranal_setup_virt_buffer (kra_tx_t *tx, + unsigned int niov, struct iovec *iov, int offset, int nob) { @@ -225,7 +189,7 @@ kranal_setup_virt_buffer (kra_tx_t *tx, int niov, struct iovec *iov, } int -kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov, +kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, lnet_kiov_t *kiov, int offset, int nob) { RAP_PHYS_REGION *phys = tx->tx_phys; @@ -248,7 +212,7 @@ kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov, tx->tx_nob = nob; tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset)); - phys->Address = kranal_page2phys(kiov->kiov_page); + phys->Address = lnet_page2phys(kiov->kiov_page); phys++; resid = nob - (kiov->kiov_len - offset); @@ -268,12 +232,12 @@ kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov, return -EINVAL; } - if ((phys - tx->tx_phys) == PTL_MD_MAX_IOV) { + if ((phys - tx->tx_phys) == LNET_MAX_IOV) { CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys)); return -EMSGSIZE; } - phys->Address = kranal_page2phys(kiov->kiov_page); + phys->Address = lnet_page2phys(kiov->kiov_page); phys++; resid -= PAGE_SIZE; @@ -284,8 +248,8 @@ kranal_setup_phys_buffer (kra_tx_t *tx, int nkiov, ptl_kiov_t *kiov, } static inline int -kranal_setup_rdma_buffer (kra_tx_t *tx, int niov, - struct iovec *iov, ptl_kiov_t *kiov, +kranal_setup_rdma_buffer (kra_tx_t *tx, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, int offset, int nob) { LASSERT ((iov == NULL) != (kiov == NULL)); @@ -404,7 +368,7 @@ kranal_unmap_buffer (kra_tx_t *tx) void kranal_tx_done (kra_tx_t *tx, int completion) { - ptl_err_t ptlrc = (completion == 0) ? PTL_OK : PTL_FAIL; + lnet_msg_t *lnetmsg[2]; unsigned long flags; int i; @@ -412,14 +376,8 @@ kranal_tx_done (kra_tx_t *tx, int completion) kranal_unmap_buffer(tx); - for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) - continue; - - lib_finalize(&kranal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; - } + lnetmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lnetmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; tx->tx_buftype = RANAL_BUF_NONE; tx->tx_msg.ram_type = RANAL_MSG_NONE; @@ -427,14 +385,17 @@ kranal_tx_done (kra_tx_t *tx, int completion) spin_lock_irqsave(&kranal_data.kra_tx_lock, flags); - if (tx->tx_isnblk) { - list_add_tail(&tx->tx_list, &kranal_data.kra_idle_nblk_txs); - } else { - list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs); - wake_up(&kranal_data.kra_idle_tx_waitq); - } + list_add_tail(&tx->tx_list, &kranal_data.kra_idle_txs); spin_unlock_irqrestore(&kranal_data.kra_tx_lock, flags); + + /* finalize AFTER freeing lnet msgs */ + for (i = 0; i < 2; i++) { + if (lnetmsg[i] == NULL) + continue; + + lnet_finalize(kranal_data.kra_ni, lnetmsg[i], completion); + } } kra_conn_t * @@ -466,12 +427,13 @@ kranal_post_fma (kra_conn_t *conn, kra_tx_t *tx) } void -kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid) +kranal_launch_tx (kra_tx_t *tx, lnet_nid_t nid) { unsigned long flags; kra_peer_t *peer; kra_conn_t *conn; - unsigned long now; + int rc; + int retry; rwlock_t *g_lock = &kranal_data.kra_global_lock; /* If I get here, I've committed to send, so I complete the tx with @@ -479,33 +441,46 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid) LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ - read_lock(g_lock); + for (retry = 0; ; retry = 1) { - peer = kranal_find_peer_locked(nid); - if (peer == NULL) { - read_unlock(g_lock); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } + read_lock(g_lock); - conn = kranal_find_conn_locked(peer); - if (conn != NULL) { - kranal_post_fma(conn, tx); + peer = kranal_find_peer_locked(nid); + if (peer != NULL) { + conn = kranal_find_conn_locked(peer); + if (conn != NULL) { + kranal_post_fma(conn, tx); + read_unlock(g_lock); + return; + } + } + + /* Making connections; I'll need a write lock... */ read_unlock(g_lock); - return; - } + write_lock_irqsave(g_lock, flags); - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock_irqsave(g_lock, flags); - - peer = kranal_find_peer_locked(nid); - if (peer == NULL) { + peer = kranal_find_peer_locked(nid); + if (peer != NULL) + break; + write_unlock_irqrestore(g_lock, flags); - kranal_tx_done(tx, -EHOSTUNREACH); - return; - } + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); + kranal_tx_done(tx, -EHOSTUNREACH); + return; + } + rc = kranal_add_persistent_peer(nid, LNET_NIDADDR(nid), + lnet_acceptor_port()); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + kranal_tx_done(tx, rc); + return; + } + } + conn = kranal_find_conn_locked(peer); if (conn != NULL) { /* Connection exists; queue message on it */ @@ -513,14 +488,14 @@ kranal_launch_tx (kra_tx_t *tx, ptl_nid_t nid) write_unlock_irqrestore(g_lock, flags); return; } - + LASSERT (peer->rap_persistence > 0); if (!peer->rap_connecting) { LASSERT (list_empty(&peer->rap_tx_queue)); - now = CURRENT_SECONDS; - if (now < peer->rap_reconnect_time) { + if (!(peer->rap_reconnect_interval == 0 || /* first attempt */ + time_after_eq(jiffies, peer->rap_reconnect_time))) { write_unlock_irqrestore(g_lock, flags); kranal_tx_done(tx, -EHOSTUNREACH); return; @@ -603,219 +578,205 @@ kranal_consume_rxmsg (kra_conn_t *conn, void *buffer, int nob) conn->rac_rxmsg = NULL; if (nob_received < nob) { - CWARN("Incomplete immediate msg from "LPX64 - ": expected %d, got %d\n", - conn->rac_peer->rap_nid, nob, nob_received); + CWARN("Incomplete immediate msg from %s: expected %d, got %d\n", + libcfs_nid2str(conn->rac_peer->rap_nid), + nob, nob_received); return -EPROTO; } return 0; } -ptl_err_t -kranal_do_send (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - ptl_kiov_t *kiov, - int offset, - int nob) +int +kranal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { - kra_conn_t *conn; - kra_tx_t *tx; - int rc; + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kra_tx_t *tx; + int rc; /* NB 'private' is different depending on what we're sending.... */ - CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64" pid %d\n", - nob, niov, nid, pid); + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + nob, niov, libcfs_id2str(target)); LASSERT (nob == 0 || niov > 0); - LASSERT (niov <= PTL_MD_MAX_IOV); + LASSERT (niov <= LNET_MAX_IOV); LASSERT (!in_interrupt()); /* payload is either all vaddrs or all pages */ LASSERT (!(kiov != NULL && iov != NULL)); + if (routing) { + CERROR ("Can't route\n"); + return -EIO; + } + switch(type) { default: LBUG(); - case PTL_MSG_REPLY: { - /* reply's 'private' is the conn that received the GET_REQ */ - conn = private; - LASSERT (conn->rac_rxmsg != NULL); - - if (conn->rac_rxmsg->ram_type == RANAL_MSG_IMMEDIATE) { - if (nob > RANAL_FMA_MAX_DATA) { - CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n", - nob, nid); - return PTL_FAIL; - } - break; /* RDMA not expected */ - } - - /* Incoming message consistent with RDMA? */ - if (conn->rac_rxmsg->ram_type != RANAL_MSG_GET_REQ) { - CERROR("REPLY to "LPX64" bad msg type %x!!!\n", - nid, conn->rac_rxmsg->ram_type); - return PTL_FAIL; - } - - tx = kranal_get_idle_tx(0); - if (tx == NULL) - return PTL_FAIL; - - rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); - if (rc != 0) { - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - tx->tx_conn = conn; - tx->tx_libmsg[0] = libmsg; - - rc = kranal_map_buffer(tx); - if (rc != 0) { - kranal_tx_done(tx, rc); - return PTL_FAIL; - } - - kranal_rdma(tx, RANAL_MSG_GET_DONE, - &conn->rac_rxmsg->ram_u.get.ragm_desc, nob, - conn->rac_rxmsg->ram_u.get.ragm_cookie); - - /* flag matched by consuming rx message */ - kranal_consume_rxmsg(conn, NULL, 0); - return PTL_OK; - } + case LNET_MSG_ACK: + LASSERT (nob == 0); + break; - case PTL_MSG_GET: + case LNET_MSG_GET: LASSERT (niov == 0); LASSERT (nob == 0); /* We have to consider the eventual sink buffer rather than any * payload passed here (there isn't any, and strictly, looking - * inside libmsg is a layering violation). We send a simple + * inside lntmsg is a layering violation). We send a simple * IMMEDIATE GET if the sink buffer is mapped already and small * enough for FMA */ - if ((libmsg->md->options & PTL_MD_KIOV) == 0 && - libmsg->md->length <= RANAL_FMA_MAX_DATA && - libmsg->md->length <= kranal_tunables.kra_max_immediate) - break; + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0 && + lntmsg->msg_md->md_length <= RANAL_FMA_MAX_DATA && + lntmsg->msg_md->md_length <= *kranal_tunables.kra_max_immediate) + break; /* send IMMEDIATE */ - tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_GET_REQ); + tx = kranal_new_tx_msg(RANAL_MSG_GET_REQ); if (tx == NULL) - return PTL_NO_SPACE; + return -ENOMEM; - if ((libmsg->md->options & PTL_MD_KIOV) == 0) - rc = kranal_setup_virt_buffer(tx, libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, libmsg->md->length); + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) + rc = kranal_setup_virt_buffer(tx, lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); else - rc = kranal_setup_phys_buffer(tx, libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, libmsg->md->length); + rc = kranal_setup_phys_buffer(tx, lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); if (rc != 0) { kranal_tx_done(tx, rc); - return PTL_FAIL; + return -EIO; } - tx->tx_libmsg[1] = lib_create_reply_msg(&kranal_lib, nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR("Can't create reply for GET to "LPX64"\n", nid); + tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET to %s\n", + libcfs_nid2str(target.nid)); kranal_tx_done(tx, rc); - return PTL_FAIL; + return -EIO; } - tx->tx_libmsg[0] = libmsg; + tx->tx_lntmsg[0] = lntmsg; tx->tx_msg.ram_u.get.ragm_hdr = *hdr; /* rest of tx_msg is setup just before it is sent */ - kranal_launch_tx(tx, nid); - return PTL_OK; - - case PTL_MSG_ACK: - LASSERT (nob == 0); - break; + kranal_launch_tx(tx, target.nid); + return 0; - case PTL_MSG_PUT: + case LNET_MSG_REPLY: + case LNET_MSG_PUT: if (kiov == NULL && /* not paged */ nob <= RANAL_FMA_MAX_DATA && /* small enough */ - nob <= kranal_tunables.kra_max_immediate) + nob <= *kranal_tunables.kra_max_immediate) break; /* send IMMEDIATE */ - tx = kranal_new_tx_msg(!in_interrupt(), RANAL_MSG_PUT_REQ); + tx = kranal_new_tx_msg(RANAL_MSG_PUT_REQ); if (tx == NULL) - return PTL_NO_SPACE; + return -ENOMEM; rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); if (rc != 0) { kranal_tx_done(tx, rc); - return PTL_FAIL; + return -EIO; } - tx->tx_libmsg[0] = libmsg; + tx->tx_lntmsg[0] = lntmsg; tx->tx_msg.ram_u.putreq.raprm_hdr = *hdr; /* rest of tx_msg is setup just before it is sent */ - kranal_launch_tx(tx, nid); - return PTL_OK; + kranal_launch_tx(tx, target.nid); + return 0; } + /* send IMMEDIATE */ + LASSERT (kiov == NULL); LASSERT (nob <= RANAL_FMA_MAX_DATA); - tx = kranal_new_tx_msg(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY || - in_interrupt()), - RANAL_MSG_IMMEDIATE); + tx = kranal_new_tx_msg(RANAL_MSG_IMMEDIATE); if (tx == NULL) - return PTL_NO_SPACE; + return -ENOMEM; rc = kranal_setup_immediate_buffer(tx, niov, iov, offset, nob); if (rc != 0) { kranal_tx_done(tx, rc); - return PTL_FAIL; + return -EIO; } tx->tx_msg.ram_u.immediate.raim_hdr = *hdr; - tx->tx_libmsg[0] = libmsg; - kranal_launch_tx(tx, nid); - return PTL_OK; + tx->tx_lntmsg[0] = lntmsg; + kranal_launch_tx(tx, target.nid); + return 0; } -ptl_err_t -kranal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, struct iovec *iov, - size_t offset, size_t len) +void +kranal_reply(lnet_ni_t *ni, kra_conn_t *conn, lnet_msg_t *lntmsg) { - return kranal_do_send(nal, private, cookie, - hdr, type, nid, pid, - niov, iov, NULL, - offset, len); + kra_msg_t *rxmsg = conn->rac_rxmsg; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kra_tx_t *tx; + int rc; + + tx = kranal_get_idle_tx(); + if (tx == NULL) + goto failed_0; + + rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); + if (rc != 0) + goto failed_1; + + tx->tx_conn = conn; + + rc = kranal_map_buffer(tx); + if (rc != 0) + goto failed_1; + + tx->tx_lntmsg[0] = lntmsg; + + kranal_rdma(tx, RANAL_MSG_GET_DONE, + &rxmsg->ram_u.get.ragm_desc, nob, + rxmsg->ram_u.get.ragm_cookie); + return; + + failed_1: + kranal_tx_done(tx, -EIO); + failed_0: + lnet_finalize(ni, lntmsg, -EIO); } -ptl_err_t -kranal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t len) +int +kranal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + void **new_private) { - return kranal_do_send(nal, private, cookie, - hdr, type, nid, pid, - niov, NULL, kiov, - offset, len); + kra_conn_t *conn = (kra_conn_t *)private; + + LCONSOLE_ERROR("Dropping message from %s: no buffers free.\n", + libcfs_nid2str(conn->rac_peer->rap_nid)); + + return -EDEADLK; } -ptl_err_t -kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - int offset, int mlen, int rlen) +int +kranal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) { kra_conn_t *conn = private; kra_msg_t *rxmsg = conn->rac_rxmsg; @@ -828,26 +789,18 @@ kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg, /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); - CDEBUG(D_NET, "conn %p, rxmsg %p, libmsg %p\n", conn, rxmsg, libmsg); - - if (libmsg == NULL) { - /* GET or ACK or portals is discarding */ - LASSERT (mlen == 0); - lib_finalize(nal, NULL, libmsg, PTL_OK); - return PTL_OK; - } + CDEBUG(D_NET, "conn %p, rxmsg %p, lntmsg %p\n", conn, rxmsg, lntmsg); switch(rxmsg->ram_type) { default: LBUG(); - return PTL_FAIL; case RANAL_MSG_IMMEDIATE: if (mlen == 0) { buffer = NULL; } else if (kiov != NULL) { CERROR("Can't recv immediate into paged buffer\n"); - return PTL_FAIL; + return -EIO; } else { LASSERT (niov > 0); while (offset >= iov->iov_len) { @@ -858,30 +811,34 @@ kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg, } if (mlen > iov->iov_len - offset) { CERROR("Can't handle immediate frags\n"); - return PTL_FAIL; + return -EIO; } buffer = ((char *)iov->iov_base) + offset; } rc = kranal_consume_rxmsg(conn, buffer, mlen); - lib_finalize(nal, NULL, libmsg, (rc == 0) ? PTL_OK : PTL_FAIL); - return PTL_OK; + lnet_finalize(ni, lntmsg, (rc == 0) ? 0 : -EIO); + return 0; case RANAL_MSG_PUT_REQ: - tx = kranal_new_tx_msg(0, RANAL_MSG_PUT_ACK); - if (tx == NULL) - return PTL_NO_SPACE; - + tx = kranal_new_tx_msg(RANAL_MSG_PUT_ACK); + if (tx == NULL) { + kranal_consume_rxmsg(conn, NULL, 0); + return -ENOMEM; + } + rc = kranal_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen); if (rc != 0) { kranal_tx_done(tx, rc); - return PTL_FAIL; + kranal_consume_rxmsg(conn, NULL, 0); + return -EIO; } tx->tx_conn = conn; rc = kranal_map_buffer(tx); if (rc != 0) { kranal_tx_done(tx, rc); - return PTL_FAIL; + kranal_consume_rxmsg(conn, NULL, 0); + return -EIO; } tx->tx_msg.ram_u.putack.rapam_src_cookie = @@ -892,34 +849,30 @@ kranal_do_recv (lib_nal_t *nal, void *private, lib_msg_t *libmsg, (__u64)((unsigned long)tx->tx_buffer); tx->tx_msg.ram_u.putack.rapam_desc.rard_nob = mlen; - tx->tx_libmsg[0] = libmsg; /* finalize this on RDMA_DONE */ + tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */ kranal_post_fma(conn, tx); + kranal_consume_rxmsg(conn, NULL, 0); + return 0; - /* flag matched by consuming rx message */ + case RANAL_MSG_GET_REQ: + if (lntmsg != NULL) { + /* Matched! */ + kranal_reply(ni, conn, lntmsg); + } else { + /* No match */ + tx = kranal_new_tx_msg(RANAL_MSG_GET_NAK); + if (tx != NULL) { + tx->tx_msg.ram_u.completion.racm_cookie = + rxmsg->ram_u.get.ragm_cookie; + kranal_post_fma(conn, tx); + } + } kranal_consume_rxmsg(conn, NULL, 0); - return PTL_OK; + return 0; } } -ptl_err_t -kranal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return kranal_do_recv(nal, private, msg, niov, iov, NULL, - offset, mlen, rlen); -} - -ptl_err_t -kranal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - return kranal_do_recv(nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen); -} - int kranal_thread_start (int(*fn)(void *arg), void *arg) { @@ -953,8 +906,8 @@ kranal_check_conn_timeouts (kra_conn_t *conn) if (!conn->rac_close_sent && time_after_eq(now, conn->rac_last_tx + conn->rac_keepalive * HZ)) { /* not sent in a while; schedule conn so scheduler sends a keepalive */ - CDEBUG(D_NET, "Scheduling keepalive %p->"LPX64"\n", - conn, conn->rac_peer->rap_nid); + CDEBUG(D_NET, "Scheduling keepalive %p->%s\n", + conn, libcfs_nid2str(conn->rac_peer->rap_nid)); kranal_schedule_conn(conn); } @@ -962,10 +915,11 @@ kranal_check_conn_timeouts (kra_conn_t *conn) if (!conn->rac_close_recvd && time_after_eq(now, conn->rac_last_rx + timeout)) { - CERROR("%s received from "LPX64" within %lu seconds\n", + CERROR("%s received from %s within %lu seconds\n", (conn->rac_state == RANAL_CONN_ESTABLISHED) ? "Nothing" : "CLOSE not", - conn->rac_peer->rap_nid, (now - conn->rac_last_rx)/HZ); + libcfs_nid2str(conn->rac_peer->rap_nid), + (now - conn->rac_last_rx)/HZ); return -ETIMEDOUT; } @@ -983,8 +937,9 @@ kranal_check_conn_timeouts (kra_conn_t *conn) if (time_after_eq(now, tx->tx_qtime + timeout)) { spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on fmaq for "LPX64" blocked %lu seconds\n", - conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ); + CERROR("tx on fmaq for %s blocked %lu seconds\n", + libcfs_nid2str(conn->rac_peer->rap_nid), + (now - tx->tx_qtime)/HZ); return -ETIMEDOUT; } } @@ -994,8 +949,9 @@ kranal_check_conn_timeouts (kra_conn_t *conn) if (time_after_eq(now, tx->tx_qtime + timeout)) { spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on rdmaq for "LPX64" blocked %lu seconds\n", - conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ); + CERROR("tx on rdmaq for %s blocked %lu seconds\n", + libcfs_nid2str(conn->rac_peer->rap_nid), + (now - tx->tx_qtime)/HZ); return -ETIMEDOUT; } } @@ -1005,8 +961,9 @@ kranal_check_conn_timeouts (kra_conn_t *conn) if (time_after_eq(now, tx->tx_qtime + timeout)) { spin_unlock_irqrestore(&conn->rac_lock, flags); - CERROR("tx on replyq for "LPX64" blocked %lu seconds\n", - conn->rac_peer->rap_nid, (now - tx->tx_qtime)/HZ); + CERROR("tx on replyq for %s blocked %lu seconds\n", + libcfs_nid2str(conn->rac_peer->rap_nid), + (now - tx->tx_qtime)/HZ); return -ETIMEDOUT; } } @@ -1044,8 +1001,9 @@ kranal_reaper_check (int idx, unsigned long *min_timeoutp) kranal_conn_addref(conn); read_unlock(&kranal_data.kra_global_lock); - CERROR("Conn to "LPX64", cqid %d timed out\n", - conn->rac_peer->rap_nid, conn->rac_cqid); + CERROR("Conn to %s, cqid %d timed out\n", + libcfs_nid2str(conn->rac_peer->rap_nid), + conn->rac_cqid); write_lock_irqsave(&kranal_data.kra_global_lock, flags); @@ -1085,8 +1043,8 @@ kranal_connd (void *arg) int did_something; snprintf(name, sizeof(name), "kranal_connd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); + cfs_daemonize(name); + cfs_block_allsigs(); init_waitqueue_entry(&wait, current); @@ -1131,7 +1089,7 @@ kranal_connd (void *arg) continue; set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kranal_data.kra_connd_waitq, &wait); + add_wait_queue_exclusive(&kranal_data.kra_connd_waitq, &wait); spin_unlock_irqrestore(&kranal_data.kra_connd_lock, flags); @@ -1178,8 +1136,8 @@ kranal_reaper (void *arg) long next_min_timeout = MAX_SCHEDULE_TIMEOUT; long current_min_timeout = 1; - kportal_daemonize("kranal_reaper"); - kportal_blockallsigs(); + cfs_daemonize("kranal_reaper"); + cfs_block_allsigs(); init_waitqueue_entry(&wait, current); @@ -1376,7 +1334,8 @@ kranal_check_fma_cq (kra_device_t *dev) } /* FMA CQ has overflowed: check ALL conns */ - CWARN("Scheduling ALL conns on device %d\n", dev->rad_id); + CWARN("FMA CQ overflow: scheduling ALL conns on device %d\n", + dev->rad_id); for (i = 0; i < kranal_data.kra_conn_hash_size; i++) { @@ -1482,7 +1441,8 @@ kranal_process_fmaq (kra_conn_t *conn) if (conn->rac_close_sent) return; - CWARN("sending CLOSE to "LPX64"\n", conn->rac_peer->rap_nid); + CWARN("sending CLOSE to %s\n", + libcfs_nid2str(conn->rac_peer->rap_nid)); kranal_init_msg(&conn->rac_msg, RANAL_MSG_CLOSE); rc = kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); if (rc != 0) @@ -1509,8 +1469,8 @@ kranal_process_fmaq (kra_conn_t *conn) if (time_after_eq(jiffies, conn->rac_last_tx + conn->rac_keepalive * HZ)) { - CDEBUG(D_NET, "sending NOOP -> "LPX64" (%p idle %lu(%ld))\n", - conn->rac_peer->rap_nid, conn, + CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%ld))\n", + libcfs_nid2str(conn->rac_peer->rap_nid), conn, (jiffies - conn->rac_last_tx)/HZ, conn->rac_keepalive); kranal_init_msg(&conn->rac_msg, RANAL_MSG_NOOP); kranal_sendmsg(conn, &conn->rac_msg, NULL, 0); @@ -1634,9 +1594,9 @@ kranal_match_reply(kra_conn_t *conn, int type, __u64 cookie) if (tx->tx_msg.ram_type != type) { spin_unlock_irqrestore(&conn->rac_lock, flags); CWARN("Unexpected type %x (%x expected) " - "matched reply from "LPX64"\n", + "matched reply from %s\n", tx->tx_msg.ram_type, type, - conn->rac_peer->rap_nid); + libcfs_nid2str(conn->rac_peer->rap_nid)); return NULL; } @@ -1646,8 +1606,8 @@ kranal_match_reply(kra_conn_t *conn, int type, __u64 cookie) } spin_unlock_irqrestore(&conn->rac_lock, flags); - CWARN("Unmatched reply %02x/"LPX64" from "LPX64"\n", - type, cookie, conn->rac_peer->rap_nid); + CWARN("Unmatched reply %02x/"LPX64" from %s\n", + type, cookie, libcfs_nid2str(conn->rac_peer->rap_nid)); return NULL; } @@ -1661,6 +1621,8 @@ kranal_check_fma_rx (kra_conn_t *conn) void *prefix; RAP_RETURN rrc = RapkFmaGetPrefix(conn->rac_rihandle, &prefix); kra_peer_t *peer = conn->rac_peer; + int rc = 0; + int repost = 1; if (rrc == RAP_NOT_DONE) return; @@ -1679,8 +1641,9 @@ kranal_check_fma_rx (kra_conn_t *conn) if (msg->ram_magic != RANAL_MSG_MAGIC) { if (__swab32(msg->ram_magic) != RANAL_MSG_MAGIC) { - CERROR("Unexpected magic %08x from "LPX64"\n", - msg->ram_magic, peer->rap_nid); + CERROR("Unexpected magic %08x from %s\n", + msg->ram_magic, libcfs_nid2str(peer->rap_nid)); + rc = -EPROTO; goto out; } @@ -1707,45 +1670,55 @@ kranal_check_fma_rx (kra_conn_t *conn) } if (msg->ram_version != RANAL_MSG_VERSION) { - CERROR("Unexpected protocol version %d from "LPX64"\n", - msg->ram_version, peer->rap_nid); + CERROR("Unexpected protocol version %d from %s\n", + msg->ram_version, libcfs_nid2str(peer->rap_nid)); + rc = -EPROTO; goto out; } if (msg->ram_srcnid != peer->rap_nid) { - CERROR("Unexpected peer "LPX64" from "LPX64"\n", - msg->ram_srcnid, peer->rap_nid); + CERROR("Unexpected peer %s from %s\n", + libcfs_nid2str(msg->ram_srcnid), + libcfs_nid2str(peer->rap_nid)); + rc = -EPROTO; goto out; } if (msg->ram_connstamp != conn->rac_peer_connstamp) { CERROR("Unexpected connstamp "LPX64"("LPX64 - " expected) from "LPX64"\n", + " expected) from %s\n", msg->ram_connstamp, conn->rac_peer_connstamp, - peer->rap_nid); + libcfs_nid2str(peer->rap_nid)); + rc = -EPROTO; goto out; } if (msg->ram_seq != seq) { - CERROR("Unexpected sequence number %d(%d expected) from " - LPX64"\n", msg->ram_seq, seq, peer->rap_nid); + CERROR("Unexpected sequence number %d(%d expected) from %s\n", + msg->ram_seq, seq, libcfs_nid2str(peer->rap_nid)); + rc = -EPROTO; goto out; } if ((msg->ram_type & RANAL_MSG_FENCE) != 0) { /* This message signals RDMA completion... */ rrc = RapkFmaSyncWait(conn->rac_rihandle); - LASSERT (rrc == RAP_SUCCESS); + if (rrc != RAP_SUCCESS) { + CERROR("RapkFmaSyncWait failed: %d\n", rrc); + rc = -ENETDOWN; + goto out; + } } if (conn->rac_close_recvd) { - CERROR("Unexpected message %d after CLOSE from "LPX64"\n", - msg->ram_type, conn->rac_peer->rap_nid); + CERROR("Unexpected message %d after CLOSE from %s\n", + msg->ram_type, libcfs_nid2str(conn->rac_peer->rap_nid)); + rc = -EPROTO; goto out; } if (msg->ram_type == RANAL_MSG_CLOSE) { - CWARN("RX CLOSE from "LPX64"\n", conn->rac_peer->rap_nid); + CWARN("RX CLOSE from %s\n", libcfs_nid2str(conn->rac_peer->rap_nid)); conn->rac_close_recvd = 1; write_lock_irqsave(&kranal_data.kra_global_lock, flags); @@ -1770,23 +1743,16 @@ kranal_check_fma_rx (kra_conn_t *conn) case RANAL_MSG_IMMEDIATE: CDEBUG(D_NET, "RX IMMEDIATE on %p\n", conn); - lib_parse(&kranal_lib, &msg->ram_u.immediate.raim_hdr, conn); + rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.immediate.raim_hdr, + msg->ram_srcnid, conn, 0); + repost = rc < 0; break; case RANAL_MSG_PUT_REQ: CDEBUG(D_NET, "RX PUT_REQ on %p\n", conn); - lib_parse(&kranal_lib, &msg->ram_u.putreq.raprm_hdr, conn); - - if (conn->rac_rxmsg == NULL) /* lib_parse matched something */ - break; - - tx = kranal_new_tx_msg(0, RANAL_MSG_PUT_NAK); - if (tx == NULL) - break; - - tx->tx_msg.ram_u.completion.racm_cookie = - msg->ram_u.putreq.raprm_cookie; - kranal_post_fma(conn, tx); + rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.putreq.raprm_hdr, + msg->ram_srcnid, conn, 1); + repost = rc < 0; break; case RANAL_MSG_PUT_NAK: @@ -1828,17 +1794,9 @@ kranal_check_fma_rx (kra_conn_t *conn) case RANAL_MSG_GET_REQ: CDEBUG(D_NET, "RX GET_REQ on %p\n", conn); - lib_parse(&kranal_lib, &msg->ram_u.get.ragm_hdr, conn); - - if (conn->rac_rxmsg == NULL) /* lib_parse matched something */ - break; - - tx = kranal_new_tx_msg(0, RANAL_MSG_GET_NAK); - if (tx == NULL) - break; - - tx->tx_msg.ram_u.completion.racm_cookie = msg->ram_u.get.ragm_cookie; - kranal_post_fma(conn, tx); + rc = lnet_parse(kranal_data.kra_ni, &msg->ram_u.get.ragm_hdr, + msg->ram_srcnid, conn, 1); + repost = rc < 0; break; case RANAL_MSG_GET_NAK: @@ -1862,12 +1820,20 @@ kranal_check_fma_rx (kra_conn_t *conn) LASSERT (tx->tx_buftype == RANAL_BUF_PHYS_MAPPED || tx->tx_buftype == RANAL_BUF_VIRT_MAPPED); +#if 0 + /* completion message should send rdma length if we ever allow + * GET truncation */ + lnet_set_reply_msg_len(kranal_data.kra_ni, tx->tx_lntmsg[1], ???); +#endif kranal_tx_done(tx, 0); break; } out: - if (conn->rac_rxmsg != NULL) + if (rc < 0) /* protocol/comms error */ + kranal_close_conn (conn, rc); + + if (repost && conn->rac_rxmsg != NULL) kranal_consume_rxmsg(conn, NULL, 0); /* check again later */ @@ -1901,8 +1867,8 @@ kranal_complete_closed_conn (kra_conn_t *conn) kranal_tx_done(tx, -ECONNABORTED); } - CWARN("Closed conn %p -> "LPX64": nmsg %d nreplies %d\n", - conn, conn->rac_peer->rap_nid, nfma, nreplies); + CWARN("Closed conn %p -> %s: nmsg %d nreplies %d\n", + conn, libcfs_nid2str(conn->rac_peer->rap_nid), nfma, nreplies); } int @@ -1944,8 +1910,8 @@ kranal_scheduler (void *arg) int busy_loops = 0; snprintf(name, sizeof(name), "kranal_sd_%02d", dev->rad_idx); - kportal_daemonize(name); - kportal_blockallsigs(); + cfs_daemonize(name); + cfs_block_allsigs(); dev->rad_scheduler = current; init_waitqueue_entry(&wait, current); @@ -2043,7 +2009,7 @@ kranal_scheduler (void *arg) continue; set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&dev->rad_waitq, &wait); + add_wait_queue_exclusive(&dev->rad_waitq, &wait); spin_unlock_irqrestore(&dev->rad_lock, flags); if (nsoonest == 0) { @@ -2068,13 +2034,3 @@ kranal_scheduler (void *arg) kranal_thread_fini(); return 0; } - - -lib_nal_t kranal_lib = { - libnal_data: &kranal_data, /* NAL private data */ - libnal_send: kranal_send, - libnal_send_pages: kranal_send_pages, - libnal_recv: kranal_recv, - libnal_recv_pages: kranal_recv_pages, - libnal_dist: kranal_dist -}; diff --git a/lnet/klnds/ralnd/ralnd_modparams.c b/lnet/klnds/ralnd/ralnd_modparams.c new file mode 100644 index 0000000..45f42e1 --- /dev/null +++ b/lnet/klnds/ralnd/ralnd_modparams.c @@ -0,0 +1,135 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "ralnd.h" + +static int n_connd = 4; +CFS_MODULE_PARM(n_connd, "i", int, 0444, + "# of connection daemons"); + +static int min_reconnect_interval = 1; +CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, + "minimum connection retry interval (seconds)"); + +static int max_reconnect_interval = 60; +CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, + "maximum connection retry interval (seconds)"); + +static int ntx = 256; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of transmit descriptors"); + +static int credits = 128; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 32; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static int fma_cq_size = 8192; +CFS_MODULE_PARM(fma_cq_size, "i", int, 0444, + "size of the completion queue"); + +static int timeout = 30; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "communications timeout (seconds)"); + +static int max_immediate = (2<<10); +CFS_MODULE_PARM(max_immediate, "i", int, 0644, + "immediate/RDMA breakpoint"); + +kra_tunables_t kranal_tunables = { + .kra_n_connd = &n_connd, + .kra_min_reconnect_interval = &min_reconnect_interval, + .kra_max_reconnect_interval = &max_reconnect_interval, + .kra_ntx = &ntx, + .kra_credits = &credits, + .kra_peercredits = &peer_credits, + .kra_fma_cq_size = &fma_cq_size, + .kra_timeout = &timeout, + .kra_max_immediate = &max_immediate, +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static ctl_table kranal_ctl_table[] = { + {1, "n_connd", &n_connd, + sizeof(int), 0444, NULL, &proc_dointvec}, + {2, "min_reconnect_interval", &min_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {3, "max_reconnect_interval", &max_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {4, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {5, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {6, "peer_credits", &peer_credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {7, "fma_cq_size", &fma_cq_size, + sizeof(int), 0444, NULL, &proc_dointvec}, + {8, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {9, "max_immediate", &max_immediate, + sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table kranal_top_ctl_table[] = { + {202, "ranal", NULL, 0, 0555, kranal_ctl_table}, + {0} +}; + +int +kranal_tunables_init () +{ + kranal_tunables.kra_sysctl = + register_sysctl_table(kranal_top_ctl_table, 0); + + if (kranal_tunables.kra_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +kranal_tunables_fini () +{ + if (kranal_tunables.kra_sysctl != NULL) + unregister_sysctl_table(kranal_tunables.kra_sysctl); +} + +#else + +int +kranal_tunables_init () +{ + return 0; +} + +void +kranal_tunables_fini () +{ +} + +#endif + diff --git a/lnet/klnds/socklnd/Info.plist b/lnet/klnds/socklnd/Info.plist index 11be93d..f5a5460 100644 --- a/lnet/klnds/socklnd/Info.plist +++ b/lnet/klnds/socklnd/Info.plist @@ -5,11 +5,11 @@ CFBundleDevelopmentRegion English CFBundleExecutable - ksocknal + ksocklnd CFBundleIconFile CFBundleIdentifier - com.clusterfs.lustre.ksocknal + com.clusterfs.lustre.ksocklnd CFBundleInfoDictionaryVersion 6.0 CFBundlePackageType @@ -22,15 +22,17 @@ 1.0.0 OSBundleLibraries - com.apple.kernel.bsd - 1.1 - com.apple.kernel.iokit - 1.0.0b1 - com.apple.kernel.mach - 1.0.0b1 + com.apple.kpi.bsd + 8.0.0b1 + com.apple.kpi.libkern + 8.0.0b1 + com.apple.kpi.mach + 8.0.0b1 + com.apple.kpi.unsupported + 8.0.0b1 com.clusterfs.lustre.libcfs 1.0.0 - com.clusterfs.lustre.portals + com.clusterfs.lustre.lnet 1.0.0 diff --git a/lnet/klnds/socklnd/Makefile.in b/lnet/klnds/socklnd/Makefile.in index 7fe9638..3a6c3f7 100644 --- a/lnet/klnds/socklnd/Makefile.in +++ b/lnet/klnds/socklnd/Makefile.in @@ -1,9 +1,5 @@ -MODULES := ksocknal +MODULES := ksocklnd -ksocknal-objs := socknal.o socknal_cb.o socknal_lib-linux.o - -# If you don't build with -O2, your modules won't insert, becahse htonl is -# just special that way. -EXTRA_POST_CFLAGS := -O2 +ksocklnd-objs := socklnd.o socklnd_cb.o socklnd_modparams.o socklnd_lib-linux.o @INCLUDE_RULES@ diff --git a/lnet/klnds/socklnd/autoMakefile.am b/lnet/klnds/socklnd/autoMakefile.am index 71a3633..0dbe697 100644 --- a/lnet/klnds/socklnd/autoMakefile.am +++ b/lnet/klnds/socklnd/autoMakefile.am @@ -1,25 +1,23 @@ if MODULES if LINUX -if !CRAY_PORTALS - modulenet_DATA := ksocknal$(KMODEXT) + modulenet_DATA := ksocklnd$(KMODEXT) -endif # !CRAY_PORTALS endif # LINUX endif # MODULES -DIST_SOURCES := $(ksocknal-objs:%.o=%.c) socknal_lib-linux.h socknal.h +DIST_SOURCES := $(ksocklnd-objs:%.o=%.c) socklnd_lib-linux.h socklnd.h if DARWIN - macos_PROGRAMS := ksocknal + macos_PROGRAMS := ksocklnd - nodist_ksocknal_SOURCES := socknal.c socknal_cb.c socknal_lib-darwin.c - DIST_SOURCES += socknal_lib-darwin.c socknal_lib-darwin.h + nodist_ksocklnd_SOURCES := socklnd.c socklnd_cb.c socklnd_modparams.c socklnd_lib-darwin.c + DIST_SOURCES += socklnd_lib-darwin.c socklnd_lib-darwin.h - ksocknal_CFLAGS := $(EXTRA_KCFLAGS) - ksocknal_LDFLAGS := $(EXTRA_KLDFLAGS) - ksocknal_LDADD := $(EXTRA_KLIBS) + ksocklnd_CFLAGS := $(EXTRA_KCFLAGS) + ksocklnd_LDFLAGS := $(EXTRA_KLDFLAGS) + ksocklnd_LDADD := $(EXTRA_KLIBS) plist_DATA := Info.plist install_data_hook := fix-kext-ownership @@ -29,4 +27,4 @@ endif # DARWIN EXTRA_DIST := $(plist_DATA) install-data-hook: $(install_data_hook) -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ socknal_lib.c +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ socklnd_lib.c diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 295ec35..9f6ba9c 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -23,47 +23,31 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "socknal.h" - -nal_t ksocknal_api; -ksock_nal_data_t ksocknal_data; -ptl_handle_ni_t ksocknal_ni; -ksock_tunables_t ksocknal_tunables; - -kpr_nal_interface_t ksocknal_router_interface = { - kprni_nalid: SOCKNAL, - kprni_arg: &ksocknal_data, - kprni_fwd: ksocknal_fwd_packet, - kprni_notify: ksocknal_notify, +#include "socklnd.h" + +lnd_t the_ksocklnd = { + .lnd_type = SOCKLND, + .lnd_startup = ksocknal_startup, + .lnd_shutdown = ksocknal_shutdown, + .lnd_ctl = ksocknal_ctl, + .lnd_send = ksocknal_send, + .lnd_recv = ksocknal_recv, + .lnd_notify = ksocknal_notify, + .lnd_accept = ksocknal_accept, }; -int -ksocknal_set_mynid(ptl_nid_t nid) -{ - lib_ni_t *ni = &ksocknal_lib.libnal_ni; - - /* FIXME: we have to do this because we call lib_init() at module - * insertion time, which is before we have 'mynid' available. lib_init - * sets the NAL's nid, which it uses to tell other nodes where packets - * are coming from. This is not a very graceful solution to this - * problem. */ - - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - ni->ni_pid.nid = nid; - return (0); -} +ksock_nal_data_t ksocknal_data; ksock_interface_t * -ksocknal_ip2iface(__u32 ip) +ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip) { + ksock_net_t *net = ni->ni_data; int i; ksock_interface_t *iface; - for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { - LASSERT(i < SOCKNAL_MAX_INTERFACES); - iface = &ksocknal_data.ksnd_interfaces[i]; + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(i < LNET_MAX_INTERFACES); + iface = &net->ksnn_interfaces[i]; if (iface->ksni_ipaddr == ip) return (iface); @@ -77,21 +61,22 @@ ksocknal_create_route (__u32 ipaddr, int port) { ksock_route_t *route; - PORTAL_ALLOC (route, sizeof (*route)); + LIBCFS_ALLOC (route, sizeof (*route)); if (route == NULL) return (NULL); atomic_set (&route->ksnr_refcount, 1); route->ksnr_peer = NULL; - route->ksnr_timeout = cfs_time_current(); - route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + route->ksnr_retry_interval = 0; /* OK to connect at any time */ route->ksnr_ipaddr = ipaddr; route->ksnr_port = port; + route->ksnr_scheduled = 0; route->ksnr_connecting = 0; route->ksnr_connected = 0; route->ksnr_deleted = 0; route->ksnr_conn_count = 0; route->ksnr_share_count = 0; + route->ksnr_proto = &ksocknal_protocol_v2x; return (route); } @@ -99,86 +84,90 @@ ksocknal_create_route (__u32 ipaddr, int port) void ksocknal_destroy_route (ksock_route_t *route) { - if (route->ksnr_peer != NULL) - ksocknal_put_peer (route->ksnr_peer); + LASSERT (atomic_read(&route->ksnr_refcount) == 0); - PORTAL_FREE (route, sizeof (*route)); -} - -void -ksocknal_put_route (ksock_route_t *route) -{ - CDEBUG (D_OTHER, "putting route[%p] (%d)\n", - route, atomic_read (&route->ksnr_refcount)); - - LASSERT (atomic_read (&route->ksnr_refcount) > 0); - if (!atomic_dec_and_test (&route->ksnr_refcount)) - return; + if (route->ksnr_peer != NULL) + ksocknal_peer_decref(route->ksnr_peer); - ksocknal_destroy_route (route); + LIBCFS_FREE (route, sizeof (*route)); } -ksock_peer_t * -ksocknal_create_peer (ptl_nid_t nid) +int +ksocknal_create_peer (ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id) { - ksock_peer_t *peer; + ksock_net_t *net = ni->ni_data; + ksock_peer_t *peer; - LASSERT (nid != PTL_NID_ANY); + LASSERT (id.nid != LNET_NID_ANY); + LASSERT (id.pid != LNET_PID_ANY); + LASSERT (!in_interrupt()); - PORTAL_ALLOC (peer, sizeof (*peer)); + LIBCFS_ALLOC (peer, sizeof (*peer)); if (peer == NULL) - return (NULL); + return -ENOMEM; memset (peer, 0, sizeof (*peer)); /* NULL pointers/clear flags etc */ - peer->ksnp_nid = nid; + peer->ksnp_ni = ni; + peer->ksnp_id = id; atomic_set (&peer->ksnp_refcount, 1); /* 1 ref for caller */ peer->ksnp_closing = 0; + peer->ksnp_accepting = 0; + peer->ksnp_zc_next_cookie = 1; CFS_INIT_LIST_HEAD (&peer->ksnp_conns); CFS_INIT_LIST_HEAD (&peer->ksnp_routes); CFS_INIT_LIST_HEAD (&peer->ksnp_tx_queue); + CFS_INIT_LIST_HEAD (&peer->ksnp_zc_req_list); + spin_lock_init(&peer->ksnp_lock); - atomic_inc (&ksocknal_data.ksnd_npeers); - return (peer); + spin_lock_bh (&net->ksnn_lock); + + if (net->ksnn_shutdown) { + spin_unlock_bh (&net->ksnn_lock); + + LIBCFS_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } + + net->ksnn_npeers++; + + spin_unlock_bh (&net->ksnn_lock); + + *peerp = peer; + return 0; } void ksocknal_destroy_peer (ksock_peer_t *peer) { - CDEBUG (D_NET, "peer "LPX64" %p deleted\n", peer->ksnp_nid, peer); + ksock_net_t *net = peer->ksnp_ni->ni_data; + + CDEBUG (D_NET, "peer %s %p deleted\n", + libcfs_id2str(peer->ksnp_id), peer); LASSERT (atomic_read (&peer->ksnp_refcount) == 0); + LASSERT (peer->ksnp_accepting == 0); LASSERT (list_empty (&peer->ksnp_conns)); LASSERT (list_empty (&peer->ksnp_routes)); LASSERT (list_empty (&peer->ksnp_tx_queue)); + LASSERT (list_empty (&peer->ksnp_zc_req_list)); - PORTAL_FREE (peer, sizeof (*peer)); - - /* NB a peer's connections and autoconnect routes keep a reference - * on their peer until they are destroyed, so we can be assured - * that _all_ state to do with this peer has been cleaned up when - * its refcount drops to zero. */ - atomic_dec (&ksocknal_data.ksnd_npeers); -} - -void -ksocknal_put_peer (ksock_peer_t *peer) -{ - CDEBUG (D_OTHER, "putting peer[%p] -> "LPX64" (%d)\n", - peer, peer->ksnp_nid, - atomic_read (&peer->ksnp_refcount)); - - LASSERT (atomic_read (&peer->ksnp_refcount) > 0); - if (!atomic_dec_and_test (&peer->ksnp_refcount)) - return; + LIBCFS_FREE (peer, sizeof (*peer)); - ksocknal_destroy_peer (peer); + /* NB a peer's connections and routes keep a reference on their peer + * until they are destroyed, so we can be assured that _all_ state to + * do with this peer has been cleaned up when its refcount drops to + * zero. */ + spin_lock_bh (&net->ksnn_lock); + net->ksnn_npeers--; + spin_unlock_bh (&net->ksnn_lock); } ksock_peer_t * -ksocknal_find_peer_locked (ptl_nid_t nid) +ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id) { - struct list_head *peer_list = ksocknal_nid2peerlist (nid); + struct list_head *peer_list = ksocknal_nid2peerlist(id.nid); struct list_head *tmp; ksock_peer_t *peer; @@ -188,25 +177,30 @@ ksocknal_find_peer_locked (ptl_nid_t nid) LASSERT (!peer->ksnp_closing); - if (peer->ksnp_nid != nid) + if (peer->ksnp_ni != ni) continue; - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ksnp_refcount)); + if (peer->ksnp_id.nid != id.nid || + peer->ksnp_id.pid != id.pid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_id2str(id), + atomic_read(&peer->ksnp_refcount)); return (peer); } return (NULL); } ksock_peer_t * -ksocknal_get_peer (ptl_nid_t nid) +ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id) { ksock_peer_t *peer; read_lock (&ksocknal_data.ksnd_global_lock); - peer = ksocknal_find_peer_locked (nid); + peer = ksocknal_find_peer_locked (ni, id); if (peer != NULL) /* +1 ref for caller? */ - atomic_inc (&peer->ksnp_refcount); + ksocknal_peer_addref(peer); read_unlock (&ksocknal_data.ksnd_global_lock); return (peer); @@ -219,10 +213,10 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer) __u32 ip; for (i = 0; i < peer->ksnp_n_passive_ips; i++) { - LASSERT (i < SOCKNAL_MAX_INTERFACES); + LASSERT (i < LNET_MAX_INTERFACES); ip = peer->ksnp_passive_ips[i]; - ksocknal_ip2iface(ip)->ksni_npeers--; + ksocknal_ip2iface(peer->ksnp_ni, ip)->ksni_npeers--; } LASSERT (list_empty(&peer->ksnp_conns)); @@ -231,12 +225,12 @@ ksocknal_unlink_peer_locked (ksock_peer_t *peer) peer->ksnp_closing = 1; list_del (&peer->ksnp_list); /* lose peerlist's ref */ - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); } int -ksocknal_get_peer_info (int index, ptl_nid_t *nid, - __u32 *myip, __u32 *peer_ip, int *port, +ksocknal_get_peer_info (lnet_ni_t *ni, int index, + lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip, int *port, int *conn_count, int *share_count) { ksock_peer_t *peer; @@ -254,12 +248,15 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid, list_for_each (ptmp, &ksocknal_data.ksnd_peers[i]) { peer = list_entry (ptmp, ksock_peer_t, ksnp_list); + if (peer->ksnp_ni != ni) + continue; + if (peer->ksnp_n_passive_ips == 0 && list_empty(&peer->ksnp_routes)) { if (index-- > 0) continue; - *nid = peer->ksnp_nid; + *id = peer->ksnp_id; *myip = 0; *peer_ip = 0; *port = 0; @@ -273,7 +270,7 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid, if (index-- > 0) continue; - *nid = peer->ksnp_nid; + *id = peer->ksnp_id; *myip = peer->ksnp_passive_ips[j]; *peer_ip = 0; *port = 0; @@ -290,7 +287,7 @@ ksocknal_get_peer_info (int index, ptl_nid_t *nid, route = list_entry(rtmp, ksock_route_t, ksnr_list); - *nid = peer->ksnp_nid; + *id = peer->ksnp_id; *myip = route->ksnr_myipaddr; *peer_ip = route->ksnr_ipaddr; *port = route->ksnr_port; @@ -314,29 +311,31 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) ksock_interface_t *iface; conn->ksnc_route = route; - atomic_inc (&route->ksnr_refcount); + ksocknal_route_addref(route); if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { if (route->ksnr_myipaddr == 0) { /* route wasn't bound locally yet (the initial route) */ - CWARN("Binding "LPX64" %u.%u.%u.%u to %u.%u.%u.%u\n", - peer->ksnp_nid, - HIPQUAD(route->ksnr_ipaddr), - HIPQUAD(conn->ksnc_myipaddr)); + CDEBUG(D_NET, "Binding %s %u.%u.%u.%u to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(conn->ksnc_myipaddr)); } else { - CWARN("Rebinding "LPX64" %u.%u.%u.%u from " - "%u.%u.%u.%u to %u.%u.%u.%u\n", - peer->ksnp_nid, - HIPQUAD(route->ksnr_ipaddr), - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(conn->ksnc_myipaddr)); - - iface = ksocknal_ip2iface(route->ksnr_myipaddr); + CDEBUG(D_NET, "Rebinding %s %u.%u.%u.%u from " + "%u.%u.%u.%u to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr), + HIPQUAD(route->ksnr_myipaddr), + HIPQUAD(conn->ksnc_myipaddr)); + + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); if (iface != NULL) iface->ksni_nroutes--; } route->ksnr_myipaddr = conn->ksnc_myipaddr; - iface = ksocknal_ip2iface(route->ksnr_myipaddr); + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); if (iface != NULL) iface->ksni_nroutes++; } @@ -346,8 +345,7 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) /* Successful connection => further attempts can * proceed immediately */ - route->ksnr_timeout = cfs_time_current(); - route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; + route->ksnr_retry_interval = 0; } void @@ -355,10 +353,11 @@ ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route) { struct list_head *tmp; ksock_conn_t *conn; - int type; ksock_route_t *route2; + LASSERT (!peer->ksnp_closing); LASSERT (route->ksnr_peer == NULL); + LASSERT (!route->ksnr_scheduled); LASSERT (!route->ksnr_connecting); LASSERT (route->ksnr_connected == 0); @@ -367,20 +366,20 @@ ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route) route2 = list_entry(tmp, ksock_route_t, ksnr_list); if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { - CERROR ("Duplicate route "LPX64" %u.%u.%u.%u\n", - peer->ksnp_nid, HIPQUAD(route->ksnr_ipaddr)); + CERROR ("Duplicate route %s %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(route->ksnr_ipaddr)); LBUG(); } } route->ksnr_peer = peer; - atomic_inc (&peer->ksnp_refcount); + ksocknal_peer_addref(peer); /* peer's routelist takes over my ref on 'route' */ list_add_tail(&route->ksnr_list, &peer->ksnp_routes); list_for_each(tmp, &peer->ksnp_conns) { conn = list_entry(tmp, ksock_conn_t, ksnc_list); - type = conn->ksnc_type; if (conn->ksnc_ipaddr != route->ksnr_ipaddr) continue; @@ -412,57 +411,59 @@ ksocknal_del_route_locked (ksock_route_t *route) } if (route->ksnr_myipaddr != 0) { - iface = ksocknal_ip2iface(route->ksnr_myipaddr); + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); if (iface != NULL) iface->ksni_nroutes--; } route->ksnr_deleted = 1; list_del (&route->ksnr_list); - ksocknal_put_route (route); /* drop peer's ref */ + ksocknal_route_decref(route); /* drop peer's ref */ if (list_empty (&peer->ksnp_routes) && list_empty (&peer->ksnp_conns)) { - /* I've just removed the last autoconnect route of a peer - * with no active connections */ + /* I've just removed the last route to a peer with no active + * connections */ ksocknal_unlink_peer_locked (peer); } } int -ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port) +ksocknal_add_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port) { - unsigned long flags; struct list_head *tmp; ksock_peer_t *peer; ksock_peer_t *peer2; ksock_route_t *route; ksock_route_t *route2; + int rc; - if (nid == PTL_NID_ANY) + if (id.nid == LNET_NID_ANY || + id.pid == LNET_PID_ANY) return (-EINVAL); /* Have a brand new peer ready... */ - peer = ksocknal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); + rc = ksocknal_create_peer(&peer, ni, id); + if (rc != 0) + return rc; route = ksocknal_create_route (ipaddr, port); if (route == NULL) { - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); return (-ENOMEM); } - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); - peer2 = ksocknal_find_peer_locked (nid); + peer2 = ksocknal_find_peer_locked (ni, id); if (peer2 != NULL) { - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); peer = peer2; } else { /* peer table takes my ref on peer */ list_add_tail (&peer->ksnp_list, - ksocknal_nid2peerlist (nid)); + ksocknal_nid2peerlist (id.nid)); } route2 = NULL; @@ -478,17 +479,17 @@ ksocknal_add_peer (ptl_nid_t nid, __u32 ipaddr, int port) ksocknal_add_route_locked(peer, route); route->ksnr_share_count++; } else { - ksocknal_put_route(route); + ksocknal_route_decref(route); route2->ksnr_share_count++; } - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (0); } void -ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share) +ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip) { ksock_conn_t *conn; ksock_route_t *route; @@ -499,30 +500,18 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share) LASSERT (!peer->ksnp_closing); /* Extra ref prevents peer disappearing until I'm done with it */ - atomic_inc(&peer->ksnp_refcount); + ksocknal_peer_addref(peer); list_for_each_safe (tmp, nxt, &peer->ksnp_routes) { route = list_entry(tmp, ksock_route_t, ksnr_list); - if (single_share && route->ksnr_share_count == 0) - continue; - /* no match */ if (!(ip == 0 || route->ksnr_ipaddr == ip)) continue; - if (!single_share) - route->ksnr_share_count = 0; - else if (route->ksnr_share_count > 0) - route->ksnr_share_count--; - - if (route->ksnr_share_count == 0) { - /* This deletes associated conns too */ - ksocknal_del_route_locked (route); - } - - if (single_share) - break; + route->ksnr_share_count = 0; + /* This deletes associated conns too */ + ksocknal_del_route_locked (route); } nshared = 0; @@ -550,14 +539,14 @@ ksocknal_del_peer_locked (ksock_peer_t *peer, __u32 ip, int single_share) } } - ksocknal_put_peer(peer); + ksocknal_peer_decref(peer); /* NB peer unlinks itself when last conn/route is removed */ } int -ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share) +ksocknal_del_peer (lnet_ni_t *ni, lnet_process_id_t id, __u32 ip) { - unsigned long flags; + CFS_LIST_HEAD (zombies); struct list_head *ptmp; struct list_head *pnxt; ksock_peer_t *peer; @@ -566,10 +555,10 @@ ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share) int i; int rc = -ENOENT; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); - if (nid != PTL_NID_ANY) - lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + if (id.nid != LNET_NID_ANY) + lo = hi = ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers; else { lo = 0; hi = ksocknal_data.ksnd_peer_hash_size - 1; @@ -579,24 +568,39 @@ ksocknal_del_peer (ptl_nid_t nid, __u32 ip, int single_share) list_for_each_safe (ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - if (!(nid == PTL_NID_ANY || peer->ksnp_nid == nid)) + if (peer->ksnp_ni != ni) continue; - ksocknal_del_peer_locked (peer, ip, single_share); - rc = 0; /* matched! */ + if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) && + (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid))) + continue; - if (single_share) - break; + ksocknal_peer_addref(peer); /* a ref for me... */ + + ksocknal_del_peer_locked (peer, ip); + + if (peer->ksnp_closing && !list_empty(&peer->ksnp_tx_queue)) { + LASSERT (list_empty(&peer->ksnp_conns)); + LASSERT (list_empty(&peer->ksnp_routes)); + + list_splice_init(&peer->ksnp_tx_queue, &zombies); + } + + ksocknal_peer_decref(peer); /* ...till here */ + + rc = 0; /* matched! */ } } - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(ni, &zombies, 1); return (rc); } ksock_conn_t * -ksocknal_get_conn_by_idx (int index) +ksocknal_get_conn_by_idx (lnet_ni_t *ni, int index) { ksock_peer_t *peer; struct list_head *ptmp; @@ -612,12 +616,15 @@ ksocknal_get_conn_by_idx (int index) LASSERT (!peer->ksnp_closing); + if (peer->ksnp_ni != ni) + continue; + list_for_each (ctmp, &peer->ksnp_conns) { if (index-- > 0) continue; conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); read_unlock (&ksocknal_data.ksnd_global_lock); return (conn); } @@ -663,18 +670,26 @@ ksocknal_choose_scheduler_locked (unsigned int irq) } int -ksocknal_local_ipvec (__u32 *ipaddrs) +ksocknal_local_ipvec (lnet_ni_t *ni, __u32 *ipaddrs) { + ksock_net_t *net = ni->ni_data; int i; int nip; read_lock (&ksocknal_data.ksnd_global_lock); - nip = ksocknal_data.ksnd_ninterfaces; - for (i = 0; i < nip; i++) { - LASSERT (i < SOCKNAL_MAX_INTERFACES); + nip = net->ksnn_ninterfaces; + LASSERT (nip < LNET_MAX_INTERFACES); - ipaddrs[i] = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; + /* Only offer interfaces for additional connections if I have + * more than one. */ + if (nip < 2) { + read_unlock (&ksocknal_data.ksnd_global_lock); + return 0; + } + + for (i = 0; i < nip; i++) { + ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; LASSERT (ipaddrs[i] != 0); } @@ -718,7 +733,7 @@ int ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) { rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - unsigned long flags; + ksock_net_t *net = peer->ksnp_ni->ni_data; ksock_interface_t *iface; ksock_interface_t *best_iface; int n_ips; @@ -739,12 +754,15 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) /* Also note that I'm not going to return more than n_peerips * interfaces, even if I have more myself */ - write_lock_irqsave(global_lock, flags); + write_lock_bh (global_lock); - LASSERT (n_peerips <= SOCKNAL_MAX_INTERFACES); - LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); + LASSERT (n_peerips <= LNET_MAX_INTERFACES); + LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); - n_ips = MIN(n_peerips, ksocknal_data.ksnd_ninterfaces); + /* Only match interfaces for additional connections + * if I have > 1 interface */ + n_ips = (net->ksnn_ninterfaces < 2) ? 0 : + MIN(n_peerips, net->ksnn_ninterfaces); for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { /* ^ yes really... */ @@ -758,7 +776,7 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) if (i < peer->ksnp_n_passive_ips) { /* Old interface. */ ip = peer->ksnp_passive_ips[i]; - best_iface = ksocknal_ip2iface(ip); + best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip); /* peer passive ips are kept up to date */ LASSERT(best_iface != NULL); @@ -770,8 +788,8 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) best_netmatch = 0; best_npeers = 0; - for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { - iface = &ksocknal_data.ksnd_interfaces[j]; + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; ip = iface->ksni_ipaddr; for (k = 0; k < peer->ksnp_n_passive_ips; k++) @@ -812,7 +830,7 @@ ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) /* Overwrite input peer IP addresses */ memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); - write_unlock_irqrestore(global_lock, flags); + write_unlock_bh (global_lock); return (n_ips); } @@ -823,7 +841,8 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, { ksock_route_t *newroute = NULL; rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - unsigned long flags; + lnet_ni_t *ni = peer->ksnp_ni; + ksock_net_t *net = ni->ni_data; struct list_head *rtmp; ksock_route_t *route; ksock_interface_t *iface; @@ -839,21 +858,33 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, * expecting to be dealing with small numbers of interfaces, so the * O(n**3)-ness here shouldn't matter */ - write_lock_irqsave(global_lock, flags); + write_lock_bh (global_lock); - LASSERT (npeer_ipaddrs <= SOCKNAL_MAX_INTERFACES); + if (net->ksnn_ninterfaces < 2) { + /* Only create additional connections + * if I have > 1 interface */ + write_unlock_bh (global_lock); + return; + } + + LASSERT (npeer_ipaddrs <= LNET_MAX_INTERFACES); for (i = 0; i < npeer_ipaddrs; i++) { if (newroute != NULL) { newroute->ksnr_ipaddr = peer_ipaddrs[i]; } else { - write_unlock_irqrestore(global_lock, flags); + write_unlock_bh (global_lock); newroute = ksocknal_create_route(peer_ipaddrs[i], port); if (newroute == NULL) return; - write_lock_irqsave(global_lock, flags); + write_lock_bh (global_lock); + } + + if (peer->ksnp_closing) { + /* peer got closed under me */ + break; } /* Already got a route? */ @@ -873,11 +904,11 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, best_nroutes = 0; best_netmatch = 0; - LASSERT (ksocknal_data.ksnd_ninterfaces <= SOCKNAL_MAX_INTERFACES); + LASSERT (net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); /* Select interface to connect from */ - for (j = 0; j < ksocknal_data.ksnd_ninterfaces; j++) { - iface = &ksocknal_data.ksnd_interfaces[j]; + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; /* Using this interface already? */ list_for_each(rtmp, &peer->ksnp_routes) { @@ -916,143 +947,266 @@ ksocknal_create_routes(ksock_peer_t *peer, int port, newroute = NULL; } - write_unlock_irqrestore(global_lock, flags); + write_unlock_bh (global_lock); if (newroute != NULL) - ksocknal_put_route(newroute); + ksocknal_route_decref(newroute); } int -ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) +ksocknal_accept (lnet_ni_t *ni, cfs_socket_t *sock) +{ + ksock_connreq_t *cr; + int rc; + __u32 peer_ip; + int peer_port; + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT (rc == 0); /* we succeeded before */ + + LIBCFS_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + LCONSOLE_ERROR("Dropping connection request from " + "%u.%u.%u.%u: memory exhausted\n", + HIPQUAD(peer_ip)); + return -ENOMEM; + } + + lnet_ni_addref(ni); + cr->ksncr_ni = ni; + cr->ksncr_sock = sock; + + spin_lock_bh (&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); + cfs_waitq_signal(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + return 0; +} + +int +ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, + cfs_socket_t *sock, int type) { - int passive = (type == SOCKNAL_CONN_NONE); rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - __u32 ipaddrs[SOCKNAL_MAX_INTERFACES]; - int nipaddrs; - ptl_nid_t nid; + CFS_LIST_HEAD (zombies); + lnet_process_id_t peerid; struct list_head *tmp; __u64 incarnation; - unsigned long flags; ksock_conn_t *conn; ksock_conn_t *conn2; ksock_peer_t *peer = NULL; ksock_peer_t *peer2; ksock_sched_t *sched; + ksock_hello_msg_t *hello; unsigned int irq; ksock_tx_t *tx; int rc; + int active; + char *warn = NULL; - /* NB, sock has an associated file since (a) this connection might - * have been created in userland and (b) we need to refcount the - * socket so that we don't close it while I/O is being done on - * it, and sock->file has that pre-cooked... */ - LASSERT (KSN_SOCK2FILE(sock) != NULL); - LASSERT (cfs_file_count(KSN_SOCK2FILE(sock)) > 0); - LASSERT (route == NULL || !passive); + active = (route != NULL); - rc = ksocknal_lib_setup_sock (sock); - if (rc != 0) - return (rc); + LASSERT (active == (type != SOCKLND_CONN_NONE)); + LASSERT (route == NULL || route->ksnr_proto != NULL); irq = ksocknal_lib_sock_irq (sock); - PORTAL_ALLOC(conn, sizeof(*conn)); - if (conn == NULL) - return (-ENOMEM); + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + rc = -ENOMEM; + goto failed_0; + } memset (conn, 0, sizeof (*conn)); conn->ksnc_peer = NULL; conn->ksnc_route = NULL; conn->ksnc_sock = sock; + atomic_set (&conn->ksnc_sock_refcount, 1); /* 1 ref for conn */ conn->ksnc_type = type; ksocknal_lib_save_callback(sock, conn); - atomic_set (&conn->ksnc_refcount, 1); /* 1 ref for me */ + atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ + + conn->ksnc_zc_capable = ksocknal_lib_zc_capable(sock); conn->ksnc_rx_ready = 0; conn->ksnc_rx_scheduled = 0; - ksocknal_new_packet (conn, 0); CFS_INIT_LIST_HEAD (&conn->ksnc_tx_queue); conn->ksnc_tx_ready = 0; conn->ksnc_tx_scheduled = 0; + conn->ksnc_tx_mono = NULL; atomic_set (&conn->ksnc_tx_nob, 0); + LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + if (hello == NULL) { + rc = -ENOMEM; + goto failed_1; + } + /* stash conn's local and remote addrs */ rc = ksocknal_lib_get_conn_addrs (conn); if (rc != 0) - goto failed_0; + goto failed_1; + + /* Find out/confirm peer's NID and connection type and get the + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer sends + * eagerly */ + + if (active) { + LASSERT(ni == route->ksnr_peer->ksnp_ni); - if (!passive) { /* Active connection sends HELLO eagerly */ - rc = ksocknal_local_ipvec(ipaddrs); - if (rc < 0) - goto failed_0; - nipaddrs = rc; + hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); + peerid = route->ksnr_peer->ksnp_id; + conn->ksnc_proto = route->ksnr_proto; - rc = ksocknal_send_hello (conn, ipaddrs, nipaddrs); + rc = ksocknal_send_hello (ni, conn, peerid.nid, hello); if (rc != 0) - goto failed_0; + goto failed_1; + } else { + peerid.nid = LNET_NID_ANY; + peerid.pid = LNET_PID_ANY; + + /* Passive, get protocol from peer */ + conn->ksnc_proto = NULL; } - /* Find out/confirm peer's NID and connection type and get the - * vector of interfaces she's willing to let me connect to */ - nid = (route == NULL) ? PTL_NID_ANY : route->ksnr_peer->ksnp_nid; - rc = ksocknal_recv_hello (conn, &nid, &incarnation, ipaddrs); - if (rc < 0) - goto failed_0; - nipaddrs = rc; - LASSERT (nid != PTL_NID_ANY); + rc = ksocknal_recv_hello (ni, conn, hello, &peerid, &incarnation); + if (rc < 0) { + if (rc == -EALREADY) { + /* only active connection loses conn race */ + LASSERT (active); - if (route != NULL) { + CDEBUG(D_NET, "Lost connection race with %s\n", + libcfs_id2str(peerid)); + /* Not an actual failure: return +ve RC so active + * connector can back off */ + rc = EALREADY; + } + goto failed_1; + } + + if (active && route->ksnr_proto != conn->ksnc_proto) { + /* Active connecting, and different protocol is returned */ + CDEBUG(D_NET, "Connecting by %d.x protocol is rejected," + " compatible version %d.x found.\n", + route->ksnr_proto->pro_version, + conn->ksnc_proto->pro_version); + /* Not an actual failure: return +ve RC so active + * connector can back off */ + rc = EPROTO; + + /* Retry with peer's protocol later */ + route->ksnr_proto = conn->ksnc_proto; + + goto failed_1; + } + + LASSERT (peerid.nid != LNET_NID_ANY); + + if (active) { peer = route->ksnr_peer; - atomic_inc(&peer->ksnp_refcount); + ksocknal_peer_addref(peer); + + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer, conn->ksnc_port, + hello->kshm_ips, hello->kshm_nips); + + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + rc = ksocknal_lib_setup_sock(sock); + + write_lock_bh (global_lock); + + if (rc != 0) + goto failed_2; } else { - peer = ksocknal_create_peer(nid); - if (peer == NULL) { - rc = -ENOMEM; - goto failed_0; - } + rc = ksocknal_create_peer(&peer, ni, peerid); + if (rc != 0) + goto failed_1; - write_lock_irqsave(global_lock, flags); + write_lock_bh (global_lock); - peer2 = ksocknal_find_peer_locked(nid); + peer2 = ksocknal_find_peer_locked(ni, peerid); if (peer2 == NULL) { /* NB this puts an "empty" peer in the peer * table (which takes my ref) */ list_add_tail(&peer->ksnp_list, - ksocknal_nid2peerlist(nid)); - } else { - ksocknal_put_peer(peer); + ksocknal_nid2peerlist(peerid.nid)); + } else { + ksocknal_peer_decref(peer); peer = peer2; } + /* +1 ref for me */ - atomic_inc(&peer->ksnp_refcount); + ksocknal_peer_addref(peer); + peer->ksnp_accepting++; + + /* Am I already connecting to this guy? Resolve in + * favour of higher NID... */ + rc = 0; + if (peerid.nid < ni->ni_nid) { + list_for_each(tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, + ksnr_list); - write_unlock_irqrestore(global_lock, flags); - } + if (route->ksnr_ipaddr != conn->ksnc_ipaddr) + continue; + + if (route->ksnr_connecting) { + rc = EALREADY; /* not a failure */ + warn = "connection race"; + } - if (!passive) { - ksocknal_create_routes(peer, conn->ksnc_port, - ipaddrs, nipaddrs); - rc = 0; - } else { - rc = ksocknal_select_ips(peer, ipaddrs, nipaddrs); - LASSERT (rc >= 0); - rc = ksocknal_send_hello (conn, ipaddrs, rc); - } - if (rc < 0) - goto failed_1; + break; + } + } + route = NULL; + + write_unlock_bh (global_lock); - write_lock_irqsave (global_lock, flags); + if (rc != 0) { + /* set CONN_NONE makes returned HELLO acknowledge I + * lost a connection race */ + conn->ksnc_type = SOCKLND_CONN_NONE; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, peerid.nid, hello); + } else { + hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, + hello->kshm_nips); + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + + /* Setup the socket (it disables SO_LINGER). I don't + * do it if I'm sending a negative response to ensure + * the response isn't discarded when I close the socket + * immediately after sending it. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); + } + + write_lock_bh (global_lock); + peer->ksnp_accepting--; + + if (rc != 0) + goto failed_2; + } if (peer->ksnp_closing || - (route != NULL && route->ksnr_deleted)) { - /* route/peer got closed under me */ + (active && route->ksnr_deleted)) { + /* peer/route got closed under me */ rc = -ESTALE; + warn = "peer/route removed"; goto failed_2; } - /* Refuse to duplicate an existing connection (both sides might - * autoconnect at once), unless this is a loopback connection */ + /* Refuse to duplicate an existing connection, unless this is a + * loopback connection */ if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { list_for_each(tmp, &peer->ksnp_conns) { conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); @@ -1063,10 +1217,8 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) conn2->ksnc_incarnation != incarnation) continue; - CWARN("Not creating duplicate connection to " - "%u.%u.%u.%u type %d\n", - HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_type); - rc = -EALREADY; + rc = 0; /* more of a NOOP than a failure */ + warn = "duplicate"; goto failed_2; } } @@ -1074,10 +1226,10 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) /* If the connection created by this route didn't bind to the IP * address the route connected to, the connection/route matching * code below probably isn't going to work. */ - if (route != NULL && + if (active && route->ksnr_ipaddr != conn->ksnc_ipaddr) { - CERROR("Route "LPX64" %u.%u.%u.%u connected to %u.%u.%u.%u\n", - peer->ksnp_nid, + CERROR("Route %s %u.%u.%u.%u connected to %u.%u.%u.%u\n", + libcfs_id2str(peer->ksnp_id), HIPQUAD(route->ksnr_ipaddr), HIPQUAD(conn->ksnc_ipaddr)); } @@ -1096,9 +1248,6 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) break; } - /* Give conn a ref on sock->file since we're going to return success */ - cfs_get_file(KSN_SOCK2FILE(sock)); - conn->ksnc_peer = peer; /* conn takes my ref on peer */ conn->ksnc_incarnation = incarnation; peer->ksnp_last_alive = cfs_time_current(); @@ -1110,11 +1259,13 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) /* Set the deadline for the outgoing HELLO to drain */ conn->ksnc_tx_bufnob = SOCK_WMEM_QUEUED(sock); - conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout); + conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); mb(); /* order with adding to peer's conn list */ list_add (&conn->ksnc_list, &peer->ksnp_conns); - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); + + ksocknal_new_packet(conn, 0); /* NB my callbacks block while I hold ksnd_global_lock */ ksocknal_lib_set_callback(sock, conn); @@ -1131,46 +1282,67 @@ ksocknal_create_conn (ksock_route_t *route, struct socket *sock, int type) } rc = ksocknal_close_stale_conns_locked(peer, incarnation); + write_unlock_bh (global_lock); + if (rc != 0) - CDEBUG(D_HA, - "Closed %d stale conns to nid "LPX64" ip %d.%d.%d.%d\n", - rc, conn->ksnc_peer->ksnp_nid, + CDEBUG(D_NET, "Closed %d stale conns to %s ip %d.%d.%d.%d\n", + rc, libcfs_id2str(conn->ksnc_peer->ksnp_id), HIPQUAD(conn->ksnc_ipaddr)); - write_unlock_irqrestore (global_lock, flags); - ksocknal_lib_bind_irq (irq); /* Call the callbacks right now to get things going. */ - if (ksocknal_getconnsock(conn) == 0) { - ksocknal_lib_act_callback(sock, conn); - ksocknal_putconnsock(conn); + if (ksocknal_connsock_addref(conn) == 0) { + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); } - CDEBUG(D_HA, "New conn nid:"LPX64" %u.%u.%u.%u -> %u.%u.%u.%u/%d " - "incarnation:"LPX64" sched[%d]/%d\n", - nid, HIPQUAD(conn->ksnc_myipaddr), + CDEBUG(D_NET, "New conn %s %u.%u.%u.%u -> %u.%u.%u.%u/%d" + " incarnation:"LPD64" sched[%d]/%d\n", + libcfs_id2str(peerid), HIPQUAD(conn->ksnc_myipaddr), HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, incarnation, - (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers),irq); + (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); + + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); - ksocknal_put_conn (conn); + ksocknal_conn_decref(conn); return (0); failed_2: if (!peer->ksnp_closing && list_empty (&peer->ksnp_conns) && - list_empty (&peer->ksnp_routes)) + list_empty (&peer->ksnp_routes)) { + list_add(&zombies, &peer->ksnp_tx_queue); + list_del_init(&peer->ksnp_tx_queue); ksocknal_unlink_peer_locked(peer); - write_unlock_irqrestore(global_lock, flags); + } + + write_unlock_bh (global_lock); + + if (warn != NULL) { + if (rc < 0) + CERROR("Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + else + CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + } + + ksocknal_txlist_done(ni, &zombies, 1); + ksocknal_peer_decref(peer); failed_1: - ksocknal_put_peer (peer); + if (hello != NULL) + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); - failed_0: - PORTAL_FREE (conn, sizeof(*conn)); + LIBCFS_FREE (conn, sizeof(*conn)); - LASSERT (rc != 0); - return (rc); + failed_0: + libcfs_sock_release(sock); + return rc; } void @@ -1187,7 +1359,6 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) LASSERT (peer->ksnp_error == 0); LASSERT (!conn->ksnc_closing); conn->ksnc_closing = 1; - atomic_inc (&ksocknal_data.ksnd_nclosing_conns); /* ksnd_deathrow_conns takes over peer's ref */ list_del (&conn->ksnc_list); @@ -1217,7 +1388,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) list_del (&route->ksnr_list); /* make route least favourite */ list_add_tail (&route->ksnr_list, &peer->ksnp_routes); #endif - ksocknal_put_route (route); /* drop conn's ref on route */ + ksocknal_route_decref(route); /* drop conn's ref on route */ } if (list_empty (&peer->ksnp_conns)) { @@ -1227,17 +1398,46 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) if (list_empty (&peer->ksnp_routes)) { /* I've just closed last conn belonging to a - * non-autoconnecting peer */ + * peer with no routes to it */ ksocknal_unlink_peer_locked (peer); } } - spin_lock (&ksocknal_data.ksnd_reaper_lock); + spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); list_add_tail (&conn->ksnc_list, &ksocknal_data.ksnd_deathrow_conns); cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq); - spin_unlock (&ksocknal_data.ksnd_reaper_lock); + spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_peer_failed (ksock_peer_t *peer) +{ + time_t last_alive = 0; + int notify = 0; + + /* There has been a connection failure or comms error; but I'll only + * tell LNET I think the peer is dead if it's to another kernel and + * there are no connections or connection attempts in existance. */ + + read_lock (&ksocknal_data.ksnd_global_lock); + + if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 && + list_empty(&peer->ksnp_conns) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + notify = 1; + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->ksnp_last_alive); + } + + read_unlock (&ksocknal_data.ksnd_global_lock); + + if (notify) + lnet_notify (peer->ksnp_ni, peer->ksnp_id.nid, 0, + last_alive); } void @@ -1247,17 +1447,14 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * disengage the socket from its callbacks and close it. * ksnc_refcount will eventually hit zero, and then the reaper will * destroy it. */ - unsigned long flags; ksock_peer_t *peer = conn->ksnc_peer; ksock_sched_t *sched = conn->ksnc_scheduler; - struct timeval now; - time_t then = 0; - int notify = 0; + int failed = 0; LASSERT(conn->ksnc_closing); /* wake up the scheduler to "send" all remaining packets to /dev/null */ - spin_lock_irqsave(&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); if (!conn->ksnc_tx_scheduled && !list_empty(&conn->ksnc_tx_queue)){ @@ -1267,15 +1464,43 @@ ksocknal_terminate_conn (ksock_conn_t *conn) conn->ksnc_tx_ready = 1; conn->ksnc_tx_scheduled = 1; /* extra ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); cfs_waitq_signal (&sched->kss_waitq); } - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); + + spin_lock(&peer->ksnp_lock); + if (!list_empty(&peer->ksnp_zc_req_list)) { + struct list_head *tmp; + struct list_head *nxt; + ksock_tx_t *tx; + LIST_HEAD (zlist); + + list_for_each_safe(tmp, nxt, &peer->ksnp_zc_req_list) { + tx = list_entry(tmp, ksock_tx_t, tx_zc_list); + + if (tx->tx_conn != conn) + continue; + list_del(&tx->tx_zc_list); + /* tell scheduler it's deleted */ + tx->tx_msg.ksm_zc_req_cookie = 0; + list_add(&tx->tx_zc_list, &zlist); + } + spin_unlock(&peer->ksnp_lock); + + list_for_each_safe(tmp, nxt, &zlist) { + tx = list_entry(tmp, ksock_tx_t, tx_zc_list); + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } + } else { + spin_unlock(&peer->ksnp_lock); + } /* serialise with callbacks */ - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); ksocknal_lib_reset_callback(conn->ksnc_sock, conn); @@ -1286,26 +1511,35 @@ ksocknal_terminate_conn (ksock_conn_t *conn) if (peer->ksnp_error != 0) { /* peer's last conn closed in error */ LASSERT (list_empty (&peer->ksnp_conns)); - - /* convert peer's last-known-alive timestamp from jiffies */ - do_gettimeofday (&now); - then = now.tv_sec - cfs_duration_sec(cfs_time_sub(cfs_time_current(), - peer->ksnp_last_alive)); - notify = 1; + failed = 1; + peer->ksnp_error = 0; /* avoid multiple notifications */ } - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); + + if (failed) + ksocknal_peer_failed(peer); /* The socket is closed on the final put; either here, or in * ksocknal_{send,recv}msg(). Since we set up the linger2 option * when the connection was established, this will close the socket * immediately, aborting anything buffered in it. Any hung * zero-copy transmits will therefore complete in finite time. */ - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); +} - if (notify) - kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid, - 0, then); +void +ksocknal_queue_zombie_conn (ksock_conn_t *conn) +{ + /* Queue the conn for the reaper to destroy */ + + LASSERT (atomic_read(&conn->ksnc_conn_refcount) == 0); + spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + cfs_waitq_signal(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); } void @@ -1314,7 +1548,9 @@ ksocknal_destroy_conn (ksock_conn_t *conn) /* Final coup-de-grace of the reaper */ CDEBUG (D_NET, "connection %p\n", conn); - LASSERT (atomic_read (&conn->ksnc_refcount) == 0); + LASSERT (atomic_read (&conn->ksnc_conn_refcount) == 0); + LASSERT (atomic_read (&conn->ksnc_sock_refcount) == 0); + LASSERT (conn->ksnc_sock == NULL); LASSERT (conn->ksnc_route == NULL); LASSERT (!conn->ksnc_tx_scheduled); LASSERT (!conn->ksnc_rx_scheduled); @@ -1322,49 +1558,45 @@ ksocknal_destroy_conn (ksock_conn_t *conn) /* complete current receive if any */ switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_BODY: - CERROR("Completing partial receive from "LPX64 + case SOCKNAL_RX_LNET_PAYLOAD: + CERROR("Completing partial receive from %s" ", ip %d.%d.%d.%d:%d, with error\n", - conn->ksnc_peer->ksnp_nid, + libcfs_id2str(conn->ksnc_peer->ksnp_id), HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - lib_finalize (&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_FAIL); + lnet_finalize (conn->ksnc_peer->ksnp_ni, + conn->ksnc_cookie, -EIO); break; - case SOCKNAL_RX_BODY_FWD: - ksocknal_fmb_callback (conn->ksnc_cookie, -ECONNABORTED); + case SOCKNAL_RX_LNET_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of lnet header from %s" + ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_proto->pro_version); break; - case SOCKNAL_RX_HEADER: - case SOCKNAL_RX_SLOP: + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of ksock message from %s" + ", ip %d.%d.%d.%d:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port, + conn->ksnc_proto->pro_version); break; + case SOCKNAL_RX_SLOP: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of slops from %s" + ", ip %d.%d.%d.%d:%d, with error\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + break; default: LBUG (); break; } - ksocknal_put_peer (conn->ksnc_peer); + ksocknal_peer_decref(conn->ksnc_peer); - PORTAL_FREE (conn, sizeof (*conn)); - atomic_dec (&ksocknal_data.ksnd_nclosing_conns); -} - -void -ksocknal_put_conn (ksock_conn_t *conn) -{ - unsigned long flags; - - CDEBUG (D_OTHER, "putting conn[%p] -> "LPX64" (%d)\n", - conn, conn->ksnc_peer->ksnp_nid, - atomic_read (&conn->ksnc_refcount)); - - LASSERT (atomic_read (&conn->ksnc_refcount) > 0); - if (!atomic_dec_and_test (&conn->ksnc_refcount)) - return; - - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); - - list_add (&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); - cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + LIBCFS_FREE (conn, sizeof (*conn)); } int @@ -1402,10 +1634,11 @@ ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation) if (conn->ksnc_incarnation == incarnation) continue; - CWARN("Closing stale conn nid:"LPX64" ip:%08x/%d " - "incarnation:"LPX64"("LPX64")\n", - peer->ksnp_nid, conn->ksnc_ipaddr, conn->ksnc_port, - conn->ksnc_incarnation, incarnation); + CDEBUG(D_NET, "Closing stale conn %s ip:%08x/%d " + "incarnation:"LPD64"("LPD64")\n", + libcfs_id2str(peer->ksnp_id), + conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_incarnation, incarnation); count++; ksocknal_close_conn_locked (conn, -ESTALE); @@ -1419,22 +1652,20 @@ ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why) { ksock_peer_t *peer = conn->ksnc_peer; __u32 ipaddr = conn->ksnc_ipaddr; - unsigned long flags; int count; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); count = ksocknal_close_peer_conns_locked (peer, ipaddr, why); - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (count); } int -ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr) +ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr) { - unsigned long flags; ksock_peer_t *peer; struct list_head *ptmp; struct list_head *pnxt; @@ -1443,10 +1674,10 @@ ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr) int i; int count = 0; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); - if (nid != PTL_NID_ANY) - lo = hi = ksocknal_nid2peerlist(nid) - ksocknal_data.ksnd_peers; + if (id.nid != LNET_NID_ANY) + lo = hi = ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers; else { lo = 0; hi = ksocknal_data.ksnd_peer_hash_size - 1; @@ -1457,33 +1688,36 @@ ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr) peer = list_entry (ptmp, ksock_peer_t, ksnp_list); - if (!(nid == PTL_NID_ANY || nid == peer->ksnp_nid)) + if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid))) continue; count += ksocknal_close_peer_conns_locked (peer, ipaddr, 0); } } - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); /* wildcards always succeed */ - if (nid == PTL_NID_ANY || ipaddr == 0) + if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0) return (0); return (count == 0 ? -ENOENT : 0); } void -ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive) +ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive) { /* The router is telling me she's been notified of a change in * gateway state.... */ + lnet_process_id_t id = {.nid = gw_nid, .pid = LNET_PID_ANY}; - CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down"); + CDEBUG (D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), + alive ? "up" : "down"); if (!alive) { /* If the gateway crashed, close all open connections... */ - ksocknal_close_matching_conns (gw_nid, 0); + ksocknal_close_matching_conns (id, 0); return; } @@ -1508,7 +1742,7 @@ ksocknal_push_peer (ksock_peer_t *peer) list_for_each (tmp, &peer->ksnp_conns) { if (i++ == index) { conn = list_entry (tmp, ksock_conn_t, ksnc_list); - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); break; } } @@ -1519,12 +1753,12 @@ ksocknal_push_peer (ksock_peer_t *peer) break; ksocknal_lib_push_conn (conn); - ksocknal_put_conn (conn); + ksocknal_conn_decref(conn); } } int -ksocknal_push (ptl_nid_t nid) +ksocknal_push (lnet_ni_t *ni, lnet_process_id_t id) { ksock_peer_t *peer; struct list_head *tmp; @@ -1533,17 +1767,6 @@ ksocknal_push (ptl_nid_t nid) int j; int rc = -ENOENT; - if (nid != PTL_NID_ANY) { - peer = ksocknal_get_peer (nid); - - if (peer != NULL) { - rc = 0; - ksocknal_push_peer (peer); - ksocknal_put_peer (peer); - } - return (rc); - } - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { for (j = 0; ; j++) { read_lock (&ksocknal_data.ksnd_global_lock); @@ -1552,10 +1775,19 @@ ksocknal_push (ptl_nid_t nid) peer = NULL; list_for_each (tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (!((id.nid == LNET_NID_ANY || + id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || + id.pid == peer->ksnp_id.pid))) { + peer = NULL; + continue; + } + if (index++ == j) { - peer = list_entry(tmp, ksock_peer_t, - ksnp_list); - atomic_inc (&peer->ksnp_refcount); + ksocknal_peer_addref(peer); break; } } @@ -1565,7 +1797,7 @@ ksocknal_push (ptl_nid_t nid) if (peer != NULL) { rc = 0; ksocknal_push_peer (peer); - ksocknal_put_peer (peer); + ksocknal_peer_decref(peer); } } @@ -1575,9 +1807,9 @@ ksocknal_push (ptl_nid_t nid) } int -ksocknal_add_interface(__u32 ipaddress, __u32 netmask) +ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask) { - unsigned long flags; + ksock_net_t *net = ni->ni_data; ksock_interface_t *iface; int rc; int i; @@ -1591,16 +1823,16 @@ ksocknal_add_interface(__u32 ipaddress, __u32 netmask) netmask == 0) return (-EINVAL); - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); - iface = ksocknal_ip2iface(ipaddress); + iface = ksocknal_ip2iface(ni, ipaddress); if (iface != NULL) { /* silently ignore dups */ rc = 0; - } else if (ksocknal_data.ksnd_ninterfaces == SOCKNAL_MAX_INTERFACES) { + } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) { rc = -ENOSPC; } else { - iface = &ksocknal_data.ksnd_interfaces[ksocknal_data.ksnd_ninterfaces++]; + iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; iface->ksni_ipaddr = ipaddress; iface->ksni_netmask = netmask; @@ -1628,7 +1860,7 @@ ksocknal_add_interface(__u32 ipaddress, __u32 netmask) /* NB only new connections will pay attention to the new interface! */ } - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (rc); } @@ -1675,10 +1907,10 @@ ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) } int -ksocknal_del_interface(__u32 ipaddress) +ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress) { + ksock_net_t *net = ni->ni_data; int rc = -ENOENT; - unsigned long flags; struct list_head *tmp; struct list_head *nxt; ksock_peer_t *peer; @@ -1686,10 +1918,10 @@ ksocknal_del_interface(__u32 ipaddress) int i; int j; - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); - for (i = 0; i < ksocknal_data.ksnd_ninterfaces; i++) { - this_ip = ksocknal_data.ksnd_interfaces[i].ksni_ipaddr; + for (i = 0; i < net->ksnn_ninterfaces; i++) { + this_ip = net->ksnn_interfaces[i].ksni_ipaddr; if (!(ipaddress == 0 || ipaddress == this_ip)) @@ -1697,266 +1929,217 @@ ksocknal_del_interface(__u32 ipaddress) rc = 0; - for (j = i+1; j < ksocknal_data.ksnd_ninterfaces; j++) - ksocknal_data.ksnd_interfaces[j-1] = - ksocknal_data.ksnd_interfaces[j]; + for (j = i+1; j < net->ksnn_ninterfaces; j++) + net->ksnn_interfaces[j-1] = + net->ksnn_interfaces[j]; - ksocknal_data.ksnd_ninterfaces--; + net->ksnn_ninterfaces--; for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { list_for_each_safe(tmp, nxt, &ksocknal_data.ksnd_peers[j]) { peer = list_entry(tmp, ksock_peer_t, ksnp_list); + if (peer->ksnp_ni != ni) + continue; + ksocknal_peer_del_interface_locked(peer, this_ip); } } } - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (rc); } int -ksocknal_cmd(struct portals_cfg *pcfg, void * private) +ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { + struct libcfs_ioctl_data *data = arg; int rc; - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_INTERFACE: { + switch(cmd) { + case IOC_LIBCFS_GET_INTERFACE: { + ksock_net_t *net = ni->ni_data; ksock_interface_t *iface; read_lock (&ksocknal_data.ksnd_global_lock); - if (pcfg->pcfg_count < 0 || - pcfg->pcfg_count >= ksocknal_data.ksnd_ninterfaces) { + if (data->ioc_count < 0 || + data->ioc_count >= net->ksnn_ninterfaces) { rc = -ENOENT; } else { rc = 0; - iface = &ksocknal_data.ksnd_interfaces[pcfg->pcfg_count]; + iface = &net->ksnn_interfaces[data->ioc_count]; - pcfg->pcfg_id = iface->ksni_ipaddr; - pcfg->pcfg_misc = iface->ksni_netmask; - pcfg->pcfg_fd = iface->ksni_npeers; - pcfg->pcfg_count = iface->ksni_nroutes; + data->ioc_u32[0] = iface->ksni_ipaddr; + data->ioc_u32[1] = iface->ksni_netmask; + data->ioc_u32[2] = iface->ksni_npeers; + data->ioc_u32[3] = iface->ksni_nroutes; } read_unlock (&ksocknal_data.ksnd_global_lock); - break; - } - case NAL_CMD_ADD_INTERFACE: { - rc = ksocknal_add_interface(pcfg->pcfg_id, /* IP address */ - pcfg->pcfg_misc); /* net mask */ - break; - } - case NAL_CMD_DEL_INTERFACE: { - rc = ksocknal_del_interface(pcfg->pcfg_id); /* IP address */ - break; + return rc; } - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - __u32 myip = 0; - __u32 ip = 0; - int port = 0; - int conn_count = 0; - int share_count = 0; - rc = ksocknal_get_peer_info(pcfg->pcfg_count, &nid, - &myip, &ip, &port, - &conn_count, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = myip; - pcfg->pcfg_id = ip; - pcfg->pcfg_misc = port; - pcfg->pcfg_count = conn_count; - pcfg->pcfg_wait = share_count; - break; - } - case NAL_CMD_ADD_PEER: { - rc = ksocknal_add_peer (pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_misc); /* port */ - break; - } - case NAL_CMD_DEL_PEER: { - rc = ksocknal_del_peer (pcfg->pcfg_nid, - pcfg->pcfg_id, /* IP */ - pcfg->pcfg_flags); /* single_share? */ - break; - } - case NAL_CMD_GET_CONN: { - ksock_conn_t *conn = ksocknal_get_conn_by_idx (pcfg->pcfg_count); + case IOC_LIBCFS_ADD_INTERFACE: + return ksocknal_add_interface(ni, + data->ioc_u32[0], /* IP address */ + data->ioc_u32[1]); /* net mask */ - if (conn == NULL) - rc = -ENOENT; - else { - int txmem; - int rxmem; - int nagle; + case IOC_LIBCFS_DEL_INTERFACE: + return ksocknal_del_interface(ni, + data->ioc_u32[0]); /* IP address */ - ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + case IOC_LIBCFS_GET_PEER: { + lnet_process_id_t id = {0,}; + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; - rc = 0; - pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid; - pcfg->pcfg_id = conn->ksnc_ipaddr; - pcfg->pcfg_misc = conn->ksnc_port; - pcfg->pcfg_fd = conn->ksnc_myipaddr; - pcfg->pcfg_flags = conn->ksnc_type; - pcfg->pcfg_gw_nal = conn->ksnc_scheduler - - ksocknal_data.ksnd_schedulers; - pcfg->pcfg_count = txmem; - pcfg->pcfg_size = rxmem; - pcfg->pcfg_wait = nagle; - ksocknal_put_conn (conn); - } - break; - } - case NAL_CMD_REGISTER_PEER_FD: { - struct socket *sock = sockfd_lookup (pcfg->pcfg_fd, &rc); - int type = pcfg->pcfg_misc; - - if (sock == NULL) - break; + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &id, &myip, &ip, &port, + &conn_count, &share_count); + if (rc != 0) + return rc; + + data->ioc_nid = id.nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; + data->ioc_u32[2] = myip; + data->ioc_u32[3] = conn_count; + data->ioc_u32[4] = id.pid; + return 0; + } + + case IOC_LIBCFS_ADD_PEER: { + lnet_process_id_t id = {.nid = data->ioc_nid, + .pid = LUSTRE_SRV_LNET_PID}; + return ksocknal_add_peer (ni, id, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ + } + case IOC_LIBCFS_DEL_PEER: { + lnet_process_id_t id = {.nid = data->ioc_nid, + .pid = LNET_PID_ANY}; + return ksocknal_del_peer (ni, id, + data->ioc_u32[0]); /* IP */ + } + case IOC_LIBCFS_GET_CONN: { + int txmem; + int rxmem; + int nagle; + ksock_conn_t *conn = ksocknal_get_conn_by_idx (ni, data->ioc_count); - switch (type) { - case SOCKNAL_CONN_NONE: - case SOCKNAL_CONN_ANY: - case SOCKNAL_CONN_CONTROL: - case SOCKNAL_CONN_BULK_IN: - case SOCKNAL_CONN_BULK_OUT: - rc = ksocknal_create_conn(NULL, sock, type); - break; - default: - rc = -EINVAL; - break; - } - cfs_put_file (KSN_SOCK2FILE(sock)); - break; - } - case NAL_CMD_CLOSE_CONNECTION: { - rc = ksocknal_close_matching_conns (pcfg->pcfg_nid, - pcfg->pcfg_id); - break; - } - case NAL_CMD_REGISTER_MYNID: { - rc = ksocknal_set_mynid (pcfg->pcfg_nid); - break; - } - case NAL_CMD_PUSH_CONNECTION: { - rc = ksocknal_push (pcfg->pcfg_nid); - break; + if (conn == NULL) + return -ENOENT; + + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + + data->ioc_count = txmem; + data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; + data->ioc_flags = nagle; + data->ioc_u32[0] = conn->ksnc_ipaddr; + data->ioc_u32[1] = conn->ksnc_port; + data->ioc_u32[2] = conn->ksnc_myipaddr; + data->ioc_u32[3] = conn->ksnc_type; + data->ioc_u32[4] = conn->ksnc_scheduler - + ksocknal_data.ksnd_schedulers; + data->ioc_u32[5] = rxmem; + data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; + ksocknal_conn_decref(conn); + return 0; + } + + case IOC_LIBCFS_CLOSE_CONNECTION: { + lnet_process_id_t id = {.nid = data->ioc_nid, + .pid = LNET_PID_ANY}; + + return ksocknal_close_matching_conns (id, + data->ioc_u32[0]); + } + case IOC_LIBCFS_REGISTER_MYNID: + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) + return 0; + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + + case IOC_LIBCFS_PUSH_CONNECTION: { + lnet_process_id_t id = {.nid = data->ioc_nid, + .pid = LNET_PID_ANY}; + + return ksocknal_push(ni, id); } default: - rc = -EINVAL; - break; - } - - return rc; -} - -void -ksocknal_free_fmbs (ksock_fmb_pool_t *p) -{ - int npages = p->fmp_buff_pages; - ksock_fmb_t *fmb; - int i; - - LASSERT (list_empty(&p->fmp_blocked_conns)); - LASSERT (p->fmp_nactive_fmbs == 0); - - while (!list_empty(&p->fmp_idle_fmbs)) { - - fmb = list_entry(p->fmp_idle_fmbs.next, - ksock_fmb_t, fmb_list); - - for (i = 0; i < npages; i++) - if (fmb->fmb_kiov[i].kiov_page != NULL) - cfs_free_page(fmb->fmb_kiov[i].kiov_page); - - list_del(&fmb->fmb_list); - PORTAL_FREE(fmb, offsetof(ksock_fmb_t, fmb_kiov[npages])); + return -EINVAL; } + /* not reached */ } void ksocknal_free_buffers (void) { - ksocknal_free_fmbs(&ksocknal_data.ksnd_small_fmp); - ksocknal_free_fmbs(&ksocknal_data.ksnd_large_fmp); - - LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_ltxs) == 0); + LASSERT (atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0); if (ksocknal_data.ksnd_schedulers != NULL) - PORTAL_FREE (ksocknal_data.ksnd_schedulers, + LIBCFS_FREE (ksocknal_data.ksnd_schedulers, sizeof (ksock_sched_t) * ksocknal_data.ksnd_nschedulers); - PORTAL_FREE (ksocknal_data.ksnd_peers, + LIBCFS_FREE (ksocknal_data.ksnd_peers, sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); + + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + struct list_head zlist; + ksock_tx_t *tx; + + list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); + list_del_init(&ksocknal_data.ksnd_idle_noop_txs); + spin_unlock(&ksocknal_data.ksnd_tx_lock); + + while(!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_list); + list_del(&tx->tx_list); + LIBCFS_FREE(tx, tx->tx_desc_size); + } + } else { + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } } void -ksocknal_api_shutdown (nal_t *nal) +ksocknal_base_shutdown (void) { ksock_sched_t *sched; int i; - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } - CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - LASSERT(nal == &ksocknal_api); + atomic_read (&libcfs_kmemory)); + LASSERT (ksocknal_data.ksnd_nnets == 0); switch (ksocknal_data.ksnd_init) { default: LASSERT (0); case SOCKNAL_INIT_ALL: - libcfs_nal_cmd_unregister(SOCKNAL); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; - /* fall through */ - - case SOCKNAL_INIT_LIB: - /* No more calls to ksocknal_cmd() to create new - * autoroutes/connections since we're being unloaded. */ - - /* Delete all peers */ - ksocknal_del_peer(PTL_NID_ANY, 0, 0); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read (&ksocknal_data.ksnd_npeers) != 0) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect\n", - atomic_read (&ksocknal_data.ksnd_npeers)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (cfs_time_seconds(1)); - } - - /* Tell lib we've stopped calling into her. */ - lib_fini(&ksocknal_lib); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; - /* fall through */ - case SOCKNAL_INIT_DATA: - LASSERT (atomic_read (&ksocknal_data.ksnd_npeers) == 0); LASSERT (ksocknal_data.ksnd_peers != NULL); for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { LASSERT (list_empty (&ksocknal_data.ksnd_peers[i])); } LASSERT (list_empty (&ksocknal_data.ksnd_enomem_conns)); LASSERT (list_empty (&ksocknal_data.ksnd_zombie_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_autoconnectd_routes)); - LASSERT (list_empty (&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns)); - LASSERT (list_empty (&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns)); + LASSERT (list_empty (&ksocknal_data.ksnd_connd_connreqs)); + LASSERT (list_empty (&ksocknal_data.ksnd_connd_routes)); if (ksocknal_data.ksnd_schedulers != NULL) for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { @@ -1965,57 +2148,49 @@ ksocknal_api_shutdown (nal_t *nal) LASSERT (list_empty (&kss->kss_tx_conns)); LASSERT (list_empty (&kss->kss_rx_conns)); + LASSERT (list_empty (&kss->kss_zombie_noop_txs)); LASSERT (kss->kss_nconns == 0); } - /* stop router calling me */ - kpr_shutdown (&ksocknal_data.ksnd_router); - /* flag threads to terminate; wake and wait for them to die */ ksocknal_data.ksnd_shuttingdown = 1; - cfs_waitq_broadcast (&ksocknal_data.ksnd_autoconnectd_waitq); + cfs_waitq_broadcast (&ksocknal_data.ksnd_connd_waitq); cfs_waitq_broadcast (&ksocknal_data.ksnd_reaper_waitq); - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { - sched = &ksocknal_data.ksnd_schedulers[i]; - cfs_waitq_broadcast(&sched->kss_waitq); - } + if (ksocknal_data.ksnd_schedulers != NULL) + for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { + sched = &ksocknal_data.ksnd_schedulers[i]; + cfs_waitq_broadcast(&sched->kss_waitq); + } i = 4; - read_lock(&ksocknal_data.ksnd_global_lock); + read_lock (&ksocknal_data.ksnd_global_lock); while (ksocknal_data.ksnd_nthreads != 0) { i++; CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "waiting for %d threads to terminate\n", ksocknal_data.ksnd_nthreads); - read_unlock(&ksocknal_data.ksnd_global_lock); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (cfs_time_seconds(1)); - read_lock(&ksocknal_data.ksnd_global_lock); + read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_pause(cfs_time_seconds(1)); + read_lock (&ksocknal_data.ksnd_global_lock); } - read_unlock(&ksocknal_data.ksnd_global_lock); - - kpr_deregister (&ksocknal_data.ksnd_router); + read_unlock (&ksocknal_data.ksnd_global_lock); ksocknal_free_buffers(); ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; - /* fall through */ - - case SOCKNAL_INIT_NOTHING: break; } CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); + atomic_read (&libcfs_kmemory)); - printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + PORTAL_MODULE_UNUSE; } -void -ksocknal_init_incarnation (void) +__u64 +ksocknal_new_incarnation (void) { struct timeval tv; @@ -2026,81 +2201,57 @@ ksocknal_init_incarnation (void) do_gettimeofday(&tv); - ksocknal_data.ksnd_incarnation = - (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; } int -ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +ksocknal_base_startup (void) { - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); int rc; int i; - int j; - - LASSERT (nal == &ksocknal_api); - - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = ksocknal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); - } LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT (ksocknal_data.ksnd_nnets == 0); memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ - ksocknal_init_incarnation(); - ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (ksocknal_data.ksnd_peers, + LIBCFS_ALLOC (ksocknal_data.ksnd_peers, sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); if (ksocknal_data.ksnd_peers == NULL) - return (-ENOMEM); + return -ENOMEM; for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); rwlock_init(&ksocknal_data.ksnd_global_lock); - spin_lock_init(&ksocknal_data.ksnd_small_fmp.fmp_lock); - CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_idle_fmbs); - CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_small_fmp.fmp_blocked_conns); - ksocknal_data.ksnd_small_fmp.fmp_buff_pages = SOCKNAL_SMALL_FWD_PAGES; - - spin_lock_init(&ksocknal_data.ksnd_large_fmp.fmp_lock); - CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_idle_fmbs); - CFS_INIT_LIST_HEAD(&ksocknal_data.ksnd_large_fmp.fmp_blocked_conns); - ksocknal_data.ksnd_large_fmp.fmp_buff_pages = SOCKNAL_LARGE_FWD_PAGES; - spin_lock_init (&ksocknal_data.ksnd_reaper_lock); CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_enomem_conns); CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_zombie_conns); CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_deathrow_conns); cfs_waitq_init(&ksocknal_data.ksnd_reaper_waitq); - spin_lock_init (&ksocknal_data.ksnd_autoconnectd_lock); - CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_autoconnectd_routes); - cfs_waitq_init(&ksocknal_data.ksnd_autoconnectd_waitq); + spin_lock_init (&ksocknal_data.ksnd_connd_lock); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_connreqs); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_connd_routes); + cfs_waitq_init(&ksocknal_data.ksnd_connd_waitq); + + spin_lock_init (&ksocknal_data.ksnd_tx_lock); + CFS_INIT_LIST_HEAD (&ksocknal_data.ksnd_idle_noop_txs); /* NB memset above zeros whole of ksocknal_data, including * ksocknal_data.ksnd_irqinfo[all].ksni_valid */ /* flag lists/ptrs/locks initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + PORTAL_MODULE_USE; ksocknal_data.ksnd_nschedulers = ksocknal_nsched(); - PORTAL_ALLOC(ksocknal_data.ksnd_schedulers, + LIBCFS_ALLOC(ksocknal_data.ksnd_schedulers, sizeof(ksock_sched_t) * ksocknal_data.ksnd_nschedulers); - if (ksocknal_data.ksnd_schedulers == NULL) { - ksocknal_api_shutdown (nal); - return (-ENOMEM); - } + if (ksocknal_data.ksnd_schedulers == NULL) + goto failed; for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { ksock_sched_t *kss = &ksocknal_data.ksnd_schedulers[i]; @@ -2108,197 +2259,252 @@ ksocknal_api_startup (nal_t *nal, ptl_pid_t requested_pid, spin_lock_init (&kss->kss_lock); CFS_INIT_LIST_HEAD (&kss->kss_rx_conns); CFS_INIT_LIST_HEAD (&kss->kss_tx_conns); -#if SOCKNAL_ZC - CFS_INIT_LIST_HEAD (&kss->kss_zctxdone_list); -#endif + CFS_INIT_LIST_HEAD (&kss->kss_zombie_noop_txs); cfs_waitq_init (&kss->kss_waitq); } - /* NB we have to wait to be told our true NID... */ - process_id.pid = requested_pid; - process_id.nid = 0; - - rc = lib_init(&ksocknal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); - } - - ksocknal_data.ksnd_init = SOCKNAL_INIT_LIB; // flag lib_init() called - for (i = 0; i < ksocknal_data.ksnd_nschedulers; i++) { rc = ksocknal_thread_start (ksocknal_scheduler, &ksocknal_data.ksnd_schedulers[i]); if (rc != 0) { CERROR("Can't spawn socknal scheduler[%d]: %d\n", i, rc); - ksocknal_api_shutdown (nal); - return (rc); + goto failed; } } - for (i = 0; i < SOCKNAL_N_AUTOCONNECTD; i++) { - rc = ksocknal_thread_start (ksocknal_autoconnectd, (void *)((long)i)); + /* must have at least 2 connds to remain responsive to accepts while + * connecting */ + if (*ksocknal_tunables.ksnd_nconnds < 2) + *ksocknal_tunables.ksnd_nconnds = 2; + + for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { + rc = ksocknal_thread_start (ksocknal_connd, (void *)((long)i)); if (rc != 0) { - CERROR("Can't spawn socknal autoconnectd: %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); + CERROR("Can't spawn socknal connd: %d\n", rc); + goto failed; } } rc = ksocknal_thread_start (ksocknal_reaper, NULL); if (rc != 0) { CERROR ("Can't spawn socknal reaper: %d\n", rc); - ksocknal_api_shutdown (nal); - return (rc); + goto failed; } - rc = kpr_register(&ksocknal_data.ksnd_router, - &ksocknal_router_interface); - if (rc != 0) { - CDEBUG(D_NET, "Can't initialise routing interface " - "(rc = %d): not routing\n", rc); - } else { - /* Only allocate forwarding buffers if there's a router */ + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - for (i = 0; i < (SOCKNAL_SMALL_FWD_NMSGS + - SOCKNAL_LARGE_FWD_NMSGS); i++) { - ksock_fmb_t *fmb; - ksock_fmb_pool_t *pool; + return 0; + failed: + ksocknal_base_shutdown(); + return -ENETDOWN; +} - if (i < SOCKNAL_SMALL_FWD_NMSGS) - pool = &ksocknal_data.ksnd_small_fmp; - else - pool = &ksocknal_data.ksnd_large_fmp; +void +ksocknal_shutdown (lnet_ni_t *ni) +{ + ksock_net_t *net = ni->ni_data; + int i; + lnet_process_id_t anyid = {.nid = LNET_NID_ANY, + .pid = LNET_PID_ANY}; - PORTAL_ALLOC(fmb, offsetof(ksock_fmb_t, - fmb_kiov[pool->fmp_buff_pages])); - if (fmb == NULL) { - ksocknal_api_shutdown(nal); - return (-ENOMEM); - } + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); + LASSERT(ksocknal_data.ksnd_nnets > 0); - fmb->fmb_pool = pool; + spin_lock_bh (&net->ksnn_lock); + net->ksnn_shutdown = 1; /* prevent new peers */ + spin_unlock_bh (&net->ksnn_lock); - for (j = 0; j < pool->fmp_buff_pages; j++) { - fmb->fmb_kiov[j].kiov_page = cfs_alloc_page(CFS_ALLOC_STD); + /* Delete all peers */ + ksocknal_del_peer(ni, anyid, 0); - if (fmb->fmb_kiov[j].kiov_page == NULL) { - ksocknal_api_shutdown (nal); - return (-ENOMEM); - } + /* Wait for all peer state to clean up */ + i = 2; + spin_lock_bh (&net->ksnn_lock); + while (net->ksnn_npeers != 0) { + spin_unlock_bh (&net->ksnn_lock); - LASSERT(cfs_page_address(fmb->fmb_kiov[j].kiov_page) != NULL); - } + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect\n", + net->ksnn_npeers); + cfs_pause(cfs_time_seconds(1)); + + spin_lock_bh (&net->ksnn_lock); + } + spin_unlock_bh (&net->ksnn_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT (net->ksnn_interfaces[i].ksni_npeers == 0); + LASSERT (net->ksnn_interfaces[i].ksni_nroutes == 0); + } + + LIBCFS_FREE(net, sizeof(*net)); + + ksocknal_data.ksnd_nnets--; + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); +} - list_add(&fmb->fmb_list, &pool->fmp_idle_fmbs); +int +ksocknal_enumerate_interfaces(ksock_net_t *net) +{ + char **names; + int i; + int j; + int rc; + int n; + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Can't enumerate interfaces: %d\n", n); + return n; + } + + for (i = j = 0; i < n; i++) { + int up; + __u32 ip; + __u32 mask; + + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &mask); + if (rc != 0) { + CWARN("Can't get interface %s info: %d\n", + names[i], rc); + continue; } + + if (!up) { + CWARN("Ignoring interface %s (down)\n", + names[i]); + continue; + } + + if (j == LNET_MAX_INTERFACES) { + CWARN("Ignoring interface %s (too many interfaces)\n", + names[i]); + continue; + } + + net->ksnn_interfaces[j].ksni_ipaddr = ip; + net->ksnn_interfaces[j].ksni_netmask = mask; + j++; } - rc = libcfs_nal_cmd_register(SOCKNAL, &ksocknal_cmd, NULL); - if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); - ksocknal_api_shutdown (nal); - return (rc); + libcfs_ipif_free_enumeration(names, n); + + if (j == 0) + CERROR("Can't find any usable interfaces\n"); + + return j; +} + +int +ksocknal_startup (lnet_ni_t *ni) +{ + ksock_net_t *net; + int rc; + int i; + + LASSERT (ni->ni_lnd == &the_ksocklnd); + + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { + rc = ksocknal_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) + goto fail_0; + + memset(net, 0, sizeof(*net)); + spin_lock_init(&net->ksnn_lock); + net->ksnn_incarnation = ksocknal_new_incarnation(); + ni->ni_data = net; + ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; + ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peercredits; + + if (ni->ni_interfaces[0] == NULL) { + rc = ksocknal_enumerate_interfaces(net); + if (rc <= 0) + goto fail_1; + + net->ksnn_ninterfaces = 1; + } else { + for (i = 0; i < LNET_MAX_INTERFACES; i++) { + int up; + + if (ni->ni_interfaces[i] == NULL) + break; + + rc = libcfs_ipif_query( + ni->ni_interfaces[i], &up, + &net->ksnn_interfaces[i].ksni_ipaddr, + &net->ksnn_interfaces[i].ksni_netmask); + + if (rc != 0) { + CERROR("Can't get interface %s info: %d\n", + ni->ni_interfaces[i], rc); + goto fail_1; + } + + if (!up) { + CERROR("Interface %s is down\n", + ni->ni_interfaces[i]); + goto fail_1; + } + } + net->ksnn_ninterfaces = i; } - /* flag everything initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), + net->ksnn_interfaces[0].ksni_ipaddr); - printk(KERN_INFO "Lustre: Routing socket NAL loaded " - "(Routing %s, initial mem %d, incarnation "LPX64")\n", - kpr_routing (&ksocknal_data.ksnd_router) ? - "enabled" : "disabled", pkmem, ksocknal_data.ksnd_incarnation); + ksocknal_data.ksnd_nnets++; - return (0); + return 0; + + fail_1: + LIBCFS_FREE(net, sizeof(*net)); + fail_0: + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); + + return -ENETDOWN; } + void __exit ksocknal_module_fini (void) { -#ifdef CONFIG_SYSCTL - if (ksocknal_tunables.ksnd_sysctl != NULL) - unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl); -#endif - PtlNIFini(ksocknal_ni); - - ptl_unregister_nal(SOCKNAL); + lnet_unregister_lnd(&the_ksocklnd); + ksocknal_lib_tunables_fini(); } -extern cfs_sysctl_table_t ksocknal_top_ctl_table[]; - int __init ksocknal_module_init (void) { int rc; - /* packet descriptor must fit in a router descriptor's scratchpad */ - LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); - /* the following must be sizeof(int) for proc_dointvec() */ - LASSERT(sizeof (ksocknal_tunables.ksnd_io_timeout) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_eager_ack) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_typed_conns) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_min_bulk) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_buffer_size) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_nagle) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_idle) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_count) == sizeof (int)); - LASSERT(sizeof (ksocknal_tunables.ksnd_keepalive_intvl) == sizeof (int)); -#if CPU_AFFINITY - LASSERT(sizeof (ksocknal_tunables.ksnd_irq_affinity) == sizeof (int)); -#endif -#if SOCKNAL_ZC - LASSERT(sizeof (ksocknal_tunables.ksnd_zc_min_frag) == sizeof (int)); -#endif /* check ksnr_connected/connecting field large enough */ - LASSERT(SOCKNAL_CONN_NTYPES <= 4); - - ksocknal_api.nal_ni_init = ksocknal_api_startup; - ksocknal_api.nal_ni_fini = ksocknal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - ksocknal_tunables.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; - ksocknal_tunables.ksnd_eager_ack = SOCKNAL_EAGER_ACK; - ksocknal_tunables.ksnd_typed_conns = SOCKNAL_TYPED_CONNS; - ksocknal_tunables.ksnd_min_bulk = SOCKNAL_MIN_BULK; - ksocknal_tunables.ksnd_buffer_size = SOCKNAL_BUFFER_SIZE; - ksocknal_tunables.ksnd_nagle = SOCKNAL_NAGLE; - ksocknal_tunables.ksnd_keepalive_idle = SOCKNAL_KEEPALIVE_IDLE; - ksocknal_tunables.ksnd_keepalive_count = SOCKNAL_KEEPALIVE_COUNT; - ksocknal_tunables.ksnd_keepalive_intvl = SOCKNAL_KEEPALIVE_INTVL; -#if CPU_AFFINITY - ksocknal_tunables.ksnd_irq_affinity = SOCKNAL_IRQ_AFFINITY; -#endif -#if SOCKNAL_ZC - ksocknal_tunables.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; -#endif - - rc = ptl_register_nal(SOCKNAL, &ksocknal_api); - if (rc != PTL_OK) { - CERROR("Can't register SOCKNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } + CLASSERT(SOCKLND_CONN_NTYPES <= 4); + + rc = ksocknal_lib_tunables_init(); + if (rc != 0) + return rc; - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(SOCKNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &ksocknal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(SOCKNAL); - return (-ENODEV); - } + lnet_register_lnd(&the_ksocklnd); -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - ksocknal_tunables.ksnd_sysctl = - register_sysctl_table (ksocknal_top_ctl_table, 0); -#endif - return (0); + return 0; } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel TCP Socket NAL v1.0.0"); +MODULE_DESCRIPTION("Kernel TCP Socket LND v2.0.0"); MODULE_LICENSE("GPL"); -cfs_module(ksocknal, "1.0.0", ksocknal_module_init, ksocknal_module_fini); +cfs_module(ksocknal, "2.0.0", ksocknal_module_init, ksocknal_module_fini); diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 8c69aa0..a1f1861 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -28,56 +28,26 @@ # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND #if defined(__linux__) -#include "socknal_lib-linux.h" +#include "socklnd_lib-linux.h" #elif defined(__APPLE__) -#include "socknal_lib-darwin.h" +#include "socklnd_lib-darwin.h" +#elif defined(__WINNT__) +#include "socklnd_lib-winnt.h" #else #error Unsupported Operating System #endif #include -#include -#include -#include -#include -#include - -#define SOCKNAL_N_AUTOCONNECTD 4 /* # socknal autoconnect daemons */ - -#define SOCKNAL_MIN_RECONNECT_INTERVAL cfs_time_seconds(1) /* first failed connection retry... */ -#define SOCKNAL_MAX_RECONNECT_INTERVAL cfs_time_seconds(60) /* ...exponentially increasing to this */ - -/* default vals for runtime tunables */ -#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ -#define SOCKNAL_EAGER_ACK SOCKNAL_ARCH_EAGER_ACK /* default eager ack (boolean) */ -#define SOCKNAL_TYPED_CONNS 1 /* unidirectional large, bidirectional small? */ -#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ -#define SOCKNAL_MIN_BULK (1<<10) /* smallest "large" message */ -#define SOCKNAL_BUFFER_SIZE (8<<20) /* default socket buffer size */ -#define SOCKNAL_NAGLE 0 /* enable/disable NAGLE? */ -#define SOCKNAL_IRQ_AFFINITY 1 /* enable/disable IRQ affinity? */ -#define SOCKNAL_KEEPALIVE_IDLE 35 /* # seconds idle before 1st probe */ - -#define SOCKNAL_KEEPALIVE_COUNT 5 /* # unanswered probes to determine peer death */ -#define SOCKNAL_KEEPALIVE_INTVL 5 /* seconds between probes */ - -#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ - -#define SOCKNAL_SMALL_FWD_NMSGS 128 /* # small messages I can be forwarding at any time */ -#define SOCKNAL_LARGE_FWD_NMSGS 64 /* # large messages I can be forwarding at any time */ - -#define SOCKNAL_SMALL_FWD_PAGES 1 /* # pages in a small message fwd buffer */ - -#define SOCKNAL_LARGE_FWD_PAGES (PAGE_ALIGN(PTL_MTU) >> PAGE_SHIFT) - /* # pages in a large message fwd buffer */ +#include +#include +#include +#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ #define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define SOCKNAL_ENOMEM_RETRY CFS_MIN_DELAY /* jiffies between retries */ - -#define SOCKNAL_MAX_INTERFACES 16 /* Largest number of interfaces we bind */ +#define SOCKNAL_ENOMEM_RETRY CFS_TICK /* jiffies between retries */ #define SOCKNAL_ROUND_ROBIN 0 /* round robin / load balance */ @@ -92,66 +62,79 @@ # define SOCKNAL_RISK_KMAP_DEADLOCK 1 #endif -typedef struct /* pool of forwarding buffers */ -{ - spinlock_t fmp_lock; /* serialise */ - struct list_head fmp_idle_fmbs; /* free buffers */ - struct list_head fmp_blocked_conns; /* connections waiting for a buffer */ - int fmp_nactive_fmbs; /* # buffers in use */ - int fmp_buff_pages; /* # pages per buffer */ -} ksock_fmb_pool_t; - - typedef struct /* per scheduler state */ { spinlock_t kss_lock; /* serialise */ struct list_head kss_rx_conns; /* conn waiting to be read */ struct list_head kss_tx_conns; /* conn waiting to be written */ -#if SOCKNAL_ZC - struct list_head kss_zctxdone_list; /* completed ZC transmits */ -#endif + struct list_head kss_zombie_noop_txs; /* zombie noop tx list */ cfs_waitq_t kss_waitq; /* where scheduler sleeps */ int kss_nconns; /* # connections assigned to this scheduler */ } ksock_sched_t; typedef struct { - int ksni_valid:1; /* been set yet? */ - int ksni_bound:1; /* bound to a cpu yet? */ - int ksni_sched:6; /* which scheduler (assumes < 64) */ + unsigned int ksni_valid:1; /* been set yet? */ + unsigned int ksni_bound:1; /* bound to a cpu yet? */ + unsigned int ksni_sched:6; /* which scheduler (assumes < 64) */ } ksock_irqinfo_t; -typedef struct +typedef struct /* in-use interface */ { __u32 ksni_ipaddr; /* interface's IP address */ __u32 ksni_netmask; /* interface's network mask */ int ksni_nroutes; /* # routes using (active) */ int ksni_npeers; /* # peers using (passive) */ + char ksni_name[16]; /* interface name */ } ksock_interface_t; typedef struct { - int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */ - int ksnd_eager_ack; /* make TCP ack eagerly? */ - int ksnd_typed_conns; /* drive sockets by type? */ - int ksnd_min_bulk; /* smallest "large" message */ - int ksnd_buffer_size; /* socket buffer size */ - int ksnd_nagle; /* enable NAGLE? */ - int ksnd_irq_affinity; /* enable IRQ affinity? */ - int ksnd_keepalive_idle; /* # idle secs before 1st probe */ - int ksnd_keepalive_count; /* # probes */ - int ksnd_keepalive_intvl; /* time between probes */ -#if SOCKNAL_ZC - unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */ + int *ksnd_timeout; /* "stuck" socket timeout (seconds) */ + int *ksnd_nconnds; /* # connection daemons */ + int *ksnd_min_reconnectms; /* first connection retry after (ms)... */ + int *ksnd_max_reconnectms; /* ...exponentially increasing to this */ + int *ksnd_eager_ack; /* make TCP ack eagerly? */ + int *ksnd_typed_conns; /* drive sockets by type? */ + int *ksnd_min_bulk; /* smallest "large" message */ + int *ksnd_tx_buffer_size; /* socket tx buffer size */ + int *ksnd_rx_buffer_size; /* socket rx buffer size */ + int *ksnd_nagle; /* enable NAGLE? */ + int *ksnd_keepalive_idle; /* # idle secs before 1st probe */ + int *ksnd_keepalive_count; /* # probes */ + int *ksnd_keepalive_intvl; /* time between probes */ + int *ksnd_credits; /* # concurrent sends */ + int *ksnd_peercredits; /* # concurrent sends to 1 peer */ + int *ksnd_enable_csum; /* enable check sum */ + int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ + unsigned int *ksnd_zc_min_frag; /* minimum zero copy frag size */ +#ifdef CPU_AFFINITY + int *ksnd_irq_affinity; /* enable IRQ affinity? */ #endif +#ifdef SOCKNAL_BACKOFF + int *ksnd_backoff_init; /* initial TCP backoff */ + int *ksnd_backoff_max; /* maximum TCP backoff */ +#endif +#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM cfs_sysctl_table_header_t *ksnd_sysctl; /* sysctl interface */ +#endif } ksock_tunables_t; typedef struct { + __u64 ksnn_incarnation; /* my epoch */ + spinlock_t ksnn_lock; /* serialise */ + int ksnn_npeers; /* # peers */ + int ksnn_shutdown; /* shutting down? */ + int ksnn_ninterfaces; /* IP interfaces */ + ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES]; +} ksock_net_t; + +typedef struct +{ int ksnd_init; /* initialisation state */ - __u64 ksnd_incarnation; /* my epoch */ - + int ksnd_nnets; /* # networks set up */ + rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ struct list_head *ksnd_peers; /* hash table of all my known peers */ int ksnd_peer_hash_size; /* size of ksnd_peers */ @@ -161,15 +144,7 @@ typedef struct int ksnd_nschedulers; /* # schedulers */ ksock_sched_t *ksnd_schedulers; /* their state */ - atomic_t ksnd_npeers; /* total # peers extant */ - atomic_t ksnd_nclosing_conns; /* # closed conns extant */ - - kpr_router_t ksnd_router; /* THE router */ - - ksock_fmb_pool_t ksnd_small_fmp; /* small message forwarding buffers */ - ksock_fmb_pool_t ksnd_large_fmp; /* large message forwarding buffers */ - - atomic_t ksnd_nactive_ltxs; /* #active ltxs */ + atomic_t ksnd_nactive_txs; /* #active txs */ struct list_head ksnd_deathrow_conns; /* conns to close: reaper_lock*/ struct list_head ksnd_zombie_conns; /* conns to free: reaper_lock */ @@ -182,163 +157,148 @@ typedef struct int ksnd_stall_tx; /* test sluggish sender */ int ksnd_stall_rx; /* test sluggish receiver */ - struct list_head ksnd_autoconnectd_routes; /* routes waiting to be connected */ - cfs_waitq_t ksnd_autoconnectd_waitq; /* autoconnectds sleep here */ - spinlock_t ksnd_autoconnectd_lock; /* serialise */ + struct list_head ksnd_connd_connreqs; /* incoming connection requests */ + struct list_head ksnd_connd_routes; /* routes waiting to be connected */ + cfs_waitq_t ksnd_connd_waitq; /* connds sleep here */ + int ksnd_connd_connecting;/* # connds connecting */ + spinlock_t ksnd_connd_lock; /* serialise */ + + struct list_head ksnd_idle_noop_txs; /* list head for freed noop tx */ + spinlock_t ksnd_tx_lock; /* serialise, NOT safe in g_lock */ ksock_irqinfo_t ksnd_irqinfo[NR_IRQS];/* irq->scheduler lookup */ - int ksnd_ninterfaces; - ksock_interface_t ksnd_interfaces[SOCKNAL_MAX_INTERFACES]; /* published interfaces */ } ksock_nal_data_t; #define SOCKNAL_INIT_NOTHING 0 #define SOCKNAL_INIT_DATA 1 -#define SOCKNAL_INIT_LIB 2 -#define SOCKNAL_INIT_ALL 3 +#define SOCKNAL_INIT_ALL 2 /* A packet just assembled for transmission is represented by 1 or more * struct iovec fragments (the first frag contains the portals header), - * followed by 0 or more ptl_kiov_t fragments. + * followed by 0 or more lnet_kiov_t fragments. * * On the receive side, initially 1 struct iovec fragment is posted for * receive (the header). Once the header has been received, the payload is - * received into either struct iovec or ptl_kiov_t fragments, depending on + * received into either struct iovec or lnet_kiov_t fragments, depending on * what the header matched or whether the message needs forwarding. */ struct ksock_conn; /* forward ref */ struct ksock_peer; /* forward ref */ struct ksock_route; /* forward ref */ +struct ksock_protocol; /* forward ref */ typedef struct /* transmit packet */ { struct list_head tx_list; /* queue on conn for transmission etc */ - char tx_isfwd; /* forwarding / sourced here */ + struct list_head tx_zc_list; /* queue on peer for ZC request */ + atomic_t tx_refcount; /* tx reference count */ int tx_nob; /* # packet bytes */ int tx_resid; /* residual bytes */ int tx_niov; /* # packet iovec frags */ struct iovec *tx_iov; /* packet iovec frags */ int tx_nkiov; /* # packet page frags */ - ptl_kiov_t *tx_kiov; /* packet page frags */ + lnet_kiov_t *tx_kiov; /* packet page frags */ struct ksock_conn *tx_conn; /* owning conn */ - ptl_hdr_t *tx_hdr; /* packet header (for debug only) */ -#if SOCKNAL_ZC - zccd_t tx_zccd; /* zero copy callback descriptor */ -#endif + lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */ + ksock_msg_t tx_msg; /* socklnd message buffer */ + int tx_desc_size; /* size of this descriptor */ + union { + struct { + struct iovec iov; /* virt hdr */ + lnet_kiov_t kiov[0]; /* paged payload */ + } paged; + struct { + struct iovec iov[1]; /* virt hdr + payload */ + } virt; + } tx_frags; } ksock_tx_t; -typedef struct /* forwarded packet */ -{ - ksock_tx_t ftx_tx; /* send info */ - struct iovec ftx_iov; /* hdr iovec */ -} ksock_ftx_t; +#define KSOCK_NOOP_TX_SIZE offsetof(ksock_tx_t, tx_frags.paged.kiov[0]) -#define KSOCK_ZCCD_2_TX(ptr) list_entry (ptr, ksock_tx_t, tx_zccd) /* network zero copy callback descriptor embedded in ksock_tx_t */ -typedef struct /* locally transmitted packet */ -{ - ksock_tx_t ltx_tx; /* send info */ - void *ltx_private; /* lib_finalize() callback arg */ - void *ltx_cookie; /* lib_finalize() callback arg */ - ptl_hdr_t ltx_hdr; /* buffer for packet header */ - int ltx_desc_size; /* bytes allocated for this desc */ - struct iovec ltx_iov[1]; /* iov for hdr + payload */ - ptl_kiov_t ltx_kiov[0]; /* kiov for payload */ -} ksock_ltx_t; - -#define KSOCK_TX_2_KPR_FWD_DESC(ptr) list_entry ((kprfd_scratch_t *)ptr, kpr_fwd_desc_t, kprfd_scratch) -/* forwarded packets (router->socknal) embedded in kpr_fwd_desc_t::kprfd_scratch */ - -#define KSOCK_TX_2_KSOCK_LTX(ptr) list_entry (ptr, ksock_ltx_t, ltx_tx) -/* local packets (lib->socknal) embedded in ksock_ltx_t::ltx_tx */ - -/* NB list_entry() is used here as convenient macro for calculating a - * pointer to a struct from the address of a member. */ - -typedef struct /* Kernel portals Socket Forwarding message buffer */ -{ /* (socknal->router) */ - struct list_head fmb_list; /* queue idle */ - kpr_fwd_desc_t fmb_fwd; /* router's descriptor */ - ksock_fmb_pool_t *fmb_pool; /* owning pool */ - struct ksock_peer *fmb_peer; /* peer received from */ - ptl_hdr_t fmb_hdr; /* message header */ - ptl_kiov_t fmb_kiov[0]; /* payload frags */ -} ksock_fmb_t; - /* space for the rx frag descriptors; we either read a single contiguous - * header, or up to PTL_MD_MAX_IOV frags of payload of either type. */ + * header, or up to LNET_MAX_IOV frags of payload of either type. */ typedef union { - struct iovec iov[PTL_MD_MAX_IOV]; - ptl_kiov_t kiov[PTL_MD_MAX_IOV]; + struct iovec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; } ksock_rxiovspace_t; -#define SOCKNAL_RX_HEADER 1 /* reading header */ -#define SOCKNAL_RX_BODY 2 /* reading body (to deliver here) */ -#define SOCKNAL_RX_BODY_FWD 3 /* reading body (to forward) */ -#define SOCKNAL_RX_SLOP 4 /* skipping body */ -#define SOCKNAL_RX_GET_FMB 5 /* scheduled for forwarding */ -#define SOCKNAL_RX_FMB_SLEEP 6 /* blocked waiting for a fwd desc */ +#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ +#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ +#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ +#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ +#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ +#define SOCKNAL_RX_SLOP 6 /* skipping body */ typedef struct ksock_conn -{ +{ struct ksock_peer *ksnc_peer; /* owning peer */ struct ksock_route *ksnc_route; /* owning route */ struct list_head ksnc_list; /* stash on peer's conn list */ - struct socket *ksnc_sock; /* actual socket */ + cfs_socket_t *ksnc_sock; /* actual socket */ void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ void *ksnc_saved_write_space; /* socket's original write_space() callback */ - atomic_t ksnc_refcount; /* # users */ + atomic_t ksnc_conn_refcount; /* conn refcount */ + atomic_t ksnc_sock_refcount; /* sock refcount */ ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ __u32 ksnc_myipaddr; /* my IP */ __u32 ksnc_ipaddr; /* peer's IP */ int ksnc_port; /* peer's port */ - int ksnc_closing; /* being shut down */ - int ksnc_type; /* type of connection */ + int ksnc_type:3; /* type of connection, should be signed value */ + int ksnc_closing:1; /* being shut down */ + int ksnc_flip:1; /* flip or not, only for V2.x */ + int ksnc_zc_capable:1; /* enable to ZC */ __u64 ksnc_incarnation; /* peer's incarnation */ - + /* reader */ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ cfs_time_t ksnc_rx_deadline; /* when (in jiffies) receive times out */ - int ksnc_rx_started; /* started receiving a message */ - int ksnc_rx_ready; /* data ready to read */ - int ksnc_rx_scheduled; /* being progressed */ - int ksnc_rx_state; /* what is being read */ + __u8 ksnc_rx_started; /* started receiving a message */ + __u8 ksnc_rx_ready; /* data ready to read */ + __u8 ksnc_rx_scheduled; /* being progressed */ + __u8 ksnc_rx_state; /* what is being read */ int ksnc_rx_nob_left; /* # bytes to next hdr/body */ int ksnc_rx_nob_wanted; /* bytes actually wanted */ int ksnc_rx_niov; /* # iovec frags */ struct iovec *ksnc_rx_iov; /* the iovec frags */ int ksnc_rx_nkiov; /* # page frags */ - ptl_kiov_t *ksnc_rx_kiov; /* the page frags */ + lnet_kiov_t *ksnc_rx_kiov; /* the page frags */ ksock_rxiovspace_t ksnc_rx_iov_space; /* space for frag descriptors */ - void *ksnc_cookie; /* rx lib_finalize passthru arg */ - ptl_hdr_t ksnc_hdr; /* where I read headers into */ + __u32 ksnc_rx_csum; /* partial checksum for incoming data */ + void *ksnc_cookie; /* rx lnet_finalize passthru arg */ + ksock_msg_t ksnc_msg; /* incoming message buffer: + * V2.x message takes the whole struct + * V1.x message is a bare lnet_hdr_t, it's stored + * in ksnc_msg.ksm_u.lnetmsg */ /* WRITER */ struct list_head ksnc_tx_list; /* where I enq waiting for output space */ struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + ksock_tx_t *ksnc_tx_mono; /* V2.x only, next mono-packet, mono-packet is : + * a. lnet packet without piggyback + * b. noop ZC-ACK packet */ cfs_time_t ksnc_tx_deadline; /* when (in jiffies) tx times out */ int ksnc_tx_bufnob; /* send buffer marker */ atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ + struct ksock_protocol *ksnc_proto; /* protocol table for the connection */ + #if !SOCKNAL_SINGLE_FRAG_RX - struct iovec ksnc_rx_scratch_iov[PTL_MD_MAX_IOV]; + struct iovec ksnc_rx_scratch_iov[LNET_MAX_IOV]; #endif #if !SOCKNAL_SINGLE_FRAG_TX - struct iovec ksnc_tx_scratch_iov[PTL_MD_MAX_IOV]; + struct iovec ksnc_tx_scratch_iov[LNET_MAX_IOV]; #endif } ksock_conn_t; -#define KSNR_TYPED_ROUTES ((1 << SOCKNAL_CONN_CONTROL) | \ - (1 << SOCKNAL_CONN_BULK_IN) | \ - (1 << SOCKNAL_CONN_BULK_OUT)) - typedef struct ksock_route { struct list_head ksnr_list; /* chain on peer route list */ - struct list_head ksnr_connect_list; /* chain on autoconnect list */ + struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */ struct ksock_peer *ksnr_peer; /* owning peer */ atomic_t ksnr_refcount; /* # users */ cfs_time_t ksnr_timeout; /* when (in jiffies) reconnection can happen next */ @@ -346,51 +306,109 @@ typedef struct ksock_route __u32 ksnr_myipaddr; /* my IP */ __u32 ksnr_ipaddr; /* IP address to connect to */ int ksnr_port; /* port to connect to */ - unsigned int ksnr_connecting:1; /* autoconnect in progress */ + unsigned int ksnr_scheduled:1; /* scheduled for attention */ + unsigned int ksnr_connecting:1; /* connection establishment in progress */ unsigned int ksnr_connected:4; /* connections established by type */ unsigned int ksnr_deleted:1; /* been removed from peer? */ unsigned int ksnr_share_count; /* created explicitly? */ int ksnr_conn_count; /* # conns established by this route */ + struct ksock_protocol *ksnr_proto ; /* protocol table for connecting */ } ksock_route_t; typedef struct ksock_peer { struct list_head ksnp_list; /* stash on global peer list */ - ptl_nid_t ksnp_nid; /* who's on the other end(s) */ + lnet_process_id_t ksnp_id; /* who's on the other end(s) */ atomic_t ksnp_refcount; /* # users */ int ksnp_sharecount; /* lconf usage counter */ int ksnp_closing; /* being closed */ + int ksnp_accepting; /* # passive connections pending */ int ksnp_error; /* errno on closing last conn */ + __u64 ksnp_zc_next_cookie;/* ZC completion cookie */ struct list_head ksnp_conns; /* all active connections */ struct list_head ksnp_routes; /* routes */ struct list_head ksnp_tx_queue; /* waiting packets */ + spinlock_t ksnp_lock; /* serialize, NOT safe in g_lock */ + struct list_head ksnp_zc_req_list; /* zero copy requests wait for ACK */ cfs_time_t ksnp_last_alive; /* when (in jiffies) I was last alive */ + lnet_ni_t *ksnp_ni; /* which network */ int ksnp_n_passive_ips; /* # of... */ - __u32 ksnp_passive_ips[SOCKNAL_MAX_INTERFACES]; /* preferred local interfaces */ + __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */ } ksock_peer_t; +typedef struct ksock_connreq +{ + struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */ + lnet_ni_t *ksncr_ni; /* chosen NI */ + cfs_socket_t *ksncr_sock; /* accepted socket */ +} ksock_connreq_t; -extern lib_nal_t ksocknal_lib; extern ksock_nal_data_t ksocknal_data; extern ksock_tunables_t ksocknal_tunables; +typedef struct ksock_protocol +{ + int pro_version; /* version number of protocol */ + int (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *); /* handshake function */ + int (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */ + void (*pro_pack)(ksock_tx_t *); /* message pack */ + void (*pro_unpack)(ksock_msg_t *); /* message unpack */ +} ksock_protocol_t; + +extern ksock_protocol_t ksocknal_protocol_v1x; +extern ksock_protocol_t ksocknal_protocol_v2x; + +#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR +#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR +#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR +#define KSOCK_PROTO_V2 2 + +static inline int +ksocknal_route_mask(void) +{ + if (!*ksocknal_tunables.ksnd_typed_conns) + return (1 << SOCKLND_CONN_ANY); + + return ((1 << SOCKLND_CONN_CONTROL) | + (1 << SOCKLND_CONN_BULK_IN) | + (1 << SOCKLND_CONN_BULK_OUT)); +} + static inline struct list_head * -ksocknal_nid2peerlist (ptl_nid_t nid) +ksocknal_nid2peerlist (lnet_nid_t nid) { unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; return (&ksocknal_data.ksnd_peers [hash]); } +static inline void +ksocknal_conn_addref (ksock_conn_t *conn) +{ + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + atomic_inc(&conn->ksnc_conn_refcount); +} + +extern void ksocknal_queue_zombie_conn (ksock_conn_t *conn); + +static inline void +ksocknal_conn_decref (ksock_conn_t *conn) +{ + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_conn_refcount)) + ksocknal_queue_zombie_conn(conn); +} + static inline int -ksocknal_getconnsock (ksock_conn_t *conn) +ksocknal_connsock_addref (ksock_conn_t *conn) { int rc = -ESHUTDOWN; read_lock (&ksocknal_data.ksnd_global_lock); if (!conn->ksnc_closing) { + LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0); + atomic_inc(&conn->ksnc_sock_refcount); rc = 0; - cfs_get_file (KSN_CONN2FILE(conn)); } read_unlock (&ksocknal_data.ksnd_global_lock); @@ -398,61 +416,127 @@ ksocknal_getconnsock (ksock_conn_t *conn) } static inline void -ksocknal_putconnsock (ksock_conn_t *conn) +ksocknal_connsock_decref (ksock_conn_t *conn) +{ + LASSERT (atomic_read(&conn->ksnc_sock_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { + LASSERT (conn->ksnc_closing); + libcfs_sock_release(conn->ksnc_sock); + conn->ksnc_sock = NULL; + } +} + +static inline void +ksocknal_tx_addref (ksock_tx_t *tx) +{ + LASSERT (atomic_read(&tx->tx_refcount) > 0); + atomic_inc(&tx->tx_refcount); +} + +extern void ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx); + +static inline void +ksocknal_tx_decref (ksock_tx_t *tx) { - cfs_put_file (KSN_CONN2FILE(conn)); + LASSERT (atomic_read(&tx->tx_refcount) > 0); + if (atomic_dec_and_test(&tx->tx_refcount)) + ksocknal_tx_done(NULL, tx); } -extern void ksocknal_put_route (ksock_route_t *route); -extern void ksocknal_put_peer (ksock_peer_t *peer); -extern ksock_peer_t *ksocknal_find_peer_locked (ptl_nid_t nid); -extern ksock_peer_t *ksocknal_get_peer (ptl_nid_t nid); -extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, - int single, int keep_conn); -extern int ksocknal_create_conn (ksock_route_t *route, - struct socket *sock, int type); +static inline void +ksocknal_route_addref (ksock_route_t *route) +{ + LASSERT (atomic_read(&route->ksnr_refcount) > 0); + atomic_inc(&route->ksnr_refcount); +} + +extern void ksocknal_destroy_route (ksock_route_t *route); + +static inline void +ksocknal_route_decref (ksock_route_t *route) +{ + LASSERT (atomic_read (&route->ksnr_refcount) > 0); + if (atomic_dec_and_test(&route->ksnr_refcount)) + ksocknal_destroy_route (route); +} + +static inline void +ksocknal_peer_addref (ksock_peer_t *peer) +{ + LASSERT (atomic_read (&peer->ksnp_refcount) > 0); + atomic_inc(&peer->ksnp_refcount); +} + +extern void ksocknal_destroy_peer (ksock_peer_t *peer); + +static inline void +ksocknal_peer_decref (ksock_peer_t *peer) +{ + LASSERT (atomic_read (&peer->ksnp_refcount) > 0); + if (atomic_dec_and_test(&peer->ksnp_refcount)) + ksocknal_destroy_peer (peer); +} + +int ksocknal_startup (lnet_ni_t *ni); +void ksocknal_shutdown (lnet_ni_t *ni); +int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int ksocknal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int ksocknal_accept(lnet_ni_t *ni, cfs_socket_t *sock); + +extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port); +extern ksock_peer_t *ksocknal_find_peer_locked (lnet_ni_t *ni, lnet_process_id_t id); +extern ksock_peer_t *ksocknal_find_peer (lnet_ni_t *ni, lnet_process_id_t id); +extern void ksocknal_peer_failed (ksock_peer_t *peer); +extern int ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, + cfs_socket_t *sock, int type); extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why); extern void ksocknal_terminate_conn (ksock_conn_t *conn); extern void ksocknal_destroy_conn (ksock_conn_t *conn); -extern void ksocknal_put_conn (ksock_conn_t *conn); extern int ksocknal_close_stale_conns_locked (ksock_peer_t *peer, __u64 incarnation); extern int ksocknal_close_conn_and_siblings (ksock_conn_t *conn, int why); -extern int ksocknal_close_matching_conns (ptl_nid_t nid, __u32 ipaddr); +extern int ksocknal_close_matching_conns (lnet_process_id_t id, __u32 ipaddr); extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); -extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch); -extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void ksocknal_fmb_callback (void *arg, int error); -extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive); +extern void ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error); +extern void ksocknal_notify (lnet_ni_t *ni, lnet_nid_t gw_nid, int alive); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); +extern void ksocknal_thread_fini (void); +extern ksock_route_t *ksocknal_find_connecting_route_locked (ksock_peer_t *peer); extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); extern int ksocknal_scheduler (void *arg); -extern int ksocknal_autoconnectd (void *arg); +extern int ksocknal_connd (void *arg); extern int ksocknal_reaper (void *arg); -extern int ksocknal_setup_sock (struct socket *sock); -extern int ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs); -extern int ksocknal_recv_hello (ksock_conn_t *conn, - ptl_nid_t *nid, __u64 *incarnation, __u32 *ipaddrs); - -extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn); -extern void ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn); -extern void ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn); -extern void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn); +extern ksock_protocol_t * ksocknal_compat_protocol(ksock_hello_msg_t *); +extern int ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello); +extern int ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *id, + __u64 *incarnation); +extern void ksocknal_read_callback(ksock_conn_t *conn); +extern void ksocknal_write_callback(ksock_conn_t *conn); + +extern int ksocknal_lib_zc_capable(cfs_socket_t *sock); +extern void ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn); +extern void ksocknal_lib_set_callback(cfs_socket_t *sock, ksock_conn_t *conn); +extern void ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn); extern void ksocknal_lib_push_conn (ksock_conn_t *conn); extern void ksocknal_lib_bind_irq (unsigned int irq); extern int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn); -extern unsigned int ksocknal_lib_sock_irq (struct socket *sock); -extern int ksocknal_lib_setup_sock (struct socket *so); +extern unsigned int ksocknal_lib_sock_irq (cfs_socket_t *sock); +extern int ksocknal_lib_setup_sock (cfs_socket_t *so); extern int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx); extern int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx); extern void ksocknal_lib_eager_ack (ksock_conn_t *conn); extern int ksocknal_lib_recv_iov (ksock_conn_t *conn); extern int ksocknal_lib_recv_kiov (ksock_conn_t *conn); -extern int ksocknal_lib_sock_write (struct socket *sock, - void *buffer, int nob); -extern int ksocknal_lib_sock_read (struct socket *sock, - void *buffer, int nob); extern int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle); -extern int ksocknal_lib_connect_sock(struct socket **sockp, int *may_retry, - ksock_route_t *route, int local_port); + +extern int ksocknal_lib_tunables_init(void); +extern void ksocknal_lib_tunables_fini(void); + +extern void ksocknal_lib_csum_tx(ksock_tx_t *tx); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index bd26027..7ca80cd 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -23,31 +23,64 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include "socknal.h" +#include "socklnd.h" -/* - * LIB functions follow - * - */ -int -ksocknal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +ksock_tx_t * +ksocknal_alloc_tx (int size) { - /* I would guess that if ksocknal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if (nal->libnal_ni.ni_pid.nid == nid) { - *dist = 0; - } else { - *dist = 1; + ksock_tx_t *tx = NULL; + + if (size == KSOCK_NOOP_TX_SIZE) { + /* searching for a noop tx in free list */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next, + ksock_tx_t, tx_list); + LASSERT(tx->tx_desc_size == size); + list_del(&tx->tx_list); + } + + spin_unlock(&ksocknal_data.ksnd_tx_lock); } + + if (tx == NULL) + LIBCFS_ALLOC(tx, size); - return 0; + if (tx == NULL) + return NULL; + + atomic_set(&tx->tx_refcount, 1); + tx->tx_desc_size = size; + atomic_inc(&ksocknal_data.ksnd_nactive_txs); + + return tx; +} + +void +ksocknal_free_tx (ksock_tx_t *tx) +{ + atomic_dec(&ksocknal_data.ksnd_nactive_txs); + + if (tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { + /* it's a noop tx */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } else { + LIBCFS_FREE(tx, tx->tx_desc_size); + } } void -ksocknal_free_ltx (ksock_ltx_t *ltx) +ksocknal_init_msg(ksock_msg_t *msg, int type) { - atomic_dec(&ksocknal_data.ksnd_nactive_ltxs); - PORTAL_FREE(ltx, ltx->ltx_desc_size); + msg->ksm_type = type; + msg->ksm_csum = 0; + msg->ksm_zc_req_cookie = 0; + msg->ksm_zc_ack_cookie = 0; } int @@ -90,7 +123,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) int ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { - ptl_kiov_t *kiov = tx->tx_kiov; + lnet_kiov_t *kiov = tx->tx_kiov; int nob; int rc; @@ -102,7 +135,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) if (rc <= 0) /* sent nothing? */ return (rc); - + nob = rc; LASSERT (nob <= tx->tx_resid); tx->tx_resid -= nob; @@ -130,15 +163,14 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) { int rc; int bufnob; - + if (ksocknal_data.ksnd_stall_tx != 0) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); + cfs_pause(cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); } LASSERT (tx->tx_resid != 0); - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) { LASSERT (conn->ksnc_closing); return (-ESHUTDOWN); @@ -158,18 +190,18 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) bufnob = SOCK_WMEM_QUEUED(conn->ksnc_sock); if (rc > 0) /* sent something? */ conn->ksnc_tx_bufnob += rc; /* account it */ - + if (bufnob < conn->ksnc_tx_bufnob) { /* allocated send buffer bytes < computed; infer * something got ACKed */ - conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout); + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); conn->ksnc_tx_bufnob = bufnob; mb(); } if (rc <= 0) { /* Didn't write anything? */ - unsigned long flags; ksock_sched_t *sched; if (rc == 0) /* some stacks return 0 instead of -EAGAIN */ @@ -181,8 +213,8 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) /* Check if EAGAIN is due to memory pressure */ sched = conn->ksnc_scheduler; - spin_lock_irqsave(&sched->kss_lock, flags); - + spin_lock_bh (&sched->kss_lock); + if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) && !conn->ksnc_tx_ready) { /* SOCK_NOSPACE is set when the socket fills @@ -196,7 +228,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) rc = -ENOMEM; } - spin_unlock_irqrestore(&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); break; } @@ -206,7 +238,7 @@ ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) } while (tx->tx_resid != 0); - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); return (rc); } @@ -228,12 +260,13 @@ ksocknal_recv_iov (ksock_conn_t *conn) /* received something... */ nob = rc; - + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); - conn->ksnc_rx_deadline = cfs_time_shift (ksocknal_tunables.ksnd_io_timeout); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); mb(); /* order with setting rx_started */ conn->ksnc_rx_started = 1; - + conn->ksnc_rx_nob_wanted -= nob; conn->ksnc_rx_nob_left -= nob; @@ -257,7 +290,7 @@ ksocknal_recv_iov (ksock_conn_t *conn) int ksocknal_recv_kiov (ksock_conn_t *conn) { - ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; int nob; int rc; LASSERT (conn->ksnc_rx_nkiov > 0); @@ -265,21 +298,22 @@ ksocknal_recv_kiov (ksock_conn_t *conn) /* Never touch conn->ksnc_rx_kiov or change connection * status inside ksocknal_lib_recv_iov */ rc = ksocknal_lib_recv_kiov(conn); - + if (rc <= 0) return (rc); - + /* received something... */ nob = rc; conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); - conn->ksnc_rx_deadline = cfs_time_shift (ksocknal_tunables.ksnd_io_timeout); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); mb(); /* order with setting rx_started */ conn->ksnc_rx_started = 1; conn->ksnc_rx_nob_wanted -= nob; conn->ksnc_rx_nob_left -= nob; - + do { LASSERT (conn->ksnc_rx_nkiov > 0); @@ -305,13 +339,12 @@ ksocknal_receive (ksock_conn_t *conn) * progress/completion. */ int rc; ENTRY; - + if (ksocknal_data.ksnd_stall_rx != 0) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout(cfs_time_seconds (ksocknal_data.ksnd_stall_rx)); + cfs_pause(cfs_time_seconds (ksocknal_data.ksnd_stall_rx)); } - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) { LASSERT (conn->ksnc_closing); return (-ESHUTDOWN); @@ -337,107 +370,128 @@ ksocknal_receive (ksock_conn_t *conn) /* Completed a fragment */ if (conn->ksnc_rx_nob_wanted == 0) { - /* Completed a message segment (header or payload) */ - if ((ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0 && - (conn->ksnc_rx_state == SOCKNAL_RX_BODY || - conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) { - /* Remind the socket to ack eagerly... */ - ksocknal_lib_eager_ack(conn); - } rc = 1; break; } } - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); RETURN (rc); } -#if SOCKNAL_ZC void -ksocknal_zc_callback (zccd_t *zcd) +ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx) { - ksock_tx_t *tx = KSOCK_ZCCD_2_TX(zcd); - ksock_sched_t *sched = tx->tx_conn->ksnc_scheduler; - unsigned long flags; + lnet_msg_t *lnetmsg = tx->tx_lnetmsg; + int rc = (tx->tx_resid == 0) ? 0 : -EIO; ENTRY; - /* Schedule tx for cleanup (can't do it now due to lock conflicts) */ + LASSERT(ni != NULL || tx->tx_conn != NULL); + + if (tx->tx_conn != NULL) + ksocknal_conn_decref(tx->tx_conn); - spin_lock_irqsave (&sched->kss_lock, flags); + if (ni == NULL && tx->tx_conn != NULL) + ni = tx->tx_conn->ksnc_peer->ksnp_ni; - list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); - cfs_waitq_signal (&sched->kss_waitq); + ksocknal_free_tx (tx); + if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */ + lnet_finalize (ni, lnetmsg, rc); - spin_unlock_irqrestore (&sched->kss_lock, flags); EXIT; } -#endif void -ksocknal_tx_done (ksock_tx_t *tx, int asynch) +ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error) { - ksock_ltx_t *ltx; - ENTRY; + ksock_tx_t *tx; + + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, ksock_tx_t, tx_list); + + if (error && tx->tx_lnetmsg != NULL) { + CDEBUG (D_NETERROR, "Deleting packet type %d len %d %s->%s\n", + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type), + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), + libcfs_nid2str(le64_to_cpu (tx->tx_lnetmsg->msg_hdr.dest_nid))); + } else if (error) { + CDEBUG (D_NETERROR, "Deleting noop packet\n"); + } - if (tx->tx_conn != NULL) { -#if SOCKNAL_ZC - /* zero copy completion isn't always from - * process_transmit() so it needs to keep a ref on - * tx_conn... */ - if (asynch) - ksocknal_put_conn (tx->tx_conn); -#else - LASSERT (!asynch); -#endif - } + list_del (&tx->tx_list); - if (tx->tx_isfwd) { /* was a forwarded packet? */ - kpr_fwd_done (&ksocknal_data.ksnd_router, - KSOCK_TX_2_KPR_FWD_DESC (tx), - (tx->tx_resid == 0) ? 0 : -ECONNABORTED); - EXIT; - return; + LASSERT (atomic_read(&tx->tx_refcount) == 1); + ksocknal_tx_done (ni, tx); } +} + +int +ksocknal_zc_req(ksock_tx_t *tx) +{ + lnet_kiov_t *kiov = tx->tx_kiov; + int nkiov = tx->tx_nkiov; - /* local send */ - ltx = KSOCK_TX_2_KSOCK_LTX (tx); + if (!tx->tx_conn->ksnc_zc_capable) + return 0; - lib_finalize (&ksocknal_lib, ltx->ltx_private, ltx->ltx_cookie, - (tx->tx_resid == 0) ? PTL_OK : PTL_FAIL); + while (nkiov > 0) { + if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag) + return 1; + --nkiov; + ++kiov; + } - ksocknal_free_ltx (ltx); - EXIT; + return 0; } -void -ksocknal_tx_launched (ksock_tx_t *tx) +static void +ksocknal_queue_zc_req(ksock_tx_t *tx) { -#if SOCKNAL_ZC - if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { - ksock_conn_t *conn = tx->tx_conn; + ksock_peer_t *peer = tx->tx_conn->ksnc_peer; - /* zccd skbufs are still in-flight. First take a ref on - * conn, so it hangs about for ksocknal_tx_done... */ - atomic_inc (&conn->ksnc_refcount); + /* assign cookie and queue tx to pending list, it will be + * released while getting ack, see ksocknal_handle_zc_ack() */ - /* ...then drop the initial ref on zccd, so the zero copy - * callback can occur */ - zccd_put (&tx->tx_zccd); - return; - } -#endif - /* Any zero-copy-ness (if any) has completed; I can complete the - * transmit now, avoiding an extra schedule */ - ksocknal_tx_done (tx, 0); + ksocknal_tx_addref(tx); /* +1 ref */ + + spin_lock(&peer->ksnp_lock); + + tx->tx_msg.ksm_zc_req_cookie = peer->ksnp_zc_next_cookie++; + list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); + + spin_unlock(&peer->ksnp_lock); } +static void +ksocknal_dequeue_zc_req(ksock_tx_t *tx) +{ + ksock_peer_t *peer = tx->tx_conn->ksnc_peer; + + spin_lock(&peer->ksnp_lock); + + if (tx->tx_msg.ksm_zc_req_cookie != 0) { + /* not deleted by ksocknal_terminate_conn() */ + list_del(&tx->tx_zc_list); + } + + spin_unlock(&peer->ksnp_lock); + + if (tx->tx_msg.ksm_zc_req_cookie != 0) + ksocknal_tx_decref(tx); /* -1 ref */ +} int ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) { - unsigned long flags; int rc; + if (conn->ksnc_proto == &ksocknal_protocol_v2x && + tx->tx_msg.ksm_zc_req_cookie == 0 && + ksocknal_zc_req(tx)) { + /* wait for ACK */ + ksocknal_queue_zc_req(tx); + } + rc = ksocknal_transmit (conn, tx); CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); @@ -446,7 +500,6 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) /* Sent everything OK */ LASSERT (rc == 0); - ksocknal_tx_launched (tx); return (0); } @@ -458,11 +511,11 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) counter++; /* exponential backoff warnings */ if ((counter & (-counter)) == counter) - CWARN("%d ENOMEM tx %p (%u allocated)\n", - counter, conn, atomic_read(&portal_kmemory)); + CWARN("%u ENOMEM tx %p (%u allocated)\n", + counter, conn, atomic_read(&libcfs_kmemory)); /* Queue on ksnd_enomem_conns for retry after a timeout */ - spin_lock_irqsave(&ksocknal_data.ksnd_reaper_lock, flags); + spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); /* enomem list takes over scheduler's ref... */ LASSERT (conn->ksnc_tx_scheduled); @@ -472,8 +525,8 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) SOCKNAL_ENOMEM_RETRY), ksocknal_data.ksnd_reaper_waketime)) cfs_waitq_signal (&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_irqrestore(&ksocknal_data.ksnd_reaper_lock, flags); + + spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); return (rc); } @@ -494,76 +547,47 @@ ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) HIPQUAD(conn->ksnc_ipaddr), rc); break; } - CDEBUG(D_HA, "[%p] Error %d on write to "LPX64 + CDEBUG(D_NET, "[%p] Error %d on write to %s" " ip %d.%d.%d.%d:%d\n", conn, rc, - conn->ksnc_peer->ksnp_nid, + libcfs_id2str(conn->ksnc_peer->ksnp_id), HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + } else { + /* closed, dequeue the ZC request if needed */ + ksocknal_dequeue_zc_req(tx); } - ksocknal_close_conn_and_siblings (conn, rc); - ksocknal_tx_launched (tx); + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); return (rc); } void -ksocknal_launch_autoconnect_locked (ksock_route_t *route) +ksocknal_launch_connection_locked (ksock_route_t *route) { - unsigned long flags; /* called holding write lock on ksnd_global_lock */ - LASSERT (!route->ksnr_connecting); - - route->ksnr_connecting = 1; /* scheduling conn for autoconnectd */ - atomic_inc (&route->ksnr_refcount); /* extra ref for autoconnectd */ - - spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); - - list_add_tail (&route->ksnr_connect_list, - &ksocknal_data.ksnd_autoconnectd_routes); - cfs_waitq_signal (&ksocknal_data.ksnd_autoconnectd_waitq); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); -} - -ksock_peer_t * -ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) -{ - char ipbuf[PTL_NALFMT_SIZE]; - ptl_nid_t target_nid; - int rc; - ksock_peer_t *peer = ksocknal_find_peer_locked (nid); - - if (peer != NULL) - return (peer); - - if (tx->tx_isfwd) { - CERROR ("Can't send packet to "LPX64 - " %s: routed target is not a peer\n", - nid, portals_nid2str(SOCKNAL, nid, ipbuf)); - return (NULL); - } - - rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob, - &target_nid); - if (rc != 0) { - CERROR ("Can't route to "LPX64" %s: router error %d\n", - nid, portals_nid2str(SOCKNAL, nid, ipbuf), rc); - return (NULL); - } - - peer = ksocknal_find_peer_locked (target_nid); - if (peer != NULL) - return (peer); - CERROR ("Can't send packet to "LPX64" %s: no peer entry\n", - target_nid, portals_nid2str(SOCKNAL, target_nid, ipbuf)); - return (NULL); + LASSERT (!route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0); + + route->ksnr_scheduled = 1; /* scheduling conn for connd */ + ksocknal_route_addref(route); /* extra ref for connd */ + + spin_lock_bh (&ksocknal_data.ksnd_connd_lock); + + list_add_tail (&route->ksnr_connd_list, + &ksocknal_data.ksnd_connd_routes); + cfs_waitq_signal (&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); } ksock_conn_t * -ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) +ksocknal_find_conn_locked (int payload_nob, ksock_peer_t *peer) { struct list_head *tmp; ksock_conn_t *typed = NULL; @@ -574,6 +598,7 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) list_for_each (tmp, &peer->ksnp_conns) { ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list); + int hdr_nob = 0; #if SOCKNAL_ROUND_ROBIN const int nob = 0; #else @@ -581,29 +606,40 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) SOCK_WMEM_QUEUED(c->ksnc_sock); #endif LASSERT (!c->ksnc_closing); + LASSERT(c->ksnc_proto != NULL); if (fallback == NULL || nob < fnob) { fallback = c; fnob = nob; } - if (!ksocknal_tunables.ksnd_typed_conns) + if (!*ksocknal_tunables.ksnd_typed_conns) continue; + if (payload_nob == 0) { + /* noop packet */ + hdr_nob = offsetof(ksock_msg_t, ksm_u); + } else { + /* lnet packet */ + hdr_nob = (c->ksnc_proto == &ksocknal_protocol_v2x)? + offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload): + sizeof(lnet_hdr_t); + } + switch (c->ksnc_type) { default: CERROR("ksnc_type bad: %u\n", c->ksnc_type); LBUG(); - case SOCKNAL_CONN_ANY: + case SOCKLND_CONN_ANY: break; - case SOCKNAL_CONN_BULK_IN: + case SOCKLND_CONN_BULK_IN: continue; - case SOCKNAL_CONN_BULK_OUT: - if (tx->tx_nob < ksocknal_tunables.ksnd_min_bulk) + case SOCKLND_CONN_BULK_OUT: + if ((hdr_nob + payload_nob) < *ksocknal_tunables.ksnd_min_bulk) continue; break; - case SOCKNAL_CONN_CONTROL: - if (tx->tx_nob >= ksocknal_tunables.ksnd_min_bulk) + case SOCKLND_CONN_CONTROL: + if ((hdr_nob + payload_nob) >= *ksocknal_tunables.ksnd_min_bulk) continue; break; } @@ -628,54 +664,179 @@ ksocknal_find_conn_locked (ksock_tx_t *tx, ksock_peer_t *peer) } void +ksocknal_next_mono_tx(ksock_conn_t *conn) +{ + ksock_tx_t *tx = conn->ksnc_tx_mono; + + /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v2x); + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + LASSERT(tx != NULL); + + if (tx->tx_list.next == &conn->ksnc_tx_queue) { + /* no more packets queued */ + conn->ksnc_tx_mono = NULL; + } else { + conn->ksnc_tx_mono = list_entry(tx->tx_list.next, ksock_tx_t, tx_list); + LASSERT(conn->ksnc_tx_mono->tx_msg.ksm_type == tx->tx_msg.ksm_type); + } +} + +int +ksocknal_piggyback_zcack(ksock_conn_t *conn, __u64 cookie) +{ + ksock_tx_t *tx = conn->ksnc_tx_mono; + + /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ + + if (tx == NULL) + return 0; + + if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { + /* tx is noop zc-ack, can't piggyback zc-ack cookie */ + return 0; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); + LASSERT(tx->tx_msg.ksm_zc_ack_cookie == 0); + + /* piggyback the zc-ack cookie */ + tx->tx_msg.ksm_zc_ack_cookie = cookie; + ksocknal_next_mono_tx(conn); + + return 1; +} + +void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) { - unsigned long flags; ksock_sched_t *sched = conn->ksnc_scheduler; + ksock_msg_t *msg = &tx->tx_msg; + ksock_tx_t *ztx; + int bufnob = 0; /* called holding global lock (read or irq-write) and caller may * not have dropped this lock between finding conn and calling me, * so we don't need the {get,put}connsock dance to deref * ksnc_sock... */ LASSERT(!conn->ksnc_closing); - LASSERT(tx->tx_resid == tx->tx_nob); - CDEBUG (D_NET, "Sending to "LPX64" ip %d.%d.%d.%d:%d\n", - conn->ksnc_peer->ksnp_nid, + CDEBUG (D_NET, "Sending to %s ip %d.%d.%d.%d:%d\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + conn->ksnc_proto->pro_pack(tx); + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete ksocknal message header. */ + LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) + + lnet_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_resid == tx->tx_nob); + + CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", + tx, (tx->tx_lnetmsg != NULL)? tx->tx_lnetmsg->msg_hdr.type: + KSOCK_MSG_NOOP, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); tx->tx_conn = conn; + ksocknal_conn_addref(conn); /* +1 ref for tx */ -#if SOCKNAL_ZC - zccd_init (&tx->tx_zccd, ksocknal_zc_callback); - /* NB this sets 1 ref on zccd, so the callback can only occur after - * I've released this ref. */ -#endif - spin_lock_irqsave (&sched->kss_lock, flags); + /* + * NB Darwin: SOCK_WMEM_QUEUED()->sock_getsockopt() will take + * a blockable lock(socket lock), so SOCK_WMEM_QUEUED can't be + * put in spinlock. + */ + bufnob = SOCK_WMEM_QUEUED(conn->ksnc_sock); + spin_lock_bh (&sched->kss_lock); - if (list_empty(&conn->ksnc_tx_queue) && - SOCK_WMEM_QUEUED(conn->ksnc_sock) == 0) { + if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) { /* First packet starts the timeout */ - conn->ksnc_tx_deadline = cfs_time_shift(ksocknal_tunables.ksnd_io_timeout); + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); conn->ksnc_tx_bufnob = 0; mb(); /* order with adding to tx_queue */ } - list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + ztx = NULL; + + if (msg->ksm_type == KSOCK_MSG_NOOP) { + /* The packet is noop ZC ACK, try to piggyback the ack_cookie + * on a normal packet so I don't need to send it */ + LASSERT(msg->ksm_zc_req_cookie == 0); + LASSERT(msg->ksm_zc_ack_cookie != 0); + + if (conn->ksnc_tx_mono != NULL) { + if (ksocknal_piggyback_zcack(conn, msg->ksm_zc_ack_cookie)) { + /* zc-ack cookie is piggybacked */ + atomic_sub (tx->tx_nob, &conn->ksnc_tx_nob); + ztx = tx; /* Put to freelist later */ + } else { + /* no packet can piggyback zc-ack cookie */ + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + } + } else { + /* It's the first mono-packet */ + conn->ksnc_tx_mono = tx; + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + } + + } else { + /* It's a normal packet - can it piggback a noop zc-ack that + * has been queued already? */ + LASSERT(msg->ksm_zc_ack_cookie == 0); + + if (conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x packet */ + conn->ksnc_tx_mono != NULL) { + if (conn->ksnc_tx_mono->tx_msg.ksm_type == KSOCK_MSG_NOOP) { + /* There is a noop zc-ack can be piggybacked */ + ztx = conn->ksnc_tx_mono; + + msg->ksm_zc_ack_cookie = ztx->tx_msg.ksm_zc_ack_cookie; + ksocknal_next_mono_tx(conn); + + /* use tx to replace the noop zc-ack packet, ztx will + * be put to freelist later */ + list_add(&tx->tx_list, &ztx->tx_list); + list_del(&ztx->tx_list); + + atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob); + } else { + /* no noop zc-ack packet, just enqueue it */ + LASSERT(conn->ksnc_tx_mono->tx_msg.ksm_type == KSOCK_MSG_LNET); + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + } + + } else if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + /* it's the first mono-packet, enqueue it */ + conn->ksnc_tx_mono = tx; + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + } else { + /* V1.x packet, just enqueue it */ + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); + } + } + if (ztx != NULL) + list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); + if (conn->ksnc_tx_ready && /* able to send */ !conn->ksnc_tx_scheduled) { /* not scheduled to send */ /* +1 ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); list_add_tail (&conn->ksnc_tx_list, &sched->kss_tx_conns); conn->ksnc_tx_scheduled = 1; cfs_waitq_signal (&sched->kss_waitq); } - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); } ksock_route_t * @@ -683,33 +844,28 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) { struct list_head *tmp; ksock_route_t *route; - int bits; - + list_for_each (tmp, &peer->ksnp_routes) { route = list_entry (tmp, ksock_route_t, ksnr_list); - bits = route->ksnr_connected; - if (ksocknal_tunables.ksnd_typed_conns) { - /* All typed connections established? */ - if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES) - continue; - } else { - /* Untyped connection established? */ - if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0) - continue; - } + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); - /* connection being established? */ - if (route->ksnr_connecting) + if (route->ksnr_scheduled) /* connections being established */ continue; - /* too soon to retry this guy? */ - if (!cfs_time_aftereq (cfs_time_current(), route->ksnr_timeout)) + /* all route types connected ? */ + if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0) continue; + /* too soon to retry this guy? */ + if (!(route->ksnr_retry_interval == 0 || /* first attempt */ + cfs_time_aftereq (cfs_time_current(), + route->ksnr_timeout))) + continue; + return (route); } - + return (NULL); } @@ -722,687 +878,560 @@ ksocknal_find_connecting_route_locked (ksock_peer_t *peer) list_for_each (tmp, &peer->ksnp_routes) { route = list_entry (tmp, ksock_route_t, ksnr_list); - if (route->ksnr_connecting) + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) return (route); } - + return (NULL); } int -ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) +ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) { - unsigned long flags; ksock_peer_t *peer; ksock_conn_t *conn; ksock_route_t *route; rwlock_t *g_lock; - - /* Ensure the frags we've been given EXACTLY match the number of - * bytes we want to send. Many TCP/IP stacks disregard any total - * size parameters passed to them and just look at the frags. - * - * We always expect at least 1 mapped fragment containing the - * complete portals header. */ - LASSERT (lib_iov_nob (tx->tx_niov, tx->tx_iov) + - lib_kiov_nob (tx->tx_nkiov, tx->tx_kiov) == tx->tx_nob); - LASSERT (tx->tx_niov >= 1); - LASSERT (tx->tx_iov[0].iov_len >= sizeof (ptl_hdr_t)); - - CDEBUG (D_NET, "packet %p type %d, nob %d niov %d nkiov %d\n", - tx, ((ptl_hdr_t *)tx->tx_iov[0].iov_base)->type, - tx->tx_nob, tx->tx_niov, tx->tx_nkiov); - - tx->tx_conn = NULL; /* only set when assigned a conn */ - tx->tx_resid = tx->tx_nob; - tx->tx_hdr = (ptl_hdr_t *)tx->tx_iov[0].iov_base; + int retry; + int rc; + + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lnetmsg != NULL); g_lock = &ksocknal_data.ksnd_global_lock; + + for (retry = 0;; retry = 1) { #if !SOCKNAL_ROUND_ROBIN - read_lock (g_lock); - - peer = ksocknal_find_target_peer_locked (tx, nid); - if (peer == NULL) { - read_unlock (g_lock); - return (-EHOSTUNREACH); - } - - if (ksocknal_find_connectable_route_locked(peer) == NULL) { - conn = ksocknal_find_conn_locked (tx, peer); - if (conn != NULL) { - /* I've got no autoconnect routes that need to be - * connecting and I do have an actual connection... */ - ksocknal_queue_tx_locked (tx, conn); - read_unlock (g_lock); - return (0); + read_lock (g_lock); + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + if (ksocknal_find_connectable_route_locked(peer) == NULL) { + conn = ksocknal_find_conn_locked (tx->tx_lnetmsg->msg_len, peer); + if (conn != NULL) { + /* I've got no routes that need to be + * connecting and I do have an actual + * connection... */ + ksocknal_queue_tx_locked (tx, conn); + read_unlock (g_lock); + return (0); + } + } } - } - - /* I'll need a write lock... */ - read_unlock (g_lock); + + /* I'll need a write lock... */ + read_unlock (g_lock); #endif - write_lock_irqsave(g_lock, flags); + write_lock_bh (g_lock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + break; + + write_unlock_bh (g_lock); - peer = ksocknal_find_target_peer_locked (tx, nid); - if (peer == NULL) { - write_unlock_irqrestore(g_lock, flags); - return (-EHOSTUNREACH); + if ((id.pid & LNET_PID_USERFLAG) != 0) { + CERROR("Refusing to create a connection to " + "userspace process %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + rc = ksocknal_add_peer(ni, id, + LNET_NIDADDR(id.nid), + lnet_acceptor_port()); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_id2str(id), rc); + return rc; + } } for (;;) { - /* launch any/all autoconnections that need it */ + /* launch any/all connections that need it */ route = ksocknal_find_connectable_route_locked (peer); if (route == NULL) break; - ksocknal_launch_autoconnect_locked (route); + ksocknal_launch_connection_locked (route); } - conn = ksocknal_find_conn_locked (tx, peer); + conn = ksocknal_find_conn_locked (tx->tx_lnetmsg->msg_len, peer); if (conn != NULL) { /* Connection exists; queue message on it */ ksocknal_queue_tx_locked (tx, conn); - write_unlock_irqrestore (g_lock, flags); + write_unlock_bh (g_lock); return (0); } - route = ksocknal_find_connecting_route_locked (peer); - if (route != NULL) { - /* At least 1 connection is being established; queue the - * message... */ + if (peer->ksnp_accepting > 0 || + ksocknal_find_connecting_route_locked (peer) != NULL) { + /* Queue the message until a connection is established */ list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); - write_unlock_irqrestore (g_lock, flags); - return (0); + write_unlock_bh (g_lock); + return 0; } + + write_unlock_bh (g_lock); - write_unlock_irqrestore (g_lock, flags); + /* NB Routes may be ignored if connections to them failed recently */ + CDEBUG(D_NETERROR, "No usable routes to %s\n", libcfs_id2str(id)); return (-EHOSTUNREACH); } -ptl_err_t -ksocknal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) +int +ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { - ksock_ltx_t *ltx; - int desc_size; - int rc; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + ksock_tx_t *tx; + int desc_size; + int rc; /* NB 'private' is different depending on what we're sending. * Just ignore it... */ - CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); + CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); - - /* It must be OK to kmap() if required */ - LASSERT (payload_kiov == NULL || !in_interrupt ()); + LASSERT (payload_niov <= LNET_MAX_IOV); /* payload is either all vaddrs or all pages */ LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); - + LASSERT (!in_interrupt ()); + if (payload_iov != NULL) - desc_size = offsetof(ksock_ltx_t, ltx_iov[1 + payload_niov]); + desc_size = offsetof(ksock_tx_t, + tx_frags.virt.iov[1 + payload_niov]); else - desc_size = offsetof(ksock_ltx_t, ltx_kiov[payload_niov]); - - if (in_interrupt() || - type == PTL_MSG_ACK || - type == PTL_MSG_REPLY) { - /* Can't block if in interrupt or responding to an incoming - * message */ - PORTAL_ALLOC_ATOMIC(ltx, desc_size); - } else { - PORTAL_ALLOC(ltx, desc_size); - } - - if (ltx == NULL) { - CERROR("Can't allocate tx desc type %d size %d %s\n", - type, desc_size, in_interrupt() ? "(intr)" : ""); - return (PTL_NO_SPACE); + desc_size = offsetof(ksock_tx_t, + tx_frags.paged.kiov[payload_niov]); + + tx = ksocknal_alloc_tx(desc_size); + if (tx == NULL) { + CERROR("Can't allocate tx desc type %d size %d\n", + type, desc_size); + return (-ENOMEM); } - atomic_inc(&ksocknal_data.ksnd_nactive_ltxs); - - ltx->ltx_desc_size = desc_size; - - /* We always have 1 mapped frag for the header */ - ltx->ltx_tx.tx_iov = ltx->ltx_iov; - ltx->ltx_iov[0].iov_base = <x->ltx_hdr; - ltx->ltx_iov[0].iov_len = sizeof(*hdr); - ltx->ltx_hdr = *hdr; - - ltx->ltx_private = private; - ltx->ltx_cookie = cookie; - - ltx->ltx_tx.tx_isfwd = 0; - ltx->ltx_tx.tx_nob = sizeof (*hdr) + payload_nob; + tx->tx_conn = NULL; /* set when assigned a conn */ + tx->tx_lnetmsg = lntmsg; if (payload_iov != NULL) { - /* payload is all mapped */ - ltx->ltx_tx.tx_kiov = NULL; - ltx->ltx_tx.tx_nkiov = 0; - - ltx->ltx_tx.tx_niov = - 1 + lib_extract_iov(payload_niov, <x->ltx_iov[1], - payload_niov, payload_iov, - payload_offset, payload_nob); + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1 + + lnet_extract_iov(payload_niov, &tx->tx_iov[1], + payload_niov, payload_iov, + payload_offset, payload_nob); } else { - /* payload is all pages */ - ltx->ltx_tx.tx_niov = 1; - - ltx->ltx_tx.tx_kiov = ltx->ltx_kiov; - ltx->ltx_tx.tx_nkiov = - lib_extract_kiov(payload_niov, ltx->ltx_kiov, - payload_niov, payload_kiov, - payload_offset, payload_nob); + tx->tx_niov = 1; + tx->tx_iov = &tx->tx_frags.paged.iov; + tx->tx_kiov = tx->tx_frags.paged.kiov; + tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); } - rc = ksocknal_launch_packet(<x->ltx_tx, nid); - if (rc == 0) - return (PTL_OK); - - ksocknal_free_ltx(ltx); - return (PTL_FAIL); -} - -ptl_err_t -ksocknal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) -{ - return (ksocknal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); -} - -ptl_err_t -ksocknal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) -{ - return (ksocknal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); -} - -void -ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - ptl_nid_t nid = fwd->kprfd_gateway_nid; - ksock_ftx_t *ftx = (ksock_ftx_t *)&fwd->kprfd_scratch; - int rc; - - CDEBUG (D_NET, "Forwarding [%p] -> "LPX64" ("LPX64"))\n", fwd, - fwd->kprfd_gateway_nid, fwd->kprfd_target_nid); + ksocknal_init_msg(&tx->tx_msg, KSOCK_MSG_LNET); - /* I'm the gateway; must be the last hop */ - if (nid == ksocknal_lib.libnal_ni.ni_pid.nid) - nid = fwd->kprfd_target_nid; - - /* setup iov for hdr */ - ftx->ftx_iov.iov_base = fwd->kprfd_hdr; - ftx->ftx_iov.iov_len = sizeof(ptl_hdr_t); - - ftx->ftx_tx.tx_isfwd = 1; /* This is a forwarding packet */ - ftx->ftx_tx.tx_nob = sizeof(ptl_hdr_t) + fwd->kprfd_nob; - ftx->ftx_tx.tx_niov = 1; - ftx->ftx_tx.tx_iov = &ftx->ftx_iov; - ftx->ftx_tx.tx_nkiov = fwd->kprfd_niov; - ftx->ftx_tx.tx_kiov = fwd->kprfd_kiov; - - rc = ksocknal_launch_packet (&ftx->ftx_tx, nid); - if (rc != 0) - kpr_fwd_done (&ksocknal_data.ksnd_router, fwd, rc); + /* The first fragment will be set later in pro_pack */ + rc = ksocknal_launch_packet(ni, tx, target); + if (rc == 0) + return (0); + + ksocknal_free_tx(tx); + return (-EIO); } int ksocknal_thread_start (int (*fn)(void *arg), void *arg) { long pid = cfs_kernel_thread (fn, arg, 0); - unsigned long flags; if (pid < 0) return ((int)pid); - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); ksocknal_data.ksnd_nthreads++; - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); return (0); } void ksocknal_thread_fini (void) { - unsigned long flags; - - write_lock_irqsave(&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); ksocknal_data.ksnd_nthreads--; - write_unlock_irqrestore(&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); } -void -ksocknal_fmb_callback (void *arg, int error) +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) { - ksock_fmb_t *fmb = (ksock_fmb_t *)arg; - ksock_fmb_pool_t *fmp = fmb->fmb_pool; - ptl_hdr_t *hdr = &fmb->fmb_hdr; - ksock_conn_t *conn = NULL; - ksock_sched_t *sched; - unsigned long flags; - char ipbuf[PTL_NALFMT_SIZE]; - char ipbuf2[PTL_NALFMT_SIZE]; - - if (error != 0) - CERROR("Failed to route packet from " - LPX64" %s to "LPX64" %s: %d\n", - le64_to_cpu(hdr->src_nid), - portals_nid2str(SOCKNAL, le64_to_cpu(hdr->src_nid), ipbuf), - le64_to_cpu(hdr->dest_nid), - portals_nid2str(SOCKNAL, le64_to_cpu(hdr->dest_nid), ipbuf2), - error); - else - CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n", - le64_to_cpu(hdr->src_nid), le64_to_cpu(hdr->dest_nid)); - - /* drop peer ref taken on init */ - ksocknal_put_peer (fmb->fmb_peer); + static char ksocknal_slop_buffer[4096]; - spin_lock_irqsave (&fmp->fmp_lock, flags); + int nob; + unsigned int niov; + int skipped; - list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs); - fmp->fmp_nactive_fmbs--; + LASSERT(conn->ksnc_proto != NULL); - if (!list_empty (&fmp->fmp_blocked_conns)) { - conn = list_entry (fmb->fmb_pool->fmp_blocked_conns.next, - ksock_conn_t, ksnc_rx_list); - list_del (&conn->ksnc_rx_list); + if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) { + /* Remind the socket to ack eagerly... */ + ksocknal_lib_eager_ack(conn); } - spin_unlock_irqrestore (&fmp->fmp_lock, flags); + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + mb (); /* racing with timeout thread */ + + switch (conn->ksnc_proto->pro_version) { + case KSOCK_PROTO_V2: + conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg; + + if (conn->ksnc_type == SOCKLND_CONN_BULK_IN) { + /* always expect lnet_hdr_t to avoid extra-read for better performance */ + conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload); + conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload); + conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload); + + } else { + /* can't make sure if it's noop or not */ + conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u); + } + break; - if (conn == NULL) - return; + case KSOCK_PROTO_V1: + /* Receiving bare lnet_hdr_t */ + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t); + conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t); - CDEBUG (D_NET, "Scheduling conn %p\n", conn); - LASSERT (conn->ksnc_rx_scheduled); - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP); + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof (lnet_hdr_t); + break; - conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; + default: + LBUG (); + } + conn->ksnc_rx_niov = 1; - sched = conn->ksnc_scheduler; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_csum = ~0; + return (1); + } + + /* Set up to skip as much as possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); - spin_lock_irqsave (&sched->kss_lock, flags); + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; - list_add_tail (&conn->ksnc_rx_list, &sched->kss_rx_conns); - cfs_waitq_signal (&sched->kss_waitq); + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); - spin_unlock_irqrestore (&sched->kss_lock, flags); + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return (0); } -ksock_fmb_t * -ksocknal_get_idle_fmb (ksock_conn_t *conn) +/* (Sink) handle incoming ZC request from sender */ +static int +ksocknal_handle_zc_req(ksock_peer_t *peer, __u64 cookie) { - int payload_nob = conn->ksnc_rx_nob_left; - unsigned long flags; - ksock_fmb_pool_t *pool; - ksock_fmb_t *fmb; + ksock_conn_t *conn; + ksock_tx_t *tx; + ksock_sched_t *sched; + int rc; - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); - LASSERT (kpr_routing(&ksocknal_data.ksnd_router)); + read_lock (&ksocknal_data.ksnd_global_lock); - if (payload_nob <= SOCKNAL_SMALL_FWD_PAGES * CFS_PAGE_SIZE) - pool = &ksocknal_data.ksnd_small_fmp; - else - pool = &ksocknal_data.ksnd_large_fmp; + conn = ksocknal_find_conn_locked (0, peer); + if (conn == NULL) { + read_unlock (&ksocknal_data.ksnd_global_lock); + CERROR("Can't find connection to send zcack.\n"); + return -ECONNRESET; + } - spin_lock_irqsave (&pool->fmp_lock, flags); + sched = conn->ksnc_scheduler; - if (!list_empty (&pool->fmp_idle_fmbs)) { - fmb = list_entry(pool->fmp_idle_fmbs.next, - ksock_fmb_t, fmb_list); - list_del (&fmb->fmb_list); - pool->fmp_nactive_fmbs++; - spin_unlock_irqrestore (&pool->fmp_lock, flags); + spin_lock_bh (&sched->kss_lock); + rc = ksocknal_piggyback_zcack(conn, cookie); + spin_unlock_bh (&sched->kss_lock); - return (fmb); + read_unlock (&ksocknal_data.ksnd_global_lock); + if (rc) { + /* Ack cookie is piggybacked */ + return 0; } - /* deschedule until fmb free */ + tx = ksocknal_alloc_tx(KSOCK_NOOP_TX_SIZE); + if (tx == NULL) { + CERROR("Can't allocate noop tx desc\n"); + return -ENOMEM; + } - conn->ksnc_rx_state = SOCKNAL_RX_FMB_SLEEP; + tx->tx_conn = NULL; + tx->tx_lnetmsg = NULL; + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1; - list_add_tail (&conn->ksnc_rx_list, - &pool->fmp_blocked_conns); + ksocknal_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP); + tx->tx_msg.ksm_zc_ack_cookie = cookie; /* incoming cookie */ - spin_unlock_irqrestore (&pool->fmp_lock, flags); - return (NULL); -} - -int -ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) -{ - int payload_nob = conn->ksnc_rx_nob_left; - ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); - int niov = 0; - int nob = payload_nob; - - LASSERT (conn->ksnc_rx_scheduled); - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB); - LASSERT (conn->ksnc_rx_nob_wanted == conn->ksnc_rx_nob_left); - LASSERT (payload_nob >= 0); - LASSERT (payload_nob <= fmb->fmb_pool->fmp_buff_pages * CFS_PAGE_SIZE); - LASSERT (sizeof (ptl_hdr_t) < CFS_PAGE_SIZE); - LASSERT (fmb->fmb_kiov[0].kiov_offset == 0); - - /* Take a ref on the conn's peer to prevent module unload before - * forwarding completes. */ - fmb->fmb_peer = conn->ksnc_peer; - atomic_inc (&conn->ksnc_peer->ksnp_refcount); - - /* Copy the header we just read into the forwarding buffer. If - * there's payload, start reading reading it into the buffer, - * otherwise the forwarding buffer can be kicked off - * immediately. */ - fmb->fmb_hdr = conn->ksnc_hdr; - - while (nob > 0) { - LASSERT (niov < fmb->fmb_pool->fmp_buff_pages); - LASSERT (fmb->fmb_kiov[niov].kiov_offset == 0); - fmb->fmb_kiov[niov].kiov_len = MIN (CFS_PAGE_SIZE, nob); - nob -= CFS_PAGE_SIZE; - niov++; - } - - kpr_fwd_init(&fmb->fmb_fwd, dest_nid, &fmb->fmb_hdr, - payload_nob, niov, fmb->fmb_kiov, - ksocknal_fmb_callback, fmb); - - if (payload_nob == 0) { /* got complete packet already */ - CDEBUG (D_NET, "%p "LPX64"->"LPX64" fwd_start (immediate)\n", - conn, le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid); - - kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); + read_lock (&ksocknal_data.ksnd_global_lock); - ksocknal_new_packet (conn, 0); /* on to next packet */ - return (1); + conn = ksocknal_find_conn_locked (0, peer); + if (conn == NULL) { + read_unlock (&ksocknal_data.ksnd_global_lock); + ksocknal_free_tx(tx); + CERROR("Can't find connection to send zcack.\n"); + return -ECONNRESET; } + ksocknal_queue_tx_locked(tx, conn); - conn->ksnc_cookie = fmb; /* stash fmb for later */ - conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - - /* Set up conn->ksnc_rx_kiov to read the payload into fmb's kiov-ed - * buffer */ - LASSERT (niov <= sizeof(conn->ksnc_rx_iov_space)/sizeof(ptl_kiov_t)); - - conn->ksnc_rx_niov = 0; - conn->ksnc_rx_nkiov = niov; - conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; - memcpy(conn->ksnc_rx_kiov, fmb->fmb_kiov, niov * sizeof(ptl_kiov_t)); - - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d reading body\n", conn, - le64_to_cpu(conn->ksnc_hdr.src_nid), dest_nid, payload_nob); - return (0); -} - -void -ksocknal_fwd_parse (ksock_conn_t *conn) -{ - ksock_peer_t *peer; - ptl_nid_t dest_nid = le64_to_cpu(conn->ksnc_hdr.dest_nid); - ptl_nid_t src_nid = le64_to_cpu(conn->ksnc_hdr.src_nid); - int body_len = le32_to_cpu(conn->ksnc_hdr.payload_length); - char str[PTL_NALFMT_SIZE]; - char str2[PTL_NALFMT_SIZE]; - - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d parsing header\n", conn, - src_nid, dest_nid, conn->ksnc_rx_nob_left); - - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER); - LASSERT (conn->ksnc_rx_scheduled); + read_unlock (&ksocknal_data.ksnd_global_lock); - if (body_len < 0) { /* length corrupt (overflow) */ - CERROR("dropping packet from "LPX64" (%s) for "LPX64" (%s): " - "packet size %d illegal\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2), - body_len); - - ksocknal_new_packet (conn, 0); /* on to new packet */ - return; - } - - if (!kpr_routing(&ksocknal_data.ksnd_router)) { /* not forwarding */ - CERROR("dropping packet from "LPX64" (%s) for "LPX64 - " (%s): not forwarding\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2)); - /* on to new packet (skip this one's body) */ - ksocknal_new_packet (conn, body_len); - return; - } - - if (body_len > PTL_MTU) { /* too big to forward */ - CERROR ("dropping packet from "LPX64" (%s) for "LPX64 - "(%s): packet size %d too big\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2), - body_len); - /* on to new packet (skip this one's body) */ - ksocknal_new_packet (conn, body_len); - return; - } - - /* should have gone direct */ - peer = ksocknal_get_peer (conn->ksnc_hdr.dest_nid); - if (peer != NULL) { - CERROR ("dropping packet from "LPX64" (%s) for "LPX64 - "(%s): target is a peer\n", - src_nid, portals_nid2str(TCPNAL, src_nid, str), - dest_nid, portals_nid2str(TCPNAL, dest_nid, str2)); - ksocknal_put_peer (peer); /* drop ref from get above */ - - /* on to next packet (skip this one's body) */ - ksocknal_new_packet (conn, body_len); - return; - } - - conn->ksnc_rx_state = SOCKNAL_RX_GET_FMB; /* Getting FMB now */ - conn->ksnc_rx_nob_left = body_len; /* stash packet size */ - conn->ksnc_rx_nob_wanted = body_len; /* (no slop) */ + return 0; } -int -ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +/* (Sender) handle ZC_ACK from sink */ +static int +ksocknal_handle_zc_ack(ksock_peer_t *peer, __u64 cookie) { - static char ksocknal_slop_buffer[4096]; + ksock_tx_t *tx; + struct list_head *ctmp; - int nob; - int niov; - int skipped; + spin_lock(&peer->ksnp_lock); - if (nob_to_skip == 0) { /* right at next packet boundary now */ - conn->ksnc_rx_started = 0; - mb (); /* racing with timeout thread */ + list_for_each(ctmp, &peer->ksnp_zc_req_list) { + tx = list_entry (ctmp, ksock_tx_t, tx_zc_list); + if (tx->tx_msg.ksm_zc_req_cookie != cookie) + continue; - conn->ksnc_rx_state = SOCKNAL_RX_HEADER; - conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); - conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); + list_del(&tx->tx_zc_list); + spin_unlock(&peer->ksnp_lock); - conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; - conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_hdr; - conn->ksnc_rx_iov[0].iov_len = sizeof (ptl_hdr_t); - conn->ksnc_rx_niov = 1; + ksocknal_tx_decref(tx); - conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_nkiov = 0; - return (1); + return 0; } + spin_unlock(&peer->ksnp_lock); - /* Set up to skip as much a possible now. If there's more left - * (ran out of iov entries) we'll get called again */ - - conn->ksnc_rx_state = SOCKNAL_RX_SLOP; - conn->ksnc_rx_nob_left = nob_to_skip; - conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; - skipped = 0; - niov = 0; - - do { - nob = MIN (nob_to_skip, sizeof (ksocknal_slop_buffer)); - - conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; - conn->ksnc_rx_iov[niov].iov_len = nob; - niov++; - skipped += nob; - nob_to_skip -=nob; - - } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ - niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); - - conn->ksnc_rx_niov = niov; - conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_nkiov = 0; - conn->ksnc_rx_nob_wanted = skipped; - return (0); + return -EPROTO; } int ksocknal_process_receive (ksock_conn_t *conn) { - ksock_fmb_t *fmb; int rc; + + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); - LASSERT (atomic_read (&conn->ksnc_refcount) > 0); - - /* doesn't need a forwarding buffer */ - if (conn->ksnc_rx_state != SOCKNAL_RX_GET_FMB) - goto try_read; - - get_fmb: - fmb = ksocknal_get_idle_fmb (conn); - if (fmb == NULL) { - /* conn descheduled waiting for idle fmb */ - return (0); - } - - if (ksocknal_init_fmb (conn, fmb)) { - /* packet forwarded */ - return (0); - } - - try_read: /* NB: sched lock NOT held */ - LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_HEADER || - conn->ksnc_rx_state == SOCKNAL_RX_BODY || - conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD || + /* SOCKNAL_RX_LNET_HEADER is here for backward compatability */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + again: + if (conn->ksnc_rx_nob_wanted != 0) { + rc = ksocknal_receive(conn); - LASSERT (conn->ksnc_rx_nob_wanted > 0); - - rc = ksocknal_receive(conn); - - if (rc <= 0) { - LASSERT (rc != -EAGAIN); - - if (rc == 0) - CWARN ("[%p] EOF from "LPX64" ip %d.%d.%d.%d:%d\n", - conn, conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); - else if (!conn->ksnc_closing) - CERROR ("[%p] Error %d on read from "LPX64 - " ip %d.%d.%d.%d:%d\n", - conn, rc, conn->ksnc_peer->ksnp_nid, - HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); + if (rc <= 0) { + LASSERT (rc != -EAGAIN); - ksocknal_close_conn_and_siblings (conn, rc); - return (rc == 0 ? -ESHUTDOWN : rc); + if (rc == 0) + CDEBUG (D_NET, "[%p] EOF from %s" + " ip %d.%d.%d.%d:%d\n", conn, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + else if (!conn->ksnc_closing) + CERROR ("[%p] Error %d on read from %s" + " ip %d.%d.%d.%d:%d\n", + conn, rc, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + return (rc == 0 ? -ESHUTDOWN : rc); + } + + if (conn->ksnc_rx_nob_wanted != 0) { + /* short read */ + return (-EAGAIN); + } } + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_flip) { + __swab32s(&conn->ksnc_msg.ksm_type); + __swab32s(&conn->ksnc_msg.ksm_csum); + __swab64s(&conn->ksnc_msg.ksm_zc_req_cookie); + __swab64s(&conn->ksnc_msg.ksm_zc_ack_cookie); + } - if (conn->ksnc_rx_nob_wanted != 0) { - /* short read */ - return (-EAGAIN); - } + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + /* NOOP Checksum error */ + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (-EIO); + } - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_HEADER: - if (conn->ksnc_hdr.type != cpu_to_le32(PTL_MSG_HELLO) && - le64_to_cpu(conn->ksnc_hdr.dest_nid) != - ksocknal_lib.libnal_ni.ni_pid.nid) { - /* This packet isn't for me */ - ksocknal_fwd_parse (conn); - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_HEADER: /* skipped (zero payload) */ - return (0); /* => come back later */ - case SOCKNAL_RX_SLOP: /* skipping packet's body */ - goto try_read; /* => go read it */ - case SOCKNAL_RX_GET_FMB: /* forwarding */ - goto get_fmb; /* => go get a fwd msg buffer */ - default: - LBUG (); + if (conn->ksnc_msg.ksm_zc_ack_cookie != 0) { + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v2x); + + rc = ksocknal_handle_zc_ack(conn->ksnc_peer, + conn->ksnc_msg.ksm_zc_ack_cookie); + if (rc != 0) { + CERROR("%s: Unknown zero copy ACK cookie: "LPU64"\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_zc_ack_cookie); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return (rc); } - /* Not Reached */ } - /* sets wanted_len, iovs etc */ - rc = lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { + ksocknal_new_packet (conn, 0); + return 0; /* NOOP is done and just return */ + } + LASSERT (conn->ksnc_msg.ksm_type == KSOCK_MSG_LNET); + + if (conn->ksnc_type == SOCKLND_CONN_BULK_IN) { + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + /* has read lnet_hdr_t already (re ksocknal_new_packet), fall through */ + } else { + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t); + conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_iov = (struct iovec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = (char *)&conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_niov = 1; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + + goto again; /* read lnet header now */ + } + + case SOCKNAL_RX_LNET_HEADER: + /* unpack message header */ + conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); + + if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { + /* Userspace peer */ + lnet_process_id_t *id = &conn->ksnc_peer->ksnp_id; + lnet_hdr_t *lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + + /* Substitute process ID assigned at connection time */ + lhdr->src_pid = cpu_to_le32(id->pid); + lhdr->src_nid = cpu_to_le64(id->nid); + } - if (rc != PTL_OK) { + conn->ksnc_rx_state = SOCKNAL_RX_PARSE; + ksocknal_conn_addref(conn); /* ++ref while parsing */ + + rc = lnet_parse(conn->ksnc_peer->ksnp_ni, + &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, + conn->ksnc_peer->ksnp_id.nid, conn, 0); + if (rc < 0) { /* I just received garbage: give up on this conn */ + ksocknal_new_packet(conn, 0); ksocknal_close_conn_and_siblings (conn, rc); + ksocknal_conn_decref(conn); return (-EPROTO); } - if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ - conn->ksnc_rx_state = SOCKNAL_RX_BODY; - goto try_read; /* go read the payload */ + /* I'm racing with ksocknal_recv() */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); + + if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) + return 0; + + /* ksocknal_recv() got called */ + goto again; + + case SOCKNAL_RX_LNET_PAYLOAD: + /* payload all received */ + rc = 0; + + if (conn->ksnc_rx_nob_left == 0 && /* not truncating */ + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + rc = -EIO; } - /* Fall through (completed packet for me) */ - case SOCKNAL_RX_BODY: - /* payload all received */ - lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie, PTL_OK); + lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); + + if (rc == 0 && conn->ksnc_msg.ksm_zc_req_cookie != 0) { + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v2x); + rc = ksocknal_handle_zc_req(conn->ksnc_peer, + conn->ksnc_msg.ksm_zc_req_cookie); + } + + if (rc != 0) { + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + return (-EPROTO); + } /* Fall through */ case SOCKNAL_RX_SLOP: /* starting new packet? */ if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) - return (0); /* come back later */ - goto try_read; /* try to finish reading slop now */ - - case SOCKNAL_RX_BODY_FWD: - /* payload all received */ - CDEBUG (D_NET, "%p "LPX64"->"LPX64" %d fwd_start (got body)\n", - conn, le64_to_cpu(conn->ksnc_hdr.src_nid), - le64_to_cpu(conn->ksnc_hdr.dest_nid), - conn->ksnc_rx_nob_left); - - /* forward the packet. NB ksocknal_init_fmb() put fmb into - * conn->ksnc_cookie */ - fmb = (ksock_fmb_t *)conn->ksnc_cookie; - kpr_fwd_start (&ksocknal_data.ksnd_router, &fmb->fmb_fwd); - - /* no slop in forwarded packets */ - LASSERT (conn->ksnc_rx_nob_left == 0); - - ksocknal_new_packet (conn, 0); /* on to next packet */ - return (0); /* (later) */ + return 0; /* come back later */ + goto again; /* try to finish reading slop now */ default: break; @@ -1413,78 +1442,76 @@ ksocknal_process_receive (ksock_conn_t *conn) return (-EINVAL); /* keep gcc happy */ } -ptl_err_t -ksocknal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) +int +ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) { - ksock_conn_t *conn = (ksock_conn_t *)private; + ksock_conn_t *conn = (ksock_conn_t *)private; + ksock_sched_t *sched = conn->ksnc_scheduler; LASSERT (mlen <= rlen); - LASSERT (niov <= PTL_MD_MAX_IOV); - + LASSERT (niov <= LNET_MAX_IOV); + conn->ksnc_cookie = msg; conn->ksnc_rx_nob_wanted = mlen; conn->ksnc_rx_nob_left = rlen; - conn->ksnc_rx_nkiov = 0; - conn->ksnc_rx_kiov = NULL; - conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; - conn->ksnc_rx_niov = - lib_extract_iov(PTL_MD_MAX_IOV, conn->ksnc_rx_iov, - niov, iov, offset, mlen); - + if (mlen == 0 || iov != NULL) { + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + conn->ksnc_rx_niov = + lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, + niov, iov, offset, mlen); + } else { + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + conn->ksnc_rx_nkiov = + lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); + } + LASSERT (mlen == - lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + - lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - - return (PTL_OK); -} - -ptl_err_t -ksocknal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - ksock_conn_t *conn = (ksock_conn_t *)private; - - LASSERT (mlen <= rlen); - LASSERT (niov <= PTL_MD_MAX_IOV); + lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); - conn->ksnc_cookie = msg; - conn->ksnc_rx_nob_wanted = mlen; - conn->ksnc_rx_nob_left = rlen; + LASSERT (conn->ksnc_rx_scheduled); - conn->ksnc_rx_niov = 0; - conn->ksnc_rx_iov = NULL; - conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; - conn->ksnc_rx_nkiov = - lib_extract_kiov(PTL_MD_MAX_IOV, conn->ksnc_rx_kiov, - niov, kiov, offset, mlen); + spin_lock_bh (&sched->kss_lock); - LASSERT (mlen == - lib_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + - lib_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_PARSE_WAIT: + list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); + cfs_waitq_signal (&sched->kss_waitq); + LASSERT (conn->ksnc_rx_ready); + break; + + case SOCKNAL_RX_PARSE: + /* scheduler hasn't noticed I'm parsing yet */ + break; + } - return (PTL_OK); + conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; + + spin_unlock_bh (&sched->kss_lock); + ksocknal_conn_decref(conn); + return (0); } static inline int ksocknal_sched_cansleep(ksock_sched_t *sched) { - unsigned long flags; int rc; - spin_lock_irqsave(&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); rc = (!ksocknal_data.ksnd_shuttingdown && -#if SOCKNAL_ZC - list_empty(&sched->kss_zctxdone_list) && -#endif list_empty(&sched->kss_rx_conns) && list_empty(&sched->kss_tx_conns)); - - spin_unlock_irqrestore(&sched->kss_lock, flags); + + spin_unlock_bh (&sched->kss_lock); return (rc); } @@ -1493,15 +1520,14 @@ int ksocknal_scheduler (void *arg) ksock_sched_t *sched = (ksock_sched_t *)arg; ksock_conn_t *conn; ksock_tx_t *tx; - unsigned long flags; int rc; int nloops = 0; int id = sched - ksocknal_data.ksnd_schedulers; char name[16]; - snprintf (name, sizeof (name),"ksocknald_%02d", id); - kportal_daemonize (name); - kportal_blockallsigs (); + snprintf (name, sizeof (name),"socknal_sd%02d", id); + cfs_daemonize (name); + cfs_block_allsigs (); #if (CONFIG_SMP && CPU_AFFINITY) id = ksocknal_sched2cpu(id); @@ -1514,7 +1540,7 @@ int ksocknal_scheduler (void *arg) } #endif /* CONFIG_SMP && CPU_AFFINITY */ - spin_lock_irqsave (&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); while (!ksocknal_data.ksnd_shuttingdown) { int did_something = 0; @@ -1534,11 +1560,11 @@ int ksocknal_scheduler (void *arg) * data_ready can set it any time after we release * kss_lock. */ conn->ksnc_rx_ready = 0; - spin_unlock_irqrestore(&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); rc = ksocknal_process_receive(conn); - spin_lock_irqsave(&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); /* I'm the only one that can clear this flag */ LASSERT(conn->ksnc_rx_scheduled); @@ -1547,13 +1573,11 @@ int ksocknal_scheduler (void *arg) if (rc == 0) conn->ksnc_rx_ready = 1; - if (conn->ksnc_rx_state == SOCKNAL_RX_FMB_SLEEP || - conn->ksnc_rx_state == SOCKNAL_RX_GET_FMB) { - /* Conn blocked for a forwarding buffer. - * It will get queued for my attention when - * one becomes available (and it might just - * already have been!). Meanwhile my ref - * on it stays put. */ + if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { + /* Conn blocked waiting for ksocknal_recv() + * I change its state (under lock) to signal + * it can be rescheduled */ + conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; } else if (conn->ksnc_rx_ready) { /* reschedule for rx */ list_add_tail (&conn->ksnc_rx_list, @@ -1561,23 +1585,34 @@ int ksocknal_scheduler (void *arg) } else { conn->ksnc_rx_scheduled = 0; /* drop my ref */ - ksocknal_put_conn(conn); + ksocknal_conn_decref(conn); } did_something = 1; } if (!list_empty (&sched->kss_tx_conns)) { + CFS_LIST_HEAD (zlist); + + if (!list_empty(&sched->kss_zombie_noop_txs)) { + list_add(&zlist, &sched->kss_zombie_noop_txs); + list_del_init(&sched->kss_zombie_noop_txs); + } + conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); list_del (&conn->ksnc_tx_list); - + LASSERT(conn->ksnc_tx_scheduled); LASSERT(conn->ksnc_tx_ready); LASSERT(!list_empty(&conn->ksnc_tx_queue)); - + tx = list_entry(conn->ksnc_tx_queue.next, ksock_tx_t, tx_list); + + if (conn->ksnc_tx_mono == tx) + ksocknal_next_mono_tx(conn); + /* dequeue now so empty list => more to send */ list_del(&tx->tx_list); @@ -1586,17 +1621,26 @@ int ksocknal_scheduler (void *arg) * write_space can set it any time after we release * kss_lock. */ conn->ksnc_tx_ready = 0; - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); - rc = ksocknal_process_transmit(conn, tx); + if (!list_empty(&zlist)) { + /* free zombie noop txs, it's fast because + * noop txs are just put in freelist */ + ksocknal_txlist_done(NULL, &zlist, 0); + } - spin_lock_irqsave (&sched->kss_lock, flags); + rc = ksocknal_process_transmit(conn, tx); if (rc == -ENOMEM || rc == -EAGAIN) { /* Incomplete send: replace tx on HEAD of tx_queue */ + spin_lock_bh (&sched->kss_lock); list_add (&tx->tx_list, &conn->ksnc_tx_queue); } else { - /* Complete send; assume space for more */ + /* Complete send; tx -ref */ + ksocknal_tx_decref (tx); + + spin_lock_bh (&sched->kss_lock); + /* assume space for more */ conn->ksnc_tx_ready = 1; } @@ -1611,44 +1655,31 @@ int ksocknal_scheduler (void *arg) } else { conn->ksnc_tx_scheduled = 0; /* drop my ref */ - ksocknal_put_conn (conn); + ksocknal_conn_decref(conn); } - - did_something = 1; - } -#if SOCKNAL_ZC - if (!list_empty (&sched->kss_zctxdone_list)) { - ksock_tx_t *tx = - list_entry(sched->kss_zctxdone_list.next, - ksock_tx_t, tx_list); + did_something = 1; - - list_del (&tx->tx_list); - spin_unlock_irqrestore (&sched->kss_lock, flags); - - ksocknal_tx_done (tx, 1); - - spin_lock_irqsave (&sched->kss_lock, flags); } -#endif if (!did_something || /* nothing to do */ ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); nloops = 0; if (!did_something) { /* wait for something to do */ - rc = wait_event_interruptible (sched->kss_waitq, - !ksocknal_sched_cansleep(sched)); + rc = wait_event_interruptible_exclusive( + sched->kss_waitq, + !ksocknal_sched_cansleep(sched)); LASSERT (rc == 0); - } else - our_cond_resched(); + } else { + our_cond_resched(); + } - spin_lock_irqsave (&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); } } - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); ksocknal_thread_fini (); return (0); } @@ -1660,12 +1691,11 @@ int ksocknal_scheduler (void *arg) void ksocknal_read_callback (ksock_conn_t *conn) { ksock_sched_t *sched; - unsigned long flags; ENTRY; sched = conn->ksnc_scheduler; - spin_lock_irqsave (&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); conn->ksnc_rx_ready = 1; @@ -1674,11 +1704,11 @@ void ksocknal_read_callback (ksock_conn_t *conn) &sched->kss_rx_conns); conn->ksnc_rx_scheduled = 1; /* extra ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); cfs_waitq_signal (&sched->kss_waitq); } - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); EXIT; } @@ -1690,12 +1720,11 @@ void ksocknal_read_callback (ksock_conn_t *conn) void ksocknal_write_callback (ksock_conn_t *conn) { ksock_sched_t *sched; - unsigned long flags; ENTRY; - + sched = conn->ksnc_scheduler; - spin_lock_irqsave (&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); conn->ksnc_tx_ready = 1; @@ -1705,375 +1734,714 @@ void ksocknal_write_callback (ksock_conn_t *conn) &sched->kss_tx_conns); conn->ksnc_tx_scheduled = 1; /* extra ref for scheduler */ - atomic_inc (&conn->ksnc_refcount); + ksocknal_conn_addref(conn); cfs_waitq_signal (&sched->kss_waitq); } - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); EXIT; } -int -ksocknal_sock_write (struct socket *sock, void *buffer, int nob) +ksock_protocol_t * +ksocknal_compat_protocol (ksock_hello_msg_t *hello) { - return ksocknal_lib_sock_write(sock, buffer, nob); -} + if ((hello->kshm_magic == LNET_PROTO_MAGIC && + hello->kshm_version == KSOCK_PROTO_V2) || + (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC) && + hello->kshm_version == __swab32(KSOCK_PROTO_V2))) + return &ksocknal_protocol_v2x; -int -ksocknal_sock_read (struct socket *sock, void *buffer, int nob) -{ - return ksocknal_lib_sock_read(sock, buffer, nob); -} + if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello; -int -ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs) -{ - /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ - struct socket *sock = conn->ksnc_sock; - ptl_hdr_t hdr; - ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; - int i; - int rc; + CLASSERT (sizeof (lnet_magicversion_t) == + offsetof (ksock_hello_msg_t, kshm_src_nid)); - LASSERT (conn->ksnc_type != SOCKNAL_CONN_NONE); - LASSERT (nipaddrs <= SOCKNAL_MAX_INTERFACES); + if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) && + hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR)) + return &ksocknal_protocol_v1x; + } - /* No need for getconnsock/putconnsock */ - LASSERT (!conn->ksnc_closing); + return NULL; +} - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); - hmv->magic = cpu_to_le32 (PORTALS_PROTO_MAGIC); - hmv->version_major = cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR); - hmv->version_minor = cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR); +static int +ksocknal_send_hello_v1 (ksock_conn_t *conn, ksock_hello_msg_t *hello) +{ + cfs_socket_t *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + lnet_magicversion_t *hmv; + int rc; + int i; + + CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid)); + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } + + hmv = (lnet_magicversion_t *)&hdr->dest_nid; + + /* Re-organize V2.x message header to V1.x (lnet_hdr_t) + * header and send out */ + hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC); + hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR); + hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hmv->version_major++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + hmv->magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } - hdr.src_nid = cpu_to_le64 (ksocknal_lib.libnal_ni.ni_pid.nid); - hdr.type = cpu_to_le32 (PTL_MSG_HELLO); - hdr.payload_length = cpu_to_le32 (nipaddrs * sizeof(*ipaddrs)); + hdr->src_nid = cpu_to_le64 (hello->kshm_src_nid); + hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid); + hdr->type = cpu_to_le32 (LNET_MSG_HELLO); + hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32)); + hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype); + hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation); - hdr.msg.hello.type = cpu_to_le32 (conn->ksnc_type); - hdr.msg.hello.incarnation = - cpu_to_le64 (ksocknal_data.ksnd_incarnation); + rc = libcfs_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout()); - /* Receiver is eager */ - rc = ksocknal_sock_write (sock, &hdr, sizeof(hdr)); if (rc != 0) { - CERROR ("Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", + CDEBUG (D_NETERROR, "Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - return (rc); + goto out; } - if (nipaddrs == 0) - return (0); + if (hello->kshm_nips == 0) + goto out; - for (i = 0; i < nipaddrs; i++) { - ipaddrs[i] = __cpu_to_le32 (ipaddrs[i]); + for (i = 0; i < hello->kshm_nips; i++) { + hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]); } - - rc = ksocknal_sock_write (sock, ipaddrs, nipaddrs * sizeof(*ipaddrs)); - if (rc != 0) - CERROR ("Error %d sending HELLO payload (%d)" - " to %u.%u.%u.%u/%d\n", rc, nipaddrs, + + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CDEBUG (D_NETERROR, "Error %d sending HELLO payload (%d)" + " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); - return (rc); -} + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); -int -ksocknal_invert_type(int type) + return rc; +} + +static int +ksocknal_send_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello) { - switch (type) - { - case SOCKNAL_CONN_ANY: - case SOCKNAL_CONN_CONTROL: - return (type); - case SOCKNAL_CONN_BULK_IN: - return SOCKNAL_CONN_BULK_OUT; - case SOCKNAL_CONN_BULK_OUT: - return SOCKNAL_CONN_BULK_IN; - default: - return (SOCKNAL_CONN_NONE); + cfs_socket_t *sock = conn->ksnc_sock; + int rc; + + hello->kshm_magic = LNET_PROTO_MAGIC; + hello->kshm_version = KSOCK_PROTO_V2; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hello->kshm_version++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + LNET_UNLOCK(); } -} -int -ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, - __u64 *incarnation, __u32 *ipaddrs) -{ - struct socket *sock = conn->ksnc_sock; - int rc; - int nips; - int i; - int type; - ptl_hdr_t hdr; - ptl_magicversion_t *hmv; - char ipbuf[PTL_NALFMT_SIZE]; - - hmv = (ptl_magicversion_t *)&hdr.dest_nid; - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); - - rc = ksocknal_sock_read (sock, hmv, sizeof (*hmv)); + rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips), + lnet_acceptor_timeout()); + if (rc != 0) { - CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", - rc, HIPQUAD(conn->ksnc_ipaddr)); - return (rc); + CDEBUG (D_NETERROR, "Error %d sending HELLO hdr to %u.%u.%u.%u/%d\n", + rc, HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); + return rc; } - if (hmv->magic != le32_to_cpu (PORTALS_PROTO_MAGIC)) { - CERROR ("Bad magic %#08x (%#08x expected) from %u.%u.%u.%u\n", - __cpu_to_le32 (hmv->magic), PORTALS_PROTO_MAGIC, - HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } + if (hello->kshm_nips == 0) + return 0; - if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || - hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { - CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" - " from %u.%u.%u.%u\n", - le16_to_cpu (hmv->version_major), - le16_to_cpu (hmv->version_minor), - PORTALS_PROTO_VERSION_MAJOR, - PORTALS_PROTO_VERSION_MINOR, - HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CDEBUG (D_NETERROR, "Error %d sending HELLO payload (%d)" + " to %u.%u.%u.%u/%d\n", rc, hello->kshm_nips, + HIPQUAD(conn->ksnc_ipaddr), conn->ksnc_port); } -#if (PORTALS_PROTO_VERSION_MAJOR != 1) -# error "This code only understands protocol version 1.x" -#endif - /* version 1 sends magic/version as the dest_nid of a 'hello' - * header, followed by payload full of interface IP addresses. - * Read the rest of it in now... */ + return rc; +} + +static int +ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello,int timeout) +{ + cfs_socket_t *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + int rc; + int i; + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } - rc = ksocknal_sock_read (sock, hmv + 1, sizeof (hdr) - sizeof (*hmv)); + rc = libcfs_sock_read(sock, &hdr->src_nid, + sizeof (*hdr) - offsetof (lnet_hdr_t, src_nid), + timeout); if (rc != 0) { CERROR ("Error %d reading rest of HELLO hdr from %u.%u.%u.%u\n", rc, HIPQUAD(conn->ksnc_ipaddr)); - return (rc); + LASSERT (rc < 0 && rc != -EALREADY); + goto out; } /* ...and check we got what we expected */ - if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) { + if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) { CERROR ("Expecting a HELLO hdr," " but got type %d from %u.%u.%u.%u\n", - le32_to_cpu (hdr.type), + le32_to_cpu (hdr->type), HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); + rc = -EPROTO; + goto out; } - if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY" - "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); + hello->kshm_src_nid = le64_to_cpu (hdr->src_nid); + hello->kshm_src_pid = le32_to_cpu (hdr->src_pid); + hello->kshm_src_incarnation = le64_to_cpu (hdr->msg.hello.incarnation); + hello->kshm_ctype = le32_to_cpu (hdr->msg.hello.type); + hello->kshm_nips = le32_to_cpu (hdr->payload_length) / + sizeof (__u32); + + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %u.%u.%u.%u\n", + hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr)); + rc = -EPROTO; + goto out; } - if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ - *nid = le64_to_cpu(hdr.src_nid); - } else if (*nid != le64_to_cpu (hdr.src_nid)) { - LCONSOLE_ERROR("Connected successfully to nid "LPX64" on host " - "%u.%u.%u.%u, but they claimed they were nid " - LPX64" (%s); please check your Lustre " - "configuration.\n", - *nid, HIPQUAD(conn->ksnc_ipaddr), - le64_to_cpu(hdr.src_nid), - portals_nid2str(SOCKNAL, - le64_to_cpu(hdr.src_nid), - ipbuf)); + if (hello->kshm_nips == 0) + goto out; - return (-EPROTO); + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + goto out; } - type = __le32_to_cpu(hdr.msg.hello.type); - - if (conn->ksnc_type == SOCKNAL_CONN_NONE) { - /* I've accepted this connection; peer determines type */ - conn->ksnc_type = ksocknal_invert_type(type); - if (conn->ksnc_type == SOCKNAL_CONN_NONE) { - CERROR ("Unexpected type %d from "LPX64"@%u.%u.%u.%u\n", - type, *nid, HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); + for (i = 0; i < hello->kshm_nips; i++) { + hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n", + i, HIPQUAD(conn->ksnc_ipaddr)); + rc = -EPROTO; + break; } - } else if (ksocknal_invert_type(type) != conn->ksnc_type) { - CERROR ("Mismatched types: me %d, "LPX64"@%u.%u.%u.%u %d\n", - conn->ksnc_type, *nid, HIPQUAD(conn->ksnc_ipaddr), - le32_to_cpu(hdr.msg.hello.type)); - return (-EPROTO); } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} - *incarnation = le64_to_cpu(hdr.msg.hello.incarnation); +static int +ksocknal_recv_hello_v2 (ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout) +{ + cfs_socket_t *sock = conn->ksnc_sock; + int rc; + int i; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + conn->ksnc_flip = 0; + else + conn->ksnc_flip = 1; - nips = __le32_to_cpu (hdr.payload_length) / sizeof (__u32); + rc = libcfs_sock_read(sock, &hello->kshm_src_nid, + offsetof(ksock_hello_msg_t, kshm_ips) - + offsetof(ksock_hello_msg_t, kshm_src_nid), + timeout); + if (rc != 0) { + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + return rc; + } - if (nips > SOCKNAL_MAX_INTERFACES || - nips * sizeof(__u32) != __le32_to_cpu (hdr.payload_length)) { - CERROR("Bad payload length %d from "LPX64"@%u.%u.%u.%u\n", - __le32_to_cpu (hdr.payload_length), - *nid, HIPQUAD(conn->ksnc_ipaddr)); + if (conn->ksnc_flip) { + __swab32s(&hello->kshm_src_pid); + __swab64s(&hello->kshm_src_nid); + __swab32s(&hello->kshm_dst_pid); + __swab64s(&hello->kshm_dst_nid); + __swab64s(&hello->kshm_src_incarnation); + __swab64s(&hello->kshm_dst_incarnation); + __swab32s(&hello->kshm_ctype); + __swab32s(&hello->kshm_nips); } - if (nips == 0) - return (0); + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %u.%u.%u.%u\n", + hello->kshm_nips, HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } - rc = ksocknal_sock_read (sock, ipaddrs, nips * sizeof(*ipaddrs)); + if (hello->kshm_nips == 0) + return 0; + + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); if (rc != 0) { - CERROR ("Error %d reading IPs from "LPX64"@%u.%u.%u.%u\n", - rc, *nid, HIPQUAD(conn->ksnc_ipaddr)); - return (rc); + CERROR ("Error %d reading IPs from ip %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + return rc; + } + + for (i = 0; i < hello->kshm_nips; i++) { + if (conn->ksnc_flip) + __swab32s(&hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %u.%u.%u.%u\n", + i, HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } } - for (i = 0; i < nips; i++) { - ipaddrs[i] = __le32_to_cpu(ipaddrs[i]); + return 0; +} - if (ipaddrs[i] == 0) { - CERROR("Zero IP[%d] from "LPX64"@%u.%u.%u.%u\n", - i, *nid, HIPQUAD(conn->ksnc_ipaddr)); - return (-EPROTO); - } +static void +ksocknal_pack_msg_v1(ksock_tx_t *tx) +{ + /* V1.x has no KSOCK_MSG_NOOP */ + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_lnetmsg != NULL); + + tx->tx_iov[0].iov_base = (void *)&tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(lnet_hdr_t); + + tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t); +} + +static void +ksocknal_pack_msg_v2(ksock_tx_t *tx) +{ + tx->tx_iov[0].iov_base = (void *)&tx->tx_msg; + + if (tx->tx_lnetmsg != NULL) { + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + + tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload); + tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_payload) + + tx->tx_lnetmsg->msg_len; + } else { + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); + tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); } + /* Don't checksum before start sending, because packet can be piggybacked with ACK */ +} + +static void +ksocknal_unpack_msg_v1(ksock_msg_t *msg) +{ + msg->ksm_type = KSOCK_MSG_LNET; + msg->ksm_csum = 0; + msg->ksm_zc_req_cookie = 0; + msg->ksm_zc_ack_cookie = 0; +} - return (nips); +static void +ksocknal_unpack_msg_v2(ksock_msg_t *msg) +{ + return; /* Do nothing */ } +ksock_protocol_t ksocknal_protocol_v1x = +{ + KSOCK_PROTO_V1, + ksocknal_send_hello_v1, + ksocknal_recv_hello_v1, + ksocknal_pack_msg_v1, + ksocknal_unpack_msg_v1 +}; + +ksock_protocol_t ksocknal_protocol_v2x = +{ + KSOCK_PROTO_V2, + ksocknal_send_hello_v2, + ksocknal_recv_hello_v2, + ksocknal_pack_msg_v2, + ksocknal_unpack_msg_v2 +}; + int -ksocknal_connect_peer (ksock_route_t *route, int type) +ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello) { - struct socket *sock; - int rc; - int port; - int may_retry; + /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ + ksock_net_t *net = (ksock_net_t *)ni->ni_data; + lnet_nid_t srcnid; - /* Iterate through reserved ports. When typed connections are - * used, we will need to bind to multiple ports, but we only know - * this at connect time. But, by that time we've already called - * bind() so we need a new socket. */ + LASSERT (0 <= hello->kshm_nips && hello->kshm_nips <= LNET_MAX_INTERFACES); - for (port = 1023; port > 512; --port) { + /* No need for getconnsock/putconnsock */ + LASSERT (!conn->ksnc_closing); + LASSERT (conn->ksnc_proto != NULL); - rc = ksocknal_lib_connect_sock(&sock, &may_retry, route, port); + srcnid = lnet_ptlcompat_srcnid(ni->ni_nid, peer_nid); - if (rc == 0) { - rc = ksocknal_create_conn(route, sock, type); - cfs_put_file(KSN_SOCK2FILE(sock)); - return rc; + hello->kshm_src_nid = srcnid; + hello->kshm_dst_nid = peer_nid; + hello->kshm_src_pid = the_lnet.ln_pid; + + hello->kshm_src_incarnation = net->ksnn_incarnation; + hello->kshm_ctype = conn->ksnc_type; + + return conn->ksnc_proto->pro_send_hello(conn, hello); +} + +int +ksocknal_invert_type(int type) +{ + switch (type) + { + case SOCKLND_CONN_ANY: + case SOCKLND_CONN_CONTROL: + return (type); + case SOCKLND_CONN_BULK_IN: + return SOCKLND_CONN_BULK_OUT; + case SOCKLND_CONN_BULK_OUT: + return SOCKLND_CONN_BULK_IN; + default: + return (SOCKLND_CONN_NONE); + } +} + +int +ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *peerid, + __u64 *incarnation) +{ + cfs_socket_t *sock = conn->ksnc_sock; + int active; + int timeout; + int match = 0; + int rc; + ksock_protocol_t *proto; + lnet_process_id_t recv_id; + + active = (peerid->nid != LNET_NID_ANY); + timeout = active ? *ksocknal_tunables.ksnd_timeout : + lnet_acceptor_timeout(); + + rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout); + if (rc != 0) { + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + return rc; + } + + if (hello->kshm_magic != LNET_PROTO_MAGIC && + hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && + hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + /* Unexpected magic! */ + if (active || + the_lnet.ln_ptlcompat == 0) { + CERROR ("Bad magic(1) %#08x (%#08x expected) from " + "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic), + LNET_PROTO_TCP_MAGIC, + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; } - if (!may_retry) + /* When portals compatibility is set, I may be passed a new + * connection "blindly" by the acceptor, and I have to + * determine if my peer has sent an acceptor connection request + * or not. This isn't a 'hello', so I'll get the acceptor to + * look at it... */ + rc = lnet_accept(ni, sock, hello->kshm_magic); + if (rc != 0) + return -EPROTO; + + /* ...and if it's OK I'm back to looking for a 'hello'... */ + rc = libcfs_sock_read(sock, &hello->kshm_magic, + sizeof (hello->kshm_magic), timeout); + if (rc != 0) { + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); return rc; + } + + /* Only need to check V1.x magic */ + if (hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + CERROR ("Bad magic(2) %#08x (%#08x expected) from " + "%u.%u.%u.%u\n", __cpu_to_le32 (hello->kshm_magic), + LNET_PROTO_TCP_MAGIC, + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + } + + rc = libcfs_sock_read(sock, &hello->kshm_version, + sizeof(hello->kshm_version), timeout); + if (rc != 0) { + CERROR ("Error %d reading HELLO from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + LASSERT (rc < 0 && rc != -EALREADY); + return rc; + } + + proto = ksocknal_compat_protocol(hello); + if (proto == NULL) { + if (!active) { + /* unknown protocol from peer, tell peer my protocol */ + conn->ksnc_proto = &ksocknal_protocol_v2x; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, ni->ni_nid, hello); + } + + CERROR ("Unknown protocol version (%d.x expected)" + " from %u.%u.%u.%u\n", + conn->ksnc_proto->pro_version, + HIPQUAD(conn->ksnc_ipaddr)); + + return -EPROTO; + } + + if (conn->ksnc_proto == proto) + match = 1; + + conn->ksnc_proto = proto; + + /* receive the rest of hello message anyway */ + rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); + if (rc != 0) { + CERROR("Error %d reading or checking hello from from %u.%u.%u.%u\n", + rc, HIPQUAD(conn->ksnc_ipaddr)); + return rc; + } + + if (hello->kshm_src_nid == LNET_NID_ANY) { + CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY" + "from %u.%u.%u.%u\n", HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + + if (conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + /* Userspace NAL assigns peer process ID from socket */ + recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; + recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr); + } else { + recv_id.nid = hello->kshm_src_nid; + + if (the_lnet.ln_ptlcompat > 1 && /* portals peers may exist */ + LNET_NIDNET(recv_id.nid) == 0) /* this is one */ + recv_id.pid = the_lnet.ln_pid; /* give it a sensible pid */ + else + recv_id.pid = hello->kshm_src_pid; + + } + + if (!active) { /* don't know peer's nid yet */ + *peerid = recv_id; + } else if (peerid->pid != recv_id.pid || + !lnet_ptlcompat_matchnid(peerid->nid, recv_id.nid)) { + LCONSOLE_ERROR("Connected successfully to %s on host " + "%u.%u.%u.%u, but they claimed they were " + "%s; please check your Lustre " + "configuration.\n", + libcfs_id2str(*peerid), + HIPQUAD(conn->ksnc_ipaddr), + libcfs_id2str(recv_id)); + return -EPROTO; + } + + if (conn->ksnc_type == SOCKLND_CONN_NONE) { + /* I've accepted this connection; peer determines type */ + conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); + if (conn->ksnc_type == SOCKLND_CONN_NONE) { + CERROR ("Unexpected type %d from %s ip %u.%u.%u.%u\n", + hello->kshm_ctype, libcfs_id2str(*peerid), + HIPQUAD(conn->ksnc_ipaddr)); + return -EPROTO; + } + } else if (hello->kshm_ctype == SOCKLND_CONN_NONE) { + if (match) { + /* lost a connection race */ + return -EALREADY; + } + /* unmatched protocol get SOCKLND_CONN_NONE anyway */ + } else if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { + CERROR ("Mismatched types: me %d, %s ip %u.%u.%u.%u %d\n", + conn->ksnc_type, libcfs_id2str(*peerid), + HIPQUAD(conn->ksnc_ipaddr), + hello->kshm_ctype); + return -EPROTO; } - CERROR("Out of ports trying to bind to a reserved port\n"); - return (-EADDRINUSE); + *incarnation = hello->kshm_src_incarnation; + + return 0; } void -ksocknal_autoconnect (ksock_route_t *route) +ksocknal_connect (ksock_route_t *route) { CFS_LIST_HEAD (zombies); - ksock_tx_t *tx; - ksock_peer_t *peer; - unsigned long flags; + ksock_peer_t *peer = route->ksnr_peer; int type; + int wanted; + cfs_socket_t *sock; + cfs_time_t deadline; + int retry_later = 0; int rc = 0; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + deadline = cfs_time_add(cfs_time_current(), + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout)); + + write_lock_bh (&ksocknal_data.ksnd_global_lock); + + LASSERT (route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + + route->ksnr_connecting = 1; for (;;) { - if (!ksocknal_tunables.ksnd_typed_conns) { - if ((route->ksnr_connected & (1<ksnr_connected; + + /* stop connecting if peer/route got closed under me, or + * route got connected while queued */ + if (peer->ksnp_closing || route->ksnr_deleted || + wanted == 0) { + retry_later = 0; + break; + } + + /* reschedule if peer is connecting to me */ + if (peer->ksnp_accepting > 0) { + CDEBUG(D_NET, + "peer %s(%d) already connecting to me, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting); + retry_later = 1; + } + + if (retry_later) /* needs reschedule */ + break; + + if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) { + type = SOCKLND_CONN_ANY; + } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) { + type = SOCKLND_CONN_CONTROL; + } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) { + type = SOCKLND_CONN_BULK_IN; } else { - if ((route->ksnr_connected & (1<ksnr_connected & (1<ksnr_connected & (1<ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + rc = lnet_connect(&sock, peer->ksnp_id.nid, + route->ksnr_myipaddr, + route->ksnr_ipaddr, route->ksnr_port); if (rc != 0) goto failed; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type); + + if (rc < 0) { + lnet_connect_console_error(rc, peer->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + /* rc == EALREADY means I lost a connection race and my + * peer is connecting to me. + * rc == EPROTO means my peer is speaking an older + * protocol version. */ + LASSERT (rc == 0 || rc == EALREADY || rc == EPROTO); + + retry_later = rc != 0; + if (retry_later) + CDEBUG(D_NET, "peer %s: conn race, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid)); + + write_lock_bh (&ksocknal_data.ksnd_global_lock); } - LASSERT (route->ksnr_connecting); + route->ksnr_scheduled = 0; route->ksnr_connecting = 0; - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - return; - failed: - switch (rc) { - /* "normal" errors */ - case -ECONNREFUSED: - LCONSOLE_ERROR("Connection was refused by host %u.%u.%u.%u on " - "port %d; check that Lustre is running on that " - "node.\n", - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - break; - case -EHOSTUNREACH: - case -ENETUNREACH: - LCONSOLE_ERROR("Host %u.%u.%u.%u was unreachable; the network " - "or that node may be down, or Lustre may be " - "misconfigured.\n", - HIPQUAD(route->ksnr_ipaddr)); - break; - case -ETIMEDOUT: - LCONSOLE_ERROR("Connecting to host %u.%u.%u.%u on port %d took " - "too long; that node may be hung or " - "experiencing high load.\n", - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - break; - /* errors that should be rare */ - case -EPROTO: - LCONSOLE_ERROR("Protocol error connecting to host %u.%u.%u.%u " - "on port %d: Is it running a compatible version" - " of Lustre?\n", - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - break; - case -EADDRINUSE: - LCONSOLE_ERROR("No privileged ports available to connect to " - "host %u.%u.%u.%u on port %d\n", - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - break; - default: - LCONSOLE_ERROR("Unexpected error %d connecting to " - "host %u.%u.%u.%u on port %d\n", rc, - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - break; + if (retry_later) { + /* re-queue for attention; this frees me up to handle + * the peer's incoming connection request */ + ksocknal_launch_connection_locked(route); } - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); + return; + + failed: + write_lock_bh (&ksocknal_data.ksnd_global_lock); - peer = route->ksnr_peer; - LASSERT (route->ksnr_connecting); + route->ksnr_scheduled = 0; route->ksnr_connecting = 0; /* This is a retry rather than a new connection */ + route->ksnr_retry_interval *= 2; + route->ksnr_retry_interval = + MAX(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000); + route->ksnr_retry_interval = + MIN(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000); + LASSERT (route->ksnr_retry_interval != 0); route->ksnr_timeout = cfs_time_add(cfs_time_current(), route->ksnr_retry_interval); - route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2, - SOCKNAL_MAX_RECONNECT_INTERVAL); - if (!list_empty (&peer->ksnp_tx_queue) && - ksocknal_find_connecting_route_locked (peer) == NULL) { + if (!list_empty(&peer->ksnp_tx_queue) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + /* ksnp_tx_queue is queued on a conn on successful + * connection */ LASSERT (list_empty (&peer->ksnp_conns)); - /* None of the connections that the blocked packets are - * waiting for have been successful. Complete them now... */ - do { - tx = list_entry (peer->ksnp_tx_queue.next, - ksock_tx_t, tx_list); - list_del (&tx->tx_list); - list_add_tail (&tx->tx_list, &zombies); - } while (!list_empty (&peer->ksnp_tx_queue)); + /* take all the blocked packets while I've got the lock and + * complete below... */ + list_add(&zombies, &peer->ksnp_tx_queue); + list_del_init(&peer->ksnp_tx_queue); } #if 0 /* irrelevent with only eager routes */ @@ -2083,154 +2451,180 @@ ksocknal_autoconnect (ksock_route_t *route) list_add_tail(&route->ksnr_list, &peer->ksnp_routes); } #endif - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); - - while (!list_empty (&zombies)) { - char ipbuf[PTL_NALFMT_SIZE]; - char ipbuf2[PTL_NALFMT_SIZE]; - tx = list_entry (zombies.next, ksock_tx_t, tx_list); - - CERROR ("Deleting packet type %d len %d ("LPX64" %s->"LPX64" %s)\n", - le32_to_cpu (tx->tx_hdr->type), - le32_to_cpu (tx->tx_hdr->payload_length), - le64_to_cpu (tx->tx_hdr->src_nid), - portals_nid2str(SOCKNAL, - le64_to_cpu(tx->tx_hdr->src_nid), - ipbuf), - le64_to_cpu (tx->tx_hdr->dest_nid), - portals_nid2str(SOCKNAL, - le64_to_cpu(tx->tx_hdr->src_nid), - ipbuf2)); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); - list_del (&tx->tx_list); - /* complete now */ - ksocknal_tx_done (tx, 0); - } + ksocknal_peer_failed(peer); + ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); +} + +static inline int +ksocknal_connd_connect_route_locked(void) +{ + /* Only handle an outgoing connection request if there is someone left + * to handle incoming connections */ + return !list_empty(&ksocknal_data.ksnd_connd_routes) && + ((ksocknal_data.ksnd_connd_connecting + 1) < + *ksocknal_tunables.ksnd_nconnds); +} + +static inline int +ksocknal_connd_ready(void) +{ + int rc; + + spin_lock_bh (&ksocknal_data.ksnd_connd_lock); + + rc = ksocknal_data.ksnd_shuttingdown || + !list_empty(&ksocknal_data.ksnd_connd_connreqs) || + ksocknal_connd_connect_route_locked(); + + spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + + return rc; } int -ksocknal_autoconnectd (void *arg) +ksocknal_connd (void *arg) { long id = (long)arg; char name[16]; - unsigned long flags; + ksock_connreq_t *cr; ksock_route_t *route; - int rc; - snprintf (name, sizeof (name), "ksocknal_ad%02ld", id); - kportal_daemonize (name); - kportal_blockallsigs (); + snprintf (name, sizeof (name), "socknal_cd%02ld", id); + cfs_daemonize (name); + cfs_block_allsigs (); - spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); + spin_lock_bh (&ksocknal_data.ksnd_connd_lock); while (!ksocknal_data.ksnd_shuttingdown) { - if (!list_empty (&ksocknal_data.ksnd_autoconnectd_routes)) { - route = list_entry (ksocknal_data.ksnd_autoconnectd_routes.next, - ksock_route_t, ksnr_connect_list); + if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { + /* Connection accepted by the listener */ + cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next, + ksock_connreq_t, ksncr_list); + + list_del(&cr->ksncr_list); + spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); + + ksocknal_create_conn(cr->ksncr_ni, NULL, + cr->ksncr_sock, SOCKLND_CONN_NONE); + lnet_ni_decref(cr->ksncr_ni); + LIBCFS_FREE(cr, sizeof(*cr)); + + spin_lock_bh (&ksocknal_data.ksnd_connd_lock); + } - list_del (&route->ksnr_connect_list); - spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + if (ksocknal_connd_connect_route_locked()) { + /* Connection request */ + route = list_entry (ksocknal_data.ksnd_connd_routes.next, + ksock_route_t, ksnr_connd_list); - ksocknal_autoconnect (route); - ksocknal_put_route (route); + list_del (&route->ksnr_connd_list); + ksocknal_data.ksnd_connd_connecting++; + spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); - spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, - flags); - continue; + ksocknal_connect (route); + ksocknal_route_decref(route); + + spin_lock_bh (&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_connecting--; } - spin_unlock_irqrestore(&ksocknal_data.ksnd_autoconnectd_lock, - flags); + spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); - rc = wait_event_interruptible(ksocknal_data.ksnd_autoconnectd_waitq, - ksocknal_data.ksnd_shuttingdown || - !list_empty(&ksocknal_data.ksnd_autoconnectd_routes)); + wait_event_interruptible_exclusive( + ksocknal_data.ksnd_connd_waitq, + ksocknal_connd_ready()); - spin_lock_irqsave(&ksocknal_data.ksnd_autoconnectd_lock, flags); + spin_lock_bh (&ksocknal_data.ksnd_connd_lock); } - spin_unlock_irqrestore (&ksocknal_data.ksnd_autoconnectd_lock, flags); + spin_unlock_bh (&ksocknal_data.ksnd_connd_lock); ksocknal_thread_fini (); return (0); } ksock_conn_t * -ksocknal_find_timed_out_conn (ksock_peer_t *peer) +ksocknal_find_timed_out_conn (ksock_peer_t *peer) { /* We're called with a shared lock on ksnd_global_lock */ ksock_conn_t *conn; struct list_head *ctmp; list_for_each (ctmp, &peer->ksnp_conns) { + int error; conn = list_entry (ctmp, ksock_conn_t, ksnc_list); - /* Don't need the {get,put}connsock dance to deref ksnc_sock... */ + /* Don't need the {get,put}connsock dance to deref ksnc_sock */ LASSERT (!conn->ksnc_closing); - if (SOCK_ERROR(conn->ksnc_sock) != 0) { - atomic_inc (&conn->ksnc_refcount); + /* SOCK_ERROR will reset error code of socket in + * some platform (like Darwin8.x) */ + error = SOCK_ERROR(conn->ksnc_sock); + if (error != 0) { + ksocknal_conn_addref(conn); - switch (SOCK_ERROR(conn->ksnc_sock)) { + switch (error) { case ECONNRESET: - LCONSOLE_WARN("A connection with %u.%u.%u.%u " - "was reset; they may have " - "rebooted.\n", - HIPQUAD(conn->ksnc_ipaddr)); + CDEBUG(D_NETERROR, "A connection with %s " + "(%u.%u.%u.%u:%d) was reset; " + "it may have rebooted.\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); break; case ETIMEDOUT: - LCONSOLE_WARN("A connection with %u.%u.%u.%u " - "timed out; the network or that " - "node may be down.\n", - HIPQUAD(conn->ksnc_ipaddr)); + CDEBUG(D_NETERROR, "A connection with %s " + "(%u.%u.%u.%u:%d) timed out; the " + "network or node may be down.\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); break; default: - LCONSOLE_WARN("An unexpected network error " - "occurred with %u.%u.%u.%u: %d\n", - HIPQUAD(conn->ksnc_ipaddr), - SOCK_ERROR(conn->ksnc_sock)); + CDEBUG(D_NETERROR, "An unexpected network error %d " + "occurred with %s " + "(%u.%u.%u.%u:%d\n", error, + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); break; } - /* Something (e.g. failed keepalive) set the socket error */ - CDEBUG(D_HA,"Socket error %d: "LPX64" %p %d.%d.%d.%d\n", - SOCK_ERROR(conn->ksnc_sock), peer->ksnp_nid, - conn, HIPQUAD(conn->ksnc_ipaddr)); - return (conn); } if (conn->ksnc_rx_started && - cfs_time_aftereq (cfs_time_current(), - conn->ksnc_rx_deadline)) { + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_rx_deadline)) { /* Timed out incomplete incoming message */ - atomic_inc (&conn->ksnc_refcount); - LCONSOLE_ERROR("A timeout occurred receiving data from " - "%u.%u.%u.%u; the network or that node " - "may be down.\n", - HIPQUAD(conn->ksnc_ipaddr)); - CERROR ("Timed out RX from "LPX64" %p %d.%d.%d.%d\n", - peer->ksnp_nid,conn,HIPQUAD(conn->ksnc_ipaddr)); + ksocknal_conn_addref(conn); + CDEBUG(D_NETERROR, "Timeout receiving from %s " + "(%u.%u.%u.%u:%d), state %d wanted %d left %d\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port, + conn->ksnc_rx_state, + conn->ksnc_rx_nob_wanted, + conn->ksnc_rx_nob_left); return (conn); } - if ((!list_empty (&conn->ksnc_tx_queue) || + if ((!list_empty(&conn->ksnc_tx_queue) || SOCK_WMEM_QUEUED(conn->ksnc_sock) != 0) && - cfs_time_aftereq (cfs_time_current(), - conn->ksnc_tx_deadline)) { + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_tx_deadline)) { /* Timed out messages queued for sending or * buffered in the socket's send buffer */ - atomic_inc (&conn->ksnc_refcount); - LCONSOLE_ERROR("A timeout occurred sending data to " - "%u.%u.%u.%u; the network or that node " - "may be down.\n", - HIPQUAD(conn->ksnc_ipaddr)); - CERROR ("Timed out TX to "LPX64" %s%d %p %d.%d.%d.%d\n", - peer->ksnp_nid, - list_empty (&conn->ksnc_tx_queue) ? "" : "Q ", - SOCK_WMEM_QUEUED(conn->ksnc_sock), conn, - HIPQUAD(conn->ksnc_ipaddr)); + ksocknal_conn_addref(conn); + CDEBUG(D_NETERROR, "Timeout sending data to %s " + "(%u.%u.%u.%u:%d) the network or that " + "node may be down.\n", + libcfs_id2str(peer->ksnp_id), + HIPQUAD(conn->ksnc_ipaddr), + conn->ksnc_port); return (conn); } } @@ -2259,15 +2653,12 @@ ksocknal_check_peer_timeouts (int idx) if (conn != NULL) { read_unlock (&ksocknal_data.ksnd_global_lock); - CERROR("Timeout out conn->"LPX64" ip %d.%d.%d.%d:%d\n", - peer->ksnp_nid, HIPQUAD(conn->ksnc_ipaddr), - conn->ksnc_port); ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); /* NB we won't find this one again, but we can't * just proceed with the next peer, since we dropped * ksnd_global_lock and it might be dead already! */ - ksocknal_put_conn (conn); + ksocknal_conn_decref(conn); goto again; } } @@ -2279,7 +2670,6 @@ int ksocknal_reaper (void *arg) { cfs_waitlink_t wait; - unsigned long flags; ksock_conn_t *conn; ksock_sched_t *sched; struct list_head enomem_conns; @@ -2289,13 +2679,13 @@ ksocknal_reaper (void *arg) int peer_index = 0; cfs_time_t deadline = cfs_time_current(); - kportal_daemonize ("ksocknal_reaper"); - kportal_blockallsigs (); + cfs_daemonize ("socknal_reaper"); + cfs_block_allsigs (); CFS_INIT_LIST_HEAD(&enomem_conns); cfs_waitlink_init (&wait); - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); while (!ksocknal_data.ksnd_shuttingdown) { @@ -2303,13 +2693,13 @@ ksocknal_reaper (void *arg) conn = list_entry (ksocknal_data.ksnd_deathrow_conns.next, ksock_conn_t, ksnc_list); list_del (&conn->ksnc_list); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); ksocknal_terminate_conn (conn); - ksocknal_put_conn (conn); + ksocknal_conn_decref(conn); - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); continue; } @@ -2317,12 +2707,12 @@ ksocknal_reaper (void *arg) conn = list_entry (ksocknal_data.ksnd_zombie_conns.next, ksock_conn_t, ksnc_list); list_del (&conn->ksnc_list); - - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + + spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); ksocknal_destroy_conn (conn); - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); continue; } @@ -2331,7 +2721,7 @@ ksocknal_reaper (void *arg) list_del_init(&ksocknal_data.ksnd_enomem_conns); } - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); /* reschedule all the connections that stalled with ENOMEM... */ nenomem_conns = 0; @@ -2342,14 +2732,14 @@ ksocknal_reaper (void *arg) sched = conn->ksnc_scheduler; - spin_lock_irqsave (&sched->kss_lock, flags); + spin_lock_bh (&sched->kss_lock); LASSERT (conn->ksnc_tx_scheduled); conn->ksnc_tx_ready = 1; - list_add_tail(&conn->ksnc_tx_list,&sched->kss_tx_conns); + list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); cfs_waitq_signal (&sched->kss_waitq); - spin_unlock_irqrestore (&sched->kss_lock, flags); + spin_unlock_bh (&sched->kss_lock); nenomem_conns++; } @@ -2367,9 +2757,9 @@ ksocknal_reaper (void *arg) * timeout on any connection within (n+1)/n times the * timeout interval. */ - if (ksocknal_tunables.ksnd_io_timeout > n * p) + if (*ksocknal_tunables.ksnd_timeout > n * p) chunk = (chunk * n * p) / - ksocknal_tunables.ksnd_io_timeout; + *ksocknal_tunables.ksnd_timeout; if (chunk == 0) chunk = 1; @@ -2397,25 +2787,16 @@ ksocknal_reaper (void *arg) if (!ksocknal_data.ksnd_shuttingdown && list_empty (&ksocknal_data.ksnd_deathrow_conns) && list_empty (&ksocknal_data.ksnd_zombie_conns)) - cfs_waitq_timedwait (&wait, timeout); + cfs_waitq_timedwait (&wait, CFS_TASK_INTERRUPTIBLE, timeout); set_current_state (TASK_RUNNING); cfs_waitq_del (&ksocknal_data.ksnd_reaper_waitq, &wait); - spin_lock_irqsave (&ksocknal_data.ksnd_reaper_lock, flags); + spin_lock_bh (&ksocknal_data.ksnd_reaper_lock); } - spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); + spin_unlock_bh (&ksocknal_data.ksnd_reaper_lock); ksocknal_thread_fini (); return (0); } - -lib_nal_t ksocknal_lib = { - libnal_data: &ksocknal_data, /* NAL private data */ - libnal_send: ksocknal_send, - libnal_send_pages: ksocknal_send_pages, - libnal_recv: ksocknal_recv, - libnal_recv_pages: ksocknal_recv_pages, - libnal_dist: ksocknal_dist -}; diff --git a/lnet/klnds/socklnd/socklnd_lib-darwin.c b/lnet/klnds/socklnd/socklnd_lib-darwin.c index ada5b64..25d6b45 100644 --- a/lnet/klnds/socklnd/socklnd_lib-darwin.c +++ b/lnet/klnds/socklnd/socklnd_lib-darwin.c @@ -28,118 +28,122 @@ #include #include -#include "socknal.h" +#include "socklnd.h" -#if 0 -#undef SOCKNAL_SINGLE_FRAG_TX -#define SOCKNAL_SINGLE_FRAG_TX 1 -#undef SOCKNAL_SINGLE_FRAG_RX -#define SOCKNAL_SINGLE_FRAG_RX 1 -#endif +# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM -SYSCTL_DECL(_portals); +SYSCTL_DECL(_lnet); -SYSCTL_NODE (_portals, OID_AUTO, ksocknal, CTLFLAG_RW, - 0, "ksocknal_sysctl"); +SYSCTL_NODE (_lnet, OID_AUTO, ksocknal, CTLFLAG_RW, + 0, "ksocknal_sysctl"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, timeout, - CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_io_timeout, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, timeout, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_timeout, 0, "timeout"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, eager_ack, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, credits, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_credits, + 0, "credits"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, peer_credits, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_peercredits, + 0, "peer_credits"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, nconnds, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nconnds, + 0, "nconnds"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, min_reconnectms, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_reconnectms, + 0, "min_reconnectms"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, max_reconnectms, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_max_reconnectms, + 0, "max_reconnectms"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, eager_ack, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_eager_ack, 0, "eager_ack"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, typed, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, typed, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_typed_conns, 0, "typed"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, min_bulk, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, min_bulk, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_bulk, 0, "min_bulk"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, buffer_size, - CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_buffer_size, - 0, "buffer_size"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, nagle, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, rx_buffer_size, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_rx_buffer_size, + 0, "rx_buffer_size"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, tx_buffer_size, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_tx_buffer_size, + 0, "tx_buffer_size"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, nagle, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nagle, 0, "nagle"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_idle, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_idle, + 0, "keepalive_idle"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_count, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_count, + 0, "keepalive_count"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_intvl, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_intvl, + 0, "keepalive_intvl"); cfs_sysctl_table_t ksocknal_top_ctl_table [] = { - &sysctl__portals_ksocknal, - &sysctl__portals_ksocknal_timeout, - &sysctl__portals_ksocknal_eager_ack, - &sysctl__portals_ksocknal_typed, - &sysctl__portals_ksocknal_min_bulk, - &sysctl__portals_ksocknal_buffer_size, - &sysctl__portals_ksocknal_nagle, + &sysctl__lnet_ksocknal, + &sysctl__lnet_ksocknal_timeout, + &sysctl__lnet_ksocknal_credits, + &sysctl__lnet_ksocknal_peer_credits, + &sysctl__lnet_ksocknal_nconnds, + &sysctl__lnet_ksocknal_min_reconnectms, + &sysctl__lnet_ksocknal_max_reconnectms, + &sysctl__lnet_ksocknal_eager_ack, + &sysctl__lnet_ksocknal_typed, + &sysctl__lnet_ksocknal_min_bulk, + &sysctl__lnet_ksocknal_rx_buffer_size, + &sysctl__lnet_ksocknal_tx_buffer_size, + &sysctl__lnet_ksocknal_nagle, + &sysctl__lnet_ksocknal_keepalive_idle, + &sysctl__lnet_ksocknal_keepalive_count, + &sysctl__lnet_ksocknal_keepalive_intvl, NULL }; -static unsigned long ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); - -struct socket * -sockfd_lookup(int fd, void *foo) +int +ksocknal_lib_tunables_init () { - struct socket *so; - struct file *fp; - CFS_DECL_FUNNEL_DATA; + ksocknal_tunables.ksnd_sysctl = + cfs_register_sysctl_table (ksocknal_top_ctl_table, 0); - CFS_NET_IN; - getsock(current_proc()->p_fd, fd, &fp); - CFS_NET_EX; - so = (struct socket *)fp->f_data; - so->reserved4 = fp; - CFS_CONE_IN; - fref(fp); - CFS_CONE_EX; - return so; -} + if (ksocknal_tunables.ksnd_sysctl == NULL) + return -ENOMEM; -extern struct fileops socketops; + return 0; +} -static int -sock_map_fd (struct socket *so) +void +ksocknal_lib_tunables_fini () { - struct file *fp; - int fd; - CFS_DECL_FUNNEL_DATA; - - CFS_CONE_IN; - falloc(current_proc(), &fp, &fd); - fp->f_flag = FREAD|FWRITE; - fp->f_type = DTYPE_SOCKET; - fp->f_ops = &socketops; - fp->f_data = (caddr_t)so; - so->reserved4 = fp; - *fdflags(current_proc(), fd) &= ~UF_RESERVED; - CFS_CONE_EX; - - return fd; + if (ksocknal_tunables.ksnd_sysctl != NULL) + cfs_unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl); } - -static void -sock_release(struct socket *so) +#else +int +ksocknal_lib_tunables_init () { - struct file *fp; - CFS_DECL_FUNNEL_DATA; - - fp = (struct file *)so->reserved4; - so->reserved4 = NULL; - fp->f_data = NULL; - CFS_CONE_IN; - frele(fp); - CFS_CONE_EX; - CFS_NET_IN; - soshutdown(so, 0); - CFS_NET_EX; + return 0; } -static void -sock_fdrelse(int fd) -{ - CFS_DECL_FUNNEL_DATA; - - CFS_CONE_IN; - fdrelse(current_proc(), fd); - CFS_CONE_EX; +void +ksocknal_lib_tunables_fini () +{ } +#endif + +/* + * To use bigger buffer for socket: + * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so + * we must patch kernel). + * 2. Increase net.inet.tcp.reass.maxsegments + * 3. Increase net.inet.tcp.sendspace + * 4. Increase net.inet.tcp.recvspace + * 5. Increase kern.ipc.maxsockbuf + */ +#define KSOCKNAL_MAX_BUFFER (1152*1024) void ksocknal_lib_bind_irq (unsigned int irq) @@ -148,7 +152,7 @@ ksocknal_lib_bind_irq (unsigned int irq) } unsigned int -ksocknal_lib_sock_irq (struct socket *sock) +ksocknal_lib_sock_irq (cfs_socket_t *sock) { return 0; } @@ -156,46 +160,374 @@ ksocknal_lib_sock_irq (struct socket *sock) int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) { - struct sockaddr_in *sin; - struct sockaddr *sa; - int rc; - CFS_DECL_NET_DATA; + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, + &conn->ksnc_port); - CFS_NET_IN; - rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa); - LASSERT (!conn->ksnc_closing); - if (rc != 0) { - CFS_NET_EX; - if (sa) FREE(sa, M_SONAME); - CERROR ("Error %d getting sock peer IP\n", rc); - return rc; + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); + + if (rc != 0) { + CERROR ("Error %d getting sock peer IP\n", rc); + return rc; + } + + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); + if (rc != 0) { + CERROR ("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +#ifdef __DARWIN8__ + +int +ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + socket_t sock = C2B_SOCK(conn->ksnc_sock); + size_t sndlen; + int nob; + int rc; + +#if SOCKNAL_SINGLE_FRAG_TX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + unsigned int niov = tx->tx_niov; +#endif + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT + }; + + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = tx->tx_iov[i]; + nob += scratchiov[i].iov_len; } - sin = (struct sockaddr_in *)sa; - conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr); - conn->ksnc_port = ntohs (sin->sin_port); - if (sa) FREE(sa, M_SONAME); - rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa); - CFS_NET_EX; - if (rc != 0) { - if (sa) FREE(sa, M_SONAME); - CERROR ("Error %d getting sock local IP\n", rc); - return rc; + + /* + * XXX Liang: + * Linux has MSG_MORE, do we have anything to + * reduce number of partial TCP segments sent? + */ + rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen); + if (rc == 0) + rc = sndlen; + return rc; +} + +int +ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + socket_t sock = C2B_SOCK(conn->ksnc_sock); + lnet_kiov_t *kiov = tx->tx_kiov; + int rc; + int nob; + size_t sndlen; + +#if SOCKNAL_SINGLE_FRAG_TX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + unsigned int niov = tx->tx_nkiov; +#endif + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT + }; + + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + + /* + * XXX Liang: + * Linux has MSG_MORE, do wen have anyting to + * reduce number of partial TCP segments sent? + */ + rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen); + for (i = 0; i < niov; i++) + cfs_kunmap(kiov[i].kiov_page); + if (rc == 0) + rc = sndlen; + return rc; +} + +int +ksocknal_lib_recv_iov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + unsigned int niov = conn->ksnc_rx_niov; +#endif + struct iovec *iov = conn->ksnc_rx_iov; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + size_t rcvlen; + int nob; + int i; + int rc; + + LASSERT (niov > 0); + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + rc = -sock_receive (C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen); + if (rc == 0) + rc = rcvlen; + + return rc; +} + +int +ksocknal_lib_recv_kiov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + unsigned int niov = conn->ksnc_rx_nkiov; +#endif + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + int nob; + int i; + size_t rcvlen; + int rc; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + \ + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + rc = -sock_receive(C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen); + for (i = 0; i < niov; i++) + cfs_kunmap(kiov[i].kiov_page); + if (rc == 0) + rc = rcvlen; + return (rc); +} + +void +ksocknal_lib_eager_ack (ksock_conn_t *conn) +{ + /* XXX Liang: */ +} + +int +ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) +{ + socket_t sock = C2B_SOCK(conn->ksnc_sock); + int len; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + *txmem = *rxmem = *nagle = 0; + return (-ESHUTDOWN); + } + rc = libcfs_sock_getbuf(conn->ksnc_sock, txmem, rxmem); + if (rc == 0) { + len = sizeof(*nagle); + rc = -sock_getsockopt(sock, IPPROTO_TCP, TCP_NODELAY, + nagle, &len); + } + ksocknal_connsock_decref(conn); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; + + return (rc); +} + +int +ksocknal_lib_setup_sock (cfs_socket_t *sock) +{ + int rc; + int option; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + socket_t so = C2B_SOCK(sock); + struct linger linger; + + /* Ensure this socket aborts active sends immediately when we close + * it. */ + linger.l_onoff = 0; + linger.l_linger = 0; + rc = -sock_setsockopt(so, SOL_SOCKET, SO_LINGER, &linger, sizeof(linger)); + if (rc != 0) { + CERROR ("Can't set SO_LINGER: %d\n", rc); + return (rc); + } + + if (!*ksocknal_tunables.ksnd_nagle) { + option = 1; + rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &option, sizeof(option)); + if (rc != 0) { + CERROR ("Can't disable nagle: %d\n", rc); + return (rc); + } } - conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr); - return 0; + rc = libcfs_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return (rc); + } + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + option = (do_keepalive ? 1 : 0); + + rc = -sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &option, sizeof(option)); + if (rc != 0) { + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + return (rc); + } + + if (!do_keepalive) + return (rc); + rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_KEEPALIVE, + &keep_idle, sizeof(keep_idle)); + + return (rc); +} + +void +ksocknal_lib_push_conn(ksock_conn_t *conn) +{ + socket_t sock; + int val = 1; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) /* being shut down */ + return; + sock = C2B_SOCK(conn->ksnc_sock); + + rc = -sock_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)); + LASSERT(rc == 0); + + ksocknal_connsock_decref(conn); + return; +} + +extern void ksocknal_read_callback (ksock_conn_t *conn); +extern void ksocknal_write_callback (ksock_conn_t *conn); + +static void +ksocknal_upcall(socket_t so, void *arg, int waitf) +{ + ksock_conn_t *conn = (ksock_conn_t *)arg; + ENTRY; + + read_lock (&ksocknal_data.ksnd_global_lock); + if (conn == NULL) + goto out; + + ksocknal_read_callback (conn); + /* XXX Liang */ + ksocknal_write_callback (conn); +out: + read_unlock (&ksocknal_data.ksnd_global_lock); + EXIT; +} + +void +ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn) +{ + /* No callback need to save in osx */ + return; +} + +void +ksocknal_lib_set_callback(cfs_socket_t *sock, ksock_conn_t *conn) +{ + libcfs_sock_set_cb(sock, ksocknal_upcall, (void *)conn); + return; +} + +void +ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn) +{ + libcfs_sock_reset_cb(sock); } +#else /* !__DARWIN8__ */ + int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) { #if SOCKNAL_SINGLE_FRAG_TX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_niov; + unsigned int niov = tx->tx_niov; #endif struct socket *sock = conn->ksnc_sock; int nob; @@ -248,13 +580,13 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_nkiov; + unsigned int niov = tx->tx_nkiov; #endif struct socket *sock = conn->ksnc_sock; - ptl_kiov_t *kiov = tx->tx_kiov; + lnet_kiov_t *kiov = tx->tx_kiov; int nob; int rc; int i; @@ -364,6 +696,10 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn) CFS_NET_IN; s = splnet(); + /* + * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo + * to send immediate ACK. + */ if (tp && tp->t_flags & TF_DELACK){ tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; @@ -371,14 +707,6 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn) } splx(s); - /* - * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo - * to send immediate ACK. It's not the best resolution because - * tcp_fasttimo will send out ACK for all delayed-ack tcp socket. - * Anyway, it's working now. - * extern void tcp_fasttimo(); - * tcp_fasttimo(); - */ CFS_NET_EX; return; @@ -390,10 +718,10 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) #if SOCKNAL_SINGLE_FRAG_RX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_niov; + unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; int nob; @@ -444,12 +772,12 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_nkiov; + unsigned int niov = conn->ksnc_rx_nkiov; #endif - ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; int nob; int rc; int i; @@ -497,138 +825,43 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) } int -ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - CFS_DECL_NET_DATA; - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct uio suio = { - .uio_iov = &iov, - .uio_iovcnt = 1, - .uio_offset = 0, - .uio_resid = nob, - .uio_segflg = UIO_SYSSPACE, - .uio_rw = UIO_WRITE, - .uio_procp = NULL - }; - - CFS_NET_IN; - rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0); - CFS_NET_EX; - - if (rc != 0) { - if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ - rc == EWOULDBLOCK)) - rc = 0; - if ( rc != 0 ) - return -rc; - rc = nob - suio.uio_resid; - buffer = ((char *)buffer) + rc; - nob = suio.uio_resid; - continue; - } - break; - } - - return (0); -} - -int -ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob) -{ - int rc; - CFS_DECL_NET_DATA; - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct uio ruio = { - .uio_iov = &iov, - .uio_iovcnt = 1, - .uio_offset = 0, - .uio_resid = nob, - .uio_segflg = UIO_SYSSPACE, - .uio_rw = UIO_READ, - .uio_procp = NULL - }; - - CFS_NET_IN; - rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0); - CFS_NET_EX; - - if (rc != 0) { - if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ - rc == EWOULDBLOCK)) - rc = 0; - if (rc != 0) - return -rc; - rc = nob - ruio.uio_resid; - buffer = ((char *)buffer) + rc; - nob = ruio.uio_resid; - continue; - } - break; - } - - return (0); -} - -int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) { - struct sockopt sopt; struct socket *sock = conn->ksnc_sock; - int len; int rc; - CFS_DECL_NET_DATA; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) { LASSERT (conn->ksnc_closing); *txmem = *rxmem = *nagle = 0; - rc = -ESHUTDOWN; - goto out; - } - len = sizeof(*txmem); - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_GET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_SNDBUF; - sopt.sopt_val = txmem; - sopt.sopt_valsize = len; - - CFS_NET_IN; - rc = sogetopt(sock, &sopt); - if (rc == 0) { - len = sizeof(*rxmem); - sopt.sopt_name = SO_RCVBUF; - sopt.sopt_val = rxmem; - rc = sogetopt(sock, &sopt); + return -ESHUTDOWN; } + rc = libcfs_sock_getbuf(sock, txmem, rxmem); if (rc == 0) { + struct sockopt sopt; + int len; + CFS_DECL_NET_DATA; + len = sizeof(*nagle); + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_GET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = nagle; - rc = sogetopt(sock, &sopt); + sopt.sopt_valsize = len; + + CFS_NET_IN; + rc = -sogetopt(sock, &sopt); + CFS_NET_EX; } - CFS_NET_EX; - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); if (rc == 0) *nagle = !*nagle; else *txmem = *rxmem = *nagle = 0; -out: - return (-rc); + return (rc); } int @@ -644,9 +877,18 @@ ksocknal_lib_setup_sock (struct socket *so) struct linger linger; CFS_DECL_NET_DATA; + rc = libcfs_sock_setbuf(so, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return (rc); + } + /* Ensure this socket aborts active sends immediately when we close * it. */ - bzero(&sopt, sizeof sopt); linger.l_onoff = 0; @@ -658,14 +900,13 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_valsize = sizeof(linger); CFS_NET_IN; - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't set SO_LINGER: %d\n", rc); goto out; } - - if (!ksocknal_tunables.ksnd_nagle) { + if (!*ksocknal_tunables.ksnd_nagle) { option = 1; bzero(&sopt, sizeof sopt); sopt.sopt_dir = SOPT_SET; @@ -673,41 +914,17 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = &option; sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't disable nagle: %d\n", rc); goto out; } } - if (ksocknal_tunables.ksnd_buffer_size > 0) { - option = ksocknal_tunables.ksnd_buffer_size; - if (option > ksocknal_mbuf_size) - option = ksocknal_mbuf_size; - - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_SNDBUF; - sopt.sopt_val = &option; - sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); - if (rc != 0) { - CERROR ("Can't set send buffer %d: %d\n", - option, rc); - goto out; - } - - sopt.sopt_name = SO_RCVBUF; - rc = sosetopt(so, &sopt); - if (rc != 0) { - CERROR ("Can't set receive buffer %d: %d\n", - option, rc); - goto out; - } - } + /* snapshot tunables */ - keep_idle = ksocknal_tunables.ksnd_keepalive_idle; - keep_count = ksocknal_tunables.ksnd_keepalive_count; - keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl; + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); option = (do_keepalive ? 1 : 0); @@ -717,7 +934,7 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_name = SO_KEEPALIVE; sopt.sopt_val = &option; sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); goto out; @@ -735,161 +952,14 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_name = TCP_KEEPALIVE; sopt.sopt_val = &keep_idle; sopt.sopt_valsize = sizeof(keep_idle); - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc); goto out; } out: CFS_NET_EX; - return (-rc); -} - -int -ksocknal_lib_connect_sock (struct socket **sockp, int *may_retry, - ksock_route_t *route, int local_port) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct timeval tv; - int fd; - struct socket *so; - struct sockopt sopt; - int option; - int rc; - int s; - CFS_DECL_FUNNEL_DATA; - - ENTRY; - bzero (&locaddr, sizeof (locaddr)); - locaddr.sin_len = sizeof(struct sockaddr_in); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons (local_port); - locaddr.sin_addr.s_addr = - (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) - : INADDR_ANY; - bzero(&srvaddr, sizeof(srvaddr)); - srvaddr.sin_len = sizeof(struct sockaddr_in); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (route->ksnr_port); - srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - *may_retry = 0; - - CFS_NET_IN; - rc = socreate(PF_INET, &so, SOCK_STREAM, 0); - CFS_NET_EX; - *sockp = so; - if (rc != 0) { - CERROR ("Can't create autoconnect socket: %d\n", rc); - return (-rc); - } - - /* - * XXX - * Liang: what do we need here? - */ - fd = sock_map_fd (so); - if (fd < 0) { - sock_release (so); - CERROR ("sock_map_fd error %d\n", fd); - return (fd); - } - sock_fdrelse(fd); - - /* Set the socket timeouts, so our connection attempt completes in - * finite time */ - tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; - tv.tv_usec = 0; - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_SNDTIMEO; - sopt.sopt_val = &tv; - sopt.sopt_valsize = sizeof(tv); - - CFS_NET_IN; - rc = sosetopt(so, &sopt); - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't set send timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto out; - } - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_RCVTIMEO; - rc = sosetopt(so, &sopt); - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't set receive timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto out; - } - option = 1; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_REUSEADDR; - sopt.sopt_val = &option; - sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't set sock reuse address: %d\n", rc); - goto out; - } - rc = sobind(so, (struct sockaddr *)&locaddr); - if (rc == EADDRINUSE) { - CFS_NET_EX; - CDEBUG(D_NET, "Port %d already in use\n", local_port); - *may_retry = 1; - goto out; - } - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n", - HIPQUAD(route->ksnr_myipaddr), rc); - goto out; - } - rc = soconnect(so, (struct sockaddr *)&srvaddr); - *may_retry = (rc == EADDRNOTAVAIL || rc == EADDRINUSE); - if (rc != 0) { - CFS_NET_EX; - if (rc != EADDRNOTAVAIL && rc != EADDRINUSE) - CERROR ("Can't connect to nid "LPX64 - " local IP: %u.%u.%u.%u," - " remote IP: %u.%u.%u.%u/%d: %d\n", - route->ksnr_peer->ksnp_nid, - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port, rc); - goto out; - } - - s = splnet(); - while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n"); - (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz); - } - LASSERT((so->so_state & SS_ISCONNECTED)); - splx(s); - CFS_NET_EX; - - rc = so->so_error; - if (rc != 0) { - CERROR ("Error %d waiting for connection to nid "LPX64 - " local IP: %u.%u.%u.%u," - " remote IP: %u.%u.%u.%u/%d: %d\n", rc, - route->ksnr_peer->ksnp_nid, - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port, rc); - goto out; - } - return (-rc); - - out: - rele_file(KSN_SOCK2FILE(so)); - - return (-rc); + return (rc); } void @@ -901,7 +971,7 @@ ksocknal_lib_push_conn(ksock_conn_t *conn) int rc; CFS_DECL_NET_DATA; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) /* being shut down */ return; sock = conn->ksnc_sock; @@ -916,47 +986,36 @@ ksocknal_lib_push_conn(ksock_conn_t *conn) sosetopt(sock, &sopt); CFS_NET_EX; - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); return; } + extern void ksocknal_read_callback (ksock_conn_t *conn); extern void ksocknal_write_callback (ksock_conn_t *conn); static void ksocknal_upcall(struct socket *so, caddr_t arg, int waitf) { - ksock_conn_t *conn; - CFS_DECL_NET_DATA; + ksock_conn_t *conn = (ksock_conn_t *)arg; ENTRY; read_lock (&ksocknal_data.ksnd_global_lock); - conn = so->reserved3; - - if (conn == NULL){ - /* More processing is needed? */ + if (conn == NULL) goto out; - } - if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) { + + if (so->so_rcv.sb_flags & SB_UPCALL) { extern int soreadable(struct socket *so); - CFS_NET_IN; - if (conn->ksnc_rx_nob_wanted && soreadable(so)){ + if (conn->ksnc_rx_nob_wanted && soreadable(so)) /* To verify whether the upcall is for receive */ - CFS_NET_EX; ksocknal_read_callback (conn); - }else - CFS_NET_EX; } /* go foward? */ - if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){ + if (so->so_snd.sb_flags & SB_UPCALL){ extern int sowriteable(struct socket *so); - CFS_NET_IN; - if (sowriteable(so)){ + if (sowriteable(so)) /* socket is writable */ - CFS_NET_EX; ksocknal_write_callback(conn); - } else - CFS_NET_EX; } out: read_unlock (&ksocknal_data.ksnd_global_lock); @@ -977,22 +1036,24 @@ ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) CFS_DECL_NET_DATA; CFS_NET_IN; - sock->so_upcallarg = (void *)sock; /* anything not NULL */ + sock->so_upcallarg = (void *)conn; sock->so_upcall = ksocknal_upcall; sock->so_snd.sb_timeo = 0; - sock->so_rcv.sb_timeo = 2 * HZ; + sock->so_rcv.sb_timeo = cfs_time_seconds(2); sock->so_rcv.sb_flags |= SB_UPCALL; sock->so_snd.sb_flags |= SB_UPCALL; - sock->reserved3 = conn; CFS_NET_EX; return; } void -ksocknal_lib_act_callback(struct socket *sock) +ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn) { - /* upcall will take the network funnel */ - ksocknal_upcall (sock, 0, 0); + CFS_DECL_NET_DATA; + + CFS_NET_IN; + ksocknal_upcall (sock, (void *)conn, 0); + CFS_NET_EX; } void @@ -1001,11 +1062,11 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) CFS_DECL_NET_DATA; CFS_NET_IN; - sock->so_upcall = NULL; - sock->so_upcallarg = NULL; sock->so_rcv.sb_flags &= ~SB_UPCALL; sock->so_snd.sb_flags &= ~SB_UPCALL; + sock->so_upcall = NULL; + sock->so_upcallarg = NULL; CFS_NET_EX; } - +#endif /* !__DARWIN8__ */ diff --git a/lnet/klnds/socklnd/socklnd_lib-darwin.h b/lnet/klnds/socklnd/socklnd_lib-darwin.h index e3b286bc..9e7574a 100644 --- a/lnet/klnds/socklnd/socklnd_lib-darwin.h +++ b/lnet/klnds/socklnd/socklnd_lib-darwin.h @@ -25,25 +25,14 @@ #include #include #include -#include #include #include -#define SOCKNAL_ARCH_EAGER_ACK 1 - -#define KSN_SOCK2FILE(so) ((struct file *)(so)->reserved4) -#define KSN_CONN2FILE(conn) ((struct file *)(conn)->ksnc_sock->reserved4) - -#define SOCK_WMEM_QUEUED(so) ((so)->so_snd.sb_cc) -#define SOCK_ERROR(so) ((so)->so_error) - -#define SOCK_TEST_NOSPACE(so) (sbspace(&(so)->so_snd) < (so)->so_snd.sb_lowat) -extern struct socket * sockfd_lookup(int fd, void *foo); - static inline int ksocknal_nsched(void) { + /* XXX Liang: fix it */ return 1; } diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index 48a813e..b7e2f49 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -1,74 +1,122 @@ -#include "socknal.h" - -#ifdef CONFIG_SYSCTL -#define SOCKNAL_SYSCTL 200 - -#define SOCKNAL_SYSCTL_TIMEOUT 1 -#define SOCKNAL_SYSCTL_EAGER_ACK 2 -#define SOCKNAL_SYSCTL_ZERO_COPY 3 -#define SOCKNAL_SYSCTL_TYPED 4 -#define SOCKNAL_SYSCTL_MIN_BULK 5 -#define SOCKNAL_SYSCTL_BUFFER_SIZE 6 -#define SOCKNAL_SYSCTL_NAGLE 7 -#define SOCKNAL_SYSCTL_IRQ_AFFINITY 8 -#define SOCKNAL_SYSCTL_KEEPALIVE_IDLE 9 -#define SOCKNAL_SYSCTL_KEEPALIVE_COUNT 10 -#define SOCKNAL_SYSCTL_KEEPALIVE_INTVL 11 - -static ctl_table ksocknal_ctl_table[] = { - {SOCKNAL_SYSCTL_TIMEOUT, "timeout", - &ksocknal_tunables.ksnd_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", - &ksocknal_tunables.ksnd_eager_ack, sizeof (int), - 0644, NULL, &proc_dointvec}, -#if SOCKNAL_ZC - {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy", - &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int), - 0644, NULL, &proc_dointvec}, -#endif - {SOCKNAL_SYSCTL_TYPED, "typed", - &ksocknal_tunables.ksnd_typed_conns, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk", - &ksocknal_tunables.ksnd_min_bulk, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_BUFFER_SIZE, "buffer_size", - &ksocknal_tunables.ksnd_buffer_size, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_NAGLE, "nagle", - &ksocknal_tunables.ksnd_nagle, sizeof(int), - 0644, NULL, &proc_dointvec}, -#if CPU_AFFINITY - {SOCKNAL_SYSCTL_IRQ_AFFINITY, "irq_affinity", - &ksocknal_tunables.ksnd_irq_affinity, sizeof(int), - 0644, NULL, &proc_dointvec}, -#endif - {SOCKNAL_SYSCTL_KEEPALIVE_IDLE, "keepalive_idle", - &ksocknal_tunables.ksnd_keepalive_idle, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_COUNT, "keepalive_count", - &ksocknal_tunables.ksnd_keepalive_count, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_INTVL, "keepalive_intvl", - &ksocknal_tunables.ksnd_keepalive_intvl, sizeof(int), - 0644, NULL, &proc_dointvec}, - { 0 } -}; +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#include "socklnd.h" + +# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static ctl_table ksocknal_ctl_table[21]; ctl_table ksocknal_top_ctl_table[] = { - {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, + {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, { 0 } }; + +int +ksocknal_lib_tunables_init () +{ + int i = 0; + int j = 1; + + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "timeout", ksocknal_tunables.ksnd_timeout, + sizeof (int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "credits", ksocknal_tunables.ksnd_credits, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "nconnds", ksocknal_tunables.ksnd_nconnds, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack, + sizeof (int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_frag, + sizeof (int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "typed", ksocknal_tunables.ksnd_typed_conns, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk, + sizeof (int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "rx_buffer_size", ksocknal_tunables.ksnd_rx_buffer_size, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "tx_buffer_size", ksocknal_tunables.ksnd_tx_buffer_size, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "nagle", ksocknal_tunables.ksnd_nagle, + sizeof(int), 0644, NULL, &proc_dointvec}; +#if CPU_AFFINITY + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity, + sizeof(int), 0644, NULL, &proc_dointvec}; +#endif + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl, + sizeof(int), 0644, NULL, &proc_dointvec}; +#ifdef SOCKNAL_BACKOFF + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "backoff_init", ksocknal_tunables.ksnd_backoff_init, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "backoff_max", ksocknal_tunables.ksnd_backoff_max, + sizeof(int), 0644, NULL, &proc_dointvec}; #endif + LASSERT (j == i+1); + LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0])); + + ksocknal_tunables.ksnd_sysctl = + register_sysctl_table(ksocknal_top_ctl_table, 0); + + if (ksocknal_tunables.ksnd_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ + if (ksocknal_tunables.ksnd_sysctl != NULL) + unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl); +} +#else +int +ksocknal_lib_tunables_init () +{ + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ +} +#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */ + void ksocknal_lib_bind_irq (unsigned int irq) { #if (defined(CONFIG_SMP) && CPU_AFFINITY) int bind; int cpu; - unsigned long flags; char cmdline[64]; ksock_irqinfo_t *info; char *argv[] = {"/bin/sh", @@ -85,13 +133,13 @@ ksocknal_lib_bind_irq (unsigned int irq) info = &ksocknal_data.ksnd_irqinfo[irq]; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + write_lock_bh (&ksocknal_data.ksnd_global_lock); LASSERT (info->ksni_valid); bind = !info->ksni_bound; info->ksni_bound = 1; - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + write_unlock_bh (&ksocknal_data.ksnd_global_lock); if (!bind) /* bound already */ return; @@ -100,8 +148,8 @@ ksocknal_lib_bind_irq (unsigned int irq) snprintf (cmdline, sizeof (cmdline), "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); - printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n", - irq, cpu, cmdline); + LCONSOLE_INFO("Binding irq %u to CPU %d with cmd: %s\n", + irq, cpu, cmdline); /* FIXME: Find a better method of setting IRQ affinity... */ @@ -113,12 +161,10 @@ ksocknal_lib_bind_irq (unsigned int irq) int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) { - struct sockaddr_in sin; - int len = sizeof (sin); - int rc; + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, + &conn->ksnc_port); - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 2); /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ LASSERT (!conn->ksnc_closing); @@ -127,18 +173,13 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) return rc; } - conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr); - conn->ksnc_port = ntohs (sin.sin_port); - - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 0); + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); if (rc != 0) { CERROR ("Error %d getting sock local IP\n", rc); return rc; } - conn->ksnc_myipaddr = ntohl (sin.sin_addr.s_addr); - return 0; } @@ -146,9 +187,10 @@ unsigned int ksocknal_lib_sock_irq (struct socket *sock) { int irq = 0; +#if CPU_AFFINITY struct dst_entry *dst; - if (!ksocknal_tunables.ksnd_irq_affinity) + if (!*ksocknal_tunables.ksnd_irq_affinity) return 0; dst = sk_dst_get (sock->sk); @@ -163,76 +205,45 @@ ksocknal_lib_sock_irq (struct socket *sock) dst_release (dst); } - return (irq); +#endif + return irq; } -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) -static struct page * -ksocknal_kvaddr_to_page (unsigned long vaddr) +int +ksocknal_lib_zc_capable(struct socket *sock) { - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#if CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); - - if (page == NULL || - !VALID_PAGE (page)) - return (NULL); - - return (page); + int caps = sock->sk->sk_route_caps; + + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && + (caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) != 0); } -#endif int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) { struct socket *sock = conn->ksnc_sock; -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) - unsigned long vaddr = (unsigned long)iov->iov_base - int offset = vaddr & (PAGE_SIZE - 1); - int zcsize = MIN (iov->iov_len, PAGE_SIZE - offset); - struct page *page; -#endif int nob; int rc; + if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ + conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ + tx->tx_nob == tx->tx_resid && /* frist sending */ + tx->tx_msg.ksm_csum == 0) /* not checksummed */ + ksocknal_lib_csum_tx(tx); + /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) - if (zcsize >= ksocknal_data.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { - int msgflg = MSG_DONTWAIT; - - CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", - (void *)vaddr, page, page_address(page), offset, zcsize); - - if (!list_empty (&conn->ksnc_tx_queue) || - zcsize < tx->tx_resid) - msgflg |= MSG_MORE; - - rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd); - } else -#endif { #if SOCKNAL_SINGLE_FRAG_TX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_niov; + unsigned int niov = tx->tx_niov; #endif struct msghdr msg = { .msg_name = NULL, @@ -266,17 +277,16 @@ int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { struct socket *sock = conn->ksnc_sock; - ptl_kiov_t *kiov = tx->tx_kiov; + lnet_kiov_t *kiov = tx->tx_kiov; int rc; int nob; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ -#if SOCKNAL_ZC - if (kiov->kiov_len >= ksocknal_tunables.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { + if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag && + tx->tx_msg.ksm_zc_req_cookie != 0) { + /* Zero copy is enabled */ struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; int fragsize = kiov->kiov_len; @@ -289,21 +299,18 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) fragsize < tx->tx_resid) msgflg |= MSG_MORE; - rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg, - &tx->tx_zccd); - } else -#endif - { + rc = tcp_sendpage(sock, page, offset, fragsize, msgflg); + } else { #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_nkiov; + unsigned int niov = tx->tx_nkiov; #endif struct msghdr msg = { .msg_name = NULL, @@ -325,7 +332,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) if (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) - msg.msg_flags |= MSG_DONTWAIT; + msg.msg_flags |= MSG_MORE; set_fs (KERNEL_DS); rc = sock_sendmsg(sock, &msg, nob); @@ -361,10 +368,10 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) #if SOCKNAL_SINGLE_FRAG_RX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_niov; + unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; struct msghdr msg = { @@ -380,6 +387,9 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) int nob; int i; int rc; + int fragnob; + int sum; + __u32 saved_csum; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ @@ -396,6 +406,27 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) /* NB this is just a boolean..........................^ */ set_fs (oldmm); + saved_csum = 0; + if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + saved_csum = conn->ksnc_msg.ksm_csum; + conn->ksnc_msg.ksm_csum = 0; + } + + if (saved_csum != 0) { + /* accumulate checksum */ + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + fragnob = iov[i].iov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + iov[i].iov_base, fragnob); + } + conn->ksnc_msg.ksm_csum = saved_csum; + } + return rc; } @@ -405,15 +436,15 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_nkiov; + unsigned int niov = conn->ksnc_rx_nkiov; #endif - ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, @@ -427,6 +458,9 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) int nob; int i; int rc; + void *base; + int sum; + int fragnob; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ @@ -441,88 +475,67 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) /* NB this is just a boolean.......................^ */ set_fs (oldmm); + if (conn->ksnc_msg.ksm_csum != 0) { + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + /* Dang! have to kmap again because I have nowhere to stash the + * mapped address. But by doing it while the page is still + * mapped, the kernel just bumps the map count and returns me + * the address it stashed. */ + base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; + fragnob = kiov[i].kiov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + base, fragnob); + + kunmap(kiov[i].kiov_page); + } + } for (i = 0; i < niov; i++) kunmap(kiov[i].kiov_page); return (rc); } -int -ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob) +void ksocknal_lib_csum_tx(ksock_tx_t *tx) { - int rc; - mm_segment_t oldmm = get_fs(); - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; - - set_fs (KERNEL_DS); - rc = sock_sendmsg (sock, &msg, iov.iov_len); - set_fs (oldmm); - - if (rc < 0) - return (rc); - - if (rc == 0) { - CERROR ("Unexpected zero rc\n"); - return (-ECONNABORTED); - } - - buffer = ((char *)buffer) + rc; - nob -= rc; - } + int i; + __u32 csum; + void *base; - return (0); -} + LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg); + LASSERT(tx->tx_conn != NULL); + LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); -int -ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); + tx->tx_msg.ksm_csum = 0; - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; + csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base, + tx->tx_iov[0].iov_len); - set_fs (KERNEL_DS); - rc = sock_recvmsg (sock, &msg, iov.iov_len, 0); - set_fs (oldmm); + if (tx->tx_kiov != NULL) { + for (i = 0; i < tx->tx_nkiov; i++) { + base = kmap(tx->tx_kiov[i].kiov_page) + + tx->tx_kiov[i].kiov_offset; - if (rc < 0) - return (rc); + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); - if (rc == 0) - return (-ECONNABORTED); + kunmap(tx->tx_kiov[i].kiov_page); + } + } else { + for (i = 1; i < tx->tx_niov; i++) + csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, + tx->tx_iov[i].iov_len); + } - buffer = ((char *)buffer) + rc; - nob -= rc; + if (*ksocknal_tunables.ksnd_inject_csum_error) { + csum++; + *ksocknal_tunables.ksnd_inject_csum_error = 0; } - return (0); + tx->tx_msg.ksm_csum = csum; } int @@ -533,31 +546,23 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int int len; int rc; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) { LASSERT (conn->ksnc_closing); *txmem = *rxmem = *nagle = 0; return (-ESHUTDOWN); } - set_fs (KERNEL_DS); - - len = sizeof(*txmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)txmem, &len); - if (rc == 0) { - len = sizeof(*rxmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF, - (char *)rxmem, &len); - } + rc = libcfs_sock_getbuf(sock, txmem, rxmem); if (rc == 0) { len = sizeof(*nagle); + set_fs(KERNEL_DS); rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)nagle, &len); + set_fs(oldmm); } - set_fs (oldmm); - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); if (rc == 0) *nagle = !*nagle; @@ -606,7 +611,7 @@ ksocknal_lib_setup_sock (struct socket *sock) return (rc); } - if (!ksocknal_tunables.ksnd_nagle) { + if (!*ksocknal_tunables.ksnd_nagle) { option = 1; set_fs (KERNEL_DS); @@ -619,34 +624,51 @@ ksocknal_lib_setup_sock (struct socket *sock) } } - if (ksocknal_tunables.ksnd_buffer_size > 0) { - option = ksocknal_tunables.ksnd_buffer_size; + rc = libcfs_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return (rc); + } + +/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ +#ifdef SOCKNAL_BACKOFF + if (*ksocknal_tunables.ksnd_backoff_init > 0) { + option = *ksocknal_tunables.ksnd_backoff_init; set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof (option)); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_INIT, + (char *)&option, sizeof (option)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set send buffer %d: %d\n", + CERROR ("Can't set initial tcp backoff %d: %d\n", option, rc); return (rc); } + } + + if (*ksocknal_tunables.ksnd_backoff_max > 0) { + option = *ksocknal_tunables.ksnd_backoff_max; set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, - (char *)&option, sizeof (option)); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_MAX, + (char *)&option, sizeof (option)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set receive buffer %d: %d\n", + CERROR ("Can't set maximum tcp backoff %d: %d\n", option, rc); return (rc); } } +#endif /* snapshot tunables */ - keep_idle = ksocknal_tunables.ksnd_keepalive_idle; - keep_count = ksocknal_tunables.ksnd_keepalive_count; - keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl; + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); @@ -693,135 +715,13 @@ ksocknal_lib_setup_sock (struct socket *sock) return (0); } -int -ksocknal_lib_connect_sock(struct socket **sockp, int *may_retry, - ksock_route_t *route, int local_port) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct socket *sock; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - struct timeval tv; - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(local_port); - locaddr.sin_addr.s_addr = - (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) - : INADDR_ANY; - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (route->ksnr_port); - srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - *may_retry = 0; - - rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); - *sockp = sock; - if (rc != 0) { - CERROR ("Can't create autoconnect socket: %d\n", rc); - return (rc); - } - - /* Ugh; have to map_fd for compatibility with sockets passed in - * from userspace. And we actually need the sock->file refcounting - * that this gives you :) */ - - rc = sock_map_fd (sock); - if (rc < 0) { - sock_release (sock); - CERROR ("sock_map_fd error %d\n", rc); - return (rc); - } - - /* NB the file descriptor (rc) now owns the ref on sock->file */ - LASSERT (sock->file != NULL); - LASSERT (file_count(sock->file) == 1); - - get_file(sock->file); /* extra ref makes sock->file */ - sys_close(rc); /* survive this close */ - - /* Still got a single ref on sock->file */ - LASSERT (file_count(sock->file) == 1); - - /* Set the socket timeouts, so our connection attempt completes in - * finite time */ - tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; - tv.tv_usec = 0; - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set send timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto failed; - } - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set receive timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto failed; - } - - set_fs (KERNEL_DS); - option = 1; - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); - goto failed; - } - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", local_port); - *may_retry = 1; - goto failed; - } - if (rc != 0) { - CERROR("Error trying to bind to reserved port %d: %d\n", - local_port, rc); - goto failed; - } - - rc = sock->ops->connect(sock, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - sock->file->f_flags); - if (rc == 0) - return 0; - - /* EADDRNOTAVAIL probably means we're already connected to the same - * peer/port on the same local port on a differently typed - * connection. Let our caller retry with a different local - * port... */ - *may_retry = (rc == -EADDRNOTAVAIL); - - CDEBUG(*may_retry ? D_NET : D_ERROR, - "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, - HIPQUAD(route->ksnr_myipaddr), local_port, - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - - failed: - fput(sock->file); - return rc; -} - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct tcp_opt *sock2tcp_opt(struct sock *sk) { return &(sk->tp_pinfo.af_tcp); } +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)) +#define sock2tcp_opt(sk) tcp_sk(sk) #else struct tcp_opt *sock2tcp_opt(struct sock *sk) { @@ -834,13 +734,17 @@ void ksocknal_lib_push_conn (ksock_conn_t *conn) { struct sock *sk; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)) struct tcp_opt *tp; +#else + struct tcp_sock *tp; +#endif int nonagle; int val = 1; int rc; mm_segment_t oldmm; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) /* being shut down */ return; @@ -865,7 +769,7 @@ ksocknal_lib_push_conn (ksock_conn_t *conn) tp->nonagle = nonagle; release_sock (sk); - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); } extern void ksocknal_read_callback (ksock_conn_t *conn); @@ -880,6 +784,7 @@ ksocknal_data_ready (struct sock *sk, int n) ENTRY; /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; @@ -894,22 +799,23 @@ ksocknal_data_ready (struct sock *sk, int n) EXIT; } -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,7)) -#define tcp_wspace(sk) sk_stream_wspace(sk) -#endif - static void ksocknal_write_space (struct sock *sk) { ksock_conn_t *conn; + int wspace; + int min_wpace; /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; + wspace = SOCKNAL_WSPACE(sk); + min_wpace = SOCKNAL_MIN_WSPACE(sk); CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", - sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn, + sk, wspace, min_wpace, conn, (conn == NULL) ? "" : (conn->ksnc_tx_ready ? " ready" : " blocked"), (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? @@ -925,7 +831,7 @@ ksocknal_write_space (struct sock *sk) return; } - if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + if (wspace >= min_wpace) { /* got enough space */ ksocknal_write_callback(conn); /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the @@ -955,14 +861,6 @@ ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) } void -ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn) -{ - ksocknal_data_ready (sock->sk, 0); - ksocknal_write_space (sock->sk); - return; -} - -void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) { /* Remove conn's network callbacks. diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.h b/lnet/klnds/socklnd/socklnd_lib-linux.h index 6129fdc..594f29f 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.h +++ b/lnet/klnds/socklnd/socklnd_lib-linux.h @@ -6,7 +6,9 @@ #ifndef __LINUX_SOCKNAL_LIB_H__ #define __LINUX_SOCKNAL_LIB_H__ +#ifdef HAVE_KERNEL_CONFIG_H #include +#endif #include #include #include @@ -19,11 +21,11 @@ #include #include #include - + #include #include #include - + #include #include #include @@ -38,35 +40,30 @@ #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) # include #endif - + #include #include -#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) -# define sk_allocation allocation -# define sk_data_ready data_ready -# define sk_write_space write_space -# define sk_user_data user_data -# define sk_prot prot -# define sk_sndbuf sndbuf -# define sk_socket socket +#include +static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); +#else + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + return crc; #endif +} -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) -# define sk_wmem_queued wmem_queued -# define sk_err err +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,7)) +# define SOCKNAL_WSPACE(sk) sk_stream_wspace(sk) +# define SOCKNAL_MIN_WSPACE(sk) sk_stream_min_wspace(sk) +#else +# define SOCKNAL_WSPACE(sk) tcp_wspace(sk) +# define SOCKNAL_MIN_WSPACE(sk) (((sk)->sk_sndbuf*8)/10) #endif -#define SOCKNAL_ARCH_EAGER_ACK 0 -#define SOCK_WMEM_QUEUED(so) ((so)->sk->sk_wmem_queued) -#define SOCK_ERROR(so) ((so)->sk->sk_err) -#define SOCK_TEST_NOSPACE(so) test_bit(SOCK_NOSPACE, &(so)->flags) - -#define KSN_SOCK2FILE(so) ((so)->file) -#define KSN_CONN2FILE(conn) ((conn)->ksnc_sock->file) - #ifndef CONFIG_SMP static inline int ksocknal_nsched(void) diff --git a/lnet/klnds/socklnd/socklnd_lib-winnt.c b/lnet/klnds/socklnd/socklnd_lib-winnt.c new file mode 100755 index 0000000..7669c77 --- /dev/null +++ b/lnet/klnds/socklnd/socklnd_lib-winnt.c @@ -0,0 +1,832 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2006 Cluster File Systems, Inc, All rights reserved. + * Author: Matt Wu + * + * This file is part of Lustre, http://www.lustre.org. + * + * This Lustre Software is proprietary - please refer to the license + * agreement you received with your software. + * + * windows socknal library + * + */ + +#include "socklnd.h" + +# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM +static ctl_table ksocknal_ctl_table[18]; + +ctl_table ksocknal_top_ctl_table[] = { + {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, + { 0 } +}; + +int +ksocknal_lib_tunables_init () +{ + int i = 0; + int j = 1; + + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "timeout", ksocknal_tunables.ksnd_timeout, + sizeof (int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "credits", ksocknal_tunables.ksnd_credits, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "nconnds", ksocknal_tunables.ksnd_nconnds, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack, + sizeof (int), 0644, NULL, &proc_dointvec}; +#if SOCKNAL_ZC + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_frag, + sizeof (int), 0644, NULL, &proc_dointvec}; +#endif + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "typed", ksocknal_tunables.ksnd_typed_conns, + sizeof (int), 0444, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk, + sizeof (int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "buffer_size", ksocknal_tunables.ksnd_buffer_size, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "nagle", ksocknal_tunables.ksnd_nagle, + sizeof(int), 0644, NULL, &proc_dointvec}; +#if CPU_AFFINITY + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity, + sizeof(int), 0644, NULL, &proc_dointvec}; +#endif + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count, + sizeof(int), 0644, NULL, &proc_dointvec}; + ksocknal_ctl_table[i++] = (ctl_table) + {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl, + sizeof(int), 0644, NULL, &proc_dointvec}; + + LASSERT (j == i+1); + LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0])); + + ksocknal_tunables.ksnd_sysctl = + register_sysctl_table(ksocknal_top_ctl_table, 0); + + if (ksocknal_tunables.ksnd_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ + if (ksocknal_tunables.ksnd_sysctl != NULL) + unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl); +} +#else +int +ksocknal_lib_tunables_init () +{ + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ +} +#endif + +void +ksocknal_lib_bind_irq (unsigned int irq) +{ +} + +int +ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) +{ + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, &conn->ksnc_port); + + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); + + if (rc != 0) { + CERROR ("Error %d getting sock peer IP\n", rc); + return rc; + } + + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); + if (rc != 0) { + CERROR ("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +unsigned int +ksocknal_lib_sock_irq (struct socket *sock) +{ + return 0; +} + +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) +static struct page * +ksocknal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (page == NULL || + !VALID_PAGE (page)) + return (NULL); + + return (page); +} +#endif + +/* + * ks_lock_iovs + * Lock the i/o vector buffers into MDL structure + * + * Arguments: + * iov: the array of i/o vectors + * niov: number of i/o vectors to be locked + * len: the real length of the iov vectors + * + * Return Value: + * ksock_mdl_t *: the Mdl of the locked buffers or + * NULL pointer in failure case + * + * Notes: + * N/A + */ + +ksock_mdl_t * +ks_lock_iovs( + IN struct iovec *iov, + IN int niov, + IN int recving, + IN int * len ) +{ + int rc = 0; + + int i = 0; + int total = 0; + ksock_mdl_t * mdl = NULL; + ksock_mdl_t * tail = NULL; + + LASSERT(iov != NULL); + LASSERT(niov > 0); + LASSERT(len != NULL); + + for (i=0; i < niov; i++) { + + ksock_mdl_t * Iovec = NULL; + + rc = ks_lock_buffer( + iov[i].iov_base, + FALSE, + iov[i].iov_len, + recving ? IoWriteAccess : IoReadAccess, + &Iovec ); + + if (rc < 0) { + break; + } + + if (tail) { + tail->Next = Iovec; + } else { + mdl = Iovec; + } + + tail = Iovec; + + total +=iov[i].iov_len; + } + + if (rc >= 0) { + *len = total; + } else { + if (mdl) { + ks_release_mdl(mdl, FALSE); + mdl = NULL; + } + } + + return mdl; +} + +/* + * ks_lock_kiovs + * Lock the kiov pages into MDL structure + * + * Arguments: + * kiov: the array of kiov pages + * niov: number of kiov to be locked + * len: the real length of the kiov arrary + * + * Return Value: + * PMDL: the Mdl of the locked buffers or NULL + * pointer in failure case + * + * Notes: + * N/A + */ +ksock_mdl_t * +ks_lock_kiovs( + IN lnet_kiov_t * kiov, + IN int nkiov, + IN int recving, + IN int * len ) +{ + int rc = 0; + int i = 0; + int total = 0; + ksock_mdl_t * mdl = NULL; + ksock_mdl_t * tail = NULL; + + LASSERT(kiov != NULL); + LASSERT(nkiov > 0); + LASSERT(len != NULL); + + for (i=0; i < nkiov; i++) { + + ksock_mdl_t * Iovec = NULL; + + + // + // Lock the kiov page into Iovec ¡­ + // + + rc = ks_lock_buffer( + (PUCHAR)kiov[i].kiov_page->addr + + kiov[i].kiov_offset, + FALSE, + kiov[i].kiov_len, + recving ? IoWriteAccess : IoReadAccess, + &Iovec + ); + + if (rc < 0) { + break; + } + + // + // Attach the Iovec to the mdl chain + // + + if (tail) { + tail->Next = Iovec; + } else { + mdl = Iovec; + } + + tail = Iovec; + + total += kiov[i].kiov_len; + + } + + if (rc >= 0) { + *len = total; + } else { + if (mdl) { + ks_release_mdl(mdl, FALSE); + mdl = NULL; + } + } + + return mdl; +} + + +int +ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) + unsigned long vaddr = (unsigned long)iov->iov_base + int offset = vaddr & (PAGE_SIZE - 1); + int zcsize = MIN (iov->iov_len, PAGE_SIZE - offset); + struct page *page; +#endif + int nob; + int rc; + ksock_mdl_t * mdl; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + +#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) + if (zcsize >= ksocknal_data.ksnd_zc_min_frag && + (sock->sk->sk_route_caps & NETIF_F_SG) && + (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && + (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { + int msgflg = MSG_DONTWAIT; + + CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", + (void *)vaddr, page, page_address(page), offset, zcsize); + + if (!list_empty (&conn->ksnc_tx_queue) || + zcsize < tx->tx_resid) + msgflg |= MSG_MORE; + + rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd); + } else +#endif + { + /* lock the whole tx iovs into a single mdl chain */ + mdl = ks_lock_iovs(tx->tx_iov, tx->tx_niov, FALSE, &nob); + + if (mdl) { + /* send the total mdl chain */ + rc = ks_send_mdl( conn->ksnc_sock, tx, mdl, nob, + (!list_empty (&conn->ksnc_tx_queue) || nob < tx->tx_resid) ? + (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT); + } else { + rc = -ENOMEM; + } + } + + return rc; +} + +int +ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; + lnet_kiov_t *kiov = tx->tx_kiov; + int rc; + int nob; + ksock_mdl_t * mdl; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + +#if SOCKNAL_ZC + if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag && + (sock->sk->sk_route_caps & NETIF_F_SG) && + (sock->sk->sk_route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + int msgflg = MSG_DONTWAIT; + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, kiov->kiov_len); + + if (!list_empty(&conn->ksnc_tx_queue) || + fragsize < tx->tx_resid) + msgflg |= MSG_MORE; + + rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg, + &tx->tx_zccd); + } else +#endif + { + /* lock the whole tx kiovs into a single mdl chain */ + mdl = ks_lock_kiovs(tx->tx_kiov, tx->tx_nkiov, FALSE, &nob); + + if (mdl) { + /* send the total mdl chain */ + rc = ks_send_mdl( + conn->ksnc_sock, tx, mdl, nob, + (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) ? + (MSG_DONTWAIT | MSG_MORE) : MSG_DONTWAIT); + } else { + rc = -ENOMEM; + } + } + + return rc; +} + + +int +ksocknal_lib_recv_iov (ksock_conn_t *conn) +{ + struct iovec *iov = conn->ksnc_rx_iov; + int rc; + int size; + ksock_mdl_t * mdl; + + /* lock the whole tx iovs into a single mdl chain */ + mdl = ks_lock_iovs(iov, conn->ksnc_rx_niov, TRUE, &size); + + if (!mdl) { + return (-ENOMEM); + } + + LASSERT (size <= conn->ksnc_rx_nob_wanted); + + /* try to request data for the whole mdl chain */ + rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT); + + return rc; +} + +int +ksocknal_lib_recv_kiov (ksock_conn_t *conn) +{ + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + int size; + int rc; + ksock_mdl_t * mdl; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone, so we only receive 1 frag at a time. */ + LASSERT (conn->ksnc_rx_nkiov > 0); + + /* lock the whole tx kiovs into a single mdl chain */ + mdl = ks_lock_kiovs(kiov, conn->ksnc_rx_nkiov, TRUE, &size); + + if (!mdl) { + rc = -ENOMEM; + return (rc); + } + + LASSERT (size <= conn->ksnc_rx_nob_wanted); + + /* try to request data for the whole mdl chain */ + rc = ks_recv_mdl (conn->ksnc_sock, mdl, size, MSG_DONTWAIT); + + return rc; +} + +void +ksocknal_lib_eager_ack (ksock_conn_t *conn) +{ + __u32 option = 1; + int rc = 0; + + rc = ks_set_tcp_option( + conn->ksnc_sock, TCP_SOCKET_NODELAY, + &option, sizeof(option) ); + if (rc != 0) { + CERROR("Can't disable nagle: %d\n", rc); + } +} + +int +ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) +{ + ksock_tconn_t * tconn = conn->ksnc_sock; + int len; + int rc; + + ks_get_tconn (tconn); + + *txmem = *rxmem = 0; + + len = sizeof(*nagle); + + rc = ks_get_tcp_option( + tconn, TCP_SOCKET_NODELAY, + (__u32 *)nagle, &len); + + ks_put_tconn (tconn); + + printk("ksocknal_get_conn_tunables: nodelay = %d rc = %d\n", *nagle, rc); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; + + return (rc); +} + +int +ksocknal_lib_buffersize (int current_sz, int tunable_sz) +{ + /* ensure >= SOCKNAL_MIN_BUFFER */ + if (current_sz < SOCKNAL_MIN_BUFFER) + return MAX(SOCKNAL_MIN_BUFFER, tunable_sz); + + if (tunable_sz > SOCKNAL_MIN_BUFFER) + return tunable_sz; + + /* leave alone */ + return 0; +} + +int +ksocknal_lib_setup_sock (struct socket *sock) +{ + int rc; + + int keep_idle; + int keep_count; + int keep_intvl; + int keep_alive; + + __u32 option; + + /* set the window size */ + +#if 0 + tconn->kstc_snd_wnd = ksocknal_tunables.ksnd_buffer_size; + tconn->kstc_rcv_wnd = ksocknal_tunables.ksnd_buffer_size; +#endif + + /* disable nagle */ + if (!ksocknal_tunables.ksnd_nagle) { + option = 1; + + rc = ks_set_tcp_option( + sock, TCP_SOCKET_NODELAY, + &option, sizeof (option)); + if (rc != 0) { + printk ("Can't disable nagle: %d\n", rc); + return (rc); + } + } + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + keep_alive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + + option = (__u32)(keep_alive ? 1 : 0); + + rc = ks_set_tcp_option( + sock, TCP_SOCKET_KEEPALIVE, + &option, sizeof (option)); + if (rc != 0) { + CERROR ("Can't disable nagle: %d\n", rc); + return (rc); + } + + return (0); +} + +void +ksocknal_lib_push_conn (ksock_conn_t *conn) +{ + ksock_tconn_t * tconn; + __u32 nagle; + __u32 val = 1; + int rc; + + tconn = conn->ksnc_sock; + + ks_get_tconn(tconn); + + spin_lock(&tconn->kstc_lock); + if (tconn->kstc_type == kstt_sender) { + nagle = tconn->sender.kstc_info.nagle; + tconn->sender.kstc_info.nagle = 0; + } else { + LASSERT(tconn->kstc_type == kstt_child); + nagle = tconn->child.kstc_info.nagle; + tconn->child.kstc_info.nagle = 0; + } + + spin_unlock(&tconn->kstc_lock); + + val = 1; + rc = ks_set_tcp_option( + tconn, + TCP_SOCKET_NODELAY, + &(val), + sizeof(__u32) + ); + + LASSERT (rc == 0); + spin_lock(&tconn->kstc_lock); + + if (tconn->kstc_type == kstt_sender) { + tconn->sender.kstc_info.nagle = nagle; + } else { + LASSERT(tconn->kstc_type == kstt_child); + tconn->child.kstc_info.nagle = nagle; + } + spin_unlock(&tconn->kstc_lock); + + ks_put_tconn(tconn); +} + +/* @mode: 0: receiving mode / 1: sending mode */ +void +ksocknal_sched_conn (ksock_conn_t *conn, int mode, ksock_tx_t *tx) +{ + int flags; + ksock_sched_t * sched; + ENTRY; + + /* interleave correctly with closing sockets... */ + read_lock (&ksocknal_data.ksnd_global_lock); + + sched = conn->ksnc_scheduler; + + spin_lock_irqsave (&sched->kss_lock, flags); + + if (mode) { /* transmission can continue ... */ + + conn->ksnc_tx_ready = 1; + + if (tx) { + /* Incomplete send: place tx on HEAD of tx_queue */ + list_add (&tx->tx_list, &conn->ksnc_tx_queue); + } + + if ( !conn->ksnc_tx_scheduled && + !list_empty(&conn->ksnc_tx_queue)) { //packets to send + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_conn_refcount); + + cfs_waitq_signal (&sched->kss_waitq); + } + } else { /* receiving can continue ... */ + + conn->ksnc_rx_ready = 1; + + if ( !conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + atomic_inc (&conn->ksnc_conn_refcount); + + cfs_waitq_signal (&sched->kss_waitq); + } + } + + spin_unlock_irqrestore (&sched->kss_lock, flags); + read_unlock (&ksocknal_data.ksnd_global_lock); + + EXIT; +} + +void ksocknal_schedule_callback(struct socket*sock, int mode, void * tx, ulong_ptr bytes) +{ + ksock_conn_t * conn = (ksock_conn_t *) sock->kstc_conn; + + if (mode) { + ksocknal_sched_conn(conn, mode, tx); + } else { + if ( CAN_BE_SCHED(bytes, (ulong_ptr)conn->ksnc_rx_nob_wanted )) { + ksocknal_sched_conn(conn, mode, tx); + } + } +} + +extern void +ksocknal_tx_launched (ksock_tx_t *tx); + +void +ksocknal_fini_sending(ksock_tcpx_fini_t *tcpx) +{ + ksocknal_tx_launched(tcpx->tx); + cfs_free(tcpx); +} + +void * +ksocknal_update_tx( + struct socket* tconn, + void * txp, + ulong_ptr rc + ) +{ + ksock_tx_t * tx = (ksock_tx_t *)txp; + + /* + * the transmission was done, we need update the tx + */ + + LASSERT(tx->tx_resid >= (int)rc); + tx->tx_resid -= (int)rc; + + /* + * just partial of tx is sent out, we need update + * the fields of tx and schedule later transmission. + */ + + if (tx->tx_resid) { + + if (tx->tx_niov > 0) { + + /* if there's iov, we need process iov first */ + while (rc > 0 ) { + if (rc < tx->tx_iov->iov_len) { + /* didn't send whole iov entry... */ + tx->tx_iov->iov_base = + (char *)(tx->tx_iov->iov_base) + rc; + tx->tx_iov->iov_len -= rc; + rc = 0; + } else { + /* the whole of iov was sent out */ + rc -= tx->tx_iov->iov_len; + tx->tx_iov++; + tx->tx_niov--; + } + } + + } else { + + /* now we need process the kiov queues ... */ + + while (rc > 0 ) { + + if (rc < tx->tx_kiov->kiov_len) { + /* didn't send whole kiov entry... */ + tx->tx_kiov->kiov_offset += rc; + tx->tx_kiov->kiov_len -= rc; + rc = 0; + } else { + /* whole kiov was sent out */ + rc -= tx->tx_kiov->kiov_len; + tx->tx_kiov++; + tx->tx_nkiov--; + } + } + } + + } else { + + ksock_tcpx_fini_t * tcpx = + cfs_alloc(sizeof(ksock_tcpx_fini_t), CFS_ALLOC_ZERO); + + ASSERT(tx->tx_resid == 0); + + if (!tcpx) { + + ksocknal_tx_launched (tx); + + } else { + + tcpx->tx = tx; + ExInitializeWorkItem( + &(tcpx->item), + ksocknal_fini_sending, + tcpx + ); + ExQueueWorkItem( + &(tcpx->item), + CriticalWorkQueue + ); + } + + tx = NULL; + } + + return (void *)tx; +} + +void +ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) +{ +} + +void +ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) +{ + sock->kstc_conn = conn; + sock->kstc_sched_cb = ksocknal_schedule_callback; + sock->kstc_update_tx = ksocknal_update_tx; +} + +void +ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) +{ + sock->kstc_conn = NULL; + sock->kstc_sched_cb = NULL; + sock->kstc_update_tx = NULL; +} + diff --git a/lnet/klnds/socklnd/socklnd_lib-winnt.h b/lnet/klnds/socklnd/socklnd_lib-winnt.h new file mode 100755 index 0000000..492c9f5 --- /dev/null +++ b/lnet/klnds/socklnd/socklnd_lib-winnt.h @@ -0,0 +1,42 @@ +#define DEBUG_PORTAL_ALLOC +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#ifndef __WINNT_TDILND_LIB_H__ +#define __WINNT_TDILND_LIB_H__ + +#include +#include + +#ifndef CONFIG_SMP + +static inline +int ksocknal_nsched(void) +{ + return 1; +} + +#else + +static inline int +ksocknal_nsched(void) +{ + return num_online_cpus(); +} + +static inline int +ksocknal_sched2cpu(int i) +{ + return i; +} + +static inline int +ksocknal_irqsched2cpu(int i) +{ + return i; +} + +#endif + +#endif diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c new file mode 100644 index 0000000..917d4d7 --- /dev/null +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -0,0 +1,156 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * Author: Eric Barton + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +static int sock_timeout = 50; +CFS_MODULE_PARM(sock_timeout, "i", int, 0644, + "dead socket timeout (seconds)"); + +static int credits = 256; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static int nconnds = 4; +CFS_MODULE_PARM(nconnds, "i", int, 0444, + "# connection daemons"); + +static int min_reconnectms = 1000; +CFS_MODULE_PARM(min_reconnectms, "i", int, 0644, + "min connection retry interval (mS)"); + +static int max_reconnectms = 60000; +CFS_MODULE_PARM(max_reconnectms, "i", int, 0644, + "max connection retry interval (mS)"); + +#if defined(__APPLE__) && !defined(__DARWIN8__) +# define DEFAULT_EAGER_ACK 1 +#else +# define DEFAULT_EAGER_ACK 0 +#endif +static int eager_ack = DEFAULT_EAGER_ACK; +CFS_MODULE_PARM(eager_ack, "i", int, 0644, + "send tcp ack packets eagerly"); + +static int typed_conns = 1; +CFS_MODULE_PARM(typed_conns, "i", int, 0444, + "use different sockets for bulk"); + +static int min_bulk = (1<<10); +CFS_MODULE_PARM(min_bulk, "i", int, 0644, + "smallest 'large' message"); + +#ifdef __APPLE__ +# ifdef __DARWIN8__ +# define DEFAULT_BUFFER_SIZE (224*1024) +# else +# define DEFAULT_BUFFER_SIZE (1152 * 1024) +# endif +#else +# define DEFAULT_BUFFER_SIZE 0 +#endif +static int tx_buffer_size = DEFAULT_BUFFER_SIZE; +CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644, + "socket tx buffer size (0 for system default)"); + +static int rx_buffer_size = DEFAULT_BUFFER_SIZE; +CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644, + "socket rx buffer size (0 for system default)"); + +static int nagle = 0; +CFS_MODULE_PARM(nagle, "i", int, 0644, + "enable NAGLE?"); + +static int keepalive_idle = 30; +CFS_MODULE_PARM(keepalive_idle, "i", int, 0644, + "# idle seconds before probe"); + +#ifdef HAVE_BGL_SUPPORT +#define DEFAULT_KEEPALIVE_COUNT 100 +#else +#define DEFAULT_KEEPALIVE_COUNT 5 +#endif +static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; +CFS_MODULE_PARM(keepalive_count, "i", int, 0644, + "# missed probes == dead"); + +static int keepalive_intvl = 5; +CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644, + "seconds between probes"); + +static int enable_csum = 0; +CFS_MODULE_PARM(enable_csum, "i", int, 0644, + "enable check sum"); + +static int inject_csum_error = 0; +CFS_MODULE_PARM(inject_csum_error, "i", int, 0644, + "set non-zero to inject a checksum error"); +#ifdef CPU_AFFINITY +static int enable_irq_affinity = 1; +CFS_MODULE_PARM(enable_irq_affinity, "i", int, 0644, + "enable IRQ affinity"); +#endif + +static unsigned int zc_min_frag = (2<<10); +CFS_MODULE_PARM(zc_min_frag, "i", int, 0644, + "minimum fragment to zero copy"); + +#ifdef SOCKNAL_BACKOFF +static int backoff_init = 3; +CFS_MODULE_PARM(backoff_init, "i", int, 0644, + "seconds for initial tcp backoff"); + +static int backoff_max = 3; +CFS_MODULE_PARM(backoff_max, "i", int, 0644, + "seconds for maximum tcp backoff"); +#endif + +ksock_tunables_t ksocknal_tunables = { + .ksnd_timeout = &sock_timeout, + .ksnd_credits = &credits, + .ksnd_peercredits = &peer_credits, + .ksnd_nconnds = &nconnds, + .ksnd_min_reconnectms = &min_reconnectms, + .ksnd_max_reconnectms = &max_reconnectms, + .ksnd_eager_ack = &eager_ack, + .ksnd_typed_conns = &typed_conns, + .ksnd_min_bulk = &min_bulk, + .ksnd_tx_buffer_size = &tx_buffer_size, + .ksnd_rx_buffer_size = &rx_buffer_size, + .ksnd_nagle = &nagle, + .ksnd_keepalive_idle = &keepalive_idle, + .ksnd_keepalive_count = &keepalive_count, + .ksnd_keepalive_intvl = &keepalive_intvl, + .ksnd_enable_csum = &enable_csum, + .ksnd_inject_csum_error = &inject_csum_error, + .ksnd_zc_min_frag = &zc_min_frag, +#ifdef CPU_AFFINITY + .ksnd_irq_affinity = &enable_irq_affinity, +#endif +#ifdef SOCKNAL_BACKOFF + .ksnd_backoff_init = &backoff_init, + .ksnd_backoff_max = &backoff_max, +#endif +}; + diff --git a/lnet/klnds/viblnd/Makefile.in b/lnet/klnds/viblnd/Makefile.in index 5287e70..5b5c2db 100644 --- a/lnet/klnds/viblnd/Makefile.in +++ b/lnet/klnds/viblnd/Makefile.in @@ -1,5 +1,5 @@ -MODULES := kvibnal -kvibnal-objs := vibnal.o vibnal_cb.o +MODULES := kviblnd +kviblnd-objs := viblnd.o viblnd_cb.o viblnd_modparams.o EXTRA_POST_CFLAGS := @VIBCPPFLAGS@ diff --git a/lnet/klnds/viblnd/autoMakefile.am b/lnet/klnds/viblnd/autoMakefile.am index f90fbf2..19861a9 100644 --- a/lnet/klnds/viblnd/autoMakefile.am +++ b/lnet/klnds/viblnd/autoMakefile.am @@ -4,12 +4,10 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS -if BUILD_VIBNAL -modulenet_DATA = kvibnal$(KMODEXT) -endif +if BUILD_VIBLND +modulenet_DATA = kviblnd$(KMODEXT) endif endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(kvibnal-objs:%.o=%.c) vibnal.h vibnal_wire.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ +DIST_SOURCES = $(kviblnd-objs:%.o=%.c) viblnd.h viblnd_wire.h diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index 65cd89c..0d738a1 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -22,30 +22,19 @@ * */ -#include "vibnal.h" - -nal_t kibnal_api; -ptl_handle_ni_t kibnal_ni; -kib_data_t kibnal_data; -kib_tunables_t kibnal_tunables; - -#ifdef CONFIG_SYSCTL -#define IBNAL_SYSCTL 202 - -#define IBNAL_SYSCTL_TIMEOUT 1 - -static ctl_table kibnal_ctl_table[] = { - {IBNAL_SYSCTL_TIMEOUT, "timeout", - &kibnal_tunables.kib_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - { 0 } +#include "viblnd.h" + +lnd_t the_kiblnd = { + .lnd_type = VIBLND, + .lnd_startup = kibnal_startup, + .lnd_shutdown = kibnal_shutdown, + .lnd_ctl = kibnal_ctl, + .lnd_send = kibnal_send, + .lnd_recv = kibnal_recv, + .lnd_eager_recv = kibnal_eager_recv, }; -static ctl_table kibnal_top_ctl_table[] = { - {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table}, - { 0 } -}; -#endif +kib_data_t kibnal_data; void vibnal_assert_wire_constants (void) { @@ -56,7 +45,7 @@ void vibnal_assert_wire_constants (void) /* Constants... */ CLASSERT (IBNAL_MSG_MAGIC == 0x0be91b91); - CLASSERT (IBNAL_MSG_VERSION == 0x10); + CLASSERT (IBNAL_MSG_VERSION == 0x11); CLASSERT (IBNAL_MSG_CONNREQ == 0xc0); CLASSERT (IBNAL_MSG_CONNACK == 0xc1); CLASSERT (IBNAL_MSG_NOOP == 0xd0); @@ -164,13 +153,6 @@ void vibnal_assert_wire_constants (void) CLASSERT ((int)sizeof(((kib_msg_t *)0)->ibm_u.completion) == 12); } -void -kibnal_pause(int ticks) -{ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ticks); -} - __u32 kibnal_cksum (void *ptr, int nob) { @@ -192,33 +174,36 @@ kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) } void -kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, - __u64 dststamp, __u64 seq) +kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, + lnet_nid_t dstnid, __u64 dststamp, __u64 seq) { /* CAVEAT EMPTOR! all message fields not set here should have been * initialised previously. */ msg->ibm_magic = IBNAL_MSG_MAGIC; - msg->ibm_version = IBNAL_MSG_VERSION; + msg->ibm_version = version; /* ibm_type */ msg->ibm_credits = credits; /* ibm_nob */ msg->ibm_cksum = 0; - msg->ibm_srcnid = kibnal_lib.libnal_ni.ni_pid.nid; + msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid, + dstnid); msg->ibm_srcstamp = kibnal_data.kib_incarnation; msg->ibm_dstnid = dstnid; msg->ibm_dststamp = dststamp; msg->ibm_seq = seq; -#if IBNAL_CKSUM - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); -#endif + + if (*kibnal_tunables.kib_cksum) { + /* NB ibm_cksum zero while computing cksum */ + msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob); + } } int -kibnal_unpack_msg(kib_msg_t *msg, int nob) +kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) { const int hdr_size = offsetof(kib_msg_t, ibm_u); __u32 msg_cksum; + __u32 msg_version; int flip; int msg_nob; #if !IBNAL_USE_FMR @@ -231,18 +216,35 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) return -EPROTO; } + /* Future protocol version compatibility support! + * If the viblnd-specific protocol changes, or when LNET unifies + * protocols over all LNDs, the initial connection will negotiate a + * protocol version. If I find this, I avoid any console errors. If + * my is doing connection establishment, the reject will tell the peer + * which version I'm running. */ + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { flip = 0; } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) { flip = 1; } else { + if (msg->ibm_magic == LNET_PROTO_MAGIC || + msg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) + return -EPROTO; + + /* Completely out to lunch */ CERROR("Bad magic: %08x\n", msg->ibm_magic); return -EPROTO; } - if (msg->ibm_version != - (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) { - CERROR("Bad version: %d\n", msg->ibm_version); + msg_version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; + if (expected_version == 0) { + if (msg_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && + msg_version != IBNAL_MSG_VERSION) + return -EPROTO; + } else if (msg_version != expected_version) { + CERROR("Bad version: %x(%x expected)\n", + msg_version, expected_version); return -EPROTO; } @@ -270,7 +272,7 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) if (flip) { /* leave magic unflipped as a clue to peer endianness */ - __swab16s(&msg->ibm_version); + msg->ibm_version = msg_version; CLASSERT (sizeof(msg->ibm_type) == 1); CLASSERT (sizeof(msg->ibm_credits) == 1); msg->ibm_nob = msg_nob; @@ -281,8 +283,8 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) __swab64s(&msg->ibm_seq); } - if (msg->ibm_srcnid == PTL_NID_ANY) { - CERROR("Bad src nid: "LPX64"\n", msg->ibm_srcnid); + if (msg->ibm_srcnid == LNET_NID_ANY) { + CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); return -EPROTO; } @@ -311,13 +313,12 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) break; case IBNAL_MSG_PUT_ACK: -#if IBNAL_USE_FMR if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) { CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, (int)(hdr_size + sizeof(msg->ibm_u.putack))); return -EPROTO; } - +#if IBNAL_USE_FMR if (flip) { __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob); @@ -422,103 +423,75 @@ kibnal_unpack_msg(kib_msg_t *msg, int nob) } int -kibnal_set_mynid(ptl_nid_t nid) +kibnal_start_listener (lnet_ni_t *ni) { - static cm_listen_data_t info; /* protected by kib_nid_mutex */ + static cm_listen_data_t info; - lib_ni_t *ni = &kibnal_lib.libnal_ni; - int rc; cm_return_t cmrc; - CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", - nid, ni->ni_pid.nid); - - down (&kibnal_data.kib_nid_mutex); + LASSERT (kibnal_data.kib_listen_handle == NULL); - if (nid == ni->ni_pid.nid) { - /* no change of NID */ - up (&kibnal_data.kib_nid_mutex); - return (0); + kibnal_data.kib_listen_handle = + cm_create_cep(cm_cep_transp_rc); + if (kibnal_data.kib_listen_handle == NULL) { + CERROR ("Can't create listen CEP\n"); + return -ENOMEM; } - CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", ni->ni_pid.nid, nid); + CDEBUG(D_NET, "Created CEP %p for listening\n", + kibnal_data.kib_listen_handle); - if (kibnal_data.kib_listen_handle != NULL) { - cmrc = cm_cancel(kibnal_data.kib_listen_handle); - if (cmrc != cm_stat_success) - CERROR ("Error %d stopping listener\n", cmrc); + memset(&info, 0, sizeof(info)); + info.listen_addr.end_pt.sid = + (__u64)(*kibnal_tunables.kib_service_number); - kibnal_pause(HZ/10); /* ensure no more callbacks */ + cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, + kibnal_listen_callback, NULL); + if (cmrc == cm_stat_success) + return 0; - cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); - if (cmrc != vv_return_ok) - CERROR ("Error %d destroying CEP\n", cmrc); - - kibnal_data.kib_listen_handle = NULL; - } - - /* Change NID. NB queued passive connection requests (if any) will be - * rejected with an incorrect destination NID */ - ni->ni_pid.nid = nid; - kibnal_data.kib_incarnation++; - mb(); - - /* Delete all existing peers and their connections after new - * NID/incarnation set to ensure no old connections in our brave - * new world. */ - kibnal_del_peer (PTL_NID_ANY, 0); - - if (ni->ni_pid.nid != PTL_NID_ANY) { /* got a new NID to install */ - kibnal_data.kib_listen_handle = - cm_create_cep(cm_cep_transp_rc); - if (kibnal_data.kib_listen_handle == NULL) { - CERROR ("Can't create listen CEP\n"); - rc = -ENOMEM; - goto failed_0; - } + CERROR ("cm_listen error: %d\n", cmrc); - CDEBUG(D_NET, "Created CEP %p for listening\n", - kibnal_data.kib_listen_handle); + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); + LASSERT (cmrc == cm_stat_success); - memset(&info, 0, sizeof(info)); - info.listen_addr.end_pt.sid = kibnal_data.kib_svc_id; + kibnal_data.kib_listen_handle = NULL; + return -EINVAL; +} - cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, - kibnal_listen_callback, NULL); - if (cmrc != 0) { - CERROR ("cm_listen error: %d\n", cmrc); - rc = -EINVAL; - goto failed_1; - } - } +void +kibnal_stop_listener(lnet_ni_t *ni) +{ + cm_return_t cmrc; - up (&kibnal_data.kib_nid_mutex); - return (0); + LASSERT (kibnal_data.kib_listen_handle != NULL); + + cmrc = cm_cancel(kibnal_data.kib_listen_handle); + if (cmrc != cm_stat_success) + CERROR ("Error %d stopping listener\n", cmrc); - failed_1: + cfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */ + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); - LASSERT (cmrc == cm_stat_success); + if (cmrc != vv_return_ok) + CERROR ("Error %d destroying CEP\n", cmrc); + kibnal_data.kib_listen_handle = NULL; - failed_0: - ni->ni_pid.nid = PTL_NID_ANY; - kibnal_data.kib_incarnation++; - mb(); - kibnal_del_peer (PTL_NID_ANY, 0); - up (&kibnal_data.kib_nid_mutex); - return rc; } -kib_peer_t * -kibnal_create_peer (ptl_nid_t nid) +int +kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) { - kib_peer_t *peer; + kib_peer_t *peer; + unsigned long flags; + int rc; - LASSERT (nid != PTL_NID_ANY); + LASSERT (nid != LNET_NID_ANY); - PORTAL_ALLOC(peer, sizeof (*peer)); + LIBCFS_ALLOC(peer, sizeof (*peer)); if (peer == NULL) { - CERROR("Canot allocate perr\n"); - return (NULL); + CERROR("Cannot allocate peer\n"); + return -ENOMEM; } memset(peer, 0, sizeof(*peer)); /* zero flags etc */ @@ -530,43 +503,62 @@ kibnal_create_peer (ptl_nid_t nid) INIT_LIST_HEAD (&peer->ibp_conns); INIT_LIST_HEAD (&peer->ibp_tx_queue); - peer->ibp_reconnect_time = jiffies; - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_error = 0; + peer->ibp_last_alive = cfs_time_current(); + peer->ibp_reconnect_interval = 0; /* OK to connect at any time */ + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + if (atomic_read(&kibnal_data.kib_npeers) >= + *kibnal_tunables.kib_concurrent_peers) { + rc = -EOVERFLOW; /* !! but at least it distinguishes */ + } else if (kibnal_data.kib_listen_handle == NULL) { + rc = -ESHUTDOWN; /* shutdown has started */ + } else { + rc = 0; + /* npeers only grows with the global lock held */ + atomic_inc(&kibnal_data.kib_npeers); + } + + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - atomic_inc (&kibnal_data.kib_npeers); - if (atomic_read(&kibnal_data.kib_npeers) <= IBNAL_CONCURRENT_PEERS) - return peer; + if (rc != 0) { + CERROR("Can't create peer: %s\n", + (rc == -ESHUTDOWN) ? "shutting down" : + "too many peers"); + LIBCFS_FREE(peer, sizeof(*peer)); + } else { + *peerp = peer; + } - CERROR("Too many peers: CQ will overflow\n"); - kibnal_peer_decref(peer); - return NULL; + return rc; } void kibnal_destroy_peer (kib_peer_t *peer) { - LASSERT (atomic_read (&peer->ibp_refcount) == 0); LASSERT (peer->ibp_persistence == 0); LASSERT (!kibnal_peer_active(peer)); LASSERT (peer->ibp_connecting == 0); + LASSERT (peer->ibp_accepting == 0); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); - PORTAL_FREE (peer, sizeof (*peer)); + LIBCFS_FREE (peer, sizeof (*peer)); /* NB a peer's connections keep a reference on their peer until * they are destroyed, so we can be assured that _all_ state to do * with this peer has been cleaned up when its refcount drops to * zero. */ - atomic_dec (&kibnal_data.kib_npeers); + atomic_dec(&kibnal_data.kib_npeers); } -/* the caller is responsible for accounting for the additional reference - * that this creates */ kib_peer_t * -kibnal_find_peer_locked (ptl_nid_t nid) +kibnal_find_peer_locked (lnet_nid_t nid) { + /* the caller is responsible for accounting the additional reference + * that this creates */ struct list_head *peer_list = kibnal_nid2peerlist (nid); struct list_head *tmp; kib_peer_t *peer; @@ -577,13 +569,15 @@ kibnal_find_peer_locked (ptl_nid_t nid) LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ peer->ibp_connecting != 0 || /* creating conns */ + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); /* active conn */ if (peer->ibp_nid != nid) continue; - CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", - peer, nid, atomic_read (&peer->ibp_refcount)); + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_nid2str(nid), + atomic_read (&peer->ibp_refcount)); return (peer); } return (NULL); @@ -602,7 +596,7 @@ kibnal_unlink_peer_locked (kib_peer_t *peer) } int -kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, +kibnal_get_peer_info (int index, lnet_nid_t *nidp, __u32 *ipp, int *persistencep) { kib_peer_t *peer; @@ -619,6 +613,7 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); if (index-- > 0) @@ -639,20 +634,22 @@ kibnal_get_peer_info (int index, ptl_nid_t *nidp, __u32 *ipp, } int -kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip) +kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip) { kib_peer_t *peer; kib_peer_t *peer2; unsigned long flags; + int rc; - CDEBUG(D_NET, LPX64"@%08x\n", nid, ip); + CDEBUG(D_NET, "%s at %u.%u.%u.%u\n", + libcfs_nid2str(nid), HIPQUAD(ip)); - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return (-EINVAL); - peer = kibnal_create_peer (nid); - if (peer == NULL) - return (-ENOMEM); + rc = kibnal_create_peer(&peer, nid); + if (rc != 0) + return rc; write_lock_irqsave(&kibnal_data.kib_global_lock, flags); @@ -674,19 +671,13 @@ kibnal_add_persistent_peer (ptl_nid_t nid, __u32 ip) } void -kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +kibnal_del_peer_locked (kib_peer_t *peer) { struct list_head *ctmp; struct list_head *cnxt; kib_conn_t *conn; - if (!single_share) - peer->ibp_persistence = 0; - else if (peer->ibp_persistence > 0) - peer->ibp_persistence--; - - if (peer->ibp_persistence != 0) - return; + peer->ibp_persistence = 0; if (list_empty(&peer->ibp_conns)) { kibnal_unlink_peer_locked(peer); @@ -704,8 +695,9 @@ kibnal_del_peer_locked (kib_peer_t *peer, int single_share) } int -kibnal_del_peer (ptl_nid_t nid, int single_share) +kibnal_del_peer (lnet_nid_t nid) { + CFS_LIST_HEAD (zombies); struct list_head *ptmp; struct list_head *pnxt; kib_peer_t *peer; @@ -717,7 +709,7 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -729,20 +721,27 @@ kibnal_del_peer (ptl_nid_t nid, int single_share) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) continue; - kibnal_del_peer_locked (peer, single_share); - rc = 0; /* matched something */ + if (!list_empty(&peer->ibp_tx_queue)) { + LASSERT (list_empty(&peer->ibp_conns)); - if (single_share) - goto out; + list_splice_init(&peer->ibp_tx_queue, &zombies); + } + + kibnal_del_peer_locked (peer); + rc = 0; /* matched something */ } } - out: + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + kibnal_txlist_done(&zombies, -EIO); + return (rc); } @@ -764,6 +763,7 @@ kibnal_get_conn_by_idx (int index) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence > 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); list_for_each (ctmp, &peer->ibp_conns) { @@ -783,6 +783,74 @@ kibnal_get_conn_by_idx (int index) return (NULL); } +void +kibnal_debug_rx (kib_rx_t *rx) +{ + CDEBUG(D_CONSOLE, " %p nob %d msg_type %x " + "cred %d seq "LPD64"\n", + rx, rx->rx_nob, rx->rx_msg->ibm_type, + rx->rx_msg->ibm_credits, rx->rx_msg->ibm_seq); +} + +void +kibnal_debug_tx (kib_tx_t *tx) +{ + CDEBUG(D_CONSOLE, " %p snd %d q %d w %d rc %d dl %lx " + "cookie "LPX64" msg %s%s type %x cred %d seq "LPD64"\n", + tx, tx->tx_sending, tx->tx_queued, tx->tx_waiting, + tx->tx_status, tx->tx_deadline, tx->tx_cookie, + tx->tx_lntmsg[0] == NULL ? "-" : "!", + tx->tx_lntmsg[1] == NULL ? "-" : "!", + tx->tx_msg->ibm_type, tx->tx_msg->ibm_credits, + tx->tx_msg->ibm_seq); +} + +void +kibnal_debug_conn (kib_conn_t *conn) +{ + struct list_head *tmp; + int i; + + spin_lock(&conn->ibc_lock); + + CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", + atomic_read(&conn->ibc_refcount), conn, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + CDEBUG(D_CONSOLE, " txseq "LPD64" rxseq "LPD64" state %d \n", + conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state); + CDEBUG(D_CONSOLE, " nposted %d cred %d o_cred %d r_cred %d\n", + conn->ibc_nsends_posted, conn->ibc_credits, + conn->ibc_outstanding_credits, conn->ibc_reserved_credits); + CDEBUG(D_CONSOLE, " disc %d comms_err %d\n", + conn->ibc_disconnect, conn->ibc_comms_error); + + CDEBUG(D_CONSOLE, " early_rxs:\n"); + list_for_each(tmp, &conn->ibc_early_rxs) + kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_nocred) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue_rsrvd:\n"); + list_for_each(tmp, &conn->ibc_tx_queue_rsrvd) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " tx_queue:\n"); + list_for_each(tmp, &conn->ibc_tx_queue) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " active_txs:\n"); + list_for_each(tmp, &conn->ibc_active_txs) + kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + + CDEBUG(D_CONSOLE, " rxs:\n"); + for (i = 0; i < IBNAL_RX_MSGS; i++) + kibnal_debug_rx(&conn->ibc_rxs[i]); + + spin_unlock(&conn->ibc_lock); +} + int kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) { @@ -835,7 +903,7 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) rtr->destanation_qp = cv->cv_remote_qpn; rtr->receive_psn = cv->cv_rxpsn; rtr->responder_rdma_r_atom_num = IBNAL_OUS_DST_RD; - rtr->opt_min_rnr_nak_timer = IBNAL_RNR_NAK_TIMER; + rtr->opt_min_rnr_nak_timer = *kibnal_tunables.kib_rnr_nak_timer; // XXX sdp sets VV_QP_AT_OP_F but no actual optional options @@ -851,9 +919,9 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) struct vv_qp_modify_rts_st *rts = &attr.modify.params.rts; rts->send_psn = cv->cv_txpsn; - rts->local_ack_timeout = IBNAL_LOCAL_ACK_TIMEOUT; - rts->retry_num = IBNAL_RETRY_CNT; - rts->rnr_num = IBNAL_RNR_CNT; + rts->local_ack_timeout = *kibnal_tunables.kib_local_ack_timeout; + rts->retry_num = *kibnal_tunables.kib_retry_cnt; + rts->rnr_num = *kibnal_tunables.kib_rnr_cnt; rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD; attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN | @@ -874,8 +942,9 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL); if (vvrc != vv_return_ok) { - CERROR("Can't modify qp -> "LPX64" state to %d: %d\n", - conn->ibc_peer->ibp_nid, new_state, vvrc); + CERROR("Can't modify qp -> %s state to %d: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + new_state, vvrc); return -EIO; } @@ -899,7 +968,7 @@ kibnal_create_conn (cm_cep_handle_t cep) LASSERT(!in_interrupt()); LASSERT(current == kibnal_data.kib_connd); - PORTAL_ALLOC(conn, sizeof (*conn)); + LIBCFS_ALLOC(conn, sizeof (*conn)); if (conn == NULL) { CERROR ("Can't allocate connection\n"); return (NULL); @@ -908,8 +977,12 @@ kibnal_create_conn (cm_cep_handle_t cep) /* zero flags, NULL pointers etc... */ memset (conn, 0, sizeof (*conn)); + conn->ibc_version = IBNAL_MSG_VERSION; /* Use latest version at first */ + INIT_LIST_HEAD (&conn->ibc_early_rxs); + INIT_LIST_HEAD (&conn->ibc_tx_queue_nocred); INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); @@ -918,7 +991,7 @@ kibnal_create_conn (cm_cep_handle_t cep) conn->ibc_cep = cep; - PORTAL_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + LIBCFS_ALLOC(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); if (conn->ibc_connvars == NULL) { CERROR("Can't allocate in-progress connection state\n"); goto failed; @@ -928,7 +1001,7 @@ kibnal_create_conn (cm_cep_handle_t cep) get_random_bytes(&conn->ibc_connvars->cv_rxpsn, sizeof(conn->ibc_connvars->cv_rxpsn)); - PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); if (conn->ibc_rxs == NULL) { CERROR("Cannot allocate RX buffers\n"); goto failed; @@ -976,7 +1049,7 @@ kibnal_create_conn (cm_cep_handle_t cep) reqattr.create.cq_send_h = kibnal_data.kib_cq; reqattr.create.cq_receive_h = kibnal_data.kib_cq; reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) * - IBNAL_MSG_QUEUE_SIZE; + (*kibnal_tunables.kib_concurrent_sends); reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS; reqattr.create.max_scatgat_per_send_wr = 1; reqattr.create.max_scatgat_per_receive_wr = 1; @@ -996,12 +1069,13 @@ kibnal_create_conn (cm_cep_handle_t cep) conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num; if (rspattr.create_return.receive_max_outstand_wr < - IBNAL_MSG_QUEUE_SIZE || + IBNAL_RX_MSGS || rspattr.create_return.send_max_outstand_wr < - (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE) { + (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) { CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n", - IBNAL_MSG_QUEUE_SIZE, - (1 + IBNAL_MAX_RDMA_FRAGS) * IBNAL_MSG_QUEUE_SIZE, + IBNAL_RX_MSGS, + (1 + IBNAL_MAX_RDMA_FRAGS) * + (*kibnal_tunables.kib_concurrent_sends), rspattr.create_return.receive_max_outstand_wr, rspattr.create_return.send_max_outstand_wr); goto failed; @@ -1033,6 +1107,8 @@ kibnal_destroy_conn (kib_conn_t *conn) LASSERT (atomic_read (&conn->ibc_refcount) == 0); LASSERT (list_empty(&conn->ibc_early_rxs)); LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); + LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); LASSERT (list_empty(&conn->ibc_active_txs)); LASSERT (conn->ibc_nsends_posted == 0); @@ -1066,16 +1142,16 @@ kibnal_destroy_conn (kib_conn_t *conn) kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) - PORTAL_FREE(conn->ibc_rxs, + LIBCFS_FREE(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof(kib_rx_t)); if (conn->ibc_connvars != NULL) - PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); if (conn->ibc_peer != NULL) kibnal_peer_decref(conn->ibc_peer); - PORTAL_FREE(conn, sizeof (*conn)); + LIBCFS_FREE(conn, sizeof (*conn)); atomic_dec(&kibnal_data.kib_nconns); } @@ -1112,8 +1188,9 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) if (conn->ibc_incarnation == incarnation) continue; - CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", - peer->ibp_nid, conn->ibc_incarnation, incarnation); + CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n", + libcfs_nid2str(peer->ibp_nid), + conn->ibc_incarnation, incarnation); count++; kibnal_close_conn_locked (conn, -ESTALE); @@ -1123,7 +1200,7 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) } int -kibnal_close_matching_conns (ptl_nid_t nid) +kibnal_close_matching_conns (lnet_nid_t nid) { kib_peer_t *peer; struct list_head *ptmp; @@ -1136,7 +1213,7 @@ kibnal_close_matching_conns (ptl_nid_t nid) write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - if (nid != PTL_NID_ANY) + if (nid != LNET_NID_ANY) lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; else { lo = 0; @@ -1149,9 +1226,10 @@ kibnal_close_matching_conns (ptl_nid_t nid) peer = list_entry (ptmp, kib_peer_t, ibp_list); LASSERT (peer->ibp_persistence != 0 || peer->ibp_connecting != 0 || + peer->ibp_accepting != 0 || !list_empty (&peer->ibp_conns)); - if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) continue; count += kibnal_close_peer_conns_locked (peer, 0); @@ -1161,70 +1239,69 @@ kibnal_close_matching_conns (ptl_nid_t nid) write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* wildcards always succeed */ - if (nid == PTL_NID_ANY) + if (nid == LNET_NID_ANY) return (0); return (count == 0 ? -ENOENT : 0); } int -kibnal_cmd(struct portals_cfg *pcfg, void * private) +kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) { - int rc = -EINVAL; + struct libcfs_ioctl_data *data = arg; + int rc = -EINVAL; - LASSERT (pcfg != NULL); + LASSERT (ni == kibnal_data.kib_ni); - switch(pcfg->pcfg_command) { - case NAL_CMD_GET_PEER: { - ptl_nid_t nid = 0; - __u32 ip = 0; - int share_count = 0; + switch(cmd) { + case IOC_LIBCFS_GET_PEER: { + lnet_nid_t nid = 0; + __u32 ip = 0; + int share_count = 0; - rc = kibnal_get_peer_info(pcfg->pcfg_count, + rc = kibnal_get_peer_info(data->ioc_count, &nid, &ip, &share_count); - pcfg->pcfg_nid = nid; - pcfg->pcfg_size = 0; - pcfg->pcfg_id = ip; - pcfg->pcfg_misc = IBNAL_SERVICE_NUMBER; /* port */ - pcfg->pcfg_count = 0; - pcfg->pcfg_wait = share_count; + data->ioc_nid = nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = *kibnal_tunables.kib_service_number; /* port */ break; } - case NAL_CMD_ADD_PEER: { - rc = kibnal_add_persistent_peer (pcfg->pcfg_nid, - pcfg->pcfg_id); /* IP */ + case IOC_LIBCFS_ADD_PEER: { + rc = kibnal_add_persistent_peer (data->ioc_nid, + data->ioc_u32[0]); /* IP */ break; } - case NAL_CMD_DEL_PEER: { - rc = kibnal_del_peer (pcfg->pcfg_nid, - /* flags == single_share */ - pcfg->pcfg_flags != 0); + case IOC_LIBCFS_DEL_PEER: { + rc = kibnal_del_peer (data->ioc_nid); break; } - case NAL_CMD_GET_CONN: { - kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + case IOC_LIBCFS_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (data->ioc_count); if (conn == NULL) rc = -ENOENT; else { + // kibnal_debug_conn(conn); rc = 0; - pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; - pcfg->pcfg_id = 0; - pcfg->pcfg_misc = 0; - pcfg->pcfg_flags = 0; + data->ioc_nid = conn->ibc_peer->ibp_nid; kibnal_conn_decref(conn); } break; } - case NAL_CMD_CLOSE_CONNECTION: { - rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + case IOC_LIBCFS_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (data->ioc_nid); break; } - case NAL_CMD_REGISTER_MYNID: { - if (pcfg->pcfg_nid == PTL_NID_ANY) + case IOC_LIBCFS_REGISTER_MYNID: { + if (ni->ni_nid == data->ioc_nid) { + rc = 0; + } else { + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); rc = -EINVAL; - else - rc = kibnal_set_mynid (pcfg->pcfg_nid); + } break; } } @@ -1242,7 +1319,7 @@ kibnal_free_pages (kib_pages_t *p) if (p->ibp_pages[i] != NULL) __free_page(p->ibp_pages[i]); - PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } int @@ -1251,7 +1328,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) kib_pages_t *p; int i; - PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); if (p == NULL) { CERROR ("Can't allocate buffer %d\n", npages); return (-ENOMEM); @@ -1278,36 +1355,36 @@ kibnal_alloc_tx_descs (void) { int i; - PORTAL_ALLOC (kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + LIBCFS_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); if (kibnal_data.kib_tx_descs == NULL) return -ENOMEM; memset(kibnal_data.kib_tx_descs, 0, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; #if IBNAL_USE_FMR - PORTAL_ALLOC(tx->tx_pages, PTL_MD_MAX_IOV * + LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV * sizeof(*tx->tx_pages)); if (tx->tx_pages == NULL) return -ENOMEM; #else - PORTAL_ALLOC(tx->tx_wrq, + LIBCFS_ALLOC(tx->tx_wrq, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); if (tx->tx_wrq == NULL) return -ENOMEM; - PORTAL_ALLOC(tx->tx_gl, + LIBCFS_ALLOC(tx->tx_gl, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_gl)); if (tx->tx_gl == NULL) return -ENOMEM; - PORTAL_ALLOC(tx->tx_rd, + LIBCFS_ALLOC(tx->tx_rd, offsetof(kib_rdma_desc_t, rd_frags[IBNAL_MAX_RDMA_FRAGS])); if (tx->tx_rd == NULL) @@ -1326,33 +1403,33 @@ kibnal_free_tx_descs (void) if (kibnal_data.kib_tx_descs == NULL) return; - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { kib_tx_t *tx = &kibnal_data.kib_tx_descs[i]; #if IBNAL_USE_FMR if (tx->tx_pages != NULL) - PORTAL_FREE(tx->tx_pages, PTL_MD_MAX_IOV * + LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV * sizeof(*tx->tx_pages)); #else if (tx->tx_wrq != NULL) - PORTAL_FREE(tx->tx_wrq, + LIBCFS_FREE(tx->tx_wrq, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); if (tx->tx_gl != NULL) - PORTAL_FREE(tx->tx_gl, + LIBCFS_FREE(tx->tx_gl, (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_gl)); if (tx->tx_rd != NULL) - PORTAL_FREE(tx->tx_rd, + LIBCFS_FREE(tx->tx_rd, offsetof(kib_rdma_desc_t, rd_frags[IBNAL_MAX_RDMA_FRAGS])); #endif } - PORTAL_FREE(kibnal_data.kib_tx_descs, - IBNAL_TX_MSGS * sizeof(kib_tx_t)); + LIBCFS_FREE(kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS() * sizeof(kib_tx_t)); } #if IBNAL_USE_FMR @@ -1396,24 +1473,23 @@ kibnal_setup_tx_descs (void) /* No fancy arithmetic when we do the buffer calculations */ CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, - 0); + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + IBNAL_TX_MSG_PAGES(), 0); if (rc != 0) return (rc); - for (i = 0; i < IBNAL_TX_MSGS; i++) { + for (i = 0; i < IBNAL_TX_MSGS(); i++) { page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; tx = &kibnal_data.kib_tx_descs[i]; #if IBNAL_USE_FMR memset(&fmr_props, 0, sizeof(fmr_props)); fmr_props.pd_hndl = kibnal_data.kib_pd; - fmr_props.acl = (vv_acc_r_mem_read | - vv_acc_r_mem_write | + fmr_props.acl = (vv_acc_r_mem_write | vv_acc_l_mem_write); - fmr_props.max_pages = PTL_MD_MAX_IOV; + fmr_props.max_pages = LNET_MAX_IOV; fmr_props.log2_page_sz = PAGE_SHIFT; - fmr_props.max_outstanding_maps = IBNAL_FMR_NMAPS; + fmr_props.max_outstanding_maps = *kibnal_tunables.kib_fmr_remaps; vvrc = vv_alloc_fmr(kibnal_data.kib_hca, &fmr_props, @@ -1426,7 +1502,7 @@ kibnal_setup_tx_descs (void) return -ENOMEM; } - tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS; + tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps; tx->tx_md.md_active = 0; #endif tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + @@ -1440,17 +1516,10 @@ kibnal_setup_tx_descs (void) &rkey); LASSERT (vvrc == vv_return_ok); - tx->tx_isnblk = (i >= IBNAL_NTX); - CDEBUG(D_NET, "Tx[%d] %p->%p[%x]\n", i, tx, tx->tx_msg, tx->tx_lkey); - if (tx->tx_isnblk) - list_add (&tx->tx_list, - &kibnal_data.kib_idle_nblk_txs); - else - list_add (&tx->tx_list, - &kibnal_data.kib_idle_txs); + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); page_offset += IBNAL_MSG_SIZE; LASSERT (page_offset <= PAGE_SIZE); @@ -1458,7 +1527,7 @@ kibnal_setup_tx_descs (void) if (page_offset == PAGE_SIZE) { page_offset = 0; ipage++; - LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); } } @@ -1466,42 +1535,34 @@ kibnal_setup_tx_descs (void) } void -kibnal_api_shutdown (nal_t *nal) +kibnal_shutdown (lnet_ni_t *ni) { - int i; - vv_return_t vvrc; - - if (nal->nal_refct != 0) { - /* This module got the first ref */ - PORTAL_MODULE_UNUSE; - return; - } + int i; + vv_return_t vvrc; + LASSERT (ni == kibnal_data.kib_ni); + LASSERT (ni->ni_data == &kibnal_data); + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - - LASSERT(nal == &kibnal_api); + atomic_read (&libcfs_kmemory)); switch (kibnal_data.kib_init) { case IBNAL_INIT_ALL: - /* stop calls to nal_cmd */ - libcfs_nal_cmd_unregister(VIBNAL); - /* No new peers */ + /* stop accepting connections and prevent new peers */ + kibnal_stop_listener(ni); - /* resetting my NID removes my listener and nukes all current - * peers and their connections */ - kibnal_set_mynid (PTL_NID_ANY); + /* nuke all existing peers */ + kibnal_del_peer(LNET_NID_ANY); /* Wait for all peer state to clean up */ i = 2; - while (atomic_read (&kibnal_data.kib_npeers) != 0) { + while (atomic_read(&kibnal_data.kib_npeers) != 0) { i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ "waiting for %d peers to disconnect\n", - atomic_read (&kibnal_data.kib_npeers)); - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); + atomic_read(&kibnal_data.kib_npeers)); + cfs_pause(cfs_time_seconds(1)); } /* fall through */ @@ -1514,7 +1575,7 @@ kibnal_api_shutdown (nal_t *nal) case IBNAL_INIT_TXD: kibnal_free_pages (kibnal_data.kib_tx_pages); #if IBNAL_USE_FMR - kibnal_free_fmrs(IBNAL_TX_MSGS); + kibnal_free_fmrs(IBNAL_TX_MSGS()); #endif /* fall through */ @@ -1542,19 +1603,13 @@ kibnal_api_shutdown (nal_t *nal) CERROR ("Close HCA error: %d\n", vvrc); /* fall through */ - case IBNAL_INIT_LIB: - lib_fini(&kibnal_lib); - /* fall through */ - case IBNAL_INIT_DATA: - LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (atomic_read(&kibnal_data.kib_npeers) == 0); LASSERT (kibnal_data.kib_peers != NULL); for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { LASSERT (list_empty (&kibnal_data.kib_peers[i])); } LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); - LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); - LASSERT (list_empty (&kibnal_data.kib_sched_txq)); LASSERT (list_empty (&kibnal_data.kib_connd_zombies)); LASSERT (list_empty (&kibnal_data.kib_connd_conns)); LASSERT (list_empty (&kibnal_data.kib_connd_pcreqs)); @@ -1571,8 +1626,7 @@ kibnal_api_shutdown (nal_t *nal) CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ "Waiting for %d threads to terminate\n", atomic_read (&kibnal_data.kib_nthreads)); - set_current_state (TASK_INTERRUPTIBLE); - schedule_timeout (HZ); + cfs_pause(cfs_time_seconds(1)); } /* fall through */ @@ -1583,54 +1637,119 @@ kibnal_api_shutdown (nal_t *nal) kibnal_free_tx_descs(); if (kibnal_data.kib_peers != NULL) - PORTAL_FREE (kibnal_data.kib_peers, + LIBCFS_FREE (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", - atomic_read (&portal_kmemory)); - printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n", - atomic_read(&portal_kmemory)); + atomic_read (&libcfs_kmemory)); kibnal_data.kib_init = IBNAL_INIT_NOTHING; + PORTAL_MODULE_UNUSE; } int -kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +kibnal_startup (lnet_ni_t *ni) { + char scratch[32]; + char ipif_name[32]; + char *hca_name; + __u32 ip; + __u32 netmask; + int up; + int nob; + int devno; struct timeval tv; - ptl_process_id_t process_id; - int pkmem = atomic_read(&portal_kmemory); int rc; int i; vv_request_event_record_t req_er; vv_return_t vvrc; - LASSERT (nal == &kibnal_api); + LASSERT (ni->ni_lnd == &the_kiblnd); + + /* Only 1 instance supported */ + if (kibnal_data.kib_init != IBNAL_INIT_NOTHING) { + CERROR ("Only 1 instance supported\n"); + return -EPERM; + } + + if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) { + CERROR ("Can't set credits(%d) > ntx(%d)\n", + *kibnal_tunables.kib_credits, + *kibnal_tunables.kib_ntx); + return -EINVAL; + } + + ni->ni_maxtxcredits = *kibnal_tunables.kib_credits; + ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits; + + CLASSERT (LNET_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[0] != NULL) { + /* Use the HCA specified in 'networks=' */ + + if (ni->ni_interfaces[1] != NULL) { + CERROR("Multiple interfaces not supported\n"); + return -EPERM; + } + + /* Parse */ + hca_name = ni->ni_interfaces[0]; + nob = strlen(*kibnal_tunables.kib_hca_basename); + + if (strncmp(hca_name, *kibnal_tunables.kib_hca_basename, nob) || + sscanf(hca_name + nob, "%d%n", &devno, &nob) < 1) { + CERROR("Unrecognised HCA %s\n", hca_name); + return -EINVAL; + } - if (nal->nal_refct != 0) { - if (actual_limits != NULL) - *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; - /* This module got the first ref */ - PORTAL_MODULE_USE; - return (PTL_OK); + } else { + /* Use 0 */ + devno = 0; + + hca_name = scratch; + snprintf(hca_name, sizeof(scratch), "%s%d", + *kibnal_tunables.kib_hca_basename, devno); + if (strlen(hca_name) == sizeof(scratch) - 1) { + CERROR("HCA name %s truncated\n", hca_name); + return -EINVAL; + } } - LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + /* Find IP address from */ + snprintf(ipif_name, sizeof(ipif_name), "%s%d", + *kibnal_tunables.kib_ipif_basename, devno); + if (strlen(ipif_name) == sizeof(ipif_name - 1)) { + CERROR("IPoIB interface name %s truncated\n", ipif_name); + return -EINVAL; + } + + rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask); + if (rc != 0) { + CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc); + return -ENETDOWN; + } + + if (!up) { + CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name); + return -ENETDOWN; + } + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip); + + PORTAL_MODULE_USE; memset (&kibnal_data, 0, sizeof (kibnal_data)); /* zero pointers, flags etc */ + + kibnal_data.kib_ni = ni; + ni->ni_data = &kibnal_data; do_gettimeofday(&tv); kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - kibnal_data.kib_svc_id = IBNAL_SERVICE_NUMBER; - - init_MUTEX (&kibnal_data.kib_nid_mutex); rwlock_init(&kibnal_data.kib_global_lock); kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; - PORTAL_ALLOC (kibnal_data.kib_peers, + LIBCFS_ALLOC (kibnal_data.kib_peers, sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); if (kibnal_data.kib_peers == NULL) { goto failed; @@ -1646,14 +1765,10 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, init_waitqueue_head (&kibnal_data.kib_connd_waitq); spin_lock_init (&kibnal_data.kib_sched_lock); - INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); - INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); init_waitqueue_head (&kibnal_data.kib_sched_waitq); spin_lock_init (&kibnal_data.kib_tx_lock); INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); - INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); - init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); rc = kibnal_alloc_tx_descs(); if (rc != 0) { @@ -1665,20 +1780,6 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ - process_id.pid = requested_pid; - process_id.nid = PTL_NID_ANY; - - rc = lib_init(&kibnal_lib, nal, process_id, - requested_limits, actual_limits); - if (rc != PTL_OK) { - CERROR("lib_init failed: error %d\n", rc); - goto failed; - } - - /* lib interface initialised */ - kibnal_data.kib_init = IBNAL_INIT_LIB; - /*****************************************************/ - for (i = 0; i < IBNAL_N_SCHED; i++) { rc = kibnal_thread_start (kibnal_scheduler, (void *)((long)i)); if (rc != 0) { @@ -1694,10 +1795,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, goto failed; } - /* TODO: apparently only one adapter is supported */ - vvrc = vv_hca_open("InfiniHost0", NULL, &kibnal_data.kib_hca); + vvrc = vv_hca_open(hca_name, NULL, &kibnal_data.kib_hca); if (vvrc != vv_return_ok) { - CERROR ("Can't open CA: %d\n", vvrc); + CERROR ("Can't open HCA %s: %d\n", hca_name, vvrc); goto failed; } @@ -1709,7 +1809,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, vvrc = vv_set_async_event_cb (kibnal_data.kib_hca, req_er, kibnal_async_callback); if (vvrc != vv_return_ok) { - CERROR ("Can't open CA: %d\n", vvrc); + CERROR ("Can't set HCA %s callback: %d\n", hca_name, vvrc); goto failed; } @@ -1719,7 +1819,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, vvrc = vv_hca_query(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs); if (vvrc != vv_return_ok) { - CERROR ("Can't size port attrs: %d\n", vvrc); + CERROR ("Can't size port attrs for %s: %d\n", hca_name, vvrc); goto failed; } @@ -1733,8 +1833,8 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, vvrc = vv_port_query(kibnal_data.kib_hca, port_num, pattr); if (vvrc != vv_return_ok) { - CERROR("vv_port_query failed for port %d: %d\n", - port_num, vvrc); + CERROR("vv_port_query failed for %s port %d: %d\n", + hca_name, port_num, vvrc); continue; } @@ -1752,45 +1852,47 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, CDEBUG(D_NET, "port[%d] Active\n", port_num); /* Found a suitable port. Get its GUID and PKEY. */ - kibnal_data.kib_port = port_num; - tbl_count = 1; vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid); if (vvrc != vv_return_ok) { CERROR("vv_get_port_gid_tbl failed " - "for port %d: %d\n", port_num, vvrc); + "for %s port %d: %d\n", + hca_name, port_num, vvrc); continue; } tbl_count = 1; vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, - port_num, &tbl_count, - &kibnal_data.kib_port_pkey); + port_num, &tbl_count, + &kibnal_data.kib_port_pkey); if (vvrc != vv_return_ok) { CERROR("vv_get_port_partition_tbl failed " - "for port %d: %d\n", port_num, vvrc); + "for %s port %d: %d\n", + hca_name, port_num, vvrc); continue; } + kibnal_data.kib_port = port_num; + break; case vv_state_linkActDefer: /* TODO: correct? */ case vv_state_linkNoChange: - CERROR("Unexpected port[%d] state %d\n", - i, pattr->port_state); + CERROR("Unexpected %s port[%d] state %d\n", + hca_name, i, pattr->port_state); continue; } break; } if (kibnal_data.kib_port == -1) { - CERROR ("Can't find an active port\n"); + CERROR ("Can't find an active port on %s\n", hca_name); goto failed; } - CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n", - kibnal_data.kib_port, + CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n", + hca_name, kibnal_data.kib_port, kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64); @@ -1820,10 +1922,11 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /* flag TX descs initialised */ kibnal_data.kib_init = IBNAL_INIT_TXD; /*****************************************************/ + { uint32_t nentries; - vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, + vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), kibnal_cq_callback, NULL, /* context */ &kibnal_data.kib_cq, &nentries); @@ -1835,9 +1938,9 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, /* flag CQ initialised */ kibnal_data.kib_init = IBNAL_INIT_CQ; - if (nentries < IBNAL_CQ_ENTRIES) { + if (nentries < IBNAL_CQ_ENTRIES()) { CERROR ("CQ only has %d entries, need %d\n", - nentries, IBNAL_CQ_ENTRIES); + nentries, IBNAL_CQ_ENTRIES()); goto failed; } @@ -1849,40 +1952,30 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, goto failed; } } - - /*****************************************************/ - rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL); + rc = kibnal_start_listener(ni); if (rc != 0) { - CERROR ("Can't initialise command interface (rc = %d)\n", rc); + CERROR("Can't start listener: %d\n", rc); goto failed; } - + /* flag everything initialised */ kibnal_data.kib_init = IBNAL_INIT_ALL; /*****************************************************/ - printk(KERN_INFO "Lustre: Voltaire IB NAL loaded " - "(initial mem %d)\n", pkmem); - - return (PTL_OK); + return (0); failed: - CDEBUG(D_NET, "kibnal_api_startup failed\n"); - kibnal_api_shutdown (&kibnal_api); - return (PTL_FAIL); + CDEBUG(D_NET, "kibnal_startup failed\n"); + kibnal_shutdown (ni); + return (-ENETDOWN); } void __exit kibnal_module_fini (void) { -#ifdef CONFIG_SYSCTL - if (kibnal_tunables.kib_sysctl != NULL) - unregister_sysctl_table (kibnal_tunables.kib_sysctl); -#endif - PtlNIFini(kibnal_ni); - - ptl_unregister_nal(VIBNAL); + lnet_unregister_lnd(&the_kiblnd); + kibnal_tunables_fini(); } int __init @@ -1903,38 +1996,17 @@ kibnal_module_init (void) CLASSERT (offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[IBNAL_MAX_RDMA_FRAGS]) <= IBNAL_MSG_SIZE); #endif - /* the following must be sizeof(int) for proc_dointvec() */ - CLASSERT (sizeof (kibnal_tunables.kib_io_timeout) == sizeof (int)); - - kibnal_api.nal_ni_init = kibnal_api_startup; - kibnal_api.nal_ni_fini = kibnal_api_shutdown; - - /* Initialise dynamic tunables to defaults once only */ - kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; + rc = kibnal_tunables_init(); + if (rc != 0) + return rc; - rc = ptl_register_nal(VIBNAL, &kibnal_api); - if (rc != PTL_OK) { - CERROR("Can't register IBNAL: %d\n", rc); - return (-ENOMEM); /* or something... */ - } + lnet_register_lnd(&the_kiblnd); - /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - ptl_unregister_nal(VIBNAL); - return (-ENODEV); - } - -#ifdef CONFIG_SYSCTL - /* Press on regardless even if registering sysctl doesn't work */ - kibnal_tunables.kib_sysctl = - register_sysctl_table (kibnal_top_ctl_table, 0); -#endif - return (0); + return 0; } MODULE_AUTHOR("Cluster File Systems, Inc. "); -MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01"); +MODULE_DESCRIPTION("Kernel Voltaire IB LND v1.00"); MODULE_LICENSE("GPL"); module_init(kibnal_module_init); diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index 959c768..12c8df4 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -53,12 +53,11 @@ #include #include -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND #include -#include -#include -#include +#include +#include /* CPU_{L,B}E #defines needed by Voltaire headers */ #include @@ -88,15 +87,22 @@ # define IBNAL_N_SCHED 1 /* # schedulers */ #endif -/* sdp-connection.c */ +#define IBNAL_USE_FMR 1 + +/* tunables fixed at compile time */ +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when eagerly to return credits */ +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ + +/* constants derived from sdp-connection.c */ #define IBNAL_QKEY 0 #define IBNAL_PKEY 0xffff #define IBNAL_PKEY_IDX 0 #define IBNAL_SGID_IDX 0 #define IBNAL_SERVICE_LEVEL 0 #define IBNAL_STATIC_RATE 0 -#define IBNAL_RETRY_CNT 7 -#define IBNAL_RNR_CNT 6 #define IBNAL_EE_FLOW_CNT 1 #define IBNAL_LOCAL_SUB 1 #define IBNAL_TRAFFIC_CLASS 0 @@ -104,74 +110,68 @@ #define IBNAL_OUS_DST_RD 1 #define IBNAL_IB_MTU vv_mtu_1024 -/* sdp-hca-params.h */ +/* constants derived from sdp-hca-params.h */ #define PATH_RATE_2_5GB 2 #define MLX_IPD_1x 1 #define MLX_IPD_4x 0 #define IBNAL_R_2_STATIC_RATE(r) ((r) == PATH_RATE_2_5GB ? MLX_IPD_1x : MLX_IPD_4x) /* other low-level IB constants */ -#define IBNAL_LOCAL_ACK_TIMEOUT 0x12 -#define IBNAL_RNR_NAK_TIMER 0x10 #define IBNAL_PKT_LIFETIME 5 #define IBNAL_ARB_INITIATOR_DEPTH 0 #define IBNAL_ARB_RESP_RES 0 #define IBNAL_FAILOVER_ACCEPTED 0 -#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* Fixed service number */ - -#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ -#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ - -#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ - -#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ -#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ - -#define IBNAL_ARP_RETRIES 3 /* How many times to retry ARP */ - -#define IBNAL_NTX 32 /* # tx descs */ -#define IBNAL_NTX_NBLK 256 /* # reserved tx descs */ - -#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ - -#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ - -#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ - -#define IBNAL_CKSUM 0 - -/* default vals for runtime tunables */ -#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ /************************/ /* derived constants... */ /* TX messages (shared by all connections) */ -#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) -#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) - -#define IBNAL_USE_FMR 1 +#define IBNAL_TX_MSGS() (*kibnal_tunables.kib_ntx) +#define IBNAL_TX_MSG_BYTES() (IBNAL_TX_MSGS() * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES() ((IBNAL_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) #if IBNAL_USE_FMR # define IBNAL_MAX_RDMA_FRAGS 1 -# define IBNAL_FMR_NMAPS 1000 +# define IBNAL_CONCURRENT_SENDS IBNAL_RX_MSGS #else -# define IBNAL_MAX_RDMA_FRAGS PTL_MD_MAX_IOV +# define IBNAL_MAX_RDMA_FRAGS LNET_MAX_IOV +# define IBNAL_CONCURRENT_SENDS IBNAL_MSG_QUEUE_SIZE #endif /* RX messages (per connection) */ -#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE -#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) -#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) +#define IBNAL_RX_MSGS (IBNAL_MSG_QUEUE_SIZE*2) +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -#define IBNAL_CQ_ENTRIES (IBNAL_TX_MSGS * (1 + IBNAL_MAX_RDMA_FRAGS) + \ - IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS) +#define IBNAL_CQ_ENTRIES() (IBNAL_TX_MSGS() * (1 + IBNAL_MAX_RDMA_FRAGS) + \ + IBNAL_RX_MSGS * *kibnal_tunables.kib_concurrent_peers) typedef struct { - int kib_io_timeout; /* comms timeout (seconds) */ + unsigned int *kib_service_number; /* IB service number */ + int *kib_min_reconnect_interval; /* first failed connection retry... */ + int *kib_max_reconnect_interval; /* ...exponentially increasing to this */ + int *kib_concurrent_peers; /* max # nodes all talking to me */ + int *kib_cksum; /* checksum kib_msg_t? */ + int *kib_timeout; /* comms timeout (seconds) */ + int *kib_ntx; /* # tx descs */ + int *kib_credits; /* # concurrent sends */ + int *kib_peercredits; /* # concurrent sends to 1 peer */ + int *kib_arp_retries; /* # times to retry ARP */ + char **kib_hca_basename; /* HCA base name */ + char **kib_ipif_basename; /* IPoIB interface base name */ + int *kib_local_ack_timeout; /* IB RC QP ack timeout... */ + int *kib_retry_cnt; /* ...and retry */ + int *kib_rnr_cnt; /* RNR retries... */ + int *kib_rnr_nak_timer; /* ...and interval */ + int *kib_keepalive; /* keepalive interval */ + int *kib_concurrent_sends; /* send work queue sizing */ +#if IBNAL_USE_FMR + int *kib_fmr_remaps; /* # FMR maps before unmap required */ +#endif +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM struct ctl_table_header *kib_sysctl; /* sysctl interface */ +#endif } kib_tunables_t; typedef struct @@ -198,16 +198,14 @@ typedef struct __u64 kib_incarnation; /* which one am I */ int kib_shutdown; /* shut down? */ atomic_t kib_nthreads; /* # live threads */ + lnet_ni_t *kib_ni; /* _the_ nal instance */ - __u64 kib_svc_id; /* service number I listen on */ vv_gid_t kib_port_gid; /* device/port GID */ vv_p_key_t kib_port_pkey; /* device/port pkey */ - struct semaphore kib_nid_mutex; /* serialise NID ops */ cm_cep_handle_t kib_listen_handle; /* IB listen handle */ rwlock_t kib_global_lock; /* stabilize peer/conn ops */ - spinlock_t kib_vverbs_lock; /* serialize vverbs calls */ int kib_ready; /* CQ callback fired */ int kib_checking_cq; /* a scheduler is checking the CQ */ @@ -225,16 +223,12 @@ typedef struct spinlock_t kib_connd_lock; /* serialise */ wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ - struct list_head kib_sched_txq; /* tx requiring attention */ - struct list_head kib_sched_rxq; /* rx requiring attention */ spinlock_t kib_sched_lock; /* serialise */ struct kib_tx *kib_tx_descs; /* all the tx descriptors */ kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ struct list_head kib_idle_txs; /* idle tx descriptors */ - struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ - wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ __u64 kib_next_tx_cookie; /* RDMA completion cookie */ spinlock_t kib_tx_lock; /* serialise */ @@ -258,7 +252,7 @@ typedef struct #define IBNAL_INIT_CQ 7 #define IBNAL_INIT_ALL 8 -#include "vibnal_wire.h" +#include "viblnd_wire.h" /***********************************************************************/ @@ -266,8 +260,7 @@ typedef struct kib_rx /* receive message */ { struct list_head rx_list; /* queue for attention */ struct kib_conn *rx_conn; /* owning conn */ - int rx_responded; /* responded to peer? */ - int rx_posted; /* posted? */ + int rx_nob; /* # bytes received (-1 while posted) */ vv_l_key_t rx_lkey; /* local key */ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ vv_wr_t rx_wrq; /* receive work item */ @@ -277,7 +270,6 @@ typedef struct kib_rx /* receive message */ typedef struct kib_tx /* transmit message */ { struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - int tx_isnblk; /* I'm reserved for non-blocking sends */ struct kib_conn *tx_conn; /* owning conn */ int tx_sending; /* # tx callbacks outstanding */ int tx_queued; /* queued for sending */ @@ -285,7 +277,7 @@ typedef struct kib_tx /* transmit message */ int tx_status; /* completion status */ unsigned long tx_deadline; /* completion deadline */ __u64 tx_cookie; /* completion cookie */ - lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + lnet_msg_t *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ vv_l_key_t tx_lkey; /* local key for message buffer */ kib_msg_t *tx_msg; /* message buffer (host vaddr) */ int tx_nwrq; /* # send work items */ @@ -293,8 +285,8 @@ typedef struct kib_tx /* transmit message */ vv_wr_t tx_wrq[2]; /* send work items... */ vv_scatgat_t tx_gl[2]; /* ...and their memory */ kib_rdma_desc_t tx_rd[1]; /* rdma descriptor */ - kib_md_t tx_md; /* FMA mapping descriptor */ - __u64 *tx_pages; /* page array for mapping */ + kib_md_t tx_md; /* FMR mapping descriptor */ + __u64 *tx_pages; /* page phys addrs */ #else vv_wr_t *tx_wrq; /* send work items... */ vv_scatgat_t *tx_gl; /* ...and their memory */ @@ -302,9 +294,6 @@ typedef struct kib_tx /* transmit message */ #endif } kib_tx_t; -#define KIB_TX_UNMAPPED 0 -#define KIB_TX_MAPPED 1 - /* Passive connection request (listener callback) queued for handling by connd */ typedef struct kib_pcreq { @@ -337,15 +326,19 @@ typedef struct kib_conn __u64 ibc_incarnation; /* which instance of the peer */ __u64 ibc_txseq; /* tx sequence number */ __u64 ibc_rxseq; /* rx sequence number */ + __u32 ibc_version; /* peer protocol version */ atomic_t ibc_refcount; /* # users */ int ibc_state; /* what's happening */ - atomic_t ibc_nob; /* # bytes buffered */ int ibc_nsends_posted; /* # uncompleted sends */ int ibc_credits; /* # credits I have */ int ibc_outstanding_credits; /* # credits to return */ + int ibc_reserved_credits; /* # credits for ACK/DONE msgs */ int ibc_disconnect; /* some disconnect callback fired */ int ibc_comms_error; /* set on comms error */ + unsigned long ibc_last_send; /* time of last send */ struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ + struct list_head ibc_tx_queue_nocred; /* sends that don't need a cred */ + struct list_head ibc_tx_queue_rsrvd; /* sends that need a reserved cred */ struct list_head ibc_tx_queue; /* send queue */ struct list_head ibc_active_txs; /* active tx awaiting completion */ spinlock_t ibc_lock; /* serialise */ @@ -373,7 +366,7 @@ typedef struct kib_peer { struct list_head ibp_list; /* stash on global peer list */ struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ - ptl_nid_t ibp_nid; /* who's on the other end(s) */ + lnet_nid_t ibp_nid; /* who's on the other end(s) */ __u32 ibp_ip; /* IP to query for peer conn params */ int ibp_port; /* port to qery for peer conn params */ __u64 ibp_incarnation; /* peer's incarnation */ @@ -381,32 +374,46 @@ typedef struct kib_peer int ibp_persistence; /* "known" peer refs */ struct list_head ibp_conns; /* all active connections */ struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - int ibp_connecting; /* connecting+accepting */ + int ibp_connecting; /* current active connection attempts */ + int ibp_accepting; /* current passive connection attempts */ int ibp_arp_count; /* # arp attempts */ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ unsigned long ibp_reconnect_interval; /* exponential backoff */ + int ibp_error; /* errno on closing this peer */ + cfs_time_t ibp_last_alive; /* when (in jiffies) I was last alive */ } kib_peer_t; -extern lib_nal_t kibnal_lib; extern kib_data_t kibnal_data; extern kib_tunables_t kibnal_tunables; +int kibnal_startup (lnet_ni_t *ni); +void kibnal_shutdown (lnet_ni_t *ni); +int kibnal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int kibnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +extern int kibnal_eager_recv (lnet_ni_t *ni, void *private, + lnet_msg_t *lntmsg, void **new_private); +int kibnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); extern void kibnal_init_msg(kib_msg_t *msg, int type, int body_nob); -extern void kibnal_pack_msg(kib_msg_t *msg, int credits, ptl_nid_t dstnid, - __u64 dststamp, __u64 seq); -extern int kibnal_unpack_msg(kib_msg_t *msg, int nob); -extern kib_peer_t *kibnal_create_peer(ptl_nid_t nid); +extern void kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, + lnet_nid_t dstnid, __u64 dststamp, __u64 seq); +extern int kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob); +extern int kibnal_create_peer(kib_peer_t **peerp, lnet_nid_t nid); extern void kibnal_destroy_peer(kib_peer_t *peer); -extern int kibnal_del_peer(ptl_nid_t nid, int single_share); -extern kib_peer_t *kibnal_find_peer_locked(ptl_nid_t nid); +extern int kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip); +extern int kibnal_del_peer(lnet_nid_t nid); +extern kib_peer_t *kibnal_find_peer_locked(lnet_nid_t nid); extern void kibnal_unlink_peer_locked(kib_peer_t *peer); +extern void kibnal_peer_alive(kib_peer_t *peer); extern int kibnal_close_stale_conns_locked(kib_peer_t *peer, __u64 incarnation); extern kib_conn_t *kibnal_create_conn(cm_cep_handle_t cep); extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); -extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access); +extern int kibnal_alloc_pages(kib_pages_t **pp, int npages, int access); extern void kibnal_free_pages(kib_pages_t *p); extern void kibnal_check_sends(kib_conn_t *conn); @@ -421,16 +428,12 @@ extern int kibnal_set_qp_state(kib_conn_t *conn, vv_qp_state_t new_state); extern void kibnal_async_callback(vv_event_record_t ev); extern void kibnal_cq_callback(unsigned long context); extern void kibnal_passive_connreq(kib_pcreq_t *pcr, int reject); -extern void kibnal_pause(int ticks); +extern void kibnal_txlist_done (struct list_head *txlist, int status); extern void kibnal_queue_tx(kib_tx_t *tx, kib_conn_t *conn); extern int kibnal_init_rdma(kib_tx_t *tx, int type, int nob, kib_rdma_desc_t *dstrd, __u64 dstcookie); - -static inline int -wrq_signals_completion (vv_wr_t *wrq) -{ - return wrq->completion_notification != 0; -} +extern int kibnal_tunables_init(void); +extern void kibnal_tunables_fini(void); #define kibnal_conn_addref(conn) \ do { \ @@ -458,8 +461,8 @@ do { \ #define kibnal_peer_addref(peer) \ do { \ - CDEBUG(D_NET, "peer[%p] -> "LPX64" (%d)++\n", \ - (peer), (peer)->ibp_nid, \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ atomic_read (&(peer)->ibp_refcount)); \ LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ atomic_inc(&(peer)->ibp_refcount); \ @@ -467,8 +470,8 @@ do { \ #define kibnal_peer_decref(peer) \ do { \ - CDEBUG(D_NET, "peer[%p] -> "LPX64" (%d)--\n", \ - (peer), (peer)->ibp_nid, \ + CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ + (peer), libcfs_nid2str((peer)->ibp_nid), \ atomic_read (&(peer)->ibp_refcount)); \ LASSERT(atomic_read(&(peer)->ibp_refcount) > 0); \ if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ @@ -476,7 +479,7 @@ do { \ } while (0) static inline struct list_head * -kibnal_nid2peerlist (ptl_nid_t nid) +kibnal_nid2peerlist (lnet_nid_t nid) { unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; @@ -493,38 +496,67 @@ kibnal_peer_active (kib_peer_t *peer) static inline void kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) { - /* CAVEAT EMPTOR: tx takes caller's ref on conn */ - + struct list_head *q; + LASSERT (tx->tx_nwrq > 0); /* work items set up */ LASSERT (!tx->tx_queued); /* not queued for sending already */ + tx->tx_queued = 1; + tx->tx_deadline = jiffies + (*kibnal_tunables.kib_timeout * HZ); + if (tx->tx_conn == NULL) { kibnal_conn_addref(conn); tx->tx_conn = conn; + LASSERT (tx->tx_msg->ibm_type != IBNAL_MSG_PUT_DONE); } else { LASSERT (tx->tx_conn == conn); LASSERT (tx->tx_msg->ibm_type == IBNAL_MSG_PUT_DONE); } - tx->tx_queued = 1; - tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + /* All messages have simple credit control */ + q = &conn->ibc_tx_queue; + } else { + LASSERT (conn->ibc_version == IBNAL_MSG_VERSION); + + switch (tx->tx_msg->ibm_type) { + case IBNAL_MSG_PUT_REQ: + case IBNAL_MSG_GET_REQ: + /* RDMA request: reserve a buffer for the RDMA reply + * before sending */ + q = &conn->ibc_tx_queue_rsrvd; + break; + + case IBNAL_MSG_PUT_NAK: + case IBNAL_MSG_PUT_ACK: + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + /* RDMA reply/completion: no credits; peer has reserved + * a reply buffer */ + q = &conn->ibc_tx_queue_nocred; + break; + + case IBNAL_MSG_NOOP: + case IBNAL_MSG_IMMEDIATE: + /* Otherwise: consume a credit before sending */ + q = &conn->ibc_tx_queue; + break; + + default: + LBUG(); + q = NULL; + } + } + + list_add_tail(&tx->tx_list, q); } -static inline __u64 -kibnal_page2phys (struct page *p) +static inline int +kibnal_send_keepalive(kib_conn_t *conn) { -#if IBNAL_32BIT_PAGE2PHYS - CLASSERT (sizeof(typeof(page_to_phys(p))) == 4); - CLASSERT (sizeof(unsigned long) == 4); - /* page_to_phys returns a 32 bit physical address. This must be a 32 - * bit machine with <= 4G memory and we must ensure we don't sign - * extend when converting to 64 bits. */ - return (unsigned long)page_to_phys(p); -#else - CLASSERT (sizeof(typeof(page_to_phys(p))) == 8); - /* page_to_phys returns a 64 bit physical address :) */ - return page_to_phys(p); -#endif + return (*kibnal_tunables.kib_keepalive > 0) && + time_after(jiffies, conn->ibc_last_send + + *kibnal_tunables.kib_keepalive*HZ); } #if IBNAL_VOIDSTAR_SGADDR diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index 139c5ea..490a7e9 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -22,13 +22,14 @@ * */ -#include "vibnal.h" +#include "viblnd.h" void kibnal_tx_done (kib_tx_t *tx) { - ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; - int i; + lnet_msg_t *lntmsg[2]; + int rc = tx->tx_status; + int i; LASSERT (!in_interrupt()); LASSERT (!tx->tx_queued); /* mustn't be queued for sending */ @@ -37,7 +38,7 @@ kibnal_tx_done (kib_tx_t *tx) #if IBNAL_USE_FMR if (tx->tx_md.md_fmrcount == 0 || - (ptlrc != PTL_OK && tx->tx_md.md_active)) { + (rc != 0 && tx->tx_md.md_active)) { vv_return_t vvrc; /* mapping must be active (it dropped fmrcount to 0) */ @@ -47,18 +48,14 @@ kibnal_tx_done (kib_tx_t *tx) 1, &tx->tx_md.md_fmrhandle); LASSERT (vvrc == vv_return_ok); - tx->tx_md.md_fmrcount = IBNAL_FMR_NMAPS; + tx->tx_md.md_fmrcount = *kibnal_tunables.kib_fmr_remaps; } tx->tx_md.md_active = 0; #endif - for (i = 0; i < 2; i++) { - /* tx may have up to 2 libmsgs to finalise */ - if (tx->tx_libmsg[i] == NULL) - continue; - lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); - tx->tx_libmsg[i] = NULL; - } + /* tx may have up to 2 lnet msgs to finalise */ + lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; + lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; if (tx->tx_conn != NULL) { kibnal_conn_decref(tx->tx_conn); @@ -70,77 +67,71 @@ kibnal_tx_done (kib_tx_t *tx) spin_lock(&kibnal_data.kib_tx_lock); - if (tx->tx_isnblk) { - list_add (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); - } else { - list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); - wake_up (&kibnal_data.kib_idle_tx_waitq); - } + list_add (&tx->tx_list, &kibnal_data.kib_idle_txs); spin_unlock(&kibnal_data.kib_tx_lock); + + /* delay finalize until my descs have been freed */ + for (i = 0; i < 2; i++) { + if (lntmsg[i] == NULL) + continue; + + lnet_finalize (kibnal_data.kib_ni, lntmsg[i], rc); + } } -kib_tx_t * -kibnal_get_idle_tx (int may_block) +void +kibnal_txlist_done (struct list_head *txlist, int status) { - kib_tx_t *tx = NULL; - ENTRY; - - for (;;) { - spin_lock(&kibnal_data.kib_tx_lock); + kib_tx_t *tx; - /* "normal" descriptor is free */ - if (!list_empty (&kibnal_data.kib_idle_txs)) { - tx = list_entry (kibnal_data.kib_idle_txs.next, - kib_tx_t, tx_list); - break; - } + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, kib_tx_t, tx_list); - if (!may_block) { - /* may dip into reserve pool */ - if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { - CERROR ("reserved tx desc pool exhausted\n"); - break; - } + list_del (&tx->tx_list); + /* complete now */ + tx->tx_waiting = 0; + tx->tx_status = status; + kibnal_tx_done (tx); + } +} - tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, - kib_tx_t, tx_list); - break; - } +kib_tx_t * +kibnal_get_idle_tx (void) +{ + kib_tx_t *tx; + + spin_lock(&kibnal_data.kib_tx_lock); - /* block for idle tx */ + if (list_empty (&kibnal_data.kib_idle_txs)) { spin_unlock(&kibnal_data.kib_tx_lock); - - wait_event (kibnal_data.kib_idle_tx_waitq, - !list_empty (&kibnal_data.kib_idle_txs) || - kibnal_data.kib_shutdown); + return NULL; } - if (tx != NULL) { - list_del (&tx->tx_list); - - /* Allocate a new completion cookie. It might not be needed, - * but we've got a lock right now and we're unlikely to - * wrap... */ - tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; + tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list); + list_del (&tx->tx_list); - LASSERT (tx->tx_nwrq == 0); - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_sending == 0); - LASSERT (!tx->tx_waiting); - LASSERT (tx->tx_status == 0); - LASSERT (tx->tx_conn == NULL); - LASSERT (tx->tx_libmsg[0] == NULL); - LASSERT (tx->tx_libmsg[1] == NULL); - } + /* Allocate a new completion cookie. It might not be needed, + * but we've got a lock right now and we're unlikely to + * wrap... */ + tx->tx_cookie = kibnal_data.kib_next_tx_cookie++; spin_unlock(&kibnal_data.kib_tx_lock); - - RETURN(tx); + + LASSERT (tx->tx_nwrq == 0); + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_sending == 0); + LASSERT (!tx->tx_waiting); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (tx->tx_lntmsg[0] == NULL); + LASSERT (tx->tx_lntmsg[1] == NULL); + + return tx; } int -kibnal_post_rx (kib_rx_t *rx, int credit) +kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) { kib_conn_t *conn = rx->rx_conn; int rc = 0; @@ -148,6 +139,9 @@ kibnal_post_rx (kib_rx_t *rx, int credit) vv_return_t vvrc; LASSERT (!in_interrupt()); + /* old peers don't reserve rxs for RDMA replies */ + LASSERT (!rsrvd_credit || + conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); rx->rx_gl = (vv_scatgat_t) { .v_address = KIBNAL_ADDR2SG(addr), @@ -164,7 +158,7 @@ kibnal_post_rx (kib_rx_t *rx, int credit) }; LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); - LASSERT (!rx->rx_posted); + LASSERT (rx->rx_nob >= 0); /* not posted */ CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", rx->rx_wrq.scatgat_list->length, @@ -177,27 +171,31 @@ kibnal_post_rx (kib_rx_t *rx, int credit) return 0; } - rx->rx_posted = 1; - + rx->rx_nob = -1; /* flag posted */ + spin_lock(&conn->ibc_lock); /* Serialise vv_post_receive; it's not re-entrant on the same QP */ vvrc = vv_post_receive(kibnal_data.kib_hca, conn->ibc_qp, &rx->rx_wrq); - spin_unlock(&conn->ibc_lock); if (vvrc == vv_return_ok) { - if (credit) { - spin_lock(&conn->ibc_lock); + if (credit) conn->ibc_outstanding_credits++; - spin_unlock(&conn->ibc_lock); + if (rsrvd_credit) + conn->ibc_reserved_credits++; + + spin_unlock(&conn->ibc_lock); + if (credit || rsrvd_credit) kibnal_check_sends(conn); - } + return 0; } - CERROR ("post rx -> "LPX64" failed %d\n", - conn->ibc_peer->ibp_nid, vvrc); + spin_unlock(&conn->ibc_lock); + + CERROR ("post rx -> %s failed %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc); rc = -EIO; kibnal_close_conn(rx->rx_conn, rc); /* No more posts for this rx; so lose its ref */ @@ -218,7 +216,7 @@ kibnal_post_receives (kib_conn_t *conn) /* +1 ref for rx desc. This ref remains until kibnal_post_rx * fails (i.e. actual failure or we're disconnecting) */ kibnal_conn_addref(conn); - rc = kibnal_post_rx (&conn->ibc_rxs[i], 0); + rc = kibnal_post_rx (&conn->ibc_rxs[i], 0, 0); if (rc != 0) return rc; } @@ -263,9 +261,8 @@ kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) if (tx == NULL) { spin_unlock(&conn->ibc_lock); - CWARN("Unmatched completion type %x cookie "LPX64 - " from "LPX64"\n", - txtype, cookie, conn->ibc_peer->ibp_nid); + CWARN("Unmatched completion type %x cookie "LPX64" from %s\n", + txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, -EPROTO); return; } @@ -274,12 +271,8 @@ kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) if (status < 0) { /* failed? */ tx->tx_status = status; } else if (txtype == IBNAL_MSG_GET_REQ) { - /* XXX layering violation: set REPLY data length */ - LASSERT (tx->tx_libmsg[1] != NULL); - LASSERT (tx->tx_libmsg[1]->ev.type == - PTL_EVENT_REPLY_END); - - tx->tx_libmsg[1]->ev.mlength = status; + lnet_set_reply_msg_len(kibnal_data.kib_ni, + tx->tx_lntmsg[1], status); } } @@ -298,11 +291,11 @@ kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie) void kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie) { - kib_tx_t *tx = kibnal_get_idle_tx(0); + kib_tx_t *tx = kibnal_get_idle_tx(); if (tx == NULL) { - CERROR("Can't get tx for completion %x for "LPX64"\n", - type, conn->ibc_peer->ibp_nid); + CERROR("Can't get tx for completion %x for %s\n", + type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); return; } @@ -320,12 +313,15 @@ kibnal_handle_rx (kib_rx_t *rx) kib_conn_t *conn = rx->rx_conn; int credits = msg->ibm_credits; kib_tx_t *tx; - int rc; + int rc = 0; + int repost = 1; + int rsrvd_credit = 0; + int rc2; LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED); - CDEBUG (D_NET, "Received %x[%d] from "LPX64"\n", - msg->ibm_type, credits, conn->ibc_peer->ibp_nid); + CDEBUG (D_NET, "Received %x[%d] from %s\n", + msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid)); if (credits != 0) { /* Have I received credits that will let me send? */ @@ -338,37 +334,38 @@ kibnal_handle_rx (kib_rx_t *rx) switch (msg->ibm_type) { default: - CERROR("Bad IBNAL message type %x from "LPX64"\n", - msg->ibm_type, conn->ibc_peer->ibp_nid); + CERROR("Bad IBNAL message type %x from %s\n", + msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; break; case IBNAL_MSG_NOOP: break; case IBNAL_MSG_IMMEDIATE: - lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr, + msg->ibm_srcnid, rx, 0); + repost = rc < 0; /* repost on error */ break; case IBNAL_MSG_PUT_REQ: - rx->rx_responded = 0; - lib_parse(&kibnal_lib, &msg->ibm_u.putreq.ibprm_hdr, rx); - if (rx->rx_responded) - break; - - /* I wasn't asked to transfer any payload data. This happens - * if the PUT didn't match, or got truncated. */ - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, - msg->ibm_u.putreq.ibprm_cookie); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr, + msg->ibm_srcnid, rx, 1); + repost = rc < 0; /* repost on error */ break; case IBNAL_MSG_PUT_NAK: - CWARN ("PUT_NACK from "LPX64"\n", conn->ibc_peer->ibp_nid); + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ + + CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ, msg->ibm_u.completion.ibcm_status, msg->ibm_u.completion.ibcm_cookie); break; case IBNAL_MSG_PUT_ACK: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ + spin_lock(&conn->ibc_lock); tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ, msg->ibm_u.putack.ibpam_src_cookie); @@ -377,9 +374,9 @@ kibnal_handle_rx (kib_rx_t *rx) spin_unlock(&conn->ibc_lock); if (tx == NULL) { - CERROR("Unmatched PUT_ACK from "LPX64"\n", - conn->ibc_peer->ibp_nid); - kibnal_close_conn(conn, -EPROTO); + CERROR("Unmatched PUT_ACK from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + rc = -EPROTO; break; } @@ -390,47 +387,55 @@ kibnal_handle_rx (kib_rx_t *rx) tx->tx_nwrq = 0; /* overwrite PUT_REQ */ - rc = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, - kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), - &msg->ibm_u.putack.ibpam_rd, - msg->ibm_u.putack.ibpam_dst_cookie); - if (rc < 0) - CERROR("Can't setup rdma for PUT to "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); + rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE, + kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd), + &msg->ibm_u.putack.ibpam_rd, + msg->ibm_u.putack.ibpam_dst_cookie); + if (rc2 < 0) + CERROR("Can't setup rdma for PUT to %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); spin_lock(&conn->ibc_lock); - if (tx->tx_status == 0 && rc < 0) - tx->tx_status = rc; + if (tx->tx_status == 0 && rc2 < 0) + tx->tx_status = rc2; tx->tx_waiting = 0; /* clear waiting and queue atomically */ kibnal_queue_tx_locked(tx, conn); spin_unlock(&conn->ibc_lock); break; case IBNAL_MSG_PUT_DONE: + /* This buffer was pre-reserved by not returning the credit + * when the PUT_REQ's buffer was reposted, so I just return it + * now */ kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK, msg->ibm_u.completion.ibcm_status, msg->ibm_u.completion.ibcm_cookie); break; case IBNAL_MSG_GET_REQ: - rx->rx_responded = 0; - lib_parse(&kibnal_lib, &msg->ibm_u.get.ibgm_hdr, rx); - if (rx->rx_responded) /* I responded to the GET_REQ */ - break; - /* NB GET didn't match (I'd have responded even with no payload - * data) */ - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, -ENODATA, - msg->ibm_u.get.ibgm_cookie); + rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr, + msg->ibm_srcnid, rx, 1); + repost = rc < 0; /* repost on error */ break; case IBNAL_MSG_GET_DONE: + rsrvd_credit = 1; /* rdma reply (was pre-reserved) */ + kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ, msg->ibm_u.completion.ibcm_status, msg->ibm_u.completion.ibcm_cookie); break; } - kibnal_post_rx(rx, 1); + if (rc < 0) /* protocol error */ + kibnal_close_conn(conn, rc); + + if (repost) { + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) + rsrvd_credit = 0; /* peer isn't pre-reserving */ + + kibnal_post_rx(rx, !rsrvd_credit, rsrvd_credit); + } } void @@ -441,42 +446,50 @@ kibnal_rx_complete (kib_rx_t *rx, vv_comp_status_t vvrc, int nob, __u64 rxseq) unsigned long flags; int rc; - CDEBUG (D_NET, "rx %p conn %p\n", rx, conn); - LASSERT (rx->rx_posted); - rx->rx_posted = 0; + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + LASSERT (rx->rx_nob < 0); /* was posted */ + rx->rx_nob = 0; /* isn't now */ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) goto ignore; if (vvrc != vv_comp_status_success) { - CERROR("Rx from "LPX64" failed: %d\n", - conn->ibc_peer->ibp_nid, vvrc); + CERROR("Rx from %s failed: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc); goto failed; } - rc = kibnal_unpack_msg(msg, nob); + rc = kibnal_unpack_msg(msg, conn->ibc_version, nob); if (rc != 0) { - CERROR ("Error %d unpacking rx from "LPX64"\n", - rc, conn->ibc_peer->ibp_nid); + CERROR ("Error %d unpacking rx from %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } - if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || + rx->rx_nob = nob; /* Can trust 'nob' now */ + + if (!lnet_ptlcompat_matchnid(conn->ibc_peer->ibp_nid, + msg->ibm_srcnid) || + !lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + msg->ibm_dstnid) || msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || msg->ibm_dststamp != kibnal_data.kib_incarnation) { - CERROR ("Stale rx from "LPX64"\n", - conn->ibc_peer->ibp_nid); + CERROR ("Stale rx from %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); goto failed; } if (msg->ibm_seq != rxseq) { - CERROR ("Out-of-sequence rx from "LPX64 + CERROR ("Out-of-sequence rx from %s" ": got "LPD64" but expected "LPD64"\n", - conn->ibc_peer->ibp_nid, msg->ibm_seq, rxseq); + libcfs_nid2str(conn->ibc_peer->ibp_nid), + msg->ibm_seq, rxseq); goto failed; } + /* set time last known alive */ + kibnal_peer_alive(conn->ibc_peer); + /* racing with connection establishment/teardown! */ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { @@ -546,8 +559,10 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, /* Try to create an address that adaptor-tavor will munge into a valid * network address, given how it maps all phys mem into 1 region */ - addr = kibnal_page2phys(page) + page_offset + PAGE_OFFSET; + addr = lnet_page2phys(page) + page_offset + PAGE_OFFSET; + /* NB this relies entirely on there being a single region for the whole + * of memory, since "high" memory will wrap in the (void *) cast! */ vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, (void *)((unsigned long)addr), len, &mem_h, &l_key, &r_key); @@ -585,7 +600,7 @@ kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page, int kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, - int niov, struct iovec *iov, int offset, int nob) + unsigned int niov, struct iovec *iov, int offset, int nob) { /* active if I'm sending */ @@ -643,7 +658,7 @@ kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, int kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, - int nkiov, ptl_kiov_t *kiov, int offset, int nob) + int nkiov, lnet_kiov_t *kiov, int offset, int nob) { /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); @@ -695,7 +710,7 @@ kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, LASSERT (tx->tx_md.md_fmrcount > 0); LASSERT (page_offset < PAGE_SIZE); LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT))); - LASSERT (npages <= PTL_MD_MAX_IOV); + LASSERT (npages <= LNET_MAX_IOV); memset(&map_props, 0, sizeof(map_props)); @@ -730,7 +745,7 @@ kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active, int kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, - int niov, struct iovec *iov, int offset, int nob) + unsigned int niov, struct iovec *iov, int offset, int nob) { /* active if I'm sending */ @@ -741,7 +756,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int npages; unsigned long page_offset; unsigned long vaddr; - + LASSERT (nob > 0); LASSERT (niov > 0); @@ -764,7 +779,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, npages = 0; do { - LASSERT (npages < PTL_MD_MAX_IOV); + LASSERT (npages < LNET_MAX_IOV); page = kibnal_kvaddr_to_page(vaddr); if (page == NULL) { @@ -772,7 +787,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, return -EFAULT; } - tx->tx_pages[npages++] = kibnal_page2phys(page); + tx->tx_pages[npages++] = lnet_page2phys(page); fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1)); vaddr += fragnob; @@ -786,7 +801,7 @@ kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, vv_access_con_bit_mask_t access, - int nkiov, ptl_kiov_t *kiov, int offset, int nob) + int nkiov, lnet_kiov_t *kiov, int offset, int nob) { /* active if I'm sending */ int active = ((access & vv_acc_r_mem_write) == 0); @@ -798,7 +813,7 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, LASSERT (nob > 0); LASSERT (nkiov > 0); - LASSERT (nkiov <= PTL_MD_MAX_IOV); + LASSERT (nkiov <= LNET_MAX_IOV); LASSERT (!tx->tx_md.md_active); LASSERT ((rd != tx->tx_rd) == !active); @@ -815,7 +830,7 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, npages = 0; do { - LASSERT (npages < PTL_MD_MAX_IOV); + LASSERT (npages < LNET_MAX_IOV); LASSERT (nkiov > 0); if ((npages > 0 && kiov->kiov_offset != 0) || @@ -829,7 +844,7 @@ kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, return -EINVAL; } - tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page); + tx->tx_pages[npages++] = lnet_page2phys(kiov->kiov_page); resid -= kiov->kiov_len; kiov++; nkiov--; @@ -856,25 +871,42 @@ void kibnal_check_sends (kib_conn_t *conn) { kib_tx_t *tx; - vv_return_t vvrc; + vv_return_t vvrc; int rc; + int consume_cred; int done; /* Don't send anything until after the connection is established */ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) { - CDEBUG(D_NET, LPX64"too soon\n", conn->ibc_peer->ibp_nid); + CDEBUG(D_NET, "%s too soon\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); return; } spin_lock(&conn->ibc_lock); - LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); - + LASSERT (conn->ibc_nsends_posted <= + *kibnal_tunables.kib_concurrent_sends); + LASSERT (conn->ibc_reserved_credits >= 0); + + while (conn->ibc_reserved_credits > 0 && + !list_empty(&conn->ibc_tx_queue_rsrvd)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry(conn->ibc_tx_queue_rsrvd.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); + conn->ibc_reserved_credits--; + } + if (list_empty(&conn->ibc_tx_queue) && - conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { + list_empty(&conn->ibc_tx_queue_nocred) && + (conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER || + kibnal_send_keepalive(conn))) { spin_unlock(&conn->ibc_lock); - tx = kibnal_get_idle_tx(0); /* don't block */ + tx = kibnal_get_idle_tx(); if (tx != NULL) kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); @@ -884,9 +916,22 @@ kibnal_check_sends (kib_conn_t *conn) kibnal_queue_tx_locked(tx, conn); } - while (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); - + for (;;) { + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + LASSERT (conn->ibc_version != + IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD); + tx = list_entry (conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); + consume_cred = 0; + } else if (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, + kib_tx_t, tx_list); + consume_cred = 1; + } else { + /* nothing waiting */ + break; + } + LASSERT (tx->tx_queued); /* We rely on this for QP sizing */ LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS); @@ -896,23 +941,27 @@ kibnal_check_sends (kib_conn_t *conn) LASSERT (conn->ibc_credits >= 0); LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); - if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) { - CDEBUG(D_NET, LPX64": posted enough\n", - conn->ibc_peer->ibp_nid); - break; - } - - if (conn->ibc_credits == 0) { /* no credits */ - CDEBUG(D_NET, LPX64": no credits\n", - conn->ibc_peer->ibp_nid); + if (conn->ibc_nsends_posted == + *kibnal_tunables.kib_concurrent_sends) { + /* We've got some tx completions outstanding... */ + CDEBUG(D_NET, "%s: posted enough\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); break; } - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) { /* giving back credits */ - CDEBUG(D_NET, LPX64": not using last credit\n", - conn->ibc_peer->ibp_nid); - break; + if (consume_cred) { + if (conn->ibc_credits == 0) { /* no credits */ + CDEBUG(D_NET, "%s: no credits\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) { /* giving back credits */ + CDEBUG(D_NET, "%s: not using last credit\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + break; + } } list_del (&tx->tx_list); @@ -922,24 +971,28 @@ kibnal_check_sends (kib_conn_t *conn) if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && (!list_empty(&conn->ibc_tx_queue) || - conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + !list_empty(&conn->ibc_tx_queue_nocred) || + (conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER && + !kibnal_send_keepalive(conn)))) { /* redundant NOOP */ spin_unlock(&conn->ibc_lock); kibnal_tx_done(tx); spin_lock(&conn->ibc_lock); - CDEBUG(D_NET, LPX64": redundant noop\n", - conn->ibc_peer->ibp_nid); + CDEBUG(D_NET, "%s: redundant noop\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); continue; } - kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits, + kibnal_pack_msg(tx->tx_msg, conn->ibc_version, + conn->ibc_outstanding_credits, conn->ibc_peer->ibp_nid, conn->ibc_incarnation, conn->ibc_txseq); conn->ibc_txseq++; conn->ibc_outstanding_credits = 0; conn->ibc_nsends_posted++; - conn->ibc_credits--; + if (consume_cred) + conn->ibc_credits--; /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA * PUT. If so, it was first queued here as a PUT_REQ, sent and @@ -958,14 +1011,14 @@ kibnal_check_sends (kib_conn_t *conn) LASSERT (tx->tx_nwrq > 0); #if 0 if (tx->tx_wrq[0].wr_type == vv_wr_rdma_write) - CDEBUG(D_WARNING, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", + CDEBUG(D_NET, "WORK[0]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", tx->tx_wrq[0].scatgat_list->v_address, tx->tx_wrq[0].scatgat_list->length, tx->tx_wrq[0].scatgat_list->l_key, tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_addr, tx->tx_wrq[0].type.send.send_qp_type.rc_type.r_r_key); else - CDEBUG(D_WARNING, "WORK[0]: %s gl %p for %d k %x\n", + CDEBUG(D_NET, "WORK[0]: %s gl %p for %d k %x\n", tx->tx_wrq[0].wr_type == vv_wr_send ? "SEND" : "????", tx->tx_wrq[0].scatgat_list->v_address, tx->tx_wrq[0].scatgat_list->length, @@ -973,14 +1026,14 @@ kibnal_check_sends (kib_conn_t *conn) if (tx->tx_nwrq > 1) { if (tx->tx_wrq[1].wr_type == vv_wr_rdma_write) - CDEBUG(D_WARNING, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", + CDEBUG(D_NET, "WORK[1]: RDMA gl %p for %d k %x -> "LPX64" k %x\n", tx->tx_wrq[1].scatgat_list->v_address, tx->tx_wrq[1].scatgat_list->length, tx->tx_wrq[1].scatgat_list->l_key, tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_addr, tx->tx_wrq[1].type.send.send_qp_type.rc_type.r_r_key); else - CDEBUG(D_WARNING, "WORK[1]: %s gl %p for %d k %x\n", + CDEBUG(D_NET, "WORK[1]: %s gl %p for %d k %x\n", tx->tx_wrq[1].wr_type == vv_wr_send ? "SEND" : "????", tx->tx_wrq[1].scatgat_list->v_address, tx->tx_wrq[1].scatgat_list->length, @@ -999,11 +1052,14 @@ kibnal_check_sends (kib_conn_t *conn) rc = (vvrc == vv_return_ok) ? 0 : -EIO; } + conn->ibc_last_send = jiffies; + if (rc != 0) { /* NB credits are transferred in the actual * message, which can only be the last work item */ conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; - conn->ibc_credits++; + if (consume_cred) + conn->ibc_credits++; conn->ibc_nsends_posted--; tx->tx_status = rc; @@ -1017,11 +1073,11 @@ kibnal_check_sends (kib_conn_t *conn) spin_unlock(&conn->ibc_lock); if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR ("Error %d posting transmit to "LPX64"\n", - vvrc, conn->ibc_peer->ibp_nid); + CERROR ("Error %d posting transmit to %s\n", + vvrc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); else - CDEBUG (D_NET, "Error %d posting transmit to " - LPX64"\n", rc, conn->ibc_peer->ibp_nid); + CDEBUG (D_NET, "Error %d posting transmit to %s\n", + rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); kibnal_close_conn (conn, rc); @@ -1049,10 +1105,11 @@ kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc) if (failed && tx->tx_status == 0 && conn->ibc_state == IBNAL_CONN_ESTABLISHED) - CERROR("tx -> "LPX64" type %x cookie "LPX64 + CDEBUG(D_NETERROR, "tx -> %s type %x cookie "LPX64 "sending %d waiting %d: failed %d\n", - conn->ibc_peer->ibp_nid, tx->tx_msg->ibm_type, - tx->tx_cookie, tx->tx_sending, tx->tx_waiting, vvrc); + libcfs_nid2str(conn->ibc_peer->ibp_nid), + tx->tx_msg->ibm_type, tx->tx_cookie, + tx->tx_sending, tx->tx_waiting, vvrc); spin_lock(&conn->ibc_lock); @@ -1080,10 +1137,12 @@ kibnal_tx_complete (kib_tx_t *tx, vv_comp_status_t vvrc) if (idle) kibnal_tx_done (tx); - if (failed) + if (failed) { kibnal_close_conn (conn, -EIO); - else + } else { + kibnal_peer_alive(conn->ibc_peer); kibnal_check_sends(conn); + } kibnal_conn_decref(conn); /* ...until here */ } @@ -1276,12 +1335,14 @@ kibnal_schedule_peer_arp (kib_peer_t *peer) } void -kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid) { kib_peer_t *peer; kib_conn_t *conn; unsigned long flags; rwlock_t *g_lock = &kibnal_data.kib_global_lock; + int retry; + int rc; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ @@ -1289,38 +1350,51 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ LASSERT (tx->tx_nwrq > 0); /* work items have been set up */ - read_lock_irqsave(g_lock, flags); + for (retry = 0; ; retry = 1) { + read_lock_irqsave(g_lock, flags); - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { - read_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; - } + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) { + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + kibnal_conn_addref(conn); /* 1 ref for me... */ + read_unlock_irqrestore(g_lock, flags); - conn = kibnal_find_conn_locked (peer); - if (conn != NULL) { - kibnal_conn_addref(conn); /* 1 ref for me... */ - read_unlock_irqrestore(g_lock, flags); + kibnal_queue_tx (tx, conn); + kibnal_conn_decref(conn); /* ...to here */ + return; + } + } - kibnal_queue_tx (tx, conn); - kibnal_conn_decref(conn); /* ...to here */ - return; - } - - /* Making one or more connections; I'll need a write lock... */ - read_unlock(g_lock); - write_lock(g_lock); + /* Making one or more connections; I'll need a write lock... */ + read_unlock(g_lock); + write_lock(g_lock); + + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) + break; - peer = kibnal_find_peer_locked (nid); - if (peer == NULL) { write_unlock_irqrestore(g_lock, flags); - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kibnal_tx_done (tx); - return; + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_nid2str(nid)); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } + + rc = kibnal_add_persistent_peer(nid, LNET_NIDADDR(nid)); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_nid2str(nid), rc); + + tx->tx_status = -EHOSTUNREACH; + tx->tx_waiting = 0; + kibnal_tx_done (tx); + return; + } } conn = kibnal_find_conn_locked (peer); @@ -1334,17 +1408,19 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) return; } - if (peer->ibp_connecting == 0) { - if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { + if (peer->ibp_connecting == 0 && + peer->ibp_accepting == 0) { + if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */ + time_after_eq(jiffies, peer->ibp_reconnect_time))) { write_unlock_irqrestore(g_lock, flags); tx->tx_status = -EHOSTUNREACH; tx->tx_waiting = 0; kibnal_tx_done (tx); return; } - + peer->ibp_connecting = 1; - peer->ibp_arp_count = 1 + IBNAL_ARP_RETRIES; + peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries; kibnal_schedule_peer_arp(peer); } @@ -1355,45 +1431,30 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) } int -kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) -{ - /* I would guess that if kibnal_get_peer (nid) == NULL, - and we're not routing, then 'nid' is very distant :) */ - if ( nal->libnal_ni.ni_pid.nid == nid ) { - *dist = 0; - } else { - *dist = 1; - } - - return 0; -} - -ptl_err_t -kibnal_sendmsg(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - ptl_kiov_t *payload_kiov, - int payload_offset, - int payload_nob) +kibnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { - kib_msg_t *ibmsg; - kib_tx_t *tx; - int nob; - int rc; + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + int target_is_router = lntmsg->msg_target_is_router; + int routing = lntmsg->msg_routing; + unsigned int payload_niov = lntmsg->msg_niov; + struct iovec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + int rc; /* NB 'private' is different depending on what we're sending.... */ - CDEBUG(D_NET, "sending %d bytes in %d frags to nid:"LPX64 - " pid %d\n", payload_nob, payload_niov, nid , pid); + CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); LASSERT (payload_nob == 0 || payload_niov > 0); - LASSERT (payload_niov <= PTL_MD_MAX_IOV); + LASSERT (payload_niov <= LNET_MAX_IOV); /* Thread context */ LASSERT (!in_interrupt()); @@ -1403,108 +1464,49 @@ kibnal_sendmsg(lib_nal_t *nal, switch (type) { default: LBUG(); - return (PTL_FAIL); + return (-EIO); - case PTL_MSG_REPLY: { - /* reply's 'private' is the incoming receive */ - kib_rx_t *rx = private; - - LASSERT(rx != NULL); - - if (rx->rx_msg->ibm_type == IBNAL_MSG_IMMEDIATE) { - /* RDMA not expected */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob > IBNAL_MSG_SIZE) { - CERROR("REPLY for "LPX64" too big (RDMA not requested):" - "%d (max for message is %d)\n", - nid, payload_nob, IBNAL_MSG_SIZE); - CERROR("Can't REPLY IMMEDIATE %d to "LPX64"\n", - nob, nid); - return PTL_FAIL; - } - break; - } - - /* Incoming message consistent with RDMA? */ - if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) { - CERROR("REPLY to "LPX64" bad msg type %x!!!\n", - nid, rx->rx_msg->ibm_type); - return PTL_FAIL; - } + case LNET_MSG_ACK: + LASSERT (payload_nob == 0); + break; - /* NB rx_complete() will send GET_NAK when I return to it from - * here, unless I set rx_responded! */ + case LNET_MSG_GET: + if (routing || target_is_router) + break; /* send IMMEDIATE */ + + /* is the REPLY message too small for RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); + if (nob <= IBNAL_MSG_SIZE) + break; /* send IMMEDIATE */ - tx = kibnal_get_idle_tx(0); + tx = kibnal_get_idle_tx(); if (tx == NULL) { - CERROR("Can't get tx for REPLY to "LPX64"\n", nid); - return PTL_FAIL; - } - - if (payload_nob == 0) - rc = 0; - else if (payload_kiov == NULL) - rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, - payload_niov, payload_iov, - payload_offset, payload_nob); - else - rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, - payload_niov, payload_kiov, - payload_offset, payload_nob); - if (rc != 0) { - CERROR("Can't setup GET src for "LPX64": %d\n", nid, rc); - kibnal_tx_done(tx); - return PTL_FAIL; + CERROR("Can allocate txd for GET to %s: \n", + libcfs_nid2str(target.nid)); + return -ENOMEM; } - rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, payload_nob, - &rx->rx_msg->ibm_u.get.ibgm_rd, - rx->rx_msg->ibm_u.get.ibgm_cookie); - if (rc < 0) { - CERROR("Can't setup rdma for GET from "LPX64": %d\n", - nid, rc); - } else if (rc == 0) { - /* No RDMA: local completion may happen now! */ - lib_finalize (&kibnal_lib, NULL, libmsg, PTL_OK); - } else { - /* RDMA: lib_finalize(libmsg) when it completes */ - tx->tx_libmsg[0] = libmsg; - } - - kibnal_queue_tx(tx, rx->rx_conn); - rx->rx_responded = 1; - return (rc >= 0) ? PTL_OK : PTL_FAIL; - } - - case PTL_MSG_GET: - /* will the REPLY message be small enough not to need RDMA? */ - nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); - if (nob <= IBNAL_MSG_SIZE) - break; - - tx = kibnal_get_idle_tx(1); /* may block; caller is an app thread */ - LASSERT (tx != NULL); - ibmsg = tx->tx_msg; ibmsg->ibm_u.get.ibgm_hdr = *hdr; ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; - if ((libmsg->md->options & PTL_MD_KIOV) == 0) + if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd, vv_acc_r_mem_write, - libmsg->md->md_niov, - libmsg->md->md_iov.iov, - 0, libmsg->md->length); + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.iov, + 0, lntmsg->msg_md->md_length); else rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd, vv_acc_r_mem_write, - libmsg->md->md_niov, - libmsg->md->md_iov.kiov, - 0, libmsg->md->length); + lntmsg->msg_md->md_niov, + lntmsg->msg_md->md_iov.kiov, + 0, lntmsg->msg_md->md_length); if (rc != 0) { - CERROR("Can't setup GET sink for "LPX64": %d\n", nid, rc); + CERROR("Can't setup GET sink for %s: %d\n", + libcfs_nid2str(target.nid), rc); kibnal_tx_done(tx); - return PTL_FAIL; + return -EIO; } #if IBNAL_USE_FMR @@ -1518,30 +1520,34 @@ kibnal_sendmsg(lib_nal_t *nal, #endif kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob); - tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, nid, libmsg); - if (tx->tx_libmsg[1] == NULL) { - CERROR("Can't create reply for GET -> "LPX64"\n", nid); + tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni, + lntmsg); + if (tx->tx_lntmsg[1] == NULL) { + CERROR("Can't create reply for GET -> %s\n", + libcfs_nid2str(target.nid)); kibnal_tx_done(tx); - return PTL_FAIL; + return -EIO; } - tx->tx_libmsg[0] = libmsg; /* finalise libmsg[0,1] on completion */ + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ tx->tx_waiting = 1; /* waiting for GET_DONE */ - kibnal_launch_tx(tx, nid); - return PTL_OK; - - case PTL_MSG_ACK: - LASSERT (payload_nob == 0); - break; + kibnal_launch_tx(tx, target.nid); + return 0; - case PTL_MSG_PUT: + case LNET_MSG_REPLY: + case LNET_MSG_PUT: /* Is the payload small enough not to need RDMA? */ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); if (nob <= IBNAL_MSG_SIZE) - break; + break; /* send IMMEDIATE */ - tx = kibnal_get_idle_tx(1); /* may block: caller is app thread */ - LASSERT (tx != NULL); + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't allocate %s txd for %s\n", + type == LNET_MSG_PUT ? "PUT" : "REPLY", + libcfs_nid2str(target.nid)); + return -ENOMEM; + } if (payload_kiov == NULL) rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, @@ -1552,9 +1558,10 @@ kibnal_sendmsg(lib_nal_t *nal, payload_niov, payload_kiov, payload_offset, payload_nob); if (rc != 0) { - CERROR("Can't setup PUT src for "LPX64": %d\n", nid, rc); + CERROR("Can't setup PUT src for %s: %d\n", + libcfs_nid2str(target.nid), rc); kibnal_tx_done(tx); - return PTL_FAIL; + return -EIO; } ibmsg = tx->tx_msg; @@ -1562,74 +1569,132 @@ kibnal_sendmsg(lib_nal_t *nal, ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t)); - tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */ + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ - kibnal_launch_tx(tx, nid); - return PTL_OK; + kibnal_launch_tx(tx, target.nid); + return 0; } + /* send IMMEDIATE */ + LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]) <= IBNAL_MSG_SIZE); - tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || - type == PTL_MSG_REPLY)); + tx = kibnal_get_idle_tx(); if (tx == NULL) { - CERROR ("Can't send %d to "LPX64": tx descs exhausted\n", type, nid); - return PTL_NO_SPACE; + CERROR ("Can't send %d to %s: tx descs exhausted\n", + type, libcfs_nid2str(target.nid)); + return -ENOMEM; } ibmsg = tx->tx_msg; ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - if (payload_nob > 0) { - if (payload_kiov != NULL) - lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_kiov, - payload_offset, payload_nob); - else - lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, - payload_niov, payload_iov, - payload_offset, payload_nob); - } + if (payload_kiov != NULL) + lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + payload_niov, payload_iov, + payload_offset, payload_nob); nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]); kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob); - tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */ - kibnal_launch_tx(tx, nid); - return PTL_OK; + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ + kibnal_launch_tx(tx, target.nid); + return 0; } -ptl_err_t -kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, struct iovec *payload_iov, - size_t payload_offset, size_t payload_len) +void +kibnal_reply (lnet_ni_t *ni, kib_rx_t *rx, lnet_msg_t *lntmsg) { - CDEBUG(D_NET, " pid = %d, nid="LPU64"\n", - pid, nid); - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, payload_iov, NULL, - payload_offset, payload_len)); + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + lnet_kiov_t *kiov = lntmsg->msg_kiov; + unsigned int offset = lntmsg->msg_offset; + unsigned int nob = lntmsg->msg_len; + kib_tx_t *tx; + int rc; + + tx = kibnal_get_idle_tx(); + if (tx == NULL) { + CERROR("Can't get tx for REPLY to %s\n", + libcfs_nid2str(target.nid)); + goto failed_0; + } + + if (nob == 0) + rc = 0; + else if (kiov == NULL) + rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 0, + niov, iov, offset, nob); + else + rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 0, + niov, kiov, offset, nob); + + if (rc != 0) { + CERROR("Can't setup GET src for %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE, nob, + &rx->rx_msg->ibm_u.get.ibgm_rd, + rx->rx_msg->ibm_u.get.ibgm_cookie); + if (rc < 0) { + CERROR("Can't setup rdma for GET from %s: %d\n", + libcfs_nid2str(target.nid), rc); + goto failed_1; + } + + if (rc == 0) { + /* No RDMA: local completion may happen now! */ + lnet_finalize(ni, lntmsg, 0); + } else { + /* RDMA: lnet_finalize(lntmsg) when it + * completes */ + tx->tx_lntmsg[0] = lntmsg; + } + + kibnal_queue_tx(tx, rx->rx_conn); + return; + + failed_1: + kibnal_tx_done(tx); + failed_0: + lnet_finalize(ni, lntmsg, -EIO); } -ptl_err_t -kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_len) +int +kibnal_eager_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + void **new_private) { - return (kibnal_sendmsg(nal, private, cookie, - hdr, type, nid, pid, - payload_niov, NULL, payload_kiov, - payload_offset, payload_len)); + kib_rx_t *rx = private; + kib_conn_t *conn = rx->rx_conn; + + if (conn->ibc_version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) { + /* Can't block if RDMA completions need normal credits */ + LCONSOLE_ERROR("Dropping message from %s: no buffers free. " + "%s is running an old version of LNET that may " + "deadlock if messages wait for buffers)\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + return -EDEADLK; + } + + *new_private = private; + return 0; } -ptl_err_t -kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, - size_t offset, int mlen, int rlen) +int +kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, + unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) { kib_rx_t *rx = private; kib_msg_t *rxmsg = rx->rx_msg; @@ -1637,10 +1702,10 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, kib_tx_t *tx; kib_msg_t *txmsg; int nob; - int rc; + int post_cred = 1; + int rc = 0; LASSERT (mlen <= rlen); - LASSERT (mlen >= 0); LASSERT (!in_interrupt()); /* Either all pages or all vaddrs */ LASSERT (!(kiov != NULL && iov != NULL)); @@ -1651,38 +1716,42 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, case IBNAL_MSG_IMMEDIATE: nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); - if (nob > IBNAL_MSG_SIZE) { - CERROR ("Immediate message from "LPX64" too big: %d\n", - rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); - return (PTL_FAIL); + if (nob > rx->rx_nob) { + CERROR ("Immediate message from %s too big: %d(%d)\n", + libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), + nob, rx->rx_nob); + rc = -EPROTO; + break; } if (kiov != NULL) - lib_copy_buf2kiov(niov, kiov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); + lnet_copy_flat2kiov(niov, kiov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); else - lib_copy_buf2iov(niov, iov, offset, - rxmsg->ibm_u.immediate.ibim_payload, - mlen); - - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + lnet_copy_flat2iov(niov, iov, offset, + IBNAL_MSG_SIZE, rxmsg, + offsetof(kib_msg_t, ibm_u.immediate.ibim_payload), + mlen); + lnet_finalize (ni, lntmsg, 0); + break; case IBNAL_MSG_PUT_REQ: - /* NB rx_complete() will send PUT_NAK when I return to it from - * here, unless I set rx_responded! */ - - if (mlen == 0) { /* No payload to RDMA */ - lib_finalize(nal, NULL, libmsg, PTL_OK); - return PTL_OK; + if (mlen == 0) { + lnet_finalize(ni, lntmsg, 0); + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; } - - tx = kibnal_get_idle_tx(0); + + tx = kibnal_get_idle_tx(); if (tx == NULL) { - CERROR("Can't allocate tx for "LPX64"\n", - conn->ibc_peer->ibp_nid); - return PTL_FAIL; + CERROR("Can't allocate tx for %s\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + /* Not replying will break the connection */ + rc = -ENOMEM; + break; } txmsg = tx->tx_msg; @@ -1697,10 +1766,13 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, vv_acc_r_mem_write, niov, kiov, offset, mlen); if (rc != 0) { - CERROR("Can't setup PUT sink for "LPX64": %d\n", - conn->ibc_peer->ibp_nid, rc); + CERROR("Can't setup PUT sink for %s: %d\n", + libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); kibnal_tx_done(tx); - return PTL_FAIL; + /* tell peer it's over */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc, + rxmsg->ibm_u.putreq.ibprm_cookie); + break; } txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; @@ -1716,39 +1788,29 @@ kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, #endif kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob); - tx->tx_libmsg[0] = libmsg; /* finalise libmsg on completion */ + tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ tx->tx_waiting = 1; /* waiting for PUT_DONE */ kibnal_queue_tx(tx, conn); - LASSERT (!rx->rx_responded); - rx->rx_responded = 1; - return PTL_OK; + if (conn->ibc_version != IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD) + post_cred = 0; /* peer still owns 'rx' for sending PUT_DONE */ + break; case IBNAL_MSG_GET_REQ: - /* We get called here just to discard any junk after the - * GET hdr. */ - LASSERT (libmsg == NULL); - lib_finalize (nal, NULL, libmsg, PTL_OK); - return (PTL_OK); + if (lntmsg != NULL) { + /* Optimized GET; RDMA lntmsg's payload */ + kibnal_reply(ni, rx, lntmsg); + } else { + /* GET didn't match anything */ + kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, + -ENODATA, + rxmsg->ibm_u.get.ibgm_cookie); + } + break; } -} -ptl_err_t -kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, struct iovec *iov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, - offset, mlen, rlen)); -} - -ptl_err_t -kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) -{ - return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, - offset, mlen, rlen)); + kibnal_post_rx(rx, post_cred, 0); + return rc; } int @@ -1770,6 +1832,41 @@ kibnal_thread_fini (void) } void +kibnal_peer_alive (kib_peer_t *peer) +{ + /* This is racy, but everyone's only writing cfs_time_current() */ + peer->ibp_last_alive = cfs_time_current(); + mb(); +} + +void +kibnal_peer_notify (kib_peer_t *peer) +{ + time_t last_alive = 0; + int error = 0; + unsigned long flags; + + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + if (list_empty(&peer->ibp_conns) && + peer->ibp_accepting == 0 && + peer->ibp_connecting == 0 && + peer->ibp_error != 0) { + error = peer->ibp_error; + peer->ibp_error = 0; + + last_alive = cfs_time_current_sec() - + cfs_duration_sec(cfs_time_current() - + peer->ibp_last_alive); + } + + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (error != 0) + lnet_notify(kibnal_data.kib_ni, peer->ibp_nid, 0, last_alive); +} + +void kibnal_schedule_conn (kib_conn_t *conn) { unsigned long flags; @@ -1787,7 +1884,7 @@ kibnal_schedule_conn (kib_conn_t *conn) void kibnal_close_conn_locked (kib_conn_t *conn, int error) { - /* This just does the immmediate housekeeping. 'error' is zero for a + /* This just does the immediate housekeeping. 'error' is zero for a * normal shutdown which can happen only after the connection has been * established. If the connection is established, schedule the * connection to be finished off by the connd. Otherwise the connd is @@ -1808,48 +1905,33 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) if (error == 0 && list_empty(&conn->ibc_tx_queue) && + list_empty(&conn->ibc_tx_queue_rsrvd) && + list_empty(&conn->ibc_tx_queue_nocred) && list_empty(&conn->ibc_active_txs)) { - CDEBUG(D_NET, "closing conn to "LPX64 + CDEBUG(D_NET, "closing conn to %s" " rx# "LPD64" tx# "LPD64"\n", - peer->ibp_nid, conn->ibc_txseq, conn->ibc_rxseq); + libcfs_nid2str(peer->ibp_nid), + conn->ibc_txseq, conn->ibc_rxseq); } else { - CERROR("Closing conn to "LPX64": error %d%s%s" + CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s" " rx# "LPD64" tx# "LPD64"\n", - peer->ibp_nid, error, + libcfs_nid2str(peer->ibp_nid), error, list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", + list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", list_empty(&conn->ibc_active_txs) ? "" : "(waiting)", conn->ibc_txseq, conn->ibc_rxseq); - -#if 0 - /* can't skip down the queue without holding ibc_lock (see above) */ - list_for_each(tmp, &conn->ibc_tx_queue) { - kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); - - CERROR(" queued tx type %x cookie "LPX64 - " sending %d waiting %d ticks %ld/%d\n", - tx->tx_msg->ibm_type, tx->tx_cookie, - tx->tx_sending, tx->tx_waiting, - (long)(tx->tx_deadline - jiffies), HZ); - } - - list_for_each(tmp, &conn->ibc_active_txs) { - kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); - - CERROR(" active tx type %x cookie "LPX64 - " sending %d waiting %d ticks %ld/%d\n", - tx->tx_msg->ibm_type, tx->tx_cookie, - tx->tx_sending, tx->tx_waiting, - (long)(tx->tx_deadline - jiffies), HZ); - } -#endif } list_del (&conn->ibc_list); - - if (list_empty (&peer->ibp_conns) && /* no more conns */ - peer->ibp_persistence == 0 && /* non-persistent peer */ - kibnal_peer_active(peer)) { /* still in peer table */ - kibnal_unlink_peer_locked (peer); + + if (list_empty (&peer->ibp_conns)) { /* no more conns */ + if (peer->ibp_persistence == 0 && /* non-persistent peer */ + kibnal_peer_active(peer)) /* still in peer table */ + kibnal_unlink_peer_locked (peer); + + /* set/clear error on last conn */ + peer->ibp_error = conn->ibc_comms_error; } kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1); @@ -1894,84 +1976,76 @@ kibnal_handle_early_rxs(kib_conn_t *conn) } void -kibnal_conn_disconnected(kib_conn_t *conn) +kibnal_abort_txs(kib_conn_t *conn, struct list_head *txs) { - LIST_HEAD (zombies); - struct list_head *tmp; - struct list_head *nxt; - kib_tx_t *tx; - - /* I'm the connd */ - LASSERT (!in_interrupt()); - LASSERT (current == kibnal_data.kib_connd); - LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); - - kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); - - /* move QP to error state to make posted work items complete */ - kibnal_set_qp_state(conn, vv_qp_state_error); + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; spin_lock(&conn->ibc_lock); - /* Complete all tx descs not waiting for sends to complete. - * NB we should be safe from RDMA now that the QP has changed state */ - - list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + list_for_each_safe (tmp, nxt, txs) { tx = list_entry (tmp, kib_tx_t, tx_list); - LASSERT (tx->tx_queued); - + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); + } + tx->tx_status = -ECONNABORTED; tx->tx_queued = 0; tx->tx_waiting = 0; - if (tx->tx_sending != 0) - continue; - - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); + if (tx->tx_sending == 0) { + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } } - list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { - tx = list_entry (tmp, kib_tx_t, tx_list); - - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_waiting || - tx->tx_sending != 0); + spin_unlock(&conn->ibc_lock); - tx->tx_status = -ECONNABORTED; - tx->tx_waiting = 0; - - if (tx->tx_sending != 0) - continue; + kibnal_txlist_done(&zombies, -ECONNABORTED); +} - list_del (&tx->tx_list); - list_add (&tx->tx_list, &zombies); - } +void +kibnal_conn_disconnected(kib_conn_t *conn) +{ + /* I'm the connd */ + LASSERT (!in_interrupt()); + LASSERT (current == kibnal_data.kib_connd); + LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); - spin_unlock(&conn->ibc_lock); + kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED); - while (!list_empty(&zombies)) { - tx = list_entry (zombies.next, kib_tx_t, tx_list); + /* move QP to error state to make posted work items complete */ + kibnal_set_qp_state(conn, vv_qp_state_error); - list_del(&tx->tx_list); - kibnal_tx_done (tx); - } + /* Complete all tx descs not waiting for sends to complete. + * NB we should be safe from RDMA now that the QP has changed state */ + + kibnal_abort_txs(conn, &conn->ibc_tx_queue); + kibnal_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); + kibnal_abort_txs(conn, &conn->ibc_tx_queue_nocred); + kibnal_abort_txs(conn, &conn->ibc_active_txs); kibnal_handle_early_rxs(conn); + + kibnal_peer_notify(conn->ibc_peer); } void -kibnal_peer_connect_failed (kib_peer_t *peer, int active) +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int error) { - struct list_head zombies; - kib_tx_t *tx; + LIST_HEAD (zombies); unsigned long flags; /* Only the connd creates conns => single threaded */ + LASSERT (error != 0); LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); - LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); write_lock_irqsave(&kibnal_data.kib_global_lock, flags); @@ -1979,10 +2053,12 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active) LASSERT (peer->ibp_connecting != 0); peer->ibp_connecting--; } else { - LASSERT (!kibnal_peer_active(peer)); + LASSERT (peer->ibp_accepting != 0); + peer->ibp_accepting--; } - if (peer->ibp_connecting != 0) { + if (peer->ibp_connecting != 0 || + peer->ibp_accepting != 0) { /* another connection attempt under way (loopback?)... */ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return; @@ -1990,11 +2066,17 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active) if (list_empty(&peer->ibp_conns)) { /* Say when active connection can be re-attempted */ - peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; - /* Increase reconnection interval */ - peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, - IBNAL_MAX_RECONNECT_INTERVAL); - + peer->ibp_reconnect_interval *= 2; + peer->ibp_reconnect_interval = + MAX(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_min_reconnect_interval); + peer->ibp_reconnect_interval = + MIN(peer->ibp_reconnect_interval, + *kibnal_tunables.kib_max_reconnect_interval); + + peer->ibp_reconnect_time = jiffies + + peer->ibp_reconnect_interval * HZ; + /* Take peer's blocked transmits to complete with error */ list_add(&zombies, &peer->ibp_tx_queue); list_del_init(&peer->ibp_tx_queue); @@ -2004,6 +2086,8 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active) /* failed connection attempt on non-persistent peer */ kibnal_unlink_peer_locked (peer); } + + peer->ibp_error = error; } else { /* Can't have blocked transmits if there are connections */ LASSERT (list_empty(&peer->ibp_tx_queue)); @@ -2011,31 +2095,49 @@ kibnal_peer_connect_failed (kib_peer_t *peer, int active) write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + kibnal_peer_notify(peer); + if (list_empty (&zombies)) return; - CERROR ("Deleting messages for "LPX64": connection failed\n", peer->ibp_nid); - do { - tx = list_entry (zombies.next, kib_tx_t, tx_list); + CDEBUG (D_NETERROR, "Deleting messages for %s: connection failed\n", + libcfs_nid2str(peer->ibp_nid)); - list_del (&tx->tx_list); - /* complete now */ - tx->tx_status = -EHOSTUNREACH; - kibnal_tx_done (tx); - } while (!list_empty (&zombies)); + kibnal_txlist_done(&zombies, -EHOSTUNREACH); } void -kibnal_connreq_done(kib_conn_t *conn, int active, int status) +kibnal_reject(cm_cep_handle_t cep, int why) { - static cm_reject_data_t rej; + static cm_reject_data_t rejs[3]; + cm_reject_data_t *rej = &rejs[why]; + + LASSERT (why >= 0 && why < sizeof(rejs)/sizeof(rejs[0])); + + /* If I wasn't so lazy, I'd initialise this only once; it's effective + * read-only */ + rej->reason = cm_rej_code_usr_rej; + rej->priv_data[0] = (IBNAL_MSG_MAGIC) & 0xff; + rej->priv_data[1] = (IBNAL_MSG_MAGIC >> 8) & 0xff; + rej->priv_data[2] = (IBNAL_MSG_MAGIC >> 16) & 0xff; + rej->priv_data[3] = (IBNAL_MSG_MAGIC >> 24) & 0xff; + rej->priv_data[4] = (IBNAL_MSG_VERSION) & 0xff; + rej->priv_data[5] = (IBNAL_MSG_VERSION >> 8) & 0xff; + rej->priv_data[6] = why; + + cm_reject(cep, rej); +} +void +kibnal_connreq_done(kib_conn_t *conn, int active, int status) +{ struct list_head txs; kib_peer_t *peer = conn->ibc_peer; - kib_peer_t *peer2; unsigned long flags; kib_tx_t *tx; + CDEBUG(D_NET,"%d\n", status); + /* Only the connd creates conns => single threaded */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); @@ -2044,10 +2146,10 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) if (active) { LASSERT (peer->ibp_connecting > 0); } else { - LASSERT (!kibnal_peer_active(peer)); + LASSERT (peer->ibp_accepting > 0); } - PORTAL_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); + LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars)); conn->ibc_connvars = NULL; if (status != 0) { @@ -2059,15 +2161,13 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) case IBNAL_CONN_ACTIVE_CHECK_REPLY: /* got a connection reply but failed checks */ LASSERT (active); - memset(&rej, 0, sizeof(rej)); - rej.reason = cm_rej_code_usr_rej; - cm_reject(conn->ibc_cep, &rej); + kibnal_reject(conn->ibc_cep, IBNAL_REJECT_FATAL); break; case IBNAL_CONN_ACTIVE_CONNECT: LASSERT (active); cm_cancel(conn->ibc_cep); - kibnal_pause(HZ/10); + cfs_pause(cfs_time_seconds(1)/10); /* cm_connect() failed immediately or * callback returned failure */ break; @@ -2087,7 +2187,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) break; } - kibnal_peer_connect_failed(conn->ibc_peer, active); + kibnal_peer_connect_failed(conn->ibc_peer, active, status); kibnal_conn_disconnected(conn); return; } @@ -2101,24 +2201,10 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) LASSERT(conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); } + conn->ibc_last_send = jiffies; kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED); + kibnal_peer_alive(peer); - if (!active) { - peer2 = kibnal_find_peer_locked(peer->ibp_nid); - if (peer2 != NULL) { - /* already in the peer table; swap */ - conn->ibc_peer = peer2; - kibnal_peer_addref(peer2); - kibnal_peer_decref(peer); - peer = conn->ibc_peer; - } else { - /* add 'peer' to the peer table */ - kibnal_peer_addref(peer); - list_add_tail(&peer->ibp_list, - kibnal_nid2peerlist(peer->ibp_nid)); - } - } - /* Add conn to peer's list and nuke any dangling conns from a different * peer instance... */ kibnal_conn_addref(conn); /* +1 ref for ibc_list */ @@ -2134,19 +2220,21 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) kibnal_close_conn_locked(conn, -ECONNABORTED); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - kibnal_peer_connect_failed(peer, active); + kibnal_peer_connect_failed(peer, active, -ECONNABORTED); return; } if (active) peer->ibp_connecting--; + else + peer->ibp_accepting--; /* grab pending txs while I have the lock */ list_add(&txs, &peer->ibp_tx_queue); list_del_init(&peer->ibp_tx_queue); - /* reset reconnect interval for next attempt */ - peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */ + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); /* Schedule blocked txs */ @@ -2205,12 +2293,12 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) break; case IBNAL_CONN_DISCONNECT1: - /* kibnal_terminate_conn is getting there; It'll see + /* kibnal_disconnect_conn is getting there; It'll see * ibc_disconnect set... */ break; case IBNAL_CONN_DISCONNECT2: - /* kibnal_terminate_conn got there already; complete + /* kibnal_disconnect_conn got there already; complete * the disconnect. */ kibnal_schedule_conn(conn); break; @@ -2225,7 +2313,7 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) LASSERT (!conn->ibc_disconnect); conn->ibc_disconnect = 1; - /* kibnal_terminate_conn sent the disconnect request. */ + /* kibnal_disconnect_conn sent the disconnect request. */ kibnal_schedule_conn(conn); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); @@ -2279,13 +2367,16 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) static kib_msg_t txmsg; static kib_msg_t rxmsg; static cm_reply_data_t reply; - static cm_reject_data_t reject; kib_conn_t *conn = NULL; int rc = 0; + int reason; int rxmsgnob; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; + kib_peer_t *peer; + kib_peer_t *peer2; + unsigned long flags; kib_connvars_t *cv; - kib_peer_t *tmp_peer; cm_return_t cmrc; vv_return_t vvrc; @@ -2294,9 +2385,10 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); - if (cmreq->sid != IBNAL_SERVICE_NUMBER) { + if (cmreq->sid != (__u64)(*kibnal_tunables.kib_service_number)) { CERROR(LPX64" != IBNAL_SERVICE_NUMBER("LPX64")\n", - cmreq->sid, (__u64)IBNAL_SERVICE_NUMBER); + cmreq->sid, (__u64)(*kibnal_tunables.kib_service_number)); + reason = IBNAL_REJECT_FATAL; goto reject; } @@ -2304,63 +2396,121 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) rxmsgnob = MIN(cm_REQ_priv_data_len, sizeof(rxmsg)); memcpy(&rxmsg, cmreq->priv_data, rxmsgnob); - rc = kibnal_unpack_msg(&rxmsg, rxmsgnob); + rc = kibnal_unpack_msg(&rxmsg, 0, rxmsgnob); if (rc != 0) { - CERROR("Can't parse connection request: %d\n", rc); + /* SILENT! kibnal_unpack_msg() complains if required */ + reason = IBNAL_REJECT_FATAL; goto reject; } + if (rxmsg.ibm_version != IBNAL_MSG_VERSION) + CWARN("Connection from %s: old protocol version 0x%x\n", + libcfs_nid2str(rxmsg.ibm_srcnid), rxmsg.ibm_version); + if (rxmsg.ibm_type != IBNAL_MSG_CONNREQ) { - CERROR("Unexpected connreq msg type: %x from "LPX64"\n", - rxmsg.ibm_type, rxmsg.ibm_srcnid); + CERROR("Unexpected connreq msg type: %x from %s\n", + rxmsg.ibm_type, libcfs_nid2str(rxmsg.ibm_srcnid)); + reason = IBNAL_REJECT_FATAL; goto reject; } - if (rxmsg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid) { - CERROR("Can't accept "LPX64": bad dst nid "LPX64"\n", - rxmsg.ibm_srcnid, rxmsg.ibm_dstnid); + if (!lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + rxmsg.ibm_dstnid)) { + CERROR("Can't accept %s: bad dst nid %s\n", + libcfs_nid2str(rxmsg.ibm_srcnid), + libcfs_nid2str(rxmsg.ibm_dstnid)); + reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR("Can't accept "LPX64": incompatible queue depth %d (%d wanted)\n", - rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_queue_depth, + CERROR("Can't accept %s: incompatible queue depth %d (%d wanted)\n", + libcfs_nid2str(rxmsg.ibm_srcnid), + rxmsg.ibm_u.connparams.ibcp_queue_depth, IBNAL_MSG_QUEUE_SIZE); + reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { - CERROR("Can't accept "LPX64": message size %d too big (%d max)\n", - rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_msg_size, + CERROR("Can't accept %s: message size %d too big (%d max)\n", + libcfs_nid2str(rxmsg.ibm_srcnid), + rxmsg.ibm_u.connparams.ibcp_max_msg_size, IBNAL_MSG_SIZE); + reason = IBNAL_REJECT_FATAL; goto reject; } if (rxmsg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Can't accept "LPX64": max frags %d too big (%d max)\n", - rxmsg.ibm_srcnid, rxmsg.ibm_u.connparams.ibcp_max_frags, + CERROR("Can't accept %s: max frags %d too big (%d max)\n", + libcfs_nid2str(rxmsg.ibm_srcnid), + rxmsg.ibm_u.connparams.ibcp_max_frags, IBNAL_MAX_RDMA_FRAGS); + reason = IBNAL_REJECT_FATAL; + goto reject; + } + + /* assume 'rxmsg.ibm_srcnid' is a new peer; create */ + rc = kibnal_create_peer (&peer, rxmsg.ibm_srcnid); + if (rc != 0) { + CERROR("Can't create peer for %s\n", + libcfs_nid2str(rxmsg.ibm_srcnid)); + reason = IBNAL_REJECT_NO_RESOURCES; goto reject; } + + write_lock_irqsave(g_lock, flags); + + peer2 = kibnal_find_peer_locked(rxmsg.ibm_srcnid); + if (peer2 != NULL) { + /* tie-break connection race in favour of the higher NID */ + if (peer2->ibp_connecting != 0 && + rxmsg.ibm_srcnid < kibnal_data.kib_ni->ni_nid) { + write_unlock_irqrestore(g_lock, flags); + + CWARN("Conn race %s\n", + libcfs_nid2str(peer2->ibp_nid)); + + kibnal_peer_decref(peer); + reason = IBNAL_REJECT_CONN_RACE; + goto reject; + } + + peer2->ibp_accepting++; + kibnal_peer_addref(peer2); + + write_unlock_irqrestore(g_lock, flags); + kibnal_peer_decref(peer); + peer = peer2; + } else { + /* Brand new peer */ + LASSERT (peer->ibp_accepting == 0); + peer->ibp_accepting = 1; + + kibnal_peer_addref(peer); + list_add_tail(&peer->ibp_list, kibnal_nid2peerlist(rxmsg.ibm_srcnid)); + + write_unlock_irqrestore(g_lock, flags); + } conn = kibnal_create_conn(cep); if (conn == NULL) { - CERROR("Can't create conn for "LPX64"\n", rxmsg.ibm_srcnid); - goto reject; - } - - /* assume 'rxmsg.ibm_srcnid' is a new peer */ - tmp_peer = kibnal_create_peer (rxmsg.ibm_srcnid); - if (tmp_peer == NULL) { - CERROR("Can't create tmp peer for "LPX64"\n", rxmsg.ibm_srcnid); - kibnal_conn_decref(conn); - conn = NULL; + CERROR("Can't create conn for %s\n", + libcfs_nid2str(rxmsg.ibm_srcnid)); + kibnal_peer_connect_failed(peer, 0, -ENOMEM); + kibnal_peer_decref(peer); + reason = IBNAL_REJECT_NO_RESOURCES; goto reject; } - conn->ibc_peer = tmp_peer; /* conn takes over my ref */ + conn->ibc_version = rxmsg.ibm_version; + + conn->ibc_peer = peer; /* conn takes over my ref */ conn->ibc_incarnation = rxmsg.ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBNAL_RX_MSGS); cv = conn->ibc_connvars; @@ -2373,25 +2523,43 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, &cv->cv_path.sgid, &cv->cv_sgid_index); - LASSERT (vvrc == vv_return_ok); + if (vvrc != vv_return_ok) { + CERROR("gid2gid_index failed for %s: %d\n", + libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); + rc = -EIO; + reason = IBNAL_REJECT_FATAL; + goto reject; + } vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, cv->cv_path.pkey, &cv->cv_pkey_index); - LASSERT (vvrc == vv_return_ok); + if (vvrc != vv_return_ok) { + CERROR("pkey2pkey_index failed for %s: %d\n", + libcfs_nid2str(rxmsg.ibm_srcnid), vvrc); + rc = -EIO; + reason = IBNAL_REJECT_FATAL; + goto reject; + } rc = kibnal_set_qp_state(conn, vv_qp_state_init); - if (rc != 0) + if (rc != 0) { + reason = IBNAL_REJECT_FATAL; goto reject; + } rc = kibnal_post_receives(conn); if (rc != 0) { - CERROR("Can't post receives for "LPX64"\n", rxmsg.ibm_srcnid); + CERROR("Can't post receives for %s\n", + libcfs_nid2str(rxmsg.ibm_srcnid)); + reason = IBNAL_REJECT_FATAL; goto reject; } rc = kibnal_set_qp_state(conn, vv_qp_state_rtr); - if (rc != 0) + if (rc != 0) { + reason = IBNAL_REJECT_FATAL; goto reject; + } memset(&reply, 0, sizeof(reply)); reply.qpn = cv->cv_local_qpn; @@ -2411,7 +2579,8 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) txmsg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; txmsg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; txmsg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; - kibnal_pack_msg(&txmsg, 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0); + kibnal_pack_msg(&txmsg, conn->ibc_version, + 0, rxmsg.ibm_srcnid, rxmsg.ibm_srcstamp, 0); /* ...and copy into reply to avoid alignment issues */ memcpy(&reply.priv_data, &txmsg, txmsg.ibm_nob); @@ -2427,13 +2596,13 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) /* back out state change (no callback happening) */ kibnal_set_conn_state(conn, IBNAL_CONN_INIT); rc = -EIO; + reason = IBNAL_REJECT_FATAL; reject: - CERROR("Rejected connreq from "LPX64"\n", rxmsg.ibm_srcnid); + CDEBUG(D_NET, "Rejecting connreq from %s\n", + libcfs_nid2str(rxmsg.ibm_srcnid)); - memset(&reject, 0, sizeof(reject)); - reject.reason = cm_rej_code_usr_rej; - cm_reject(cep, &reject); + kibnal_reject(cep, reason); if (conn != NULL) { LASSERT (rc != 0); @@ -2458,12 +2627,11 @@ kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *data, void *arg) return; } - PORTAL_ALLOC_ATOMIC(pcr, sizeof(*pcr)); + LIBCFS_ALLOC_ATOMIC(pcr, sizeof(*pcr)); if (pcr == NULL) { CERROR("Can't allocate passive connreq\n"); - cm_reject(cep, &((cm_reject_data_t) /* NB RO struct */ - {.reason = cm_rej_code_no_res,})); + kibnal_reject(cep, IBNAL_REJECT_NO_RESOURCES); cm_destroy_cep(cep); return; } @@ -2504,7 +2672,7 @@ kibnal_connect_conn (kib_conn_t *conn) kib_connvars_t *cv = conn->ibc_connvars; kib_peer_t *peer = conn->ibc_peer; cm_return_t cmrc; - + /* Only called by connd => statics OK */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); @@ -2512,12 +2680,12 @@ kibnal_connect_conn (kib_conn_t *conn) memset(&cmreq, 0, sizeof(cmreq)); - cmreq.sid = IBNAL_SERVICE_NUMBER; + cmreq.sid = (__u64)(*kibnal_tunables.kib_service_number); cmreq.cep_data.ca_guid = kibnal_data.kib_hca_attrs.guid; cmreq.cep_data.qpn = cv->cv_local_qpn; - cmreq.cep_data.retry_cnt = IBNAL_RETRY_CNT; - cmreq.cep_data.rtr_retry_cnt = IBNAL_RNR_CNT; + cmreq.cep_data.retry_cnt = *kibnal_tunables.kib_retry_cnt; + cmreq.cep_data.rtr_retry_cnt = *kibnal_tunables.kib_rnr_cnt; cmreq.cep_data.start_psn = cv->cv_rxpsn; cmreq.cep_data.end_to_end_flow_ctrl = IBNAL_EE_FLOW_CNT; // XXX ack_timeout? @@ -2534,12 +2702,27 @@ kibnal_connect_conn (kib_conn_t *conn) msg.ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE; msg.ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE; msg.ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS; - kibnal_pack_msg(&msg, 0, peer->ibp_nid, 0, 0); + kibnal_pack_msg(&msg, conn->ibc_version, 0, peer->ibp_nid, 0, 0); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + msg.ibm_version++; + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + msg.ibm_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } /* ...and copy into cmreq to avoid alignment issues */ memcpy(&cmreq.priv_data, &msg, msg.ibm_nob); - CDEBUG(D_NET, "Connecting %p to "LPX64"\n", conn, peer->ibp_nid); + CDEBUG(D_NET, "Connecting %p to %s\n", conn, + libcfs_nid2str(peer->ibp_nid)); kibnal_conn_addref(conn); /* ++ref for CM callback */ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CONNECT); @@ -2547,17 +2730,67 @@ kibnal_connect_conn (kib_conn_t *conn) cmrc = cm_connect(conn->ibc_cep, &cmreq, kibnal_active_connect_callback, conn); if (cmrc == cm_stat_success) { - CDEBUG(D_NET, "connection REQ sent to "LPX64"\n", - peer->ibp_nid); + CDEBUG(D_NET, "connection REQ sent to %s\n", + libcfs_nid2str(peer->ibp_nid)); return; } - CERROR ("Connect "LPX64" failed: %d\n", peer->ibp_nid, cmrc); + CERROR ("Connect %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), cmrc); kibnal_conn_decref(conn); /* drop callback's ref */ kibnal_connreq_done(conn, 1, -EHOSTUNREACH); } void +kibnal_reconnect (kib_conn_t *conn, int why) +{ + kib_peer_t *peer = conn->ibc_peer; + int retry; + unsigned long flags; + cm_return_t cmrc; + cm_cep_handle_t cep; + + LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); + + read_lock_irqsave(&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting > 0); /* 'conn' at least */ + + /* retry connection if it's still needed and no other connection + * attempts (active or passive) are in progress. + * Immediate reconnect is required, so I don't even look at the + * reconnection timeout etc */ + + retry = (!list_empty(&peer->ibp_tx_queue) && + peer->ibp_connecting == 1 && + peer->ibp_accepting == 0); + + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); + + if (!retry) { + kibnal_connreq_done(conn, 1, why); + return; + } + + cep = cm_create_cep(cm_cep_transp_rc); + if (cep == NULL) { + CERROR("Can't create new CEP\n"); + kibnal_connreq_done(conn, 1, -ENOMEM); + return; + } + + cmrc = cm_cancel(conn->ibc_cep); + LASSERT (cmrc == cm_stat_success); + cmrc = cm_destroy_cep(conn->ibc_cep); + LASSERT (cmrc == cm_stat_success); + + conn->ibc_cep = cep; + + /* reuse conn; no need to peer->ibp_connecting++ */ + kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); + kibnal_connect_conn(conn); +} + +void kibnal_check_connreply (kib_conn_t *conn) { static cm_rtu_data_t rtu; @@ -2568,7 +2801,6 @@ kibnal_check_connreply (kib_conn_t *conn) kib_peer_t *peer = conn->ibc_peer; int msgnob; cm_return_t cmrc; - cm_cep_handle_t cep; unsigned long flags; int rc; @@ -2589,64 +2821,73 @@ kibnal_check_connreply (kib_conn_t *conn) msgnob = MIN(cm_REP_priv_data_len, sizeof(msg)); memcpy(&msg, &reply->priv_data, msgnob); - rc = kibnal_unpack_msg(&msg, msgnob); + rc = kibnal_unpack_msg(&msg, conn->ibc_version, msgnob); if (rc != 0) { - CERROR("Can't unpack reply from "LPX64"\n", - peer->ibp_nid); + CERROR("Can't unpack reply from %s\n", + libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, rc); return; } if (msg.ibm_type != IBNAL_MSG_CONNACK ) { - CERROR("Unexpected message type %d from "LPX64"\n", - msg.ibm_type, peer->ibp_nid); + CERROR("Unexpected message type %d from %s\n", + msg.ibm_type, libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, -EPROTO); return; } if (msg.ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE) { - CERROR(LPX64" has incompatible queue depth %d(%d wanted)\n", - peer->ibp_nid, msg.ibm_u.connparams.ibcp_queue_depth, + CERROR("%s has incompatible queue depth %d(%d wanted)\n", + libcfs_nid2str(peer->ibp_nid), + msg.ibm_u.connparams.ibcp_queue_depth, IBNAL_MSG_QUEUE_SIZE); kibnal_connreq_done(conn, 1, -EPROTO); return; } if (msg.ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE) { - CERROR(LPX64" max message size %d too big (%d max)\n", - peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_msg_size, + CERROR("%s max message size %d too big (%d max)\n", + libcfs_nid2str(peer->ibp_nid), + msg.ibm_u.connparams.ibcp_max_msg_size, IBNAL_MSG_SIZE); kibnal_connreq_done(conn, 1, -EPROTO); return; } if (msg.ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) { - CERROR(LPX64" max frags %d too big (%d max)\n", - peer->ibp_nid, msg.ibm_u.connparams.ibcp_max_frags, + CERROR("%s max frags %d too big (%d max)\n", + libcfs_nid2str(peer->ibp_nid), + msg.ibm_u.connparams.ibcp_max_frags, IBNAL_MAX_RDMA_FRAGS); kibnal_connreq_done(conn, 1, -EPROTO); return; } read_lock_irqsave(&kibnal_data.kib_global_lock, flags); - rc = (msg.ibm_dstnid != kibnal_lib.libnal_ni.ni_pid.nid || - msg.ibm_dststamp != kibnal_data.kib_incarnation) ? - -ESTALE : 0; + if (lnet_ptlcompat_matchnid(kibnal_data.kib_ni->ni_nid, + msg.ibm_dstnid) && + msg.ibm_dststamp == kibnal_data.kib_incarnation) + rc = 0; + else + rc = -ESTALE; read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); if (rc != 0) { - CERROR("Stale connection reply from "LPX64"\n", - peer->ibp_nid); + CERROR("Stale connection reply from %s\n", + libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, rc); return; } conn->ibc_incarnation = msg.ibm_srcstamp; conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + conn->ibc_reserved_credits = IBNAL_MSG_QUEUE_SIZE; + LASSERT (conn->ibc_credits + conn->ibc_reserved_credits + <= IBNAL_RX_MSGS); rc = kibnal_post_receives(conn); if (rc != 0) { - CERROR("Can't post receives for "LPX64"\n", - peer->ibp_nid); + CERROR("Can't post receives for %s\n", + libcfs_nid2str(peer->ibp_nid)); kibnal_connreq_done(conn, 1, rc); return; } @@ -2676,7 +2917,8 @@ kibnal_check_connreply (kib_conn_t *conn) return; } - CERROR("cm_accept "LPX64" failed: %d\n", peer->ibp_nid, cmrc); + CERROR("cm_accept %s failed: %d\n", + libcfs_nid2str(peer->ibp_nid), cmrc); /* Back out of RTU: no callback coming */ kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_CHECK_REPLY); kibnal_conn_decref(conn); @@ -2686,37 +2928,72 @@ kibnal_check_connreply (kib_conn_t *conn) if (cv->cv_conndata.status == cm_event_conn_reject) { - if (cv->cv_conndata.data.reject.reason != cm_rej_code_stale_conn) { - CERROR("conn -> "LPX64" rejected: %d\n", peer->ibp_nid, - cv->cv_conndata.data.reject.reason); - kibnal_connreq_done(conn, 1, -ECONNREFUSED); - return; - } + if (cv->cv_conndata.data.reject.reason == cm_rej_code_usr_rej) { + unsigned char *bytes = + cv->cv_conndata.data.reject.priv_data; + int magic = (bytes[0]) | + (bytes[1] << 8) | + (bytes[2] << 16) | + (bytes[3] << 24); + int version = (bytes[4]) | + (bytes[5] << 8); + int why = (bytes[6]); + + /* Expected proto/version: she just doesn't like me (or + * ran out of resources) */ + if (magic == IBNAL_MSG_MAGIC && + version == conn->ibc_version) { + CERROR("conn -> %s rejected: fatal error %d\n", + libcfs_nid2str(peer->ibp_nid), why); + + if (why == IBNAL_REJECT_CONN_RACE) + kibnal_reconnect(conn, -EALREADY); + else + kibnal_connreq_done(conn, 1, -ECONNREFUSED); + return; + } + + /* Fail unless it's worth retrying with an old proto + * version */ + if (!(magic == IBNAL_MSG_MAGIC && + version == IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD && + conn->ibc_version == IBNAL_MSG_VERSION)) { + CERROR("conn -> %s rejected: bad protocol " + "magic/ver %08x/%x why %d\n", + libcfs_nid2str(peer->ibp_nid), + magic, version, why); + + kibnal_connreq_done(conn, 1, -ECONNREFUSED); + return; + } - CWARN ("conn -> "LPX64" stale: retrying\n", peer->ibp_nid); + conn->ibc_version = version; + CWARN ("Connection to %s refused: " + "retrying with old protocol version 0x%x\n", + libcfs_nid2str(peer->ibp_nid), version); - cep = cm_create_cep(cm_cep_transp_rc); - if (cep == NULL) { - CERROR("Can't create new CEP\n"); - kibnal_connreq_done(conn, 1, -ENOMEM); + kibnal_reconnect(conn, -ECONNREFUSED); return; - } - - cmrc = cm_cancel(conn->ibc_cep); - LASSERT (cmrc == cm_stat_success); - cmrc = cm_destroy_cep(conn->ibc_cep); - LASSERT (cmrc == cm_stat_success); - - conn->ibc_cep = cep; + } else if (cv->cv_conndata.data.reject.reason == + cm_rej_code_stale_conn) { + + CWARN ("conn -> %s stale: retrying\n", + libcfs_nid2str(peer->ibp_nid)); - /* retry connect */ - kibnal_set_conn_state(conn, IBNAL_CONN_ACTIVE_ARP); - kibnal_connect_conn(conn); - return; + kibnal_reconnect(conn, -ESTALE); + return; + } else { + CDEBUG(D_NETERROR, "conn -> %s rejected: reason %d\n", + libcfs_nid2str(peer->ibp_nid), + cv->cv_conndata.data.reject.reason); + kibnal_connreq_done(conn, 1, -ECONNREFUSED); + return; + } + /* NOT REACHED */ } - CERROR("conn -> "LPX64" failed: %d\n", peer->ibp_nid, - cv->cv_conndata.status); + CDEBUG(D_NETERROR, "conn -> %s failed: %d\n", + libcfs_nid2str(peer->ibp_nid), cv->cv_conndata.status); kibnal_connreq_done(conn, 1, -ECONNABORTED); } @@ -2737,54 +3014,50 @@ kibnal_arp_done (kib_conn_t *conn) LASSERT (peer->ibp_arp_count > 0); if (cv->cv_arprc != ibat_stat_ok) { - write_lock_irqsave(&kibnal_data.kib_global_lock, flags); - peer->ibp_arp_count--; - if (peer->ibp_arp_count == 0) { - /* final ARP attempt failed */ - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", - peer->ibp_nid, HIPQUAD(peer->ibp_ip), - cv->cv_arprc); - } else { - /* Retry ARP: ibp_connecting++ so terminating conn - * doesn't end peer's connection attempt */ - peer->ibp_connecting++; - write_unlock_irqrestore(&kibnal_data.kib_global_lock, - flags); - CWARN("Arp "LPX64"@%u.%u.%u.%u failed: %d " - "(%d attempts left)\n", - peer->ibp_nid, HIPQUAD(peer->ibp_ip), - cv->cv_arprc, peer->ibp_arp_count); - - kibnal_schedule_peer_arp(peer); - } - kibnal_connreq_done(conn, 1, -ENETUNREACH); - return; + CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed: %d\n", + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), + cv->cv_arprc); + goto failed; } if ((arp->mask & IBAT_PRI_PATH_VALID) != 0) { - CDEBUG(D_NET, "Got valid path for "LPX64"\n", peer->ibp_nid); + CDEBUG(D_NET, "Got valid path for %s\n", + libcfs_nid2str(peer->ibp_nid)); *path = *arp->primary_path; vvrc = base_gid2port_num(kibnal_data.kib_hca, &path->sgid, &cv->cv_port); - LASSERT (vvrc == vv_return_ok); + if (vvrc != vv_return_ok) { + CWARN("base_gid2port_num failed for %s @ %u.%u.%u.%u: %d\n", + libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), vvrc); + goto failed; + } vvrc = gid2gid_index(kibnal_data.kib_hca, cv->cv_port, &path->sgid, &cv->cv_sgid_index); - LASSERT (vvrc == vv_return_ok); + if (vvrc != vv_return_ok) { + CWARN("gid2gid_index failed for %s @ %u.%u.%u.%u: %d\n", + libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), vvrc); + goto failed; + } vvrc = pkey2pkey_index(kibnal_data.kib_hca, cv->cv_port, path->pkey, &cv->cv_pkey_index); - LASSERT (vvrc == vv_return_ok); + if (vvrc != vv_return_ok) { + CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n", + libcfs_nid2str(peer->ibp_nid), + HIPQUAD(peer->ibp_ip), vvrc); + goto failed; + } path->mtu = IBNAL_IB_MTU; } else if ((arp->mask & IBAT_LID_VALID) != 0) { - CWARN("Creating new path record for "LPX64"@%u.%u.%u.%u\n", - peer->ibp_nid, HIPQUAD(peer->ibp_ip)); + CWARN("Creating new path record for %s @ %u.%u.%u.%u\n", + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); cv->cv_pkey_index = IBNAL_PKEY_IDX; cv->cv_sgid_index = IBNAL_SGID_IDX; @@ -2794,11 +3067,21 @@ kibnal_arp_done (kib_conn_t *conn) vvrc = port_num2base_gid(kibnal_data.kib_hca, cv->cv_port, &path->sgid); - LASSERT (vvrc == vv_return_ok); + if (vvrc != vv_return_ok) { + CWARN("port_num2base_gid failed for %s @ %u.%u.%u.%u: %d\n", + libcfs_nid2str(peer->ibp_ip), + HIPQUAD(peer->ibp_ip), vvrc); + goto failed; + } vvrc = port_num2base_lid(kibnal_data.kib_hca, cv->cv_port, &path->slid); - LASSERT (vvrc == vv_return_ok); + if (vvrc != vv_return_ok) { + CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n", + libcfs_nid2str(peer->ibp_ip), + HIPQUAD(peer->ibp_ip), vvrc); + goto failed; + } path->dgid = arp->gid; path->sl = IBNAL_SERVICE_LEVEL; @@ -2809,10 +3092,9 @@ kibnal_arp_done (kib_conn_t *conn) path->pkey = IBNAL_PKEY; path->traffic_class = IBNAL_TRAFFIC_CLASS; } else { - CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: no PATH or LID\n", - peer->ibp_nid, HIPQUAD(peer->ibp_ip)); - kibnal_connreq_done(conn, 1, -ENETUNREACH); - return; + CWARN("Arp for %s @ %u.%u.%u.%u returned neither PATH nor LID\n", + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); + goto failed; } rc = kibnal_set_qp_state(conn, vv_qp_state_init); @@ -2822,27 +3104,53 @@ kibnal_arp_done (kib_conn_t *conn) /* do the actual connection request */ kibnal_connect_conn(conn); + return; + + failed: + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + peer->ibp_arp_count--; + if (peer->ibp_arp_count == 0) { + /* final ARP attempt failed */ + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (final attempt)\n", + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip)); + } else { + /* Retry ARP: ibp_connecting++ so terminating conn + * doesn't end peer's connection attempt */ + peer->ibp_connecting++; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + CDEBUG(D_NETERROR, "Arp %s @ %u.%u.%u.%u failed (%d attempts left)\n", + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), + peer->ibp_arp_count); + + kibnal_schedule_peer_arp(peer); + } + kibnal_connreq_done(conn, 1, -ENETUNREACH); } void kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg) { /* CAVEAT EMPTOR: tasklet context */ - kib_conn_t *conn = (kib_conn_t *)arg; - kib_peer_t *peer = conn->ibc_peer; + kib_peer_t *peer; + kib_conn_t *conn = (kib_conn_t *)arg; + + LASSERT (conn != NULL); + LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); + + peer = conn->ibc_peer; if (arprc != ibat_stat_ok) - CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", - peer->ibp_nid, HIPQUAD(peer->ibp_ip), arprc); + CDEBUG(D_NETERROR, "Arp %s at %u.%u.%u.%u failed: %d\n", + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), arprc); else - CDEBUG(D_NET, "Arp "LPX64"@%u.%u.%u.%u OK: LID %s PATH %s\n", - peer->ibp_nid, HIPQUAD(peer->ibp_ip), + CDEBUG(D_NET, "Arp %s at %u.%u.%u.%u OK: LID %s PATH %s\n", + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), (arp_data->mask & IBAT_LID_VALID) == 0 ? "invalid" : "valid", (arp_data->mask & IBAT_PRI_PATH_VALID) == 0 ? "invalid" : "valid"); - LASSERT (conn != NULL); - LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); - conn->ibc_connvars->cv_arprc = arprc; if (arprc == ibat_stat_ok) conn->ibc_connvars->cv_arp = *arp_data; @@ -2865,18 +3173,18 @@ kibnal_arp_peer (kib_peer_t *peer) cep = cm_create_cep(cm_cep_transp_rc); if (cep == NULL) { - CERROR ("Can't create cep for conn->"LPX64"\n", - peer->ibp_nid); - kibnal_peer_connect_failed(peer, 1); + CERROR ("Can't create cep for conn->%s\n", + libcfs_nid2str(peer->ibp_nid)); + kibnal_peer_connect_failed(peer, 1, -ENOMEM); return; } conn = kibnal_create_conn(cep); if (conn == NULL) { - CERROR ("Can't allocate conn->"LPX64"\n", - peer->ibp_nid); + CERROR ("Can't allocate conn->%s\n", + libcfs_nid2str(peer->ibp_nid)); cm_destroy_cep(cep); - kibnal_peer_connect_failed(peer, 1); + kibnal_peer_connect_failed(peer, 1, -ENOMEM); return; } @@ -2912,39 +3220,41 @@ kibnal_arp_peer (kib_peer_t *peer) } int -kibnal_conn_timed_out (kib_conn_t *conn) +kibnal_check_txs (kib_conn_t *conn, struct list_head *txs) { kib_tx_t *tx; struct list_head *ttmp; + int timed_out = 0; spin_lock(&conn->ibc_lock); - list_for_each (ttmp, &conn->ibc_tx_queue) { + list_for_each (ttmp, txs) { tx = list_entry (ttmp, kib_tx_t, tx_list); - LASSERT (tx->tx_queued); - - if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock(&conn->ibc_lock); - return 1; + if (txs == &conn->ibc_active_txs) { + LASSERT (!tx->tx_queued); + LASSERT (tx->tx_waiting || tx->tx_sending != 0); + } else { + LASSERT (tx->tx_queued); } - } - - list_for_each (ttmp, &conn->ibc_active_txs) { - tx = list_entry (ttmp, kib_tx_t, tx_list); - - LASSERT (!tx->tx_queued); - LASSERT (tx->tx_waiting || - tx->tx_sending != 0); if (time_after_eq (jiffies, tx->tx_deadline)) { - spin_unlock(&conn->ibc_lock); - return 1; + timed_out = 1; + break; } } spin_unlock(&conn->ibc_lock); - return 0; + return timed_out; +} + +int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + return kibnal_check_txs(conn, &conn->ibc_tx_queue) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || + kibnal_check_txs(conn, &conn->ibc_tx_queue_nocred) || + kibnal_check_txs(conn, &conn->ibc_active_txs); } void @@ -2985,11 +3295,11 @@ kibnal_check_conns (int idx) kibnal_conn_addref(conn); /* 1 ref for me... */ - read_unlock_irqrestore(&kibnal_data.kib_global_lock, + read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - CERROR("Timed out RDMA with "LPX64"\n", - peer->ibp_nid); + CERROR("Timed out RDMA with %s\n", + libcfs_nid2str(peer->ibp_nid)); kibnal_close_conn (conn, -ETIMEDOUT); kibnal_conn_decref(conn); /* ...until here */ @@ -3037,7 +3347,7 @@ kibnal_disconnect_conn (kib_conn_t *conn) write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); cm_cancel(conn->ibc_cep); - kibnal_pause(HZ/10); + cfs_pause(cfs_time_seconds(1)/10); if (!conn->ibc_disconnect) /* CM callback will never happen now */ kibnal_conn_decref(conn); @@ -3062,13 +3372,13 @@ kibnal_connd (void *arg) int peer_index = 0; unsigned long deadline = jiffies; - kportal_daemonize ("kibnal_connd"); - kportal_blockallsigs (); + cfs_daemonize ("kibnal_connd"); + cfs_block_allsigs (); init_waitqueue_entry (&wait, current); kibnal_data.kib_connd = current; - spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); while (!kibnal_data.kib_shutdown) { @@ -3096,7 +3406,7 @@ kibnal_connd (void *arg) dropped_lock = 1; kibnal_recv_connreq(pcr->pcr_cep, &pcr->pcr_cmreq); - PORTAL_FREE(pcr, sizeof(*pcr)); + LIBCFS_FREE(pcr, sizeof(*pcr)); spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); } @@ -3167,9 +3477,9 @@ kibnal_connd (void *arg) * connection within (n+1)/n times the timeout * interval. */ - if (kibnal_tunables.kib_io_timeout > n * p) + if (*kibnal_tunables.kib_timeout > n * p) chunk = (chunk * n * p) / - kibnal_tunables.kib_io_timeout; + *kibnal_tunables.kib_timeout; if (chunk == 0) chunk = 1; @@ -3216,8 +3526,6 @@ kibnal_cq_callback (unsigned long unused_context) { unsigned long flags; - CDEBUG(D_NET, "!!\n"); - spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); kibnal_data.kib_ready = 1; wake_up(&kibnal_data.kib_sched_waitq); @@ -3239,8 +3547,8 @@ kibnal_scheduler(void *arg) int busy_loops = 0; snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); - kportal_daemonize(name); - kportal_blockallsigs(); + cfs_daemonize(name); + cfs_block_allsigs(); init_waitqueue_entry(&wait, current); @@ -3333,8 +3641,8 @@ kibnal_scheduler(void *arg) * I give a scheduler on another CPU a chance * to get the final SEND completion, so the tx * descriptor can get freed as I inspect it. */ - CERROR ("RDMA failed: %d\n", - wc.completion_status); + CDEBUG(D_NETERROR, "RDMA failed: %d\n", + wc.completion_status); break; default: @@ -3348,7 +3656,7 @@ kibnal_scheduler(void *arg) /* Nothing to do; sleep... */ set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kibnal_data.kib_sched_waitq, &wait); + add_wait_queue_exclusive(&kibnal_data.kib_sched_waitq, &wait); spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); @@ -3364,13 +3672,3 @@ kibnal_scheduler(void *arg) kibnal_thread_fini(); return (0); } - - -lib_nal_t kibnal_lib = { - .libnal_data = &kibnal_data, /* NAL private data */ - .libnal_send = kibnal_send, - .libnal_send_pages = kibnal_send_pages, - .libnal_recv = kibnal_recv, - .libnal_recv_pages = kibnal_recv_pages, - .libnal_dist = kibnal_dist -}; diff --git a/lnet/klnds/viblnd/viblnd_modparams.c b/lnet/klnds/viblnd/viblnd_modparams.c new file mode 100644 index 0000000..1179d72 --- /dev/null +++ b/lnet/klnds/viblnd/viblnd_modparams.c @@ -0,0 +1,237 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "viblnd.h" + +static int service_number = 0x11b9a2; +CFS_MODULE_PARM(service_number, "i", int, 0444, + "IB service number"); + +static int min_reconnect_interval = 1; +CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644, + "minimum connection retry interval (seconds)"); + +static int max_reconnect_interval = 60; +CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644, + "maximum connection retry interval (seconds)"); + +static int concurrent_peers = 1152; +CFS_MODULE_PARM(concurrent_peers, "i", int, 0444, + "maximum number of peers that may connect"); + +static int cksum = 0; +CFS_MODULE_PARM(cksum, "i", int, 0644, + "set non-zero to enable message (not RDMA) checksums"); + +static int timeout = 50; +CFS_MODULE_PARM(timeout, "i", int, 0644, + "timeout (seconds)"); + +static int ntx = 256; +CFS_MODULE_PARM(ntx, "i", int, 0444, + "# of message descriptors"); + +static int credits = 128; +CFS_MODULE_PARM(credits, "i", int, 0444, + "# concurrent sends"); + +static int peer_credits = 8; +CFS_MODULE_PARM(peer_credits, "i", int, 0444, + "# concurrent sends to 1 peer"); + +static int arp_retries = 3; +CFS_MODULE_PARM(arp_retries, "i", int, 0644, + "# of times to retry ARP"); + +static char *hca_basename = "InfiniHost"; +CFS_MODULE_PARM(hca_basename, "s", charp, 0444, + "HCA base name"); + +static char *ipif_basename = "ipoib"; +CFS_MODULE_PARM(ipif_basename, "s", charp, 0444, + "IPoIB interface base name"); + +static int local_ack_timeout = 0x12; +CFS_MODULE_PARM(local_ack_timeout, "i", int, 0644, + "ACK timeout for low-level 'sends'"); + +static int retry_cnt = 7; +CFS_MODULE_PARM(retry_cnt, "i", int, 0644, + "Retransmissions when no ACK received"); + +static int rnr_cnt = 6; +CFS_MODULE_PARM(rnr_cnt, "i", int, 0644, + "RNR retransmissions"); + +static int rnr_nak_timer = 0x10; +CFS_MODULE_PARM(rnr_nak_timer, "i", int, 0644, + "RNR retransmission interval"); + +static int keepalive = 100; +CFS_MODULE_PARM(keepalive, "i", int, 0644, + "Idle time in seconds before sending a keepalive"); + +static int concurrent_sends = IBNAL_RX_MSGS; +CFS_MODULE_PARM(concurrent_sends, "i", int, 0644, + "send work-queue sizing"); + +#if IBNAL_USE_FMR +static int fmr_remaps = 1000; +CFS_MODULE_PARM(fmr_remaps, "i", int, 0444, + "FMR mappings allowed before unmap"); +#endif + +kib_tunables_t kibnal_tunables = { + .kib_service_number = &service_number, + .kib_min_reconnect_interval = &min_reconnect_interval, + .kib_max_reconnect_interval = &max_reconnect_interval, + .kib_concurrent_peers = &concurrent_peers, + .kib_cksum = &cksum, + .kib_timeout = &timeout, + .kib_ntx = &ntx, + .kib_credits = &credits, + .kib_peercredits = &peer_credits, + .kib_arp_retries = &arp_retries, + .kib_hca_basename = &hca_basename, + .kib_ipif_basename = &ipif_basename, + .kib_local_ack_timeout = &local_ack_timeout, + .kib_retry_cnt = &retry_cnt, + .kib_rnr_cnt = &rnr_cnt, + .kib_rnr_nak_timer = &rnr_nak_timer, + .kib_keepalive = &keepalive, + .kib_concurrent_sends = &concurrent_sends, +#if IBNAL_USE_FMR + .kib_fmr_remaps = &fmr_remaps, +#endif +}; + +#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM + +static char hca_basename_space[32]; +static char ipif_basename_space[32]; + +static ctl_table kibnal_ctl_table[] = { + {1, "service_number", &service_number, + sizeof(int), 0444, NULL, &proc_dointvec}, + {2, "min_reconnect_interval", &min_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {3, "max_reconnect_interval", &max_reconnect_interval, + sizeof(int), 0644, NULL, &proc_dointvec}, + {4, "concurrent_peers", &concurrent_peers, + sizeof(int), 0444, NULL, &proc_dointvec}, + {5, "cksum", &cksum, + sizeof(int), 0644, NULL, &proc_dointvec}, + {6, "timeout", &timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {7, "ntx", &ntx, + sizeof(int), 0444, NULL, &proc_dointvec}, + {8, "credits", &credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {9, "peer_credits", &peer_credits, + sizeof(int), 0444, NULL, &proc_dointvec}, + {10, "arp_retries", &arp_retries, + sizeof(int), 0644, NULL, &proc_dointvec}, + {11, "hca_basename", hca_basename_space, + sizeof(hca_basename_space), 0444, NULL, &proc_dostring}, + {12, "ipif_basename", ipif_basename_space, + sizeof(ipif_basename_space), 0444, NULL, &proc_dostring}, + {13, "local_ack_timeout", &local_ack_timeout, + sizeof(int), 0644, NULL, &proc_dointvec}, + {14, "retry_cnt", &retry_cnt, + sizeof(int), 0644, NULL, &proc_dointvec}, + {15, "rnr_cnt", &rnr_cnt, + sizeof(int), 0644, NULL, &proc_dointvec}, + {16, "rnr_nak_timer", &rnr_nak_timer, + sizeof(int), 0644, NULL, &proc_dointvec}, + {17, "keepalive", &keepalive, + sizeof(int), 0644, NULL, &proc_dointvec}, + {18, "concurrent_sends", &concurrent_sends, + sizeof(int), 0644, NULL, &proc_dointvec}, +#if IBNAL_USE_FMR + {19, "fmr_remaps", &fmr_remaps, + sizeof(int), 0444, NULL, &proc_dointvec}, +#endif + {0} +}; + +static ctl_table kibnal_top_ctl_table[] = { + {203, "vibnal", NULL, 0, 0555, kibnal_ctl_table}, + {0} +}; + +void +kibnal_initstrtunable(char *space, char *str, int size) +{ + strncpy(space, str, size); + space[size-1] = 0; +} + +int +kibnal_tunables_init () +{ + kibnal_initstrtunable(hca_basename_space, hca_basename, + sizeof(hca_basename_space)); + kibnal_initstrtunable(ipif_basename_space, ipif_basename, + sizeof(ipif_basename_space)); + + kibnal_tunables.kib_sysctl = + register_sysctl_table(kibnal_top_ctl_table, 0); + + if (kibnal_tunables.kib_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + if (*kibnal_tunables.kib_concurrent_sends > IBNAL_RX_MSGS) + *kibnal_tunables.kib_concurrent_sends = IBNAL_RX_MSGS; + if (*kibnal_tunables.kib_concurrent_sends < IBNAL_MSG_QUEUE_SIZE) + *kibnal_tunables.kib_concurrent_sends = IBNAL_MSG_QUEUE_SIZE; + + return 0; +} + +void +kibnal_tunables_fini () +{ + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table(kibnal_tunables.kib_sysctl); +} + +#else + +int +kibnal_tunables_init () +{ + return 0; +} + +void +kibnal_tunables_fini () +{ +} + +#endif + + + + + + diff --git a/lnet/klnds/viblnd/viblnd_wire.h b/lnet/klnds/viblnd/viblnd_wire.h index 6dacf6d..26242c18 100644 --- a/lnet/klnds/viblnd/viblnd_wire.h +++ b/lnet/klnds/viblnd/viblnd_wire.h @@ -12,7 +12,7 @@ typedef struct kib_connparams typedef struct { - ptl_hdr_t ibim_hdr; /* portals header */ + lnet_hdr_t ibim_hdr; /* portals header */ char ibim_payload[0]; /* piggy-backed payload */ } WIRE_ATTR kib_immediate_msg_t; @@ -48,7 +48,7 @@ typedef struct typedef struct { - ptl_hdr_t ibprm_hdr; /* portals header */ + lnet_hdr_t ibprm_hdr; /* portals header */ __u64 ibprm_cookie; /* opaque completion cookie */ } WIRE_ATTR kib_putreq_msg_t; @@ -61,7 +61,7 @@ typedef struct typedef struct { - ptl_hdr_t ibgm_hdr; /* portals header */ + lnet_hdr_t ibgm_hdr; /* portals header */ __u64 ibgm_cookie; /* opaque completion cookie */ kib_rdma_desc_t ibgm_rd; /* rdma descriptor */ } WIRE_ATTR kib_get_msg_t; @@ -98,13 +98,11 @@ typedef struct } WIRE_ATTR ibm_u; } WIRE_ATTR kib_msg_t; -#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_MAGIC LNET_PROTO_VIB_MAGIC /* unique magic */ -#if IBNAL_USE_FMA /* ensure version changes on FMA */ -#define IBNAL_MSG_VERSION 0x11 -#else -#define IBNAL_MSG_VERSION 0x10 -#endif +#define IBNAL_MSG_VERSION_RDMAREPLYNOTRSRVD 0x10 /* previous version */ + +#define IBNAL_MSG_VERSION 0x11 /* current version */ #define IBNAL_MSG_CONNREQ 0xc0 /* connection request */ #define IBNAL_MSG_CONNACK 0xc1 /* connection acknowledge */ @@ -116,3 +114,8 @@ typedef struct #define IBNAL_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ #define IBNAL_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ #define IBNAL_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ + +/* connection rejection reasons */ +#define IBNAL_REJECT_CONN_RACE 0 /* You lost connection race */ +#define IBNAL_REJECT_NO_RESOURCES 1 /* Out of memory/conns etc */ +#define IBNAL_REJECT_FATAL 2 /* Anything else */ diff --git a/lnet/klnds/viblnd/wirecheck.c b/lnet/klnds/viblnd/wirecheck.c index d42171d..5a0e060 100644 --- a/lnet/klnds/viblnd/wirecheck.c +++ b/lnet/klnds/viblnd/wirecheck.c @@ -5,11 +5,19 @@ #include #include #include -#include -#include + +#include + +/* This ghastly hack to allows me to include lib-types.h It doesn't affect any + * assertions generated here (but fails-safe if it ever does) */ +typedef struct { + int counter; +} atomic_t; + +#include #define IBNAL_USE_FMR 1 -#include "vibnal_wire.h" +#include "viblnd_wire.h" #ifndef HAVE_STRNLEN #define strnlen(s, i) strlen(s) @@ -146,6 +154,10 @@ main (int argc, char **argv) CHECK_DEFINE (IBNAL_MSG_GET_REQ); CHECK_DEFINE (IBNAL_MSG_GET_DONE); + CHECK_DEFINE (IBNAL_REJECT_CONN_RACE); + CHECK_DEFINE (IBNAL_REJECT_NO_RESOURCES); + CHECK_DEFINE (IBNAL_REJECT_FATAL); + CHECK_STRUCT (kib_connparams_t); CHECK_MEMBER (kib_connparams_t, ibcp_queue_depth); CHECK_MEMBER (kib_connparams_t, ibcp_max_msg_size); diff --git a/lnet/libcfs/Info.plist b/lnet/libcfs/Info.plist index 7e3cc08..aaf9b2f 100644 --- a/lnet/libcfs/Info.plist +++ b/lnet/libcfs/Info.plist @@ -22,12 +22,14 @@ 1.0.0 OSBundleLibraries - com.apple.kernel.bsd - 1.1 - com.apple.kernel.iokit - 1.0.0b1 - com.apple.kernel.mach - 1.0.0b1 + com.apple.kpi.bsd + 8.0.0b1 + com.apple.kpi.libkern + 8.0.0b1 + com.apple.kpi.mach + 8.0.0b1 + com.apple.kpi.unsupported + 8.0.0b1 diff --git a/lnet/libcfs/Makefile.in b/lnet/libcfs/Makefile.in index aaaad93..0940a56 100644 --- a/lnet/libcfs/Makefile.in +++ b/lnet/libcfs/Makefile.in @@ -2,7 +2,7 @@ MODULES = libcfs libcfs-linux-objs := linux-tracefile.o linux-debug.o libcfs-linux-objs += linux-prim.o linux-mem.o -libcfs-linux-objs += linux-fs.o linux-sync.o +libcfs-linux-objs += linux-fs.o linux-sync.o linux-tcpip.o libcfs-linux-objs += linux-lwt.o linux-proc.o linux-curproc.o libcfs-linux-objs += linux-utils.o linux-module.o @@ -24,10 +24,10 @@ sources: endif -libcfs-all-objs := debug.o lwt.o module.o tracefile.o watchdog.o +libcfs-all-objs := debug.o nidstrings.o lwt.o module.o tracefile.o watchdog.o libcfs-objs := $(libcfs-linux-objs) $(libcfs-all-objs) -EXTRA_PRE_CFLAGS := -I@LUSTRE@/../portals/libcfs +EXTRA_PRE_CFLAGS := -I@LUSTRE@/../lnet/libcfs @INCLUDE_RULES@ diff --git a/lnet/libcfs/autoMakefile.am b/lnet/libcfs/autoMakefile.am index a818ab8..18381c1 100644 --- a/lnet/libcfs/autoMakefile.am +++ b/lnet/libcfs/autoMakefile.am @@ -9,6 +9,13 @@ SUBDIRS += darwin endif DIST_SUBDIRS := $(SUBDIRS) +if LIBLUSTRE +noinst_LIBRARIES= libcfs.a +libcfs_a_SOURCES= debug.c user-prim.c user-lock.c +libcfs_a_CPPFLAGS = $(LLCPPFLAGS) +libcfs_a_CFLAGS = $(LLCFLAGS) +endif + if MODULES if LINUX @@ -18,12 +25,12 @@ endif if DARWIN macos_PROGRAMS := libcfs -nodist_libcfs_SOURCES := debug.c module.c tracefile.c \ - darwin/darwin-debug.c darwin/darwin-fs.c darwin/darwin-mem.c \ - darwin/darwin-module.c darwin/darwin-prim.c \ - darwin/darwin-proc.c darwin/darwin-tracefile.c \ - darwin/darwin-utils.c darwin/darwin-sync.c \ - darwin/darwin-curproc.c user-prim.c user-lock.c +nodist_libcfs_SOURCES := darwin/darwin-sync.c darwin/darwin-mem.c \ + darwin/darwin-prim.c darwin/darwin-fs.c darwin/darwin-curproc.c \ + darwin/darwin-tcpip.c darwin/darwin-utils.c \ + darwin/darwin-debug.c darwin/darwin-proc.c \ + darwin/darwin-tracefile.c darwin/darwin-module.c \ + debug.c module.c tracefile.c nidstrings.c watchdog.c libcfs_CFLAGS := $(EXTRA_KCFLAGS) libcfs_LDFLAGS := $(EXTRA_KLDFLAGS) @@ -41,6 +48,5 @@ install-data-hook: $(install_data_hook) EXTRA_DIST := Info.plist -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ linux-*.c -MOSTLYCLEANFILES += linux/*.o darwin/*.o libcfs -DIST_SOURCES := $(libcfs-all-objs:%.o=%.c) tracefile.h +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ linux-*.c linux/*.o darwin/*.o libcfs +DIST_SOURCES := $(libcfs-all-objs:%.o=%.c) tracefile.h user-prim.c user-lock.c diff --git a/lnet/libcfs/darwin/Makefile.am b/lnet/libcfs/darwin/Makefile.am index 8e77294..3f2077b 100644 --- a/lnet/libcfs/darwin/Makefile.am +++ b/lnet/libcfs/darwin/Makefile.am @@ -8,4 +8,5 @@ EXTRA_DIST := \ darwin-fs.c \ darwin-prim.c \ darwin-tracefile.c \ - darwin-curproc.c + darwin-curproc.c \ + darwin-tcpip.c diff --git a/lnet/libcfs/darwin/darwin-curproc.c b/lnet/libcfs/darwin/darwin-curproc.c index d930051..e12394e 100644 --- a/lnet/libcfs/darwin/darwin-curproc.c +++ b/lnet/libcfs/darwin/darwin-curproc.c @@ -18,19 +18,23 @@ * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include /* - * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h) + * Implementation of cfs_curproc API (see lnet/include/libcfs/curproc.h) * for XNU kernel. */ static inline struct ucred *curproc_ucred(void) { +#ifdef __DARWIN8__ + return proc_ucred(current_proc()); +#else return current_proc()->p_cred->pc_ucred; +#endif } uid_t cfs_curproc_uid(void) @@ -46,17 +50,30 @@ gid_t cfs_curproc_gid(void) uid_t cfs_curproc_fsuid(void) { +#ifdef __DARWIN8__ + return curproc_ucred()->cr_ruid; +#else return current_proc()->p_cred->p_ruid; +#endif } gid_t cfs_curproc_fsgid(void) { +#ifdef __DARWIN8__ + return curproc_ucred()->cr_rgid; +#else return current_proc()->p_cred->p_rgid; +#endif } pid_t cfs_curproc_pid(void) { +#ifdef __DARWIN8__ + /* no pid for each thread, return address of thread struct */ + return (pid_t)current_thread(); +#else return current_proc()->p_pid; +#endif } int cfs_curproc_groups_nr(void) @@ -94,17 +111,40 @@ void cfs_curproc_groups_dump(gid_t *array, int size) mode_t cfs_curproc_umask(void) { +#ifdef __DARWIN8__ + /* + * XXX Liang: + * + * fd_cmask is not available in kexts, so we just assume + * verything is permited. + */ + return -1; +#else return current_proc()->p_fd->fd_cmask; +#endif } char *cfs_curproc_comm(void) { +#ifdef __DARWIN8__ + /* + * Writing to proc->p_comm is not permited in Darwin8, + * because proc_selfname() only return a copy of proc->p_comm, + * so this function is not really working while user try to + * change comm of current process. + */ + static char pcomm[MAXCOMLEN+1]; + + proc_selfname(pcomm, MAXCOMLEN+1); + return pcomm; +#else return current_proc()->p_comm; +#endif } cfs_kernel_cap_t cfs_curproc_cap_get(void) { - return 0; + return -1; } void cfs_curproc_cap_set(cfs_kernel_cap_t cap) diff --git a/lnet/libcfs/darwin/darwin-debug.c b/lnet/libcfs/darwin/darwin-debug.c index 970c5b9..2152d40 100644 --- a/lnet/libcfs/darwin/darwin-debug.c +++ b/lnet/libcfs/darwin/darwin-debug.c @@ -1,25 +1,77 @@ -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET #include #include #include "tracefile.h" -void portals_debug_dumpstack(cfs_task_t *tsk) +void libcfs_debug_dumpstack(cfs_task_t *tsk) { return; } -cfs_task_t *portals_current(void) -{ - return cfs_current(); +void libcfs_run_lbug_upcall(char *file, const char *fn, const int line) +{ +} + +void lbug_with_loc(char *file, const char *func, const int line) +{ + libcfs_catastrophe = 1; + CEMERG("LBUG: pid: %u thread: %#x\n", + (unsigned)cfs_curproc_pid(), (unsigned)current_thread()); + libcfs_debug_dumplog(); + libcfs_run_lbug_upcall(file, func, line); + while (1) + cfs_schedule(); + + /* panic("lbug_with_loc(%s, %s, %d)", file, func, line) */ } -int portals_arch_debug_init(unsigned long bufsize) +#if ENTRY_NESTING_SUPPORT + +static inline struct cfs_debug_data *__current_cdd(void) { - return 0; + struct cfs_debug_data *cdd; + + cdd = (struct cfs_debug_data *)current_uthread()->uu_nlminfo; + if (cdd != NULL && + cdd->magic1 == CDD_MAGIC1 && cdd->magic2 == CDD_MAGIC2 && + cdd->nesting_level < 1000) + return cdd; + else + return NULL; } -int portals_arch_debug_cleanup(void) +static inline void __current_cdd_set(struct cfs_debug_data *cdd) { - return 0; + current_uthread()->uu_nlminfo = (void *)cdd; +} + +void __entry_nesting(struct cfs_debug_data *child) +{ + struct cfs_debug_data *parent; + + parent = __current_cdd(); + if (parent != NULL) { + child->parent = parent; + child->nesting_level = parent->nesting_level + 1; + } + __current_cdd_set(child); +} + +void __exit_nesting(struct cfs_debug_data *child) +{ + __current_cdd_set(child->parent); +} + +unsigned int __current_nesting_level(void) +{ + struct cfs_debug_data *cdd; + + cdd = __current_cdd(); + if (cdd != NULL) + return cdd->nesting_level; + else + return 0; } +/* ENTRY_NESTING_SUPPORT */ +#endif diff --git a/lnet/libcfs/darwin/darwin-fs.c b/lnet/libcfs/darwin/darwin-fs.c index 5b0f44c..45f37df 100644 --- a/lnet/libcfs/darwin/darwin-fs.c +++ b/lnet/libcfs/darwin/darwin-fs.c @@ -27,13 +27,12 @@ #include #include #include -#include #include #include #include #include -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include @@ -43,8 +42,144 @@ * * Public functions */ + +#ifdef __DARWIN8__ +#include + +extern int vn_rdwr(enum uio_rw, vnode_t, caddr_t, int, off_t, enum uio_seg, int, kauth_cred_t, int *, proc_t); + +/* vnode_size() is not exported */ +static errno_t +vnode_size(vnode_t vp, off_t *sizep, vfs_context_t ctx) +{ + struct vnode_attr va; + int error; + + VATTR_INIT(&va); + VATTR_WANTED(&va, va_data_size); + error = vnode_getattr(vp, &va, ctx); + if (!error) + *sizep = va.va_data_size; + return(error); +} + +/* + * XXX Liang: + * + * kern_file_*() are not safe for multi-threads now, + * however, we need them only for tracefiled, so it's + * not so important to implement for MT. + */ +int +kern_file_size(struct cfs_kern_file *fp, off_t *psize) +{ + int error; + off_t size; + + error = vnode_size(fp->f_vp, &size, fp->f_ctxt); + if (error) + return error; + + if (psize) + *psize = size; + return 0; +} + +struct cfs_kern_file * +kern_file_open(const char * filename, int uflags, int mode, int *err) +{ + struct cfs_kern_file *fp; + vnode_t vp; + int error; + + fp = (struct cfs_kern_file *)_MALLOC(sizeof(struct cfs_kern_file), M_TEMP, M_WAITOK); + if (fp == NULL) { + if (err != NULL) + *err = -ENOMEM; + return NULL; + } + fp->f_flags = FFLAGS(uflags); + fp->f_ctxt = vfs_context_create(NULL); + + if ((error = vnode_open(filename, fp->f_flags, + mode, 0, &vp, fp->f_ctxt))){ + if (err != NULL) + *err = -error; + _FREE(fp, M_TEMP); + } else { + if (err != NULL) + *err = 0; + fp->f_vp = vp; + } + + return fp; +} + +int +kern_file_close(struct cfs_kern_file *fp) +{ + vnode_close(fp->f_vp, fp->f_flags, fp->f_ctxt); + vfs_context_rele(fp->f_ctxt); + _FREE(fp, M_TEMP); + + return 0; +} + +int +kern_file_read(struct cfs_kern_file *fp, void *buf, size_t nbytes, loff_t *pos) +{ + struct proc *p = current_proc(); + int resid; + int error; + + assert(buf != NULL); + assert(fp != NULL && fp->f_vp != NULL); + + error = vn_rdwr(UIO_READ, fp->f_vp, buf, nbytes, *pos, + UIO_SYSSPACE32, 0, vfs_context_ucred(fp->f_ctxt), &resid, p); + if ((error) || (nbytes == resid)) { + if (!error) + error = -EINVAL; + return error; + } + *pos += nbytes - resid; + + return (int)(nbytes - resid); +} + int -filp_node_size(struct file *fp, off_t *size) +kern_file_write(struct cfs_kern_file *fp, void *buf, size_t nbytes, loff_t *pos) +{ + struct proc *p = current_proc(); + int resid; + int error; + + assert(buf != NULL); + assert(fp != NULL && fp->f_vp != NULL); + + error = vn_rdwr(UIO_WRITE, fp->f_vp, buf, nbytes, *pos, + UIO_SYSSPACE32, 0, vfs_context_ucred(fp->f_ctxt), &resid, p); + if ((error) || (nbytes == resid)) { + if (!error) + error = -EINVAL; + return error; + } + *pos += nbytes - resid; + + return (int)(nbytes - resid); + +} + +int +kern_file_sync (struct cfs_kern_file *fp) +{ + return VNOP_FSYNC(fp->f_vp, MNT_WAIT, fp->f_ctxt); +} + +#else /* !__DARWIN8__ */ + +int +kern_file_size(struct file *fp, off_t *size) { struct vnode *vp = (struct vnode *)fp->f_data; struct stat sb; @@ -60,12 +195,11 @@ filp_node_size(struct file *fp, off_t *size) } cfs_file_t * -filp_open(const char * filename, int flags, int mode, int *err) +kern_file_open(const char * filename, int flags, int mode, int *err) { struct nameidata nd; - register cfs_file_t *fp; + cfs_file_t *fp; register struct vnode *vp; - cfs_file_t *nfp; int rc; extern struct fileops vnops; extern int nfiles; @@ -73,16 +207,16 @@ filp_open(const char * filename, int flags, int mode, int *err) CFS_CONE_IN; nfiles++; - MALLOC_ZONE(nfp, cfs_file_t *, sizeof(cfs_file_t), M_FILE, M_WAITOK|M_ZERO); - bzero(nfp, sizeof(cfs_file_t)); - nfp->f_count = 1; - fp = nfp; + MALLOC_ZONE(fp, cfs_file_t *, sizeof(cfs_file_t), M_FILE, M_WAITOK|M_ZERO); + bzero(fp, sizeof(cfs_file_t)); + fp->f_count = 1; + LIST_CIRCLE(fp, f_list); NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, (char *)filename, current_proc()); if ((rc = vn_open(&nd, flags, mode)) != 0){ printf("filp_open failed at (%d)\n", rc); if (err != NULL) *err = rc; - ffree(fp); + FREE_ZONE(fp, sizeof *fp, M_FILE); CFS_CONE_EX; return NULL; } @@ -117,7 +251,7 @@ frele_internal(cfs_file_t *fp) } int -filp_close (cfs_file_t *fp) +kern_file_close (cfs_file_t *fp) { struct vnode *vp; CFS_DECL_CONE_DATA; @@ -159,21 +293,28 @@ extern void bwillwrite(void); * Write buffer to filp inside kernel */ int -filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos) +kern_file_write (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos) { struct uio auio; struct iovec aiov; struct proc *p = current_proc(); long cnt, error = 0; + int flags = 0; CFS_DECL_CONE_DATA; aiov.iov_base = (void *)(uintptr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; - if (pos != NULL) + if (pos != NULL) { auio.uio_offset = *pos; - else + /* + * Liang: If don't set FOF_OFFSET, vn_write() + * will use fp->f_offset as the the real offset. + * Same in vn_read() + */ + flags |= FOF_OFFSET; + } else auio.uio_offset = (off_t)-1; if (nbyte > INT_MAX) return (EINVAL); @@ -186,7 +327,7 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos) CFS_CONE_IN; if (fp->f_type == DTYPE_VNODE) bwillwrite(); /* empty stuff now */ - if ((error = fo_write(fp, &auio, fp->f_cred, 0, p))) { + if ((error = fo_write(fp, &auio, fp->f_cred, flags, p))) { if (auio.uio_resid != cnt && (error == ERESTART ||\ error == EINTR || error == EWOULDBLOCK)) error = 0; @@ -200,7 +341,7 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos) else cnt -= auio.uio_resid; if (pos != NULL) - *pos = auio.uio_offset; + *pos += cnt; return cnt; } @@ -208,21 +349,23 @@ filp_write (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos) * Read from filp inside kernel */ int -filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos) +kern_file_read (cfs_file_t *fp, void *buf, size_t nbyte, loff_t *pos) { struct uio auio; struct iovec aiov; struct proc *p = current_proc(); long cnt, error = 0; + int flags = 0; CFS_DECL_CONE_DATA; aiov.iov_base = (caddr_t)buf; aiov.iov_len = nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; - if (pos != NULL) + if (pos != NULL) { auio.uio_offset = *pos; - else + flags |= FOF_OFFSET; + } else auio.uio_offset = (off_t)-1; if (nbyte > INT_MAX) return (EINVAL); @@ -233,7 +376,7 @@ filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos) cnt = nbyte; CFS_CONE_IN; - if ((error = fo_read(fp, &auio, fp->f_cred, 0, p)) != 0) { + if ((error = fo_read(fp, &auio, fp->f_cred, flags, p)) != 0) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; @@ -244,13 +387,13 @@ filp_read (cfs_file_t *fp, void *buf, size_t nbyte, off_t *pos) else cnt -= auio.uio_resid; if (pos != NULL) - *pos = auio.uio_offset; + *pos += cnt; return cnt; } int -filp_fsync (cfs_file_t *fp) +kern_file_sync (cfs_file_t *fp) { struct vnode *vp = (struct vnode *)fp->f_data; struct proc *p = current_proc(); @@ -271,60 +414,53 @@ filp_fsync (cfs_file_t *fp) return error; } -int -ref_file(cfs_file_t *fp) +#endif /* !__DARWIN8__ */ + +cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor) { - CFS_DECL_CONE_DATA; + return makedev(major, minor); +} - CFS_CONE_IN; - fref(fp); - CFS_CONE_EX; - return 0; +cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev) +{ + return major(rdev); } -int -rele_file(cfs_file_t *fp) +cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev) { - CFS_DECL_CONE_DATA; + return minor(rdev); +} - CFS_CONE_IN; - frele(fp); - CFS_CONE_EX; - return 0; +struct posix_acl *posix_acl_alloc(int count, int flags) +{ + static struct posix_acl acl; + return &acl; } /* - * Private functions + * XXX Liang: I've not converted all of them, + * more is needed? */ -void vrele_safe(struct vnode *nd) -{ - CFS_DECL_CONE_DATA; - - CFS_CONE_IN; - vrele(nd); - CFS_CONE_EX; -} - -int -path_lookup(const char *path, unsigned int flags, struct nameidata *nd) +int cfs_oflags2univ(int flags) { - int ret = 0; - CFS_DECL_CONE_DATA; + int f; - CFS_CONE_IN; - NDINIT(nd, LOOKUP, FOLLOW, UIO_SYSSPACE, (char *)path, current_proc()); - if ((ret = namei(nd)) != 0){ - CERROR("path_lookup fail!\n"); - } - CFS_CONE_EX; - - return ret; + f = flags & O_ACCMODE; + f |= (flags & O_CREAT) ? CFS_O_CREAT: 0; + f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0; + f |= (flags & O_EXCL) ? CFS_O_EXCL: 0; + f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0; + f |= (flags & O_APPEND) ? CFS_O_APPEND: 0; + f |= (flags & O_NOFOLLOW) ? CFS_O_NOFOLLOW: 0; + f |= (flags & O_SYNC)? CFS_O_SYNC: 0; + return f; } -int -file_count(struct file *fp) +/* + * XXX Liang: we don't need it in OSX. + * But it should be implemented anyway. + */ +int cfs_univ2oflags(int flags) { - return fcount(fp); + return flags; } - - diff --git a/lnet/libcfs/darwin/darwin-internal.h b/lnet/libcfs/darwin/darwin-internal.h new file mode 100644 index 0000000..6c83577 --- /dev/null +++ b/lnet/libcfs/darwin/darwin-internal.h @@ -0,0 +1,22 @@ +#ifndef __LIBCFS_DARWIN_INTERNAL_H__ +#define __LIBCFS_DARWIN_INTERNAL_H__ + +#include +#include +#include +#include +#include + +int cfs_sysctl_isvalid(void); +struct sysctl_oid *cfs_alloc_sysctl_node(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, int (*handler) SYSCTL_HANDLER_ARGS); +struct sysctl_oid *cfs_alloc_sysctl_int(struct sysctl_oid_list *parent, int n, + const char *name, int *ptr, int val); +struct sysctl_oid * cfs_alloc_sysctl_long(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, int *ptr, int val); +struct sysctl_oid * cfs_alloc_sysctl_string(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, char *ptr, int len); +struct sysctl_oid * cfs_alloc_sysctl_struct(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, void *ptr, int size); + +#endif diff --git a/lnet/libcfs/darwin/darwin-mem.c b/lnet/libcfs/darwin/darwin-mem.c index 4cf16d7..3079a56 100644 --- a/lnet/libcfs/darwin/darwin-mem.c +++ b/lnet/libcfs/darwin/darwin-mem.c @@ -2,7 +2,8 @@ * vim:expandtab:shiftwidth=8:tabstop=8: * * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Phil Schwan + * Author: Liang Zhen + * Nikita Danilov * * This file is part of Lustre, http://www.lustre.org. * @@ -22,78 +23,45 @@ * Darwin porting library * Make things easy to port */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include #include +#include "darwin-internal.h" -/* - * Definition of struct zone, copied from osfmk/kern/zalloc.h. - */ -struct zone_hack { - int count; /* Number of elements used now */ - vm_offset_t free_elements; - vm_size_t cur_size; /* current memory utilization */ - vm_size_t max_size; /* how large can this zone grow */ - vm_size_t elem_size; /* size of an element */ - vm_size_t alloc_size; /* size used for more memory */ - char *zone_name; /* a name for the zone */ - unsigned int - /* boolean_t */ exhaustible :1, /* (F) merely return if empty? */ - /* boolean_t */ collectable :1, /* (F) garbage collect empty pages */ - /* boolean_t */ expandable :1, /* (T) expand zone (with message)? */ - /* boolean_t */ allows_foreign :1,/* (F) allow non-zalloc space */ - /* boolean_t */ doing_alloc :1, /* is zone expanding now? */ - /* boolean_t */ waiting :1, /* is thread waiting for expansion? */ - /* boolean_t */ async_pending :1; /* asynchronous allocation pending? */ - struct zone_hack * next_zone; /* Link for all-zones list */ - /* - * more fields follow, but we don't need them. We only need - * offset from the beginning of struct zone to ->next_zone - * field: it allows us to scan the list of all zones. - */ +#if CFS_INDIVIDUAL_ZONE +extern zone_t zinit( vm_size_t, vm_size_t, vm_size_t, const char *); +extern void * zalloc(zone_t zone); +extern void *zalloc_noblock(zone_t zone); +extern void zfree(zone_t zone, void *addr); + +struct cfs_zone_nob { + struct list_head *z_nob; /* Pointer to z_link */ + struct list_head z_link; /* Do NOT access it directly */ }; -decl_simple_lock_data(extern, all_zones_lock) +static struct cfs_zone_nob cfs_zone_nob; +static spinlock_t cfs_zone_guard; -/* - * returns true iff zone with name @name already exists. - * - * XXX nikita: this function is defined in this file only because there is no - * better place to put it in. - */ -zone_t cfs_find_zone(const char *name) +cfs_mem_cache_t *mem_cache_find(const char *name, size_t objsize) { - struct zone_hack *scan; + cfs_mem_cache_t *walker = NULL; - /* from osfmk/kern/zalloc.c */ - extern zone_t first_zone; + LASSERT(cfs_zone_nob.z_nob != NULL); - LASSERT(name != NULL); + spin_lock(&cfs_zone_guard); + list_for_each_entry(walker, cfs_zone_nob.z_nob, mc_link) { + if (!strcmp(walker->mc_name, name) && \ + walker->mc_size == objsize) + break; + } + spin_unlock(&cfs_zone_guard); - simple_lock(&all_zones_lock); - for (scan = (struct zone_hack *)first_zone; - scan != NULL; scan = scan->next_zone) { - if (!strcmp(scan->zone_name, name)) - break; - } - simple_unlock(&all_zones_lock); - return((zone_t)scan); + return walker; } /* @@ -103,59 +71,120 @@ zone_t cfs_find_zone(const char *name) * survives kext unloading, so that @name cannot be just static string * embedded into kext image. */ -zone_t cfs_zinit(vm_size_t size, vm_size_t max, int alloc, const char *name) +cfs_mem_cache_t *mem_cache_create(vm_size_t objsize, const char *name) { + cfs_mem_cache_t *mc = NULL; char *cname; + MALLOC(mc, cfs_mem_cache_t *, sizeof(cfs_mem_cache_t), M_TEMP, M_WAITOK|M_ZERO); + if (mc == NULL){ + CERROR("cfs_mem_cache created fail!\n"); + return NULL; + } + cname = _MALLOC(strlen(name) + 1, M_TEMP, M_WAITOK); LASSERT(cname != NULL); - return zinit(size, max, alloc, strcpy(cname, name)); + mc->mc_cache = zinit(objsize, (KMEM_MAX_ZONE * objsize), 0, strcpy(cname, name)); + mc->mc_size = objsize; + CFS_INIT_LIST_HEAD(&mc->mc_link); + strncpy(mc->mc_name, name, 1 + strlen(name)); + return mc; +} + +void mem_cache_destroy(cfs_mem_cache_t *mc) +{ + /* + * zone can NOT be destroyed after creating, + * so just keep it in list. + * + * We will not lost a zone after we unload + * libcfs, it can be found by from libcfs.zone + */ + return; } +#define mem_cache_alloc(mc) zalloc((mc)->mc_cache) +#ifdef __DARWIN8__ +# define mem_cache_alloc_nb(mc) zalloc((mc)->mc_cache) +#else +/* XXX Liang: Tiger doesn't export zalloc_noblock() */ +# define mem_cache_alloc_nb(mc) zalloc_noblock((mc)->mc_cache) +#endif +#define mem_cache_free(mc, p) zfree((mc)->mc_cache, p) + +#else /* !CFS_INDIVIDUAL_ZONE */ + cfs_mem_cache_t * -cfs_mem_cache_create (const char *name, size_t objsize, size_t off, unsigned long arg1, - void (*arg2)(void *, cfs_mem_cache_t *, unsigned long), - void (*arg3)(void *, cfs_mem_cache_t *, unsigned long)) +mem_cache_find(const char *name, size_t objsize) +{ + return NULL; +} + +cfs_mem_cache_t *mem_cache_create(vm_size_t size, const char *name) { - cfs_mem_cache_t *new = NULL; + cfs_mem_cache_t *mc = NULL; - MALLOC(new, cfs_mem_cache_t *, objsize, M_TEMP, M_WAITOK|M_ZERO); - if (new == NULL){ + MALLOC(mc, cfs_mem_cache_t *, sizeof(cfs_mem_cache_t), M_TEMP, M_WAITOK|M_ZERO); + if (mc == NULL){ CERROR("cfs_mem_cache created fail!\n"); return NULL; } - new->size = objsize; - CFS_INIT_LIST_HEAD(&new->link); - strncpy(new->name, name, 1 + strlen(name)); - new->zone = cfs_find_zone(name); - if (new->zone == NULL) { - new->zone = cfs_zinit (objsize, KMEM_MAX_ZONE * objsize, 0, name); - if (new->zone == NULL) { - CERROR("zone create fault!\n"); - FREE (new, M_TEMP); - return NULL; - } - } - return new; + mc->mc_cache = OSMalloc_Tagalloc(name, OSMT_DEFAULT); + mc->mc_size = size; + return mc; } -int -cfs_mem_cache_destroy (cfs_mem_cache_t *cachep) +void mem_cache_destroy(cfs_mem_cache_t *mc) { - FREE (cachep, M_TEMP); - return 0; + OSMalloc_Tagfree(mc->mc_cache); + FREE(mc, M_TEMP); } -void * -cfs_mem_cache_alloc (cfs_mem_cache_t *cachep, int flags) +#define mem_cache_alloc(mc) OSMalloc((mc)->mc_size, (mc)->mc_cache) +#define mem_cache_alloc_nb(mc) OSMalloc_noblock((mc)->mc_size, (mc)->mc_cache) +#define mem_cache_free(mc, p) OSFree(p, (mc)->mc_size, (mc)->mc_cache) + +#endif /* !CFS_INDIVIDUAL_ZONE */ + +cfs_mem_cache_t * +cfs_mem_cache_create (const char *name, + size_t objsize, size_t off, unsigned long arg1) +{ + cfs_mem_cache_t *mc; + + mc = mem_cache_find(name, objsize); + if (mc) + return mc; + mc = mem_cache_create(objsize, name); + return mc; +} + +int cfs_mem_cache_destroy (cfs_mem_cache_t *cachep) { - return (void *)zalloc(cachep->zone); + mem_cache_destroy(cachep); + return 0; } -void -cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp) +void *cfs_mem_cache_alloc (cfs_mem_cache_t *cachep, int flags) { - zfree (cachep->zone, (vm_address_t)objp); + void *result; + + /* zalloc_canblock() is not exported... Emulate it. */ + if (flags & CFS_ALLOC_ATOMIC) { + result = (void *)mem_cache_alloc_nb(cachep); + } else { + LASSERT(get_preemption_level() == 0); + result = (void *)mem_cache_alloc(cachep); + } + if (result != NULL && (flags & CFS_ALLOC_ZERO)) + memset(result, 0, cachep->mc_size); + + return result; +} + +void cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp) +{ + mem_cache_free(cachep, objp); } /* --------------------------------------------------------------------------- @@ -167,38 +196,15 @@ cfs_mem_cache_free (cfs_mem_cache_t *cachep, void *objp) * "Raw" pages */ -extern vm_map_t zone_map; -static inline vm_map_t page_map(struct xnu_raw_page *pg) -{ - LASSERT(pg != NULL); - - return pg->order == 0 ? zone_map : kernel_map; -} - -static int raw_page_init(struct xnu_raw_page *pg) -{ - vm_size_t size = (1UL << pg->order) * PAGE_SIZE; - int upl_flags = UPL_SET_INTERNAL | - UPL_SET_LITE | UPL_SET_IO_WIRE | UPL_COPYOUT_FROM; - int kr = 0; - - /* XXX is it necessary? */ - kr = vm_map_get_upl(page_map(pg), - pg->virtual, &size, &pg->upl, 0, 0, &upl_flags, 0); - return kr; -} - -static void raw_page_done(struct xnu_raw_page *pg) -{ - ubc_upl_abort(pg->upl, UPL_ABORT_FREE_ON_EMPTY); - return; -} +static unsigned int raw_pages = 0; +static cfs_mem_cache_t *raw_page_cache = NULL; static struct xnu_page_ops raw_page_ops; static struct xnu_page_ops *page_ops[XNU_PAGE_NTYPES] = { [XNU_PAGE_RAW] = &raw_page_ops }; +#if defined(LIBCFS_DEBUG) static int page_type_is_valid(cfs_page_t *page) { LASSERT(page != NULL); @@ -209,6 +215,7 @@ static int page_is_raw(cfs_page_t *page) { return page->type == XNU_PAGE_RAW; } +#endif static struct xnu_raw_page *as_raw(cfs_page_t *page) { @@ -236,120 +243,83 @@ static struct xnu_page_ops raw_page_ops = { .page_address = raw_page_address }; +extern int get_preemption_level(void); -extern vm_size_t kalloc_max; -extern vm_size_t kalloc_max_prerounded; -extern int first_k_zone; -extern struct zone *k_zone[16]; -extern vm_offset_t zalloc_canblock( register zone_t, boolean_t ); -extern vm_map_t zone_map; - -static inline vm_address_t -page_zone_alloc(int flags, int order) -{ - register int zindex; - register vm_size_t allocsize; - vm_size_t size = (1UL << order) * PAGE_SIZE; - vm_address_t addr; - kern_return_t kr; - - assert(order >= 0); - if (size > PAGE_SIZE){ - /* XXX Liang: - * zalloc_canblock() call kernel_memory_allocate to allocate - * pages, kernel_memory_allocate cannot guarantee contig pages! - * So any request bigger then PAGE_SIZE should not call zalloc() - * - * NB. kmem_alloc_contig could be very slow!!!! Anyway, I dont - * know what will happen if order >= 1 :-( - * */ - CDEBUG(D_MALLOC, "Allocate contig pages!\n"); - kr = kmem_alloc_contig(kernel_map, &addr, size, 0, 0); - if (kr) - return 0; - return addr; - } - allocsize = KALLOC_MINSIZE; - zindex = first_k_zone; - while (allocsize < size) { - allocsize <<= 1; - zindex++; - } - assert(allocsize < kalloc_max); - if (flags & M_NOWAIT != 0) - addr = zalloc_canblock(k_zone[zindex], FALSE); - else - addr = zalloc_canblock(k_zone[zindex], TRUE); - return addr; -} +struct list_head page_death_row; +spinlock_t page_death_row_phylax; -/* Allocate a "page", actually upl of darwin */ -struct xnu_raw_page *alloc_raw_pages(u_int32_t flags, u_int32_t order) +static void raw_page_finish(struct xnu_raw_page *pg) { - kern_return_t kr; - vm_size_t size = (1UL << order) * PAGE_SIZE; - u_int32_t mflags = 0; - struct xnu_raw_page *pg; - - if (flags & CFS_ALLOC_ATOMIC != 0) - mflags |= M_NOWAIT; - else - mflags |= M_WAITOK; - if (flags & CFS_ALLOC_ZERO != 0) - mflags |= M_ZERO; + -- raw_pages; + if (pg->virtual != NULL) + cfs_mem_cache_free(raw_page_cache, pg->virtual); + cfs_free(pg); +} - MALLOC (pg, struct xnu_raw_page *, sizeof *pg, M_TEMP, mflags); - if (pg == NULL) - return NULL; - pg->header.type = XNU_PAGE_RAW; - pg->order = order; - cfs_set_page_count(&pg->header, 1); - pg->virtual = page_zone_alloc(flags, order); - if (!pg->virtual) - /* - * XXX nikita: Liang, shouldn't pg be freed here? - */ - return NULL; +void raw_page_death_row_clean(void) +{ + struct xnu_raw_page *pg; - kr = raw_page_init(pg); - if (kr != 0) { - size = (1UL << order) * PAGE_SIZE; - kmem_free(page_map(pg), pg->virtual, size); - return NULL; - } - return pg; + spin_lock(&page_death_row_phylax); + while (!list_empty(&page_death_row)) { + pg = container_of(page_death_row.next, + struct xnu_raw_page, link); + list_del(&pg->link); + spin_unlock(&page_death_row_phylax); + raw_page_finish(pg); + spin_lock(&page_death_row_phylax); + } + spin_unlock(&page_death_row_phylax); } /* Free a "page" */ -void free_raw_pages(struct xnu_raw_page *pg, u_int32_t order) +void free_raw_page(struct xnu_raw_page *pg) { - vm_size_t size = (1UL << order) * PAGE_SIZE; - if (!atomic_dec_and_test(&pg->count)) return; - raw_page_done(pg); - kmem_free(page_map(pg), pg->virtual, size); - FREE(pg, M_TEMP); -} - -cfs_page_t *cfs_alloc_pages(u_int32_t flags, u_int32_t order) -{ - return &alloc_raw_pages(flags, order)->header; + /* + * kmem_free()->vm_map_remove()->vm_map_delete()->lock_write() may + * block. (raw_page_done()->upl_abort() can block too) On the other + * hand, cfs_free_page() may be called in non-blockable context. To + * work around this, park pages on global list when cannot block. + */ + if (get_preemption_level() > 0) { + spin_lock(&page_death_row_phylax); + list_add(&pg->link, &page_death_row); + spin_unlock(&page_death_row_phylax); + } else { + raw_page_finish(pg); + raw_page_death_row_clean(); + } } cfs_page_t *cfs_alloc_page(u_int32_t flags) { - return cfs_alloc_pages(flags, 0); -} - -void cfs_free_pages(cfs_page_t *pages, int order) -{ - free_raw_pages(as_raw(pages), order); + struct xnu_raw_page *page; + + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + page = cfs_alloc(sizeof *page, flags); + if (page != NULL) { + page->virtual = cfs_mem_cache_alloc(raw_page_cache, flags); + if (page->virtual != NULL) { + ++ raw_pages; + page->header.type = XNU_PAGE_RAW; + atomic_set(&page->count, 1); + } else { + cfs_free(page); + page = NULL; + } + } + return page != NULL ? &page->header : NULL; } -void cfs_free_page(cfs_page_t *page) +void cfs_free_page(cfs_page_t *pages) { - cfs_free_pages(page, 0); + free_raw_page(as_raw(pages)); } void cfs_get_page(cfs_page_t *p) @@ -367,17 +337,16 @@ int cfs_page_count(cfs_page_t *p) return atomic_read(&as_raw(p)->count); } -void cfs_set_page_count(cfs_page_t *p, int v) -{ - atomic_set(&as_raw(p)->count, v); -} - /* * Generic page operations */ void *cfs_page_address(cfs_page_t *pg) { + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ LASSERT(page_type_is_valid(pg)); return page_ops[pg->type]->page_address(pg); } @@ -425,14 +394,14 @@ void *cfs_alloc(size_t nr_bytes, u_int32_t flags) int mflags; mflags = 0; - if (flags & CFS_ALLOC_ATOMIC != 0) { - mflags |= 0 /* M_NOWAIT */; + if (flags & CFS_ALLOC_ATOMIC) { + mflags |= M_NOWAIT; } else { LASSERT(get_preemption_level() == 0); mflags |= M_WAITOK; } - if (flags & CFS_ALLOC_ZERO != 0) + if (flags & CFS_ALLOC_ZERO) mflags |= M_ZERO; return _MALLOC(nr_bytes, M_TEMP, mflags); @@ -451,5 +420,61 @@ void *cfs_alloc_large(size_t nr_bytes) void cfs_free_large(void *addr) { + LASSERT(get_preemption_level() == 0); return _FREE(addr, M_TEMP); } + +/* + * Lookup cfs_zone_nob by sysctl.zone, if it cannot be + * found (first load of * libcfs since boot), allocate + * sysctl libcfs.zone. + */ +int cfs_mem_init(void) +{ +#if CFS_INDIVIDUAL_ZONE + int rc; + size_t len; + + len = sizeof(struct cfs_zone_nob); + rc = sysctlbyname("libcfs.zone", + (void *)&cfs_zone_nob, &len, NULL, 0); + if (rc == ENOENT) { + /* zone_nob is not register in libcfs_sysctl */ + struct cfs_zone_nob *nob; + struct sysctl_oid *oid; + + assert(cfs_sysctl_isvalid()); + + nob = _MALLOC(sizeof(struct cfs_zone_nob), + M_TEMP, M_WAITOK | M_ZERO); + CFS_INIT_LIST_HEAD(&nob->z_link); + nob->z_nob = &nob->z_link; + oid = cfs_alloc_sysctl_struct(NULL, OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN, + "zone", nob, sizeof(struct cfs_zone_nob)); + if (oid == NULL) { + _FREE(nob, M_TEMP); + return -ENOMEM; + } + sysctl_register_oid(oid); + + cfs_zone_nob.z_nob = nob->z_nob; + } + spin_lock_init(&cfs_zone_guard); +#endif + CFS_INIT_LIST_HEAD(&page_death_row); + spin_lock_init(&page_death_row_phylax); + raw_page_cache = cfs_mem_cache_create("raw-page", CFS_PAGE_SIZE, 0, 0); + return 0; +} + +void cfs_mem_fini(void) +{ + raw_page_death_row_clean(); + spin_lock_done(&page_death_row_phylax); + cfs_mem_cache_destroy(raw_page_cache); + +#if CFS_INDIVIDUAL_ZONE + cfs_zone_nob.z_nob = NULL; + spin_lock_done(&cfs_zone_guard); +#endif +} diff --git a/lnet/libcfs/darwin/darwin-module.c b/lnet/libcfs/darwin/darwin-module.c index 4f858624..10cb7d8 100644 --- a/lnet/libcfs/darwin/darwin-module.c +++ b/lnet/libcfs/darwin/darwin-module.c @@ -4,156 +4,188 @@ #include #include -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include -int portal_ioctl_getdata(char *buf, char *end, void *arg) +int libcfs_ioctl_getdata(char *buf, char *end, void *arg) { - struct portal_ioctl_hdr *hdr; - struct portal_ioctl_data *data; + struct libcfs_ioctl_hdr *hdr; + struct libcfs_ioctl_data *data; int err = 0; ENTRY; - hdr = (struct portal_ioctl_hdr *)buf; - data = (struct portal_ioctl_data *)buf; - /* portals_ioctl_data has been copied in by ioctl of osx */ - memcpy(buf, arg, sizeof(struct portal_ioctl_data)); + hdr = (struct libcfs_ioctl_hdr *)buf; + data = (struct libcfs_ioctl_data *)buf; + /* libcfs_ioctl_data has been copied in by ioctl of osx */ + memcpy(buf, arg, sizeof(struct libcfs_ioctl_data)); - if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { - CERROR("PORTALS: version mismatch kernel vs application\n"); + if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) { + CERROR("LIBCFS: version mismatch kernel vs application\n"); RETURN(-EINVAL); } if (hdr->ioc_len + buf >= end) { - CERROR("PORTALS: user buffer exceeds kernel buffer\n"); + CERROR("LIBCFS: user buffer exceeds kernel buffer\n"); RETURN(-EINVAL); } - if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { - CERROR("PORTALS: user buffer too small for ioctl\n"); + if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) { + CERROR("LIBCFS: user buffer too small for ioctl\n"); RETURN(-EINVAL); } buf += size_round(sizeof(*data)); - if (data->ioc_inllen1) { - err = copy_from_user(buf, data->ioc_inlbuf1, size_round(data->ioc_inllen1)); + if (data->ioc_inllen1) { + err = copy_from_user(buf, data->ioc_inlbuf1, size_round(data->ioc_inllen1)); if (err) RETURN(err); - data->ioc_inlbuf1 = buf; - buf += size_round(data->ioc_inllen1); - } - - if (data->ioc_inllen2) { - copy_from_user(buf, data->ioc_inlbuf2, size_round(data->ioc_inllen2)); + data->ioc_inlbuf1 = buf; + buf += size_round(data->ioc_inllen1); + } + + if (data->ioc_inllen2) { + copy_from_user(buf, data->ioc_inlbuf2, size_round(data->ioc_inllen2)); if (err) RETURN(err); - data->ioc_inlbuf2 = buf; - } + data->ioc_inlbuf2 = buf; + } RETURN(err); } +int libcfs_ioctl_popdata(void *arg, void *data, int size) +{ + /* + * system call will copy out ioctl arg to user space + */ + memcpy(arg, data, size); + return 0; +} + extern struct cfs_psdev_ops libcfs_psdev_ops; -struct portals_device_userstate *mdev_state[16]; +struct libcfs_device_userstate *mdev_state[16]; -static int +static int libcfs_psdev_open(dev_t dev, int flags, int devtype, struct proc *p) -{ - struct portals_device_userstate *mstat = NULL; +{ + struct libcfs_device_userstate *mstat = NULL; int rc = 0; - int devid; - devid = minor(dev); + int devid; + devid = minor(dev); - if (devid > 16) return (-ENXIO); + if (devid > 16) return (ENXIO); if (libcfs_psdev_ops.p_open != NULL) - rc = libcfs_psdev_ops.p_open(0, &mstat); + rc = -libcfs_psdev_ops.p_open(0, &mstat); else - rc = -EPERM; - if (!rc) - return rc; - mdev_state[devid] = mstat; + rc = EPERM; + if (rc == 0) + mdev_state[devid] = mstat; return rc; } -static int +static int libcfs_psdev_close(dev_t dev, int flags, int mode, struct proc *p) { - int devid; - devid = minor(dev); + int devid; + devid = minor(dev); int rc = 0; - if (devid > 16) return (-ENXIO); + if (devid > 16) return (ENXIO); if (libcfs_psdev_ops.p_close != NULL) - rc = libcfs_psdev_ops.p_close(0, mdev_state[devid]); + rc = -libcfs_psdev_ops.p_close(0, mdev_state[devid]); else - rc = -EPERM; - if (rc) - return rc; - mdev_state[devid] = NULL; + rc = EPERM; + if (rc == 0) + mdev_state[devid] = NULL; return rc; } -static int +static int libcfs_ioctl (dev_t dev, u_long cmd, caddr_t arg, int flag, struct proc *p) -{ - int rc = 0; - struct cfs_psdev_file pfile; - int devid; - devid = minor(dev); +{ + int rc = 0; + struct cfs_psdev_file pfile; + int devid; + devid = minor(dev); - if (devid > 16) return (-ENXIO); + if (devid > 16) return (ENXIO); - if (suser(p->p_ucred, &p->p_acflag)) - return (-EPERM); + if (!is_suser()) + return (EPERM); pfile.off = 0; pfile.private_data = mdev_state[devid]; - if (libcfs_psdev_ops.p_ioctl != NULL) - rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); - else - rc = -EPERM; + if (libcfs_psdev_ops.p_ioctl != NULL) + rc = -libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); + else + rc = EPERM; return rc; } static struct cdevsw libcfs_devsw = -{ - libcfs_psdev_open, /* open */ - libcfs_psdev_close, /* close */ - NULL, /* read */ - NULL, /* write */ - libcfs_ioctl, /* ioctl */ - NULL, /* stop */ - NULL, /* reset */ - NULL, /* tty's */ - NULL, /* select */ - NULL, /* mmap */ - NULL, /* strategy */ - NULL, /* getc */ - NULL, /* putc */ - 0 /* type */ +{ + .d_open = libcfs_psdev_open, + .d_close = libcfs_psdev_close, + .d_read = eno_rdwrt, + .d_write = eno_rdwrt, + .d_ioctl = libcfs_ioctl, + .d_stop = eno_stop, + .d_reset = eno_reset, + .d_ttys = NULL, + .d_select = eno_select, + .d_mmap = eno_mmap, + .d_strategy = eno_strat, + .d_getc = eno_getc, + .d_putc = eno_putc, + .d_type = 0 }; -cfs_psdev_t libcfs_dev = { - -1, - NULL, - "portals", - &libcfs_devsw, +cfs_psdev_t libcfs_dev = { + -1, + NULL, + "lnet", + &libcfs_devsw, NULL }; -void -kportal_daemonize (char *str) +extern spinlock_t trace_cpu_serializer; +extern void cfs_sync_init(void); +extern void cfs_sync_fini(void); +extern int cfs_sysctl_init(void); +extern void cfs_sysctl_fini(void); +extern int cfs_mem_init(void); +extern int cfs_mem_fini(void); +extern void raw_page_death_row_clean(void); +extern void cfs_thread_agent_init(void); +extern void cfs_thread_agent_fini(void); +extern void cfs_symbol_init(void); +extern void cfs_symbol_fini(void); + +int libcfs_arch_init(void) { - printf("Daemonize request: %s.\n", str); - return; + cfs_sync_init(); + cfs_sysctl_init(); + cfs_mem_init(); + cfs_thread_agent_init(); + cfs_symbol_init(); + + spin_lock_init(&trace_cpu_serializer); + + return 0; } -void -kportal_blockallsigs(void) +void libcfs_arch_cleanup(void) { - return; + spin_lock_done(&trace_cpu_serializer); + + cfs_symbol_fini(); + cfs_thread_agent_fini(); + cfs_mem_fini(); + cfs_sysctl_fini(); + cfs_sync_fini(); } + diff --git a/lnet/libcfs/darwin/darwin-prim.c b/lnet/libcfs/darwin/darwin-prim.c index fd2d120..cdcabd9 100644 --- a/lnet/libcfs/darwin/darwin-prim.c +++ b/lnet/libcfs/darwin/darwin-prim.c @@ -22,42 +22,34 @@ * Darwin porting library * Make things easy to port */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include -#include -#include #include #include -#include #include #include #include #include -#include -#include #include #include #include -void *darwin_current_journal_info = NULL; -int darwin_current_cap_effective = -1; - -/* - * cfs pseudo device, actually pseudo char device in darwin +/* + * cfs pseudo device, actually pseudo char device in darwin */ -#define KPORTAL_MAJOR -1 +#define KLNET_MAJOR -1 kern_return_t cfs_psdev_register(cfs_psdev_t *dev) { - dev->index = cdevsw_add(KPORTAL_MAJOR, dev->devsw); + dev->index = cdevsw_add(KLNET_MAJOR, dev->devsw); if (dev->index < 0) { - printf("portal_init: failed to allocate a major number!\n"); + printf("libcfs_init: failed to allocate a major number!\n"); return KERN_FAILURE; } - dev->handle = devfs_make_node(makedev (dev->index, 0), - DEVFS_CHAR, UID_ROOT, + dev->handle = devfs_make_node(makedev (dev->index, 0), + DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666, (char *)dev->name, 0); return KERN_SUCCESS; } @@ -68,11 +60,11 @@ kern_return_t cfs_psdev_deregister(cfs_psdev_t *dev) { return KERN_SUCCESS; } -/* - * KPortal symbol register / unregister support +/* + * KPortal symbol register / unregister support */ -static struct rw_semaphore cfs_symbol_lock; -struct list_head cfs_symbol_list; +struct rw_semaphore cfs_symbol_lock; +struct list_head cfs_symbol_list; void * cfs_symbol_get(const char *name) @@ -87,9 +79,9 @@ cfs_symbol_get(const char *name) sym->ref ++; break; } - } + } up_read(&cfs_symbol_lock); - if (sym != NULL) + if (sym != NULL) return sym->value; return NULL; } @@ -108,7 +100,7 @@ cfs_symbol_put(const char *name) LASSERT(sym->ref >= 0); break; } - } + } up_read(&cfs_symbol_lock); LASSERT(sym != NULL); @@ -167,7 +159,14 @@ cfs_symbol_unregister(const char *name) } void -cfs_symbol_clean() +cfs_symbol_init() +{ + CFS_INIT_LIST_HEAD(&cfs_symbol_list); + init_rwsem(&cfs_symbol_lock); +} + +void +cfs_symbol_fini() { struct list_head *walker; struct cfs_symbol *sym = NULL; @@ -180,77 +179,225 @@ cfs_symbol_clean() FREE(sym, M_TEMP); } up_write(&cfs_symbol_lock); + + fini_rwsem(&cfs_symbol_lock); return; } -/* - * Register sysctl table - */ -cfs_sysctl_table_header_t * -register_cfs_sysctl_table (cfs_sysctl_table_t *table, int arg) +struct kernel_thread_arg { - cfs_sysctl_table_t item; - int i = 0; + spinlock_t lock; + atomic_t inuse; + cfs_thread_t func; + void *arg; +}; - while ((item = table[i++]) != NULL) { - sysctl_register_oid(item); - } - return table; -} +struct kernel_thread_arg cfs_thread_arg; + +#define THREAD_ARG_FREE 0 +#define THREAD_ARG_HOLD 1 +#define THREAD_ARG_RECV 2 + +#define set_targ_stat(a, v) atomic_set(&(a)->inuse, v) +#define get_targ_stat(a) atomic_read(&(a)->inuse) /* - * Unregister sysctl table + * Hold the thread argument and set the status of thread_status + * to THREAD_ARG_HOLD, if the thread argument is held by other + * threads (It's THREAD_ARG_HOLD already), current-thread has to wait. */ -void -unregister_cfs_sysctl_table (cfs_sysctl_table_header_t *table) { - int i = 0; - cfs_sysctl_table_t item; +#define thread_arg_hold(pta, _func, _arg) \ + do { \ + spin_lock(&(pta)->lock); \ + if (get_targ_stat(pta) == THREAD_ARG_FREE) { \ + set_targ_stat((pta), THREAD_ARG_HOLD); \ + (pta)->arg = (void *)_arg; \ + (pta)->func = _func; \ + spin_unlock(&(pta)->lock); \ + break; \ + } \ + spin_unlock(&(pta)->lock); \ + cfs_schedule(); \ + } while(1); \ - while ((item = table[i++]) != NULL) { - sysctl_unregister_oid(item); - } - return; -} +/* + * Release the thread argument if the thread argument has been + * received by the child-thread (Status of thread_args is + * THREAD_ARG_RECV), otherwise current-thread has to wait. + * After release, the thread_args' status will be set to + * THREAD_ARG_FREE, and others can re-use the thread_args to + * create new kernel_thread. + */ +#define thread_arg_release(pta) \ + do { \ + spin_lock(&(pta)->lock); \ + if (get_targ_stat(pta) == THREAD_ARG_RECV) { \ + (pta)->arg = NULL; \ + (pta)->func = NULL; \ + set_targ_stat(pta, THREAD_ARG_FREE); \ + spin_unlock(&(pta)->lock); \ + break; \ + } \ + spin_unlock(&(pta)->lock); \ + cfs_schedule(); \ + } while(1) -struct kernel_thread_arg cfs_thread_arg; +/* + * Receive thread argument (Used in child thread), set the status + * of thread_args to THREAD_ARG_RECV. + */ +#define __thread_arg_recv_fin(pta, _func, _arg, fin) \ + do { \ + spin_lock(&(pta)->lock); \ + if (get_targ_stat(pta) == THREAD_ARG_HOLD) { \ + if (fin) \ + set_targ_stat(pta, THREAD_ARG_RECV);\ + _arg = (pta)->arg; \ + _func = (pta)->func; \ + spin_unlock(&(pta)->lock); \ + break; \ + } \ + spin_unlock(&(pta)->lock); \ + cfs_schedule(); \ + } while (1); \ + +/* + * Just set the thread_args' status to THREAD_ARG_RECV + */ +#define thread_arg_fin(pta) \ + do { \ + spin_lock(&(pta)->lock); \ + assert( get_targ_stat(pta) == THREAD_ARG_HOLD); \ + set_targ_stat(pta, THREAD_ARG_RECV); \ + spin_unlock(&(pta)->lock); \ + } while(0) + +#define thread_arg_recv(pta, f, a) __thread_arg_recv_fin(pta, f, a, 1) +#define thread_arg_keep(pta, f, a) __thread_arg_recv_fin(pta, f, a, 0) void -cfs_thread_agent_init() -{ - set_targ_stat(&cfs_thread_arg, THREAD_ARG_FREE); - spin_lock_init(&cfs_thread_arg.lock); - cfs_thread_arg.arg = NULL; - cfs_thread_arg.func = NULL; +cfs_thread_agent_init(void) +{ + set_targ_stat(&cfs_thread_arg, THREAD_ARG_FREE); + spin_lock_init(&cfs_thread_arg.lock); + cfs_thread_arg.arg = NULL; + cfs_thread_arg.func = NULL; } void -cfs_thread_agent (void) +cfs_thread_agent_fini(void) +{ + assert(get_targ_stat(&cfs_thread_arg) == THREAD_ARG_FREE); + + spin_lock_done(&cfs_thread_arg.lock); +} + +/* + * + * All requests to create kernel thread will create a new + * thread instance of cfs_thread_agent, one by one. + * cfs_thread_agent will call the caller's thread function + * with argument supplied by caller. + */ +void +cfs_thread_agent (void) { cfs_thread_t func = NULL; void *arg = NULL; thread_arg_recv(&cfs_thread_arg, func, arg); - printf("entry of thread agent (func: %08lx).\n", (void *)func); + /* printf("entry of thread agent (func: %08lx).\n", (void *)func); */ assert(func != NULL); func(arg); - printf("thread agent exit. (func: %08lx)\n", (void *)func); - (void) thread_terminate(current_act()); + /* printf("thread agent exit. (func: %08lx)\n", (void *)func); */ + (void) thread_terminate(current_thread()); } +extern thread_t kernel_thread(task_t task, void (*start)(void)); + int cfs_kernel_thread(cfs_thread_t func, void *arg, int flag) -{ - int ret = 0; - thread_t th = NULL; - - thread_arg_hold(&cfs_thread_arg, func, arg); - th = kernel_thread(kernel_task, cfs_thread_agent); - thread_arg_release(&cfs_thread_arg); - if (th == THREAD_NULL) - ret = -1; +{ + int ret = 0; + thread_t th = NULL; + + thread_arg_hold(&cfs_thread_arg, func, arg); + th = kernel_thread(kernel_task, cfs_thread_agent); + thread_arg_release(&cfs_thread_arg); + if (th == THREAD_NULL) + ret = -1; return ret; } +void cfs_daemonize(char *str) +{ + snprintf(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX, "%s", str); + return; +} + +/* + * XXX Liang: kexts cannot access sigmask in Darwin8. + * it's almost impossible for us to get/set signal mask + * without patching kernel. + * Should we provide these functions in xnu? + * + * These signal functions almost do nothing now, we + * need to investigate more about signal in Darwin. + */ +cfs_sigset_t cfs_get_blockedsigs() +{ + return (cfs_sigset_t)0; +} + +extern int block_procsigmask(struct proc *p, int bit); + +cfs_sigset_t cfs_block_allsigs() +{ + cfs_sigset_t old = 0; +#ifdef __DARWIN8__ +#else + block_procsigmask(current_proc(), -1); +#endif + return old; +} + +cfs_sigset_t cfs_block_sigs(sigset_t bit) +{ + cfs_sigset_t old = 0; +#ifdef __DARWIN8__ +#else + block_procsigmask(current_proc(), bit); +#endif + return old; +} + +void cfs_restore_sigs(cfs_sigset_t old) +{ +} + +int cfs_signal_pending(void) + +{ +#ifdef __DARWIN8__ + extern int thread_issignal(proc_t, thread_t, sigset_t); + return thread_issignal(current_proc(), current_thread(), (sigset_t)-1); +#else + return SHOULDissignal(current_proc(), current_uthread()) +#endif +} + +void cfs_clear_sigpending(void) +{ +#ifdef __DARWIN8__ +#else + clear_procsiglist(current_proc(), -1); +#endif +} + +#ifdef __DARWIN8__ + +#else /* !__DARWIN8__ */ + void lustre_cone_in(boolean_t *state, funnel_t **cone) { *cone = thread_funnel_get(); @@ -284,7 +431,7 @@ void lustre_net_ex(boolean_t state, funnel_t *cone) else if (cone == NULL) (void) thread_funnel_set(network_flock, state); } - +#endif /* !__DARWIN8__ */ void cfs_waitq_init(struct cfs_waitq *waitq) { @@ -297,7 +444,7 @@ void cfs_waitlink_init(struct cfs_waitlink *link) } void cfs_waitq_add(struct cfs_waitq *waitq, struct cfs_waitlink *link) -{ +{ link->wl_waitq = waitq; ksleep_add(&waitq->wq_ksleep_chan, &link->wl_ksleep_link); } @@ -329,6 +476,10 @@ int cfs_waitq_active(struct cfs_waitq *waitq) void cfs_waitq_signal(struct cfs_waitq *waitq) { + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ ksleep_wake(&waitq->wq_ksleep_chan); } @@ -342,61 +493,89 @@ void cfs_waitq_broadcast(struct cfs_waitq *waitq) ksleep_wake_all(&waitq->wq_ksleep_chan); } -void cfs_waitq_wait(struct cfs_waitlink *link) -{ - ksleep_wait(&link->wl_waitq->wq_ksleep_chan); +void cfs_waitq_wait(struct cfs_waitlink *link, cfs_task_state_t state) +{ + ksleep_wait(&link->wl_waitq->wq_ksleep_chan, state); } -cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, +cfs_duration_t cfs_waitq_timedwait(struct cfs_waitlink *link, + cfs_task_state_t state, cfs_duration_t timeout) -{ - CDEBUG(D_TRACE, "timeout: %llu\n", (long long unsigned)timeout); - return ksleep_timedwait(&link->chan->c, timeout); +{ + return ksleep_timedwait(&link->wl_waitq->wq_ksleep_chan, + state, timeout); } typedef void (*ktimer_func_t)(void *); void cfs_timer_init(cfs_timer_t *t, void (* func)(unsigned long), void *arg) -{ +{ ktimer_init(&t->t, (ktimer_func_t)func, arg); } void cfs_timer_done(struct cfs_timer *t) -{ +{ ktimer_done(&t->t); } void cfs_timer_arm(struct cfs_timer *t, cfs_time_t deadline) -{ +{ ktimer_arm(&t->t, deadline); } void cfs_timer_disarm(struct cfs_timer *t) -{ +{ ktimer_disarm(&t->t); } int cfs_timer_is_armed(struct cfs_timer *t) -{ +{ return ktimer_is_armed(&t->t); } cfs_time_t cfs_timer_deadline(struct cfs_timer *t) -{ +{ return ktimer_deadline(&t->t); } -int -libcfs_arch_init(void) +void cfs_enter_debugger(void) { - init_rwsem(&cfs_symbol_lock); - CFS_INIT_LIST_HEAD(&cfs_symbol_list); - cfs_thread_agent_init(); - return 0; +#ifdef __DARWIN8__ + extern void Debugger(const char * reason); + Debugger("CFS"); +#else + extern void PE_enter_debugger(char *cause); + PE_enter_debugger("CFS"); +#endif } -void -libcfs_arch_cleanup(void) +int cfs_online_cpus(void) { - cfs_symbol_clean(); -} + int activecpu; + size_t size; + +#ifdef __DARWIN8__ + size = sizeof(int); + sysctlbyname("hw.activecpu", &activecpu, &size, NULL, 0); + return activecpu; +#else + host_basic_info_data_t hinfo; + kern_return_t kret; + int count = HOST_BASIC_INFO_COUNT; +#define BSD_HOST 1 + kret = host_info(BSD_HOST, HOST_BASIC_INFO, &hinfo, &count); + if (kret == KERN_SUCCESS) + return (hinfo.avail_cpus); + return(-EINVAL); +#endif +} + +int cfs_ncpus(void) +{ + int ncpu; + size_t size; + size = sizeof(int); + + sysctlbyname("hw.ncpu", &ncpu, &size, NULL, 0); + return ncpu; +} diff --git a/lnet/libcfs/darwin/darwin-proc.c b/lnet/libcfs/darwin/darwin-proc.c index f2b48d5..a38902a 100644 --- a/lnet/libcfs/darwin/darwin-proc.c +++ b/lnet/libcfs/darwin/darwin-proc.c @@ -28,62 +28,246 @@ #include #include -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET + #include -static cfs_sysctl_table_header_t *portals_table_header = NULL; -extern unsigned int portal_debug; -extern char debug_file_path[1024]; -extern unsigned int portal_subsystem_debug; -extern unsigned int portal_printk; -extern unsigned int portals_catastrophe; -extern atomic_t portal_kmemory; +#define LIBCFS_SYSCTL "libcfs" +#define LIBCFS_SYSCTL_SPRITE "sprite" +#define LIBCFS_SYSCTL_MAGIC 0xbabeface + +static struct libcfs_sysctl_sprite { + int ss_magic; + struct sysctl_oid_list *ss_link; +} libcfs_sysctl_sprite = { 0, NULL }; + +static cfs_sysctl_table_header_t *libcfs_table_header = NULL; +extern unsigned int libcfs_debug; +extern unsigned int libcfs_subsystem_debug; +extern unsigned int libcfs_printk; +extern unsigned int libcfs_console_ratelimit; +extern unsigned int libcfs_catastrophe; +extern atomic_t libcfs_kmemory; extern long max_debug_mb; extern int cfs_trace_daemon SYSCTL_HANDLER_ARGS; extern int cfs_debug_mb SYSCTL_HANDLER_ARGS; /* - * sysctl table for portals + * sysctl table for lnet */ -SYSCTL_NODE (, OID_AUTO, portals, CTLFLAG_RW, - 0, "portals sysctl top"); -SYSCTL_INT(_portals, OID_AUTO, debug, - CTLTYPE_INT | CTLFLAG_RW , &portal_debug, +SYSCTL_NODE (, OID_AUTO, lnet, CTLFLAG_RW, + 0, "lnet sysctl top"); + +SYSCTL_INT(_lnet, OID_AUTO, debug, + CTLTYPE_INT | CTLFLAG_RW , &libcfs_debug, 0, "debug"); -SYSCTL_INT(_portals, OID_AUTO, subsystem_debug, - CTLTYPE_INT | CTLFLAG_RW, &portal_subsystem_debug, +SYSCTL_INT(_lnet, OID_AUTO, subsystem_debug, + CTLTYPE_INT | CTLFLAG_RW, &libcfs_subsystem_debug, 0, "subsystem debug"); -SYSCTL_INT(_portals, OID_AUTO, printk, - CTLTYPE_INT | CTLFLAG_RW, &portal_printk, +SYSCTL_INT(_lnet, OID_AUTO, printk, + CTLTYPE_INT | CTLFLAG_RW, &libcfs_printk, 0, "printk"); -SYSCTL_STRING(_portals, OID_AUTO, debug_path, - CTLTYPE_STRING | CTLFLAG_RW, debug_file_path, +SYSCTL_INT(_lnet, OID_AUTO, console_ratelimit, + CTLTYPE_INT | CTLFLAG_RW, &libcfs_console_ratelimit, + 0, "console_ratelimit"); +SYSCTL_STRING(_lnet, OID_AUTO, debug_path, + CTLTYPE_STRING | CTLFLAG_RW, debug_file_path, 1024, "debug path"); -SYSCTL_INT(_portals, OID_AUTO, memused, - CTLTYPE_INT | CTLFLAG_RW, (int *)&portal_kmemory.counter, +SYSCTL_INT(_lnet, OID_AUTO, memused, + CTLTYPE_INT | CTLFLAG_RW, (int *)&libcfs_kmemory.counter, 0, "memused"); -SYSCTL_PROC(_portals, OID_AUTO, trace_daemon, +SYSCTL_INT(_lnet, OID_AUTO, catastrophe, + CTLTYPE_INT | CTLFLAG_RW, (int *)&libcfs_catastrophe, + 0, "catastrophe"); +SYSCTL_PROC(_lnet, OID_AUTO, trace_daemon, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, &cfs_trace_daemon, "A", "trace daemon"); -SYSCTL_PROC(_portals, OID_AUTO, debug_mb, +SYSCTL_PROC(_lnet, OID_AUTO, debug_mb, CTLTYPE_INT | CTLFLAG_RW, &max_debug_mb, 0, &cfs_debug_mb, "L", "max debug size"); -#warning "add 'catastrophe' entry for LBUG detection" static cfs_sysctl_table_t top_table[] = { - &sysctl__portals, - &sysctl__portals_debug, - &sysctl__portals_subsystem_debug, - &sysctl__portals_printk, - &sysctl__portals_debug_path, - &sysctl__portals_memused, - &sysctl__portals_trace_daemon, - &sysctl__portals_debug_mb, + &sysctl__lnet, + &sysctl__lnet_debug, + &sysctl__lnet_subsystem_debug, + &sysctl__lnet_printk, + &sysctl__lnet_console_ratelimit, + &sysctl__lnet_debug_path, + &sysctl__lnet_memused, + &sysctl__lnet_catastrophe, + &sysctl__lnet_trace_daemon, + &sysctl__lnet_debug_mb, NULL }; +/* + * Register sysctl table + */ +cfs_sysctl_table_header_t * +cfs_register_sysctl_table (cfs_sysctl_table_t *table, int arg) +{ + cfs_sysctl_table_t item; + int i = 0; + + while ((item = table[i++]) != NULL) + sysctl_register_oid(item); + return table; +} + +/* + * Unregister sysctl table + */ +void +cfs_unregister_sysctl_table (cfs_sysctl_table_header_t *table) { + int i = 0; + cfs_sysctl_table_t item; + + while ((item = table[i++]) != NULL) + sysctl_unregister_oid(item); + return; +} + +/* + * Allocate a sysctl oid. + */ +static struct sysctl_oid * +cfs_alloc_sysctl(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, void *arg1, int arg2, const char *fmt, + int (*handler) SYSCTL_HANDLER_ARGS) +{ + struct sysctl_oid *oid; + char *sname = NULL; + char *sfmt = NULL; + + if (strlen(name) + 1 > CTL_MAXNAME) { + printf("libcfs: sysctl name: %s is too long.\n", name); + return NULL; + } + oid = (struct sysctl_oid*)_MALLOC(sizeof(struct sysctl_oid), + M_TEMP, M_WAITOK | M_ZERO); + if (oid == NULL) + return NULL; + + sname = (char *)_MALLOC(sizeof(CTL_MAXNAME), + M_TEMP, M_WAITOK | M_ZERO); + if (sname == NULL) + goto error; + strcpy(sname, name); + + sfmt = (char *)_MALLOC(4, M_TEMP, M_WAITOK | M_ZERO); + if (sfmt == NULL) + goto error; + strcpy(sfmt, fmt); + + if (parent == NULL) + oid->oid_parent = &sysctl__children; + else + oid->oid_parent = parent; + oid->oid_number = nbr; + oid->oid_kind = access; + oid->oid_name = sname; + oid->oid_handler = handler; + oid->oid_fmt = sfmt; + + if ((access & CTLTYPE) == CTLTYPE_NODE){ + /* It's a sysctl node */ + struct sysctl_oid_list *link; + + link = (struct sysctl_oid_list *)_MALLOC(sizeof(struct sysctl_oid_list), + M_TEMP, M_WAITOK | M_ZERO); + if (link == NULL) + goto error; + oid->oid_arg1 = link; + oid->oid_arg2 = 0; + } else { + oid->oid_arg1 = arg1; + oid->oid_arg2 = arg2; + } + + return oid; +error: + if (sfmt != NULL) + _FREE(sfmt, M_TEMP); + if (sname != NULL) + _FREE(sname, M_TEMP); + if (oid != NULL) + _FREE(oid, M_TEMP); + return NULL; +} + +void cfs_free_sysctl(struct sysctl_oid *oid) +{ + if (oid->oid_name != NULL) + _FREE((void *)oid->oid_name, M_TEMP); + if (oid->oid_fmt != NULL) + _FREE((void *)oid->oid_fmt, M_TEMP); + if ((oid->oid_kind & CTLTYPE_NODE != 0) && oid->oid_arg1) + /* XXX Liang: need to assert the list is empty */ + _FREE(oid->oid_arg1, M_TEMP); + _FREE(oid, M_TEMP); +} + +#define CFS_SYSCTL_ISVALID ((libcfs_sysctl_sprite.ss_magic == LIBCFS_SYSCTL_MAGIC) && \ + (libcfs_sysctl_sprite.ss_link != NULL)) + +int +cfs_sysctl_isvalid(void) +{ + return CFS_SYSCTL_ISVALID; +} + +struct sysctl_oid * +cfs_alloc_sysctl_node(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, int (*handler) SYSCTL_HANDLER_ARGS) +{ + if (parent == NULL && CFS_SYSCTL_ISVALID) + parent = libcfs_sysctl_sprite.ss_link; + return cfs_alloc_sysctl(parent, nbr, CTLTYPE_NODE | access, name, + NULL, 0, "N", handler); +} + +struct sysctl_oid * +cfs_alloc_sysctl_int(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, int *ptr, int val) +{ + if (parent == NULL && CFS_SYSCTL_ISVALID) + parent = libcfs_sysctl_sprite.ss_link; + return cfs_alloc_sysctl(parent, nbr, CTLTYPE_INT | access, name, + ptr, val, "I", sysctl_handle_int); +} + +struct sysctl_oid * +cfs_alloc_sysctl_long(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, int *ptr, int val) +{ + if (parent == NULL && CFS_SYSCTL_ISVALID) + parent = libcfs_sysctl_sprite.ss_link; + return cfs_alloc_sysctl(parent, nbr, CTLTYPE_INT | access, name, + ptr, val, "L", sysctl_handle_long); +} + +struct sysctl_oid * +cfs_alloc_sysctl_string(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, char *ptr, int len) +{ + if (parent == NULL && CFS_SYSCTL_ISVALID) + parent = libcfs_sysctl_sprite.ss_link; + return cfs_alloc_sysctl(parent, nbr, CTLTYPE_STRING | access, name, + ptr, len, "A", sysctl_handle_string); +} + +struct sysctl_oid * +cfs_alloc_sysctl_struct(struct sysctl_oid_list *parent, int nbr, int access, + const char *name, void *ptr, int size) +{ + if (parent == NULL && CFS_SYSCTL_ISVALID) + parent = libcfs_sysctl_sprite.ss_link; + return cfs_alloc_sysctl(parent, nbr, CTLTYPE_OPAQUE | access, name, + ptr, size, "S", sysctl_handle_opaque); +} + /* no proc in osx */ cfs_proc_dir_entry_t * cfs_create_proc_entry(char *name, int mod, cfs_proc_dir_entry_t *parent) @@ -111,8 +295,8 @@ int insert_proc(void) { #if 1 - if (!portals_table_header) - portals_table_header = register_cfs_sysctl_table(top_table, 0); + if (!libcfs_table_header) + libcfs_table_header = cfs_register_sysctl_table(top_table, 0); #endif return 0; } @@ -121,11 +305,80 @@ void remove_proc(void) { #if 1 - if (portals_table_header != NULL) - unregister_cfs_sysctl_table(portals_table_header); - portals_table_header = NULL; + if (libcfs_table_header != NULL) + cfs_unregister_sysctl_table(libcfs_table_header); + libcfs_table_header = NULL; #endif return; } +int +cfs_sysctl_init(void) +{ + struct sysctl_oid *oid_root; + struct sysctl_oid *oid_sprite; + struct libcfs_sysctl_sprite *sprite; + size_t len; + int rc; + + len = sizeof(struct libcfs_sysctl_sprite); + rc = sysctlbyname("libcfs.sprite", + (void *)&libcfs_sysctl_sprite, &len, NULL, 0); + if (rc == 0) { + /* + * XXX Liang: assert (rc == 0 || rc == ENOENT) + * + * libcfs.sprite has been registered by previous + * loading of libcfs + */ + if (libcfs_sysctl_sprite.ss_magic != LIBCFS_SYSCTL_MAGIC) { + printf("libcfs: magic number of libcfs.sprite " + "is not right (%lx, %lx)\n", + libcfs_sysctl_sprite.ss_magic, + LIBCFS_SYSCTL_MAGIC); + return -1; + } + assert(libcfs_sysctl_sprite.ss_link != NULL); + printf("libcfs: registered libcfs.sprite found.\n"); + return 0; + } + oid_root = cfs_alloc_sysctl_node(NULL, OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN, + LIBCFS_SYSCTL, 0); + if (oid_root == NULL) + return -1; + sysctl_register_oid(oid_root); + + sprite = (struct libcfs_sysctl_sprite *)_MALLOC(sizeof(struct libcfs_sysctl_sprite), + M_TEMP, M_WAITOK | M_ZERO); + if (sprite == NULL) { + sysctl_unregister_oid(oid_root); + cfs_free_sysctl(oid_root); + return -1; + } + sprite->ss_magic = LIBCFS_SYSCTL_MAGIC; + sprite->ss_link = (struct sysctl_oid_list *)oid_root->oid_arg1; + oid_sprite = cfs_alloc_sysctl_struct((struct sysctl_oid_list *)oid_root->oid_arg1, + OID_AUTO, CTLFLAG_RD | CTLFLAG_KERN, + LIBCFS_SYSCTL_SPRITE, sprite, + sizeof(struct libcfs_sysctl_sprite)); + if (oid_sprite == NULL) { + cfs_free_sysctl(oid_sprite); + sysctl_unregister_oid(oid_root); + cfs_free_sysctl(oid_root); + return -1; + } + sysctl_register_oid(oid_sprite); + + libcfs_sysctl_sprite.ss_magic = sprite->ss_magic; + libcfs_sysctl_sprite.ss_link = sprite->ss_link; + + return 0; +} + +void +cfs_sysctl_fini(void) +{ + libcfs_sysctl_sprite.ss_magic = 0; + libcfs_sysctl_sprite.ss_link = NULL; +} diff --git a/lnet/libcfs/darwin/darwin-sync.c b/lnet/libcfs/darwin/darwin-sync.c index 7ac24f6..dc2af0f 100644 --- a/lnet/libcfs/darwin/darwin-sync.c +++ b/lnet/libcfs/darwin/darwin-sync.c @@ -23,7 +23,7 @@ * * Created by nikita on Sun Jul 18 2004. * - * Prototypes of XNU synchronization primitives. + * XNU synchronization primitives. */ /* @@ -45,11 +45,15 @@ * A lot can be optimized here. */ -#include -#include -#include +#define DEBUG_SUBSYSTEM S_LNET -#define DEBUG_SUBSYSTEM S_PORTALS +#ifdef __DARWIN8__ +# include +#else +# include +# include +# include +#endif #include #include @@ -62,14 +66,35 @@ extern int get_preemption_level(void); #define get_preemption_level() (0) #endif -/* - * Warning: low level portals debugging code (portals_debug_msg(), for - * example), uses spin-locks, so debugging output here may lead to nasty - * surprises. - */ - #if SMP +#ifdef __DARWIN8__ + +static lck_grp_t *cfs_lock_grp = NULL; +#warning "Verify definition of lck_spin_t hasn't been changed while building!" + +/* hw_lock_* are not exported by Darwin8 */ +static inline void xnu_spin_init(xnu_spin_t *s) +{ + SLASSERT(cfs_lock_grp != NULL); + //*s = lck_spin_alloc_init(cfs_lock_grp, LCK_ATTR_NULL); + lck_spin_init((lck_spin_t *)s, cfs_lock_grp, LCK_ATTR_NULL); +} + +static inline void xnu_spin_done(xnu_spin_t *s) +{ + SLASSERT(cfs_lock_grp != NULL); + //lck_spin_free(*s, cfs_lock_grp); + //*s = NULL; + lck_spin_destroy((lck_spin_t *)s, cfs_lock_grp); +} +#define xnu_spin_lock(s) lck_spin_lock((lck_spin_t *)(s)) +#define xnu_spin_unlock(s) lck_spin_unlock((lck_spin_t *)(s)) + +#warning "Darwin8 does not export lck_spin_try_lock" +#define xnu_spin_try(s) (1) + +#else /* DARWIN8 */ extern void hw_lock_init(hw_lock_t); extern void hw_lock_lock(hw_lock_t); extern void hw_lock_unlock(hw_lock_t); @@ -77,10 +102,33 @@ extern unsigned int hw_lock_to(hw_lock_t, unsigned int); extern unsigned int hw_lock_try(hw_lock_t); extern unsigned int hw_lock_held(hw_lock_t); +#define xnu_spin_init(s) hw_lock_init(s) +#define xnu_spin_done(s) do {} while (0) +#define xnu_spin_lock(s) hw_lock_lock(s) +#define xnu_spin_unlock(s) hw_lock_unlock(s) +#define xnu_spin_try(s) hw_lock_try(s) +#endif /* DARWIN8 */ + +#else /* SMP */ +#define xnu_spin_init(s) do {} while (0) +#define xnu_spin_done(s) do {} while (0) +#define xnu_spin_lock(s) do {} while (0) +#define xnu_spin_unlock(s) do {} while (0) +#define xnu_spin_try(s) (1) +#endif /* SMP */ + +/* + * Warning: low level libcfs debugging code (libcfs_debug_msg(), for + * example), uses spin-locks, so debugging output here may lead to nasty + * surprises. + * + * In uniprocessor version of spin-lock. Only checks. + */ + void kspin_init(struct kspin *spin) { SLASSERT(spin != NULL); - hw_lock_init(&spin->lock); + xnu_spin_init(&spin->lock); ON_SYNC_DEBUG(spin->magic = KSPIN_MAGIC); ON_SYNC_DEBUG(spin->owner = NULL); } @@ -90,26 +138,37 @@ void kspin_done(struct kspin *spin) SLASSERT(spin != NULL); SLASSERT(spin->magic == KSPIN_MAGIC); SLASSERT(spin->owner == NULL); + xnu_spin_done(&spin->lock); } void kspin_lock(struct kspin *spin) { SLASSERT(spin != NULL); SLASSERT(spin->magic == KSPIN_MAGIC); - SLASSERT(spin->owner != current_thread); + SLASSERT(spin->owner != current_thread()); + + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ - hw_lock_lock(&spin->lock); + xnu_spin_lock(&spin->lock); SLASSERT(spin->owner == NULL); - ON_SYNC_DEBUG(spin->owner = current_thread); + ON_SYNC_DEBUG(spin->owner = current_thread()); } void kspin_unlock(struct kspin *spin) { + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + SLASSERT(spin != NULL); SLASSERT(spin->magic == KSPIN_MAGIC); - SLASSERT(spin->owner == current_thread); + SLASSERT(spin->owner == current_thread()); ON_SYNC_DEBUG(spin->owner = NULL); - hw_lock_unlock(&spin->lock); + xnu_spin_unlock(&spin->lock); } int kspin_trylock(struct kspin *spin) @@ -117,84 +176,133 @@ int kspin_trylock(struct kspin *spin) SLASSERT(spin != NULL); SLASSERT(spin->magic == KSPIN_MAGIC); - if (hw_lock_try(&spin->lock)) { + if (xnu_spin_try(&spin->lock)) { SLASSERT(spin->owner == NULL); - ON_SYNC_DEBUG(spin->owner = current_thread); + ON_SYNC_DEBUG(spin->owner = current_thread()); return 1; } else return 0; } -/* SMP */ -#else - -/* - * uniprocessor version of spin-lock. Only checks. - */ - -void kspin_init(struct kspin *spin) +#if XNU_SYNC_DEBUG +int kspin_islocked(struct kspin *spin) { SLASSERT(spin != NULL); - ON_SYNC_DEBUG(spin->magic = KSPIN_MAGIC); - ON_SYNC_DEBUG(spin->owner = NULL); + SLASSERT(spin->magic == KSPIN_MAGIC); + return spin->owner == current_thread(); } -void kspin_done(struct kspin *spin) +int kspin_isnotlocked(struct kspin *spin) { SLASSERT(spin != NULL); SLASSERT(spin->magic == KSPIN_MAGIC); - SLASSERT(spin->owner == NULL); + return spin->owner != current_thread(); } +#endif -void kspin_lock(struct kspin *spin) +/* + * read/write spin-lock + */ +void krw_spin_init(struct krw_spin *rwspin) { - SLASSERT(spin != NULL); - SLASSERT(spin->magic == KSPIN_MAGIC); - SLASSERT(spin->owner == NULL); - ON_SYNC_DEBUG(spin->owner = current_thread); + SLASSERT(rwspin != NULL); + + kspin_init(&rwspin->guard); + rwspin->count = 0; + ON_SYNC_DEBUG(rwspin->magic = KRW_SPIN_MAGIC); } -void kspin_unlock(struct kspin *spin) +void krw_spin_done(struct krw_spin *rwspin) { - SLASSERT(spin != NULL); - SLASSERT(spin->magic == KSPIN_MAGIC); - SLASSERT(spin->owner == current_thread); - ON_SYNC_DEBUG(spin->owner = NULL); + SLASSERT(rwspin != NULL); + SLASSERT(rwspin->magic == KRW_SPIN_MAGIC); + SLASSERT(rwspin->count == 0); + kspin_done(&rwspin->guard); } -int kspin_trylock(struct kspin *spin) +void krw_spin_down_r(struct krw_spin *rwspin) { - SLASSERT(spin != NULL); - SLASSERT(spin->magic == KSPIN_MAGIC); - SLASSERT(spin->owner == NULL); - ON_SYNC_DEBUG(spin->owner = current_thread); - return 1; + int i; + SLASSERT(rwspin != NULL); + SLASSERT(rwspin->magic == KRW_SPIN_MAGIC); + + kspin_lock(&rwspin->guard); + while(rwspin->count < 0) { + i = -1; + kspin_unlock(&rwspin->guard); + while (--i != 0 && rwspin->count < 0) + continue; + kspin_lock(&rwspin->guard); + } + ++ rwspin->count; + kspin_unlock(&rwspin->guard); } -/* SMP */ -#endif +void krw_spin_down_w(struct krw_spin *rwspin) +{ + int i; + SLASSERT(rwspin != NULL); + SLASSERT(rwspin->magic == KRW_SPIN_MAGIC); + + kspin_lock(&rwspin->guard); + while (rwspin->count != 0) { + i = -1; + kspin_unlock(&rwspin->guard); + while (--i != 0 && rwspin->count != 0) + continue; + kspin_lock(&rwspin->guard); + } + rwspin->count = -1; + kspin_unlock(&rwspin->guard); +} -#if XNU_SYNC_DEBUG -int kspin_islocked(struct kspin *spin) +void krw_spin_up_r(struct krw_spin *rwspin) { - SLASSERT(spin != NULL); - SLASSERT(spin->magic == KSPIN_MAGIC); - return spin->owner == current_thread; + SLASSERT(rwspin != NULL); + SLASSERT(rwspin->magic == KRW_SPIN_MAGIC); + SLASSERT(rwspin->count > 0); + + kspin_lock(&rwspin->guard); + -- rwspin->count; + kspin_unlock(&rwspin->guard); } -int kspin_isnotlocked(struct kspin *spin) +void krw_spin_up_w(struct krw_spin *rwspin) { - SLASSERT(spin != NULL); - SLASSERT(spin->magic == KSPIN_MAGIC); - return spin->owner != current_thread; + SLASSERT(rwspin != NULL); + SLASSERT(rwspin->magic == KRW_SPIN_MAGIC); + SLASSERT(rwspin->count == -1); + + kspin_lock(&rwspin->guard); + rwspin->count = 0; + kspin_unlock(&rwspin->guard); } -#endif +/* + * semaphore + */ +#ifdef __DARWIN8__ + +#define xnu_waitq_init(q, a) do {} while (0) +#define xnu_waitq_done(q) do {} while (0) +#define xnu_waitq_wakeup_one(q, e, s) ({wakeup_one((void *)(e)); KERN_SUCCESS;}) +#define xnu_waitq_wakeup_all(q, e, s) ({wakeup((void *)(e)); KERN_SUCCESS;}) +#define xnu_waitq_assert_wait(q, e, s) assert_wait((e), s) + +#else /* DARWIN8 */ + +#define xnu_waitq_init(q, a) wait_queue_init((q), a) +#define xnu_waitq_done(q) do {} while (0) +#define xnu_waitq_wakeup_one(q, e, s) wait_queue_wakeup_one((q), (event_t)(e), s) +#define xnu_waitq_wakeup_all(q, e, s) wait_queue_wakeup_all((q), (event_t)(e), s) +#define xnu_waitq_assert_wait(q, e, s) wait_queue_assert_wait((q), (event_t)(e), s) + +#endif /* DARWIN8 */ void ksem_init(struct ksem *sem, int value) { SLASSERT(sem != NULL); kspin_init(&sem->guard); - wait_queue_init(&sem->q, SYNC_POLICY_FIFO); + xnu_waitq_init(&sem->q, SYNC_POLICY_FIFO); sem->value = value; ON_SYNC_DEBUG(sem->magic = KSEM_MAGIC); } @@ -221,11 +329,11 @@ int ksem_up(struct ksem *sem, int value) kspin_lock(&sem->guard); sem->value += value; if (sem->value == 0) - result = wait_queue_wakeup_one(&sem->q, (event_t)sem, - THREAD_AWAKENED); + result = xnu_waitq_wakeup_one(&sem->q, sem, + THREAD_AWAKENED); else - result = wait_queue_wakeup_all(&sem->q, (event_t)sem, - THREAD_AWAKENED); + result = xnu_waitq_wakeup_all(&sem->q, sem, + THREAD_AWAKENED); kspin_unlock(&sem->guard); SLASSERT(result == KERN_SUCCESS || result == KERN_NOT_WAITING); return (result == KERN_SUCCESS) ? 0 : 1; @@ -242,8 +350,8 @@ void ksem_down(struct ksem *sem, int value) kspin_lock(&sem->guard); while (sem->value < value) { - result = wait_queue_assert_wait(&sem->q, (event_t)sem, - THREAD_UNINT); + result = xnu_waitq_assert_wait(&sem->q, sem, + THREAD_UNINT); SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING); kspin_unlock(&sem->guard); if (result == THREAD_WAITING) @@ -292,18 +400,18 @@ void kmut_lock(struct kmut *mut) { SLASSERT(mut != NULL); SLASSERT(mut->magic == KMUT_MAGIC); - SLASSERT(mut->owner != current_thread); + SLASSERT(mut->owner != current_thread()); SLASSERT(get_preemption_level() == 0); ksem_down(&mut->s, 1); - ON_SYNC_DEBUG(mut->owner = current_thread); + ON_SYNC_DEBUG(mut->owner = current_thread()); } void kmut_unlock(struct kmut *mut) { SLASSERT(mut != NULL); SLASSERT(mut->magic == KMUT_MAGIC); - SLASSERT(mut->owner == current_thread); + SLASSERT(mut->owner == current_thread()); ON_SYNC_DEBUG(mut->owner = NULL); ksem_up(&mut->s, 1); @@ -321,14 +429,14 @@ int kmut_islocked(struct kmut *mut) { SLASSERT(mut != NULL); SLASSERT(mut->magic == KMUT_MAGIC); - return mut->owner == current_thread; + return mut->owner == current_thread(); } int kmut_isnotlocked(struct kmut *mut) { SLASSERT(mut != NULL); SLASSERT(mut->magic == KMUT_MAGIC); - return mut->owner != current_thread; + return mut->owner != current_thread(); } #endif @@ -560,7 +668,7 @@ void ksleep_link_init(struct ksleep_link *link) CFS_INIT_LIST_HEAD(&link->linkage); link->flags = 0; - link->event = current_thread; + link->event = current_thread(); link->hits = 0; link->forward = NULL; ON_SYNC_DEBUG(link->magic = KSLEEP_LINK_MAGIC); @@ -620,6 +728,11 @@ static void add_hit(struct ksleep_chan *chan, event_t event) { struct ksleep_link *scan; + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + SLASSERT(kspin_islocked(&chan->guard)); list_for_each_entry(scan, &chan->waiters, linkage) { if (scan->event == event) { @@ -629,7 +742,7 @@ static void add_hit(struct ksleep_chan *chan, event_t event) } } -void ksleep_wait(struct ksleep_chan *chan) +void ksleep_wait(struct ksleep_chan *chan, cfs_task_state_t state) { event_t event; int result; @@ -640,10 +753,10 @@ void ksleep_wait(struct ksleep_chan *chan) SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC); SLASSERT(get_preemption_level() == 0); - event = current_thread; + event = current_thread(); kspin_lock(&chan->guard); if (!has_hits(chan, event)) { - result = assert_wait(event, THREAD_UNINT); + result = assert_wait(event, state); kspin_unlock(&chan->guard); SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING); if (result == THREAD_WAITING) @@ -653,12 +766,16 @@ void ksleep_wait(struct ksleep_chan *chan) EXIT; } -int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout) +/* + * Sleep on @chan for no longer than @timeout nano-seconds. Return remaining + * sleep time (non-zero only if thread was waken by a signal (not currently + * implemented), or waitq was already in the "signalled" state). + */ +int64_t ksleep_timedwait(struct ksleep_chan *chan, + cfs_task_state_t state, + uint64_t timeout) { event_t event; - int64_t result; - AbsoluteTime clock_current; - AbsoluteTime clock_delay; ENTRY; @@ -666,22 +783,20 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout) SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC); SLASSERT(get_preemption_level() == 0); - CDEBUG(D_TRACE, "timeout: %llu\n", (long long unsigned)timeout); - - event = current_thread; - result = 0; + event = current_thread(); kspin_lock(&chan->guard); if (!has_hits(chan, event)) { - result = assert_wait(event, THREAD_UNINT); + int result; + uint64_t expire; + result = assert_wait(event, state); if (timeout > 0) { /* * arm a timer. thread_set_timer()'s first argument is * uint32_t, so we have to cook deadline ourselves. */ - clock_get_uptime(&clock_current); - nanoseconds_to_absolutetime(timeout, &clock_delay); - ADD_ABSOLUTETIME(&clock_current, &clock_delay); - thread_set_timer_deadline(clock_current); + nanoseconds_to_absolutetime(timeout, &expire); + clock_absolutetime_interval_to_deadline(expire, &expire); + thread_set_timer_deadline(expire); } kspin_unlock(&chan->guard); SLASSERT(result == THREAD_AWAKENED || result == THREAD_WAITING); @@ -689,19 +804,22 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout) result = thread_block(THREAD_CONTINUE_NULL); thread_cancel_timer(); - clock_get_uptime(&clock_delay); - SUB_ABSOLUTETIME(&clock_delay, &clock_current); - if (result == THREAD_TIMED_OUT) - result = 0; - else { - absolutetime_to_nanoseconds(clock_delay, &result); - if (result < 0) - result = 0; - } - } else + if (result == THREAD_TIMED_OUT) + timeout = 0; + else { + uint64_t now; + clock_get_uptime(&now); + if (expire > now) + absolutetime_to_nanoseconds(expire - now, &timeout); + else + timeout = 0; + } + } else { + /* just return timeout, because I've got event and don't need to wait */ kspin_unlock(&chan->guard); + } - RETURN(result); + RETURN(timeout); } /* @@ -710,9 +828,11 @@ int64_t ksleep_timedwait(struct ksleep_chan *chan, uint64_t timeout) */ void ksleep_wake(struct ksleep_chan *chan) { - ENTRY; + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ ksleep_wake_nr(chan, 1); - EXIT; } /* @@ -734,7 +854,10 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr) struct ksleep_link *scan; int result; - ENTRY; + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ SLASSERT(chan != NULL); SLASSERT(chan->magic == KSLEEP_CHAN_MAGIC); @@ -747,8 +870,6 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr) if (forward != NULL) kspin_lock(&forward->guard); result = thread_wakeup(scan->event); - CDEBUG(D_INFO, "waking 0x%x: %d\n", - (unsigned int)scan->event, result); SLASSERT(result == KERN_SUCCESS || result == KERN_NOT_WAITING); if (result == KERN_NOT_WAITING) { ++ scan->hits; @@ -761,7 +882,6 @@ void ksleep_wake_nr(struct ksleep_chan *chan, int nr) break; } kspin_unlock(&chan->guard); - EXIT; } void ktimer_init(struct ktimer *t, void (*func)(void *), void *arg) @@ -807,6 +927,9 @@ static void ktimer_actor(void *arg0, void *arg1) t->func(t->arg); } +extern boolean_t thread_call_func_cancel(thread_call_func_t, thread_call_param_t, boolean_t); +extern void thread_call_func_delayed(thread_call_func_t, thread_call_param_t, uint64_t); + static void ktimer_disarm_locked(struct ktimer *t) { SLASSERT(t != NULL); @@ -815,15 +938,29 @@ static void ktimer_disarm_locked(struct ktimer *t) thread_call_func_cancel(ktimer_actor, t, FALSE); } +/* + * Received deadline is nanoseconds, but time checked by + * thread_call is absolute time (The abstime unit is equal to + * the length of one bus cycle, so the duration is dependent + * on the bus speed of the computer), so we need to convert + * nanotime to abstime by nanoseconds_to_absolutetime(). + * + * Refer to _delayed_call_timer(...) + * + * if thread_call_func_delayed is not exported in the future, + * we can use timeout() or bsd_timeout() to replace it. + */ void ktimer_arm(struct ktimer *t, u_int64_t deadline) { + cfs_time_t abstime; SLASSERT(t != NULL); SLASSERT(t->magic == KTIMER_MAGIC); kspin_lock(&t->guard); ktimer_disarm_locked(t); t->armed = 1; - thread_call_func_delayed(ktimer_actor, t, *(AbsoluteTime *)&deadline); + nanoseconds_to_absolutetime(deadline, &abstime); + thread_call_func_delayed(ktimer_actor, t, deadline); kspin_unlock(&t->guard); } @@ -857,6 +994,26 @@ u_int64_t ktimer_deadline(struct ktimer *t) return t->deadline; } +void cfs_sync_init(void) +{ +#ifdef __DARWIN8__ + /* Initialize lock group */ + cfs_lock_grp = lck_grp_alloc_init("libcfs sync", LCK_GRP_ATTR_NULL); +#endif +} + +void cfs_sync_fini(void) +{ +#ifdef __DARWIN8__ + /* + * XXX Liang: destroy lock group. As we haven't called lock_done + * for all locks, cfs_lock_grp may not be freed by kernel(reference + * count > 1). + */ + lck_grp_free(cfs_lock_grp); + cfs_lock_grp = NULL; +#endif +} /* * Local variables: * c-indentation-style: "K&R" diff --git a/lnet/libcfs/darwin/darwin-tcpip.c b/lnet/libcfs/darwin/darwin-tcpip.c new file mode 100644 index 0000000..c6609a7 --- /dev/null +++ b/lnet/libcfs/darwin/darwin-tcpip.c @@ -0,0 +1,1339 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Darwin porting library + * Make things easy to port + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +static __inline__ struct sockaddr_in +blank_sin() +{ + struct sockaddr_in blank = { sizeof(struct sockaddr_in), AF_INET }; + return (blank); +} + +void +libcfs_ipif_free_enumeration (char **names, int n) +{ + int i; + + LASSERT (n > 0); + + for (i = 0; i < n && names[i] != NULL; i++) + LIBCFS_FREE(names[i], IFNAMSIZ); + + LIBCFS_FREE(names, n * sizeof(*names)); +} + +#ifdef __DARWIN8__ +/* + * Darwin 8.x + * + * No hack kernel structre, all using KPI. + */ + +int +libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask) +{ + struct ifreq ifr; + socket_t so; + __u32 val; + int nob; + int rc; + + rc = -sock_socket(PF_INET, SOCK_STREAM, 0, + NULL, NULL, &so); + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return rc; + } + + nob = strnlen(name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + CERROR("Interface name %s too long\n", name); + rc = -EINVAL; + goto out; + } + + CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ); + bzero(&ifr, sizeof(ifr)); + strcpy(ifr.ifr_name, name); + rc = -sock_ioctl (so, SIOCGIFFLAGS, &ifr); + + if (rc != 0) { + CERROR("Can't get flags for interface %s\n", name); + goto out; + } + + if ((ifr.ifr_flags & IFF_UP) == 0) { + CDEBUG(D_NET, "Interface %s down\n", name); + *up = 0; + *ip = *mask = 0; + goto out; + } + + *up = 1; + + bzero(&ifr, sizeof(ifr)); + strcpy(ifr.ifr_name, name); + *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin(); + rc = -sock_ioctl(so, SIOCGIFADDR, &ifr); + + if (rc != 0) { + CERROR("Can't get IP address for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *ip = ntohl(val); + + bzero(&ifr, sizeof(ifr)); + strcpy(ifr.ifr_name, name); + *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin(); + rc = -sock_ioctl(so, SIOCGIFNETMASK, &ifr); + + if (rc != 0) { + CERROR("Can't get netmask for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *mask = ntohl(val); +out: + sock_close(so); + return rc; +} + +int +libcfs_ipif_enumerate (char ***namesp) +{ + /* Allocate and fill in 'names', returning # interfaces/error */ + char **names; + int toobig; + int nalloc; + int nfound; + socket_t so; + struct ifreq *ifr; + struct ifconf ifc; + int rc; + int nob; + int i; + + rc = -sock_socket(PF_INET, SOCK_STREAM, 0, + NULL, NULL, &so); + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return (rc); + } + + nalloc = 16; /* first guess at max interfaces */ + toobig = 0; + for (;;) { + if (nalloc * sizeof(*ifr) > CFS_PAGE_SIZE) { + toobig = 1; + nalloc = CFS_PAGE_SIZE/sizeof(*ifr); + CWARN("Too many interfaces: only enumerating first %d\n", + nalloc); + } + + LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr)); + if (ifr == NULL) { + CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc); + rc = -ENOMEM; + goto out0; + } + + ifc.ifc_buf = (char *)ifr; + ifc.ifc_len = nalloc * sizeof(*ifr); + +#if 1 + /* + * XXX Liang: + * sock_ioctl(..., SIOCGIFCONF, ...) is not supposed to be used in + * kernel space because it always try to copy result to userspace. + * So we can't get interfaces name by sock_ioctl(...,SIOCGIFCONF,...). + * I've created a bug for Apple, let's wait... + */ + nfound = 0; + for (i = 0; i < 16; i++) { + struct ifreq en; + bzero(&en, sizeof(en)); + snprintf(en.ifr_name, IFNAMSIZ, "en%d", i); + rc = -sock_ioctl (so, SIOCGIFFLAGS, &en); + if (rc != 0) + continue; + strcpy(ifr[nfound++].ifr_name, en.ifr_name); + } + +#else /* NOT in using now */ + rc = -sock_ioctl(so, SIOCGIFCONF, (caddr_t)&ifc); + + if (rc < 0) { + CERROR ("Error %d enumerating interfaces\n", rc); + goto out1; + } + + nfound = ifc.ifc_len/sizeof(*ifr); + LASSERT (nfound <= nalloc); +#endif + + if (nfound < nalloc || toobig) + break; + + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + nalloc *= 2; + } + if (nfound == 0) + goto out1; + + LIBCFS_ALLOC(names, nfound * sizeof(*names)); + if (names == NULL) { + rc = -ENOMEM; + goto out1; + } + /* NULL out all names[i] */ + memset (names, 0, nfound * sizeof(*names)); + + for (i = 0; i < nfound; i++) { + + nob = strnlen (ifr[i].ifr_name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + /* no space for terminating NULL */ + CERROR("interface name %.*s too long (%d max)\n", + nob, ifr[i].ifr_name, IFNAMSIZ); + rc = -ENAMETOOLONG; + goto out2; + } + + LIBCFS_ALLOC(names[i], IFNAMSIZ); + if (names[i] == NULL) { + rc = -ENOMEM; + goto out2; + } + + memcpy(names[i], ifr[i].ifr_name, nob); + names[i][nob] = 0; + } + + *namesp = names; + rc = nfound; + +out2: + if (rc < 0) + libcfs_ipif_free_enumeration(names, nfound); +out1: + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); +out0: + sock_close(so); + return rc; + +} + +/* + * Public entry of socket upcall. + * + * so_upcall can only be installed while create/accept of socket in + * Darwin 8.0, so we setup libcfs_sock_upcall() as upcall for all + * sockets in creat/accept, it will call upcall provided by user + * which can be setup after create/accept of socket. + */ +static void libcfs_sock_upcall(socket_t so, void* arg, int waitf) +{ + cfs_socket_t *sock; + + sock = (cfs_socket_t *)arg; + LASSERT(sock->s_magic == CFS_SOCK_MAGIC); + + if ((sock->s_flags & CFS_SOCK_UPCALL) != 0 && sock->s_upcall != NULL) + sock->s_upcall(so, sock->s_upcallarg, waitf); + return; +} + +void libcfs_sock_set_cb(cfs_socket_t *sock, so_upcall callback, void *arg) +{ + sock->s_upcall = callback; + sock->s_upcallarg = arg; + sock->s_flags |= CFS_SOCK_UPCALL; + return; +} + +void libcfs_sock_reset_cb(cfs_socket_t *sock) +{ + sock->s_flags &= ~CFS_SOCK_UPCALL; + sock->s_upcall = NULL; + sock->s_upcallarg = NULL; + return; +} + +static int +libcfs_sock_create (cfs_socket_t **sockp, int *fatal, + __u32 local_ip, int local_port) +{ + struct sockaddr_in locaddr; + cfs_socket_t *sock; + int option; + int optlen; + int rc; + + /* All errors are fatal except bind failure if the port is in use */ + *fatal = 1; + + sock = _MALLOC(sizeof(cfs_socket_t), M_TEMP, M_WAITOK|M_ZERO); + if (!sock) { + CERROR("Can't allocate cfs_socket.\n"); + return -ENOMEM; + } + *sockp = sock; + sock->s_magic = CFS_SOCK_MAGIC; + + rc = -sock_socket(PF_INET, SOCK_STREAM, 0, + libcfs_sock_upcall, sock, &C2B_SOCK(sock)); + if (rc != 0) + goto out; + option = 1; + optlen = sizeof(option); + rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, + SO_REUSEADDR, &option, optlen); + if (rc != 0) + goto out; + + /* can't specify a local port without a local IP */ + LASSERT (local_ip == 0 || local_port != 0); + + if (local_ip != 0 || local_port != 0) { + bzero (&locaddr, sizeof (locaddr)); + locaddr.sin_len = sizeof(struct sockaddr_in); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons (local_port); + locaddr.sin_addr.s_addr = (local_ip != 0) ? htonl(local_ip) : INADDR_ANY; + rc = -sock_bind(C2B_SOCK(sock), (struct sockaddr *)&locaddr); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *fatal = 0; + goto out; + } + if (rc != 0) { + CERROR("Error trying to bind to port %d: %d\n", + local_port, rc); + goto out; + } + } + return 0; +out: + if (C2B_SOCK(sock) != NULL) + sock_close(C2B_SOCK(sock)); + FREE(sock, M_TEMP); + return rc; +} + +int +libcfs_sock_listen (cfs_socket_t **sockp, + __u32 local_ip, int local_port, int backlog) +{ + cfs_socket_t *sock; + int fatal; + int rc; + + rc = libcfs_sock_create(&sock, &fatal, local_ip, local_port); + if (rc != 0) { + if (!fatal) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return rc; + + } + rc = -sock_listen(C2B_SOCK(sock), backlog); + if (rc == 0) { + *sockp = sock; + return 0; + } + + if (C2B_SOCK(sock) != NULL) + sock_close(C2B_SOCK(sock)); + FREE(sock, M_TEMP); + return rc; +} + +int +libcfs_sock_accept (cfs_socket_t **newsockp, cfs_socket_t *sock) +{ + cfs_socket_t *newsock; + int rc; + + newsock = _MALLOC(sizeof(cfs_socket_t), M_TEMP, M_WAITOK|M_ZERO); + if (!newsock) { + CERROR("Can't allocate cfs_socket.\n"); + return -ENOMEM; + } + newsock->s_magic = CFS_SOCK_MAGIC; + /* + * thread will sleep in sock_accept by calling of msleep(), + * it can be interrupted because msleep() use PCATCH as argument. + */ + rc = -sock_accept(C2B_SOCK(sock), NULL, 0, 0, + libcfs_sock_upcall, newsock, &C2B_SOCK(newsock)); + if (rc) { + if (C2B_SOCK(newsock) != NULL) + sock_close(C2B_SOCK(newsock)); + FREE(newsock, M_TEMP); + if ((sock->s_flags & CFS_SOCK_DOWN) != 0) + /* shutdown by libcfs_sock_abort_accept(), fake + * error number for lnet_acceptor() */ + rc = -EAGAIN; + return rc; + } + *newsockp = newsock; + return 0; +} + +void +libcfs_sock_abort_accept (cfs_socket_t *sock) +{ + /* + * XXX Liang: + * + * we want to wakeup thread blocked by sock_accept, but we don't + * know the address where thread is sleeping on, so we cannot + * wakeup it directly. + * The thread slept in sock_accept will be waken up while: + * 1. interrupt by signal + * 2. new connection is coming (sonewconn) + * 3. disconnecting of the socket (soisconnected) + * + * Cause we can't send signal to a thread directly(no KPI), so the + * only thing can be done here is disconnect the socket (by + * sock_shutdown() or sth else? ). + * + * Shutdown request of socket with SHUT_WR or SHUT_RDWR will + * be issured to the protocol. + * sock_shutdown()->tcp_usr_shutdown()->tcp_usrclosed()-> + * tcp_close()->soisdisconnected(), it will wakeup thread by + * wakeup((caddr_t)&so->so_timeo); + */ + sock->s_flags |= CFS_SOCK_DOWN; + sock_shutdown(C2B_SOCK(sock), SHUT_RDWR); +} + +int +libcfs_sock_read (cfs_socket_t *sock, void *buffer, int nob, int timeout) +{ + size_t rcvlen; + int rc; + cfs_duration_t to = cfs_time_seconds(timeout); + cfs_time_t then; + struct timeval tv; + + LASSERT(nob > 0); + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0, + }; + cfs_duration_usec(to, &tv); + rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_RCVTIMEO, + &tv, sizeof(tv)); + if (rc != 0) { + CERROR("Can't set socket recv timeout " + "%ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + + then = cfs_time_current(); + rc = -sock_receive(C2B_SOCK(sock), &msg, 0, &rcvlen); + to -= cfs_time_current() - then; + + if (rc != 0 && rc != -EWOULDBLOCK) + return rc; + if (rcvlen == nob) + return 0; + + if (to <= 0) + return -EAGAIN; + + buffer = ((char *)buffer) + rcvlen; + nob -= rcvlen; + } + return 0; +} + +int +libcfs_sock_write (cfs_socket_t *sock, void *buffer, int nob, int timeout) +{ + size_t sndlen; + int rc; + cfs_duration_t to = cfs_time_seconds(timeout); + cfs_time_t then; + struct timeval tv; + + LASSERT(nob > 0); + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0, + }; + + if (timeout != 0) { + cfs_duration_usec(to, &tv); + rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDTIMEO, + &tv, sizeof(tv)); + if (rc != 0) { + CERROR("Can't set socket send timeout " + "%ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + } + + then = cfs_time_current(); + rc = -sock_send(C2B_SOCK(sock), &msg, + ((timeout == 0) ? MSG_DONTWAIT : 0), &sndlen); + to -= cfs_time_current() - then; + + if (rc != 0 && rc != -EWOULDBLOCK) + return rc; + if (sndlen == nob) + return 0; + + if (to <= 0) + return -EAGAIN; + buffer = ((char *)buffer) + sndlen; + nob -= sndlen; + } + return 0; + +} + +int +libcfs_sock_getaddr (cfs_socket_t *sock, int remote, __u32 *ip, int *port) +{ + struct sockaddr_in sin; + int rc; + + if (remote != 0) + /* Get remote address */ + rc = -sock_getpeername(C2B_SOCK(sock), (struct sockaddr *)&sin, sizeof(sin)); + else + /* Get local address */ + rc = -sock_getsockname(C2B_SOCK(sock), (struct sockaddr *)&sin, sizeof(sin)); + if (rc != 0) { + CERROR ("Error %d getting sock %s IP/port\n", + rc, remote ? "peer" : "local"); + return rc; + } + + if (ip != NULL) + *ip = ntohl (sin.sin_addr.s_addr); + + if (port != NULL) + *port = ntohs (sin.sin_port); + return 0; +} + +int +libcfs_sock_setbuf (cfs_socket_t *sock, int txbufsize, int rxbufsize) +{ + int option; + int rc; + + if (txbufsize != 0) { + option = txbufsize; + rc = -sock_setsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + return (rc); + } + } + + if (rxbufsize != 0) { + option = rxbufsize; + rc = -sock_setsockopt (C2B_SOCK(sock), SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + return (rc); + } + } + return 0; +} + +int +libcfs_sock_getbuf (cfs_socket_t *sock, int *txbufsize, int *rxbufsize) +{ + int option; + int optlen; + int rc; + + if (txbufsize != NULL) { + optlen = sizeof(option); + rc = -sock_getsockopt(C2B_SOCK(sock), SOL_SOCKET, SO_SNDBUF, + (char *)&option, &optlen); + if (rc != 0) { + CERROR ("Can't get send buffer size: %d\n", rc); + return (rc); + } + *txbufsize = option; + } + + if (rxbufsize != NULL) { + optlen = sizeof(option); + rc = -sock_getsockopt (C2B_SOCK(sock), SOL_SOCKET, SO_RCVBUF, + (char *)&option, &optlen); + if (rc != 0) { + CERROR ("Can't get receive buffer size: %d\n", rc); + return (rc); + } + *rxbufsize = option; + } + return 0; +} + +void +libcfs_sock_release (cfs_socket_t *sock) +{ + if (C2B_SOCK(sock) != NULL) { + sock_shutdown(C2B_SOCK(sock), 2); + sock_close(C2B_SOCK(sock)); + } + FREE(sock, M_TEMP); +} + +int +libcfs_sock_connect (cfs_socket_t **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port) +{ + cfs_socket_t *sock; + struct sockaddr_in srvaddr; + int rc; + + rc = libcfs_sock_create(&sock, fatal, local_ip, local_port); + if (rc != 0) + return rc; + + bzero(&srvaddr, sizeof(srvaddr)); + srvaddr.sin_len = sizeof(struct sockaddr_in); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(peer_port); + srvaddr.sin_addr.s_addr = htonl(peer_ip); + + rc = -sock_connect(C2B_SOCK(sock), (struct sockaddr *)&srvaddr, 0); + if (rc == 0) { + *sockp = sock; + return 0; + } + + *fatal = !(rc == -EADDRNOTAVAIL || rc == -EADDRINUSE); + CDEBUG(*fatal ? D_NETERROR : D_NET, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port); + + libcfs_sock_release(sock); + return rc; +} + +#else /* !__DARWIN8__ */ + +/* + * To use bigger buffer for socket: + * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so + * we must patch kernel). + * 2. Increase net.inet.tcp.reass.maxsegments + * 3. Increase net.inet.tcp.sendspace + * 4. Increase net.inet.tcp.recvspace + * 5. Increase kern.ipc.maxsockbuf + */ +#define KSOCK_MAX_BUF (1152*1024) + +int +libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask) +{ + struct socket *so; + struct ifreq ifr; + int nob; + int rc; + __u32 val; + CFS_DECL_FUNNEL_DATA; + + CFS_NET_IN; + rc = socreate(PF_INET, &so, SOCK_STREAM, 0); + CFS_NET_EX; + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return (-rc); + } + nob = strnlen(name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + CERROR("Interface name %s too long\n", name); + rc = -EINVAL; + goto out; + } + + CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ); + strcpy(ifr.ifr_name, name); + CFS_NET_IN; + rc = ifioctl(so, SIOCGIFFLAGS, (caddr_t)&ifr, current_proc()); + CFS_NET_EX; + + if (rc != 0) { + CERROR("Can't get flags for interface %s\n", name); + goto out; + } + if ((ifr.ifr_flags & IFF_UP) == 0) { + CDEBUG(D_NET, "Interface %s down\n", name); + *up = 0; + *ip = *mask = 0; + goto out; + } + + *up = 1; + strcpy(ifr.ifr_name, name); + *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin(); + CFS_NET_IN; + rc = ifioctl(so, SIOCGIFADDR, (caddr_t)&ifr, current_proc()); + CFS_NET_EX; + + if (rc != 0) { + CERROR("Can't get IP address for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *ip = ntohl(val); + + strcpy(ifr.ifr_name, name); + *((struct sockaddr_in *)&ifr.ifr_addr) = blank_sin(); + CFS_NET_IN; + rc = ifioctl(so, SIOCGIFNETMASK, (caddr_t)&ifr, current_proc()); + CFS_NET_EX; + + if (rc != 0) { + CERROR("Can't get netmask for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *mask = ntohl(val); +out: + CFS_NET_IN; + soclose(so); + CFS_NET_EX; + return -rc; +} + +int +libcfs_ipif_enumerate (char ***namesp) +{ + /* Allocate and fill in 'names', returning # interfaces/error */ + char **names; + int toobig; + int nalloc; + int nfound; + struct socket *so; + struct ifreq *ifr; + struct ifconf ifc; + int rc; + int nob; + int i; + CFS_DECL_FUNNEL_DATA; + + CFS_NET_IN; + rc = socreate(PF_INET, &so, SOCK_STREAM, 0); + CFS_NET_EX; + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return (-rc); + } + + nalloc = 16; /* first guess at max interfaces */ + toobig = 0; + for (;;) { + if (nalloc * sizeof(*ifr) > CFS_PAGE_SIZE) { + toobig = 1; + nalloc = CFS_PAGE_SIZE/sizeof(*ifr); + CWARN("Too many interfaces: only enumerating first %d\n", + nalloc); + } + + LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr)); + if (ifr == NULL) { + CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc); + rc = -ENOMEM; + goto out0; + } + + ifc.ifc_buf = (char *)ifr; + ifc.ifc_len = nalloc * sizeof(*ifr); + + CFS_NET_IN; + rc = -ifioctl(so, SIOCGIFCONF, (caddr_t)&ifc, current_proc()); + CFS_NET_EX; + + if (rc < 0) { + CERROR ("Error %d enumerating interfaces\n", rc); + goto out1; + } + + nfound = ifc.ifc_len/sizeof(*ifr); + LASSERT (nfound <= nalloc); + + if (nfound < nalloc || toobig) + break; + + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + nalloc *= 2; + } + if (nfound == 0) + goto out1; + + LIBCFS_ALLOC(names, nfound * sizeof(*names)); + if (names == NULL) { + rc = -ENOMEM; + goto out1; + } + /* NULL out all names[i] */ + memset (names, 0, nfound * sizeof(*names)); + + for (i = 0; i < nfound; i++) { + + nob = strnlen (ifr[i].ifr_name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + /* no space for terminating NULL */ + CERROR("interface name %.*s too long (%d max)\n", + nob, ifr[i].ifr_name, IFNAMSIZ); + rc = -ENAMETOOLONG; + goto out2; + } + + LIBCFS_ALLOC(names[i], IFNAMSIZ); + if (names[i] == NULL) { + rc = -ENOMEM; + goto out2; + } + + memcpy(names[i], ifr[i].ifr_name, nob); + names[i][nob] = 0; + } + + *namesp = names; + rc = nfound; + +out2: + if (rc < 0) + libcfs_ipif_free_enumeration(names, nfound); +out1: + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); +out0: + CFS_NET_IN; + soclose(so); + CFS_NET_EX; + return rc; +} + +static int +libcfs_sock_create (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port) +{ + struct sockaddr_in locaddr; + struct socket *so; + struct sockopt sopt; + int option; + int rc; + CFS_DECL_FUNNEL_DATA; + + *fatal = 1; + CFS_NET_IN; + rc = socreate(PF_INET, &so, SOCK_STREAM, 0); + CFS_NET_EX; + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return (-rc); + } + + bzero(&sopt, sizeof sopt); + option = 1; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_REUSEADDR; + sopt.sopt_val = &option; + sopt.sopt_valsize = sizeof(option); + CFS_NET_IN; + rc = sosetopt(so, &sopt); + if (rc != 0) { + CFS_NET_EX; + CERROR ("Can't set sock reuse address: %d\n", rc); + goto out; + } + /* can't specify a local port without a local IP */ + LASSERT (local_ip == 0 || local_port != 0); + + if (local_ip != 0 || local_port != 0) { + bzero (&locaddr, sizeof (locaddr)); + locaddr.sin_len = sizeof(struct sockaddr_in); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons (local_port); + locaddr.sin_addr.s_addr = (local_ip != 0) ? htonl(local_ip) : + INADDR_ANY; + + rc = sobind(so, (struct sockaddr *)&locaddr); + if (rc == EADDRINUSE) { + CFS_NET_EX; + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *fatal = 0; + goto out; + } + if (rc != 0) { + CFS_NET_EX; + CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n", + HIPQUAD(local_ip), rc); + goto out; + } + } + *sockp = so; + return 0; +out: + CFS_NET_IN; + soclose(so); + CFS_NET_EX; + return -rc; +} + +int +libcfs_sock_listen (struct socket **sockp, + __u32 local_ip, int local_port, int backlog) +{ + int fatal; + int rc; + CFS_DECL_FUNNEL_DATA; + + rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port); + if (rc != 0) { + if (!fatal) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return rc; + } + CFS_NET_IN; + rc = solisten(*sockp, backlog); + CFS_NET_EX; + if (rc == 0) + return 0; + CERROR("Can't set listen backlog %d: %d\n", backlog, rc); + CFS_NET_IN; + soclose(*sockp); + CFS_NET_EX; + return -rc; +} + +int +libcfs_sock_accept (struct socket **newsockp, struct socket *sock) +{ + struct socket *so; + struct sockaddr *sa; + int error, s; + CFS_DECL_FUNNEL_DATA; + + CFS_NET_IN; + s = splnet(); + if ((sock->so_options & SO_ACCEPTCONN) == 0) { + splx(s); + CFS_NET_EX; + return (-EINVAL); + } + + if ((sock->so_state & SS_NBIO) && sock->so_comp.tqh_first == NULL) { + splx(s); + CFS_NET_EX; + return (-EWOULDBLOCK); + } + + error = 0; + while (TAILQ_EMPTY(&sock->so_comp) && sock->so_error == 0) { + if (sock->so_state & SS_CANTRCVMORE) { + sock->so_error = ECONNABORTED; + break; + } + error = tsleep((caddr_t)&sock->so_timeo, PSOCK | PCATCH, + "accept", 0); + if (error) { + splx(s); + CFS_NET_EX; + return (-error); + } + } + if (sock->so_error) { + error = sock->so_error; + sock->so_error = 0; + splx(s); + CFS_NET_EX; + return (-error); + } + + /* + * At this point we know that there is at least one connection + * ready to be accepted. Remove it from the queue prior to + * allocating the file descriptor for it since falloc() may + * block allowing another process to accept the connection + * instead. + */ + so = TAILQ_FIRST(&sock->so_comp); + TAILQ_REMOVE(&sock->so_comp, so, so_list); + sock->so_qlen--; + + so->so_state &= ~SS_COMP; + so->so_head = NULL; + sa = 0; + (void) soaccept(so, &sa); + + *newsockp = so; + FREE(sa, M_SONAME); + splx(s); + CFS_NET_EX; + return (-error); +} + +void +libcfs_sock_abort_accept (struct socket *sock) +{ + wakeup(&sock->so_timeo); +} + +/* + * XXX Liang: timeout for write is not supported yet. + */ +int +libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + CFS_DECL_NET_DATA; + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct uio suio = { + .uio_iov = &iov, + .uio_iovcnt = 1, + .uio_offset = 0, + .uio_resid = nob, + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_WRITE, + .uio_procp = NULL + }; + + CFS_NET_IN; + rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0); + CFS_NET_EX; + + if (rc != 0) { + if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ + rc == EWOULDBLOCK)) + rc = 0; + if ( rc != 0 ) + return -rc; + rc = nob - suio.uio_resid; + buffer = ((char *)buffer) + rc; + nob = suio.uio_resid; + continue; + } + break; + } + return (0); +} + +/* + * XXX Liang: timeout for read is not supported yet. + */ +int +libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + CFS_DECL_NET_DATA; + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct uio ruio = { + .uio_iov = &iov, + .uio_iovcnt = 1, + .uio_offset = 0, + .uio_resid = nob, + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_READ, + .uio_procp = NULL + }; + + CFS_NET_IN; + rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0); + CFS_NET_EX; + + if (rc != 0) { + if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ + rc == EWOULDBLOCK)) + rc = 0; + if (rc != 0) + return -rc; + rc = nob - ruio.uio_resid; + buffer = ((char *)buffer) + rc; + nob = ruio.uio_resid; + continue; + } + break; + } + return (0); +} + +int +libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize) +{ + struct sockopt sopt; + int rc = 0; + int option; + CFS_DECL_NET_DATA; + + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_val = &option; + sopt.sopt_valsize = sizeof(option); + + if (txbufsize != 0) { + option = txbufsize; + if (option > KSOCK_MAX_BUF) + option = KSOCK_MAX_BUF; + + sopt.sopt_name = SO_SNDBUF; + CFS_NET_IN; + rc = sosetopt(sock, &sopt); + CFS_NET_EX; + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + + return -rc; + } + } + + if (rxbufsize != 0) { + option = rxbufsize; + sopt.sopt_name = SO_RCVBUF; + CFS_NET_IN; + rc = sosetopt(sock, &sopt); + CFS_NET_EX; + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + return -rc; + } + } + return 0; +} + +int +libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port) +{ + struct sockaddr_in *sin; + struct sockaddr *sa = NULL; + int rc; + CFS_DECL_NET_DATA; + + if (remote != 0) { + CFS_NET_IN; + rc = sock->so_proto->pr_usrreqs->pru_peeraddr(sock, &sa); + CFS_NET_EX; + + if (rc != 0) { + if (sa) FREE(sa, M_SONAME); + CERROR ("Error %d getting sock peer IP\n", rc); + return -rc; + } + } else { + CFS_NET_IN; + rc = sock->so_proto->pr_usrreqs->pru_sockaddr(sock, &sa); + CFS_NET_EX; + if (rc != 0) { + if (sa) FREE(sa, M_SONAME); + CERROR ("Error %d getting sock local IP\n", rc); + return -rc; + } + } + if (sa != NULL) { + sin = (struct sockaddr_in *)sa; + if (ip != NULL) + *ip = ntohl (sin->sin_addr.s_addr); + if (port != NULL) + *port = ntohs (sin->sin_port); + if (sa) + FREE(sa, M_SONAME); + } + return 0; +} + +int +libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize) +{ + struct sockopt sopt; + int rc; + CFS_DECL_NET_DATA; + + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_GET; + sopt.sopt_level = SOL_SOCKET; + + if (txbufsize != NULL) { + sopt.sopt_val = txbufsize; + sopt.sopt_valsize = sizeof(*txbufsize); + sopt.sopt_name = SO_SNDBUF; + CFS_NET_IN; + rc = sogetopt(sock, &sopt); + CFS_NET_EX; + if (rc != 0) { + CERROR ("Can't get send buffer size: %d\n", rc); + return -rc; + } + } + + if (rxbufsize != NULL) { + sopt.sopt_val = rxbufsize; + sopt.sopt_valsize = sizeof(*rxbufsize); + sopt.sopt_name = SO_RCVBUF; + CFS_NET_IN; + rc = sogetopt(sock, &sopt); + CFS_NET_EX; + if (rc != 0) { + CERROR ("Can't get receive buffer size: %d\n", rc); + return -rc; + } + } + return 0; +} + +int +libcfs_sock_connect (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port) +{ + struct sockaddr_in srvaddr; + struct socket *so; + int s; + int rc; + CFS_DECL_FUNNEL_DATA; + + rc = libcfs_sock_create(sockp, fatal, local_ip, local_port); + if (rc != 0) + return rc; + so = *sockp; + bzero(&srvaddr, sizeof(srvaddr)); + srvaddr.sin_len = sizeof(struct sockaddr_in); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons (peer_port); + srvaddr.sin_addr.s_addr = htonl (peer_ip); + + CFS_NET_IN; + rc = soconnect(so, (struct sockaddr *)&srvaddr); + if (rc != 0) { + CFS_NET_EX; + if (rc != EADDRNOTAVAIL && rc != EADDRINUSE) + CDEBUG(D_NETERROR, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port); + goto out; + } + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n"); + (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz); + } + if ((rc = so->so_error) != 0) { + so->so_error = 0; + splx(s); + CFS_NET_EX; + CDEBUG(D_NETERROR, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port); + goto out; + } + LASSERT(so->so_state & SS_ISCONNECTED); + splx(s); + CFS_NET_EX; + if (sockp) + *sockp = so; + return (0); +out: + CFS_NET_IN; + soshutdown(so, 2); + soclose(so); + CFS_NET_EX; + return (-rc); +} + +void +libcfs_sock_release (struct socket *sock) +{ + CFS_DECL_FUNNEL_DATA; + CFS_NET_IN; + soshutdown(sock, 0); + CFS_NET_EX; +} + +#endif diff --git a/lnet/libcfs/darwin/darwin-tracefile.c b/lnet/libcfs/darwin/darwin-tracefile.c index c621129..bb1dc72 100644 --- a/lnet/libcfs/darwin/darwin-tracefile.c +++ b/lnet/libcfs/darwin/darwin-tracefile.c @@ -1,5 +1,5 @@ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #define LUSTRE_TRACEFILE_PRIVATE #include #include @@ -14,122 +14,239 @@ extern union trace_data_union trace_data[NR_CPUS]; extern char *tracefile; extern long long tracefile_size; -extern struct rw_semaphore tracefile_sem; extern int trace_start_thread(void); extern void trace_stop_thread(void); long max_debug_mb = M_TCD_MAX_PAGES; static long max_permit_mb = (64 * 1024); -inline struct trace_cpu_data * -__trace_get_tcd (unsigned long *flags) +spinlock_t trace_cpu_serializer; + +/* + * thread currently executing tracefile code or NULL if none does. Used to + * detect recursive calls to libcfs_debug_msg(). + */ +static thread_t trace_owner = NULL; + +extern int get_preemption_level(void); +extern atomic_t tage_allocated; + +struct rw_semaphore tracefile_sem; + +int tracefile_init_arch() { + init_rwsem(&tracefile_sem); +#error "Todo: initialise per-cpu console buffers" + return 0; +} + +void tracefile_fini_arch() { +} + +void tracefile_read_lock() { + down_read(&tracefile_sem); +} + +void tracefile_read_unlock() { + up_read(&tracefile_sem); +} + +void tracefile_write_lock() { + down_write(&tracefile_sem); +} + +void tracefile_write_unlock() { + up_write(&tracefile_sem); +} + +char *trace_get_console_buffer(void) +{ +#error "todo: return a per-cpu/interrupt console buffer and disable pre-emption" +} + +void trace_put_console_buffer(char *buffer) +{ +#error "todo: re-enable pre-emption" +} + +struct trace_cpu_data *trace_get_tcd(void) +{ + struct trace_cpu_data *tcd; + int nr_pages; + struct list_head pages; + + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + /* + * debugging check for recursive call to libcfs_debug_msg() + */ + if (trace_owner == current_thread()) { + /* + * Cannot assert here. + */ + printk(KERN_EMERG "recursive call to %s", __FUNCTION__); + /* + * "The death of God left the angels in a strange position." + */ + cfs_enter_debugger(); + } + tcd = &trace_data[0].tcd; + CFS_INIT_LIST_HEAD(&pages); + if (get_preemption_level() == 0) + nr_pages = trace_refill_stock(tcd, CFS_ALLOC_STD, &pages); + else + nr_pages = 0; + spin_lock(&trace_cpu_serializer); + trace_owner = current_thread(); + tcd->tcd_cur_stock_pages += nr_pages; + list_splice(&pages, &tcd->tcd_stock_pages); + return tcd; +} + +extern void raw_page_death_row_clean(void); + +void __trace_put_tcd(struct trace_cpu_data *tcd) { - return &trace_data[0].tcd; + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + LASSERT(trace_owner == current_thread()); + trace_owner = NULL; + spin_unlock(&trace_cpu_serializer); + if (get_preemption_level() == 0) + /* purge all pending pages */ + raw_page_death_row_clean(); } -inline void -__trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags) +int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage) { - return; + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + /* XNU has global tcd, and all pages are owned by it */ + return 1; } void -set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, +set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, const int line, unsigned long stack) -{ - struct timeval tv; +{ + struct timeval tv; - do_gettimeofday(&tv); - header->ph_subsys = subsys; - header->ph_mask = mask; - header->ph_cpu_id = smp_processor_id(); - header->ph_sec = (__u32)tv.tv_sec; - header->ph_usec = tv.tv_usec; - header->ph_stack = stack; - header->ph_pid = 0; - header->ph_line_num = line; - header->ph_extern_pid = 0; -} - -void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, - int len, char *file, const char *fn) -{ - char *prefix = NULL, *ptype = NULL; - - if ((mask & D_EMERG) != 0) { - prefix = "LustreError"; - ptype = KERN_EMERG; - } else if ((mask & D_ERROR) != 0) { - prefix = "LustreError"; - ptype = KERN_ERR; - } else if ((mask & D_WARNING) != 0) { - prefix = "Lustre"; - ptype = KERN_WARNING; - } else if (portal_printk != 0 || (mask & D_CONSOLE)) { - prefix = "Lustre"; - ptype = KERN_INFO; - } + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + do_gettimeofday(&tv); + header->ph_subsys = subsys; + header->ph_mask = mask; + header->ph_cpu_id = smp_processor_id(); + header->ph_sec = (__u32)tv.tv_sec; + header->ph_usec = tv.tv_usec; + header->ph_stack = stack; + header->ph_pid = cfs_curproc_pid(); + header->ph_line_num = line; + header->ph_extern_pid = (__u32)current_thread(); +} + +void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf, + int len, const char *file, const char *fn) +{ + char *prefix = "Lustre", *ptype = KERN_INFO; + + /* + * XXX nikita: do NOT call libcfs_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + if ((mask & D_EMERG) != 0) { + prefix = "LustreError"; + ptype = KERN_EMERG; + } else if ((mask & D_ERROR) != 0) { + prefix = "LustreError"; + ptype = KERN_ERR; + } else if ((mask & D_WARNING) != 0) { + prefix = "Lustre"; + ptype = KERN_WARNING; + } else if ((mask & libcfs_printk) != 0 || (mask & D_CONSOLE)) { + prefix = "Lustre"; + ptype = KERN_INFO; + } if ((mask & D_CONSOLE) != 0) { printk("%s%s: %.*s", ptype, prefix, len, buf); } else { - printk("%s%s: %d:%d:(%s:%d:%s()) %*s", ptype, prefix, hdr->ph_pid, - hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); + printk("%s%s: %d:%d:(%s:%d:%s()) %*s", + ptype, prefix, hdr->ph_pid, hdr->ph_extern_pid, + file, hdr->ph_line_num, fn, len, buf); } } /* * Sysctl handle of libcfs */ +#define MAX_TRACEFILE_PATH_LEN 256 int cfs_trace_daemon SYSCTL_HANDLER_ARGS { int error = 0; char *name = NULL; - MALLOC(name, char *, req->newlen + 1, M_TEMP, M_WAITOK | M_ZERO); + if (req->newptr == USER_ADDR_NULL) { + /* a read */ + if (tracefile) + error = sysctl_handle_string(oidp, tracefile, 0, req); + else + error = sysctl_handle_string(oidp, "NA", 0, req); + + return error; + } + + /* now hanle write requests */ + MALLOC(name, char *, MAX_TRACEFILE_PATH_LEN + 1, M_TEMP, M_WAITOK | M_ZERO); if (name == NULL) return -ENOMEM; - down_write(&tracefile_sem); - error = sysctl_handle_string(oidp, name, req->newlen + 1, req); - if (!error || req->newptr != NULL) { - /* write */ + name[0] = '\0'; + tracefile_write_lock(); + error = sysctl_handle_string(oidp, name, MAX_TRACEFILE_PATH_LEN + 1, req); + if (!error) { if (strcmp(name, "stop") == 0) { /* stop tracefile daemon */ tracefile = NULL; trace_stop_thread(); - goto out; - }else if (strncmp(name, "size=", 5) == 0) { - tracefile_size = simple_strtoul(name + 5, NULL, 0); - if (tracefile_size < 10 || tracefile_size > 20480) - tracefile_size = TRACEFILE_SIZE; - else - tracefile_size <<= 20; + goto out; + }else if (strncmp(name, "size=", 5) == 0) { + tracefile_size = simple_strtoul(name + 5, NULL, 0); + if (tracefile_size < 10 || tracefile_size > 20480) + tracefile_size = TRACEFILE_SIZE; + else + tracefile_size <<= 20; goto out; } - if (name[0] != '/') { - error = -EINVAL; - goto out; - } - if (tracefile != NULL) + if (name[0] != '/') { + error = -EINVAL; + goto out; + } + if (tracefile != NULL) cfs_free(tracefile); - tracefile = name; - name = NULL; + tracefile = name; + name = NULL; trace_start_thread(); - } else if (req->newptr != NULL) { + } else { /* Something was wrong with the write request */ printf("sysctl debug daemon failed: %d.\n", error); goto out; - } else { - /* Read request */ - SYSCTL_OUT(req, tracefile, sizeof(tracefile)); } out: - if (name != NULL) + if (name != NULL) FREE(name, M_TEMP); - up_write(&tracefile_sem); + tracefile_write_unlock(); return error; } +#undef MAX_TRACEFILE_PATH_LEN int cfs_debug_mb SYSCTL_HANDLER_ARGS @@ -138,27 +255,29 @@ int cfs_debug_mb SYSCTL_HANDLER_ARGS int error = 0; error = sysctl_handle_long(oidp, oidp->oid_arg1, oidp->oid_arg2, req); - if (!error && req->newptr != NULL) { + if (!error && req->newptr != USER_ADDR_NULL) { /* We have a new value stored in the standard location */ if (max_debug_mb <= 0) return -EINVAL; if (max_debug_mb > max_permit_mb) { printf("sysctl debug_mb is too big: %d.\n", max_debug_mb); return 0; - } - for (i = 0; i < NR_CPUS; i++) { - struct trace_cpu_data *tcd; - tcd = &trace_data[i].tcd; + } + for (i = 0; i < NR_CPUS; i++) { + struct trace_cpu_data *tcd; + tcd = &trace_data[i].tcd; tcd->tcd_max_pages = max_debug_mb; } - } else if (req->newptr != NULL) { + } else if (req->newptr != USER_ADDR_NULL) { /* Something was wrong with the write request */ printf ("sysctl debug_mb fault: %d.\n", error); - } else { - /* Read request */ - error = SYSCTL_OUT(req, &max_debug_mb, sizeof max_debug_mb); } + return error; } - +void +trace_call_on_all_cpus(void (*fn)(void *arg), void *arg) +{ +#error "tbd" +} diff --git a/lnet/libcfs/darwin/darwin-utils.c b/lnet/libcfs/darwin/darwin-utils.c index 630db6b..cfd7a2d 100644 --- a/lnet/libcfs/darwin/darwin-utils.c +++ b/lnet/libcfs/darwin/darwin-utils.c @@ -22,19 +22,21 @@ * Darwin porting library * Make things easy to port */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include #include #include #include -#include +#include + +#include #ifndef isspace inline int isspace(char c) -{ +{ return (c == ' ' || c == '\t' || c == '\n' || c == '\12'); } #endif @@ -98,12 +100,12 @@ strstr(const char *in, const char *str) char * strrchr(const char *p, int ch) -{ - const char *end = p + strlen(p); - do { - if (*end == (char)ch) - return (char *)end; - } while (--end >= p); +{ + const char *end = p + strlen(p); + do { + if (*end == (char)ch) + return (char *)end; + } while (--end >= p); return NULL; } @@ -273,7 +275,7 @@ int convert_server_error(__u64 ecode) int sign; int code; - static int errno_xlate[] = { + static int errno_xlate[] = { /* success is always success */ [0] = 0, [LINUX_EPERM] = EPERM, @@ -358,7 +360,8 @@ int convert_server_error(__u64 ecode) [LINUX_ELIBMAX] = EINVAL /* ELIBMAX */, [LINUX_ELIBEXEC] = EINVAL /* ELIBEXEC */, [LINUX_EILSEQ] = EILSEQ, - [LINUX_ERESTART] = ERESTART, + [LINUX_ERESTART] = EINVAL /* because ERESTART is + * negative in XNU */, [LINUX_ESTRPIPE] = EINVAL /* ESTRPIPE */, [LINUX_EUSERS] = EUSERS, [LINUX_ENOTSOCK] = ENOTSOCK, @@ -398,22 +401,19 @@ int convert_server_error(__u64 ecode) [LINUX_EDQUOT] = EDQUOT, [LINUX_ENOMEDIUM] = EINVAL /* ENOMEDIUM */, [LINUX_EMEDIUMTYPE] = EINVAL /* EMEDIUMTYPE */, - }; + }; code = (int)ecode; - if (code >= 0) { + if (code >= 0) { sign = +1; } else { sign = -1; code = -code; } - if (code < (sizeof errno_xlate) / (sizeof errno_xlate[0])) + if (code < (sizeof errno_xlate) / (sizeof errno_xlate[0])) { code = errno_xlate[code]; - else - /* - * Unknown error. Reserved for the future. - */ - code = EINVAL; - return sign * code; + LASSERT(code >= 0); + } + return sign * code; } enum { @@ -448,7 +448,7 @@ static inline void obit_convert(int *cflag, int *sflag, */ int convert_client_oflag(int cflag, int *result) { - int sflag; + int sflag = 0; cflag = 0; obit_convert(&cflag, &sflag, O_RDONLY, LINUX_O_RDONLY); @@ -480,3 +480,99 @@ int convert_client_oflag(int cflag, int *result) } else return -EINVAL; } + +#ifdef __DARWIN8__ +#else /* !__DARWIN8__ */ +extern int unix_syscall(); +extern int unix_syscall_return(); + +extern int ktrsysret(); +extern int ktrace(); + +extern int ast_taken(); +extern int ast_check(); + +extern int trap(); +extern int syscall_trace(); + +static int is_addr_in_range(void *addr, void *start, void *end) +{ + return start <= addr && addr <= end; +} + +extern void cfs_thread_agent (void); + +static int is_last_frame(void *addr) +{ + if (addr == NULL) + return 1; + else if (is_addr_in_range(addr, unix_syscall, unix_syscall_return)) + return 1; + else if (is_addr_in_range(addr, ktrsysret, ktrace)) + return 1; + else if (is_addr_in_range(addr, ast_taken, ast_check)) + return 1; + else if (is_addr_in_range(addr, trap, syscall_trace)) + return 1; + else if (is_addr_in_range(addr, cfs_thread_agent, cfs_kernel_thread)) + return 1; + else + return 0; +} + +static void *get_frame(int i) +{ + void *result; + +#define CASE(i) case (i): result = __builtin_return_address(i); break + switch (i + 1) { + CASE(1); + CASE(2); + CASE(3); + CASE(4); + CASE(5); + CASE(6); + CASE(7); + CASE(8); + CASE(9); + CASE(10); + CASE(11); + CASE(12); + CASE(13); + CASE(14); + CASE(15); + CASE(16); + CASE(17); + CASE(18); + CASE(19); + CASE(20); + default: + panic("impossible frame number: %d\n", i); + result = NULL; + } + return result; +} + +void cfs_stack_trace_fill(struct cfs_stack_trace *trace) +{ + int i; + + memset(trace, 0, sizeof *trace); + for (i = 0; i < sizeof_array(trace->frame); ++ i) { + void *addr; + + addr = get_frame(i); + trace->frame[i] = addr; + if (is_last_frame(addr)) + break; + } +} + +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no) +{ + if (0 <= frame_no && frame_no < sizeof_array(trace->frame)) + return trace->frame[frame_no]; + else + return NULL; +} +#endif /* !__DARWIN8__ */ diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index 3ef33d8..18bc5d5 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -24,64 +24,348 @@ # define EXPORT_SYMTAB #endif -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET #include #include - #include "tracefile.h" -unsigned int portal_subsystem_debug = ~0 - (S_PORTALS); -EXPORT_SYMBOL(portal_subsystem_debug); +static char debug_file_name[1024]; -unsigned int portal_debug = (D_WARNING | D_DLMTRACE | D_ERROR | D_EMERG | D_HA | - D_RPCTRACE | D_VFSTRACE | D_CONFIG | D_IOCTL | - D_CONSOLE); -EXPORT_SYMBOL(portal_debug); +#ifdef __KERNEL__ +unsigned int libcfs_subsystem_debug = ~0; +EXPORT_SYMBOL(libcfs_subsystem_debug); -unsigned int portal_printk; -EXPORT_SYMBOL(portal_printk); +unsigned int libcfs_debug = (D_EMERG | D_ERROR | D_WARNING | D_CONSOLE | + D_NETERROR | D_HA | D_CONFIG | D_IOCTL | + D_DLMTRACE | D_RPCTRACE | D_VFSTRACE); +EXPORT_SYMBOL(libcfs_debug); -unsigned int portal_stack; -EXPORT_SYMBOL(portal_stack); +unsigned int libcfs_printk; +EXPORT_SYMBOL(libcfs_printk); -unsigned int portals_catastrophe; -EXPORT_SYMBOL(portals_catastrophe); +unsigned int libcfs_console_ratelimit = 1; +EXPORT_SYMBOL(libcfs_console_ratelimit); -#ifdef __KERNEL__ -atomic_t portal_kmemory = ATOMIC_INIT(0); -EXPORT_SYMBOL(portal_kmemory); -#endif +unsigned int libcfs_debug_binary = 1; +EXPORT_SYMBOL(libcfs_debug_binary); + +unsigned int libcfs_stack; +EXPORT_SYMBOL(libcfs_stack); + +unsigned int portal_enter_debugger; +EXPORT_SYMBOL(portal_enter_debugger); + +unsigned int libcfs_catastrophe; +EXPORT_SYMBOL(libcfs_catastrophe); + +atomic_t libcfs_kmemory = ATOMIC_INIT(0); +EXPORT_SYMBOL(libcfs_kmemory); static cfs_waitq_t debug_ctlwq; char debug_file_path[1024] = "/tmp/lustre-log"; -static char debug_file_name[1024]; -void portals_debug_dumplog_internal(void *arg) +int libcfs_panic_in_progress; + +/* libcfs_debug_token2mask() expects the returned + * string in lower-case */ +const char * +libcfs_debug_subsys2str(int subsys) +{ + switch (subsys) { + default: + return NULL; + case S_UNDEFINED: + return "undefined"; + case S_MDC: + return "mdc"; + case S_MDS: + return "mds"; + case S_OSC: + return "osc"; + case S_OST: + return "ost"; + case S_CLASS: + return "class"; + case S_LOG: + return "log"; + case S_LLITE: + return "llite"; + case S_RPC: + return "rpc"; + case S_LNET: + return "lnet"; + case S_LND: + return "lnd"; + case S_PINGER: + return "pinger"; + case S_FILTER: + return "filter"; + case S_ECHO: + return "echo"; + case S_LDLM: + return "ldlm"; + case S_LOV: + return "lov"; + case S_LMV: + return "lmv"; + case S_SEC: + return "sec"; + case S_GSS: + return "gss"; + case S_MGC: + return "mgc"; + case S_MGS: + return "mgs"; + case S_FID: + return "fid"; + case S_FLD: + return "fld"; + } +} + +/* libcfs_debug_token2mask() expects the returned + * string in lower-case */ +const char * +libcfs_debug_dbg2str(int debug) +{ + switch (debug) { + default: + return NULL; + case D_TRACE: + return "trace"; + case D_INODE: + return "inode"; + case D_SUPER: + return "super"; + case D_EXT2: + return "ext2"; + case D_MALLOC: + return "malloc"; + case D_CACHE: + return "cache"; + case D_INFO: + return "info"; + case D_IOCTL: + return "ioctl"; + case D_NETERROR: + return "neterror"; + case D_NET: + return "net"; + case D_WARNING: + return "warning"; + case D_BUFFS: + return "buffs"; + case D_OTHER: + return "other"; + case D_DENTRY: + return "dentry"; + case D_PAGE: + return "page"; + case D_DLMTRACE: + return "dlmtrace"; + case D_ERROR: + return "error"; + case D_EMERG: + return "emerg"; + case D_HA: + return "ha"; + case D_RPCTRACE: + return "rpctrace"; + case D_VFSTRACE: + return "vfstrace"; + case D_READA: + return "reada"; + case D_MMAP: + return "mmap"; + case D_CONFIG: + return "config"; + case D_CONSOLE: + return "console"; + case D_QUOTA: + return "quota"; + case D_SEC: + return "sec"; + } +} + +int +libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int len = 0; + const char *token; + int bit; + int i; + + if (mask == 0) { /* "0" */ + if (size > 0) + str[0] = '0'; + len = 1; + } else { /* space-separated tokens */ + for (i = 0; i < 32; i++) { + bit = 1 << i; + + if ((mask & bit) == 0) + continue; + + token = fn(bit); + if (token == NULL) /* unused bit */ + continue; + + if (len > 0) { /* separator? */ + if (len < size) + str[len] = ' '; + len++; + } + + while (*token != 0) { + if (len < size) + str[len] = *token; + token++; + len++; + } + } + } + + /* terminate 'str' */ + if (len < size) + str[len] = 0; + else + str[size - 1] = 0; + + return len; +} + +int +libcfs_debug_token2mask(int *mask, const char *str, int len, int is_subsys) +{ + const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : + libcfs_debug_dbg2str; + int i; + int j; + int bit; + const char *token; + + /* match against known tokens */ + for (i = 0; i < 32; i++) { + bit = 1 << i; + + token = fn(bit); + if (token == NULL) /* unused? */ + continue; + + /* strcasecmp */ + for (j = 0; ; j++) { + if (j == len) { /* end of token */ + if (token[j] == 0) { + *mask = bit; + return 0; + } + break; + } + + if (token[j] == 0) + break; + + if (str[j] == token[j]) + continue; + + if (str[j] < 'A' || 'Z' < str[j]) + break; + + if (str[j] - 'A' + 'a' != token[j]) + break; + } + } + + return -EINVAL; /* no match */ +} + +int +libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) +{ + int m = 0; + int matched = 0; + char op = 0; + int n; + int t; + + /* must be a list of debug tokens or numbers separated by + * whitespace and optionally an operator ('+' or '-'). If an operator + * appears first in , '*mask' is used as the starting point + * (relative), otherwise 0 is used (absolute). An operator applies to + * all following tokens up to the next operator. */ + + while (*str != 0) { + while (isspace(*str)) /* skip whitespace */ + str++; + + if (*str == 0) + break; + + if (*str == '+' || *str == '-') { + op = *str++; + + /* op on first token == relative */ + if (!matched) + m = *mask; + + while (isspace(*str)) /* skip whitespace */ + str++; + + if (*str == 0) /* trailing op */ + return -EINVAL; + } + + /* find token length */ + for (n = 0; str[n] != 0 && !isspace(str[n]); n++); + + /* match token */ + if (libcfs_debug_token2mask(&t, str, n, is_subsys) != 0) + return -EINVAL; + + matched = 1; + if (op == '-') + m &= ~t; + else + m |= t; + + str += n; + } + + if (!matched) + return -EINVAL; + + *mask = m; + return 0; +} + +void libcfs_debug_dumplog_internal(void *arg) { CFS_DECL_JOURNAL_DATA; CFS_PUSH_JOURNAL; - snprintf(debug_file_name, sizeof(debug_file_path) - 1, - "%s.%ld.%ld", debug_file_path, cfs_time_current_sec(), (long)arg); + snprintf(debug_file_name, sizeof(debug_file_path) - 1, "%s.%ld.%ld", + debug_file_path, cfs_time_current_sec(), (long)arg); printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name); tracefile_dump_all_pages(debug_file_name); CFS_POP_JOURNAL; } -int portals_debug_dumplog_thread(void *arg) +int libcfs_debug_dumplog_thread(void *arg) { - kportal_daemonize(""); - reparent_to_init(); - portals_debug_dumplog_internal(arg); + cfs_daemonize(""); + libcfs_debug_dumplog_internal(arg); cfs_waitq_signal(&debug_ctlwq); return 0; } -void portals_debug_dumplog(void) +void libcfs_debug_dumplog(void) { int rc; cfs_waitlink_t wait; @@ -94,90 +378,51 @@ void portals_debug_dumplog(void) set_current_state(TASK_INTERRUPTIBLE); cfs_waitq_add(&debug_ctlwq, &wait); - rc = cfs_kernel_thread(portals_debug_dumplog_thread, + rc = cfs_kernel_thread(libcfs_debug_dumplog_thread, (void *)(long)cfs_curproc_pid(), CLONE_VM | CLONE_FS | CLONE_FILES); if (rc < 0) printk(KERN_ERR "LustreError: cannot start log dump thread: " "%d\n", rc); else - schedule(); + cfs_waitq_wait(&wait, CFS_TASK_INTERRUPTIBLE); /* be sure to teardown if kernel_thread() failed */ cfs_waitq_del(&debug_ctlwq, &wait); set_current_state(TASK_RUNNING); } -#ifdef PORTALS_DUMP_ON_PANIC -static int panic_dumplog(struct notifier_block *self, unsigned long unused1, - void *unused2) +int libcfs_debug_init(unsigned long bufsize) { - static int handled_panic; /* to avoid recursive calls to notifiers */ + int rc; - if (handled_panic) - return 0; - else - handled_panic = 1; - - if (in_interrupt()) { - trace_debug_print(); - return 0; - } - - while (current->lock_depth >= 0) - unlock_kernel(); - portals_debug_dumplog(); - return 0; -} - -static struct notifier_block lustre_panic_notifier = { - notifier_call : panic_dumplog, - next : NULL, - priority : 10000 -}; -#endif + cfs_waitq_init(&debug_ctlwq); + rc = tracefile_init(); -#ifdef CRAY_PORTALS -extern void *lus_portals_debug; -#endif + if (rc == 0) + libcfs_register_panic_notifier(); -int portals_debug_init(unsigned long bufsize) -{ - cfs_waitq_init(&debug_ctlwq); -#ifdef CRAY_PORTALS - lus_portals_debug = &portals_debug_msg; -#endif -#ifdef PORTALS_DUMP_ON_PANIC - /* This is currently disabled because it spews far too much to the - * console on the rare cases it is ever triggered. */ - notifier_chain_register(&panic_notifier_list, &lustre_panic_notifier); -#endif - return tracefile_init(); + return rc; } -int portals_debug_cleanup(void) +int libcfs_debug_cleanup(void) { + libcfs_unregister_panic_notifier(); tracefile_exit(); -#ifdef PORTALS_DUMP_ON_PANIC - notifier_chain_unregister(&panic_notifier_list, &lustre_panic_notifier); -#endif -#ifdef CRAY_PORTALS - lus_portals_debug = NULL; -#endif return 0; } -int portals_debug_clear_buffer(void) +int libcfs_debug_clear_buffer(void) { trace_flush_pages(); return 0; } -/* Debug markers, although printed by S_PORTALS +/* Debug markers, although printed by S_LNET * should not be be marked as such. */ #undef DEBUG_SUBSYSTEM #define DEBUG_SUBSYSTEM S_UNDEFINED -int portals_debug_mark_buffer(char *text) +int libcfs_debug_mark_buffer(char *text) { CDEBUG(D_TRACE,"***************************************************\n"); CDEBUG(D_WARNING, "DEBUG MARKER: %s\n", text); @@ -186,75 +431,293 @@ int portals_debug_mark_buffer(char *text) return 0; } #undef DEBUG_SUBSYSTEM -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET -void portals_debug_set_level(unsigned int debug_level) +void libcfs_debug_set_level(unsigned int debug_level) { printk(KERN_WARNING "Lustre: Setting portals debug level to %08x\n", debug_level); - portal_debug = debug_level; + libcfs_debug = debug_level; +} + +EXPORT_SYMBOL(libcfs_debug_dumplog); +EXPORT_SYMBOL(libcfs_debug_set_level); + + +#else /* !__KERNEL__ */ + +#include + +#ifdef HAVE_SYS_USER_H +# include +#endif + +#ifdef HAVE_CATAMOUNT_DATA_H +#include +#include + +static char source_nid[16]; +/* 0 indicates no messages to console, 1 is errors, > 1 is all debug messages */ +static int toconsole = 1; +unsigned int libcfs_console_ratelimit = 1; +#else /* !HAVE_CATAMOUNT_DATA_H */ +#ifdef HAVE_NETDB_H +#include +#endif /* HAVE_CATAMOUNT_DATA_H */ +struct utsname *tmp_utsname; +static char source_nid[sizeof(tmp_utsname->nodename)]; +#endif /* __KERNEL__ */ + +static int source_pid; +int smp_processor_id = 1; +char debug_file_path[1024]; +FILE *debug_file_fd; + +int portals_do_debug_dumplog(void *arg) +{ + printf("Look in %s\n", debug_file_name); + return 0; } -char *portals_nid2str(int nal, ptl_nid_t nid, char *str) + +void portals_debug_print(void) { - if (nid == PTL_NID_ANY) { - snprintf(str, PTL_NALFMT_SIZE, "%s", "PTL_NID_ANY"); - return str; - } + return; +} - switch(NALID_FROM_IFACE(nal)){ -/* XXX this could be a nal method of some sort, 'cept it's config - * dependent whether (say) socknal NIDs are actually IP addresses... */ -#if !CRAY_PORTALS - case TCPNAL: - /* userspace NAL */ - case IIBNAL: - case VIBNAL: - case OPENIBNAL: - case RANAL: - case SOCKNAL: { - /* HIPQUAD requires __u32, but we can't cast in it */ - __u32 nid32 = (__u32)nid; - if ((__u32)(nid >> 32)) { - snprintf(str, PTL_NALFMT_SIZE, "%u:%u.%u.%u.%u", - (__u32)(nid >> 32), HIPQUAD(nid32)); - } else { - snprintf(str, PTL_NALFMT_SIZE, "%u.%u.%u.%u", - HIPQUAD(nid32)); - } - break; + +void libcfs_debug_dumplog(void) +{ + printf("Look in %s\n", debug_file_name); + return; +} + +int libcfs_debug_init(unsigned long bufsize) +{ + char *debug_mask = NULL; + char *debug_subsys = NULL; + char *debug_filename; + +#ifdef HAVE_CATAMOUNT_DATA_H + char *debug_console = NULL; + char *debug_ratelimit = NULL; + + snprintf(source_nid, sizeof(source_nid) - 1, "%u", _my_pnid); + source_pid = _my_pid; + + debug_console = getenv("LIBLUSTRE_DEBUG_CONSOLE"); + if (debug_console != NULL) { + toconsole = strtoul(debug_console, NULL, 0); + CDEBUG(D_INFO, "set liblustre toconsole to %u\n", toconsole); + } + debug_ratelimit = getenv("LIBLUSTRE_DEBUG_CONSOLE_RATELIMIT"); + if (debug_ratelimit != NULL) { + libcfs_console_ratelimit = strtoul(debug_ratelimit, NULL, 0); + CDEBUG(D_INFO, "set liblustre console ratelimit to %u\n", libcfs_console_ratelimit); } - case QSWNAL: - case GMNAL: - case LONAL: - snprintf(str, PTL_NALFMT_SIZE, "%u:%u", - (__u32)(nid >> 32), (__u32)nid); - break; #else - case PTL_IFACE_SS: - case PTL_IFACE_SS_ACCEL: - snprintf(str, PTL_NALFMT_SIZE, "%u", (__u32)nid); - break; + struct utsname myname; + + if (uname(&myname) == 0) + strcpy(source_nid, myname.nodename); + source_pid = getpid(); #endif - default: - snprintf(str, PTL_NALFMT_SIZE, "?%x? %llx", - nal, (long long)nid); - break; + /* debug masks */ + debug_mask = getenv("LIBLUSTRE_DEBUG_MASK"); + if (debug_mask) + libcfs_debug = (unsigned int) strtol(debug_mask, NULL, 0); + + debug_subsys = getenv("LIBLUSTRE_DEBUG_SUBSYS"); + if (debug_subsys) + libcfs_subsystem_debug = + (unsigned int) strtol(debug_subsys, NULL, 0); + + debug_filename = getenv("LIBLUSTRE_DEBUG_BASE"); + if (debug_filename) + strncpy(debug_file_path,debug_filename,sizeof(debug_file_path)); + + debug_filename = getenv("LIBLUSTRE_DEBUG_FILE"); + if (debug_filename) + strncpy(debug_file_name,debug_filename,sizeof(debug_file_path)); + + if (debug_file_name[0] == '\0' && debug_file_path[0] != '\0') + snprintf(debug_file_name, sizeof(debug_file_name) - 1, + "%s-%s-%lu.log", debug_file_path, source_nid, time(0)); + + if (strcmp(debug_file_name, "stdout") == 0 || + strcmp(debug_file_name, "-") == 0) { + debug_file_fd = stdout; + } else if (strcmp(debug_file_name, "stderr") == 0) { + debug_file_fd = stderr; + } else if (debug_file_name[0] != '\0') { + debug_file_fd = fopen(debug_file_name, "w"); + if (debug_file_fd == NULL) + fprintf(stderr, "%s: unable to open '%s': %s\n", + source_nid, debug_file_name, strerror(errno)); } - return str; + + if (debug_file_fd == NULL) + debug_file_fd = stdout; + + return 0; } -char *portals_id2str(int nal, ptl_process_id_t id, char *str) +int libcfs_debug_cleanup(void) { - int len; + if (debug_file_fd != stdout && debug_file_fd != stderr) + fclose(debug_file_fd); + return 0; +} - portals_nid2str(nal, id.nid, str); - len = strlen(str); - snprintf(str + len, PTL_NALFMT_SIZE - len, "-%u", id.pid); - return str; +int libcfs_debug_clear_buffer(void) +{ + return 0; +} + +int libcfs_debug_mark_buffer(char *text) +{ + + fprintf(debug_file_fd, "*******************************************************************************\n"); + fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); + fprintf(debug_file_fd, "*******************************************************************************\n"); + + return 0; +} + +#ifdef HAVE_CATAMOUNT_DATA_H +#define CATAMOUNT_MAXLINE (256-4) +void catamount_printline(char *buf, size_t size) +{ + char *pos = buf; + int prsize = size; + + while (prsize > 0){ + lputs(pos); + pos += CATAMOUNT_MAXLINE; + prsize -= CATAMOUNT_MAXLINE; + } +} +#endif + +int +libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls, + int subsys, int mask, + const char *file, const char *fn, const int line, + const char *format1, va_list args, + const char *format2, ...) +{ + struct timeval tv; + int nob; + int remain; + va_list ap; + char buf[PAGE_SIZE]; /* size 4096 used for compatimble with linux, + * where message can`t be exceed PAGE_SIZE */ + int console = 0; + char *prefix = "Lustre"; + +#ifdef HAVE_CATAMOUNT_DATA_H + /* toconsole == 0 - all messages to debug_file_fd + * toconsole == 1 - warnings to console, all to debug_file_fd + * toconsole > 1 - all debug to console */ + if ( ((mask & D_CANTMASK) && + (toconsole == 1)) || (toconsole > 1)) { + console = 1; + } +#endif + + if ((!console) && (!debug_file_fd)) { + return 0; + } + + if (mask & (D_EMERG | D_ERROR)) + prefix = "LustreError"; + + nob = snprintf(buf, sizeof(buf), "%s: %u-%s:(%s:%d:%s()): ", prefix, + source_pid, source_nid, file, line, fn); + + remain = sizeof(buf) - nob; + if (format1) { + nob += vsnprintf(&buf[nob], remain, format1, args); + } + + remain = sizeof(buf) - nob; + if ((format2) && (remain > 0)) { + va_start(ap, format2); + nob += vsnprintf(&buf[nob], remain, format2, ap); + va_end(ap); + } + +#ifdef HAVE_CATAMOUNT_DATA_H + if (console) { + /* check rate limit for console */ + if (cdls != NULL) { + cfs_time_t t = cdls->cdls_next + + cfs_time_seconds(CDEBUG_MAX_LIMIT + 10); + cfs_duration_t dmax = cfs_time_seconds(CDEBUG_MAX_LIMIT); + + if (libcfs_console_ratelimit && + cdls->cdls_next != 0 && /* not first time ever */ + !cfs_time_after(cfs_time_current(), cdls->cdls_next)) { + + /* skipping a console message */ + cdls->cdls_count++; + goto out_file; + } + + if (cfs_time_after(cfs_time_current(), t)) { + /* last timeout was a long time ago */ + cdls->cdls_delay /= 8; + } else { + cdls->cdls_delay *= 2; + + if (cdls->cdls_delay < CFS_TICK) + cdls->cdls_delay = CFS_TICK; + else if (cdls->cdls_delay > dmax) + cdls->cdls_delay = dmax; + } + + /* ensure cdls_next is never zero after it's been seen */ + cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1; + } + + if (cdls != NULL && cdls->cdls_count != 0) { + char buf2[100]; + + nob = snprintf(buf2, sizeof(buf2), + "Skipped %d previous similar message%s\n", + cdls->cdls_count, (cdls->cdls_count > 1) ? "s" : ""); + + catamount_printline(buf2, nob); + cdls->cdls_count = 0; + goto out_file; + } + catamount_printline(buf, nob); + } +out_file: + /* return on toconsole > 1, as we don't want the user getting + * spammed by the debug data */ + if (toconsole > 1) + return 0; +#endif + if (debug_file_fd == NULL) + return 0; + + gettimeofday(&tv, NULL); + + fprintf(debug_file_fd, "%lu.%06lu:%u:%s:(%s:%d:%s()): %s", + tv.tv_sec, tv.tv_usec, source_pid, source_nid, + file, line, fn, buf); + + return 0; +} + +void +libcfs_assertion_failed(const char *expr, const char *file, const char *func, + const int line) +{ + libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line, + "ASSERTION(%s) failed\n", expr); + abort(); } -EXPORT_SYMBOL(portals_debug_dumplog); -EXPORT_SYMBOL(portals_debug_set_level); -EXPORT_SYMBOL(portals_nid2str); -EXPORT_SYMBOL(portals_id2str); +#endif /* __KERNEL__ */ diff --git a/lnet/libcfs/linux/Makefile.am b/lnet/libcfs/linux/Makefile.am index 49f8e87..8bf35cc 100644 --- a/lnet/libcfs/linux/Makefile.am +++ b/lnet/libcfs/linux/Makefile.am @@ -1,4 +1,4 @@ EXTRA_DIST := linux-debug.c linux-lwt.c linux-prim.c linux-tracefile.c \ linux-fs.c linux-mem.c linux-proc.c linux-utils.c linux-lock.c \ - linux-module.c linux-sync.c linux-curproc.c + linux-module.c linux-sync.c linux-curproc.c linux-tcpip.c diff --git a/lnet/libcfs/linux/linux-curproc.c b/lnet/libcfs/linux/linux-curproc.c index 719e48b..e446169 100644 --- a/lnet/libcfs/linux/linux-curproc.c +++ b/lnet/libcfs/linux/linux-curproc.c @@ -20,7 +20,7 @@ #include -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include diff --git a/lnet/libcfs/linux/linux-debug.c b/lnet/libcfs/linux/linux-debug.c index 62779de..abc07b2 100644 --- a/lnet/libcfs/linux/linux-debug.c +++ b/lnet/libcfs/linux/linux-debug.c @@ -24,7 +24,9 @@ # define EXPORT_SYMTAB #endif +#ifdef HAVE_KERNEL_CONFIG_H #include +#endif #include #include #include @@ -47,7 +49,7 @@ #include #include -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET #include #include @@ -59,9 +61,9 @@ #include #endif -char portals_upcall[1024] = "/usr/lib/lustre/portals_upcall"; +char lnet_upcall[1024] = "/usr/lib/lustre/lnet_upcall"; -void portals_run_upcall(char **argv) +void libcfs_run_upcall(char **argv) { int rc; int argc; @@ -71,7 +73,7 @@ void portals_run_upcall(char **argv) NULL}; ENTRY; - argv[0] = portals_upcall; + argv[0] = lnet_upcall; argc = 1; while (argv[argc] != NULL) argc++; @@ -80,15 +82,15 @@ void portals_run_upcall(char **argv) rc = USERMODEHELPER(argv[0], argv, envp); if (rc < 0) { - CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; " - "check /proc/sys/portals/upcall\n", + CERROR("Error %d invoking LNET upcall %s %s%s%s%s%s%s%s%s; " + "check /proc/sys/lnet/upcall\n", rc, argv[0], argv[1], argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], argc < 6 ? "" : ",..."); } else { - CWARN("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n", + CWARN("Invoked LNET upcall %s %s%s%s%s%s%s%s%s\n", argv[0], argv[1], argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], @@ -97,7 +99,7 @@ void portals_run_upcall(char **argv) } } -void portals_run_lbug_upcall(char *file, const char *fn, const int line) +void libcfs_run_lbug_upcall(char *file, const char *fn, const int line) { char *argv[6]; char buf[32]; @@ -111,18 +113,50 @@ void portals_run_lbug_upcall(char *file, const char *fn, const int line) argv[4] = buf; argv[5] = NULL; - portals_run_upcall (argv); + libcfs_run_upcall (argv); +} + +#ifdef __arch_um__ +void lbug_with_loc(char *file, const char *func, const int line) +{ + libcfs_catastrophe = 1; + libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line, + "LBUG - trying to dump log to /tmp/lustre-log\n"); + libcfs_debug_dumplog(); + libcfs_run_lbug_upcall(file, func, line); + asm("int $3"); + panic("LBUG"); } +#else +/* coverity[+kill] */ +void lbug_with_loc(char *file, const char *func, const int line) +{ + libcfs_catastrophe = 1; + libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line, "LBUG\n"); + + if (in_interrupt()) { + panic("LBUG in interrupt.\n"); + /* not reached */ + } + + libcfs_debug_dumpstack(NULL); + libcfs_debug_dumplog(); + libcfs_run_lbug_upcall(file, func, line); + set_task_state(current, TASK_UNINTERRUPTIBLE); + while (1) + schedule(); +} +#endif /* __arch_um__ */ #ifdef __KERNEL__ -void portals_debug_dumpstack(struct task_struct *tsk) +void libcfs_debug_dumpstack(struct task_struct *tsk) { #if defined(__arch_um__) if (tsk != NULL) CWARN("stack dump for pid %d (%d) requested; wake up gdb.\n", tsk->pid, UML_PID(tsk)); - asm("int $3"); + //asm("int $3"); #elif defined(HAVE_SHOW_TASK) /* this is exported by lustre kernel version 42 */ extern void show_task(struct task_struct *); @@ -133,18 +167,71 @@ void portals_debug_dumpstack(struct task_struct *tsk) show_task(tsk); #else CWARN("can't show stack: kernel doesn't export show_task\n"); + if ((tsk == NULL) || (tsk == current)) + dump_stack(); #endif } -cfs_task_t *portals_current(void) +cfs_task_t *libcfs_current(void) { CWARN("current task struct is %p\n", current); return current; } -EXPORT_SYMBOL(portals_debug_dumpstack); -EXPORT_SYMBOL(portals_current); + +static int panic_notifier(struct notifier_block *self, unsigned long unused1, + void *unused2) +{ + if (libcfs_panic_in_progress) + return 0; + + libcfs_panic_in_progress = 1; + mb(); + +#ifdef LNET_DUMP_ON_PANIC + /* This is currently disabled because it spews far too much to the + * console on the rare cases it is ever triggered. */ + + if (in_interrupt()) { + trace_debug_print(); + } else { + while (current->lock_depth >= 0) + unlock_kernel(); + + libcfs_debug_dumplog_internal((void *)(long)cfs_curproc_pid()); + } +#endif + return 0; +} + +static struct notifier_block libcfs_panic_notifier = { + notifier_call : panic_notifier, + next : NULL, + priority : 10000 +}; + +void libcfs_register_panic_notifier(void) +{ +#ifdef HAVE_ATOMIC_PANIC_NOTIFIER + atomic_notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier); +#else + notifier_chain_register(&panic_notifier_list, &libcfs_panic_notifier); +#endif +} + +void libcfs_unregister_panic_notifier(void) +{ +#ifdef HAVE_ATOMIC_PANIC_NOTIFIER + atomic_notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier); +#else + notifier_chain_unregister(&panic_notifier_list, &libcfs_panic_notifier); +#endif +} + +EXPORT_SYMBOL(libcfs_debug_dumpstack); +EXPORT_SYMBOL(libcfs_current); #endif /* __KERNEL__ */ -EXPORT_SYMBOL(portals_run_upcall); -EXPORT_SYMBOL(portals_run_lbug_upcall); +EXPORT_SYMBOL(libcfs_run_upcall); +EXPORT_SYMBOL(libcfs_run_lbug_upcall); +EXPORT_SYMBOL(lbug_with_loc); diff --git a/lnet/libcfs/linux/linux-fs.c b/lnet/libcfs/linux/linux-fs.c index 61b7166..061944c 100644 --- a/lnet/libcfs/linux/linux-fs.c +++ b/lnet/libcfs/linux/linux-fs.c @@ -1,6 +1,7 @@ -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET #include +#include #include #include @@ -14,12 +15,12 @@ cfs_filp_open (const char *name, int flags, int mode, int *err) */ cfs_file_t *filp = NULL; - filp = filp_open(name, flags, mode); - if (IS_ERR(filp)) { + filp = filp_open(name, flags, mode); + if (IS_ERR(filp)) { int rc; - rc = PTR_ERR(filp); - printk(KERN_ERR "LustreError: can't open %s file: err %d\n", + rc = PTR_ERR(filp); + printk(KERN_ERR "LustreError: can't open %s file: err %d\n", name, rc); if (err) *err = rc; @@ -28,4 +29,87 @@ cfs_filp_open (const char *name, int flags, int mode, int *err) return filp; } +/* write a userspace buffer to disk. + * NOTE: this returns 0 on success, not the number of bytes written. */ +ssize_t +cfs_user_write (cfs_file_t *filp, const char *buf, size_t count, loff_t *offset) +{ + mm_segment_t fs; + ssize_t size = 0; + + fs = get_fs(); + set_fs(KERNEL_DS); + while (count > 0) { + size = filp->f_op->write(filp, (char *)buf, count, offset); + if (size < 0) + break; + count -= size; + size = 0; + } + set_fs(fs); + + return size; +} + +cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor) +{ + return MKDEV(major, minor); +} + +cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev) +{ + return MAJOR(rdev); +} + +cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev) +{ + return MINOR(rdev); +} + +#if !(CFS_O_CREAT == O_CREAT && CFS_O_EXCL == O_EXCL && \ + CFS_O_TRUNC == O_TRUNC && CFS_O_APPEND == O_APPEND &&\ + CFS_O_NONBLOCK == O_NONBLOCK && CFS_O_NDELAY == O_NDELAY &&\ + CFS_O_SYNC == O_SYNC && CFS_O_ASYNC == FASYNC &&\ + CFS_O_DIRECT == O_DIRECT && CFS_O_LARGEFILE == O_LARGEFILE &&\ + CFS_O_DIRECTORY == O_DIRECTORY && CFS_O_NOFOLLOW == O_NOFOLLOW) + +int cfs_oflags2univ(int flags) +{ + int f; + + f = flags & O_ACCMODE; + f |= (flags & O_CREAT) ? CFS_O_CREAT: 0; + f |= (flags & O_EXCL) ? CFS_O_EXCL: 0; + f |= (flags & O_NOCTTY) ? CFS_O_NOCTTY: 0; + f |= (flags & O_TRUNC) ? CFS_O_TRUNC: 0; + f |= (flags & O_APPEND) ? CFS_O_APPEND: 0; + f |= (flags & O_NONBLOCK) ? CFS_O_NONBLOCK: 0; + f |= (flags & O_SYNC)? CFS_O_SYNC: 0; + f |= (flags & FASYNC)? CFS_O_ASYNC: 0; + f |= (flags & O_DIRECTORY)? CFS_O_DIRECTORY: 0; + f |= (flags & O_DIRECT)? CFS_O_DIRECT: 0; + f |= (flags & O_LARGEFILE)? CFS_O_LARGEFILE: 0; + f |= (flags & O_NOFOLLOW)? CFS_O_NOFOLLOW: 0; + f |= (flags & O_NOATIME)? CFS_O_NOATIME: 0; + return f; +} +#else + +int cfs_oflags2univ(int flags) +{ + return (flags); +} +#endif + +/* + * XXX Liang: we don't need cfs_univ2oflags() now. + */ +int cfs_univ2oflags(int flags) +{ + return (flags); +} + EXPORT_SYMBOL(cfs_filp_open); +EXPORT_SYMBOL(cfs_user_write); +EXPORT_SYMBOL(cfs_oflags2univ); +EXPORT_SYMBOL(cfs_univ2oflags); diff --git a/lnet/libcfs/linux/linux-lock.c b/lnet/libcfs/linux/linux-lock.c index a1d713e..01511d6 100644 --- a/lnet/libcfs/linux/linux-lock.c +++ b/lnet/libcfs/linux/linux-lock.c @@ -1,4 +1,4 @@ -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET #include #include diff --git a/lnet/libcfs/linux/linux-lwt.c b/lnet/libcfs/linux/linux-lwt.c index 32adc80..520c54c 100644 --- a/lnet/libcfs/linux/linux-lwt.c +++ b/lnet/libcfs/linux/linux-lwt.c @@ -1,2 +1,2 @@ -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET diff --git a/lnet/libcfs/linux/linux-mem.c b/lnet/libcfs/linux/linux-mem.c index fb2c6a0..f327814 100644 --- a/lnet/libcfs/linux/linux-mem.c +++ b/lnet/libcfs/linux/linux-mem.c @@ -18,7 +18,7 @@ * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include @@ -26,36 +26,40 @@ #include #include -void * -cfs_alloc(size_t nr_bytes, u_int32_t flags) +static unsigned int cfs_alloc_flags_to_gfp(u_int32_t flags) { - void *ptr = NULL; unsigned int mflags = 0; #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - if (flags & CFS_ALLOC_ATOMIC) - mflags |= __GFP_HIGH; + if (flags & CFS_ALLOC_ATOMIC) + mflags |= __GFP_HIGH; else if (flags & CFS_ALLOC_WAIT) mflags |= __GFP_WAIT; - else - mflags |= (__GFP_HIGH | __GFP_WAIT); - - if (flags & CFS_ALLOC_FS) - mflags |= __GFP_FS; - if (flags & CFS_ALLOC_IO) - mflags |= __GFP_IO | __GFP_HIGHIO; + else + mflags |= (__GFP_HIGH | __GFP_WAIT); + if (flags & CFS_ALLOC_IO) + mflags |= __GFP_IO | __GFP_HIGHIO; #else if (flags & CFS_ALLOC_ATOMIC) mflags |= __GFP_HIGH; else mflags |= __GFP_WAIT; - if (flags & CFS_ALLOC_FS) - mflags |= __GFP_FS; + if (flags & CFS_ALLOC_NOWARN) + mflags |= __GFP_NOWARN; if (flags & CFS_ALLOC_IO) mflags |= __GFP_IO; #endif + if (flags & CFS_ALLOC_FS) + mflags |= __GFP_FS; + return mflags; +} + +void * +cfs_alloc(size_t nr_bytes, u_int32_t flags) +{ + void *ptr = NULL; - ptr = kmalloc(nr_bytes, mflags); + ptr = kmalloc(nr_bytes, cfs_alloc_flags_to_gfp(flags)); if (ptr != NULL && (flags & CFS_ALLOC_ZERO)) memset(ptr, 0, nr_bytes); return ptr; @@ -79,83 +83,37 @@ cfs_free_large(void *addr) vfree(addr); } -cfs_page_t * -cfs_alloc_pages(unsigned int flags, unsigned int order) +cfs_page_t *cfs_alloc_page(unsigned int flags) { - unsigned int mflags = 0; - -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - if (flags & CFS_ALLOC_ATOMIC) - mflags |= __GFP_HIGH; - else if (flags & CFS_ALLOC_WAIT) - mflags |= __GFP_WAIT; - else - mflags |= (__GFP_HIGH | __GFP_WAIT); - - if (flags & CFS_ALLOC_FS) - mflags |= __GFP_FS; - if (flags & CFS_ALLOC_IO) - mflags |= __GFP_IO | __GFP_HIGHIO; - if (flags & CFS_ALLOC_HIGH) - mflags |= __GFP_HIGHMEM; -#else - if (flags & CFS_ALLOC_ATOMIC) - mflags |= __GFP_HIGH; - else - mflags |= __GFP_WAIT; - if (flags & CFS_ALLOC_FS) - mflags |= __GFP_FS; - if (flags & CFS_ALLOC_IO) - mflags |= __GFP_IO; - if (flags & CFS_ALLOC_HIGH) - mflags |= __GFP_HIGHMEM; -#endif - - return alloc_pages(mflags, order); + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + return alloc_pages(cfs_alloc_flags_to_gfp(flags), 0); } cfs_mem_cache_t * cfs_mem_cache_create (const char *name, size_t size, size_t offset, - unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), - void (*dtor)(void*, cfs_mem_cache_t *, unsigned long)) + unsigned long flags) { - return kmem_cache_create(name, size, offset, flags, ctor, dtor); + return kmem_cache_create(name, size, offset, flags, NULL, NULL); } int cfs_mem_cache_destroy (cfs_mem_cache_t * cachep) { +#ifdef HAVE_KMEM_CACHE_DESTROY_INT return kmem_cache_destroy(cachep); +#else + kmem_cache_destroy(cachep); + return 0; +#endif } void * cfs_mem_cache_alloc(cfs_mem_cache_t *cachep, int flags) { - unsigned int mflags = 0; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - if (flags & CFS_SLAB_ATOMIC) - mflags |= __GFP_HIGH; - else if (flags & CFS_ALLOC_WAIT) - mflags |= __GFP_WAIT; - else - mflags |= (__GFP_HIGH | __GFP_WAIT); - - if (flags & CFS_SLAB_FS) - mflags |= __GFP_FS; - if (flags & CFS_SLAB_IO) - mflags |= __GFP_IO | __GFP_HIGHIO; -#else - if (flags & CFS_SLAB_ATOMIC) - mflags |= __GFP_HIGH; - else - mflags |= __GFP_WAIT; - if (flags & CFS_SLAB_FS) - mflags |= __GFP_FS; - if (flags & CFS_SLAB_IO) - mflags |= __GFP_IO; -#endif - - return kmem_cache_alloc(cachep, mflags); + return kmem_cache_alloc(cachep, cfs_alloc_flags_to_gfp(flags)); } void @@ -168,7 +126,7 @@ EXPORT_SYMBOL(cfs_alloc); EXPORT_SYMBOL(cfs_free); EXPORT_SYMBOL(cfs_alloc_large); EXPORT_SYMBOL(cfs_free_large); -EXPORT_SYMBOL(cfs_alloc_pages); +EXPORT_SYMBOL(cfs_alloc_page); EXPORT_SYMBOL(cfs_mem_cache_create); EXPORT_SYMBOL(cfs_mem_cache_destroy); EXPORT_SYMBOL(cfs_mem_cache_alloc); diff --git a/lnet/libcfs/linux/linux-module.c b/lnet/libcfs/linux/linux-module.c index 4b2558b..6f21853 100644 --- a/lnet/libcfs/linux/linux-module.c +++ b/lnet/libcfs/linux/linux-module.c @@ -1,48 +1,25 @@ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include -#define PORTAL_MINOR 240 +#define LNET_MINOR 240 - -void -kportal_daemonize (char *str) +int libcfs_ioctl_getdata(char *buf, char *end, void *arg) { -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) - daemonize(str); -#else - daemonize(); - snprintf (current->comm, sizeof (current->comm), "%s", str); -#endif -} - -void -kportal_blockallsigs () -{ - unsigned long flags; - - SIGNAL_MASK_LOCK(current, flags); - sigfillset(¤t->blocked); - RECALC_SIGPENDING; - SIGNAL_MASK_UNLOCK(current, flags); -} - -int portal_ioctl_getdata(char *buf, char *end, void *arg) -{ - struct portal_ioctl_hdr *hdr; - struct portal_ioctl_data *data; + struct libcfs_ioctl_hdr *hdr; + struct libcfs_ioctl_data *data; int err; ENTRY; - hdr = (struct portal_ioctl_hdr *)buf; - data = (struct portal_ioctl_data *)buf; + hdr = (struct libcfs_ioctl_hdr *)buf; + data = (struct libcfs_ioctl_data *)buf; err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); if (err) RETURN(err); - if (hdr->ioc_version != PORTAL_IOCTL_VERSION) { + if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) { CERROR("PORTALS: version mismatch kernel vs application\n"); RETURN(-EINVAL); } @@ -53,7 +30,7 @@ int portal_ioctl_getdata(char *buf, char *end, void *arg) } - if (hdr->ioc_len < sizeof(struct portal_ioctl_data)) { + if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) { CERROR("PORTALS: user buffer too small for ioctl\n"); RETURN(-EINVAL); } @@ -62,7 +39,7 @@ int portal_ioctl_getdata(char *buf, char *end, void *arg) if (err) RETURN(err); - if (portal_ioctl_is_invalid(data)) { + if (libcfs_ioctl_is_invalid(data)) { CERROR("PORTALS: ioctl not correctly formatted\n"); RETURN(-EINVAL); } @@ -76,18 +53,25 @@ int portal_ioctl_getdata(char *buf, char *end, void *arg) RETURN(0); } - + +int libcfs_ioctl_popdata(void *arg, void *data, int size) +{ + if (copy_to_user((char *)arg, data, size)) + return -EFAULT; + return 0; +} + extern struct cfs_psdev_ops libcfs_psdev_ops; -static int +static int libcfs_psdev_open(struct inode * inode, struct file * file) -{ - struct portals_device_userstate **pdu = NULL; +{ + struct libcfs_device_userstate **pdu = NULL; int rc = 0; - if (!inode) + if (!inode) return (-EINVAL); - pdu = (struct portals_device_userstate **)&file->private_data; + pdu = (struct libcfs_device_userstate **)&file->private_data; if (libcfs_psdev_ops.p_open != NULL) rc = libcfs_psdev_ops.p_open(0, (void *)pdu); else @@ -96,13 +80,13 @@ libcfs_psdev_open(struct inode * inode, struct file * file) } /* called when closing /dev/device */ -static int +static int libcfs_psdev_release(struct inode * inode, struct file * file) { - struct portals_device_userstate *pdu; + struct libcfs_device_userstate *pdu; int rc = 0; - if (!inode) + if (!inode) return (-EINVAL); pdu = file->private_data; if (libcfs_psdev_ops.p_close != NULL) @@ -112,59 +96,56 @@ libcfs_psdev_release(struct inode * inode, struct file * file) return rc; } -static int -libcfs_ioctl(struct inode *inode, struct file *file, +static int +libcfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) -{ +{ struct cfs_psdev_file pfile; int rc = 0; - if (current->fsuid != 0) - return -EACCES; - - if ( _IOC_TYPE(cmd) != IOC_PORTAL_TYPE || - _IOC_NR(cmd) < IOC_PORTAL_MIN_NR || - _IOC_NR(cmd) > IOC_PORTAL_MAX_NR ) { - CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", - _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); - return (-EINVAL); - } - + if (current->fsuid != 0) + return -EACCES; + + if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || + _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR || + _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) { + CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); + return (-EINVAL); + } + /* Handle platform-dependent IOC requests */ - switch (cmd) { - case IOC_PORTAL_PANIC: - if (!capable (CAP_SYS_BOOT)) - return (-EPERM); - panic("debugctl-invoked panic"); + switch (cmd) { + case IOC_LIBCFS_PANIC: + if (!capable (CAP_SYS_BOOT)) + return (-EPERM); + panic("debugctl-invoked panic"); return (0); - case IOC_PORTAL_MEMHOG: - if (!capable (CAP_SYS_ADMIN)) + case IOC_LIBCFS_MEMHOG: + if (!capable (CAP_SYS_ADMIN)) return -EPERM; /* go thought */ } pfile.off = 0; pfile.private_data = file->private_data; - if (libcfs_psdev_ops.p_ioctl != NULL) - rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); + if (libcfs_psdev_ops.p_ioctl != NULL) + rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); else rc = -EPERM; return (rc); } -static struct file_operations libcfs_fops = { - ioctl: libcfs_ioctl, - open: libcfs_psdev_open, +static struct file_operations libcfs_fops = { + ioctl: libcfs_ioctl, + open: libcfs_psdev_open, release: libcfs_psdev_release }; -cfs_psdev_t libcfs_dev = { - PORTAL_MINOR, - "portals", +cfs_psdev_t libcfs_dev = { + LNET_MINOR, + "lnet", &libcfs_fops }; -EXPORT_SYMBOL(kportal_blockallsigs); -EXPORT_SYMBOL(kportal_daemonize); - diff --git a/lnet/libcfs/linux/linux-prim.c b/lnet/libcfs/linux/linux-prim.c index 95365ee..fe5d61f 100644 --- a/lnet/libcfs/linux/linux-prim.c +++ b/lnet/libcfs/linux/linux-prim.c @@ -1,19 +1,154 @@ -#define DEBUG_SUBSYSTEM S_PORTALS +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002, 2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#ifdef HAVE_KERNEL_CONFIG_H #include +#endif #include #include +#include + +#if defined(CONFIG_KGDB) +#include +#endif + +void cfs_enter_debugger(void) +{ +#if defined(CONFIG_KGDB) + BREAKPOINT(); +#elif defined(__arch_um__) + asm("int $3"); +#else + /* nothing */ +#endif +} + +void cfs_daemonize(char *str) { + unsigned long flags; + + lock_kernel(); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,63)) + daemonize(str); +#else + daemonize(); + exit_files(current); + reparent_to_init(); + snprintf (current->comm, sizeof (current->comm), "%s", str); +#endif + SIGNAL_MASK_LOCK(current, flags); + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + unlock_kernel(); +} + +int cfs_daemonize_ctxt(char *str) { + struct task_struct *tsk = current; + struct fs_struct *fs = NULL; + + cfs_daemonize(str); + fs = copy_fs_struct(tsk->fs); + if (fs == NULL) + return -ENOMEM; + exit_fs(tsk); + tsk->fs = fs; + return 0; +} + + +sigset_t +cfs_get_blockedsigs(void) +{ + unsigned long flags; + sigset_t old; + + SIGNAL_MASK_LOCK(current, flags); + old = current->blocked; + SIGNAL_MASK_UNLOCK(current, flags); + return old; +} + +sigset_t +cfs_block_allsigs(void) +{ + unsigned long flags; + sigset_t old; + + SIGNAL_MASK_LOCK(current, flags); + old = current->blocked; + sigfillset(¤t->blocked); + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + + return old; +} + +sigset_t +cfs_block_sigs(sigset_t bits) +{ + unsigned long flags; + sigset_t old; + + SIGNAL_MASK_LOCK(current, flags); + old = current->blocked; + current->blocked = bits; + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); + return old; +} + +void +cfs_restore_sigs (cfs_sigset_t old) +{ + unsigned long flags; + + SIGNAL_MASK_LOCK(current, flags); + current->blocked = old; + RECALC_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); +} + +int +cfs_signal_pending(void) +{ + return signal_pending(current); +} + +void +cfs_clear_sigpending(void) +{ + unsigned long flags; + + SIGNAL_MASK_LOCK(current, flags); + CLEAR_SIGPENDING; + SIGNAL_MASK_UNLOCK(current, flags); +} int libcfs_arch_init(void) -{ - return 0; +{ + return 0; } void libcfs_arch_cleanup(void) { - return; + return; } EXPORT_SYMBOL(libcfs_arch_init); EXPORT_SYMBOL(libcfs_arch_cleanup); +EXPORT_SYMBOL(cfs_daemonize); +EXPORT_SYMBOL(cfs_daemonize_ctxt); +EXPORT_SYMBOL(cfs_block_allsigs); +EXPORT_SYMBOL(cfs_block_sigs); +EXPORT_SYMBOL(cfs_get_blockedsigs); +EXPORT_SYMBOL(cfs_restore_sigs); +EXPORT_SYMBOL(cfs_signal_pending); +EXPORT_SYMBOL(cfs_clear_sigpending); diff --git a/lnet/libcfs/linux/linux-proc.c b/lnet/libcfs/linux/linux-proc.c index 77277ba..3efdd46 100644 --- a/lnet/libcfs/linux/linux-proc.c +++ b/lnet/libcfs/linux/linux-proc.c @@ -26,7 +26,9 @@ # define EXPORT_SYMTAB #endif +#ifdef HAVE_KERNEL_CONFIG_H #include +#endif #include #include #include @@ -51,231 +53,138 @@ #include #include -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET #include #include #include "tracefile.h" -static struct ctl_table_header *portals_table_header = NULL; -extern char debug_file_path[1024]; -extern char portals_upcall[1024]; +static struct ctl_table_header *lnet_table_header = NULL; +extern char lnet_upcall[1024]; -#define PSDEV_PORTALS (0x100) +#define PSDEV_LNET (0x100) enum { PSDEV_DEBUG = 1, /* control debugging */ PSDEV_SUBSYSTEM_DEBUG, /* control debugging */ - PSDEV_PRINTK, /* force all errors to console */ - PSDEV_CONSOLE, /* allow _any_ messages to console */ + PSDEV_PRINTK, /* force all messages to console */ + PSDEV_CONSOLE_RATELIMIT, /* ratelimit console messages */ PSDEV_DEBUG_PATH, /* crashdump log location */ PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ - PSDEV_PORTALS_UPCALL, /* User mode upcall script */ - PSDEV_PORTALS_MEMUSED, /* bytes currently PORTAL_ALLOCated */ - PSDEV_PORTALS_CATASTROPHE,/* if we have LBUGged or panic'd */ + PSDEV_LNET_UPCALL, /* User mode upcall script */ + PSDEV_LNET_MEMUSED, /* bytes currently PORTAL_ALLOCated */ + PSDEV_LNET_CATASTROPHE, /* if we have LBUGged or panic'd */ }; -static struct ctl_table portals_table[] = { - {PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL, - &proc_dointvec}, - {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &portal_subsystem_debug, +int LL_PROC_PROTO(proc_dobitmasks); + +static struct ctl_table lnet_table[] = { + {PSDEV_DEBUG, "debug", &libcfs_debug, sizeof(int), 0644, NULL, + &proc_dobitmasks}, + {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &libcfs_subsystem_debug, + sizeof(int), 0644, NULL, &proc_dobitmasks}, + {PSDEV_PRINTK, "printk", &libcfs_printk, sizeof(int), 0644, NULL, + &proc_dobitmasks}, + {PSDEV_CONSOLE_RATELIMIT, "console_ratelimit",&libcfs_console_ratelimit, sizeof(int), 0644, NULL, &proc_dointvec}, - {PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL, - &proc_dointvec}, {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, - {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, - sizeof(portals_upcall), 0644, NULL, &proc_dostring, + {PSDEV_LNET_UPCALL, "upcall", lnet_upcall, + sizeof(lnet_upcall), 0644, NULL, &proc_dostring, &sysctl_string}, - {PSDEV_PORTALS_MEMUSED, "memused", (int *)&portal_kmemory.counter, + {PSDEV_LNET_MEMUSED, "memused", (int *)&libcfs_kmemory.counter, sizeof(int), 0444, NULL, &proc_dointvec}, - {PSDEV_PORTALS_CATASTROPHE, "catastrophe", &portals_catastrophe, + {PSDEV_LNET_CATASTROPHE, "catastrophe", &libcfs_catastrophe, sizeof(int), 0444, NULL, &proc_dointvec}, {0} }; static struct ctl_table top_table[2] = { - {PSDEV_PORTALS, "portals", NULL, 0, 0555, portals_table}, + {PSDEV_LNET, "lnet", NULL, 0, 0555, lnet_table}, {0} }; +int LL_PROC_PROTO(proc_dobitmasks) +{ + const int tmpstrlen = 512; + char *str; + int rc = 0; + /* the proc filling api stumps me always, coax proc_dointvec + * and proc_dostring into doing the drudgery by cheating + * with a dummy ctl_table + */ + struct ctl_table dummy = *table; + unsigned int *mask = (unsigned int *)table->data; + int is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0; + + str = kmalloc(tmpstrlen, GFP_USER); + if (str == NULL) + return -ENOMEM; + + if (write) { + size_t oldlen = *lenp; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8) + loff_t oldpos = *ppos; +#endif -#ifdef PORTALS_PROFILING -/* - * profiling stuff. we do this statically for now 'cause its simple, - * but we could do some tricks with elf sections to have this array - * automatically built. - */ -#define def_prof(FOO) [PROF__##FOO] = {#FOO, 0, } - -struct prof_ent prof_ents[] = { - def_prof(our_recvmsg), - def_prof(our_sendmsg), - def_prof(socknal_recv), - def_prof(lib_parse), - def_prof(conn_list_walk), - def_prof(memcpy), - def_prof(lib_finalize), - def_prof(pingcli_time), - def_prof(gmnal_send), - def_prof(gmnal_recv), -}; + dummy.proc_handler = &proc_dointvec; -EXPORT_SYMBOL(prof_ents); + /* old proc interface allows user to specify just an int + * value; be compatible and don't break userland. + */ + rc = ll_proc_dointvec(&dummy, write, filp, buffer, lenp, ppos); -/* - * this function is as crazy as the proc filling api - * requires. - * - * buffer: page allocated for us to scribble in. the - * data returned to the user will be taken from here. - * *start: address of the pointer that will tell the - * caller where in buffer the data the user wants is. - * ppos: offset in the entire /proc file that the user - * currently wants. - * wanted: the amount of data the user wants. - * - * while going, 'curpos' is the offset in the entire - * file where we currently are. We only actually - * start filling buffer when we get to a place in - * the file that the user cares about. - * - * we take care to only sprintf when the user cares because - * we're holding a lock while we do this. - * - * we're smart and know that we generate fixed size lines. - * we only start writing to the buffer when the user cares. - * This is unpredictable because we don't snapshot the - * list between calls that are filling in a file from - * the list. The list could change mid read and the - * output will look very weird indeed. oh well. - */ + if (rc != -EINVAL) + goto out; -static int prof_read_proc(char *buffer, char **start, off_t ppos, int wanted, - int *eof, void *data) -{ - int len = 0, i; - int curpos; - char *header = "Interval Cycles_per (Starts Finishes Total)\n"; - int header_len = strlen(header); - char *format = "%-15s %.12Ld (%.12d %.12d %.12Ld)"; - int line_len = (15 + 1 + 12 + 2 + 12 + 1 + 12 + 1 + 12 + 1); - - *start = buffer; - - if (ppos < header_len) { - int diff = MIN(header_len, wanted); - memcpy(buffer, header + ppos, diff); - len += diff; - ppos += diff; - } + /* using new interface */ + dummy.data = str; + dummy.maxlen = tmpstrlen; + dummy.proc_handler = &proc_dostring; - if (len >= wanted) - goto out; + /* proc_dointvec might have changed these */ + *lenp = oldlen; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8) + *ppos = oldpos; +#endif - curpos = header_len; + rc = ll_proc_dostring(&dummy, write, filp, buffer, lenp, ppos); - for ( i = 0; i < MAX_PROFS ; i++) { - int copied; - struct prof_ent *pe = &prof_ents[i]; - long long cycles_per; - /* - * find the part of the array that the buffer wants - */ - if (ppos >= (curpos + line_len)) { - curpos += line_len; - continue; - } - /* the clever caller split a line */ - if (ppos > curpos) { - *start = buffer + (ppos - curpos); - } - - if (pe->finishes == 0) - cycles_per = 0; - else - { - cycles_per = pe->total_cycles; - do_div (cycles_per, pe->finishes); - } - - copied = sprintf(buffer + len, format, pe->str, cycles_per, - pe->starts, pe->finishes, pe->total_cycles); - - len += copied; - - /* pad to line len, -1 for \n */ - if ((copied < line_len-1)) { - int diff = (line_len-1) - copied; - memset(buffer + len, ' ', diff); - len += diff; - copied += diff; - } - - buffer[len++]= '\n'; - - /* bail if we have enough */ - if (((buffer + len) - *start) >= wanted) - break; - - curpos += line_len; - } + if (rc != 0) + goto out; - /* lameness */ - if (i == MAX_PROFS) - *eof = 1; - out: + rc = libcfs_debug_str2mask(mask, dummy.data, is_subsys); + } else { + dummy.data = str; + dummy.maxlen = tmpstrlen; + dummy.proc_handler = &proc_dostring; - return MIN(((buffer + len) - *start), wanted); -} + libcfs_debug_mask2str(dummy.data, dummy.maxlen,*mask,is_subsys); -/* - * all kids love /proc :/ - */ -static unsigned char basedir[]="net/portals"; -#endif /* PORTALS_PROFILING */ + rc = ll_proc_dostring(&dummy, write, filp, buffer, lenp, ppos); + } + +out: + kfree(str); + return rc; +} int insert_proc(void) { struct proc_dir_entry *ent; -#if PORTALS_PROFILING - unsigned char dir[128]; - - if (ARRAY_SIZE(prof_ents) != MAX_PROFS) { - CERROR("profiling enum and array are out of sync.\n"); - return -1; - } - - /* - * This is pretty lame. assuming that failure just - * means that they already existed. - */ - strcat(dir, basedir); - create_proc_entry(dir, S_IFDIR, 0); - - strcat(dir, "/cycles"); - ent = create_proc_entry(dir, 0, 0); - if (!ent) { - CERROR("couldn't register %s?\n", dir); - return -1; - } - - ent->data = NULL; - ent->read_proc = prof_read_proc; -#endif /* PORTALS_PROFILING */ #ifdef CONFIG_SYSCTL - if (!portals_table_header) - portals_table_header = register_sysctl_table(top_table, 0); + if (!lnet_table_header) + lnet_table_header = register_sysctl_table(top_table, 0); #endif - ent = create_proc_entry("sys/portals/dump_kernel", 0, NULL); + ent = create_proc_entry("sys/lnet/dump_kernel", 0, NULL); if (ent == NULL) { CERROR("couldn't register dump_kernel\n"); return -1; } ent->write_proc = trace_dk; - ent = create_proc_entry("sys/portals/daemon_file", 0, NULL); + ent = create_proc_entry("sys/lnet/daemon_file", 0, NULL); if (ent == NULL) { CERROR("couldn't register daemon_file\n"); return -1; @@ -283,7 +192,7 @@ int insert_proc(void) ent->write_proc = trace_write_daemon_file; ent->read_proc = trace_read_daemon_file; - ent = create_proc_entry("sys/portals/debug_mb", 0, NULL); + ent = create_proc_entry("sys/lnet/debug_mb", 0, NULL); if (ent == NULL) { CERROR("couldn't register debug_mb\n"); return -1; @@ -296,29 +205,13 @@ int insert_proc(void) void remove_proc(void) { -#if PORTALS_PROFILING - unsigned char dir[128]; - int end; - - dir[0]='\0'; - strcat(dir, basedir); - - end = strlen(dir); - - strcat(dir, "/cycles"); - remove_proc_entry(dir, 0); - - dir[end] = '\0'; - remove_proc_entry(dir, 0); -#endif /* PORTALS_PROFILING */ - - remove_proc_entry("sys/portals/dump_kernel", NULL); - remove_proc_entry("sys/portals/daemon_file", NULL); - remove_proc_entry("sys/portals/debug_mb", NULL); + remove_proc_entry("sys/lnet/dump_kernel", NULL); + remove_proc_entry("sys/lnet/daemon_file", NULL); + remove_proc_entry("sys/lnet/debug_mb", NULL); #ifdef CONFIG_SYSCTL - if (portals_table_header) - unregister_sysctl_table(portals_table_header); - portals_table_header = NULL; + if (lnet_table_header) + unregister_sysctl_table(lnet_table_header); + lnet_table_header = NULL; #endif } diff --git a/lnet/libcfs/linux/linux-sync.c b/lnet/libcfs/linux/linux-sync.c index 32adc80..520c54c 100644 --- a/lnet/libcfs/linux/linux-sync.c +++ b/lnet/libcfs/linux/linux-sync.c @@ -1,2 +1,2 @@ -# define DEBUG_SUBSYSTEM S_PORTALS +# define DEBUG_SUBSYSTEM S_LNET diff --git a/lnet/libcfs/linux/linux-tcpip.c b/lnet/libcfs/linux/linux-tcpip.c new file mode 100644 index 0000000..9cb85ef --- /dev/null +++ b/lnet/libcfs/linux/linux-tcpip.c @@ -0,0 +1,687 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + +#include +#include +#include +/* For sys_open & sys_close */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) +#include +#else +#include +#endif + +int +libcfs_sock_ioctl(int cmd, unsigned long arg) +{ + mm_segment_t oldmm = get_fs(); + struct socket *sock; + int fd; + int rc; + struct file *sock_filp; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return rc; + } + + fd = sock_map_fd(sock); + if (fd < 0) { + rc = fd; + sock_release(sock); + goto out; + } + + sock_filp = fget(fd); + if (!sock_filp) { + rc = -ENOMEM; + goto out_fd; + } + + set_fs(KERNEL_DS); +#ifdef HAVE_UNLOCKED_IOCTL + if (sock_filp->f_op->unlocked_ioctl) + rc = sock_filp->f_op->unlocked_ioctl(sock_filp, cmd, arg); + else +#endif + { + lock_kernel(); + rc =sock_filp->f_op->ioctl(sock_filp->f_dentry->d_inode, + sock_filp, cmd, arg); + unlock_kernel(); + } + set_fs(oldmm); + + fput(sock_filp); + + out_fd: + sys_close(fd); + out: + return rc; +} + +int +libcfs_ipif_query (char *name, int *up, __u32 *ip, __u32 *mask) +{ + struct ifreq ifr; + int nob; + int rc; + __u32 val; + + nob = strnlen(name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + CERROR("Interface name %s too long\n", name); + rc = -EINVAL; + goto out; + } + + CLASSERT (sizeof(ifr.ifr_name) >= IFNAMSIZ); + + strcpy(ifr.ifr_name, name); + rc = libcfs_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get flags for interface %s\n", name); + goto out; + } + + if ((ifr.ifr_flags & IFF_UP) == 0) { + CDEBUG(D_NET, "Interface %s down\n", name); + *up = 0; + *ip = *mask = 0; + goto out; + } + + *up = 1; + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + rc = libcfs_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get IP address for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; + *ip = ntohl(val); + + strcpy(ifr.ifr_name, name); + ifr.ifr_addr.sa_family = AF_INET; + rc = libcfs_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr); + + if (rc != 0) { + CERROR("Can't get netmask for interface %s\n", name); + goto out; + } + + val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr; + *mask = ntohl(val); + + out: + return rc; +} + +EXPORT_SYMBOL(libcfs_ipif_query); + +int +libcfs_ipif_enumerate (char ***namesp) +{ + /* Allocate and fill in 'names', returning # interfaces/error */ + char **names; + int toobig; + int nalloc; + int nfound; + struct ifreq *ifr; + struct ifconf ifc; + int rc; + int nob; + int i; + + + nalloc = 16; /* first guess at max interfaces */ + toobig = 0; + for (;;) { + if (nalloc * sizeof(*ifr) > CFS_PAGE_SIZE) { + toobig = 1; + nalloc = CFS_PAGE_SIZE/sizeof(*ifr); + CWARN("Too many interfaces: only enumerating first %d\n", + nalloc); + } + + LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr)); + if (ifr == NULL) { + CERROR ("ENOMEM enumerating up to %d interfaces\n", nalloc); + rc = -ENOMEM; + goto out0; + } + + ifc.ifc_buf = (char *)ifr; + ifc.ifc_len = nalloc * sizeof(*ifr); + + rc = libcfs_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc); + + if (rc < 0) { + CERROR ("Error %d enumerating interfaces\n", rc); + goto out1; + } + + LASSERT (rc == 0); + + nfound = ifc.ifc_len/sizeof(*ifr); + LASSERT (nfound <= nalloc); + + if (nfound < nalloc || toobig) + break; + + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + nalloc *= 2; + } + + if (nfound == 0) + goto out1; + + LIBCFS_ALLOC(names, nfound * sizeof(*names)); + if (names == NULL) { + rc = -ENOMEM; + goto out1; + } + /* NULL out all names[i] */ + memset (names, 0, nfound * sizeof(*names)); + + for (i = 0; i < nfound; i++) { + + nob = strnlen (ifr[i].ifr_name, IFNAMSIZ); + if (nob == IFNAMSIZ) { + /* no space for terminating NULL */ + CERROR("interface name %.*s too long (%d max)\n", + nob, ifr[i].ifr_name, IFNAMSIZ); + rc = -ENAMETOOLONG; + goto out2; + } + + LIBCFS_ALLOC(names[i], IFNAMSIZ); + if (names[i] == NULL) { + rc = -ENOMEM; + goto out2; + } + + memcpy(names[i], ifr[i].ifr_name, nob); + names[i][nob] = 0; + } + + *namesp = names; + rc = nfound; + + out2: + if (rc < 0) + libcfs_ipif_free_enumeration(names, nfound); + out1: + LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); + out0: + return rc; +} + +EXPORT_SYMBOL(libcfs_ipif_enumerate); + +void +libcfs_ipif_free_enumeration (char **names, int n) +{ + int i; + + LASSERT (n > 0); + + for (i = 0; i < n && names[i] != NULL; i++) + LIBCFS_FREE(names[i], IFNAMSIZ); + + LIBCFS_FREE(names, n * sizeof(*names)); +} + +EXPORT_SYMBOL(libcfs_ipif_free_enumeration); + +int +libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + mm_segment_t oldmm = get_fs(); + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + /* Caller may pass a zero timeout if she thinks the socket buffer is + * empty enough to take the whole message immediately */ + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0 + }; + + if (timeout != 0) { + /* Set send timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + set_fs(KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, + (char *)&tv, sizeof(tv)); + set_fs(oldmm); + if (rc != 0) { + CERROR("Can't set socket send timeout " + "%ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + } + + set_fs (KERNEL_DS); + then = jiffies; + rc = sock_sendmsg (sock, &msg, iov.iov_len); + ticks -= jiffies - then; + set_fs (oldmm); + + if (rc == nob) + return 0; + + if (rc < 0) + return rc; + + if (rc == 0) { + CERROR ("Unexpected zero rc\n"); + return (-ECONNABORTED); + } + + if (ticks <= 0) + return -EAGAIN; + + buffer = ((char *)buffer) + rc; + nob -= rc; + } + + return (0); +} +EXPORT_SYMBOL(libcfs_sock_write); + +int +libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + mm_segment_t oldmm = get_fs(); + long ticks = timeout * HZ; + unsigned long then; + struct timeval tv; + + LASSERT (nob > 0); + LASSERT (ticks > 0); + + for (;;) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + /* Set receive timeout to remaining time */ + tv = (struct timeval) { + .tv_sec = ticks / HZ, + .tv_usec = ((ticks % HZ) * 1000000) / HZ + }; + set_fs(KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + (char *)&tv, sizeof(tv)); + set_fs(oldmm); + if (rc != 0) { + CERROR("Can't set socket recv timeout %ld.%06d: %d\n", + (long)tv.tv_sec, (int)tv.tv_usec, rc); + return rc; + } + + set_fs(KERNEL_DS); + then = jiffies; + rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); + ticks -= jiffies - then; + set_fs(oldmm); + + if (rc < 0) + return rc; + + if (rc == 0) + return -ECONNRESET; + + buffer = ((char *)buffer) + rc; + nob -= rc; + + if (nob == 0) + return 0; + + if (ticks <= 0) + return -ETIMEDOUT; + } +} + +EXPORT_SYMBOL(libcfs_sock_read); + +static int +libcfs_sock_create (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port) +{ + struct sockaddr_in locaddr; + struct socket *sock; + int rc; + int option; + mm_segment_t oldmm = get_fs(); + + /* All errors are fatal except bind failure if the port is in use */ + *fatal = 1; + + rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); + *sockp = sock; + if (rc != 0) { + CERROR ("Can't create socket: %d\n", rc); + return (rc); + } + + set_fs (KERNEL_DS); + option = 1; + rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); + goto failed; + } + + if (local_ip != 0 || local_port != 0) { + memset(&locaddr, 0, sizeof(locaddr)); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons(local_port); + locaddr.sin_addr.s_addr = (local_ip == 0) ? + INADDR_ANY : htonl(local_ip); + + rc = sock->ops->bind(sock, (struct sockaddr *)&locaddr, + sizeof(locaddr)); + if (rc == -EADDRINUSE) { + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *fatal = 0; + goto failed; + } + if (rc != 0) { + CERROR("Error trying to bind to port %d: %d\n", + local_port, rc); + goto failed; + } + } + + return 0; + + failed: + sock_release(sock); + return rc; +} + +int +libcfs_sock_setbuf (struct socket *sock, int txbufsize, int rxbufsize) +{ + mm_segment_t oldmm = get_fs(); + int option; + int rc; + + if (txbufsize != 0) { + option = txbufsize; + set_fs (KERNEL_DS); + rc = sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + return (rc); + } + } + + if (rxbufsize != 0) { + option = rxbufsize; + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + return (rc); + } + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_setbuf); + +int +libcfs_sock_getaddr (struct socket *sock, int remote, __u32 *ip, int *port) +{ + struct sockaddr_in sin; + int len = sizeof (sin); + int rc; + + rc = sock->ops->getname (sock, (struct sockaddr *)&sin, &len, + remote ? 2 : 0); + if (rc != 0) { + CERROR ("Error %d getting sock %s IP/port\n", + rc, remote ? "peer" : "local"); + return rc; + } + + if (ip != NULL) + *ip = ntohl (sin.sin_addr.s_addr); + + if (port != NULL) + *port = ntohs (sin.sin_port); + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_getaddr); + +int +libcfs_sock_getbuf (struct socket *sock, int *txbufsize, int *rxbufsize) +{ + + if (txbufsize != NULL) { + *txbufsize = sock->sk->sk_sndbuf; + } + + if (rxbufsize != NULL) { + *rxbufsize = sock->sk->sk_rcvbuf; + } + + return 0; +} + +EXPORT_SYMBOL(libcfs_sock_getbuf); + +int +libcfs_sock_listen (struct socket **sockp, + __u32 local_ip, int local_port, int backlog) +{ + int fatal; + int rc; + + rc = libcfs_sock_create(sockp, &fatal, local_ip, local_port); + if (rc != 0) { + if (!fatal) + CERROR("Can't create socket: port %d already in use\n", + local_port); + return rc; + } + + rc = (*sockp)->ops->listen(*sockp, backlog); + if (rc == 0) + return 0; + + CERROR("Can't set listen backlog %d: %d\n", backlog, rc); + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_listen); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,12) +int sock_create_lite(int family, int type, int protocol, struct socket **res) +{ + int err = 0; + struct socket *sock; + + sock = sock_alloc(); + if (!sock) { + err = -ENOMEM; + goto out; + } + sock->type = type; +out: + *res = sock; + return err; +} +#endif + +int +libcfs_sock_accept (struct socket **newsockp, struct socket *sock) +{ + wait_queue_t wait; + struct socket *newsock; + int rc; + + init_waitqueue_entry(&wait, current); + + /* XXX this should add a ref to sock->ops->owner, if + * TCP could be a module */ + rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock); + if (rc) { + CERROR("Can't allocate socket\n"); + return rc; + } + + newsock->ops = sock->ops; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(sock->sk->sk_sleep, &wait); + + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + if (rc == -EAGAIN) { + /* Nothing ready, so wait for activity */ + schedule(); + rc = sock->ops->accept(sock, newsock, O_NONBLOCK); + } + + remove_wait_queue(sock->sk->sk_sleep, &wait); + set_current_state(TASK_RUNNING); + + if (rc != 0) + goto failed; + + *newsockp = newsock; + return 0; + + failed: + sock_release(newsock); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_accept); + +void +libcfs_sock_abort_accept (struct socket *sock) +{ + wake_up_all(sock->sk->sk_sleep); +} + +EXPORT_SYMBOL(libcfs_sock_abort_accept); + +int +libcfs_sock_connect (struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port) +{ + struct sockaddr_in srvaddr; + int rc; + + rc = libcfs_sock_create(sockp, fatal, local_ip, local_port); + if (rc != 0) + return rc; + + memset (&srvaddr, 0, sizeof (srvaddr)); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons(peer_port); + srvaddr.sin_addr.s_addr = htonl(peer_ip); + + rc = (*sockp)->ops->connect(*sockp, + (struct sockaddr *)&srvaddr, sizeof(srvaddr), + 0); + if (rc == 0) + return 0; + + /* EADDRNOTAVAIL probably means we're already connected to the same + * peer/port on the same local port on a differently typed + * connection. Let our caller retry with a different local + * port... */ + *fatal = !(rc == -EADDRNOTAVAIL); + + CDEBUG(*fatal ? D_NETERROR : D_NET, + "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, + HIPQUAD(local_ip), local_port, HIPQUAD(peer_ip), peer_port); + + sock_release(*sockp); + return rc; +} + +EXPORT_SYMBOL(libcfs_sock_connect); + +void +libcfs_sock_release (struct socket *sock) +{ + sock_release(sock); +} + +EXPORT_SYMBOL(libcfs_sock_release); diff --git a/lnet/libcfs/linux/linux-tracefile.c b/lnet/libcfs/linux/linux-tracefile.c index daba696..1fb38cf 100644 --- a/lnet/libcfs/linux/linux-tracefile.c +++ b/lnet/libcfs/linux/linux-tracefile.c @@ -1,4 +1,4 @@ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #define LUSTRE_TRACEFILE_PRIVATE #include @@ -13,198 +13,315 @@ extern union trace_data_union trace_data[NR_CPUS]; extern char *tracefile; extern long long tracefile_size; -extern struct rw_semaphore tracefile_sem; -inline struct trace_cpu_data * -__trace_get_tcd(unsigned long *flags) +char *trace_console_buffers[NR_CPUS][3]; + +struct rw_semaphore tracefile_sem; + +int tracefile_init_arch() +{ + int i; + int j; + + init_rwsem(&tracefile_sem); + + for (i = 0; i < NR_CPUS; i++) + for (j = 0; j < 3; j++) { + trace_console_buffers[i][j] = + kmalloc(TRACE_CONSOLE_BUFFER_SIZE, + GFP_KERNEL); + + if (trace_console_buffers[i][j] == NULL) { + tracefile_fini_arch(); + printk(KERN_ERR + "Can't allocate " + "console message buffer\n"); + return -ENOMEM; + } + } + + return 0; +} + +void tracefile_fini_arch() +{ + int i; + int j; + + for (i = 0; i < NR_CPUS; i++) + for (j = 0; j < 3; j++) + if (trace_console_buffers[i][j] != NULL) { + kfree(trace_console_buffers[i][j]); + trace_console_buffers[i][j] = NULL; + } +} + +void tracefile_read_lock() +{ + down_read(&tracefile_sem); +} + +void tracefile_read_unlock() +{ + up_read(&tracefile_sem); +} + +void tracefile_write_lock() +{ + down_write(&tracefile_sem); +} + +void tracefile_write_unlock() +{ + up_write(&tracefile_sem); +} + +char * +trace_get_console_buffer(void) { - struct trace_cpu_data *ret; + int cpu = get_cpu(); + int idx; - int cpu = get_cpu(); - local_irq_save(*flags); - ret = &trace_data[cpu].tcd; + if (in_irq()) { + idx = 0; + } else if (in_softirq()) { + idx = 1; + } else { + idx = 2; + } - return ret; + return trace_console_buffers[cpu][idx]; } -inline void -trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags) +void +trace_put_console_buffer(char *buffer) { - local_irq_restore(flags); - put_cpu(); + put_cpu(); +} + +struct trace_cpu_data * +trace_get_tcd(void) +{ + int cpu; + + if (in_interrupt()) /* no logging in IRQ context */ + return NULL; + + cpu = get_cpu(); + return &trace_data[cpu].tcd; +} + +void +trace_put_tcd (struct trace_cpu_data *tcd) +{ + __LASSERT (!in_interrupt()); + put_cpu(); +} + +int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage) +{ + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + return tcd->tcd_cpu == tage->cpu; } void -set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, +set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, const int line, unsigned long stack) -{ - struct timeval tv; - - do_gettimeofday(&tv); - - header->ph_subsys = subsys; - header->ph_mask = mask; - header->ph_cpu_id = smp_processor_id(); - header->ph_sec = (__u32)tv.tv_sec; - header->ph_usec = tv.tv_usec; - header->ph_stack = stack; - header->ph_pid = current->pid; - header->ph_line_num = line; -#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) +{ + struct timeval tv; + + do_gettimeofday(&tv); + + header->ph_subsys = subsys; + header->ph_mask = mask; + header->ph_cpu_id = smp_processor_id(); + header->ph_sec = (__u32)tv.tv_sec; + header->ph_usec = tv.tv_usec; + header->ph_stack = stack; + header->ph_pid = current->pid; + header->ph_line_num = line; +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) header->ph_extern_pid = current->thread.extern_pid; -#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) header->ph_extern_pid = current->thread.mode.tt.extern_pid; -#else +#else header->ph_extern_pid = 0; #endif return; } -void print_to_console(struct ptldebug_header *hdr, int mask, char *buf, - int len, char *file, const char *fn) -{ - char *prefix = NULL, *ptype = NULL; - - if ((mask & D_EMERG) != 0) { - prefix = "LustreError"; - ptype = KERN_EMERG; - } else if ((mask & D_ERROR) != 0) { - prefix = "LustreError"; - ptype = KERN_ERR; - } else if ((mask & D_WARNING) != 0) { - prefix = "Lustre"; - ptype = KERN_WARNING; - } else if (portal_printk != 0 || (mask & D_CONSOLE)) { - prefix = "Lustre"; - ptype = KERN_INFO; - } +void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf, + int len, const char *file, const char *fn) +{ + char *prefix = "Lustre", *ptype = NULL; + + if ((mask & D_EMERG) != 0) { + prefix = "LustreError"; + ptype = KERN_EMERG; + } else if ((mask & D_ERROR) != 0) { + prefix = "LustreError"; + ptype = KERN_ERR; + } else if ((mask & D_WARNING) != 0) { + prefix = "Lustre"; + ptype = KERN_WARNING; + } else if ((mask & libcfs_printk) != 0 || (mask & D_CONSOLE)) { + prefix = "Lustre"; + ptype = KERN_INFO; + } if ((mask & D_CONSOLE) != 0) { printk("%s%s: %.*s", ptype, prefix, len, buf); } else { - printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, + printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid, hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf); } return; } -int trace_write_daemon_file(struct file *file, const char *buffer, +int trace_write_daemon_file(struct file *file, const char *buffer, unsigned long count, void *data) -{ - char *name; - unsigned long off; - int rc; - - name = kmalloc(count + 1, GFP_KERNEL); - if (name == NULL) - return -ENOMEM; - - if (copy_from_user(name, buffer, count)) { - rc = -EFAULT; - goto out; - } - - /* be nice and strip out trailing '\n' */ - for (off = count ; off > 2 && isspace(name[off - 1]); off--) - ; - - name[off] = '\0'; - - down_write(&tracefile_sem); - if (strcmp(name, "stop") == 0) { - tracefile = NULL; - trace_stop_thread(); - goto out_sem; - } else if (strncmp(name, "size=", 5) == 0) { - tracefile_size = simple_strtoul(name + 5, NULL, 0); - if (tracefile_size < 10 || tracefile_size > 20480) - tracefile_size = TRACEFILE_SIZE; - else - tracefile_size <<= 20; - goto out_sem; - } - - if (name[0] != '/') { - rc = -EINVAL; - goto out_sem; - } - - if (tracefile != NULL) - kfree(tracefile); - - tracefile = name; - name = NULL; - printk(KERN_INFO "Lustre: debug daemon will attempt to start writing " - "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10)); - - trace_start_thread(); -out_sem: - up_write(&tracefile_sem); -out: +{ + char *name; + unsigned long off; + int rc; + + name = kmalloc(count + 1, GFP_KERNEL); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user(name, buffer, count)) { + rc = -EFAULT; + goto out; + } + + /* be nice and strip out trailing '\n' */ + for (off = count ; off > 2 && isspace(name[off - 1]); off--) + ; + + name[off] = '\0'; + + tracefile_write_lock(); + if (strcmp(name, "stop") == 0) { + tracefile = NULL; + trace_stop_thread(); + goto out_sem; + } else if (strncmp(name, "size=", 5) == 0) { + tracefile_size = simple_strtoul(name + 5, NULL, 0); + if (tracefile_size < 10 || tracefile_size > 20480) + tracefile_size = TRACEFILE_SIZE; + else + tracefile_size <<= 20; + goto out_sem; + } + + if (name[0] != '/') { + rc = -EINVAL; + goto out_sem; + } + + if (tracefile != NULL) + kfree(tracefile); + + tracefile = name; + name = NULL; + printk(KERN_INFO "Lustre: debug daemon will attempt to start writing " + "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10)); + + trace_start_thread(); +out_sem: + tracefile_write_unlock(); +out: kfree(name); return count; } -int trace_read_daemon_file(char *page, char **start, off_t off, int count, +int trace_read_daemon_file(char *page, char **start, off_t off, int count, int *eof, void *data) -{ - int rc; - - down_read(&tracefile_sem); - rc = snprintf(page, count, "%s", tracefile); - up_read(&tracefile_sem); +{ + int rc; + + tracefile_read_lock(); + rc = snprintf(page, count, "%s", tracefile); + tracefile_read_unlock(); return rc; } -int trace_write_debug_mb(struct file *file, const char *buffer, +int trace_write_debug_mb(struct file *file, const char *buffer, unsigned long count, void *data) -{ - char string[32]; - int i; - unsigned max; - - if (count >= sizeof(string)) { - printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n", - count); - return -EOVERFLOW; - } - - if (copy_from_user(string, buffer, count)) - return -EFAULT; - - max = simple_strtoul(string, NULL, 0); - if (max == 0) +{ + char string[32]; + int i; + unsigned max; + + if (count >= sizeof(string)) { + printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n", + count); + return -EOVERFLOW; + } + + if (copy_from_user(string, buffer, count)) + return -EFAULT; + + max = simple_strtoul(string, NULL, 0); + if (max == 0) return -EINVAL; - if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || max >= 512) { - printk(KERN_ERR "Lustre: Refusing to set debug buffer size to " - "%dMB, which is more than 80%% of available RAM (%lu)\n", - max, (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5); - return -EINVAL; - } - - max /= smp_num_cpus; - - for (i = 0; i < NR_CPUS; i++) { - struct trace_cpu_data *tcd; - tcd = &trace_data[i].tcd; - tcd->tcd_max_pages = max << (20 - PAGE_SHIFT); - } + if (max > (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5 || max >= 512) { + printk(KERN_ERR "Lustre: Refusing to set debug buffer size to " + "%dMB, which is more than 80%% of available RAM (%lu)\n", + max, (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5); + return -EINVAL; + } + + max /= smp_num_cpus; + + for (i = 0; i < NR_CPUS; i++) { + struct trace_cpu_data *tcd; + tcd = &trace_data[i].tcd; + tcd->tcd_max_pages = max << (20 - CFS_PAGE_SHIFT); + } return count; } int trace_read_debug_mb(char *page, char **start, off_t off, int count, int *eof, void *data) -{ - struct trace_cpu_data *tcd; - unsigned long flags; +{ + struct trace_cpu_data *tcd; int rc; - - tcd = trace_get_tcd(flags); - rc = snprintf(page, count, "%lu\n", - (tcd->tcd_max_pages >> (20 - PAGE_SHIFT)) * smp_num_cpus); - trace_put_tcd(tcd, flags); + + tcd = trace_get_tcd(); + __LASSERT (tcd != NULL); + + rc = snprintf(page, count, "%lu\n", + (tcd->tcd_max_pages >> (20 - CFS_PAGE_SHIFT)) * smp_num_cpus); + + trace_put_tcd(tcd); return rc; } +void +trace_call_on_all_cpus(void (*fn)(void *arg), void *arg) +{ + cpumask_t cpus_allowed = current->cpus_allowed; + /* use cpus_allowed to quiet 2.4 UP kernel warning only */ + cpumask_t m = cpus_allowed; + int cpu; + + /* Run the given routine on every CPU in thread context */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_online(cpu)) + continue; + + cpus_clear(m); + cpu_set(cpu, m); + set_cpus_allowed(current, m); + + fn(arg); + + set_cpus_allowed(current, cpus_allowed); + } +} diff --git a/lnet/libcfs/linux/linux-utils.c b/lnet/libcfs/linux/linux-utils.c index 67ecb0c..60f7cb8 100644 --- a/lnet/libcfs/linux/linux-utils.c +++ b/lnet/libcfs/linux/linux-utils.c @@ -24,8 +24,8 @@ /* * miscellaneous libcfs stuff */ -#define DEBUG_SUBSYSTEM S_PORTALS -#include +#define DEBUG_SUBSYSTEM S_LNET +#include /* * Convert server error code to client format. Error codes are from @@ -35,13 +35,26 @@ int convert_server_error(__u64 ecode) { return ecode; } +EXPORT_SYMBOL(convert_server_error); /* * convert flag from client to server. */ -int convert_client_oflag(int cflag) +int convert_client_oflag(int cflag, int *result) { - return cflag; + *result = cflag; + return 0; } +EXPORT_SYMBOL(convert_client_oflag); +void cfs_stack_trace_fill(struct cfs_stack_trace *trace) +{} + +EXPORT_SYMBOL(cfs_stack_trace_fill); + +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no) +{ + return NULL; +} +EXPORT_SYMBOL(cfs_stack_trace_frame); diff --git a/lnet/libcfs/lwt.c b/lnet/libcfs/lwt.c index b4ae10f5..3ed5d45 100644 --- a/lnet/libcfs/lwt.c +++ b/lnet/libcfs/lwt.c @@ -24,7 +24,9 @@ # define EXPORT_SYMTAB #endif +#ifdef HAVE_KERNEL_CONFIG_H #include +#endif #include #include #include @@ -39,7 +41,7 @@ #include #include -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include @@ -118,7 +120,7 @@ lwt_control (int enable, int clear) continue; for (j = 0; j < lwt_pages_per_cpu; j++) { - memset (p->lwtp_events, 0, PAGE_SIZE); + memset (p->lwtp_events, 0, CFS_PAGE_SIZE); p = list_entry (p->lwtp_list.next, lwt_page_t, lwtp_list); @@ -138,7 +140,7 @@ int lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, void *user_ptr, int user_size) { - const int events_per_page = PAGE_SIZE / sizeof(lwt_event_t); + const int events_per_page = CFS_PAGE_SIZE / sizeof(lwt_event_t); const int bytes_per_page = events_per_page * sizeof(lwt_event_t); lwt_page_t *p; int i; @@ -189,7 +191,7 @@ lwt_init () /* NULL pointers, zero scalars */ memset (lwt_cpus, 0, sizeof (lwt_cpus)); - lwt_pages_per_cpu = LWT_MEMORY / (num_online_cpus() * PAGE_SIZE); + lwt_pages_per_cpu = LWT_MEMORY / (num_online_cpus() * CFS_PAGE_SIZE); for (i = 0; i < num_online_cpus(); i++) for (j = 0; j < lwt_pages_per_cpu; j++) { @@ -202,7 +204,7 @@ lwt_init () return (-ENOMEM); } - PORTAL_ALLOC(lwtp, sizeof (*lwtp)); + LIBCFS_ALLOC(lwtp, sizeof (*lwtp)); if (lwtp == NULL) { CERROR ("Can't allocate lwtp\n"); __free_page(page); @@ -212,7 +214,7 @@ lwt_init () lwtp->lwtp_page = page; lwtp->lwtp_events = page_address(page); - memset (lwtp->lwtp_events, 0, PAGE_SIZE); + memset (lwtp->lwtp_events, 0, CFS_PAGE_SIZE); if (j == 0) { INIT_LIST_HEAD (&lwtp->lwtp_list); @@ -253,7 +255,7 @@ lwt_fini () } __free_page (lwtp->lwtp_page); - PORTAL_FREE (lwtp, sizeof (*lwtp)); + LIBCFS_FREE (lwtp, sizeof (*lwtp)); } } diff --git a/lnet/libcfs/misc.c b/lnet/libcfs/misc.c new file mode 100644 index 0000000..0ace40d --- /dev/null +++ b/lnet/libcfs/misc.c @@ -0,0 +1,53 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. + * Author: Nikita Danilov + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +# define DEBUG_SUBSYSTEM S_LNET + +#include + +/* + * On-wire format is native kdev_t format of Linux kernel 2.6 + */ +enum { + WIRE_RDEV_MINORBITS = 20, + WIRE_RDEV_MINORMASK = ((1U << WIRE_RDEV_MINORBITS) - 1) +}; + +cfs_wire_rdev_t cfs_wire_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor) +{ + return (major << WIRE_RDEV_MINORBITS) | minor; +} + +cfs_major_nr_t cfs_wire_rdev_major(cfs_wire_rdev_t rdev) +{ + return rdev >> WIRE_RDEV_MINORBITS; +} + +cfs_minor_nr_t cfs_wire_rdev_minor(cfs_wire_rdev_t rdev) +{ + return rdev & WIRE_RDEV_MINORMASK; +} + diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index 7da61f4..5e273cb 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -22,35 +22,17 @@ #ifndef EXPORT_SYMTAB # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET -#include -#include +#include +#include #include - -struct nal_cmd_handler { - int nch_number; - nal_cmd_handler_fn *nch_handler; - void *nch_private; -}; - -static struct nal_cmd_handler nal_cmd[16]; -struct semaphore nal_cmd_mutex; - -#ifdef PORTAL_DEBUG -void kportal_assertion_failed(char *expr, char *file, const char *func, - const int line) -{ - portals_debug_msg(0, D_EMERG, file, func, line, CDEBUG_STACK, - "ASSERTION(%s) failed\n", expr); - LBUG_WITH_LOC(file, func, line); -} -#endif +#include "tracefile.h" void -kportal_memhog_free (struct portals_device_userstate *pdu) +kportal_memhog_free (struct libcfs_device_userstate *ldu) { - cfs_page_t **level0p = &pdu->pdu_memhog_root_page; + cfs_page_t **level0p = &ldu->ldu_memhog_root_page; cfs_page_t **level1p; cfs_page_t **level2p; int count1; @@ -71,28 +53,28 @@ kportal_memhog_free (struct portals_device_userstate *pdu) *level2p != NULL) { cfs_free_page(*level2p); - pdu->pdu_memhog_pages--; + ldu->ldu_memhog_pages--; level2p++; count2++; } cfs_free_page(*level1p); - pdu->pdu_memhog_pages--; + ldu->ldu_memhog_pages--; level1p++; count1++; } cfs_free_page(*level0p); - pdu->pdu_memhog_pages--; + ldu->ldu_memhog_pages--; *level0p = NULL; } - LASSERT (pdu->pdu_memhog_pages == 0); + LASSERT (ldu->ldu_memhog_pages == 0); } int -kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flags) +kportal_memhog_alloc (struct libcfs_device_userstate *ldu, int npages, int flags) { cfs_page_t **level0p; cfs_page_t **level1p; @@ -100,8 +82,8 @@ kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flag int count1; int count2; - LASSERT (pdu->pdu_memhog_pages == 0); - LASSERT (pdu->pdu_memhog_root_page == NULL); + LASSERT (ldu->ldu_memhog_pages == 0); + LASSERT (ldu->ldu_memhog_root_page == NULL); if (npages < 0) return -EINVAL; @@ -109,41 +91,41 @@ kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flag if (npages == 0) return 0; - level0p = &pdu->pdu_memhog_root_page; + level0p = &ldu->ldu_memhog_root_page; *level0p = cfs_alloc_page(flags); if (*level0p == NULL) return -ENOMEM; - pdu->pdu_memhog_pages++; + ldu->ldu_memhog_pages++; level1p = (cfs_page_t **)cfs_page_address(*level0p); count1 = 0; memset(level1p, 0, CFS_PAGE_SIZE); - while (pdu->pdu_memhog_pages < npages && + while (ldu->ldu_memhog_pages < npages && count1 < CFS_PAGE_SIZE/sizeof(cfs_page_t *)) { - if (cfs_signal_pending(cfs_current())) + if (cfs_signal_pending()) return (-EINTR); *level1p = cfs_alloc_page(flags); if (*level1p == NULL) return -ENOMEM; - pdu->pdu_memhog_pages++; + ldu->ldu_memhog_pages++; level2p = (cfs_page_t **)cfs_page_address(*level1p); count2 = 0; memset(level2p, 0, CFS_PAGE_SIZE); - while (pdu->pdu_memhog_pages < npages && + while (ldu->ldu_memhog_pages < npages && count2 < CFS_PAGE_SIZE/sizeof(cfs_page_t *)) { - if (cfs_signal_pending(cfs_current())) + if (cfs_signal_pending()) return (-EINTR); *level2p = cfs_alloc_page(flags); if (*level2p == NULL) return (-ENOMEM); - pdu->pdu_memhog_pages++; + ldu->ldu_memhog_pages++; level2p++; count2++; @@ -159,17 +141,17 @@ kportal_memhog_alloc (struct portals_device_userstate *pdu, int npages, int flag /* called when opening /dev/device */ static int libcfs_psdev_open(unsigned long flags, void *args) { - struct portals_device_userstate *pdu; + struct libcfs_device_userstate *ldu; ENTRY; PORTAL_MODULE_USE; - PORTAL_ALLOC(pdu, sizeof(*pdu)); - if (pdu != NULL) { - pdu->pdu_memhog_pages = 0; - pdu->pdu_memhog_root_page = NULL; + LIBCFS_ALLOC(ldu, sizeof(*ldu)); + if (ldu != NULL) { + ldu->ldu_memhog_pages = 0; + ldu->ldu_memhog_root_page = NULL; } - *(struct portals_device_userstate **)args = pdu; + *(struct libcfs_device_userstate **)args = ldu; RETURN(0); } @@ -177,157 +159,49 @@ static int libcfs_psdev_open(unsigned long flags, void *args) /* called when closing /dev/device */ static int libcfs_psdev_release(unsigned long flags, void *args) { - struct portals_device_userstate *pdu; + struct libcfs_device_userstate *ldu; ENTRY; - pdu = (struct portals_device_userstate *)args; - if (pdu != NULL) { - kportal_memhog_free(pdu); - PORTAL_FREE(pdu, sizeof(*pdu)); + ldu = (struct libcfs_device_userstate *)args; + if (ldu != NULL) { + kportal_memhog_free(ldu); + LIBCFS_FREE(ldu, sizeof(*ldu)); } PORTAL_MODULE_UNUSE; RETURN(0); } -static inline void freedata(void *data, int len) -{ - PORTAL_FREE(data, len); -} - -struct nal_cmd_handler * -libcfs_find_nal_cmd_handler(int nal) -{ - int i; - - for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) - if (nal_cmd[i].nch_handler != NULL && - nal_cmd[i].nch_number == nal) - return (&nal_cmd[i]); - - return (NULL); -} - -int -libcfs_nal_cmd_register(int nal, nal_cmd_handler_fn *handler, void *private) -{ - struct nal_cmd_handler *cmd; - int i; - int rc; - - CDEBUG(D_IOCTL, "Register NAL %x, handler: %p\n", nal, handler); - - mutex_down(&nal_cmd_mutex); - - if (libcfs_find_nal_cmd_handler(nal) != NULL) { - mutex_up (&nal_cmd_mutex); - return (-EBUSY); - } - - cmd = NULL; - for (i = 0; i < sizeof(nal_cmd)/sizeof(nal_cmd[0]); i++) - if (nal_cmd[i].nch_handler == NULL) { - cmd = &nal_cmd[i]; - break; - } - - if (cmd == NULL) { - rc = -EBUSY; - } else { - rc = 0; - cmd->nch_number = nal; - cmd->nch_handler = handler; - cmd->nch_private = private; - } - - mutex_up(&nal_cmd_mutex); - - return rc; -} -EXPORT_SYMBOL(libcfs_nal_cmd_register); - -void -libcfs_nal_cmd_unregister(int nal) -{ - struct nal_cmd_handler *cmd; - - CDEBUG(D_IOCTL, "Unregister NAL %x\n", nal); - - mutex_down(&nal_cmd_mutex); - cmd = libcfs_find_nal_cmd_handler(nal); - LASSERT (cmd != NULL); - cmd->nch_handler = NULL; - cmd->nch_private = NULL; - mutex_up(&nal_cmd_mutex); -} -EXPORT_SYMBOL(libcfs_nal_cmd_unregister); - -int -libcfs_nal_cmd(struct portals_cfg *pcfg) -{ -#if CRAY_PORTALS - /* pretend success */ - RETURN(0); -#else - struct nal_cmd_handler *cmd; - __u32 nal = pcfg->pcfg_nal; - int rc = -EINVAL; - ENTRY; - - if (pcfg->pcfg_version != PORTALS_CFG_VERSION) { - RETURN(-EINVAL); - } - - mutex_down(&nal_cmd_mutex); - cmd = libcfs_find_nal_cmd_handler(nal); - if (cmd != NULL) { - CDEBUG(D_IOCTL, "calling handler nal: %x, cmd: %d\n", nal, - pcfg->pcfg_command); - rc = cmd->nch_handler(pcfg, cmd->nch_private); - } else { - CERROR("invalid nal: %x, cmd: %d\n", nal, pcfg->pcfg_command); - } - mutex_up(&nal_cmd_mutex); - - RETURN(rc); -#endif -} -EXPORT_SYMBOL(libcfs_nal_cmd); - static struct rw_semaphore ioctl_list_sem; static struct list_head ioctl_list; int libcfs_register_ioctl(struct libcfs_ioctl_handler *hand) { int rc = 0; - down_read(&ioctl_list_sem); + + down_write(&ioctl_list_sem); if (!list_empty(&hand->item)) rc = -EBUSY; - up_read(&ioctl_list_sem); - - if (rc == 0) { - down_write(&ioctl_list_sem); + else list_add_tail(&hand->item, &ioctl_list); - up_write(&ioctl_list_sem); - } - RETURN(0); + up_write(&ioctl_list_sem); + + return rc; } EXPORT_SYMBOL(libcfs_register_ioctl); int libcfs_deregister_ioctl(struct libcfs_ioctl_handler *hand) { int rc = 0; - down_read(&ioctl_list_sem); + + down_write(&ioctl_list_sem); if (list_empty(&hand->item)) rc = -ENOENT; - up_read(&ioctl_list_sem); - - if (rc == 0) { - down_write(&ioctl_list_sem); + else list_del_init(&hand->item); - up_write(&ioctl_list_sem); - } - RETURN(0); + up_write(&ioctl_list_sem); + + return rc; } EXPORT_SYMBOL(libcfs_deregister_ioctl); @@ -335,112 +209,67 @@ static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *a { char buf[1024]; int err = -EINVAL; - struct portal_ioctl_data *data; + struct libcfs_ioctl_data *data; ENTRY; /* 'cmd' and permissions get checked in our arch-specific caller */ - if (portal_ioctl_getdata(buf, buf + 800, (void *)arg)) { + if (libcfs_ioctl_getdata(buf, buf + 800, (void *)arg)) { CERROR("PORTALS ioctl: data error\n"); RETURN(-EINVAL); } - data = (struct portal_ioctl_data *)buf; + data = (struct libcfs_ioctl_data *)buf; switch (cmd) { - case IOC_PORTAL_CLEAR_DEBUG: - portals_debug_clear_buffer(); + case IOC_LIBCFS_CLEAR_DEBUG: + libcfs_debug_clear_buffer(); RETURN(0); /* - * case IOC_PORTAL_PANIC: + * case IOC_LIBCFS_PANIC: * Handled in arch/cfs_module.c */ - case IOC_PORTAL_MARK_DEBUG: - if (data->ioc_inlbuf1 == NULL || - data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') - RETURN(-EINVAL); - portals_debug_mark_buffer(data->ioc_inlbuf1); - RETURN(0); - case IOC_PORTAL_DMSG: + case IOC_LIBCFS_MARK_DEBUG: if (data->ioc_inlbuf1 == NULL || data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') RETURN(-EINVAL); - printk("%s", data->ioc_inlbuf1); + libcfs_debug_mark_buffer(data->ioc_inlbuf1); RETURN(0); #if LWT_SUPPORT - case IOC_PORTAL_LWT_CONTROL: - err = lwt_control (data->ioc_flags, data->ioc_misc); + case IOC_LIBCFS_LWT_CONTROL: + err = lwt_control ((data->ioc_flags & 1) != 0, + (data->ioc_flags & 2) != 0); break; - case IOC_PORTAL_LWT_SNAPSHOT: { + case IOC_LIBCFS_LWT_SNAPSHOT: { cycles_t now; int ncpu; int total_size; err = lwt_snapshot (&now, &ncpu, &total_size, data->ioc_pbuf1, data->ioc_plen1); - data->ioc_nid = now; - data->ioc_count = ncpu; - data->ioc_misc = total_size; + data->ioc_u64[0] = now; + data->ioc_u32[0] = ncpu; + data->ioc_u32[1] = total_size; /* Hedge against broken user/kernel typedefs (e.g. cycles_t) */ - data->ioc_nid2 = sizeof(lwt_event_t); - data->ioc_nid3 = offsetof(lwt_event_t, lwte_where); + data->ioc_u32[2] = sizeof(lwt_event_t); + data->ioc_u32[3] = offsetof(lwt_event_t, lwte_where); if (err == 0 && - copy_to_user((char *)arg, data, sizeof (*data))) + libcfs_ioctl_popdata(arg, data, sizeof (*data))) err = -EFAULT; break; } - case IOC_PORTAL_LWT_LOOKUP_STRING: + case IOC_LIBCFS_LWT_LOOKUP_STRING: err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, data->ioc_pbuf2, data->ioc_plen2); if (err == 0 && - copy_to_user((char *)arg, data, sizeof (*data))) + libcfs_ioctl_popdata(arg, data, sizeof (*data))) err = -EFAULT; break; #endif - case IOC_PORTAL_NAL_CMD: { - struct portals_cfg pcfg; - - if (data->ioc_plen1 != sizeof(pcfg)) { - CERROR("Bad ioc_plen1 %d (wanted "LPSZ")\n", - data->ioc_plen1, sizeof(pcfg)); - err = -EINVAL; - break; - } - - if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, - sizeof(pcfg))) { - err = -EFAULT; - break; - } - - CDEBUG (D_IOCTL, "nal command nal %x cmd %d\n", pcfg.pcfg_nal, - pcfg.pcfg_command); - if (pcfg.pcfg_version != PORTALS_CFG_VERSION) { - /* set this so userspace can tell when they - * have an incompatible version and print a - * decent message to the user - */ - pcfg.pcfg_version = PORTALS_CFG_VERSION; - if (copy_to_user((char *)data->ioc_pbuf1, &pcfg, - sizeof (pcfg))) - err = -EFAULT; - else - err = -EINVAL; - } else { - err = libcfs_nal_cmd(&pcfg); - - if (err == 0 && - copy_to_user((char *)data->ioc_pbuf1, &pcfg, - sizeof (pcfg))) - err = -EFAULT; - } - break; - } - - case IOC_PORTAL_MEMHOG: + case IOC_LIBCFS_MEMHOG: if (pfile->private_data == NULL) { err = -EINVAL; } else { @@ -454,17 +283,39 @@ static int libcfs_ioctl(struct cfs_psdev_file *pfile, unsigned long cmd, void *a } break; + case IOC_LIBCFS_PING_TEST: { + extern void (kping_client)(struct libcfs_ioctl_data *); + void (*ping)(struct libcfs_ioctl_data *); + + CDEBUG(D_IOCTL, "doing %d pings to nid %s (%s)\n", + data->ioc_count, libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(data->ioc_nid)); + ping = PORTAL_SYMBOL_GET(kping_client); + if (!ping) + CERROR("PORTAL_SYMBOL_GET failed\n"); + else { + ping(data); + PORTAL_SYMBOL_PUT(kping_client); + } + RETURN(0); + } + default: { struct libcfs_ioctl_handler *hand; err = -EINVAL; down_read(&ioctl_list_sem); list_for_each_entry(hand, &ioctl_list, item) { - err = hand->handle_ioctl(data, cmd, (unsigned long)arg); - if (err != -EINVAL) + err = hand->handle_ioctl(cmd, data); + if (err != -EINVAL) { + if (err == 0) + err = libcfs_ioctl_popdata(arg, + data, sizeof (*data)); break; + } } up_read(&ioctl_list_sem); - } break; + break; + } } RETURN(err); @@ -488,6 +339,7 @@ extern cfs_psdev_t libcfs_dev; extern struct rw_semaphore tracefile_sem; extern struct semaphore trace_thread_sem; +extern void libcfs_init_nidstrings(void); extern int libcfs_arch_init(void); extern void libcfs_arch_cleanup(void); @@ -496,15 +348,15 @@ static int init_libcfs_module(void) int rc; libcfs_arch_init(); + libcfs_init_nidstrings(); init_rwsem(&tracefile_sem); init_mutex(&trace_thread_sem); - init_mutex(&nal_cmd_mutex); init_rwsem(&ioctl_list_sem); CFS_INIT_LIST_HEAD(&ioctl_list); - rc = portals_debug_init(5 * 1024 * 1024); + rc = libcfs_debug_init(5 * 1024 * 1024); if (rc < 0) { - printk(KERN_ERR "LustreError: portals_debug_init: %d\n", rc); + printk(KERN_ERR "LustreError: libcfs_debug_init: %d\n", rc); return (rc); } @@ -537,7 +389,7 @@ static int init_libcfs_module(void) lwt_fini(); cleanup_debug: #endif - portals_debug_cleanup(); + libcfs_debug_cleanup(); return rc; } @@ -548,7 +400,7 @@ static void exit_libcfs_module(void) remove_proc(); CDEBUG(D_MALLOC, "before Portals cleanup: kmem %d\n", - atomic_read(&portal_kmemory)); + atomic_read(&libcfs_kmemory)); rc = cfs_psdev_deregister(&libcfs_dev); if (rc) @@ -558,16 +410,14 @@ static void exit_libcfs_module(void) lwt_fini(); #endif - if (atomic_read(&portal_kmemory) != 0) + if (atomic_read(&libcfs_kmemory) != 0) CERROR("Portals memory leaked: %d bytes\n", - atomic_read(&portal_kmemory)); + atomic_read(&libcfs_kmemory)); - rc = portals_debug_cleanup(); + rc = libcfs_debug_cleanup(); if (rc) - printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc); + printk(KERN_ERR "LustreError: libcfs_debug_cleanup: %d\n", rc); libcfs_arch_cleanup(); } -EXPORT_SYMBOL(kportal_assertion_failed); - cfs_module(libcfs, "1.0.0", init_libcfs_module, exit_libcfs_module); diff --git a/lnet/libcfs/nidstrings.c b/lnet/libcfs/nidstrings.c new file mode 100644 index 0000000..78a255d --- /dev/null +++ b/lnet/libcfs/nidstrings.c @@ -0,0 +1,533 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#ifndef __KERNEL__ +#ifdef HAVE_GETHOSTBYNAME +# include +#endif +#endif + +/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids + * consistent in all conversion functions. Some code fragments are copied + * around for the sake of clarity... + */ + +/* CAVEAT EMPTOR! Racey temporary buffer allocation! + * Choose the number of nidstrings to support the MAXIMUM expected number of + * concurrent users. If there are more, the returned string will be volatile. + * NB this number must allow for a process to be descheduled for a timeslice + * between getting its string and using it. + */ + +#define LNET_NIDSTR_COUNT 128 /* # of nidstrings */ +#define LNET_NIDSTR_SIZE 32 /* size of each one (see below for usage) */ + +static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; +static int libcfs_nidstring_idx = 0; + +#ifdef __KERNEL__ +static spinlock_t libcfs_nidstring_lock; + +void libcfs_init_nidstrings (void) +{ + spin_lock_init(&libcfs_nidstring_lock); +} + +# define NIDSTR_LOCK(f) spin_lock_irqsave(&libcfs_nidstring_lock, f) +# define NIDSTR_UNLOCK(f) spin_unlock_irqrestore(&libcfs_nidstring_lock, f) +#else +# define NIDSTR_LOCK(f) (f=0) /* avoid unused var warnings */ +# define NIDSTR_UNLOCK(f) (f=0) +#endif + +static char * +libcfs_next_nidstring (void) +{ + char *str; + unsigned long flags; + + NIDSTR_LOCK(flags); + + str = libcfs_nidstrings[libcfs_nidstring_idx++]; + if (libcfs_nidstring_idx == + sizeof(libcfs_nidstrings)/sizeof(libcfs_nidstrings[0])) + libcfs_nidstring_idx = 0; + + NIDSTR_UNLOCK(flags); + return str; +} + +static int libcfs_lo_str2addr(char *str, int nob, __u32 *addr); +static void libcfs_ip_addr2str(__u32 addr, char *str); +static int libcfs_ip_str2addr(char *str, int nob, __u32 *addr); +static void libcfs_decnum_addr2str(__u32 addr, char *str); +static void libcfs_hexnum_addr2str(__u32 addr, char *str); +static int libcfs_num_str2addr(char *str, int nob, __u32 *addr); + +struct netstrfns { + int nf_type; + char *nf_name; + char *nf_modname; + void (*nf_addr2str)(__u32 addr, char *str); + int (*nf_str2addr)(char *str, int nob, __u32 *addr); +}; + +static struct netstrfns libcfs_netstrfns[] = { + {/* .nf_type */ LOLND, + /* .nf_name */ "lo", + /* .nf_modname */ "klolnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_lo_str2addr}, + {/* .nf_type */ SOCKLND, + /* .nf_name */ "tcp", + /* .nf_modname */ "ksocklnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ O2IBLND, + /* .nf_name */ "o2ib", + /* .nf_modname */ "ko2iblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ CIBLND, + /* .nf_name */ "cib", + /* .nf_modname */ "kciblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ OPENIBLND, + /* .nf_name */ "openib", + /* .nf_modname */ "kopeniblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ IIBLND, + /* .nf_name */ "iib", + /* .nf_modname */ "kiiblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ VIBLND, + /* .nf_name */ "vib", + /* .nf_modname */ "kviblnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ RALND, + /* .nf_name */ "ra", + /* .nf_modname */ "kralnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ QSWLND, + /* .nf_name */ "elan", + /* .nf_modname */ "kqswlnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr}, + {/* .nf_type */ GMLND, + /* .nf_name */ "gm", + /* .nf_modname */ "kgmlnd", + /* .nf_addr2str */ libcfs_hexnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr}, + {/* .nf_type */ MXLND, + /* .nf_name */ "mx", + /* .nf_modname */ "kmxlnd", + /* .nf_addr2str */ libcfs_ip_addr2str, + /* .nf_str2addr */ libcfs_ip_str2addr}, + {/* .nf_type */ PTLLND, + /* .nf_name */ "ptl", + /* .nf_modname */ "kptllnd", + /* .nf_addr2str */ libcfs_decnum_addr2str, + /* .nf_str2addr */ libcfs_num_str2addr}, + /* placeholder for net0 alias. It MUST BE THE LAST ENTRY */ + {/* .nf_type */ -1}, +}; + +const int libcfs_nnetstrfns = sizeof(libcfs_netstrfns)/sizeof(libcfs_netstrfns[0]); + +int +libcfs_lo_str2addr(char *str, int nob, __u32 *addr) +{ + *addr = 0; + return 1; +} + +void +libcfs_ip_addr2str(__u32 addr, char *str) +{ +#if 0 /* never lookup */ +#if !defined(__KERNEL__) && defined HAVE_GETHOSTBYNAME + __u32 netip = htonl(addr); + struct hostent *he = gethostbyaddr(&netip, sizeof(netip), AF_INET); + + if (he != NULL) { + snprintf(str, LNET_NIDSTR_SIZE, "%s", he->h_name); + return; + } +#endif +#endif + snprintf(str, LNET_NIDSTR_SIZE, "%u.%u.%u.%u", + (addr >> 24) & 0xff, (addr >> 16) & 0xff, + (addr >> 8) & 0xff, addr & 0xff); +} + +/* CAVEAT EMPTOR XscanfX + * I use "%n" at the end of a sscanf format to detect trailing junk. However + * sscanf may return immediately if it sees the terminating '0' in a string, so + * I initialise the %n variable to the expected length. If sscanf sets it; + * fine, if it doesn't, then the scan ended at the end of the string, which is + * fine too :) */ + +int +libcfs_ip_str2addr(char *str, int nob, __u32 *addr) +{ + int a; + int b; + int c; + int d; + int n = nob; /* XscanfX */ + + /* numeric IP? */ + if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && + n == nob && + (a & ~0xff) == 0 && (b & ~0xff) == 0 && + (c & ~0xff) == 0 && (d & ~0xff) == 0) { + *addr = ((a<<24)|(b<<16)|(c<<8)|d); + return 1; + } + +#if !defined(__KERNEL__) && defined HAVE_GETHOSTBYNAME + /* known hostname? */ + if (('a' <= str[0] && str[0] <= 'z') || + ('A' <= str[0] && str[0] <= 'Z')) { + char *tmp; + + LIBCFS_ALLOC(tmp, nob + 1); + if (tmp != NULL) { + struct hostent *he; + + memcpy(tmp, str, nob); + tmp[nob] = 0; + + he = gethostbyname(tmp); + + LIBCFS_FREE(tmp, nob); + + if (he != NULL) { + __u32 ip = *(__u32 *)he->h_addr; + + *addr = ntohl(ip); + return 1; + } + } + } +#endif + return 0; +} + +void +libcfs_decnum_addr2str(__u32 addr, char *str) +{ + snprintf(str, LNET_NIDSTR_SIZE, "%u", addr); +} + +void +libcfs_hexnum_addr2str(__u32 addr, char *str) +{ + snprintf(str, LNET_NIDSTR_SIZE, "0x%x", addr); +} + +int +libcfs_num_str2addr(char *str, int nob, __u32 *addr) +{ + int n; + + n = nob; + if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) + return 1; + + n = nob; + if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) + return 1; + + return 0; +} + +struct netstrfns * +libcfs_lnd2netstrfns(int lnd) +{ + int i; + + if (lnd >= 0) + for (i = 0; i < libcfs_nnetstrfns; i++) + if (lnd == libcfs_netstrfns[i].nf_type) + return &libcfs_netstrfns[i]; + + return NULL; +} + +struct netstrfns * +libcfs_name2netstrfns(char *name) +{ + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) + if (libcfs_netstrfns[i].nf_type >= 0 && + !strcmp(libcfs_netstrfns[i].nf_name, name)) + return &libcfs_netstrfns[i]; + + return NULL; +} + +int +libcfs_isknown_lnd(int type) +{ + return libcfs_lnd2netstrfns(type) != NULL; +} + +char * +libcfs_lnd2modname(int lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + return (nf == NULL) ? NULL : nf->nf_modname; +} + +char * +libcfs_lnd2str(int lnd) +{ + char *str; + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + + if (nf != NULL) + return nf->nf_name; + + str = libcfs_next_nidstring(); + snprintf(str, LNET_NIDSTR_SIZE, "?%u?", lnd); + return str; +} + +int +libcfs_str2lnd(char *str) +{ + struct netstrfns *nf = libcfs_name2netstrfns(str); + + if (nf != NULL) + return nf->nf_type; + + return -1; +} + +char * +libcfs_net2str(__u32 net) +{ + int lnd = LNET_NETTYP(net); + int num = LNET_NETNUM(net); + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + char *str = libcfs_next_nidstring(); + + if (nf == NULL) + snprintf(str, LNET_NIDSTR_SIZE, "<%u:%u>", lnd, num); + else if (num == 0) + snprintf(str, LNET_NIDSTR_SIZE, "%s", nf->nf_name); + else + snprintf(str, LNET_NIDSTR_SIZE, "%s%u", nf->nf_name, num); + + return str; +} + +char * +libcfs_nid2str(lnet_nid_t nid) +{ + __u32 addr = LNET_NIDADDR(nid); + __u32 net = LNET_NIDNET(nid); + int lnd = LNET_NETTYP(net); + int nnum = LNET_NETNUM(net); + struct netstrfns *nf; + char *str; + int nob; + + if (nid == LNET_NID_ANY) + return "LNET_NID_ANY"; + + nf = libcfs_lnd2netstrfns(lnd); + str = libcfs_next_nidstring(); + + if (nf == NULL) + snprintf(str, LNET_NIDSTR_SIZE, "%x@<%u:%u>", addr, lnd, nnum); + else { + nf->nf_addr2str(addr, str); + nob = strlen(str); + if (nnum == 0) + snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s", + nf->nf_name); + else + snprintf(str + nob, LNET_NIDSTR_SIZE - nob, "@%s%u", + nf->nf_name, nnum); + } + + return str; +} + +static struct netstrfns * +libcfs_str2net_internal(char *str, __u32 *net) +{ + struct netstrfns *nf; + int nob; + int netnum; + int i; + + for (i = 0; i < libcfs_nnetstrfns; i++) { + nf = &libcfs_netstrfns[i]; + if (nf->nf_type >= 0 && + !strncmp(str, nf->nf_name, strlen(nf->nf_name))) + break; + } + + if (i == libcfs_nnetstrfns) + return NULL; + + nob = strlen(nf->nf_name); + + if (strlen(str) == (unsigned int)nob) { + netnum = 0; + } else { + if (nf->nf_type == LOLND) /* net number not allowed */ + return NULL; + + str += nob; + i = strlen(str); + if (sscanf(str, "%u%n", &netnum, &i) < 1 || + i != (int)strlen(str)) + return NULL; + } + + *net = LNET_MKNET(nf->nf_type, netnum); + return nf; +} + +__u32 +libcfs_str2net(char *str) +{ + __u32 net; + + if (libcfs_str2net_internal(str, &net) != NULL) + return net; + + return LNET_NIDNET(LNET_NID_ANY); +} + +lnet_nid_t +libcfs_str2nid(char *str) +{ + char *sep = strchr(str, '@'); + struct netstrfns *nf; + __u32 net; + __u32 addr; + + if (sep != NULL) { + nf = libcfs_str2net_internal(sep + 1, &net); + if (nf == NULL) + return LNET_NID_ANY; + } else { + sep = str + strlen(str); + net = LNET_MKNET(SOCKLND, 0); + nf = libcfs_lnd2netstrfns(SOCKLND); + LASSERT (nf != NULL); + } + + if (!nf->nf_str2addr(str, sep - str, &addr)) + return LNET_NID_ANY; + + return LNET_MKNID(net, addr); +} + +char * +libcfs_id2str(lnet_process_id_t id) +{ + char *str = libcfs_next_nidstring(); + + snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", + ((id.pid & LNET_PID_USERFLAG) != 0) ? "U" : "", + (id.pid & ~LNET_PID_USERFLAG), libcfs_nid2str(id.nid)); + return str; +} + +int +libcfs_str2anynid(lnet_nid_t *nidp, char *str) +{ + if (!strcmp(str, "*")) { + *nidp = LNET_NID_ANY; + return 1; + } + + *nidp = libcfs_str2nid(str); + return *nidp != LNET_NID_ANY; +} + +#ifdef __KERNEL__ +void +libcfs_setnet0alias(int lnd) +{ + struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); + struct netstrfns *nf0 = &libcfs_netstrfns[libcfs_nnetstrfns - 1]; + + /* Ghastly hack to allow LNET to inter-operate with portals. + * NET type 0 becomes an alias for whatever local network we have, and + * this assignment here means we can parse and print its NIDs */ + + LASSERT (nf != NULL); + LASSERT (nf0->nf_type < 0); + + nf0->nf_name = "zero";//nf->nf_name; + nf0->nf_modname = nf->nf_modname; + nf0->nf_addr2str = nf->nf_addr2str; + nf0->nf_str2addr = nf->nf_str2addr; + mb(); + nf0->nf_type = 0; +} + +EXPORT_SYMBOL(libcfs_isknown_lnd); +EXPORT_SYMBOL(libcfs_lnd2modname); +EXPORT_SYMBOL(libcfs_lnd2str); +EXPORT_SYMBOL(libcfs_str2lnd); +EXPORT_SYMBOL(libcfs_net2str); +EXPORT_SYMBOL(libcfs_nid2str); +EXPORT_SYMBOL(libcfs_str2net); +EXPORT_SYMBOL(libcfs_str2nid); +EXPORT_SYMBOL(libcfs_id2str); +EXPORT_SYMBOL(libcfs_str2anynid); +EXPORT_SYMBOL(libcfs_setnet0alias); +#else /* __KERNEL__ */ +void +libcfs_setnet0alias(int lnd) +{ + LCONSOLE_ERROR("Liblustre cannot interoperate with old Portals.\n" + "portals_compatibility must be set to 'none'.\n"); +} +#endif diff --git a/lnet/libcfs/tracefile.c b/lnet/libcfs/tracefile.c index e93ff1b..0b8e61e 100644 --- a/lnet/libcfs/tracefile.c +++ b/lnet/libcfs/tracefile.c @@ -22,7 +22,7 @@ */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #define LUSTRE_TRACEFILE_PRIVATE #include "tracefile.h" @@ -32,14 +32,16 @@ /* XXX move things up to the top, comment */ union trace_data_union trace_data[NR_CPUS] __cacheline_aligned; -struct rw_semaphore tracefile_sem; char *tracefile = NULL; -long long tracefile_size = TRACEFILE_SIZE; +int64_t tracefile_size = TRACEFILE_SIZE; static struct tracefiled_ctl trace_tctl; struct semaphore trace_thread_sem; static int thread_running = 0; -static void put_pages_on_daemon_list_on_cpu(void *info); +atomic_t tage_allocated = ATOMIC_INIT(0); + +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct trace_cpu_data *tcd); static inline struct trace_page *tage_from_list(struct list_head *list) { @@ -51,71 +53,91 @@ static struct trace_page *tage_alloc(int gfp) cfs_page_t *page; struct trace_page *tage; + /* + * Don't spam console with allocation failures: they will be reported + * by upper layer anyway. + */ + gfp |= CFS_ALLOC_NOWARN; page = cfs_alloc_page(gfp); if (page == NULL) return NULL; - + tage = cfs_alloc(sizeof(*tage), gfp); if (tage == NULL) { cfs_free_page(page); return NULL; } - + tage->page = page; + atomic_inc(&tage_allocated); return tage; } static void tage_free(struct trace_page *tage) { - LASSERT(tage != NULL); - LASSERT(tage->page != NULL); + __LASSERT(tage != NULL); + __LASSERT(tage->page != NULL); cfs_free_page(tage->page); cfs_free(tage); + atomic_dec(&tage_allocated); } static void tage_to_tail(struct trace_page *tage, struct list_head *queue) { - LASSERT(tage != NULL); - LASSERT(queue != NULL); + __LASSERT(tage != NULL); + __LASSERT(queue != NULL); list_move_tail(&tage->linkage, queue); } -static void LASSERT_TAGE_INVARIANT(struct trace_page *tage) +int trace_refill_stock(struct trace_cpu_data *tcd, int gfp, + struct list_head *stock) { - LASSERT(tage != NULL); - LASSERT(tage->page != NULL); - LASSERTF(tage->used <= CFS_PAGE_SIZE, "used = %u, PAGE_SIZE %lu\n", - tage->used, CFS_PAGE_SIZE); - LASSERTF(cfs_page_count(tage->page) > 0, "count = %d\n", - cfs_page_count(tage->page)); + int i; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ + + for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++ i) { + struct trace_page *tage; + + tage = tage_alloc(gfp); + if (tage == NULL) + break; + list_add_tail(&tage->linkage, stock); + } + return i; } /* return a page that has 'len' bytes left at the end */ -static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd, - unsigned long len) +static struct trace_page *trace_get_tage_try(struct trace_cpu_data *tcd, + unsigned long len) { struct trace_page *tage; - if (len > CFS_PAGE_SIZE) { - printk(KERN_ERR "cowardly refusing to write %lu bytes in a " - "page\n", len); - return NULL; - } - - if (!list_empty(&tcd->tcd_pages)) { + if (tcd->tcd_cur_pages > 0) { + __LASSERT(!list_empty(&tcd->tcd_pages)); tage = tage_from_list(tcd->tcd_pages.prev); if (tage->used + len <= CFS_PAGE_SIZE) return tage; } if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { - tage = tage_alloc(CFS_ALLOC_ATOMIC); - if (tage == NULL) { - /* the kernel should print a message for us. fall back - * to using the last page in the ring buffer. */ - goto ring_buffer; + if (tcd->tcd_cur_stock_pages > 0) { + tage = tage_from_list(tcd->tcd_stock_pages.prev); + -- tcd->tcd_cur_stock_pages; + list_del_init(&tage->linkage); + } else { + tage = tage_alloc(CFS_ALLOC_ATOMIC); + if (tage == NULL) { + printk(KERN_WARNING + "failure to allocate a tage (%ld)\n", + tcd->tcd_cur_pages); + return NULL; + } } tage->used = 0; @@ -125,131 +147,346 @@ static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd, if (tcd->tcd_cur_pages > 8 && thread_running) { struct tracefiled_ctl *tctl = &trace_tctl; + /* + * wake up tracefiled to process some pages. + */ cfs_waitq_signal(&tctl->tctl_waitq); } return tage; } + return NULL; +} - ring_buffer: - if (thread_running) { - int pgcount = tcd->tcd_cur_pages / 10; - struct page_collection pc; - struct trace_page *tage; - struct trace_page *tmp; +static void tcd_shrink(struct trace_cpu_data *tcd) +{ + int pgcount = tcd->tcd_cur_pages / 10; + struct page_collection pc; + struct trace_page *tage; + struct trace_page *tmp; - printk(KERN_WARNING "debug daemon buffer overflowed; discarding" - " 10%% of pages (%d)\n", pgcount + 1); + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ - CFS_INIT_LIST_HEAD(&pc.pc_pages); - spin_lock_init(&pc.pc_lock); + printk(KERN_WARNING "debug daemon buffer overflowed; discarding" + " 10%% of pages (%d of %ld)\n", pgcount + 1, tcd->tcd_cur_pages); - list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { - if (pgcount-- == 0) - break; + CFS_INIT_LIST_HEAD(&pc.pc_pages); + spin_lock_init(&pc.pc_lock); - list_move_tail(&tage->linkage, &pc.pc_pages); - tcd->tcd_cur_pages--; - } - put_pages_on_daemon_list_on_cpu(&pc); + list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { + if (pgcount-- == 0) + break; - LASSERT(!list_empty(&tcd->tcd_pages)); + list_move_tail(&tage->linkage, &pc.pc_pages); + tcd->tcd_cur_pages--; } + put_pages_on_tcd_daemon_list(&pc, tcd); +} - if (list_empty(&tcd->tcd_pages)) - return NULL; +/* return a page that has 'len' bytes left at the end */ +static struct trace_page *trace_get_tage(struct trace_cpu_data *tcd, + unsigned long len) +{ + struct trace_page *tage; + + /* + * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) + * from here: this will lead to infinite recursion. + */ - tage = tage_from_list(tcd->tcd_pages.next); - tage->used = 0; - tage_to_tail(tage, &tcd->tcd_pages); + if (len > CFS_PAGE_SIZE) { + printk(KERN_ERR + "cowardly refusing to write %lu bytes in a page\n", len); + return NULL; + } + tage = trace_get_tage_try(tcd, len); + if (tage != NULL) + return tage; + if (thread_running) + tcd_shrink(tcd); + if (tcd->tcd_cur_pages > 0) { + tage = tage_from_list(tcd->tcd_pages.next); + tage->used = 0; + tage_to_tail(tage, &tcd->tcd_pages); + } return tage; } -void portals_debug_msg(int subsys, int mask, char *file, const char *fn, - const int line, unsigned long stack, char *format, ...) +int libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls, int subsys, int mask, + const char *file, const char *fn, const int line, + const char *format1, va_list args, + const char *format2, ...) { - struct trace_cpu_data *tcd; - struct ptldebug_header header; - struct trace_page *tage; - char *debug_buf = format; - int known_size, needed = 85 /* average message length */, max_nob; - va_list ap; - unsigned long flags; - -#ifdef CRAY_PORTALS - if (mask == D_PORTALS && !(portal_debug & D_PORTALS)) - return; -#endif + struct trace_cpu_data *tcd = NULL; + struct ptldebug_header header; + struct trace_page *tage; + /* string_buf is used only if tcd != NULL, and is always set then */ + char *string_buf = NULL; + char *debug_buf; + int known_size; + int needed = 85; /* average message length */ + int max_nob; + va_list ap; + int depth; + int i; + int remain; + if (strchr(file, '/')) file = strrchr(file, '/') + 1; - if (*(format + strlen(format) - 1) != '\n') - printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n", - file, line, fn); - tcd = trace_get_tcd(flags); - if (tcd->tcd_shutting_down) - goto out; + set_ptldebug_header(&header, subsys, mask, line, CDEBUG_STACK()); - set_ptldebug_header(&header, subsys, mask, line, stack); - known_size = sizeof(header) + strlen(file) + strlen(fn) + 2; // nulls + tcd = trace_get_tcd(); + if (tcd == NULL) /* arch may not log in IRQ context */ + goto console; - retry: - tage = trace_get_tage(tcd, needed + known_size); - if (tage == NULL) { - debug_buf = format; - if (needed + known_size > CFS_PAGE_SIZE) - mask |= D_ERROR; - needed = strlen(format); - goto out; + if (tcd->tcd_shutting_down) { + trace_put_tcd(tcd); + tcd = NULL; + goto console; } - debug_buf = cfs_page_address(tage->page) + tage->used + known_size; + depth = __current_nesting_level(); + known_size = strlen(file) + 1 + depth; + if (fn) + known_size += strlen(fn) + 1; + + if (libcfs_debug_binary) + known_size += sizeof(header); + + /*/ + * '2' used because vsnprintf return real size required for output + * _without_ terminating NULL. + * if needed is to small for this format. + */ + for (i=0;i<2;i++) { + tage = trace_get_tage(tcd, needed + known_size + 1); + if (tage == NULL) { + if (needed + known_size > CFS_PAGE_SIZE) + mask |= D_ERROR; - max_nob = CFS_PAGE_SIZE - tage->used - known_size; - LASSERT(max_nob > 0); - va_start(ap, format); - needed = vsnprintf(debug_buf, max_nob, format, ap); - va_end(ap); + trace_put_tcd(tcd); + tcd = NULL; + goto console; + } - if (needed > max_nob) /* overflow. oh poop. */ - goto retry; + string_buf = (char *)cfs_page_address(tage->page)+tage->used+known_size; + + max_nob = CFS_PAGE_SIZE - tage->used - known_size; + if (max_nob <= 0) { + printk(KERN_EMERG "negative max_nob: %i\n", max_nob); + mask |= D_ERROR; + trace_put_tcd(tcd); + tcd = NULL; + goto console; + } + + needed = 0; + if (format1) { + va_copy(ap, args); + needed = vsnprintf(string_buf, max_nob, format1, ap); + va_end(ap); + } + + + if (format2) { + remain = max_nob - needed; + if (remain < 0) + remain = 0; + + va_start(ap, format2); + needed += vsnprintf(string_buf+needed, remain, format2, ap); + va_end(ap); + } + if (needed < max_nob) /* well. printing ok.. */ + break; + } + + if (*(string_buf+needed-1) != '\n') + printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n", + file, line, fn); + header.ph_len = known_size + needed; - debug_buf = cfs_page_address(tage->page) + tage->used; + debug_buf = (char *)cfs_page_address(tage->page) + tage->used; - memcpy(debug_buf, &header, sizeof(header)); - tage->used += sizeof(header); - debug_buf += sizeof(header); + if (libcfs_debug_binary) { + memcpy(debug_buf, &header, sizeof(header)); + tage->used += sizeof(header); + debug_buf += sizeof(header); + } + + /* indent message according to the nesting level */ + while (depth-- > 0) { + *(debug_buf++) = '.'; + ++ tage->used; + } strcpy(debug_buf, file); tage->used += strlen(file) + 1; debug_buf += strlen(file) + 1; - strcpy(debug_buf, fn); - tage->used += strlen(fn) + 1; - debug_buf += strlen(fn) + 1; + if (fn) { + strcpy(debug_buf, fn); + tage->used += strlen(fn) + 1; + debug_buf += strlen(fn) + 1; + } + + __LASSERT(debug_buf == string_buf); tage->used += needed; - if (tage->used > CFS_PAGE_SIZE) - printk(KERN_EMERG - "tage->used == %u in portals_debug_msg\n", tage->used); + __LASSERT (tage->used <= CFS_PAGE_SIZE); + +console: + if (!((mask & D_CANTMASK) != 0 || (mask & libcfs_printk) != 0)) { + /* no console output requested */ + if (tcd != NULL) + trace_put_tcd(tcd); + return 1; + } - out: - if ((mask & (D_EMERG | D_ERROR | D_WARNING | D_CONSOLE)) || portal_printk) - print_to_console(&header, mask, debug_buf, needed, file, fn); + if (cdls != NULL) { + cfs_time_t t = cdls->cdls_next + + cfs_time_seconds(CDEBUG_MAX_LIMIT + 10); + cfs_duration_t dmax = cfs_time_seconds(CDEBUG_MAX_LIMIT); + + if (libcfs_console_ratelimit && + cdls->cdls_next != 0 && /* not first time ever */ + !cfs_time_after(cfs_time_current(), cdls->cdls_next)) { + /* skipping a console message */ + cdls->cdls_count++; + if (tcd != NULL) + trace_put_tcd(tcd); + return 1; + } + + if (cfs_time_after(cfs_time_current(), t)) { + /* last timeout was a long time ago */ + cdls->cdls_delay /= 8; + } else { + cdls->cdls_delay *= 2; + + if (cdls->cdls_delay < CFS_TICK) + cdls->cdls_delay = CFS_TICK; + else if (cdls->cdls_delay > dmax) + cdls->cdls_delay = dmax; + } + + /* ensure cdls_next is never zero after it's been seen */ + cdls->cdls_next = (cfs_time_current() + cdls->cdls_delay) | 1; + } + + if (tcd != NULL) { + print_to_console(&header, mask, string_buf, needed, file, fn); + trace_put_tcd(tcd); + } else { + string_buf = trace_get_console_buffer(); + + needed = 0; + if (format1 != NULL) { + va_copy(ap, args); + needed = vsnprintf(string_buf, TRACE_CONSOLE_BUFFER_SIZE, format1, ap); + va_end(ap); + } + if (format2 != NULL) { + remain = TRACE_CONSOLE_BUFFER_SIZE - needed; + if (remain > 0) { + va_start(ap, format2); + needed += vsnprintf(string_buf+needed, remain, format2, ap); + va_end(ap); + } + } + print_to_console(&header, mask, + string_buf, needed, file, fn); + + trace_put_console_buffer(string_buf); + } + + if (cdls != NULL && cdls->cdls_count != 0) { + string_buf = trace_get_console_buffer(); + + needed = snprintf(string_buf, TRACE_CONSOLE_BUFFER_SIZE, + "Skipped %d previous similar message%s\n", + cdls->cdls_count, (cdls->cdls_count > 1) ? "s" : ""); + + print_to_console(&header, mask, + string_buf, needed, file, fn); - trace_put_tcd(tcd, flags); + trace_put_console_buffer(string_buf); + cdls->cdls_count = 0; + } + + return 0; +} +EXPORT_SYMBOL(libcfs_debug_vmsg2); + +void +libcfs_assertion_failed(const char *expr, const char *file, + const char *func, const int line) +{ + libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line, + "ASSERTION(%s) failed\n", expr); + LBUG(); +} +EXPORT_SYMBOL(libcfs_assertion_failed); + +void +trace_assertion_failed(const char *str, + const char *fn, const char *file, int line) +{ + struct ptldebug_header hdr; + + libcfs_panic_in_progress = 1; + libcfs_catastrophe = 1; + mb(); + + set_ptldebug_header(&hdr, DEBUG_SUBSYSTEM, D_EMERG, line, + CDEBUG_STACK()); + + print_to_console(&hdr, D_EMERG, str, strlen(str), file, fn); + + LIBCFS_PANIC("Lustre debug assertion failure\n"); + + /* not reached */ +} + +static void +panic_collect_pages(struct page_collection *pc) +{ + /* Do the collect_pages job on a single CPU: assumes that all other + * CPUs have been stopped during a panic. If this isn't true for some + * arch, this will have to be implemented separately in each arch. */ + int i; + struct trace_cpu_data *tcd; + + CFS_INIT_LIST_HEAD(&pc->pc_pages); + + for (i = 0; i < NR_CPUS; i++) { + tcd = &trace_data[i].tcd; + + list_splice(&tcd->tcd_pages, &pc->pc_pages); + CFS_INIT_LIST_HEAD(&tcd->tcd_pages); + tcd->tcd_cur_pages = 0; + + if (pc->pc_want_daemon_pages) { + list_splice(&tcd->tcd_daemon_pages, &pc->pc_pages); + CFS_INIT_LIST_HEAD(&tcd->tcd_daemon_pages); + tcd->tcd_cur_daemon_pages = 0; + } + } } -EXPORT_SYMBOL(portals_debug_msg); static void collect_pages_on_cpu(void *info) { struct trace_cpu_data *tcd; - unsigned long flags; struct page_collection *pc = info; - tcd = trace_get_tcd(flags); + tcd = trace_get_tcd(); + __LASSERT (tcd != NULL); spin_lock(&pc->pc_lock); list_splice(&tcd->tcd_pages, &pc->pc_pages); @@ -262,15 +499,17 @@ static void collect_pages_on_cpu(void *info) } spin_unlock(&pc->pc_lock); - trace_put_tcd(tcd, flags); + trace_put_tcd(tcd); } static void collect_pages(struct page_collection *pc) { - /* needs to be fixed up for preempt */ CFS_INIT_LIST_HEAD(&pc->pc_pages); - collect_pages_on_cpu(pc); - smp_call_function(collect_pages_on_cpu, pc, 0, 1); + + if (libcfs_panic_in_progress) + panic_collect_pages(pc); + else + trace_call_on_all_cpus(collect_pages_on_cpu, pc); } static void put_pages_back_on_cpu(void *info) @@ -278,18 +517,18 @@ static void put_pages_back_on_cpu(void *info) struct page_collection *pc = info; struct trace_cpu_data *tcd; struct list_head *cur_head; - unsigned long flags; struct trace_page *tage; struct trace_page *tmp; - tcd = trace_get_tcd(flags); + tcd = trace_get_tcd(); + __LASSERT (tcd != NULL); cur_head = tcd->tcd_pages.next; spin_lock(&pc->pc_lock); list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) { - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); if (tage->cpu != smp_processor_id()) continue; @@ -299,34 +538,29 @@ static void put_pages_back_on_cpu(void *info) } spin_unlock(&pc->pc_lock); - trace_put_tcd(tcd, flags); + trace_put_tcd(tcd); } static void put_pages_back(struct page_collection *pc) { - /* needs to be fixed up for preempt */ - put_pages_back_on_cpu(pc); - smp_call_function(put_pages_back_on_cpu, pc, 0, 1); + if (!libcfs_panic_in_progress) + trace_call_on_all_cpus(put_pages_back_on_cpu, pc); } /* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that * we have a good amount of data at all times for dumping during an LBUG, even * if we have been steadily writing (and otherwise discarding) pages via the * debug daemon. */ -static void put_pages_on_daemon_list_on_cpu(void *info) +static void put_pages_on_tcd_daemon_list(struct page_collection *pc, + struct trace_cpu_data *tcd) { - struct page_collection *pc = info; - struct trace_cpu_data *tcd; struct trace_page *tage; struct trace_page *tmp; - unsigned long flags; - - tcd = trace_get_tcd(flags); spin_lock(&pc->pc_lock); list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) { - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); if (tage->cpu != smp_processor_id()) continue; @@ -337,10 +571,10 @@ static void put_pages_on_daemon_list_on_cpu(void *info) if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) { struct trace_page *victim; - LASSERT(!list_empty(&tcd->tcd_daemon_pages)); + __LASSERT(!list_empty(&tcd->tcd_daemon_pages)); victim = tage_from_list(tcd->tcd_daemon_pages.next); - LASSERT_TAGE_INVARIANT(victim); + __LASSERT_TAGE_INVARIANT(victim); list_del(&victim->linkage); tage_free(victim); @@ -348,14 +582,23 @@ static void put_pages_on_daemon_list_on_cpu(void *info) } } spin_unlock(&pc->pc_lock); +} - trace_put_tcd(tcd, flags); +static void put_pages_on_daemon_list_on_cpu(void *info) +{ + struct trace_cpu_data *tcd; + + tcd = trace_get_tcd(); + __LASSERT (tcd != NULL); + + put_pages_on_tcd_daemon_list(info, tcd); + + trace_put_tcd(tcd); } static void put_pages_on_daemon_list(struct page_collection *pc) { - put_pages_on_daemon_list_on_cpu(pc); - smp_call_function(put_pages_on_daemon_list_on_cpu, pc, 0, 1); + trace_call_on_all_cpus(put_pages_on_daemon_list_on_cpu, pc); } void trace_debug_print(void) @@ -372,11 +615,11 @@ void trace_debug_print(void) char *p, *file, *fn; cfs_page_t *page; - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); page = tage->page; p = cfs_page_address(page); - while (p < ((char *)cfs_page_address(page) + CFS_PAGE_SIZE)) { + while (p < ((char *)cfs_page_address(page) + tage->used)) { struct ptldebug_header *hdr; int len; hdr = (void *)p; @@ -388,6 +631,8 @@ void trace_debug_print(void) len = hdr->ph_len - (p - (char *)hdr); print_to_console(hdr, D_EMERG, p, len, file, fn); + + p += len; } list_del(&tage->linkage); @@ -401,13 +646,14 @@ int tracefile_dump_all_pages(char *filename) cfs_file_t *filp; struct trace_page *tage; struct trace_page *tmp; - CFS_DECL_MMSPACE; int rc; - down_write(&tracefile_sem); + CFS_DECL_MMSPACE; + + tracefile_write_lock(); filp = cfs_filp_open(filename, - O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0666, &rc); + O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600, &rc); if (!filp) { printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n", filename, rc); @@ -427,14 +673,15 @@ int tracefile_dump_all_pages(char *filename) CFS_MMSPACE_OPEN; list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); rc = cfs_filp_write(filp, cfs_page_address(tage->page), tage->used, cfs_filp_poff(filp)); - if (rc != tage->used) { + if (rc != (int)tage->used) { printk(KERN_WARNING "wanted to write %u but wrote " "%d\n", tage->used, rc); put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); break; } list_del(&tage->linkage); @@ -447,7 +694,7 @@ int tracefile_dump_all_pages(char *filename) close: cfs_filp_close(filp); out: - up_write(&tracefile_sem); + tracefile_write_unlock(); return rc; } @@ -463,7 +710,7 @@ void trace_flush_pages(void) collect_pages(&pc); list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); list_del(&tage->linkage); tage_free(tage); @@ -481,15 +728,17 @@ int trace_dk(struct file *file, const char *buffer, unsigned long count, if (name == NULL) return -ENOMEM; - if (copy_from_user(name, buffer, count)) { + if (copy_from_user((void *)name, (void *)buffer, count)) { rc = -EFAULT; goto out; } +#if !defined(__WINNT__) if (name[0] != '/') { rc = -EINVAL; goto out; } +#endif /* be nice and strip out trailing '\n' */ for (off = count ; off > 2 && isspace(name[off - 1]); off--) @@ -512,13 +761,13 @@ static int tracefiled(void *arg) struct trace_page *tmp; struct ptldebug_header *hdr; cfs_file_t *filp; - CFS_DECL_MMSPACE; int rc; + CFS_DECL_MMSPACE; + /* we're started late enough that we pick up init's fs context */ /* this is so broken in uml? what on earth is going on? */ - kportal_daemonize("ktracefiled"); - reparent_to_init(); + cfs_daemonize("ktracefiled"); spin_lock_init(&pc.pc_lock); complete(&tctl->tctl_start); @@ -529,7 +778,8 @@ static int tracefiled(void *arg) cfs_waitlink_init(&__wait); cfs_waitq_add(&tctl->tctl_waitq, &__wait); set_current_state(TASK_INTERRUPTIBLE); - cfs_waitq_timedwait(&__wait, cfs_time_seconds(1)); + cfs_waitq_timedwait(&__wait, CFS_TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); cfs_waitq_del(&tctl->tctl_waitq, &__wait); if (atomic_read(&tctl->tctl_shutdown)) @@ -541,16 +791,18 @@ static int tracefiled(void *arg) continue; filp = NULL; - down_read(&tracefile_sem); + tracefile_read_lock(); if (tracefile != NULL) { - filp = cfs_filp_open(tracefile, O_CREAT|O_RDWR|O_LARGEFILE, - 0600, &rc); + filp = cfs_filp_open(tracefile, + O_CREAT | O_RDWR | O_LARGEFILE, + 0600, &rc); if (!(filp)) printk("couldn't open %s: %d\n", tracefile, rc); } - up_read(&tracefile_sem); + tracefile_read_unlock(); if (filp == NULL) { put_pages_on_daemon_list(&pc); + __LASSERT(list_empty(&pc.pc_pages)); continue; } @@ -558,7 +810,7 @@ static int tracefiled(void *arg) /* mark the first header, so we can sort in chunks */ tage = tage_from_list(pc.pc_pages.next); - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); hdr = cfs_page_address(tage->page); hdr->ph_flags |= PH_FLAG_FIRST_RECORD; @@ -566,25 +818,27 @@ static int tracefiled(void *arg) list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { static loff_t f_pos; - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); - if (f_pos >= tracefile_size) + if (f_pos >= (off_t)tracefile_size) f_pos = 0; else if (f_pos > cfs_filp_size(filp)) f_pos = cfs_filp_size(filp); rc = cfs_filp_write(filp, cfs_page_address(tage->page), tage->used, &f_pos); - if (rc != tage->used) { + if (rc != (int)tage->used) { printk(KERN_WARNING "wanted to write %u but " "wrote %d\n", tage->used, rc); put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); } } CFS_MMSPACE_CLOSE; cfs_filp_close(filp); put_pages_on_daemon_list(&pc); + __LASSERT(list_empty(&pc.pc_pages)); } complete(&tctl->tctl_stop); return 0; @@ -633,17 +887,26 @@ void trace_stop_thread(void) int tracefile_init(void) { struct trace_cpu_data *tcd; - int i; + int i; + int rc; + + rc = tracefile_init_arch(); + if (rc != 0) + return rc; for (i = 0; i < NR_CPUS; i++) { tcd = &trace_data[i].tcd; CFS_INIT_LIST_HEAD(&tcd->tcd_pages); + CFS_INIT_LIST_HEAD(&tcd->tcd_stock_pages); CFS_INIT_LIST_HEAD(&tcd->tcd_daemon_pages); tcd->tcd_cur_pages = 0; + tcd->tcd_cur_stock_pages = 0; tcd->tcd_cur_daemon_pages = 0; tcd->tcd_max_pages = TCD_MAX_PAGES; tcd->tcd_shutting_down = 0; + tcd->tcd_cpu = i; } + return 0; } @@ -652,21 +915,21 @@ static void trace_cleanup_on_cpu(void *info) struct trace_cpu_data *tcd; struct trace_page *tage; struct trace_page *tmp; - unsigned long flags; - tcd = trace_get_tcd(flags); + tcd = trace_get_tcd(); + __LASSERT (tcd != NULL); tcd->tcd_shutting_down = 1; list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { - LASSERT_TAGE_INVARIANT(tage); + __LASSERT_TAGE_INVARIANT(tage); list_del(&tage->linkage); tage_free(tage); } tcd->tcd_cur_pages = 0; - trace_put_tcd(tcd, flags); + trace_put_tcd(tcd); } static void trace_cleanup(void) @@ -676,8 +939,9 @@ static void trace_cleanup(void) CFS_INIT_LIST_HEAD(&pc.pc_pages); spin_lock_init(&pc.pc_lock); - trace_cleanup_on_cpu(&pc); - smp_call_function(trace_cleanup_on_cpu, &pc, 0, 1); + trace_call_on_all_cpus(trace_cleanup_on_cpu, &pc); + + tracefile_fini_arch(); } void tracefile_exit(void) diff --git a/lnet/libcfs/tracefile.h b/lnet/libcfs/tracefile.h index 4e7fdde..f3568e9 100644 --- a/lnet/libcfs/tracefile.h +++ b/lnet/libcfs/tracefile.h @@ -3,6 +3,16 @@ #include +/* trace file lock routines */ + +int tracefile_init_arch(void); +void tracefile_fini_arch(void); + +void tracefile_read_lock(void); +void tracefile_read_unlock(void); +void tracefile_write_lock(void); +void tracefile_write_unlock(void); + int tracefile_dump_all_pages(char *filename); void trace_debug_print(void); void trace_flush_pages(void); @@ -21,38 +31,112 @@ int trace_read_debug_mb(char *page, char **start, off_t off, int count, int trace_dk(struct file *file, const char *buffer, unsigned long count, void *data); +extern void libcfs_debug_dumplog_internal(void *arg); +extern void libcfs_register_panic_notifier(void); +extern void libcfs_unregister_panic_notifier(void); +extern int libcfs_panic_in_progress; + #ifdef LUSTRE_TRACEFILE_PRIVATE /* * Private declare for tracefile */ -#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) +#define TCD_MAX_PAGES (5 << (20 - CFS_PAGE_SHIFT)) +#define TCD_STOCK_PAGES (TCD_MAX_PAGES) #define TRACEFILE_SIZE (500 << 20) +/* Size of a buffer for sprinting console messages to in IRQ context (no + * logging in IRQ context) */ +#define TRACE_CONSOLE_BUFFER_SIZE 1024 + union trace_data_union { struct trace_cpu_data { + /* + * pages with trace records not yet processed by tracefiled. + */ struct list_head tcd_pages; + /* number of pages on ->tcd_pages */ unsigned long tcd_cur_pages; + /* + * pages with trace records already processed by + * tracefiled. These pages are kept in memory, so that some + * portion of log can be written in the event of LBUG. This + * list is maintained in LRU order. + * + * Pages are moved to ->tcd_daemon_pages by tracefiled() + * (put_pages_on_daemon_list()). LRU pages from this list are + * discarded when list grows too large. + */ struct list_head tcd_daemon_pages; + /* number of pages on ->tcd_daemon_pages */ unsigned long tcd_cur_daemon_pages; + /* + * Maximal number of pages allowed on ->tcd_pages and + * ->tcd_daemon_pages each. Always TCD_MAX_PAGES in current + * implementation. + */ unsigned long tcd_max_pages; + + /* + * preallocated pages to write trace records into. Pages from + * ->tcd_stock_pages are moved to ->tcd_pages by + * portals_debug_msg(). + * + * This list is necessary, because on some platforms it's + * impossible to perform efficient atomic page allocation in a + * non-blockable context. + * + * Such platforms fill ->tcd_stock_pages "on occasion", when + * tracing code is entered in blockable context. + * + * trace_get_tage_try() tries to get a page from + * ->tcd_stock_pages first and resorts to atomic page + * allocation only if this queue is empty. ->tcd_stock_pages + * is replenished when tracing code is entered in blocking + * context (darwin-tracefile.c:trace_get_tcd()). We try to + * maintain TCD_STOCK_PAGES (40 by default) pages in this + * queue. Atomic allocation is only required if more than + * TCD_STOCK_PAGES pagesful are consumed by trace records all + * emitted in non-blocking contexts. Which is quite unlikely. + */ + struct list_head tcd_stock_pages; + /* number of pages on ->tcd_stock_pages */ + unsigned long tcd_cur_stock_pages; + int tcd_shutting_down; + int tcd_cpu; } tcd; char __pad[SMP_CACHE_BYTES]; }; +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ struct page_collection { struct list_head pc_pages; + /* + * spin-lock protecting ->pc_pages. It is taken by smp_call_function() + * call-back functions. XXX nikita: Which is horrible: all processors + * receive NMI at the same time only to be serialized by this + * lock. Probably ->pc_pages should be replaced with an array of + * NR_CPUS elements accessed locklessly. + */ spinlock_t pc_lock; + /* + * if this flag is set, collect_pages() will spill both + * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise, + * only ->tcd_pages are spilled. + */ int pc_want_daemon_pages; }; +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ struct tracefiled_ctl { struct completion tctl_start; struct completion tctl_stop; - cfs_waitq_t tctl_waitq; + cfs_waitq_t tctl_waitq; pid_t tctl_pid; atomic_t tctl_shutdown; }; @@ -60,6 +144,8 @@ struct tracefiled_ctl { /* * small data-structure for each page owned by tracefiled. */ +/* XXX nikita: this declaration is internal to tracefile.c and should probably + * be moved there */ struct trace_page { /* * page itself @@ -83,14 +169,42 @@ struct trace_page { extern void set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, const int line, unsigned long stack); -extern void print_to_console(struct ptldebug_header *hdr, int mask, - char *buf, int len, char *file, const char *fn); -extern struct trace_cpu_data * __trace_get_tcd (unsigned long *flags); -extern void __trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags); +extern void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf, + int len, const char *file, const char *fn); + +extern struct trace_cpu_data *trace_get_tcd(void); +extern void trace_put_tcd(struct trace_cpu_data *tcd); +extern char *trace_get_console_buffer(void); +extern void trace_put_console_buffer(char *buffer); + +extern void trace_call_on_all_cpus(void (*fn)(void *arg), void *arg); + +int trace_refill_stock(struct trace_cpu_data *tcd, int gfp, + struct list_head *stock); + + +int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage); + +extern void trace_assertion_failed(const char *str, const char *fn, + const char *file, int line); + +/* ASSERTION that is safe to use within the debug system */ +#define __LASSERT(cond) \ +({ \ + if (unlikely(!(cond))) { \ + trace_assertion_failed("ASSERTION("#cond") failed", \ + __FUNCTION__, __FILE__, __LINE__); \ + } \ +}) -#define trace_get_tcd(f) __trace_get_tcd(&(f)) -#define trace_put_tcd(t, f) __trace_put_tcd(t, f) +#define __LASSERT_TAGE_INVARIANT(tage) \ +({ \ + __LASSERT(tage != NULL); \ + __LASSERT(tage->page != NULL); \ + __LASSERT(tage->used <= CFS_PAGE_SIZE); \ + __LASSERT(cfs_page_count(tage->page) > 0); \ +}) #endif /* LUSTRE_TRACEFILE_PRIVATE */ -#endif /* __PORTALS_TRACEFILE_H */ +#endif /* __LIBCFS_TRACEFILE_H__ */ diff --git a/lnet/libcfs/user-lock.c b/lnet/libcfs/user-lock.c index 99dcd7f..a1a6779 100644 --- a/lnet/libcfs/user-lock.c +++ b/lnet/libcfs/user-lock.c @@ -27,14 +27,21 @@ /* * liblustre is single-threaded, so most "synchronization" APIs are trivial. + * + * XXX Liang: There are several branches share lnet with b_hd_newconfig, + * if we define lock APIs at here, there will be conflict with liblustre + * in other branches. */ #ifndef __KERNEL__ +#include +#include /* * Optional debugging (magic stamping and checking ownership) can be added. */ +#if 0 /* * spin_lock * @@ -89,19 +96,6 @@ void spin_unlock_bh(spinlock_t *lock) (void)lock; } -void spin_lock_irqsave(spinlock_t *lock, unsigned long flags) -{ - LASSERT(lock != NULL); - (void)lock; -} - -void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) -{ - LASSERT(lock != NULL); - (void)lock; -} - - /* * Semaphore * @@ -227,6 +221,7 @@ void up_write(struct rw_semaphore *s) LASSERT(s != NULL); (void)s; } +#endif /* !__KERNEL__ */ #endif diff --git a/lnet/libcfs/user-prim.c b/lnet/libcfs/user-prim.c index ddc994c..8d968a0 100644 --- a/lnet/libcfs/user-prim.c +++ b/lnet/libcfs/user-prim.c @@ -34,17 +34,24 @@ #include #ifndef __CYGWIN__ #include +#ifdef HAVE_ASM_PAGE_H #include +#endif +#ifdef HAVE_SYS_USER_H +#include +#endif #else #include #endif #include #include +#include #include #include #include #include +#include /* * Sleep channel. No-op implementation. @@ -98,6 +105,7 @@ int cfs_waitq_active(struct cfs_waitq *waitq) { LASSERT(waitq != NULL); (void)waitq; + return 0; } void cfs_waitq_signal(struct cfs_waitq *waitq) @@ -112,7 +120,7 @@ void cfs_waitq_signal_nr(struct cfs_waitq *waitq, int nr) (void)waitq; } -void cfs_waitq_broadcast(struct cfs_waitq *waitq) +void cfs_waitq_broadcast(struct cfs_waitq *waitq, int state) { LASSERT(waitq != NULL); (void)waitq; @@ -124,27 +132,24 @@ void cfs_waitq_wait(struct cfs_waitlink *link) (void)link; } -int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int64_t timeout) +int64_t cfs_waitq_timedwait(struct cfs_waitlink *link, int state, int64_t timeout) { LASSERT(link != NULL); (void)link; + return 0; } /* * Allocator */ -cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order) +cfs_page_t *cfs_alloc_page(unsigned int flags) { cfs_page_t *pg = malloc(sizeof(*pg)); if (!pg) return NULL; -#if 0 //#ifdef MAP_ANONYMOUS - pg->addr = mmap(0, PAGE_SIZE << order, PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); -#else - pg->addr = malloc(PAGE_SIZE << order); -#endif + pg->addr = malloc(CFS_PAGE_SIZE); if (!pg->addr) { free(pg); @@ -153,26 +158,12 @@ cfs_page_t *cfs_alloc_pages(unsigned int flags, unsigned int order) return pg; } -void cfs_free_pages(struct page *pg, int what) +void cfs_free_page(cfs_page_t *pg) { -#if 0 //#ifdef MAP_ANONYMOUS - munmap(pg->addr, PAGE_SIZE); -#else free(pg->addr); -#endif free(pg); } -cfs_page_t *cfs_alloc_page(unsigned int flags) -{ - return cfs_alloc_pages(flags, 0); -} - -void cfs_free_page(cfs_page_t *pg, int what) -{ - cfs_free_page(pg, what); -} - void *cfs_page_address(cfs_page_t *pg) { return pg->addr; @@ -188,40 +179,11 @@ void cfs_kunmap(cfs_page_t *pg) } /* - * Memory allocator - */ -void *cfs_alloc(size_t nr_bytes, u_int32_t flags) -{ - void *result; - - result = malloc(nr_bytes); - if (result != NULL && (flags & CFS_ALLOC_ZERO)) - memset(result, 0, nr_bytes); -} - -void cfs_free(void *addr) -{ - free(addr); -} - -void *cfs_alloc_large(size_t nr_bytes) -{ - return cfs_alloc(nr_bytes, 0); -} - -void cfs_free_large(void *addr) -{ - return cfs_free(addr); -} - -/* * SLAB allocator */ cfs_mem_cache_t * -cfs_mem_cache_create(const char *, size_t, size_t, unsigned long, - void (*)(void *, cfs_mem_cache_t *, unsigned long), - void (*)(void *, cfs_mem_cache_t *, unsigned long)) +cfs_mem_cache_create(const char *name, size_t objsize, size_t off, unsigned long flags) { cfs_mem_cache_t *c; @@ -243,7 +205,7 @@ int cfs_mem_cache_destroy(cfs_mem_cache_t *c) void *cfs_mem_cache_alloc(cfs_mem_cache_t *c, int gfp) { - return cfs_alloc(c, gfp); + return cfs_alloc(c->size, gfp); } void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr) @@ -251,6 +213,138 @@ void cfs_mem_cache_free(cfs_mem_cache_t *c, void *addr) cfs_free(addr); } +/* + * This uses user-visible declarations from + */ +#ifdef __LINUX__ +#include +#endif + +#ifndef MKDEV + +#define MAJOR(dev) ((dev)>>8) +#define MINOR(dev) ((dev) & 0xff) +#define MKDEV(ma,mi) ((ma)<<8 | (mi)) + +#endif + +cfs_rdev_t cfs_rdev_build(cfs_major_nr_t major, cfs_minor_nr_t minor) +{ + return MKDEV(major, minor); +} + +cfs_major_nr_t cfs_rdev_major(cfs_rdev_t rdev) +{ + return MAJOR(rdev); +} + +cfs_minor_nr_t cfs_rdev_minor(cfs_rdev_t rdev) +{ + return MINOR(rdev); +} + +void cfs_enter_debugger(void) +{ + /* + * nothing for now. + */ +} + +void cfs_daemonize(char *str) +{ + return; +} + +cfs_sigset_t cfs_block_allsigs(void) +{ + cfs_sigset_t all; + cfs_sigset_t old; + int rc; + + sigfillset(&all); + rc = sigprocmask(SIG_SETMASK, &all, &old); + LASSERT(rc == 0); + + return old; +} + +cfs_sigset_t cfs_block_sigs(cfs_sigset_t blocks) +{ + cfs_sigset_t old; + int rc; + + rc = sigprocmask(SIG_SETMASK, &blocks, &old); + LASSERT (rc == 0); + + return old; +} + +void cfs_restore_sigs(cfs_sigset_t old) +{ + int rc = sigprocmask(SIG_SETMASK, &old, NULL); + + LASSERT (rc == 0); +} + +int cfs_signal_pending(void) +{ + cfs_sigset_t empty; + cfs_sigset_t set; + int rc; + + rc = sigpending(&set); + LASSERT (rc == 0); + + sigemptyset(&empty); + + return !memcmp(&empty, &set, sizeof(set)); +} + +void cfs_clear_sigpending(void) +{ + return; +} + +#ifdef __LINUX__ + +/* + * In glibc (NOT in Linux, so check above is not right), implement + * stack-back-tracing through backtrace() function. + */ +#include + +void cfs_stack_trace_fill(struct cfs_stack_trace *trace) +{ + backtrace(trace->frame, sizeof_array(trace->frame)); +} + +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no) +{ + if (0 <= frame_no && frame_no < sizeof_array(trace->frame)) + return trace->frame[frame_no]; + else + return NULL; +} + +#else + +void cfs_stack_trace_fill(struct cfs_stack_trace *trace) +{} +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no) +{ + return NULL; +} + +/* __LINUX__ */ +#endif + +void lbug_with_loc(char *file, const char *func, const int line) +{ + /* No libcfs_catastrophe in userspace! */ + libcfs_debug_msg(NULL, 0, D_EMERG, file, func, line, "LBUG\n"); + abort(); +} + /* !__KERNEL__ */ #endif diff --git a/lnet/libcfs/watchdog.c b/lnet/libcfs/watchdog.c index c9be01a..3000e8f 100644 --- a/lnet/libcfs/watchdog.c +++ b/lnet/libcfs/watchdog.c @@ -20,27 +20,23 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET #include #include -#include - - +#include "tracefile.h" struct lc_watchdog { - struct timer_list lcw_timer; /* kernel timer */ + cfs_timer_t lcw_timer; /* kernel timer */ struct list_head lcw_list; struct timeval lcw_last_touched; - struct task_struct *lcw_task; + cfs_task_t *lcw_task; - void (*lcw_callback)(struct lc_watchdog *, - struct task_struct *, - void *data); - void *lcw_data; + void (*lcw_callback)(pid_t, void *); + void *lcw_data; - int lcw_pid; - int lcw_time; /* time until watchdog fires, in ms */ + pid_t lcw_pid; + int lcw_time; /* time until watchdog fires, in ms */ enum { LC_WATCHDOG_DISABLED, @@ -49,6 +45,7 @@ struct lc_watchdog { } lcw_state; }; +#ifdef WITH_WATCHDOG /* * The dispatcher will complete lcw_start_completion when it starts, * and lcw_stop_completion when it exits. @@ -78,36 +75,44 @@ static DECLARE_MUTEX(lcw_refcount_sem); * List of timers that have fired that need their callbacks run by the * dispatcher. */ -static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; /* BH lock! */ static struct list_head lcw_pending_timers = \ LIST_HEAD_INIT(lcw_pending_timers); -static struct task_struct *lcw_lookup_task(struct lc_watchdog *lcw) +#ifdef HAVE_TASKLIST_LOCK +static void +lcw_dump(struct lc_watchdog *lcw) { - struct task_struct *tsk; - unsigned long flags; + cfs_task_t *tsk; ENTRY; - read_lock_irqsave(&tasklist_lock, flags); + read_lock(&tasklist_lock); tsk = find_task_by_pid(lcw->lcw_pid); - read_unlock_irqrestore(&tasklist_lock, flags); - if (!tsk) { + + if (tsk == NULL) { CWARN("Process %d was not found in the task list; " - "watchdog callback may be incomplete\n", lcw->lcw_pid); + "watchdog callback may be incomplete\n", (int)lcw->lcw_pid); } else if (tsk != lcw->lcw_task) { - tsk = NULL; CWARN("The current process %d did not set the watchdog; " - "watchdog callback may be incomplete\n", lcw->lcw_pid); + "watchdog callback may be incomplete\n", (int)lcw->lcw_pid); + } else { + libcfs_debug_dumpstack(tsk); } - - RETURN(tsk); + + read_unlock(&tasklist_lock); + EXIT; } +#else +static void +lcw_dump(struct lc_watchdog *lcw) +{ + CERROR("unable to dump stack because of missing export\n"); +} +#endif static void lcw_cb(unsigned long data) { struct lc_watchdog *lcw = (struct lc_watchdog *)data; - struct task_struct *tsk; - unsigned long flags; ENTRY; @@ -118,47 +123,47 @@ static void lcw_cb(unsigned long data) lcw->lcw_state = LC_WATCHDOG_EXPIRED; - CWARN("Watchdog triggered for pid %d: it was inactive for %dms\n", - lcw->lcw_pid, (lcw->lcw_time * 1000) / HZ); + /* NB this warning should appear on the console, but may not get into + * the logs since we're running in a softirq handler */ + + CWARN("Watchdog triggered for pid %d: it was inactive for %ldms\n", + (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time) * 1000); + lcw_dump(lcw); - tsk = lcw_lookup_task(lcw); - if (tsk != NULL) - portals_debug_dumpstack(tsk); + spin_lock_bh(&lcw_pending_timers_lock); - spin_lock_irqsave(&lcw_pending_timers_lock, flags); if (list_empty(&lcw->lcw_list)) { list_add(&lcw->lcw_list, &lcw_pending_timers); wake_up(&lcw_event_waitq); } - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + + spin_unlock_bh(&lcw_pending_timers_lock); EXIT; } static int is_watchdog_fired(void) { - unsigned long flags; int rc; if (test_bit(LCW_FLAG_STOP, &lcw_flags)) return 1; - spin_lock_irqsave(&lcw_pending_timers_lock, flags); + spin_lock_bh(&lcw_pending_timers_lock); rc = !list_empty(&lcw_pending_timers); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + spin_unlock_bh(&lcw_pending_timers_lock); return rc; } static int lcw_dispatch_main(void *data) { - int rc = 0; - unsigned long flags; + int rc = 0; + unsigned long flags; struct lc_watchdog *lcw; - struct task_struct *tsk; ENTRY; - kportal_daemonize("lc_watchdogd"); + cfs_daemonize("lc_watchdogd"); SIGNAL_MASK_LOCK(current, flags); sigfillset(¤t->blocked); @@ -173,9 +178,9 @@ static int lcw_dispatch_main(void *data) if (test_bit(LCW_FLAG_STOP, &lcw_flags)) { CDEBUG(D_INFO, "LCW_FLAG_STOP was set, shutting down...\n"); - spin_lock_irqsave(&lcw_pending_timers_lock, flags); + spin_lock_bh(&lcw_pending_timers_lock); rc = !list_empty(&lcw_pending_timers); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + spin_unlock_bh(&lcw_pending_timers_lock); if (rc) { CERROR("pending timers list was not empty at " "time of watchdog dispatch shutdown\n"); @@ -183,29 +188,24 @@ static int lcw_dispatch_main(void *data) break; } - spin_lock_irqsave(&lcw_pending_timers_lock, flags); + spin_lock_bh(&lcw_pending_timers_lock); while (!list_empty(&lcw_pending_timers)) { lcw = list_entry(lcw_pending_timers.next, struct lc_watchdog, lcw_list); list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + spin_unlock_bh(&lcw_pending_timers_lock); - CDEBUG(D_INFO, "found lcw for pid %d\n", lcw->lcw_pid); + CDEBUG(D_INFO, "found lcw for pid %d: inactive for %ldms\n", + (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time) * 1000); - if (lcw->lcw_state != LC_WATCHDOG_DISABLED) { - /* - * sanity check the task against our - * watchdog - */ - tsk = lcw_lookup_task(lcw); - lcw->lcw_callback(lcw, tsk, lcw->lcw_data); - } + if (lcw->lcw_state != LC_WATCHDOG_DISABLED) + lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data); - spin_lock_irqsave(&lcw_pending_timers_lock, flags); + spin_lock_bh(&lcw_pending_timers_lock); } - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + spin_unlock_bh(&lcw_pending_timers_lock); } complete(&lcw_stop_completion); @@ -255,26 +255,24 @@ static void lcw_dispatch_stop(void) } struct lc_watchdog *lc_watchdog_add(int timeout_ms, - void (*callback)(struct lc_watchdog *, - struct task_struct *, - void *), + void (*callback)(pid_t, void *), void *data) { struct lc_watchdog *lcw = NULL; ENTRY; - PORTAL_ALLOC(lcw, sizeof(*lcw)); - if (!lcw) { + LIBCFS_ALLOC(lcw, sizeof(*lcw)); + if (lcw == NULL) { CDEBUG(D_INFO, "Could not allocate new lc_watchdog\n"); RETURN(ERR_PTR(-ENOMEM)); } - lcw->lcw_task = cfs_current(); - lcw->lcw_pid = cfs_curproc_pid(); - lcw->lcw_time = (timeout_ms * HZ) / 1000; - lcw->lcw_callback = callback ? callback : lc_watchdog_dumplog; - lcw->lcw_data = data; - lcw->lcw_state = LC_WATCHDOG_DISABLED; + lcw->lcw_task = cfs_current(); + lcw->lcw_pid = cfs_curproc_pid(); + lcw->lcw_time = cfs_time_seconds(timeout_ms) / 1000; + lcw->lcw_callback = (callback != NULL) ? callback : lc_watchdog_dumplog; + lcw->lcw_data = data; + lcw->lcw_state = LC_WATCHDOG_DISABLED; INIT_LIST_HEAD(&lcw->lcw_list); @@ -298,40 +296,31 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms, } EXPORT_SYMBOL(lc_watchdog_add); -static long -timeval_sub(struct timeval *large, struct timeval *small) -{ - return (large->tv_sec - small->tv_sec) * 1000000 + - (large->tv_usec - small->tv_usec); -} - static void lcw_update_time(struct lc_watchdog *lcw, const char *message) { struct timeval newtime; - unsigned long timediff; + struct timeval timediff; do_gettimeofday(&newtime); if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) { - timediff = timeval_sub(&newtime, &lcw->lcw_last_touched); + cfs_timeval_sub(&newtime, &lcw->lcw_last_touched, &timediff); CWARN("Expired watchdog for pid %d %s after %lu.%.4lus\n", lcw->lcw_pid, message, - timediff / 1000000, - (timediff % 1000000) / 100); + timediff.tv_sec, + timediff.tv_usec / 100); } lcw->lcw_last_touched = newtime; } void lc_watchdog_touch(struct lc_watchdog *lcw) { - unsigned long flags; ENTRY; LASSERT(lcw != NULL); - spin_lock_irqsave(&lcw_pending_timers_lock, flags); - if (!list_empty(&lcw->lcw_list)) - list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + spin_lock_bh(&lcw_pending_timers_lock); + list_del_init(&lcw->lcw_list); + spin_unlock_bh(&lcw_pending_timers_lock); lcw_update_time(lcw, "touched"); lcw->lcw_state = LC_WATCHDOG_ENABLED; @@ -344,14 +333,13 @@ EXPORT_SYMBOL(lc_watchdog_touch); void lc_watchdog_disable(struct lc_watchdog *lcw) { - unsigned long flags; ENTRY; LASSERT(lcw != NULL); - spin_lock_irqsave(&lcw_pending_timers_lock, flags); + spin_lock_bh(&lcw_pending_timers_lock); if (!list_empty(&lcw->lcw_list)) list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + spin_unlock_bh(&lcw_pending_timers_lock); lcw_update_time(lcw, "disabled"); lcw->lcw_state = LC_WATCHDOG_DISABLED; @@ -362,7 +350,6 @@ EXPORT_SYMBOL(lc_watchdog_disable); void lc_watchdog_delete(struct lc_watchdog *lcw) { - unsigned long flags; ENTRY; LASSERT(lcw != NULL); @@ -370,17 +357,17 @@ void lc_watchdog_delete(struct lc_watchdog *lcw) lcw_update_time(lcw, "deleted"); - spin_lock_irqsave(&lcw_pending_timers_lock, flags); + spin_lock_bh(&lcw_pending_timers_lock); if (!list_empty(&lcw->lcw_list)) list_del_init(&lcw->lcw_list); - spin_unlock_irqrestore(&lcw_pending_timers_lock, flags); + spin_unlock_bh(&lcw_pending_timers_lock); down(&lcw_refcount_sem); if (--lcw_refcount == 0) lcw_dispatch_stop(); up(&lcw_refcount_sem); - PORTAL_FREE(lcw, sizeof(*lcw)); + LIBCFS_FREE(lcw, sizeof(*lcw)); EXIT; } @@ -390,13 +377,37 @@ EXPORT_SYMBOL(lc_watchdog_delete); * Provided watchdog handlers */ -extern void portals_debug_dumplog_internal(void *arg); - -void lc_watchdog_dumplog(struct lc_watchdog *lcw, - struct task_struct *tsk, - void *data) +void lc_watchdog_dumplog(pid_t pid, void *data) { - tsk = tsk ? tsk : current; - portals_debug_dumplog_internal((void *)(long)tsk->pid); + libcfs_debug_dumplog_internal((void *)((unsigned long)pid)); } EXPORT_SYMBOL(lc_watchdog_dumplog); + +#else /* !defined(WITH_WATCHDOG) */ + +struct lc_watchdog *lc_watchdog_add(int timeout_ms, + void (*callback)(pid_t pid, void *), + void *data) +{ + static struct lc_watchdog watchdog; + return &watchdog; +} +EXPORT_SYMBOL(lc_watchdog_add); + +void lc_watchdog_touch(struct lc_watchdog *lcw) +{ +} +EXPORT_SYMBOL(lc_watchdog_touch); + +void lc_watchdog_disable(struct lc_watchdog *lcw) +{ +} +EXPORT_SYMBOL(lc_watchdog_disable); + +void lc_watchdog_delete(struct lc_watchdog *lcw) +{ +} +EXPORT_SYMBOL(lc_watchdog_delete); + +#endif + diff --git a/lnet/libcfs/winnt/winnt-curproc.c b/lnet/libcfs/winnt/winnt-curproc.c new file mode 100644 index 0000000..e21c5c9 --- /dev/null +++ b/lnet/libcfs/winnt/winnt-curproc.c @@ -0,0 +1,453 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + * Impletion of winnt curproc routines. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + + +/* + * Implementation of cfs_curproc API (see portals/include/libcfs/curproc.h) + * for Linux kernel. + */ + +cfs_task_t this_task = + { 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, + "sysetm\0" }; + + +uid_t cfs_curproc_uid(void) +{ + return this_task.uid; +} + +gid_t cfs_curproc_gid(void) +{ + return this_task.gid; +} + +uid_t cfs_curproc_fsuid(void) +{ + return this_task.fsuid; +} + +gid_t cfs_curproc_fsgid(void) +{ + return this_task.fsgid; +} + +pid_t cfs_curproc_pid(void) +{ + return cfs_current()->pid; +} + +int cfs_curproc_groups_nr(void) +{ + return this_task.ngroups; +} + +void cfs_curproc_groups_dump(gid_t *array, int size) +{ + LASSERT(size <= NGROUPS); + size = min_t(int, size, this_task.ngroups); + memcpy(array, this_task.groups, size * sizeof(__u32)); +} + +int cfs_curproc_is_in_groups(gid_t gid) +{ + return in_group_p(gid); +} + +mode_t cfs_curproc_umask(void) +{ + return this_task.umask; +} + +char *cfs_curproc_comm(void) +{ + return this_task.comm; +} + +cfs_kernel_cap_t cfs_curproc_cap_get(void) +{ + return this_task.cap_effective; +} + +void cfs_curproc_cap_set(cfs_kernel_cap_t cap) +{ + this_task.cap_effective = cap; +} + + +/* + * Implementation of linux task management routines + */ + + +/* global of the task manager structure */ + +TASK_MAN TaskMan; + + +/* + * task slot routiens + */ + +PTASK_SLOT +alloc_task_slot() +{ + PTASK_SLOT task = NULL; + + if (TaskMan.slab) { + task = cfs_mem_cache_alloc(TaskMan.slab, 0); + } else { + task = cfs_alloc(sizeof(TASK_SLOT), 0); + } + + return task; +} + +void +init_task_slot(PTASK_SLOT task) +{ + memset(task, 0, sizeof(TASK_SLOT)); + task->Magic = TASKSLT_MAGIC; + task->task = this_task; + task->task.pid = (pid_t)PsGetCurrentThreadId(); + cfs_init_event(&task->Event, TRUE, FALSE); +} + + +void +cleanup_task_slot(PTASK_SLOT task) +{ + if (TaskMan.slab) { + cfs_mem_cache_free(TaskMan.slab, task); + } else { + cfs_free(task); + } +} + +/* + * task manager related routines + */ + +VOID +task_manager_notify( + IN HANDLE ProcessId, + IN HANDLE ThreadId, + IN BOOLEAN Create + ) +{ + PLIST_ENTRY ListEntry = NULL; + PTASK_SLOT TaskSlot = NULL; + + spin_lock(&(TaskMan.Lock)); + + ListEntry = TaskMan.TaskList.Flink; + + while (ListEntry != (&(TaskMan.TaskList))) { + + TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link); + + if (TaskSlot->Pid == ProcessId && TaskSlot->Tid == ThreadId) { + + if (Create) { +/* + DbgPrint("task_manager_notify: Pid=%xh Tid %xh resued (TaskSlot->Tet = %xh)...\n", + ProcessId, ThreadId, TaskSlot->Tet); +*/ + } else { + /* remove the taskslot */ + RemoveEntryList(&(TaskSlot->Link)); + TaskMan.NumOfTasks--; + + /* now free the task slot */ + cleanup_task_slot(TaskSlot); + } + } + + ListEntry = ListEntry->Flink; + } + + spin_unlock(&(TaskMan.Lock)); +} + +int +init_task_manager() +{ + NTSTATUS status; + + /* initialize the content and magic */ + memset(&TaskMan, 0, sizeof(TASK_MAN)); + TaskMan.Magic = TASKMAN_MAGIC; + + /* initialize the spinlock protection */ + spin_lock_init(&TaskMan.Lock); + + /* create slab memory cache */ + TaskMan.slab = cfs_mem_cache_create( + "TSLT", sizeof(TASK_SLOT), 0, 0); + + /* intialize the list header */ + InitializeListHead(&(TaskMan.TaskList)); + + /* set the thread creation/destruction notify routine */ + status = PsSetCreateThreadNotifyRoutine(task_manager_notify); + + if (!NT_SUCCESS(status)) { + cfs_enter_debugger(); + } + + return 0; +} + +void +cleanup_task_manager() +{ + PLIST_ENTRY ListEntry = NULL; + PTASK_SLOT TaskSlot = NULL; + + /* we must stay in system since we succeed to register the + CreateThreadNotifyRoutine: task_manager_notify */ + cfs_enter_debugger(); + + + /* cleanup all the taskslots attached to the list */ + spin_lock(&(TaskMan.Lock)); + + while (!IsListEmpty(&(TaskMan.TaskList))) { + + ListEntry = TaskMan.TaskList.Flink; + TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link); + + RemoveEntryList(ListEntry); + cleanup_task_slot(TaskSlot); + } + + spin_unlock(&TaskMan.Lock); + + /* destroy the taskslot cache slab */ + cfs_mem_cache_destroy(TaskMan.slab); + memset(&TaskMan, 0, sizeof(TASK_MAN)); +} + + +/* + * schedule routines (task slot list) + */ + + +cfs_task_t * +cfs_current() +{ + HANDLE Pid = PsGetCurrentProcessId(); + HANDLE Tid = PsGetCurrentThreadId(); + PETHREAD Tet = PsGetCurrentThread(); + + PLIST_ENTRY ListEntry = NULL; + PTASK_SLOT TaskSlot = NULL; + + spin_lock(&(TaskMan.Lock)); + + ListEntry = TaskMan.TaskList.Flink; + + while (ListEntry != (&(TaskMan.TaskList))) { + + TaskSlot = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link); + + if (TaskSlot->Pid == Pid && TaskSlot->Tid == Tid) { + if (TaskSlot->Tet != Tet) { + +/* + DbgPrint("cfs_current: Pid=%xh Tid %xh Tet = %xh resued (TaskSlot->Tet = %xh)...\n", + Pid, Tid, Tet, TaskSlot->Tet); +*/ + // + // The old thread was already exit. This must be a + // new thread which get the same Tid to the previous. + // + + TaskSlot->Tet = Tet; + } + break; + + } else { + + if ((ULONG)TaskSlot->Pid > (ULONG)Pid) { + TaskSlot = NULL; + break; + } else if ((ULONG)TaskSlot->Pid == (ULONG)Pid) { + if ((ULONG)TaskSlot->Tid > (ULONG)Tid) { + TaskSlot = NULL; + break; + } + } + + TaskSlot = NULL; + } + + ListEntry = ListEntry->Flink; + } + + if (!TaskSlot) { + + TaskSlot = alloc_task_slot(); + + if (!TaskSlot) { + cfs_enter_debugger(); + goto errorout; + } + + init_task_slot(TaskSlot); + + TaskSlot->Pid = Pid; + TaskSlot->Tid = Tid; + TaskSlot->Tet = Tet; + + if (ListEntry == (&(TaskMan.TaskList))) { + // + // Empty case or the biggest case, put it to the tail. + // + InsertTailList(&(TaskMan.TaskList), &(TaskSlot->Link)); + } else { + // + // Get a slot and smaller than it's tid, put it just before. + // + InsertHeadList(ListEntry->Blink, &(TaskSlot->Link)); + } + + TaskMan.NumOfTasks++; + } + + // + // To Check whether he task structures are arranged in the expected order ? + // + + { + PTASK_SLOT Prev = NULL, Curr = NULL; + + ListEntry = TaskMan.TaskList.Flink; + + while (ListEntry != (&(TaskMan.TaskList))) { + + Curr = CONTAINING_RECORD(ListEntry, TASK_SLOT, Link); + ListEntry = ListEntry->Flink; + + if (Prev) { + if ((ULONG)Prev->Pid > (ULONG)Curr->Pid) { + cfs_enter_debugger(); + } else if ((ULONG)Prev->Pid == (ULONG)Curr->Pid) { + if ((ULONG)Prev->Tid > (ULONG)Curr->Tid) { + cfs_enter_debugger(); + } + } + } + + Prev = Curr; + } + } + +errorout: + + spin_unlock(&(TaskMan.Lock)); + + if (!TaskSlot) { + cfs_enter_debugger(); + return NULL; + } + + return (&(TaskSlot->task)); +} + +int +schedule_timeout(int64_t time) +{ + cfs_task_t * task = cfs_current(); + PTASK_SLOT slot = NULL; + + if (!task) { + cfs_enter_debugger(); + return 0; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + cfs_assert(slot->Magic == TASKSLT_MAGIC); + + if (time == MAX_SCHEDULE_TIMEOUT) { + time = 0; + } + + return (cfs_wait_event(&(slot->Event), time) != 0); +} + +int +schedule() +{ + return schedule_timeout(0); +} + +int +wake_up_process( + cfs_task_t * task + ) +{ + PTASK_SLOT slot = NULL; + + if (!task) { + cfs_enter_debugger(); + return 0; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + cfs_assert(slot->Magic == TASKSLT_MAGIC); + + cfs_wake_event(&(slot->Event)); + + return TRUE; +} + +void +sleep_on( + cfs_waitq_t *waitq + ) +{ + cfs_waitlink_t link; + + cfs_waitlink_init(&link); + cfs_waitq_add(waitq, &link); + cfs_waitq_wait(&link, CFS_TASK_INTERRUPTIBLE); + cfs_waitq_del(waitq, &link); +} + +EXPORT_SYMBOL(cfs_curproc_uid); +EXPORT_SYMBOL(cfs_curproc_pid); +EXPORT_SYMBOL(cfs_curproc_gid); +EXPORT_SYMBOL(cfs_curproc_fsuid); +EXPORT_SYMBOL(cfs_curproc_fsgid); +EXPORT_SYMBOL(cfs_curproc_umask); +EXPORT_SYMBOL(cfs_curproc_comm); +EXPORT_SYMBOL(cfs_curproc_groups_nr); +EXPORT_SYMBOL(cfs_curproc_groups_dump); +EXPORT_SYMBOL(cfs_curproc_is_in_groups); +EXPORT_SYMBOL(cfs_curproc_cap_get); +EXPORT_SYMBOL(cfs_curproc_cap_set); diff --git a/lnet/libcfs/winnt/winnt-debug.c b/lnet/libcfs/winnt/winnt-debug.c new file mode 100644 index 0000000..9e94f84 --- /dev/null +++ b/lnet/libcfs/winnt/winnt-debug.c @@ -0,0 +1,1057 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include "tracefile.h" + +void lnet_debug_dumpstack(cfs_task_t *tsk) +{ + return; +} + +cfs_task_t *lnet_current(void) +{ + return cfs_current(); +} + +int lnet_arch_debug_init(unsigned long bufsize) +{ + return 0; +} + +int lnet_arch_debug_cleanup(void) +{ + return 0; +} + +void lnet_run_lbug_upcall(char *file, const char *fn, const int line) +{ +} + +void lbug_with_loc(char *file, const char *func, const int line) +{ + libcfs_catastrophe = 1; + CEMERG("LBUG: pid: %u thread: %#x\n", + (unsigned)cfs_curproc_pid(), (unsigned)PsGetCurrentThread()); + // portals_debug_dumplog(); + // portals_run_lbug_upcall(file, func, line); +} + +#if TDI_LIBCFS_DBG + +/* + * Definitions + */ + +LONG KsDebugLevel = 0x5; + + +/* + * Routines + */ + + +/* + * KsNtStatusToString + * Get the error message for a specified nt status + * + * Arguments: + * Status - nt status code + * + * Return Value: + * PUCHAR - message string for the status code + * + * NOTES: + * N/A + */ + +PUCHAR +KsNtStatusToString (IN NTSTATUS Status) +{ + switch (Status) { + + case 0x00000000: return "STATUS_SUCCESS"; + case 0x00000001: return "STATUS_WAIT_1"; + case 0x00000002: return "STATUS_WAIT_2"; + case 0x00000003: return "STATUS_WAIT_3"; + case 0x0000003F: return "STATUS_WAIT_63"; + case 0x00000080: return "STATUS_ABANDONED_WAIT_0"; + case 0x000000BF: return "STATUS_ABANDONED_WAIT_63"; + case 0x000000C0: return "STATUS_USER_APC"; + case 0x00000100: return "STATUS_KERNEL_APC"; + case 0x00000101: return "STATUS_ALERTED"; + case 0x00000102: return "STATUS_TIMEOUT"; + case 0x00000103: return "STATUS_PENDING"; + case 0x00000104: return "STATUS_REPARSE"; + case 0x00000105: return "STATUS_MORE_ENTRIES"; + case 0x00000106: return "STATUS_NOT_ALL_ASSIGNED"; + case 0x00000107: return "STATUS_SOME_NOT_MAPPED"; + case 0x00000108: return "STATUS_OPLOCK_BREAK_IN_PROGRESS"; + case 0x00000109: return "STATUS_VOLUME_MOUNTED"; + case 0x0000010A: return "STATUS_RXACT_COMMITTED"; + case 0x0000010B: return "STATUS_NOTIFY_CLEANUP"; + case 0x0000010C: return "STATUS_NOTIFY_ENUM_DIR"; + case 0x0000010D: return "STATUS_NO_QUOTAS_FOR_ACCOUNT"; + case 0x0000010E: return "STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED"; + case 0x00000110: return "STATUS_PAGE_FAULT_TRANSITION"; + case 0x00000111: return "STATUS_PAGE_FAULT_DEMAND_ZERO"; + case 0x00000112: return "STATUS_PAGE_FAULT_COPY_ON_WRITE"; + case 0x00000113: return "STATUS_PAGE_FAULT_GUARD_PAGE"; + case 0x00000114: return "STATUS_PAGE_FAULT_PAGING_FILE"; + case 0x00000115: return "STATUS_CACHE_PAGE_LOCKED"; + case 0x00000116: return "STATUS_CRASH_DUMP"; + case 0x00000117: return "STATUS_BUFFER_ALL_ZEROS"; + case 0x00000118: return "STATUS_REPARSE_OBJECT"; + case 0x00000119: return "STATUS_RESOURCE_REQUIREMENTS_CHANGED"; + case 0x00000120: return "STATUS_TRANSLATION_COMPLETE"; + case 0x00000121: return "STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY"; + case 0x00010001: return "DBG_EXCEPTION_HANDLED"; + case 0x00010002: return "DBG_CONTINUE"; + case 0x40000000: return "STATUS_OBJECT_NAME_EXISTS"; + case 0x40000001: return "STATUS_THREAD_WAS_SUSPENDED"; + case 0x40000002: return "STATUS_WORKING_SET_LIMIT_RANGE"; + case 0x40000003: return "STATUS_IMAGE_NOT_AT_BASE"; + case 0x40000004: return "STATUS_RXACT_STATE_CREATED"; + case 0x40000005: return "STATUS_SEGMENT_NOTIFICATION"; + case 0x40000006: return "STATUS_LOCAL_USER_SESSION_KEY"; + case 0x40000007: return "STATUS_BAD_CURRENT_DIRECTORY"; + case 0x40000008: return "STATUS_SERIAL_MORE_WRITES"; + case 0x40000009: return "STATUS_REGISTRY_RECOVERED"; + case 0x4000000A: return "STATUS_FT_READ_RECOVERY_FROM_BACKUP"; + case 0x4000000B: return "STATUS_FT_WRITE_RECOVERY"; + case 0x4000000C: return "STATUS_SERIAL_COUNTER_TIMEOUT"; + case 0x4000000D: return "STATUS_NULL_LM_PASSWORD"; + case 0x4000000E: return "STATUS_IMAGE_MACHINE_TYPE_MISMATCH"; + case 0x4000000F: return "STATUS_RECEIVE_PARTIAL"; + case 0x40000010: return "STATUS_RECEIVE_EXPEDITED"; + case 0x40000011: return "STATUS_RECEIVE_PARTIAL_EXPEDITED"; + case 0x40000012: return "STATUS_EVENT_DONE"; + case 0x40000013: return "STATUS_EVENT_PENDING"; + case 0x40000014: return "STATUS_CHECKING_FILE_SYSTEM"; + case 0x40000015: return "STATUS_FATAL_APP_EXIT"; + case 0x40000016: return "STATUS_PREDEFINED_HANDLE"; + case 0x40000017: return "STATUS_WAS_UNLOCKED"; + case 0x40000018: return "STATUS_SERVICE_NOTIFICATION"; + case 0x40000019: return "STATUS_WAS_LOCKED"; + case 0x4000001A: return "STATUS_LOG_HARD_ERROR"; + case 0x4000001B: return "STATUS_ALREADY_WIN32"; + case 0x4000001C: return "STATUS_WX86_UNSIMULATE"; + case 0x4000001D: return "STATUS_WX86_CONTINUE"; + case 0x4000001E: return "STATUS_WX86_SINGLE_STEP"; + case 0x4000001F: return "STATUS_WX86_BREAKPOINT"; + case 0x40000020: return "STATUS_WX86_EXCEPTION_CONTINUE"; + case 0x40000021: return "STATUS_WX86_EXCEPTION_LASTCHANCE"; + case 0x40000022: return "STATUS_WX86_EXCEPTION_CHAIN"; + case 0x40000023: return "STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE"; + case 0x40000024: return "STATUS_NO_YIELD_PERFORMED"; + case 0x40000025: return "STATUS_TIMER_RESUME_IGNORED"; + case 0x40000026: return "STATUS_ARBITRATION_UNHANDLED"; + case 0x40000027: return "STATUS_CARDBUS_NOT_SUPPORTED"; + case 0x40000028: return "STATUS_WX86_CREATEWX86TIB"; + case 0x40000029: return "STATUS_MP_PROCESSOR_MISMATCH"; + case 0x40010001: return "DBG_REPLY_LATER"; + case 0x40010002: return "DBG_UNABLE_TO_PROVIDE_HANDLE"; + case 0x40010003: return "DBG_TERMINATE_THREAD"; + case 0x40010004: return "DBG_TERMINATE_PROCESS"; + case 0x40010005: return "DBG_CONTROL_C"; + case 0x40010006: return "DBG_PRINTEXCEPTION_C"; + case 0x40010007: return "DBG_RIPEXCEPTION"; + case 0x40010008: return "DBG_CONTROL_BREAK"; + case 0x80000001: return "STATUS_GUARD_PAGE_VIOLATION"; + case 0x80000002: return "STATUS_DATATYPE_MISALIGNMENT"; + case 0x80000003: return "STATUS_BREAKPOINT"; + case 0x80000004: return "STATUS_SINGLE_STEP"; + case 0x80000005: return "STATUS_BUFFER_OVERFLOW"; + case 0x80000006: return "STATUS_NO_MORE_FILES"; + case 0x80000007: return "STATUS_WAKE_SYSTEM_DEBUGGER"; + case 0x8000000A: return "STATUS_HANDLES_CLOSED"; + case 0x8000000B: return "STATUS_NO_INHERITANCE"; + case 0x8000000C: return "STATUS_GUID_SUBSTITUTION_MADE"; + case 0x8000000D: return "STATUS_PARTIAL_COPY"; + case 0x8000000E: return "STATUS_DEVICE_PAPER_EMPTY"; + case 0x8000000F: return "STATUS_DEVICE_POWERED_OFF"; + case 0x80000010: return "STATUS_DEVICE_OFF_LINE"; + case 0x80000011: return "STATUS_DEVICE_BUSY"; + case 0x80000012: return "STATUS_NO_MORE_EAS"; + case 0x80000013: return "STATUS_INVALID_EA_NAME"; + case 0x80000014: return "STATUS_EA_LIST_INCONSISTENT"; + case 0x80000015: return "STATUS_INVALID_EA_FLAG"; + case 0x80000016: return "STATUS_VERIFY_REQUIRED"; + case 0x80000017: return "STATUS_EXTRANEOUS_INFORMATION"; + case 0x80000018: return "STATUS_RXACT_COMMIT_NECESSARY"; + case 0x8000001A: return "STATUS_NO_MORE_ENTRIES"; + case 0x8000001B: return "STATUS_FILEMARK_DETECTED"; + case 0x8000001C: return "STATUS_MEDIA_CHANGED"; + case 0x8000001D: return "STATUS_BUS_RESET"; + case 0x8000001E: return "STATUS_END_OF_MEDIA"; + case 0x8000001F: return "STATUS_BEGINNING_OF_MEDIA"; + case 0x80000020: return "STATUS_MEDIA_CHECK"; + case 0x80000021: return "STATUS_SETMARK_DETECTED"; + case 0x80000022: return "STATUS_NO_DATA_DETECTED"; + case 0x80000023: return "STATUS_REDIRECTOR_HAS_OPEN_HANDLES"; + case 0x80000024: return "STATUS_SERVER_HAS_OPEN_HANDLES"; + case 0x80000025: return "STATUS_ALREADY_DISCONNECTED"; + case 0x80000026: return "STATUS_LONGJUMP"; + case 0x80010001: return "DBG_EXCEPTION_NOT_HANDLED"; + case 0xC0000001: return "STATUS_UNSUCCESSFUL"; + case 0xC0000002: return "STATUS_NOT_IMPLEMENTED"; + case 0xC0000003: return "STATUS_INVALID_INFO_CLASS"; + case 0xC0000004: return "STATUS_INFO_LENGTH_MISMATCH"; + case 0xC0000005: return "STATUS_ACCESS_VIOLATION"; + case 0xC0000006: return "STATUS_IN_PAGE_ERROR"; + case 0xC0000007: return "STATUS_PAGEFILE_QUOTA"; + case 0xC0000008: return "STATUS_INVALID_HANDLE"; + case 0xC0000009: return "STATUS_BAD_INITIAL_STACK"; + case 0xC000000A: return "STATUS_BAD_INITIAL_PC"; + case 0xC000000B: return "STATUS_INVALID_CID"; + case 0xC000000C: return "STATUS_TIMER_NOT_CANCELED"; + case 0xC000000D: return "STATUS_INVALID_PARAMETER"; + case 0xC000000E: return "STATUS_NO_SUCH_DEVICE"; + case 0xC000000F: return "STATUS_NO_SUCH_FILE"; + case 0xC0000010: return "STATUS_INVALID_DEVICE_REQUEST"; + case 0xC0000011: return "STATUS_END_OF_FILE"; + case 0xC0000012: return "STATUS_WRONG_VOLUME"; + case 0xC0000013: return "STATUS_NO_MEDIA_IN_DEVICE"; + case 0xC0000014: return "STATUS_UNRECOGNIZED_MEDIA"; + case 0xC0000015: return "STATUS_NONEXISTENT_SECTOR"; + case 0xC0000016: return "STATUS_MORE_PROCESSING_REQUIRED"; + case 0xC0000017: return "STATUS_NO_MEMORY"; + case 0xC0000018: return "STATUS_CONFLICTING_ADDRESSES"; + case 0xC0000019: return "STATUS_NOT_MAPPED_VIEW"; + case 0xC000001A: return "STATUS_UNABLE_TO_FREE_VM"; + case 0xC000001B: return "STATUS_UNABLE_TO_DELETE_SECTION"; + case 0xC000001C: return "STATUS_INVALID_SYSTEM_SERVICE"; + case 0xC000001D: return "STATUS_ILLEGAL_INSTRUCTION"; + case 0xC000001E: return "STATUS_INVALID_LOCK_SEQUENCE"; + case 0xC000001F: return "STATUS_INVALID_VIEW_SIZE"; + case 0xC0000020: return "STATUS_INVALID_FILE_FOR_SECTION"; + case 0xC0000021: return "STATUS_ALREADY_COMMITTED"; + case 0xC0000022: return "STATUS_ACCESS_DENIED"; + case 0xC0000023: return "STATUS_BUFFER_TOO_SMALL"; + case 0xC0000024: return "STATUS_OBJECT_TYPE_MISMATCH"; + case 0xC0000025: return "STATUS_NONCONTINUABLE_EXCEPTION"; + case 0xC0000026: return "STATUS_INVALID_DISPOSITION"; + case 0xC0000027: return "STATUS_UNWIND"; + case 0xC0000028: return "STATUS_BAD_STACK"; + case 0xC0000029: return "STATUS_INVALID_UNWIND_TARGET"; + case 0xC000002A: return "STATUS_NOT_LOCKED"; + case 0xC000002B: return "STATUS_PARITY_ERROR"; + case 0xC000002C: return "STATUS_UNABLE_TO_DECOMMIT_VM"; + case 0xC000002D: return "STATUS_NOT_COMMITTED"; + case 0xC000002E: return "STATUS_INVALID_PORT_ATTRIBUTES"; + case 0xC000002F: return "STATUS_PORT_MESSAGE_TOO_LONG"; + case 0xC0000030: return "STATUS_INVALID_PARAMETER_MIX"; + case 0xC0000031: return "STATUS_INVALID_QUOTA_LOWER"; + case 0xC0000032: return "STATUS_DISK_CORRUPT_ERROR"; + case 0xC0000033: return "STATUS_OBJECT_NAME_INVALID"; + case 0xC0000034: return "STATUS_OBJECT_NAME_NOT_FOUND"; + case 0xC0000035: return "STATUS_OBJECT_NAME_COLLISION"; + case 0xC0000037: return "STATUS_PORT_DISCONNECTED"; + case 0xC0000038: return "STATUS_DEVICE_ALREADY_ATTACHED"; + case 0xC0000039: return "STATUS_OBJECT_PATH_INVALID"; + case 0xC000003A: return "STATUS_OBJECT_PATH_NOT_FOUND"; + case 0xC000003B: return "STATUS_OBJECT_PATH_SYNTAX_BAD"; + case 0xC000003C: return "STATUS_DATA_OVERRUN"; + case 0xC000003D: return "STATUS_DATA_LATE_ERROR"; + case 0xC000003E: return "STATUS_DATA_ERROR"; + case 0xC000003F: return "STATUS_CRC_ERROR"; + case 0xC0000040: return "STATUS_SECTION_TOO_BIG"; + case 0xC0000041: return "STATUS_PORT_CONNECTION_REFUSED"; + case 0xC0000042: return "STATUS_INVALID_PORT_HANDLE"; + case 0xC0000043: return "STATUS_SHARING_VIOLATION"; + case 0xC0000044: return "STATUS_QUOTA_EXCEEDED"; + case 0xC0000045: return "STATUS_INVALID_PAGE_PROTECTION"; + case 0xC0000046: return "STATUS_MUTANT_NOT_OWNED"; + case 0xC0000047: return "STATUS_SEMAPHORE_LIMIT_EXCEEDED"; + case 0xC0000048: return "STATUS_PORT_ALREADY_SET"; + case 0xC0000049: return "STATUS_SECTION_NOT_IMAGE"; + case 0xC000004A: return "STATUS_SUSPEND_COUNT_EXCEEDED"; + case 0xC000004B: return "STATUS_THREAD_IS_TERMINATING"; + case 0xC000004C: return "STATUS_BAD_WORKING_SET_LIMIT"; + case 0xC000004D: return "STATUS_INCOMPATIBLE_FILE_MAP"; + case 0xC000004E: return "STATUS_SECTION_PROTECTION"; + case 0xC000004F: return "STATUS_EAS_NOT_SUPPORTED"; + case 0xC0000050: return "STATUS_EA_TOO_LARGE"; + case 0xC0000051: return "STATUS_NONEXISTENT_EA_ENTRY"; + case 0xC0000052: return "STATUS_NO_EAS_ON_FILE"; + case 0xC0000053: return "STATUS_EA_CORRUPT_ERROR"; + case 0xC0000054: return "STATUS_FILE_LOCK_CONFLICT"; + case 0xC0000055: return "STATUS_LOCK_NOT_GRANTED"; + case 0xC0000056: return "STATUS_DELETE_PENDING"; + case 0xC0000057: return "STATUS_CTL_FILE_NOT_SUPPORTED"; + case 0xC0000058: return "STATUS_UNKNOWN_REVISION"; + case 0xC0000059: return "STATUS_REVISION_MISMATCH"; + case 0xC000005A: return "STATUS_INVALID_OWNER"; + case 0xC000005B: return "STATUS_INVALID_PRIMARY_GROUP"; + case 0xC000005C: return "STATUS_NO_IMPERSONATION_TOKEN"; + case 0xC000005D: return "STATUS_CANT_DISABLE_MANDATORY"; + case 0xC000005E: return "STATUS_NO_LOGON_SERVERS"; + case 0xC000005F: return "STATUS_NO_SUCH_LOGON_SESSION"; + case 0xC0000060: return "STATUS_NO_SUCH_PRIVILEGE"; + case 0xC0000061: return "STATUS_PRIVILEGE_NOT_HELD"; + case 0xC0000062: return "STATUS_INVALID_ACCOUNT_NAME"; + case 0xC0000063: return "STATUS_USER_EXISTS"; + case 0xC0000064: return "STATUS_NO_SUCH_USER"; + case 0xC0000065: return "STATUS_GROUP_EXISTS"; + case 0xC0000066: return "STATUS_NO_SUCH_GROUP"; + case 0xC0000067: return "STATUS_MEMBER_IN_GROUP"; + case 0xC0000068: return "STATUS_MEMBER_NOT_IN_GROUP"; + case 0xC0000069: return "STATUS_LAST_ADMIN"; + case 0xC000006A: return "STATUS_WRONG_PASSWORD"; + case 0xC000006B: return "STATUS_ILL_FORMED_PASSWORD"; + case 0xC000006C: return "STATUS_PASSWORD_RESTRICTION"; + case 0xC000006D: return "STATUS_LOGON_FAILURE"; + case 0xC000006E: return "STATUS_ACCOUNT_RESTRICTION"; + case 0xC000006F: return "STATUS_INVALID_LOGON_HOURS"; + case 0xC0000070: return "STATUS_INVALID_WORKSTATION"; + case 0xC0000071: return "STATUS_PASSWORD_EXPIRED"; + case 0xC0000072: return "STATUS_ACCOUNT_DISABLED"; + case 0xC0000073: return "STATUS_NONE_MAPPED"; + case 0xC0000074: return "STATUS_TOO_MANY_LUIDS_REQUESTED"; + case 0xC0000075: return "STATUS_LUIDS_EXHAUSTED"; + case 0xC0000076: return "STATUS_INVALID_SUB_AUTHORITY"; + case 0xC0000077: return "STATUS_INVALID_ACL"; + case 0xC0000078: return "STATUS_INVALID_SID"; + case 0xC0000079: return "STATUS_INVALID_SECURITY_DESCR"; + case 0xC000007A: return "STATUS_PROCEDURE_NOT_FOUND"; + case 0xC000007B: return "STATUS_INVALID_IMAGE_FORMAT"; + case 0xC000007C: return "STATUS_NO_TOKEN"; + case 0xC000007D: return "STATUS_BAD_INHERITANCE_ACL"; + case 0xC000007E: return "STATUS_RANGE_NOT_LOCKED"; + case 0xC000007F: return "STATUS_DISK_FULL"; + case 0xC0000080: return "STATUS_SERVER_DISABLED"; + case 0xC0000081: return "STATUS_SERVER_NOT_DISABLED"; + case 0xC0000082: return "STATUS_TOO_MANY_GUIDS_REQUESTED"; + case 0xC0000083: return "STATUS_GUIDS_EXHAUSTED"; + case 0xC0000084: return "STATUS_INVALID_ID_AUTHORITY"; + case 0xC0000085: return "STATUS_AGENTS_EXHAUSTED"; + case 0xC0000086: return "STATUS_INVALID_VOLUME_LABEL"; + case 0xC0000087: return "STATUS_SECTION_NOT_EXTENDED"; + case 0xC0000088: return "STATUS_NOT_MAPPED_DATA"; + case 0xC0000089: return "STATUS_RESOURCE_DATA_NOT_FOUND"; + case 0xC000008A: return "STATUS_RESOURCE_TYPE_NOT_FOUND"; + case 0xC000008B: return "STATUS_RESOURCE_NAME_NOT_FOUND"; + case 0xC000008C: return "STATUS_ARRAY_BOUNDS_EXCEEDED"; + case 0xC000008D: return "STATUS_FLOAT_DENORMAL_OPERAND"; + case 0xC000008E: return "STATUS_FLOAT_DIVIDE_BY_ZERO"; + case 0xC000008F: return "STATUS_FLOAT_INEXACT_RESULT"; + case 0xC0000090: return "STATUS_FLOAT_INVALID_OPERATION"; + case 0xC0000091: return "STATUS_FLOAT_OVERFLOW"; + case 0xC0000092: return "STATUS_FLOAT_STACK_CHECK"; + case 0xC0000093: return "STATUS_FLOAT_UNDERFLOW"; + case 0xC0000094: return "STATUS_INTEGER_DIVIDE_BY_ZERO"; + case 0xC0000095: return "STATUS_INTEGER_OVERFLOW"; + case 0xC0000096: return "STATUS_PRIVILEGED_INSTRUCTION"; + case 0xC0000097: return "STATUS_TOO_MANY_PAGING_FILES"; + case 0xC0000098: return "STATUS_FILE_INVALID"; + case 0xC0000099: return "STATUS_ALLOTTED_SPACE_EXCEEDED"; + case 0xC000009A: return "STATUS_INSUFFICIENT_RESOURCES"; + case 0xC000009B: return "STATUS_DFS_EXIT_PATH_FOUND"; + case 0xC000009C: return "STATUS_DEVICE_DATA_ERROR"; + case 0xC000009D: return "STATUS_DEVICE_NOT_CONNECTED"; + case 0xC000009E: return "STATUS_DEVICE_POWER_FAILURE"; + case 0xC000009F: return "STATUS_FREE_VM_NOT_AT_BASE"; + case 0xC00000A0: return "STATUS_MEMORY_NOT_ALLOCATED"; + case 0xC00000A1: return "STATUS_WORKING_SET_QUOTA"; + case 0xC00000A2: return "STATUS_MEDIA_WRITE_PROTECTED"; + case 0xC00000A3: return "STATUS_DEVICE_NOT_READY"; + case 0xC00000A4: return "STATUS_INVALID_GROUP_ATTRIBUTES"; + case 0xC00000A5: return "STATUS_BAD_IMPERSONATION_LEVEL"; + case 0xC00000A6: return "STATUS_CANT_OPEN_ANONYMOUS"; + case 0xC00000A7: return "STATUS_BAD_VALIDATION_CLASS"; + case 0xC00000A8: return "STATUS_BAD_TOKEN_TYPE"; + case 0xC00000A9: return "STATUS_BAD_MASTER_BOOT_RECORD"; + case 0xC00000AA: return "STATUS_INSTRUCTION_MISALIGNMENT"; + case 0xC00000AB: return "STATUS_INSTANCE_NOT_AVAILABLE"; + case 0xC00000AC: return "STATUS_PIPE_NOT_AVAILABLE"; + case 0xC00000AD: return "STATUS_INVALID_PIPE_STATE"; + case 0xC00000AE: return "STATUS_PIPE_BUSY"; + case 0xC00000AF: return "STATUS_ILLEGAL_FUNCTION"; + case 0xC00000B0: return "STATUS_PIPE_DISCONNECTED"; + case 0xC00000B1: return "STATUS_PIPE_CLOSING"; + case 0xC00000B2: return "STATUS_PIPE_CONNECTED"; + case 0xC00000B3: return "STATUS_PIPE_LISTENING"; + case 0xC00000B4: return "STATUS_INVALID_READ_MODE"; + case 0xC00000B5: return "STATUS_IO_TIMEOUT"; + case 0xC00000B6: return "STATUS_FILE_FORCED_CLOSED"; + case 0xC00000B7: return "STATUS_PROFILING_NOT_STARTED"; + case 0xC00000B8: return "STATUS_PROFILING_NOT_STOPPED"; + case 0xC00000B9: return "STATUS_COULD_NOT_INTERPRET"; + case 0xC00000BA: return "STATUS_FILE_IS_A_DIRECTORY"; + case 0xC00000BB: return "STATUS_NOT_SUPPORTED"; + case 0xC00000BC: return "STATUS_REMOTE_NOT_LISTENING"; + case 0xC00000BD: return "STATUS_DUPLICATE_NAME"; + case 0xC00000BE: return "STATUS_BAD_NETWORK_PATH"; + case 0xC00000BF: return "STATUS_NETWORK_BUSY"; + case 0xC00000C0: return "STATUS_DEVICE_DOES_NOT_EXIST"; + case 0xC00000C1: return "STATUS_TOO_MANY_COMMANDS"; + case 0xC00000C2: return "STATUS_ADAPTER_HARDWARE_ERROR"; + case 0xC00000C3: return "STATUS_INVALID_NETWORK_RESPONSE"; + case 0xC00000C4: return "STATUS_UNEXPECTED_NETWORK_ERROR"; + case 0xC00000C5: return "STATUS_BAD_REMOTE_ADAPTER"; + case 0xC00000C6: return "STATUS_PRINT_QUEUE_FULL"; + case 0xC00000C7: return "STATUS_NO_SPOOL_SPACE"; + case 0xC00000C8: return "STATUS_PRINT_CANCELLED"; + case 0xC00000C9: return "STATUS_NETWORK_NAME_DELETED"; + case 0xC00000CA: return "STATUS_NETWORK_ACCESS_DENIED"; + case 0xC00000CB: return "STATUS_BAD_DEVICE_TYPE"; + case 0xC00000CC: return "STATUS_BAD_NETWORK_NAME"; + case 0xC00000CD: return "STATUS_TOO_MANY_NAMES"; + case 0xC00000CE: return "STATUS_TOO_MANY_SESSIONS"; + case 0xC00000CF: return "STATUS_SHARING_PAUSED"; + case 0xC00000D0: return "STATUS_REQUEST_NOT_ACCEPTED"; + case 0xC00000D1: return "STATUS_REDIRECTOR_PAUSED"; + case 0xC00000D2: return "STATUS_NET_WRITE_FAULT"; + case 0xC00000D3: return "STATUS_PROFILING_AT_LIMIT"; + case 0xC00000D4: return "STATUS_NOT_SAME_DEVICE"; + case 0xC00000D5: return "STATUS_FILE_RENAMED"; + case 0xC00000D6: return "STATUS_VIRTUAL_CIRCUIT_CLOSED"; + case 0xC00000D7: return "STATUS_NO_SECURITY_ON_OBJECT"; + case 0xC00000D8: return "STATUS_CANT_WAIT"; + case 0xC00000D9: return "STATUS_PIPE_EMPTY"; + case 0xC00000DA: return "STATUS_CANT_ACCESS_DOMAIN_INFO"; + case 0xC00000DB: return "STATUS_CANT_TERMINATE_SELF"; + case 0xC00000DC: return "STATUS_INVALID_SERVER_STATE"; + case 0xC00000DD: return "STATUS_INVALID_DOMAIN_STATE"; + case 0xC00000DE: return "STATUS_INVALID_DOMAIN_ROLE"; + case 0xC00000DF: return "STATUS_NO_SUCH_DOMAIN"; + case 0xC00000E0: return "STATUS_DOMAIN_EXISTS"; + case 0xC00000E1: return "STATUS_DOMAIN_LIMIT_EXCEEDED"; + case 0xC00000E2: return "STATUS_OPLOCK_NOT_GRANTED"; + case 0xC00000E3: return "STATUS_INVALID_OPLOCK_PROTOCOL"; + case 0xC00000E4: return "STATUS_INTERNAL_DB_CORRUPTION"; + case 0xC00000E5: return "STATUS_INTERNAL_ERROR"; + case 0xC00000E6: return "STATUS_GENERIC_NOT_MAPPED"; + case 0xC00000E7: return "STATUS_BAD_DESCRIPTOR_FORMAT"; + case 0xC00000E8: return "STATUS_INVALID_USER_BUFFER"; + case 0xC00000E9: return "STATUS_UNEXPECTED_IO_ERROR"; + case 0xC00000EA: return "STATUS_UNEXPECTED_MM_CREATE_ERR"; + case 0xC00000EB: return "STATUS_UNEXPECTED_MM_MAP_ERROR"; + case 0xC00000EC: return "STATUS_UNEXPECTED_MM_EXTEND_ERR"; + case 0xC00000ED: return "STATUS_NOT_LOGON_PROCESS"; + case 0xC00000EE: return "STATUS_LOGON_SESSION_EXISTS"; + case 0xC00000EF: return "STATUS_INVALID_PARAMETER_1"; + case 0xC00000F0: return "STATUS_INVALID_PARAMETER_2"; + case 0xC00000F1: return "STATUS_INVALID_PARAMETER_3"; + case 0xC00000F2: return "STATUS_INVALID_PARAMETER_4"; + case 0xC00000F3: return "STATUS_INVALID_PARAMETER_5"; + case 0xC00000F4: return "STATUS_INVALID_PARAMETER_6"; + case 0xC00000F5: return "STATUS_INVALID_PARAMETER_7"; + case 0xC00000F6: return "STATUS_INVALID_PARAMETER_8"; + case 0xC00000F7: return "STATUS_INVALID_PARAMETER_9"; + case 0xC00000F8: return "STATUS_INVALID_PARAMETER_10"; + case 0xC00000F9: return "STATUS_INVALID_PARAMETER_11"; + case 0xC00000FA: return "STATUS_INVALID_PARAMETER_12"; + case 0xC00000FB: return "STATUS_REDIRECTOR_NOT_STARTED"; + case 0xC00000FC: return "STATUS_REDIRECTOR_STARTED"; + case 0xC00000FD: return "STATUS_STACK_OVERFLOW"; + case 0xC00000FE: return "STATUS_NO_SUCH_PACKAGE"; + case 0xC00000FF: return "STATUS_BAD_FUNCTION_TABLE"; + case 0xC0000100: return "STATUS_VARIABLE_NOT_FOUND"; + case 0xC0000101: return "STATUS_DIRECTORY_NOT_EMPTY"; + case 0xC0000102: return "STATUS_FILE_CORRUPT_ERROR"; + case 0xC0000103: return "STATUS_NOT_A_DIRECTORY"; + case 0xC0000104: return "STATUS_BAD_LOGON_SESSION_STATE"; + case 0xC0000105: return "STATUS_LOGON_SESSION_COLLISION"; + case 0xC0000106: return "STATUS_NAME_TOO_LONG"; + case 0xC0000107: return "STATUS_FILES_OPEN"; + case 0xC0000108: return "STATUS_CONNECTION_IN_USE"; + case 0xC0000109: return "STATUS_MESSAGE_NOT_FOUND"; + case 0xC000010A: return "STATUS_PROCESS_IS_TERMINATING"; + case 0xC000010B: return "STATUS_INVALID_LOGON_TYPE"; + case 0xC000010C: return "STATUS_NO_GUID_TRANSLATION"; + case 0xC000010D: return "STATUS_CANNOT_IMPERSONATE"; + case 0xC000010E: return "STATUS_IMAGE_ALREADY_LOADED"; + case 0xC000010F: return "STATUS_ABIOS_NOT_PRESENT"; + case 0xC0000110: return "STATUS_ABIOS_LID_NOT_EXIST"; + case 0xC0000111: return "STATUS_ABIOS_LID_ALREADY_OWNED"; + case 0xC0000112: return "STATUS_ABIOS_NOT_LID_OWNER"; + case 0xC0000113: return "STATUS_ABIOS_INVALID_COMMAND"; + case 0xC0000114: return "STATUS_ABIOS_INVALID_LID"; + case 0xC0000115: return "STATUS_ABIOS_SELECTOR_NOT_AVAILABLE"; + case 0xC0000116: return "STATUS_ABIOS_INVALID_SELECTOR"; + case 0xC0000117: return "STATUS_NO_LDT"; + case 0xC0000118: return "STATUS_INVALID_LDT_SIZE"; + case 0xC0000119: return "STATUS_INVALID_LDT_OFFSET"; + case 0xC000011A: return "STATUS_INVALID_LDT_DESCRIPTOR"; + case 0xC000011B: return "STATUS_INVALID_IMAGE_NE_FORMAT"; + case 0xC000011C: return "STATUS_RXACT_INVALID_STATE"; + case 0xC000011D: return "STATUS_RXACT_COMMIT_FAILURE"; + case 0xC000011E: return "STATUS_MAPPED_FILE_SIZE_ZERO"; + case 0xC000011F: return "STATUS_TOO_MANY_OPENED_FILES"; + case 0xC0000120: return "STATUS_CANCELLED"; + case 0xC0000121: return "STATUS_CANNOT_DELETE"; + case 0xC0000122: return "STATUS_INVALID_COMPUTER_NAME"; + case 0xC0000123: return "STATUS_FILE_DELETED"; + case 0xC0000124: return "STATUS_SPECIAL_ACCOUNT"; + case 0xC0000125: return "STATUS_SPECIAL_GROUP"; + case 0xC0000126: return "STATUS_SPECIAL_USER"; + case 0xC0000127: return "STATUS_MEMBERS_PRIMARY_GROUP"; + case 0xC0000128: return "STATUS_FILE_CLOSED"; + case 0xC0000129: return "STATUS_TOO_MANY_THREADS"; + case 0xC000012A: return "STATUS_THREAD_NOT_IN_PROCESS"; + case 0xC000012B: return "STATUS_TOKEN_ALREADY_IN_USE"; + case 0xC000012C: return "STATUS_PAGEFILE_QUOTA_EXCEEDED"; + case 0xC000012D: return "STATUS_COMMITMENT_LIMIT"; + case 0xC000012E: return "STATUS_INVALID_IMAGE_LE_FORMAT"; + case 0xC000012F: return "STATUS_INVALID_IMAGE_NOT_MZ"; + case 0xC0000130: return "STATUS_INVALID_IMAGE_PROTECT"; + case 0xC0000131: return "STATUS_INVALID_IMAGE_WIN_16"; + case 0xC0000132: return "STATUS_LOGON_SERVER_CONFLICT"; + case 0xC0000133: return "STATUS_TIME_DIFFERENCE_AT_DC"; + case 0xC0000134: return "STATUS_SYNCHRONIZATION_REQUIRED"; + case 0xC0000135: return "STATUS_DLL_NOT_FOUND"; + case 0xC0000136: return "STATUS_OPEN_FAILED"; + case 0xC0000137: return "STATUS_IO_PRIVILEGE_FAILED"; + case 0xC0000138: return "STATUS_ORDINAL_NOT_FOUND"; + case 0xC0000139: return "STATUS_ENTRYPOINT_NOT_FOUND"; + case 0xC000013A: return "STATUS_CONTROL_C_EXIT"; + case 0xC000013B: return "STATUS_LOCAL_DISCONNECT"; + case 0xC000013C: return "STATUS_REMOTE_DISCONNECT"; + case 0xC000013D: return "STATUS_REMOTE_RESOURCES"; + case 0xC000013E: return "STATUS_LINK_FAILED"; + case 0xC000013F: return "STATUS_LINK_TIMEOUT"; + case 0xC0000140: return "STATUS_INVALID_CONNECTION"; + case 0xC0000141: return "STATUS_INVALID_ADDRESS"; + case 0xC0000142: return "STATUS_DLL_INIT_FAILED"; + case 0xC0000143: return "STATUS_MISSING_SYSTEMFILE"; + case 0xC0000144: return "STATUS_UNHANDLED_EXCEPTION"; + case 0xC0000145: return "STATUS_APP_INIT_FAILURE"; + case 0xC0000146: return "STATUS_PAGEFILE_CREATE_FAILED"; + case 0xC0000147: return "STATUS_NO_PAGEFILE"; + case 0xC0000148: return "STATUS_INVALID_LEVEL"; + case 0xC0000149: return "STATUS_WRONG_PASSWORD_CORE"; + case 0xC000014A: return "STATUS_ILLEGAL_FLOAT_CONTEXT"; + case 0xC000014B: return "STATUS_PIPE_BROKEN"; + case 0xC000014C: return "STATUS_REGISTRY_CORRUPT"; + case 0xC000014D: return "STATUS_REGISTRY_IO_FAILED"; + case 0xC000014E: return "STATUS_NO_EVENT_PAIR"; + case 0xC000014F: return "STATUS_UNRECOGNIZED_VOLUME"; + case 0xC0000150: return "STATUS_SERIAL_NO_DEVICE_INITED"; + case 0xC0000151: return "STATUS_NO_SUCH_ALIAS"; + case 0xC0000152: return "STATUS_MEMBER_NOT_IN_ALIAS"; + case 0xC0000153: return "STATUS_MEMBER_IN_ALIAS"; + case 0xC0000154: return "STATUS_ALIAS_EXISTS"; + case 0xC0000155: return "STATUS_LOGON_NOT_GRANTED"; + case 0xC0000156: return "STATUS_TOO_MANY_SECRETS"; + case 0xC0000157: return "STATUS_SECRET_TOO_LONG"; + case 0xC0000158: return "STATUS_INTERNAL_DB_ERROR"; + case 0xC0000159: return "STATUS_FULLSCREEN_MODE"; + case 0xC000015A: return "STATUS_TOO_MANY_CONTEXT_IDS"; + case 0xC000015B: return "STATUS_LOGON_TYPE_NOT_GRANTED"; + case 0xC000015C: return "STATUS_NOT_REGISTRY_FILE"; + case 0xC000015D: return "STATUS_NT_CROSS_ENCRYPTION_REQUIRED"; + case 0xC000015E: return "STATUS_DOMAIN_CTRLR_CONFIG_ERROR"; + case 0xC000015F: return "STATUS_FT_MISSING_MEMBER"; + case 0xC0000160: return "STATUS_ILL_FORMED_SERVICE_ENTRY"; + case 0xC0000161: return "STATUS_ILLEGAL_CHARACTER"; + case 0xC0000162: return "STATUS_UNMAPPABLE_CHARACTER"; + case 0xC0000163: return "STATUS_UNDEFINED_CHARACTER"; + case 0xC0000164: return "STATUS_FLOPPY_VOLUME"; + case 0xC0000165: return "STATUS_FLOPPY_ID_MARK_NOT_FOUND"; + case 0xC0000166: return "STATUS_FLOPPY_WRONG_CYLINDER"; + case 0xC0000167: return "STATUS_FLOPPY_UNKNOWN_ERROR"; + case 0xC0000168: return "STATUS_FLOPPY_BAD_REGISTERS"; + case 0xC0000169: return "STATUS_DISK_RECALIBRATE_FAILED"; + case 0xC000016A: return "STATUS_DISK_OPERATION_FAILED"; + case 0xC000016B: return "STATUS_DISK_RESET_FAILED"; + case 0xC000016C: return "STATUS_SHARED_IRQ_BUSY"; + case 0xC000016D: return "STATUS_FT_ORPHANING"; + case 0xC000016E: return "STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT"; + case 0xC0000172: return "STATUS_PARTITION_FAILURE"; + case 0xC0000173: return "STATUS_INVALID_BLOCK_LENGTH"; + case 0xC0000174: return "STATUS_DEVICE_NOT_PARTITIONED"; + case 0xC0000175: return "STATUS_UNABLE_TO_LOCK_MEDIA"; + case 0xC0000176: return "STATUS_UNABLE_TO_UNLOAD_MEDIA"; + case 0xC0000177: return "STATUS_EOM_OVERFLOW"; + case 0xC0000178: return "STATUS_NO_MEDIA"; + case 0xC000017A: return "STATUS_NO_SUCH_MEMBER"; + case 0xC000017B: return "STATUS_INVALID_MEMBER"; + case 0xC000017C: return "STATUS_KEY_DELETED"; + case 0xC000017D: return "STATUS_NO_LOG_SPACE"; + case 0xC000017E: return "STATUS_TOO_MANY_SIDS"; + case 0xC000017F: return "STATUS_LM_CROSS_ENCRYPTION_REQUIRED"; + case 0xC0000180: return "STATUS_KEY_HAS_CHILDREN"; + case 0xC0000181: return "STATUS_CHILD_MUST_BE_VOLATILE"; + case 0xC0000182: return "STATUS_DEVICE_CONFIGURATION_ERROR"; + case 0xC0000183: return "STATUS_DRIVER_INTERNAL_ERROR"; + case 0xC0000184: return "STATUS_INVALID_DEVICE_STATE"; + case 0xC0000185: return "STATUS_IO_DEVICE_ERROR"; + case 0xC0000186: return "STATUS_DEVICE_PROTOCOL_ERROR"; + case 0xC0000187: return "STATUS_BACKUP_CONTROLLER"; + case 0xC0000188: return "STATUS_LOG_FILE_FULL"; + case 0xC0000189: return "STATUS_TOO_LATE"; + case 0xC000018A: return "STATUS_NO_TRUST_LSA_SECRET"; + case 0xC000018B: return "STATUS_NO_TRUST_SAM_ACCOUNT"; + case 0xC000018C: return "STATUS_TRUSTED_DOMAIN_FAILURE"; + case 0xC000018D: return "STATUS_TRUSTED_RELATIONSHIP_FAILURE"; + case 0xC000018E: return "STATUS_EVENTLOG_FILE_CORRUPT"; + case 0xC000018F: return "STATUS_EVENTLOG_CANT_START"; + case 0xC0000190: return "STATUS_TRUST_FAILURE"; + case 0xC0000191: return "STATUS_MUTANT_LIMIT_EXCEEDED"; + case 0xC0000192: return "STATUS_NETLOGON_NOT_STARTED"; + case 0xC0000193: return "STATUS_ACCOUNT_EXPIRED"; + case 0xC0000194: return "STATUS_POSSIBLE_DEADLOCK"; + case 0xC0000195: return "STATUS_NETWORK_CREDENTIAL_CONFLICT"; + case 0xC0000196: return "STATUS_REMOTE_SESSION_LIMIT"; + case 0xC0000197: return "STATUS_EVENTLOG_FILE_CHANGED"; + case 0xC0000198: return "STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT"; + case 0xC0000199: return "STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT"; + case 0xC000019A: return "STATUS_NOLOGON_SERVER_TRUST_ACCOUNT"; + case 0xC000019B: return "STATUS_DOMAIN_TRUST_INCONSISTENT"; + case 0xC000019C: return "STATUS_FS_DRIVER_REQUIRED"; + case 0xC0000202: return "STATUS_NO_USER_SESSION_KEY"; + case 0xC0000203: return "STATUS_USER_SESSION_DELETED"; + case 0xC0000204: return "STATUS_RESOURCE_LANG_NOT_FOUND"; + case 0xC0000205: return "STATUS_INSUFF_SERVER_RESOURCES"; + case 0xC0000206: return "STATUS_INVALID_BUFFER_SIZE"; + case 0xC0000207: return "STATUS_INVALID_ADDRESS_COMPONENT"; + case 0xC0000208: return "STATUS_INVALID_ADDRESS_WILDCARD"; + case 0xC0000209: return "STATUS_TOO_MANY_ADDRESSES"; + case 0xC000020A: return "STATUS_ADDRESS_ALREADY_EXISTS"; + case 0xC000020B: return "STATUS_ADDRESS_CLOSED"; + case 0xC000020C: return "STATUS_CONNECTION_DISCONNECTED"; + case 0xC000020D: return "STATUS_CONNECTION_RESET"; + case 0xC000020E: return "STATUS_TOO_MANY_NODES"; + case 0xC000020F: return "STATUS_TRANSACTION_ABORTED"; + case 0xC0000210: return "STATUS_TRANSACTION_TIMED_OUT"; + case 0xC0000211: return "STATUS_TRANSACTION_NO_RELEASE"; + case 0xC0000212: return "STATUS_TRANSACTION_NO_MATCH"; + case 0xC0000213: return "STATUS_TRANSACTION_RESPONDED"; + case 0xC0000214: return "STATUS_TRANSACTION_INVALID_ID"; + case 0xC0000215: return "STATUS_TRANSACTION_INVALID_TYPE"; + case 0xC0000216: return "STATUS_NOT_SERVER_SESSION"; + case 0xC0000217: return "STATUS_NOT_CLIENT_SESSION"; + case 0xC0000218: return "STATUS_CANNOT_LOAD_REGISTRY_FILE"; + case 0xC0000219: return "STATUS_DEBUG_ATTACH_FAILED"; + case 0xC000021A: return "STATUS_SYSTEM_PROCESS_TERMINATED"; + case 0xC000021B: return "STATUS_DATA_NOT_ACCEPTED"; + case 0xC000021C: return "STATUS_NO_BROWSER_SERVERS_FOUND"; + case 0xC000021D: return "STATUS_VDM_HARD_ERROR"; + case 0xC000021E: return "STATUS_DRIVER_CANCEL_TIMEOUT"; + case 0xC000021F: return "STATUS_REPLY_MESSAGE_MISMATCH"; + case 0xC0000220: return "STATUS_MAPPED_ALIGNMENT"; + case 0xC0000221: return "STATUS_IMAGE_CHECKSUM_MISMATCH"; + case 0xC0000222: return "STATUS_LOST_WRITEBEHIND_DATA"; + case 0xC0000223: return "STATUS_CLIENT_SERVER_PARAMETERS_INVALID"; + case 0xC0000224: return "STATUS_PASSWORD_MUST_CHANGE"; + case 0xC0000225: return "STATUS_NOT_FOUND"; + case 0xC0000226: return "STATUS_NOT_TINY_STREAM"; + case 0xC0000227: return "STATUS_RECOVERY_FAILURE"; + case 0xC0000228: return "STATUS_STACK_OVERFLOW_READ"; + case 0xC0000229: return "STATUS_FAIL_CHECK"; + case 0xC000022A: return "STATUS_DUPLICATE_OBJECTID"; + case 0xC000022B: return "STATUS_OBJECTID_EXISTS"; + case 0xC000022C: return "STATUS_CONVERT_TO_LARGE"; + case 0xC000022D: return "STATUS_RETRY"; + case 0xC000022E: return "STATUS_FOUND_OUT_OF_SCOPE"; + case 0xC000022F: return "STATUS_ALLOCATE_BUCKET"; + case 0xC0000230: return "STATUS_PROPSET_NOT_FOUND"; + case 0xC0000231: return "STATUS_MARSHALL_OVERFLOW"; + case 0xC0000232: return "STATUS_INVALID_VARIANT"; + case 0xC0000233: return "STATUS_DOMAIN_CONTROLLER_NOT_FOUND"; + case 0xC0000234: return "STATUS_ACCOUNT_LOCKED_OUT"; + case 0xC0000235: return "STATUS_HANDLE_NOT_CLOSABLE"; + case 0xC0000236: return "STATUS_CONNECTION_REFUSED"; + case 0xC0000237: return "STATUS_GRACEFUL_DISCONNECT"; + case 0xC0000238: return "STATUS_ADDRESS_ALREADY_ASSOCIATED"; + case 0xC0000239: return "STATUS_ADDRESS_NOT_ASSOCIATED"; + case 0xC000023A: return "STATUS_CONNECTION_INVALID"; + case 0xC000023B: return "STATUS_CONNECTION_ACTIVE"; + case 0xC000023C: return "STATUS_NETWORK_UNREACHABLE"; + case 0xC000023D: return "STATUS_HOST_UNREACHABLE"; + case 0xC000023E: return "STATUS_PROTOCOL_UNREACHABLE"; + case 0xC000023F: return "STATUS_PORT_UNREACHABLE"; + case 0xC0000240: return "STATUS_REQUEST_ABORTED"; + case 0xC0000241: return "STATUS_CONNECTION_ABORTED"; + case 0xC0000242: return "STATUS_BAD_COMPRESSION_BUFFER"; + case 0xC0000243: return "STATUS_USER_MAPPED_FILE"; + case 0xC0000244: return "STATUS_AUDIT_FAILED"; + case 0xC0000245: return "STATUS_TIMER_RESOLUTION_NOT_SET"; + case 0xC0000246: return "STATUS_CONNECTION_COUNT_LIMIT"; + case 0xC0000247: return "STATUS_LOGIN_TIME_RESTRICTION"; + case 0xC0000248: return "STATUS_LOGIN_WKSTA_RESTRICTION"; + case 0xC0000249: return "STATUS_IMAGE_MP_UP_MISMATCH"; + case 0xC0000250: return "STATUS_INSUFFICIENT_LOGON_INFO"; + case 0xC0000251: return "STATUS_BAD_DLL_ENTRYPOINT"; + case 0xC0000252: return "STATUS_BAD_SERVICE_ENTRYPOINT"; + case 0xC0000253: return "STATUS_LPC_REPLY_LOST"; + case 0xC0000254: return "STATUS_IP_ADDRESS_CONFLICT1"; + case 0xC0000255: return "STATUS_IP_ADDRESS_CONFLICT2"; + case 0xC0000256: return "STATUS_REGISTRY_QUOTA_LIMIT"; + case 0xC0000257: return "STATUS_PATH_NOT_COVERED"; + case 0xC0000258: return "STATUS_NO_CALLBACK_ACTIVE"; + case 0xC0000259: return "STATUS_LICENSE_QUOTA_EXCEEDED"; + case 0xC000025A: return "STATUS_PWD_TOO_SHORT"; + case 0xC000025B: return "STATUS_PWD_TOO_RECENT"; + case 0xC000025C: return "STATUS_PWD_HISTORY_CONFLICT"; + case 0xC000025E: return "STATUS_PLUGPLAY_NO_DEVICE"; + case 0xC000025F: return "STATUS_UNSUPPORTED_COMPRESSION"; + case 0xC0000260: return "STATUS_INVALID_HW_PROFILE"; + case 0xC0000261: return "STATUS_INVALID_PLUGPLAY_DEVICE_PATH"; + case 0xC0000262: return "STATUS_DRIVER_ORDINAL_NOT_FOUND"; + case 0xC0000263: return "STATUS_DRIVER_ENTRYPOINT_NOT_FOUND"; + case 0xC0000264: return "STATUS_RESOURCE_NOT_OWNED"; + case 0xC0000265: return "STATUS_TOO_MANY_LINKS"; + case 0xC0000266: return "STATUS_QUOTA_LIST_INCONSISTENT"; + case 0xC0000267: return "STATUS_FILE_IS_OFFLINE"; + case 0xC0000268: return "STATUS_EVALUATION_EXPIRATION"; + case 0xC0000269: return "STATUS_ILLEGAL_DLL_RELOCATION"; + case 0xC000026A: return "STATUS_LICENSE_VIOLATION"; + case 0xC000026B: return "STATUS_DLL_INIT_FAILED_LOGOFF"; + case 0xC000026C: return "STATUS_DRIVER_UNABLE_TO_LOAD"; + case 0xC000026D: return "STATUS_DFS_UNAVAILABLE"; + case 0xC000026E: return "STATUS_VOLUME_DISMOUNTED"; + case 0xC000026F: return "STATUS_WX86_INTERNAL_ERROR"; + case 0xC0000270: return "STATUS_WX86_FLOAT_STACK_CHECK"; + case 0xC0000271: return "STATUS_VALIDATE_CONTINUE"; + case 0xC0000272: return "STATUS_NO_MATCH"; + case 0xC0000273: return "STATUS_NO_MORE_MATCHES"; + case 0xC0000275: return "STATUS_NOT_A_REPARSE_POINT"; + case 0xC0000276: return "STATUS_IO_REPARSE_TAG_INVALID"; + case 0xC0000277: return "STATUS_IO_REPARSE_TAG_MISMATCH"; + case 0xC0000278: return "STATUS_IO_REPARSE_DATA_INVALID"; + case 0xC0000279: return "STATUS_IO_REPARSE_TAG_NOT_HANDLED"; + case 0xC0000280: return "STATUS_REPARSE_POINT_NOT_RESOLVED"; + case 0xC0000281: return "STATUS_DIRECTORY_IS_A_REPARSE_POINT"; + case 0xC0000282: return "STATUS_RANGE_LIST_CONFLICT"; + case 0xC0000283: return "STATUS_SOURCE_ELEMENT_EMPTY"; + case 0xC0000284: return "STATUS_DESTINATION_ELEMENT_FULL"; + case 0xC0000285: return "STATUS_ILLEGAL_ELEMENT_ADDRESS"; + case 0xC0000286: return "STATUS_MAGAZINE_NOT_PRESENT"; + case 0xC0000287: return "STATUS_REINITIALIZATION_NEEDED"; + case 0x80000288: return "STATUS_DEVICE_REQUIRES_CLEANING"; + case 0x80000289: return "STATUS_DEVICE_DOOR_OPEN"; + case 0xC000028A: return "STATUS_ENCRYPTION_FAILED"; + case 0xC000028B: return "STATUS_DECRYPTION_FAILED"; + case 0xC000028C: return "STATUS_RANGE_NOT_FOUND"; + case 0xC000028D: return "STATUS_NO_RECOVERY_POLICY"; + case 0xC000028E: return "STATUS_NO_EFS"; + case 0xC000028F: return "STATUS_WRONG_EFS"; + case 0xC0000290: return "STATUS_NO_USER_KEYS"; + case 0xC0000291: return "STATUS_FILE_NOT_ENCRYPTED"; + case 0xC0000292: return "STATUS_NOT_EXPORT_FORMAT"; + case 0xC0000293: return "STATUS_FILE_ENCRYPTED"; + case 0x40000294: return "STATUS_WAKE_SYSTEM"; + case 0xC0000295: return "STATUS_WMI_GUID_NOT_FOUND"; + case 0xC0000296: return "STATUS_WMI_INSTANCE_NOT_FOUND"; + case 0xC0000297: return "STATUS_WMI_ITEMID_NOT_FOUND"; + case 0xC0000298: return "STATUS_WMI_TRY_AGAIN"; + case 0xC0000299: return "STATUS_SHARED_POLICY"; + case 0xC000029A: return "STATUS_POLICY_OBJECT_NOT_FOUND"; + case 0xC000029B: return "STATUS_POLICY_ONLY_IN_DS"; + case 0xC000029C: return "STATUS_VOLUME_NOT_UPGRADED"; + case 0xC000029D: return "STATUS_REMOTE_STORAGE_NOT_ACTIVE"; + case 0xC000029E: return "STATUS_REMOTE_STORAGE_MEDIA_ERROR"; + case 0xC000029F: return "STATUS_NO_TRACKING_SERVICE"; + case 0xC00002A0: return "STATUS_SERVER_SID_MISMATCH"; + case 0xC00002A1: return "STATUS_DS_NO_ATTRIBUTE_OR_VALUE"; + case 0xC00002A2: return "STATUS_DS_INVALID_ATTRIBUTE_SYNTAX"; + case 0xC00002A3: return "STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED"; + case 0xC00002A4: return "STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS"; + case 0xC00002A5: return "STATUS_DS_BUSY"; + case 0xC00002A6: return "STATUS_DS_UNAVAILABLE"; + case 0xC00002A7: return "STATUS_DS_NO_RIDS_ALLOCATED"; + case 0xC00002A8: return "STATUS_DS_NO_MORE_RIDS"; + case 0xC00002A9: return "STATUS_DS_INCORRECT_ROLE_OWNER"; + case 0xC00002AA: return "STATUS_DS_RIDMGR_INIT_ERROR"; + case 0xC00002AB: return "STATUS_DS_OBJ_CLASS_VIOLATION"; + case 0xC00002AC: return "STATUS_DS_CANT_ON_NON_LEAF"; + case 0xC00002AD: return "STATUS_DS_CANT_ON_RDN"; + case 0xC00002AE: return "STATUS_DS_CANT_MOD_OBJ_CLASS"; + case 0xC00002AF: return "STATUS_DS_CROSS_DOM_MOVE_FAILED"; + case 0xC00002B0: return "STATUS_DS_GC_NOT_AVAILABLE"; + case 0xC00002B1: return "STATUS_DIRECTORY_SERVICE_REQUIRED"; + case 0xC00002B2: return "STATUS_REPARSE_ATTRIBUTE_CONFLICT"; + case 0xC00002B3: return "STATUS_CANT_ENABLE_DENY_ONLY"; + case 0xC00002B4: return "STATUS_FLOAT_MULTIPLE_FAULTS"; + case 0xC00002B5: return "STATUS_FLOAT_MULTIPLE_TRAPS"; + case 0xC00002B6: return "STATUS_DEVICE_REMOVED"; + case 0xC00002B7: return "STATUS_JOURNAL_DELETE_IN_PROGRESS"; + case 0xC00002B8: return "STATUS_JOURNAL_NOT_ACTIVE"; + case 0xC00002B9: return "STATUS_NOINTERFACE"; + case 0xC00002C1: return "STATUS_DS_ADMIN_LIMIT_EXCEEDED"; + case 0xC00002C2: return "STATUS_DRIVER_FAILED_SLEEP"; + case 0xC00002C3: return "STATUS_MUTUAL_AUTHENTICATION_FAILED"; + case 0xC00002C4: return "STATUS_CORRUPT_SYSTEM_FILE"; + case 0xC00002C5: return "STATUS_DATATYPE_MISALIGNMENT_ERROR"; + case 0xC00002C6: return "STATUS_WMI_READ_ONLY"; + case 0xC00002C7: return "STATUS_WMI_SET_FAILURE"; + case 0xC00002C8: return "STATUS_COMMITMENT_MINIMUM"; + case 0xC00002C9: return "STATUS_REG_NAT_CONSUMPTION"; + case 0xC00002CA: return "STATUS_TRANSPORT_FULL"; + case 0xC00002CB: return "STATUS_DS_SAM_INIT_FAILURE"; + case 0xC00002CC: return "STATUS_ONLY_IF_CONNECTED"; + case 0xC00002CD: return "STATUS_DS_SENSITIVE_GROUP_VIOLATION"; + case 0xC00002CE: return "STATUS_PNP_RESTART_ENUMERATION"; + case 0xC00002CF: return "STATUS_JOURNAL_ENTRY_DELETED"; + case 0xC00002D0: return "STATUS_DS_CANT_MOD_PRIMARYGROUPID"; + case 0xC00002D1: return "STATUS_SYSTEM_IMAGE_BAD_SIGNATURE"; + case 0xC00002D2: return "STATUS_PNP_REBOOT_REQUIRED"; + case 0xC00002D3: return "STATUS_POWER_STATE_INVALID"; + case 0xC00002D4: return "STATUS_DS_INVALID_GROUP_TYPE"; + case 0xC00002D5: return "STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN"; + case 0xC00002D6: return "STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN"; + case 0xC00002D7: return "STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER"; + case 0xC00002D8: return "STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER"; + case 0xC00002D9: return "STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER"; + case 0xC00002DA: return "STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER"; + case 0xC00002DB: return "STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER"; + case 0xC00002DC: return "STATUS_DS_HAVE_PRIMARY_MEMBERS"; + case 0xC00002DD: return "STATUS_WMI_NOT_SUPPORTED"; + case 0xC00002DE: return "STATUS_INSUFFICIENT_POWER"; + case 0xC00002DF: return "STATUS_SAM_NEED_BOOTKEY_PASSWORD"; + case 0xC00002E0: return "STATUS_SAM_NEED_BOOTKEY_FLOPPY"; + case 0xC00002E1: return "STATUS_DS_CANT_START"; + case 0xC00002E2: return "STATUS_DS_INIT_FAILURE"; + case 0xC00002E3: return "STATUS_SAM_INIT_FAILURE"; + case 0xC00002E4: return "STATUS_DS_GC_REQUIRED"; + case 0xC00002E5: return "STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY"; + case 0xC00002E6: return "STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS"; + case 0xC00002E7: return "STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED"; + case 0xC00002E8: return "STATUS_MULTIPLE_FAULT_VIOLATION"; + case 0xC0000300: return "STATUS_NOT_SUPPORTED_ON_SBS"; + case 0xC0009898: return "STATUS_WOW_ASSERTION"; + case 0xC0010001: return "DBG_NO_STATE_CHANGE"; + case 0xC0010002: return "DBG_APP_NOT_IDLE"; + case 0xC0020001: return "RPC_NT_INVALID_STRING_BINDING"; + case 0xC0020002: return "RPC_NT_WRONG_KIND_OF_BINDING"; + case 0xC0020003: return "RPC_NT_INVALID_BINDING"; + case 0xC0020004: return "RPC_NT_PROTSEQ_NOT_SUPPORTED"; + case 0xC0020005: return "RPC_NT_INVALID_RPC_PROTSEQ"; + case 0xC0020006: return "RPC_NT_INVALID_STRING_UUID"; + case 0xC0020007: return "RPC_NT_INVALID_ENDPOINT_FORMAT"; + case 0xC0020008: return "RPC_NT_INVALID_NET_ADDR"; + case 0xC0020009: return "RPC_NT_NO_ENDPOINT_FOUND"; + case 0xC002000A: return "RPC_NT_INVALID_TIMEOUT"; + case 0xC002000B: return "RPC_NT_OBJECT_NOT_FOUND"; + case 0xC002000C: return "RPC_NT_ALREADY_REGISTERED"; + case 0xC002000D: return "RPC_NT_TYPE_ALREADY_REGISTERED"; + case 0xC002000E: return "RPC_NT_ALREADY_LISTENING"; + case 0xC002000F: return "RPC_NT_NO_PROTSEQS_REGISTERED"; + case 0xC0020010: return "RPC_NT_NOT_LISTENING"; + case 0xC0020011: return "RPC_NT_UNKNOWN_MGR_TYPE"; + case 0xC0020012: return "RPC_NT_UNKNOWN_IF"; + case 0xC0020013: return "RPC_NT_NO_BINDINGS"; + case 0xC0020014: return "RPC_NT_NO_PROTSEQS"; + case 0xC0020015: return "RPC_NT_CANT_CREATE_ENDPOINT"; + case 0xC0020016: return "RPC_NT_OUT_OF_RESOURCES"; + case 0xC0020017: return "RPC_NT_SERVER_UNAVAILABLE"; + case 0xC0020018: return "RPC_NT_SERVER_TOO_BUSY"; + case 0xC0020019: return "RPC_NT_INVALID_NETWORK_OPTIONS"; + case 0xC002001A: return "RPC_NT_NO_CALL_ACTIVE"; + case 0xC002001B: return "RPC_NT_CALL_FAILED"; + case 0xC002001C: return "RPC_NT_CALL_FAILED_DNE"; + case 0xC002001D: return "RPC_NT_PROTOCOL_ERROR"; + case 0xC002001F: return "RPC_NT_UNSUPPORTED_TRANS_SYN"; + case 0xC0020021: return "RPC_NT_UNSUPPORTED_TYPE"; + case 0xC0020022: return "RPC_NT_INVALID_TAG"; + case 0xC0020023: return "RPC_NT_INVALID_BOUND"; + case 0xC0020024: return "RPC_NT_NO_ENTRY_NAME"; + case 0xC0020025: return "RPC_NT_INVALID_NAME_SYNTAX"; + case 0xC0020026: return "RPC_NT_UNSUPPORTED_NAME_SYNTAX"; + case 0xC0020028: return "RPC_NT_UUID_NO_ADDRESS"; + case 0xC0020029: return "RPC_NT_DUPLICATE_ENDPOINT"; + case 0xC002002A: return "RPC_NT_UNKNOWN_AUTHN_TYPE"; + case 0xC002002B: return "RPC_NT_MAX_CALLS_TOO_SMALL"; + case 0xC002002C: return "RPC_NT_STRING_TOO_LONG"; + case 0xC002002D: return "RPC_NT_PROTSEQ_NOT_FOUND"; + case 0xC002002E: return "RPC_NT_PROCNUM_OUT_OF_RANGE"; + case 0xC002002F: return "RPC_NT_BINDING_HAS_NO_AUTH"; + case 0xC0020030: return "RPC_NT_UNKNOWN_AUTHN_SERVICE"; + case 0xC0020031: return "RPC_NT_UNKNOWN_AUTHN_LEVEL"; + case 0xC0020032: return "RPC_NT_INVALID_AUTH_IDENTITY"; + case 0xC0020033: return "RPC_NT_UNKNOWN_AUTHZ_SERVICE"; + case 0xC0020034: return "EPT_NT_INVALID_ENTRY"; + case 0xC0020035: return "EPT_NT_CANT_PERFORM_OP"; + case 0xC0020036: return "EPT_NT_NOT_REGISTERED"; + case 0xC0020037: return "RPC_NT_NOTHING_TO_EXPORT"; + case 0xC0020038: return "RPC_NT_INCOMPLETE_NAME"; + case 0xC0020039: return "RPC_NT_INVALID_VERS_OPTION"; + case 0xC002003A: return "RPC_NT_NO_MORE_MEMBERS"; + case 0xC002003B: return "RPC_NT_NOT_ALL_OBJS_UNEXPORTED"; + case 0xC002003C: return "RPC_NT_INTERFACE_NOT_FOUND"; + case 0xC002003D: return "RPC_NT_ENTRY_ALREADY_EXISTS"; + case 0xC002003E: return "RPC_NT_ENTRY_NOT_FOUND"; + case 0xC002003F: return "RPC_NT_NAME_SERVICE_UNAVAILABLE"; + case 0xC0020040: return "RPC_NT_INVALID_NAF_ID"; + case 0xC0020041: return "RPC_NT_CANNOT_SUPPORT"; + case 0xC0020042: return "RPC_NT_NO_CONTEXT_AVAILABLE"; + case 0xC0020043: return "RPC_NT_INTERNAL_ERROR"; + case 0xC0020044: return "RPC_NT_ZERO_DIVIDE"; + case 0xC0020045: return "RPC_NT_ADDRESS_ERROR"; + case 0xC0020046: return "RPC_NT_FP_DIV_ZERO"; + case 0xC0020047: return "RPC_NT_FP_UNDERFLOW"; + case 0xC0020048: return "RPC_NT_FP_OVERFLOW"; + case 0xC0030001: return "RPC_NT_NO_MORE_ENTRIES"; + case 0xC0030002: return "RPC_NT_SS_CHAR_TRANS_OPEN_FAIL"; + case 0xC0030003: return "RPC_NT_SS_CHAR_TRANS_SHORT_FILE"; + case 0xC0030004: return "RPC_NT_SS_IN_NULL_CONTEXT"; + case 0xC0030005: return "RPC_NT_SS_CONTEXT_MISMATCH"; + case 0xC0030006: return "RPC_NT_SS_CONTEXT_DAMAGED"; + case 0xC0030007: return "RPC_NT_SS_HANDLES_MISMATCH"; + case 0xC0030008: return "RPC_NT_SS_CANNOT_GET_CALL_HANDLE"; + case 0xC0030009: return "RPC_NT_NULL_REF_POINTER"; + case 0xC003000A: return "RPC_NT_ENUM_VALUE_OUT_OF_RANGE"; + case 0xC003000B: return "RPC_NT_BYTE_COUNT_TOO_SMALL"; + case 0xC003000C: return "RPC_NT_BAD_STUB_DATA"; + case 0xC0020049: return "RPC_NT_CALL_IN_PROGRESS"; + case 0xC002004A: return "RPC_NT_NO_MORE_BINDINGS"; + case 0xC002004B: return "RPC_NT_GROUP_MEMBER_NOT_FOUND"; + case 0xC002004C: return "EPT_NT_CANT_CREATE"; + case 0xC002004D: return "RPC_NT_INVALID_OBJECT"; + case 0xC002004F: return "RPC_NT_NO_INTERFACES"; + case 0xC0020050: return "RPC_NT_CALL_CANCELLED"; + case 0xC0020051: return "RPC_NT_BINDING_INCOMPLETE"; + case 0xC0020052: return "RPC_NT_COMM_FAILURE"; + case 0xC0020053: return "RPC_NT_UNSUPPORTED_AUTHN_LEVEL"; + case 0xC0020054: return "RPC_NT_NO_PRINC_NAME"; + case 0xC0020055: return "RPC_NT_NOT_RPC_ERROR"; + case 0x40020056: return "RPC_NT_UUID_LOCAL_ONLY"; + case 0xC0020057: return "RPC_NT_SEC_PKG_ERROR"; + case 0xC0020058: return "RPC_NT_NOT_CANCELLED"; + case 0xC0030059: return "RPC_NT_INVALID_ES_ACTION"; + case 0xC003005A: return "RPC_NT_WRONG_ES_VERSION"; + case 0xC003005B: return "RPC_NT_WRONG_STUB_VERSION"; + case 0xC003005C: return "RPC_NT_INVALID_PIPE_OBJECT"; + case 0xC003005D: return "RPC_NT_INVALID_PIPE_OPERATION"; + case 0xC003005E: return "RPC_NT_WRONG_PIPE_VERSION"; + case 0xC003005F: return "RPC_NT_PIPE_CLOSED"; + case 0xC0030060: return "RPC_NT_PIPE_DISCIPLINE_ERROR"; + case 0xC0030061: return "RPC_NT_PIPE_EMPTY"; + case 0xC0020062: return "RPC_NT_INVALID_ASYNC_HANDLE"; + case 0xC0020063: return "RPC_NT_INVALID_ASYNC_CALL"; + case 0x400200AF: return "RPC_NT_SEND_INCOMPLETE"; + case 0xC0140001: return "STATUS_ACPI_INVALID_OPCODE"; + case 0xC0140002: return "STATUS_ACPI_STACK_OVERFLOW"; + case 0xC0140003: return "STATUS_ACPI_ASSERT_FAILED"; + case 0xC0140004: return "STATUS_ACPI_INVALID_INDEX"; + case 0xC0140005: return "STATUS_ACPI_INVALID_ARGUMENT"; + case 0xC0140006: return "STATUS_ACPI_FATAL"; + case 0xC0140007: return "STATUS_ACPI_INVALID_SUPERNAME"; + case 0xC0140008: return "STATUS_ACPI_INVALID_ARGTYPE"; + case 0xC0140009: return "STATUS_ACPI_INVALID_OBJTYPE"; + case 0xC014000A: return "STATUS_ACPI_INVALID_TARGETTYPE"; + case 0xC014000B: return "STATUS_ACPI_INCORRECT_ARGUMENT_COUNT"; + case 0xC014000C: return "STATUS_ACPI_ADDRESS_NOT_MAPPED"; + case 0xC014000D: return "STATUS_ACPI_INVALID_EVENTTYPE"; + case 0xC014000E: return "STATUS_ACPI_HANDLER_COLLISION"; + case 0xC014000F: return "STATUS_ACPI_INVALID_DATA"; + case 0xC0140010: return "STATUS_ACPI_INVALID_REGION"; + case 0xC0140011: return "STATUS_ACPI_INVALID_ACCESS_SIZE"; + case 0xC0140012: return "STATUS_ACPI_ACQUIRE_GLOBAL_LOCK"; + case 0xC0140013: return "STATUS_ACPI_ALREADY_INITIALIZED"; + case 0xC0140014: return "STATUS_ACPI_NOT_INITIALIZED"; + case 0xC0140015: return "STATUS_ACPI_INVALID_MUTEX_LEVEL"; + case 0xC0140016: return "STATUS_ACPI_MUTEX_NOT_OWNED"; + case 0xC0140017: return "STATUS_ACPI_MUTEX_NOT_OWNER"; + case 0xC0140018: return "STATUS_ACPI_RS_ACCESS"; + case 0xC0140019: return "STATUS_ACPI_INVALID_TABLE"; + case 0xC0140020: return "STATUS_ACPI_REG_HANDLER_FAILED"; + case 0xC0140021: return "STATUS_ACPI_POWER_REQUEST_FAILED"; + case 0xC00A0001: return "STATUS_CTX_WINSTATION_NAME_INVALID"; + case 0xC00A0002: return "STATUS_CTX_INVALID_PD"; + case 0xC00A0003: return "STATUS_CTX_PD_NOT_FOUND"; + case 0x400A0004: return "STATUS_CTX_CDM_CONNECT"; + case 0x400A0005: return "STATUS_CTX_CDM_DISCONNECT"; + case 0xC00A0006: return "STATUS_CTX_CLOSE_PENDING"; + case 0xC00A0007: return "STATUS_CTX_NO_OUTBUF"; + case 0xC00A0008: return "STATUS_CTX_MODEM_INF_NOT_FOUND"; + case 0xC00A0009: return "STATUS_CTX_INVALID_MODEMNAME"; + case 0xC00A000A: return "STATUS_CTX_RESPONSE_ERROR"; + case 0xC00A000B: return "STATUS_CTX_MODEM_RESPONSE_TIMEOUT"; + case 0xC00A000C: return "STATUS_CTX_MODEM_RESPONSE_NO_CARRIER"; + case 0xC00A000D: return "STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE"; + case 0xC00A000E: return "STATUS_CTX_MODEM_RESPONSE_BUSY"; + case 0xC00A000F: return "STATUS_CTX_MODEM_RESPONSE_VOICE"; + case 0xC00A0010: return "STATUS_CTX_TD_ERROR"; + case 0xC00A0012: return "STATUS_CTX_LICENSE_CLIENT_INVALID"; + case 0xC00A0013: return "STATUS_CTX_LICENSE_NOT_AVAILABLE"; + case 0xC00A0014: return "STATUS_CTX_LICENSE_EXPIRED"; + case 0xC00A0015: return "STATUS_CTX_WINSTATION_NOT_FOUND"; + case 0xC00A0016: return "STATUS_CTX_WINSTATION_NAME_COLLISION"; + case 0xC00A0017: return "STATUS_CTX_WINSTATION_BUSY"; + case 0xC00A0018: return "STATUS_CTX_BAD_VIDEO_MODE"; + case 0xC00A0022: return "STATUS_CTX_GRAPHICS_INVALID"; + case 0xC00A0024: return "STATUS_CTX_NOT_CONSOLE"; + case 0xC00A0026: return "STATUS_CTX_CLIENT_QUERY_TIMEOUT"; + case 0xC00A0027: return "STATUS_CTX_CONSOLE_DISCONNECT"; + case 0xC00A0028: return "STATUS_CTX_CONSOLE_CONNECT"; + case 0xC00A002A: return "STATUS_CTX_SHADOW_DENIED"; + case 0xC00A002B: return "STATUS_CTX_WINSTATION_ACCESS_DENIED"; + case 0xC00A002E: return "STATUS_CTX_INVALID_WD"; + case 0xC00A002F: return "STATUS_CTX_WD_NOT_FOUND"; + case 0xC00A0030: return "STATUS_CTX_SHADOW_INVALID"; + case 0xC00A0031: return "STATUS_CTX_SHADOW_DISABLED"; + case 0xC00A0032: return "STATUS_RDP_PROTOCOL_ERROR"; + case 0xC00A0033: return "STATUS_CTX_CLIENT_LICENSE_NOT_SET"; + case 0xC00A0034: return "STATUS_CTX_CLIENT_LICENSE_IN_USE"; + case 0xC0040035: return "STATUS_PNP_BAD_MPS_TABLE"; + case 0xC0040036: return "STATUS_PNP_TRANSLATION_FAILED"; + case 0xC0040037: return "STATUS_PNP_IRQ_TRANSLATION_FAILED"; + default: return "STATUS_UNKNOWN"; + } +} + + +/* + * KsPrintf + * This function is variable-argument, level-sensitive debug print routine. + * If the specified debug level for the print statement is lower or equal + * to the current debug level, the message will be printed. + * + * Arguments: + * DebugPrintLevel - Specifies at which debugging level the string should + * be printed + * DebugMessage - Variable argument ascii c string + * + * Return Value: + * N/A + * + * NOTES: + * N/A + */ + +VOID +KsPrintf( + LONG DebugPrintLevel, + PCHAR DebugMessage, + ... + ) +{ + va_list ap; + + va_start(ap, DebugMessage); + + if (DebugPrintLevel <= KsDebugLevel) + { + CHAR buffer[0x200]; + + vsprintf(buffer, DebugMessage, ap); + + KdPrint(("TID:%8.8x: %s", PsGetCurrentThread(), buffer)); + } + + va_end(ap); + +} // KsPrint() + +#endif diff --git a/lnet/libcfs/winnt/winnt-fs.c b/lnet/libcfs/winnt/winnt-fs.c new file mode 100644 index 0000000..128781b --- /dev/null +++ b/lnet/libcfs/winnt/winnt-fs.c @@ -0,0 +1,541 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +# define DEBUG_SUBSYSTEM S_LNET + +#include + +const CHAR *dos_file_prefix = "\\??\\"; + +/* + * cfs_filp_open + * To open or create a file in kernel mode + * + * Arguments: + * name: name of the file to be opened or created, no dos path prefix + * flags: open/creation attribute options + * mode: access mode/permission to open or create + * err: error code + * + * Return Value: + * the pointer to the cfs_file_t or NULL if it fails + * + * Notes: + * N/A + */ + +cfs_file_t *cfs_filp_open(const char *name, int flags, int mode, int *err) +{ + cfs_file_t * fp = NULL; + + NTSTATUS Status; + + OBJECT_ATTRIBUTES ObjectAttributes; + HANDLE FileHandle; + IO_STATUS_BLOCK IoStatus; + ACCESS_MASK DesiredAccess; + ULONG CreateDisposition; + ULONG ShareAccess; + ULONG CreateOptions; + + USHORT NameLength = 0; + USHORT PrefixLength = 0; + + UNICODE_STRING UnicodeName; + PWCHAR UnicodeString = NULL; + + ANSI_STRING AnsiName; + PUCHAR AnsiString = NULL; + + /* Analyze the flags settings */ + + if (cfs_is_flag_set(flags, O_WRONLY)) { + DesiredAccess = (GENERIC_WRITE | SYNCHRONIZE); + ShareAccess = 0; + } else if (cfs_is_flag_set(flags, O_RDWR)) { + DesiredAccess = (GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE); + ShareAccess = FILE_SHARE_READ | FILE_SHARE_WRITE; + } else { + DesiredAccess = (GENERIC_READ | SYNCHRONIZE); + ShareAccess = FILE_SHARE_READ; + } + + if (cfs_is_flag_set(flags, O_CREAT)) { + if (cfs_is_flag_set(flags, O_EXCL)) { + CreateDisposition = FILE_CREATE; + } else { + CreateDisposition = FILE_OPEN_IF; + } + } else { + CreateDisposition = FILE_OPEN; + } + + if (cfs_is_flag_set(flags, O_TRUNC)) { + if (cfs_is_flag_set(flags, O_EXCL)) { + CreateDisposition = FILE_OVERWRITE; + } else { + CreateDisposition = FILE_OVERWRITE_IF; + } + } + + CreateOptions = 0; + + if (cfs_is_flag_set(flags, O_DIRECTORY)) { + cfs_set_flag(CreateOptions, FILE_DIRECTORY_FILE); + } + + if (cfs_is_flag_set(flags, O_SYNC)) { + cfs_set_flag(CreateOptions, FILE_WRITE_THROUGH); + } + + if (cfs_is_flag_set(flags, O_DIRECT)) { + cfs_set_flag(CreateOptions, FILE_NO_INTERMEDIATE_BUFFERING); + } + + /* Initialize the unicode path name for the specified file */ + + NameLength = (USHORT)strlen(name); + + if (name[0] != '\\') { + PrefixLength = (USHORT)strlen(dos_file_prefix); + } + + AnsiString = cfs_alloc( sizeof(CHAR) * (NameLength + PrefixLength + 1), + CFS_ALLOC_ZERO); + if (NULL == AnsiString) { + if (err) *err = -ENOMEM; + return NULL; + } + + UnicodeString = cfs_alloc( sizeof(WCHAR) * (NameLength + PrefixLength + 1), + CFS_ALLOC_ZERO); + + if (NULL == UnicodeString) { + if (err) *err = -ENOMEM; + cfs_free(AnsiString); + return NULL; + } + + if (PrefixLength) { + RtlCopyMemory(&AnsiString[0], dos_file_prefix , PrefixLength); + } + + RtlCopyMemory(&AnsiString[PrefixLength], name, NameLength); + NameLength += PrefixLength; + + AnsiName.MaximumLength = NameLength + 1; + AnsiName.Length = NameLength; + AnsiName.Buffer = AnsiString; + + UnicodeName.MaximumLength = (NameLength + 1) * sizeof(WCHAR); + UnicodeName.Length = 0; + UnicodeName.Buffer = (PWSTR)UnicodeString; + + RtlAnsiStringToUnicodeString(&UnicodeName, &AnsiName, FALSE); + + /* Setup the object attributes structure for the file. */ + + InitializeObjectAttributes( + &ObjectAttributes, + &UnicodeName, + OBJ_CASE_INSENSITIVE | + OBJ_KERNEL_HANDLE, + NULL, + NULL ); + + /* Now to open or create the file now */ + + Status = ZwCreateFile( + &FileHandle, + DesiredAccess, + &ObjectAttributes, + &IoStatus, + 0, + FILE_ATTRIBUTE_NORMAL, + ShareAccess, + CreateDisposition, + CreateOptions, + NULL, + 0 ); + + /* Check the returned status of IoStatus... */ + + if (!NT_SUCCESS(IoStatus.Status)) { + *err = cfs_error_code(IoStatus.Status); + cfs_free(UnicodeString); + cfs_free(AnsiString); + return NULL; + } + + /* Allocate the cfs_file_t: libcfs file object */ + + fp = cfs_alloc(sizeof(cfs_file_t) + NameLength, CFS_ALLOC_ZERO); + + if (NULL == fp) { + Status = ZwClose(FileHandle); + ASSERT(NT_SUCCESS(Status)); + *err = -ENOMEM; + cfs_free(UnicodeString); + cfs_free(AnsiString); + return NULL; + } + + fp->f_handle = FileHandle; + strcpy(fp->f_name, name); + fp->f_flags = flags; + fp->f_mode = (mode_t)mode; + fp->f_count = 1; + *err = 0; + + /* free the memory of temporary name strings */ + cfs_free(UnicodeString); + cfs_free(AnsiString); + + return fp; +} + + +/* + * cfs_filp_close + * To close the opened file and release the filp structure + * + * Arguments: + * fp: the pointer of the cfs_file_t strcture + * + * Return Value: + * ZERO: on success + * Non-Zero: on failure + * + * Notes: + * N/A + */ + +int cfs_filp_close(cfs_file_t *fp) +{ + NTSTATUS Status; + + ASSERT(fp != NULL); + ASSERT(fp->f_handle != NULL); + + /* release the file handle */ + Status = ZwClose(fp->f_handle); + ASSERT(NT_SUCCESS(Status)); + + /* free the file flip structure */ + cfs_free(fp); + return 0; +} + + +/* + * cfs_filp_read + * To read data from the opened file + * + * Arguments: + * fp: the pointer of the cfs_file_t strcture + * buf: pointer to the buffer to contain the data + * nbytes: size in bytes to be read from the file + * pos: offset in file where reading starts, if pos + * NULL, then read from current file offset + * + * Return Value: + * Actual size read into the buffer in success case + * Error code in failure case + * + * Notes: + * N/A + */ + +int cfs_filp_read(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos) +{ + LARGE_INTEGER address; + NTSTATUS Status; + IO_STATUS_BLOCK IoStatus; + + int rc = 0; + + /* Read data from the file into the specified buffer */ + + if (pos != NULL) { + address.QuadPart = *pos; + } else { + address.QuadPart = fp->f_pos; + } + + Status = ZwReadFile( fp->f_handle, + 0, + NULL, + NULL, + &IoStatus, + buf, + nbytes, + &address, + NULL ); + + if (!NT_SUCCESS(IoStatus.Status)) { + rc = cfs_error_code(IoStatus.Status); + } else { + rc = (int)IoStatus.Information; + fp->f_pos = address.QuadPart + rc; + + if (pos != NULL) { + *pos = fp->f_pos; + } + } + + return rc; +} + + +/* + * cfs_filp_wrtie + * To write specified data to the opened file + * + * Arguments: + * fp: the pointer of the cfs_file_t strcture + * buf: pointer to the buffer containing the data + * nbytes: size in bytes to be written to the file + * pos: offset in file where writing starts, if pos + * NULL, then write to current file offset + * + * Return Value: + * Actual size written into the buffer in success case + * Error code in failure case + * + * Notes: + * N/A + */ + +int cfs_filp_write(cfs_file_t *fp, void *buf, size_t nbytes, loff_t *pos) +{ + LARGE_INTEGER address; + NTSTATUS Status; + IO_STATUS_BLOCK IoStatus; + int rc = 0; + + /* Write user specified data into the file */ + + if (pos != NULL) { + address.QuadPart = *pos; + } else { + address.QuadPart = fp->f_pos; + } + + Status = ZwWriteFile( fp->f_handle, + 0, + NULL, + NULL, + &IoStatus, + buf, + nbytes, + &address, + NULL ); + + if (!NT_SUCCESS(Status)) { + rc = cfs_error_code(Status); + } else { + rc = (int)IoStatus.Information; + fp->f_pos = address.QuadPart + rc; + + if (pos != NULL) { + *pos = fp->f_pos; + } + } + + return rc; +} + + +NTSTATUS +CompletionRoutine( + PDEVICE_OBJECT DeviceObject, + PIRP Irp, + PVOID Context) +{ + /* copy the IoStatus result */ + *Irp->UserIosb = Irp->IoStatus; + + /* singal the event we set */ + KeSetEvent(Irp->UserEvent, 0, FALSE); + + /* free the Irp we allocated */ + IoFreeIrp(Irp); + + return STATUS_MORE_PROCESSING_REQUIRED; +} + + +/* + * cfs_filp_fsync + * To sync the dirty data of the file to disk + * + * Arguments: + * fp: the pointer of the cfs_file_t strcture + * + * Return Value: + * Zero: in success case + * Error code: in failure case + * + * Notes: + * Nt kernel doesn't export such a routine to flush a file, + * we must allocate our own Irp and issue it to the file + * system driver. + */ + +int cfs_filp_fsync(cfs_file_t *fp) +{ + + PFILE_OBJECT FileObject; + PDEVICE_OBJECT DeviceObject; + + NTSTATUS Status; + PIRP Irp; + KEVENT Event; + IO_STATUS_BLOCK IoSb; + PIO_STACK_LOCATION IrpSp; + + /* get the FileObject and the DeviceObject */ + + Status = ObReferenceObjectByHandle( + fp->f_handle, + FILE_WRITE_DATA, + NULL, + KernelMode, + (PVOID*)&FileObject, + NULL ); + + if (!NT_SUCCESS(Status)) { + return cfs_error_code(Status); + } + + DeviceObject = IoGetRelatedDeviceObject(FileObject); + + /* allocate a new Irp */ + + Irp = IoAllocateIrp(DeviceObject->StackSize, FALSE); + + if (!Irp) { + + ObDereferenceObject(FileObject); + return -ENOMEM; + } + + /* intialize the event */ + KeInitializeEvent(&Event, SynchronizationEvent, FALSE); + + /* setup the Irp */ + Irp->UserEvent = &Event; + Irp->UserIosb = &IoSb; + Irp->RequestorMode = KernelMode; + + Irp->Tail.Overlay.Thread = PsGetCurrentThread(); + Irp->Tail.Overlay.OriginalFileObject = FileObject; + + /* setup the Irp stack location */ + IrpSp = IoGetNextIrpStackLocation(Irp); + + IrpSp->MajorFunction = IRP_MJ_FLUSH_BUFFERS; + IrpSp->DeviceObject = DeviceObject; + IrpSp->FileObject = FileObject; + + IoSetCompletionRoutine(Irp, CompletionRoutine, 0, TRUE, TRUE, TRUE); + + + /* issue the Irp to the underlying file system driver */ + IoCallDriver(DeviceObject, Irp); + + /* wait until it is finished */ + KeWaitForSingleObject(&Event, Executive, KernelMode, TRUE, 0); + + /* cleanup our reference on it */ + ObDereferenceObject(FileObject); + + Status = IoSb.Status; + + return cfs_error_code(Status); +} + +/* + * cfs_get_file + * To increase the reference of the file object + * + * Arguments: + * fp: the pointer of the cfs_file_t strcture + * + * Return Value: + * Zero: in success case + * Non-Zero: in failure case + * + * Notes: + * N/A + */ + +int cfs_get_file(cfs_file_t *fp) +{ + InterlockedIncrement(&(fp->f_count)); + return 0; +} + + +/* + * cfs_put_file + * To decrease the reference of the file object + * + * Arguments: + * fp: the pointer of the cfs_file_t strcture + * + * Return Value: + * Zero: in success case + * Non-Zero: in failure case + * + * Notes: + * N/A + */ + +int cfs_put_file(cfs_file_t *fp) +{ + if (InterlockedDecrement(&(fp->f_count)) == 0) { + cfs_filp_close(fp); + } + + return 0; +} + + +/* + * cfs_file_count + * To query the reference count of the file object + * + * Arguments: + * fp: the pointer of the cfs_file_t strcture + * + * Return Value: + * the reference count of the file object + * + * Notes: + * N/A + */ + +int cfs_file_count(cfs_file_t *fp) +{ + return (int)(fp->f_count); +} diff --git a/lnet/libcfs/winnt/winnt-lock.c b/lnet/libcfs/winnt/winnt-lock.c new file mode 100644 index 0000000..12dbc67 --- /dev/null +++ b/lnet/libcfs/winnt/winnt-lock.c @@ -0,0 +1,353 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + + +# define DEBUG_SUBSYSTEM S_LNET + +#include + + +#if _X86_ + +void __declspec (naked) FASTCALL +atomic_add( + int i, + atomic_t *v + ) +{ + // ECX = i + // EDX = v ; [EDX][0] = v->counter + + __asm { + lock add dword ptr [edx][0], ecx + ret + } +} + +void __declspec (naked) FASTCALL +atomic_sub( + int i, + atomic_t *v + ) +{ + // ECX = i + // EDX = v ; [EDX][0] = v->counter + + __asm { + lock sub dword ptr [edx][0], ecx + ret + } +} + +void __declspec (naked) FASTCALL +atomic_inc( + atomic_t *v + ) +{ + //InterlockedIncrement((PULONG)(&((v)->counter))); + + //` ECX = v ; [ECX][0] = v->counter + + __asm { + lock inc dword ptr [ecx][0] + ret + } +} + +void __declspec (naked) FASTCALL +atomic_dec( + atomic_t *v + ) +{ + // ECX = v ; [ECX][0] = v->counter + + __asm { + lock dec dword ptr [ecx][0] + ret + } +} + +int __declspec (naked) FASTCALL +atomic_sub_and_test( + int i, + atomic_t *v + ) +{ + + // ECX = i + // EDX = v ; [EDX][0] = v->counter + + __asm { + xor eax, eax + lock sub dword ptr [edx][0], ecx + sete al + ret + } +} + +int __declspec (naked) FASTCALL +atomic_inc_and_test( + atomic_t *v + ) +{ + // ECX = v ; [ECX][0] = v->counter + + __asm { + xor eax, eax + lock inc dword ptr [ecx][0] + sete al + ret + } +} + +int __declspec (naked) FASTCALL +atomic_dec_and_test( + atomic_t *v + ) +{ + // ECX = v ; [ECX][0] = v->counter + + __asm { + xor eax, eax + lock dec dword ptr [ecx][0] + sete al + ret + } +} + +#else + +void FASTCALL +atomic_add( + int i, + atomic_t *v + ) +{ + InterlockedExchangeAdd( (PULONG)(&((v)->counter)) , (LONG) (i)); +} + +void FASTCALL +atomic_sub( + int i, + atomic_t *v + ) +{ + InterlockedExchangeAdd( (PULONG)(&((v)->counter)) , (LONG) (-1*i)); +} + +void FASTCALL +atomic_inc( + atomic_t *v + ) +{ + InterlockedIncrement((PULONG)(&((v)->counter))); +} + +void FASTCALL +atomic_dec( + atomic_t *v + ) +{ + InterlockedDecrement((PULONG)(&((v)->counter))); +} + +int FASTCALL +atomic_sub_and_test( + int i, + atomic_t *v + ) +{ + int counter, result; + + do { + + counter = v->counter; + result = counter - i; + + } while ( InterlockedCompareExchange( + &(v->counter), + result, + counter) != counter); + + return (result == 0); +} + +int FASTCALL +atomic_inc_and_test( + atomic_t *v + ) +{ + int counter, result; + + do { + + counter = v->counter; + result = counter + 1; + + } while ( InterlockedCompareExchange( + &(v->counter), + result, + counter) != counter); + + return (result == 0); +} + +int FASTCALL +atomic_dec_and_test( + atomic_t *v + ) +{ + int counter, result; + + do { + + counter = v->counter; + result = counter + 1; + + } while ( InterlockedCompareExchange( + &(v->counter), + result, + counter) != counter); + + return (result == 0); +} + +#endif + + +/* + * rw spinlock + */ + + +void +rwlock_init(rwlock_t * rwlock) +{ + spin_lock_init(&rwlock->guard); + rwlock->count = 0; +} + +void +rwlock_fini(rwlock_t * rwlock) +{ +} + +void +read_lock(rwlock_t * rwlock) +{ + cfs_task_t * task = cfs_current(); + PTASK_SLOT slot = NULL; + + if (!task) { + /* should bugchk here */ + cfs_enter_debugger(); + return; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + ASSERT(slot->Magic == TASKSLT_MAGIC); + + slot->irql = KeRaiseIrqlToDpcLevel(); + + while (TRUE) { + spin_lock(&rwlock->guard); + if (rwlock->count >= 0) + break; + spin_unlock(&rwlock->guard); + } + + rwlock->count++; + spin_unlock(&rwlock->guard); +} + +void +read_unlock(rwlock_t * rwlock) +{ + cfs_task_t * task = cfs_current(); + PTASK_SLOT slot = NULL; + + if (!task) { + /* should bugchk here */ + cfs_enter_debugger(); + return; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + ASSERT(slot->Magic == TASKSLT_MAGIC); + + spin_lock(&rwlock->guard); + ASSERT(rwlock->count > 0); + rwlock->count--; + if (rwlock < 0) { + cfs_enter_debugger(); + } + spin_unlock(&rwlock->guard); + + KeLowerIrql(slot->irql); +} + +void +write_lock(rwlock_t * rwlock) +{ + cfs_task_t * task = cfs_current(); + PTASK_SLOT slot = NULL; + + if (!task) { + /* should bugchk here */ + cfs_enter_debugger(); + return; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + ASSERT(slot->Magic == TASKSLT_MAGIC); + + slot->irql = KeRaiseIrqlToDpcLevel(); + + while (TRUE) { + spin_lock(&rwlock->guard); + if (rwlock->count == 0) + break; + spin_unlock(&rwlock->guard); + } + + rwlock->count = -1; + spin_unlock(&rwlock->guard); +} + +void +write_unlock(rwlock_t * rwlock) +{ + cfs_task_t * task = cfs_current(); + PTASK_SLOT slot = NULL; + + if (!task) { + /* should bugchk here */ + cfs_enter_debugger(); + return; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + ASSERT(slot->Magic == TASKSLT_MAGIC); + + spin_lock(&rwlock->guard); + ASSERT(rwlock->count == -1); + rwlock->count = 0; + spin_unlock(&rwlock->guard); + + KeLowerIrql(slot->irql); +} diff --git a/lnet/libcfs/winnt/winnt-lwt.c b/lnet/libcfs/winnt/winnt-lwt.c new file mode 100644 index 0000000..272cbcf --- /dev/null +++ b/lnet/libcfs/winnt/winnt-lwt.c @@ -0,0 +1,20 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + +# define DEBUG_SUBSYSTEM S_LNET + diff --git a/lnet/libcfs/winnt/winnt-mem.c b/lnet/libcfs/winnt/winnt-mem.c new file mode 100644 index 0000000..6b66a95 --- /dev/null +++ b/lnet/libcfs/winnt/winnt-mem.c @@ -0,0 +1,332 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + + +cfs_mem_cache_t *cfs_page_t_slab = NULL; +cfs_mem_cache_t *cfs_page_p_slab = NULL; + +/* + * cfs_alloc_page + * To allocate the cfs_page_t and also 1 page of memory + * + * Arguments: + * flags: the allocation options + * + * Return Value: + * pointer to the cfs_page_t strcture in success or + * NULL in failure case + * + * Notes: + * N/A + */ + +cfs_page_t * cfs_alloc_page(int flags) +{ + cfs_page_t *pg; + pg = cfs_mem_cache_alloc(cfs_page_t_slab, 0); + + if (NULL == pg) { + cfs_enter_debugger(); + return NULL; + } + + memset(pg, 0, sizeof(cfs_page_t)); + pg->addr = cfs_mem_cache_alloc(cfs_page_p_slab, 0); + atomic_set(&pg->count, 1); + + if (pg->addr) { + if (cfs_is_flag_set(flags, CFS_ALLOC_ZERO)) { + memset(pg->addr, 0, CFS_PAGE_SIZE); + } + } else { + cfs_enter_debugger(); + cfs_mem_cache_free(cfs_page_t_slab, pg); + pg = NULL; + } + + return pg; +} + +/* + * cfs_free_page + * To free the cfs_page_t including the page + * + * Arguments: + * pg: pointer to the cfs_page_t strcture + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ +void cfs_free_page(cfs_page_t *pg) +{ + ASSERT(pg != NULL); + ASSERT(pg->addr != NULL); + ASSERT(atomic_read(&pg->count) <= 1); + + cfs_mem_cache_free(cfs_page_p_slab, pg->addr); + cfs_mem_cache_free(cfs_page_t_slab, pg); +} + + +/* + * cfs_alloc + * To allocate memory from system pool + * + * Arguments: + * nr_bytes: length in bytes of the requested buffer + * flags: flags indiction + * + * Return Value: + * NULL: if there's no enough memory space in system + * the address of the allocated memory in success. + * + * Notes: + * This operation can be treated as atomic. + */ + +void * +cfs_alloc(size_t nr_bytes, u_int32_t flags) +{ + void *ptr; + + /* Ignore the flags: always allcoate from NonPagedPool */ + + ptr = ExAllocatePoolWithTag(NonPagedPool, nr_bytes, 'Lufs'); + + if (ptr != NULL && (flags & CFS_ALLOC_ZERO)) { + memset(ptr, 0, nr_bytes); + } + + if (!ptr) { + cfs_enter_debugger(); + } + + return ptr; +} + +/* + * cfs_free + * To free the sepcified memory to system pool + * + * Arguments: + * addr: pointer to the buffer to be freed + * + * Return Value: + * N/A + * + * Notes: + * This operation can be treated as atomic. + */ + +void +cfs_free(void *addr) +{ + ExFreePool(addr); +} + +/* + * cfs_alloc_large + * To allocate large block of memory from system pool + * + * Arguments: + * nr_bytes: length in bytes of the requested buffer + * + * Return Value: + * NULL: if there's no enough memory space in system + * the address of the allocated memory in success. + * + * Notes: + * N/A + */ + +void * +cfs_alloc_large(size_t nr_bytes) +{ + return cfs_alloc(nr_bytes, 0); +} + +/* + * cfs_free_large + * To free the sepcified memory to system pool + * + * Arguments: + * addr: pointer to the buffer to be freed + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +cfs_free_large(void *addr) +{ + cfs_free(addr); +} + + +/* + * cfs_mem_cache_create + * To create a SLAB cache + * + * Arguments: + * name: name string of the SLAB cache to be created + * size: size in bytes of SLAB entry buffer + * offset: offset in the page + * flags: SLAB creation flags +* + * Return Value: + * The poitner of cfs_memory_cache structure in success. + * NULL pointer in failure case. + * + * Notes: + * 1, offset won't be used here. + * 2, it could be better to induce a lock to protect the access of the + * SLAB structure on SMP if there's not outside lock protection. + * 3, parameters C/D are removed. + */ + +cfs_mem_cache_t * +cfs_mem_cache_create( + const char * name, + size_t size, + size_t offset, + unsigned long flags + ) +{ + cfs_mem_cache_t * kmc = NULL; + + /* The name of the SLAB could not exceed 20 chars */ + + if (name && strlen(name) >= 20) { + goto errorout; + } + + /* Allocate and initialize the SLAB strcture */ + + kmc = cfs_alloc (sizeof(cfs_mem_cache_t), 0); + + if (NULL == kmc) { + goto errorout; + } + + memset(kmc, 0, sizeof(cfs_mem_cache_t)); + + kmc->flags = flags; + + if (name) { + strcpy(&kmc->name[0], name); + } + + /* Initialize the corresponding LookAside list */ + + ExInitializeNPagedLookasideList( + &(kmc->npll), + NULL, + NULL, + 0, + size, + 'pnmk', + 0); + +errorout: + + return kmc; +} + +/* + * cfs_mem_cache_destroy + * To destroy the unused SLAB cache + * + * Arguments: + * kmc: the SLAB cache to be destroied. + * + * Return Value: + * 0: in success case. + * 1: in failure case. + * + * Notes: + * N/A + */ + +int cfs_mem_cache_destroy (cfs_mem_cache_t * kmc) +{ + ASSERT(kmc != NULL); + + ExDeleteNPagedLookasideList(&(kmc->npll)); + + cfs_free(kmc); + + return 0; +} + +/* + * cfs_mem_cache_alloc + * To allocate an object (LookAside entry) from the SLAB + * + * Arguments: + * kmc: the SLAB cache to be allocated from. + * flags: flags for allocation options + * + * Return Value: + * object buffer address: in success case. + * NULL: in failure case. + * + * Notes: + * N/A + */ + +void *cfs_mem_cache_alloc(cfs_mem_cache_t * kmc, int flags) +{ + void *buf = NULL; + + buf = ExAllocateFromNPagedLookasideList(&(kmc->npll)); + + return buf; +} + +/* + * cfs_mem_cache_free + * To free an object (LookAside entry) to the SLAB cache + * + * Arguments: + * kmc: the SLAB cache to be freed to. + * buf: the pointer to the object to be freed. + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_mem_cache_free(cfs_mem_cache_t * kmc, void * buf) +{ + ExFreeToNPagedLookasideList(&(kmc->npll), buf); +} diff --git a/lnet/libcfs/winnt/winnt-module.c b/lnet/libcfs/winnt/winnt-module.c new file mode 100644 index 0000000..2b6b008 --- /dev/null +++ b/lnet/libcfs/winnt/winnt-module.c @@ -0,0 +1,160 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + + +#define DEBUG_SUBSYSTEM S_LIBCFS + +#include +#include + +#define LIBCFS_MINOR 240 + +int libcfs_ioctl_getdata(char *buf, char *end, void *arg) +{ + struct libcfs_ioctl_hdr *hdr; + struct libcfs_ioctl_data *data; + int err; + ENTRY; + + hdr = (struct libcfs_ioctl_hdr *)buf; + data = (struct libcfs_ioctl_data *)buf; + + err = copy_from_user(buf, (void *)arg, sizeof(*hdr)); + if (err) + RETURN(err); + + if (hdr->ioc_version != LIBCFS_IOCTL_VERSION) { + CERROR(("LIBCFS: version mismatch kernel vs application\n")); + RETURN(-EINVAL); + } + + if (hdr->ioc_len + buf >= end) { + CERROR(("LIBCFS: user buffer exceeds kernel buffer\n")); + RETURN(-EINVAL); + } + + if (hdr->ioc_len < sizeof(struct libcfs_ioctl_data)) { + CERROR(("LIBCFS: user buffer too small for ioctl\n")); + RETURN(-EINVAL); + } + + err = copy_from_user(buf, (void *)arg, hdr->ioc_len); + if (err) + RETURN(err); + + if (libcfs_ioctl_is_invalid(data)) { + CERROR(("LIBCFS: ioctl not correctly formatted\n")); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1) + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + + if (data->ioc_inllen2) + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + size_round(data->ioc_inllen1); + + RETURN(0); +} + +extern struct cfs_psdev_ops libcfs_psdev_ops; + +static int +libcfs_psdev_open(cfs_file_t * file) +{ + struct libcfs_device_userstate **pdu = NULL; + int rc = 0; + + pdu = (struct libcfs_device_userstate **)&file->private_data; + if (libcfs_psdev_ops.p_open != NULL) + rc = libcfs_psdev_ops.p_open(0, (void *)pdu); + else + return (-EPERM); + return rc; +} + +/* called when closing /dev/device */ +static int +libcfs_psdev_release(cfs_file_t * file) +{ + struct libcfss_device_userstate *pdu; + int rc = 0; + + pdu = file->private_data; + if (libcfs_psdev_ops.p_close != NULL) + rc = libcfs_psdev_ops.p_close(0, (void *)pdu); + else + rc = -EPERM; + return rc; +} + +static int +libcfs_ioctl(cfs_file_t * file, unsigned int cmd, ulong_ptr arg) +{ + struct cfs_psdev_file pfile; + int rc = 0; + + if ( _IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || + _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR || + _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR ) { + CDEBUG(D_IOCTL, ("invalid ioctl ( type %d, nr %d, size %d )\n", + _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd))); + return (-EINVAL); + } + + /* Handle platform-dependent IOC requests */ + switch (cmd) { + case IOC_LIBCFS_PANIC: + if (!capable (CAP_SYS_BOOT)) + return (-EPERM); + CERROR(("debugctl-invoked panic")); + KeBugCheckEx('LUFS', (ULONG_PTR)libcfs_ioctl, (ULONG_PTR)NULL, (ULONG_PTR)NULL, (ULONG_PTR)NULL); + + return (0); + case IOC_LIBCFS_MEMHOG: + + if (!capable (CAP_SYS_ADMIN)) + return -EPERM; + break; + } + + pfile.off = 0; + pfile.private_data = file->private_data; + if (libcfs_psdev_ops.p_ioctl != NULL) + rc = libcfs_psdev_ops.p_ioctl(&pfile, cmd, (void *)arg); + else + rc = -EPERM; + return (rc); +} + +static struct file_operations libcfs_fops = { + /* lseek: */ NULL, + /* read: */ NULL, + /* write: */ NULL, + /* ioctl: */ libcfs_ioctl, + /* open: */ libcfs_psdev_open, + /* release:*/ libcfs_psdev_release +}; + +cfs_psdev_t libcfs_dev = { + LIBCFS_MINOR, + "lnet", + &libcfs_fops +}; + diff --git a/lnet/libcfs/winnt/winnt-prim.c b/lnet/libcfs/winnt/winnt-prim.c new file mode 100644 index 0000000..064b071 --- /dev/null +++ b/lnet/libcfs/winnt/winnt-prim.c @@ -0,0 +1,650 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include +#include + + +/* + * Thread routines + */ + +/* + * cfs_thread_proc + * Lustre thread procedure wrapper routine (It's an internal routine) + * + * Arguments: + * context: a structure of cfs_thread_context_t, containing + * all the necessary parameters + * + * Return Value: + * void: N/A + * + * Notes: + * N/A + */ + +void +cfs_thread_proc( + void * context + ) +{ + cfs_thread_context_t * thread_context = + (cfs_thread_context_t *) context; + + /* Execute the specified function ... */ + + if (thread_context->func) { + (thread_context->func)(thread_context->arg); + } + + /* Free the context memory */ + + cfs_free(context); + + /* Terminate this system thread */ + + PsTerminateSystemThread(STATUS_SUCCESS); +} + +/* + * cfs_kernel_thread + * Create a system thread to execute the routine specified + * + * Arguments: + * func: function to be executed in the thread + * arg: argument transferred to func function + * flag: thread creation flags. + * + * Return Value: + * int: 0 on success or error codes + * + * Notes: + * N/A + */ + +int cfs_kernel_thread(int (*func)(void *), void *arg, int flag) +{ + cfs_handle_t thread = NULL; + NTSTATUS status; + cfs_thread_context_t * context = NULL; + + /* Allocate the context to be transferred to system thread */ + + context = cfs_alloc(sizeof(cfs_thread_context_t), CFS_ALLOC_ZERO); + + if (!context) { + return -ENOMEM; + } + + context->func = func; + context->arg = arg; + + /* Create system thread with the cfs_thread_proc wrapper */ + + status = PsCreateSystemThread( + &thread, + (ACCESS_MASK)0L, + 0, 0, 0, + cfs_thread_proc, + context); + + if (!NT_SUCCESS(status)) { + + + cfs_free(context); + + /* We need translate the nt status to linux error code */ + + return cfs_error_code(status); + } + + // + // Query the thread id of the newly created thread + // + + ZwClose(thread); + + return 0; +} + + +/* + * Symbols routines + */ + + +static CFS_DECL_RWSEM(cfs_symbol_lock); +CFS_LIST_HEAD(cfs_symbol_list); + +int MPSystem = FALSE; + +/* + * cfs_symbol_get + * To query the specified symbol form the symbol table + * + * Arguments: + * name: the symbol name to be queried + * + * Return Value: + * If the symbol is in the table, return the address of it. + * If not, return NULL. + * + * Notes: + * N/A + */ + +void * +cfs_symbol_get(const char *name) +{ + struct list_head *walker; + struct cfs_symbol *sym = NULL; + + down_read(&cfs_symbol_lock); + list_for_each(walker, &cfs_symbol_list) { + sym = list_entry (walker, struct cfs_symbol, sym_list); + if (!strcmp(sym->name, name)) { + sym->ref ++; + break; + } + } + up_read(&cfs_symbol_lock); + + if (sym != NULL) + return sym->value; + + return NULL; +} + +/* + * cfs_symbol_put + * To decrease the reference of the specified symbol + * + * Arguments: + * name: the symbol name to be dereferred + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +cfs_symbol_put(const char *name) +{ + struct list_head *walker; + struct cfs_symbol *sym = NULL; + + down_read(&cfs_symbol_lock); + list_for_each(walker, &cfs_symbol_list) { + sym = list_entry (walker, struct cfs_symbol, sym_list); + if (!strcmp(sym->name, name)) { + LASSERT(sym->ref > 0); + sym->ref--; + break; + } + } + up_read(&cfs_symbol_lock); + + LASSERT(sym != NULL); +} + + +/* + * cfs_symbol_register + * To register the specified symbol infromation + * + * Arguments: + * name: the symbol name to be dereferred + * value: the value that the symbol stands for + * + * Return Value: + * N/A + * + * Notes: + * Zero: Succeed to register + * Non-Zero: Fail to register the symbol + */ + +int +cfs_symbol_register(const char *name, const void *value) +{ + struct list_head *walker; + struct cfs_symbol *sym = NULL; + struct cfs_symbol *new = NULL; + + new = cfs_alloc(sizeof(struct cfs_symbol), CFS_ALLOC_ZERO); + if (!new) { + return (-ENOMEM); + } + strncpy(new->name, name, CFS_SYMBOL_LEN); + new->value = (void *)value; + new->ref = 0; + CFS_INIT_LIST_HEAD(&new->sym_list); + + down_write(&cfs_symbol_lock); + list_for_each(walker, &cfs_symbol_list) { + sym = list_entry (walker, struct cfs_symbol, sym_list); + if (!strcmp(sym->name, name)) { + up_write(&cfs_symbol_lock); + cfs_free(new); + return 0; // alreay registerred + } + } + list_add_tail(&new->sym_list, &cfs_symbol_list); + up_write(&cfs_symbol_lock); + + return 0; +} + +/* + * cfs_symbol_unregister + * To unregister/remove the specified symbol + * + * Arguments: + * name: the symbol name to be dereferred + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +cfs_symbol_unregister(const char *name) +{ + struct list_head *walker; + struct list_head *nxt; + struct cfs_symbol *sym = NULL; + + down_write(&cfs_symbol_lock); + list_for_each_safe(walker, nxt, &cfs_symbol_list) { + sym = list_entry (walker, struct cfs_symbol, sym_list); + if (!strcmp(sym->name, name)) { + LASSERT(sym->ref == 0); + list_del (&sym->sym_list); + cfs_free(sym); + break; + } + } + up_write(&cfs_symbol_lock); +} + +/* + * cfs_symbol_clean + * To clean all the symbols + * + * Arguments: + * N/A + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +cfs_symbol_clean() +{ + struct list_head *walker; + struct cfs_symbol *sym = NULL; + + down_write(&cfs_symbol_lock); + list_for_each(walker, &cfs_symbol_list) { + sym = list_entry (walker, struct cfs_symbol, sym_list); + LASSERT(sym->ref == 0); + list_del (&sym->sym_list); + cfs_free(sym); + } + up_write(&cfs_symbol_lock); + return; +} + + + +/* + * Timer routines + */ + + +/* Timer dpc procedure */ + +static void +cfs_timer_dpc_proc ( + IN PKDPC Dpc, + IN PVOID DeferredContext, + IN PVOID SystemArgument1, + IN PVOID SystemArgument2) +{ + cfs_timer_t * timer; + KIRQL Irql; + + timer = (cfs_timer_t *) DeferredContext; + + /* clear the flag */ + KeAcquireSpinLock(&(timer->Lock), &Irql); + cfs_clear_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED); + KeReleaseSpinLock(&(timer->Lock), Irql); + + /* call the user specified timer procedure */ + timer->proc((unsigned long)(timer->arg)); +} + +/* + * cfs_timer_init + * To initialize the cfs_timer_t + * + * Arguments: + * timer: the cfs_timer to be initialized + * func: the timer callback procedure + * arg: argument for the callback proc + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_timer_init(cfs_timer_t *timer, void (*func)(unsigned long), void *arg) +{ + memset(timer, 0, sizeof(cfs_timer_t)); + + timer->proc = func; + timer->arg = arg; + + KeInitializeSpinLock(&(timer->Lock)); + KeInitializeTimer(&timer->Timer); + KeInitializeDpc (&timer->Dpc, cfs_timer_dpc_proc, timer); + + cfs_set_flag(timer->Flags, CFS_TIMER_FLAG_INITED); +} + +/* + * cfs_timer_done + * To finialize the cfs_timer_t (unused) + * + * Arguments: + * timer: the cfs_timer to be cleaned up + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_timer_done(cfs_timer_t *timer) +{ + return; +} + +/* + * cfs_timer_arm + * To schedule the timer while touching @deadline + * + * Arguments: + * timer: the cfs_timer to be freed + * dealine: timeout value to wake up the timer + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_timer_arm(cfs_timer_t *timer, cfs_time_t deadline) +{ + LARGE_INTEGER timeout; + KIRQL Irql; + + KeAcquireSpinLock(&(timer->Lock), &Irql); + if (!cfs_is_flag_set(timer->Flags, CFS_TIMER_FLAG_TIMERED)){ + + timeout.QuadPart = (LONGLONG)-1*1000*1000*10/HZ*deadline; + + if (KeSetTimer(&timer->Timer, timeout, &timer->Dpc )) { + cfs_set_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED); + } + + timer->deadline = deadline; + } + + KeReleaseSpinLock(&(timer->Lock), Irql); +} + +/* + * cfs_timer_disarm + * To discard the timer to be scheduled + * + * Arguments: + * timer: the cfs_timer to be discarded + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_timer_disarm(cfs_timer_t *timer) +{ + KIRQL Irql; + + KeAcquireSpinLock(&(timer->Lock), &Irql); + KeCancelTimer(&(timer->Timer)); + cfs_clear_flag(timer->Flags, CFS_TIMER_FLAG_TIMERED); + KeReleaseSpinLock(&(timer->Lock), Irql); +} + + +/* + * cfs_timer_is_armed + * To check the timer is scheduled or not + * + * Arguments: + * timer: the cfs_timer to be checked + * + * Return Value: + * 1: if it's armed. + * 0: if it's not. + * + * Notes: + * N/A + */ + +int cfs_timer_is_armed(cfs_timer_t *timer) +{ + int rc = 0; + KIRQL Irql; + + KeAcquireSpinLock(&(timer->Lock), &Irql); + if (cfs_is_flag_set(timer->Flags, CFS_TIMER_FLAG_TIMERED)) { + rc = 1; + } + KeReleaseSpinLock(&(timer->Lock), Irql); + + return rc; +} + +/* + * cfs_timer_deadline + * To query the deadline of the timer + * + * Arguments: + * timer: the cfs_timer to be queried + * + * Return Value: + * the deadline value + * + * Notes: + * N/A + */ + +cfs_time_t cfs_timer_deadline(cfs_timer_t * timer) +{ + return timer->deadline; +} + +/* + * daemonize routine stub + */ + +void cfs_daemonize(char *str) +{ + return; +} + +/* + * routine related with sigals + */ + +cfs_sigset_t cfs_get_blockedsigs() +{ + return 0; +} + +cfs_sigset_t cfs_block_allsigs() +{ + return 0; +} + +cfs_sigset_t cfs_block_sigs(sigset_t bit) +{ + return 0; +} + +void cfs_restore_sigs(cfs_sigset_t old) +{ +} + +int cfs_signal_pending(void) +{ + return 0; +} + +void cfs_clear_sigpending(void) +{ + return; +} + +/** + ** Initialize routines + **/ + +int +libcfs_arch_init(void) +{ + int rc; + + spinlock_t lock; + /* Workground to check the system is MP build or UP build */ + spin_lock_init(&lock); + spin_lock(&lock); + MPSystem = (int)lock.lock; + /* MP build system: it's a real spin, for UP build system, it + only raises the IRQL to DISPATCH_LEVEL */ + spin_unlock(&lock); + + /* create slab memory caches for page alloctors */ + cfs_page_t_slab = cfs_mem_cache_create( + "CPGT", sizeof(cfs_page_t), 0, 0 ); + + cfs_page_p_slab = cfs_mem_cache_create( + "CPGP", CFS_PAGE_SIZE, 0, 0 ); + + if ( cfs_page_t_slab == NULL || + cfs_page_p_slab == NULL ){ + rc = -ENOMEM; + goto errorout; + } + + rc = init_task_manager(); + + if (rc != 0) { + cfs_enter_debugger(); + KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing task manager ...\n")); + goto errorout; + } + + /* initialize the proc file system */ + rc = proc_init_fs(); + + if (rc != 0) { + cfs_enter_debugger(); + KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing proc fs ...\n")); + cleanup_task_manager(); + goto errorout; + } + + /* initialize the tdi data */ + rc = ks_init_tdi_data(); + + if (rc != 0) { + cfs_enter_debugger(); + KdPrint(("winnt-prim.c:libcfs_arch_init: error initializing tdi ...\n")); + proc_destroy_fs(); + cleanup_task_manager(); + goto errorout; + } + +errorout: + + if (rc != 0) { + /* destroy the taskslot cache slab */ + if (cfs_page_t_slab) { + cfs_mem_cache_destroy(cfs_page_t_slab); + } + if (cfs_page_p_slab) { + cfs_mem_cache_destroy(cfs_page_p_slab); + } + } + + return rc; +} + +void +libcfs_arch_cleanup(void) +{ + /* finialize the tdi data */ + ks_fini_tdi_data(); + + /* detroy the whole proc fs tree and nodes */ + proc_destroy_fs(); + + /* destroy the taskslot cache slab */ + if (cfs_page_t_slab) { + cfs_mem_cache_destroy(cfs_page_t_slab); + } + + if (cfs_page_p_slab) { + cfs_mem_cache_destroy(cfs_page_p_slab); + } + + return; +} + +EXPORT_SYMBOL(libcfs_arch_init); +EXPORT_SYMBOL(libcfs_arch_cleanup); diff --git a/lnet/libcfs/winnt/winnt-proc.c b/lnet/libcfs/winnt/winnt-proc.c new file mode 100644 index 0000000..ebce30d --- /dev/null +++ b/lnet/libcfs/winnt/winnt-proc.c @@ -0,0 +1,1990 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +# define DEBUG_SUBSYSTEM S_LNET + +#include +#include +#include "tracefile.h" + +#ifdef __KERNEL__ + + +/* + * /proc emulator routines ... + */ + +/* The root node of the proc fs emulation: /proc */ +cfs_proc_entry_t * proc_fs_root = NULL; + + +/* The sys root: /proc/sys */ +cfs_proc_entry_t * proc_sys_root = NULL; + + +/* The sys root: /proc/dev | to implement misc device */ + +cfs_proc_entry_t * proc_dev_root = NULL; + + +/* SLAB object for cfs_proc_entry_t allocation */ + +cfs_mem_cache_t * proc_entry_cache = NULL; + +/* root node for sysctl table */ + +cfs_sysctl_table_header_t root_table_header; + +/* The global lock to protect all the access */ + +#if LIBCFS_PROCFS_SPINLOCK +spinlock_t proc_fs_lock; + +#define INIT_PROCFS_LOCK() spin_lock_init(&proc_fs_lock) +#define LOCK_PROCFS() spin_lock(&proc_fs_lock) +#define UNLOCK_PROCFS() spin_unlock(&proc_fs_lock) + +#else + +mutex_t proc_fs_lock; + +#define INIT_PROCFS_LOCK() init_mutex(&proc_fs_lock) +#define LOCK_PROCFS() mutex_down(&proc_fs_lock) +#define UNLOCK_PROCFS() mutex_up(&proc_fs_lock) + +#endif + +static ssize_t +proc_file_read(struct file * file, const char * buf, size_t nbytes, loff_t *ppos) +{ + char *page; + ssize_t retval=0; + int eof=0; + ssize_t n, count; + char *start; + cfs_proc_entry_t * dp; + + dp = (cfs_proc_entry_t *) file->private_data; + if (!(page = (char*) cfs_alloc(CFS_PAGE_SIZE, 0))) + return -ENOMEM; + + while ((nbytes > 0) && !eof) { + + count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); + + start = NULL; + if (dp->read_proc) { + n = dp->read_proc( page, &start, (long)*ppos, + count, &eof, dp->data); + } else + break; + + if (!start) { + /* + * For proc files that are less than 4k + */ + start = page + *ppos; + n -= (ssize_t)(*ppos); + if (n <= 0) + break; + if (n > count) + n = count; + } + if (n == 0) + break; /* End of file */ + if (n < 0) { + if (retval == 0) + retval = n; + break; + } + + n -= copy_to_user((void *)buf, start, n); + if (n == 0) { + if (retval == 0) + retval = -EFAULT; + break; + } + + *ppos += n; + nbytes -= n; + buf += n; + retval += n; + } + cfs_free(page); + + return retval; +} + +static ssize_t +proc_file_write(struct file * file, const char * buffer, + size_t count, loff_t *ppos) +{ + cfs_proc_entry_t * dp; + + dp = (cfs_proc_entry_t *) file->private_data; + + if (!dp->write_proc) + return -EIO; + + /* FIXME: does this routine need ppos? probably... */ + return dp->write_proc(file, buffer, count, dp->data); +} + +struct file_operations proc_file_operations = { + /*lseek:*/ NULL, //proc_file_lseek, + /*read:*/ proc_file_read, + /*write:*/ proc_file_write, + /*ioctl:*/ NULL, + /*open:*/ NULL, + /*release:*/ NULL +}; + +/* allocate proc entry block */ + +cfs_proc_entry_t * +proc_alloc_entry() +{ + cfs_proc_entry_t * entry = NULL; + + entry = cfs_mem_cache_alloc(proc_entry_cache, 0); + if (!entry) { + return NULL; + } + + memset(entry, 0, sizeof(cfs_proc_entry_t)); + + entry->magic = CFS_PROC_ENTRY_MAGIC; + RtlInitializeSplayLinks(&(entry->s_link)); + entry->proc_fops = &proc_file_operations; + + return entry; +} + +/* free the proc entry block */ + +void +proc_free_entry(cfs_proc_entry_t * entry) + +{ + ASSERT(entry->magic == CFS_PROC_ENTRY_MAGIC); + + cfs_mem_cache_free(proc_entry_cache, entry); +} + +/* dissect the path string for a given full proc path */ + +void +proc_dissect_name( + char *path, + char **first, + int *first_len, + char **remain + ) +{ + int i = 0, j = 0, len = 0; + + *first = *remain = NULL; + *first_len = 0; + + len = strlen(path); + + while (i < len && (path[i] == '/')) i++; + + if (i < len) { + + *first = path + i; + while (i < len && (path[i] != '/')) i++; + *first_len = (path + i - *first); + + if (i + 1 < len) { + *remain = path + i + 1; + } + } +} + +/* search the children entries of the parent entry */ + +cfs_proc_entry_t * +proc_search_splay ( + cfs_proc_entry_t * parent, + char * name + ) +{ + cfs_proc_entry_t * node; + PRTL_SPLAY_LINKS link; + + ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC); + ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY)); + + link = parent->root; + + while (link) { + + ANSI_STRING ename,nname; + long result; + + node = CONTAINING_RECORD(link, cfs_proc_entry_t, s_link); + + ASSERT(node->magic == CFS_PROC_ENTRY_MAGIC); + + /* Compare the prefix in the tree with the full name */ + + RtlInitAnsiString(&ename, name); + RtlInitAnsiString(&nname, node->name); + + result = RtlCompareString(&nname, &ename,TRUE); + + if (result > 0) { + + /* The prefix is greater than the full name + so we go down the left child */ + + link = RtlLeftChild(link); + + } else if (result < 0) { + + /* The prefix is less than the full name + so we go down the right child */ + // + + link = RtlRightChild(link); + + } else { + + /* We got the entry in the splay tree and + make it root node instead */ + + parent->root = RtlSplay(link); + + return node; + } + + /* we need continue searching down the tree ... */ + } + + /* There's no the exptected entry in the splay tree */ + + return NULL; +} + +int +proc_insert_splay ( + cfs_proc_entry_t * parent, + cfs_proc_entry_t * child + ) +{ + cfs_proc_entry_t * entry; + + ASSERT(parent != NULL && child != NULL); + ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC); + ASSERT(child->magic == CFS_PROC_ENTRY_MAGIC); + ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY)); + + if (!parent->root) { + parent->root = &(child->s_link); + } else { + entry = CONTAINING_RECORD(parent->root, cfs_proc_entry_t, s_link); + while (TRUE) { + long result; + ANSI_STRING ename, cname; + + ASSERT(entry->magic == CFS_PROC_ENTRY_MAGIC); + + RtlInitAnsiString(&ename, entry->name); + RtlInitAnsiString(&cname, child->name); + + result = RtlCompareString(&ename, &cname,TRUE); + + if (result == 0) { + cfs_enter_debugger(); + if (entry == child) { + break; + } + return FALSE; + } + + if (result > 0) { + if (RtlLeftChild(&entry->s_link) == NULL) { + RtlInsertAsLeftChild(&entry->s_link, &child->s_link); + break; + } else { + entry = CONTAINING_RECORD( RtlLeftChild(&entry->s_link), + cfs_proc_entry_t, s_link); + } + } else { + if (RtlRightChild(&entry->s_link) == NULL) { + RtlInsertAsRightChild(&entry->s_link, &child->s_link); + break; + } else { + entry = CONTAINING_RECORD( RtlRightChild(&entry->s_link), + cfs_proc_entry_t, s_link ); + } + } + } + } + + cfs_set_flag(child->flags, CFS_PROC_FLAG_ATTACHED); + parent->nlink++; + + return TRUE; +} + + +/* remove a child entry from the splay tree */ +int +proc_remove_splay ( + cfs_proc_entry_t * parent, + cfs_proc_entry_t * child + ) +{ + cfs_proc_entry_t * entry = NULL; + + ASSERT(parent != NULL && child != NULL); + ASSERT(parent->magic == CFS_PROC_ENTRY_MAGIC); + ASSERT(child->magic == CFS_PROC_ENTRY_MAGIC); + ASSERT(cfs_is_flag_set(parent->flags, CFS_PROC_FLAG_DIRECTORY)); + ASSERT(cfs_is_flag_set(child->flags, CFS_PROC_FLAG_ATTACHED)); + + entry = proc_search_splay(parent, child->name); + + if (entry) { + ASSERT(entry == child); + parent->root = RtlDelete(&(entry->s_link)); + parent->nlink--; + } else { + cfs_enter_debugger(); + return FALSE; + } + + return TRUE; +} + + +/* search a node inside the proc fs tree */ + +cfs_proc_entry_t * +proc_search_entry( + char * name, + cfs_proc_entry_t * root + ) +{ + cfs_proc_entry_t * entry; + cfs_proc_entry_t * parent; + char *first, *remain; + int flen; + char *ename = NULL; + + parent = root; + entry = NULL; + + ename = cfs_alloc(0x21, CFS_ALLOC_ZERO); + + if (ename == NULL) { + goto errorout; + } + +again: + + /* dissect the file name string */ + proc_dissect_name(name, &first, &flen, &remain); + + if (first) { + + if (flen >= 0x20) { + cfs_enter_debugger(); + entry = NULL; + goto errorout; + } + + memset(ename, 0, 0x20); + memcpy(ename, first, flen); + + entry = proc_search_splay(parent, ename); + + if (!entry) { + goto errorout; + } + + if (remain) { + name = remain; + parent = entry; + + goto again; + } + } + +errorout: + + if (ename) { + cfs_free(ename); + } + + return entry; +} + +/* insert the path nodes to the proc fs tree */ + +cfs_proc_entry_t * +proc_insert_entry( + char * name, + cfs_proc_entry_t * root + ) +{ + cfs_proc_entry_t *entry; + cfs_proc_entry_t *parent; + char *first, *remain; + int flen; + char ename[0x20]; + + parent = root; + entry = NULL; + +again: + + proc_dissect_name(name, &first, &flen, &remain); + + if (first) { + + if (flen >= 0x20) { + return NULL; + } + + memset(ename, 0, 0x20); + memcpy(ename, first, flen); + + entry = proc_search_splay(parent, ename); + + if (!entry) { + entry = proc_alloc_entry(); + memcpy(entry->name, ename, flen); + + if (entry) { + if(!proc_insert_splay(parent, entry)) { + proc_free_entry(entry); + entry = NULL; + } + } + } + + if (!entry) { + return NULL; + } + + if (remain) { + entry->mode |= S_IFDIR | S_IRUGO | S_IXUGO; + cfs_set_flag(entry->flags, CFS_PROC_FLAG_DIRECTORY); + name = remain; + parent = entry; + goto again; + } + } + + return entry; +} + +/* remove the path nodes from the proc fs tree */ + +void +proc_remove_entry( + char * name, + cfs_proc_entry_t * root + ) +{ + cfs_proc_entry_t *entry; + char *first, *remain; + int flen; + char ename[0x20]; + + entry = NULL; + + proc_dissect_name(name, &first, &flen, &remain); + + if (first) { + + memset(ename, 0, 0x20); + memcpy(ename, first, flen); + + entry = proc_search_splay(root, ename); + + if (entry) { + + if (remain) { + ASSERT(S_ISDIR(entry->mode)); + proc_remove_entry(remain, entry); + } + + if (!entry->nlink) { + proc_remove_splay(root, entry); + proc_free_entry(entry); + } + } + } else { + cfs_enter_debugger(); + } +} + +/* create proc entry and insert it into the proc fs */ + +cfs_proc_entry_t * +create_proc_entry ( + char * name, + mode_t mode, + cfs_proc_entry_t * root + ) +{ + cfs_proc_entry_t *parent = root; + cfs_proc_entry_t *entry = NULL; + + if (S_ISDIR(mode)) { + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO | S_IXUGO; + } else { + if ((mode & S_IFMT) == 0) + mode |= S_IFREG; + if ((mode & S_IALLUGO) == 0) + mode |= S_IRUGO; + } + + LOCK_PROCFS(); + + ASSERT(NULL != proc_fs_root); + + if (!parent) { + parent = proc_fs_root; + } + + entry = proc_search_entry(name, parent); + + if (!entry) { + entry = proc_insert_entry(name, parent); + if (!entry) { + /* Failed to create/insert the splay node ... */ + cfs_enter_debugger(); + goto errorout; + } + /* Initializing entry ... */ + entry->mode = mode; + + if (S_ISDIR(mode)) { + cfs_set_flag(entry->flags, CFS_PROC_FLAG_DIRECTORY); + } + } + +errorout: + + UNLOCK_PROCFS(); + + return entry; +} + + +/* search the specified entry form the proc fs */ + +cfs_proc_entry_t * +search_proc_entry( + char * name, + cfs_proc_entry_t * root + ) +{ + cfs_proc_entry_t * entry; + + LOCK_PROCFS(); + if (root == NULL) { + root = proc_fs_root; + } + entry = proc_search_entry(name, root); + UNLOCK_PROCFS(); + + return entry; +} + +/* remove the entry from the proc fs */ + +void +remove_proc_entry( + char * name, + cfs_proc_entry_t * parent + ) +{ + LOCK_PROCFS(); + if (parent == NULL) { + parent = proc_fs_root; + } + proc_remove_entry(name, parent); + UNLOCK_PROCFS(); +} + + +void proc_destroy_splay(cfs_proc_entry_t * entry) +{ + cfs_proc_entry_t * node; + + if (S_ISDIR(entry->mode)) { + + while (entry->root) { + node = CONTAINING_RECORD(entry->root, cfs_proc_entry_t, s_link); + entry->root = RtlDelete(&(node->s_link)); + proc_destroy_splay(node); + } + } + + proc_free_entry(entry); +} + + +/* destory the whole proc fs tree */ + +void proc_destroy_fs() +{ + LOCK_PROCFS(); + + if (proc_fs_root) { + proc_destroy_splay(proc_fs_root); + } + + if (proc_entry_cache) { + cfs_mem_cache_destroy(proc_entry_cache); + } + + UNLOCK_PROCFS(); +} + +/* initilaize / build the proc fs tree */ + +int proc_init_fs() +{ + cfs_proc_entry_t * root = NULL; + + memset(&(root_table_header), 0, sizeof(struct ctl_table_header)); + INIT_LIST_HEAD(&(root_table_header.ctl_entry)); + + INIT_PROCFS_LOCK(); + proc_entry_cache = cfs_mem_cache_create( + NULL, + sizeof(cfs_proc_entry_t), + 0, + 0 + ); + + if (!proc_entry_cache) { + return (-ENOMEM); + } + + root = proc_alloc_entry(); + + if (!root) { + proc_destroy_fs(); + return (-ENOMEM); + } + + root->magic = CFS_PROC_ENTRY_MAGIC; + root->flags = CFS_PROC_FLAG_DIRECTORY; + root->mode = S_IFDIR | S_IRUGO | S_IXUGO; + root->nlink = 3; // root should never be deleted. + + root->name[0]='p'; + root->name[1]='r'; + root->name[2]='o'; + root->name[3]='c'; + + proc_fs_root = root; + + proc_sys_root = create_proc_entry("sys", S_IFDIR, root); + + if (!proc_sys_root) { + proc_free_entry(root); + proc_fs_root = NULL; + proc_destroy_fs(); + return (-ENOMEM); + } + + proc_sys_root->nlink = 1; + + proc_dev_root = create_proc_entry("dev", S_IFDIR, root); + + if (!proc_dev_root) { + proc_free_entry(proc_sys_root); + proc_sys_root = NULL; + proc_free_entry(proc_fs_root); + proc_fs_root = NULL; + proc_destroy_fs(); + return (-ENOMEM); + } + + proc_dev_root->nlink = 1; + + return 0; +} + + +static ssize_t do_rw_proc(int write, struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + int op; + cfs_proc_entry_t *de; + struct ctl_table *table; + size_t res; + ssize_t error; + + de = (cfs_proc_entry_t *) file->proc_dentry; + + if (!de || !de->data) + return -ENOTDIR; + table = (struct ctl_table *) de->data; + if (!table || !table->proc_handler) + return -ENOTDIR; + op = (write ? 002 : 004); + +// if (ctl_perm(table, op)) +// return -EPERM; + + res = count; + + /* + * FIXME: we need to pass on ppos to the handler. + */ + + error = (*table->proc_handler) (table, write, file, buf, &res); + if (error) + return error; + return res; +} + +static ssize_t proc_readsys(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(0, file, buf, count, ppos); +} + +static ssize_t proc_writesys(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + return do_rw_proc(1, file, (char *) buf, count, ppos); +} + + +struct file_operations proc_sys_file_operations = { + /*lseek:*/ NULL, + /*read:*/ proc_readsys, + /*write:*/ proc_writesys, + /*ioctl:*/ NULL, + /*open:*/ NULL, + /*release:*/ NULL +}; + + +/* Scan the sysctl entries in table and add them all into /proc */ +void register_proc_table(cfs_sysctl_table_t * table, cfs_proc_entry_t * root) +{ + cfs_proc_entry_t * de; + int len; + mode_t mode; + + for (; table->ctl_name; table++) { + /* Can't do anything without a proc name. */ + if (!table->procname) + continue; + /* Maybe we can't do anything with it... */ + if (!table->proc_handler && !table->child) { + printk(KERN_WARNING "SYSCTL: Can't register %s\n", + table->procname); + continue; + } + + len = strlen(table->procname); + mode = table->mode; + + de = NULL; + if (table->proc_handler) + mode |= S_IFREG; + else { + de = search_proc_entry(table->procname, root); + if (de) { + break; + } + /* If the subdir exists already, de is non-NULL */ + } + + if (!de) { + + de = create_proc_entry((char *)table->procname, mode, root); + if (!de) + continue; + de->data = (void *) table; + if (table->proc_handler) { + de->proc_fops = &proc_sys_file_operations; + } + } + table->de = de; + if (de->mode & S_IFDIR) + register_proc_table(table->child, de); + } +} + + +/* + * Unregister a /proc sysctl table and any subdirectories. + */ +void unregister_proc_table(cfs_sysctl_table_t * table, cfs_proc_entry_t *root) +{ + cfs_proc_entry_t *de; + for (; table->ctl_name; table++) { + if (!(de = table->de)) + continue; + if (de->mode & S_IFDIR) { + if (!table->child) { + printk (KERN_ALERT "Help - malformed sysctl tree on free\n"); + continue; + } + unregister_proc_table(table->child, de); + + /* Don't unregister directories which still have entries.. */ + if (de->nlink) + continue; + } + + /* Don't unregister proc entries that are still being used.. */ + if (de->nlink) + continue; + + table->de = NULL; + remove_proc_entry((char *)table->procname, root); + } +} + +/* The generic string strategy routine: */ +int sysctl_string(cfs_sysctl_table_t *table, int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, void **context) +{ + int l, len; + + if (!table->data || !table->maxlen) + return -ENOTDIR; + + if (oldval && oldlenp) { + if(get_user(len, oldlenp)) + return -EFAULT; + if (len) { + l = strlen(table->data); + if (len > l) len = l; + if (len >= table->maxlen) + len = table->maxlen; + if(copy_to_user(oldval, table->data, len)) + return -EFAULT; + if(put_user(0, ((char *) oldval) + len)) + return -EFAULT; + if(put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + len = newlen; + if (len > table->maxlen) + len = table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + if (len == table->maxlen) + len--; + ((char *) table->data)[len] = 0; + } + return 0; +} + +/** + * simple_strtoul - convert a string to an unsigned long + * @cp: The start of the string + * @endp: A pointer to the end of the parsed string will be placed here + * @base: The number base to use + */ +unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base) +{ + unsigned long result = 0, value; + + if (!base) { + base = 10; + if (*cp == '0') { + base = 8; + cp++; + if ((*cp == 'x') && isxdigit(cp[1])) { + cp++; + base = 16; + } + } + } + while (isxdigit(*cp) && + (value = isdigit(*cp) ? *cp-'0' : toupper(*cp)-'A'+10) < base) { + result = result*base + value; + cp++; + } + if (endp) + *endp = (char *)cp; + return result; +} + +#define OP_SET 0 +#define OP_AND 1 +#define OP_OR 2 +#define OP_MAX 3 +#define OP_MIN 4 + + +static int do_proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp, + void *buffer, size_t *lenp, int conv, int op) +{ + int *i, vleft, first=1, neg, val; + size_t left, len; + + #define TMPBUFLEN 20 + char buf[TMPBUFLEN], *p; + + if (!table->data || !table->maxlen || !*lenp) + { + *lenp = 0; + return 0; + } + + i = (int *) table->data; + vleft = table->maxlen / sizeof(int); + left = *lenp; + + for (; left && vleft--; i++, first=0) { + if (write) { + while (left) { + char c; + if(get_user(c,(char *) buffer)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + ((char *) buffer)++; + } + if (!left) + break; + neg = 0; + len = left; + if (len > TMPBUFLEN-1) + len = TMPBUFLEN-1; + if(copy_from_user(buf, buffer, len)) + return -EFAULT; + buf[len] = 0; + p = buf; + if (*p == '-' && left > 1) { + neg = 1; + left--, p++; + } + if (*p < '0' || *p > '9') + break; + val = simple_strtoul(p, &p, 0) * conv; + len = p-buf; + if ((len < left) && *p && !isspace(*p)) + break; + if (neg) + val = -val; + (char *)buffer += len; + left -= len; + switch(op) { + case OP_SET: *i = val; break; + case OP_AND: *i &= val; break; + case OP_OR: *i |= val; break; + case OP_MAX: if(*i < val) + *i = val; + break; + case OP_MIN: if(*i > val) + *i = val; + break; + } + } else { + p = buf; + if (!first) + *p++ = '\t'; + sprintf(p, "%d", (*i) / conv); + len = strlen(buf); + if (len > left) + len = left; + if(copy_to_user(buffer, buf, len)) + return -EFAULT; + left -= len; + (char *)buffer += len; + } + } + + if (!write && !first && left) { + if(put_user('\n', (char *) buffer)) + return -EFAULT; + left--, ((char *)buffer)++; + } + if (write) { + p = (char *) buffer; + while (left) { + char c; + if(get_user(c, p++)) + return -EFAULT; + if (!isspace(c)) + break; + left--; + } + } + if (write && first) + return -EINVAL; + *lenp -= left; + memset(&(filp->f_pos) , 0, sizeof(loff_t)); + filp->f_pos += (loff_t)(*lenp); + return 0; +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(cfs_sysctl_table_t *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + return do_proc_dointvec(table,write,filp,buffer,lenp,1,OP_SET); +} + + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(cfs_sysctl_table_t *table, int write, struct file *filp, + void *buffer, size_t *lenp) +{ + size_t len; + char *p, c; + + if (!table->data || !table->maxlen || !*lenp || + (filp->f_pos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + len = 0; + p = buffer; + while (len < *lenp) { + if(get_user(c, p++)) + return -EFAULT; + if (c == 0 || c == '\n') + break; + len++; + } + if (len >= (size_t)table->maxlen) + len = (size_t)table->maxlen-1; + if(copy_from_user(table->data, buffer, len)) + return -EFAULT; + ((char *) table->data)[len] = 0; + filp->f_pos += *lenp; + } else { + len = (size_t)strlen(table->data); + if (len > (size_t)table->maxlen) + len = (size_t)table->maxlen; + if (len > *lenp) + len = *lenp; + if (len) + if(copy_to_user(buffer, table->data, len)) + return -EFAULT; + if (len < *lenp) { + if(put_user('\n', ((char *) buffer) + len)) + return -EFAULT; + len++; + } + *lenp = len; + filp->f_pos += len; + } + return 0; +} + +/* Perform the actual read/write of a sysctl table entry. */ +int do_sysctl_strategy (cfs_sysctl_table_t *table, + int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, void **context) +{ + int op = 0, rc; + size_t len; + + if (oldval) + op |= 004; + if (newval) + op |= 002; + + if (table->strategy) { + rc = table->strategy(table, name, nlen, oldval, oldlenp, + newval, newlen, context); + if (rc < 0) + return rc; + if (rc > 0) + return 0; + } + + /* If there is no strategy routine, or if the strategy returns + * zero, proceed with automatic r/w */ + if (table->data && table->maxlen) { + if (oldval && oldlenp) { + get_user(len, oldlenp); + if (len) { + if (len > (size_t)table->maxlen) + len = (size_t)table->maxlen; + if(copy_to_user(oldval, table->data, len)) + return -EFAULT; + if(put_user(len, oldlenp)) + return -EFAULT; + } + } + if (newval && newlen) { + len = newlen; + if (len > (size_t)table->maxlen) + len = (size_t)table->maxlen; + if(copy_from_user(table->data, newval, len)) + return -EFAULT; + } + } + return 0; +} + +static int parse_table(int *name, int nlen, + void *oldval, size_t *oldlenp, + void *newval, size_t newlen, + cfs_sysctl_table_t *table, void **context) +{ + int n; + +repeat: + + if (!nlen) + return -ENOTDIR; + if (get_user(n, name)) + return -EFAULT; + for ( ; table->ctl_name; table++) { + if (n == table->ctl_name || table->ctl_name == CTL_ANY) { + int error; + if (table->child) { +/* + if (ctl_perm(table, 001)) + return -EPERM; +*/ + if (table->strategy) { + error = table->strategy( + table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + if (error) + return error; + } + name++; + nlen--; + table = table->child; + goto repeat; + } + error = do_sysctl_strategy(table, name, nlen, + oldval, oldlenp, + newval, newlen, context); + return error; + } + } + return -ENOTDIR; +} + +int do_sysctl(int *name, int nlen, void *oldval, size_t *oldlenp, + void *newval, size_t newlen) +{ + struct list_head *tmp; + + if (nlen <= 0 || nlen >= CTL_MAXNAME) + return -ENOTDIR; + if (oldval) { + int old_len; + if (!oldlenp || get_user(old_len, oldlenp)) + return -EFAULT; + } + tmp = &root_table_header.ctl_entry; + do { + struct ctl_table_header *head = + list_entry(tmp, struct ctl_table_header, ctl_entry); + void *context = NULL; + int error = parse_table(name, nlen, oldval, oldlenp, + newval, newlen, head->ctl_table, + &context); + if (context) + cfs_free(context); + if (error != -ENOTDIR) + return error; + tmp = tmp->next; + } while (tmp != &root_table_header.ctl_entry); + return -ENOTDIR; +} + +/** + * register_sysctl_table - register a sysctl heirarchy + * @table: the top-level table structure + * @insert_at_head: whether the entry should be inserted in front or at the end + * + * Register a sysctl table heirarchy. @table should be a filled in ctl_table + * array. An entry with a ctl_name of 0 terminates the table. + * + * The members of the &ctl_table structure are used as follows: + * + * ctl_name - This is the numeric sysctl value used by sysctl(2). The number + * must be unique within that level of sysctl + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * + * child - a pointer to the child sysctl table if this entry is a directory, or + * %NULL. + * + * proc_handler - the text handler routine (described below) + * + * strategy - the strategy routine (described below) + * + * de - for internal use by the sysctl routines + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * More sophisticated management can be enabled by the provision of a + * strategy routine with the table entry. This will be called before + * any automatic read or write of the data is performed. + * + * The strategy routine may return + * + * < 0 - Error occurred (error is passed to user process) + * + * 0 - OK - proceed with automatic read or write. + * + * > 0 - OK - read or write has been done by the strategy routine, so + * return immediately. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_minmax(), proc_doulongvec_ms_jiffies_minmax(), + * proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *register_sysctl_table(cfs_sysctl_table_t * table, + int insert_at_head) +{ + struct ctl_table_header *tmp; + tmp = cfs_alloc(sizeof(struct ctl_table_header), 0); + if (!tmp) + return NULL; + tmp->ctl_table = table; + + INIT_LIST_HEAD(&tmp->ctl_entry); + if (insert_at_head) + list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); + else + list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); +#ifdef CONFIG_PROC_FS + register_proc_table(table, proc_sys_root); +#endif + return tmp; +} + +/** + * unregister_sysctl_table - unregister a sysctl table heirarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + list_del(&header->ctl_entry); +#ifdef CONFIG_PROC_FS + unregister_proc_table(header->ctl_table, proc_sys_root); +#endif + cfs_free(header); +} + + +int cfs_psdev_register(cfs_psdev_t * psdev) +{ + cfs_proc_entry_t * entry; + + entry = create_proc_entry ( + (char *)psdev->name, + S_IFREG, + proc_dev_root + ); + + if (!entry) { + return -ENOMEM; + } + + entry->flags |= CFS_PROC_FLAG_MISCDEV; + + entry->proc_fops = psdev->fops; + entry->data = (void *)psdev; + + return 0; +} + +int cfs_psdev_deregister(cfs_psdev_t * psdev) +{ + cfs_proc_entry_t * entry; + + entry = search_proc_entry ( + (char *)psdev->name, + proc_dev_root + ); + + if (entry) { + + ASSERT(entry->data == (void *)psdev); + ASSERT(entry->flags & CFS_PROC_FLAG_MISCDEV); + + remove_proc_entry( + (char *)psdev->name, + proc_dev_root + ); + } + + return 0; +} + +extern char debug_file_path[1024]; + +#define PSDEV_LNET (0x100) +enum { + PSDEV_DEBUG = 1, /* control debugging */ + PSDEV_SUBSYSTEM_DEBUG, /* control debugging */ + PSDEV_PRINTK, /* force all messages to console */ + PSDEV_CONSOLE_RATELIMIT, /* rate limit console messages */ + PSDEV_DEBUG_PATH, /* crashdump log location */ + PSDEV_DEBUG_DUMP_PATH, /* crashdump tracelog location */ + PSDEV_LIBCFS_MEMUSED, /* bytes currently PORTAL_ALLOCated */ +}; + +static struct ctl_table lnet_table[] = { + {PSDEV_DEBUG, "debug", &libcfs_debug, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_SUBSYSTEM_DEBUG, "subsystem_debug", &libcfs_subsystem_debug, + sizeof(int), 0644, NULL, &proc_dointvec}, + {PSDEV_PRINTK, "printk", &libcfs_printk, sizeof(int), 0644, NULL, + &proc_dointvec}, + {PSDEV_CONSOLE_RATELIMIT, "console_ratelimit", &libcfs_console_ratelimit, + sizeof(int), 0644, NULL, &proc_dointvec}, + {PSDEV_DEBUG_PATH, "debug_path", debug_file_path, + sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string}, +/* + {PSDEV_PORTALS_UPCALL, "upcall", portals_upcall, + sizeof(portals_upcall), 0644, NULL, &proc_dostring, + &sysctl_string}, +*/ + {PSDEV_LIBCFS_MEMUSED, "memused", (int *)&libcfs_kmemory.counter, + sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static struct ctl_table top_table[2] = { + {PSDEV_LNET, "lnet", NULL, 0, 0555, lnet_table}, + {0} +}; + +int insert_proc(void) +{ + cfs_proc_entry_t *ent; + + ent = create_proc_entry("sys/lnet/dump_kernel", 0, NULL); + if (ent == NULL) { + CERROR(("couldn't register dump_kernel\n")); + return -1; + } + ent->write_proc = trace_dk; + + ent = create_proc_entry("sys/lnet/daemon_file", 0, NULL); + if (ent == NULL) { + CERROR(("couldn't register daemon_file\n")); + return -1; + } + ent->write_proc = trace_write_daemon_file; + ent->read_proc = trace_read_daemon_file; + + ent = create_proc_entry("sys/lnet/debug_mb", 0, NULL); + if (ent == NULL) { + CERROR(("couldn't register debug_mb\n")); + return -1; + } + ent->write_proc = trace_write_debug_mb; + ent->read_proc = trace_read_debug_mb; + + return 0; +} + +void remove_proc(void) +{ + remove_proc_entry("sys/portals/dump_kernel", NULL); + remove_proc_entry("sys/portals/daemon_file", NULL); + remove_proc_entry("sys/portals/debug_mb", NULL); + +#ifdef CONFIG_SYSCTL + if (portals_table_header) + unregister_sysctl_table(portals_table_header); + portals_table_header = NULL; +#endif +} + + +/* + * proc process routines of kernel space + */ + +cfs_file_t * +lustre_open_file(char * filename) +{ + int rc = 0; + cfs_file_t * fh = NULL; + cfs_proc_entry_t * fp = NULL; + + fp = search_proc_entry(filename, proc_fs_root); + + if (!fp) { + rc = -ENOENT; + return NULL; + } + + fh = cfs_alloc(sizeof(cfs_file_t), CFS_ALLOC_ZERO); + + if (!fh) { + rc = -ENOMEM; + return NULL; + } + + fh->private_data = (void *)fp; + fh->f_op = fp->proc_fops; + + if (fh->f_op->open) { + rc = (fh->f_op->open)(fh); + } else { + fp->nlink++; + } + + if (0 != rc) { + cfs_free(fh); + return NULL; + } + + return fh; +} + +int +lustre_close_file(cfs_file_t * fh) +{ + int rc = 0; + cfs_proc_entry_t * fp = NULL; + + fp = (cfs_proc_entry_t *) fh->private_data; + + if (fh->f_op->release) { + rc = (fh->f_op->release)(fh); + } else { + fp->nlink--; + } + + cfs_free(fh); + + return rc; +} + +int +lustre_do_ioctl( cfs_file_t * fh, + unsigned long cmd, + ulong_ptr arg ) +{ + int rc = 0; + + if (fh->f_op->ioctl) { + rc = (fh->f_op->ioctl)(fh, cmd, arg); + } + + if (rc != 0) { + printk("lustre_do_ioctl: fialed: cmd = %xh arg = %xh rc = %d\n", + cmd, arg, rc); + } + + return rc; +} + +int +lustre_ioctl_file(cfs_file_t * fh, PCFS_PROC_IOCTL devctl) +{ + int rc = 0; + ulong_ptr data; + + data = (ulong_ptr)devctl + sizeof(CFS_PROC_IOCTL); + + /* obd ioctl code */ + if (_IOC_TYPE(devctl->cmd) == 'f') { +#if 0 + struct obd_ioctl_data * obd = (struct obd_ioctl_data *) data; + + if ( devctl->cmd != (ULONG)OBD_IOC_BRW_WRITE && + devctl->cmd != (ULONG)OBD_IOC_BRW_READ ) { + + unsigned long off = obd->ioc_len; + + if (obd->ioc_pbuf1) { + obd->ioc_pbuf1 = (char *)(data + off); + off += size_round(obd->ioc_plen1); + } + + if (obd->ioc_pbuf2) { + obd->ioc_pbuf2 = (char *)(data + off); + } + } + #endif + } + + rc = lustre_do_ioctl(fh, devctl->cmd, data); + + return rc; +} + + +size_t +lustre_read_file( + cfs_file_t * fh, + loff_t off, + size_t size, + char * buf + ) +{ + size_t rc = 0; + + if (fh->f_op->read) { + rc = (fh->f_op->read) (fh, buf, size, &off); + } + + return rc; +} + + +size_t +lustre_write_file( + cfs_file_t * fh, + loff_t off, + size_t size, + char * buf + ) +{ + size_t rc = 0; + + if (fh->f_op->write) { + rc = (fh->f_op->write)(fh, buf, size, &off); + } + + return rc; +} + +#else /* !__KERNEL__ */ + +#include +#include +#include + +/* + * proc process routines of user space + */ + +HANDLE cfs_proc_open (char * filename, int oflag) +{ + NTSTATUS status; + IO_STATUS_BLOCK iosb; + int rc; + + HANDLE FileHandle = INVALID_HANDLE_VALUE; + OBJECT_ATTRIBUTES ObjectAttributes; + ACCESS_MASK DesiredAccess; + ULONG CreateDisposition; + ULONG ShareAccess; + ULONG CreateOptions; + UNICODE_STRING UnicodeName; + USHORT NameLength; + + PFILE_FULL_EA_INFORMATION Ea = NULL; + ULONG EaLength; + UCHAR EaBuffer[EA_MAX_LENGTH]; + + /* Check the filename: should start with "/proc" or "/dev" */ + NameLength = (USHORT)strlen(filename); + if (NameLength > 0x05) { + if (_strnicmp(filename, "/proc/", 6) == 0) { + filename += 6; + NameLength -=6; + if (NameLength <= 0) { + rc = -EINVAL; + goto errorout; + } + } else if (_strnicmp(filename, "/dev/", 5) == 0) { + } else { + rc = -EINVAL; + goto errorout; + } + } else { + rc = -EINVAL; + goto errorout; + } + + /* Analyze the flags settings */ + + if (cfs_is_flag_set(oflag, O_WRONLY)) { + DesiredAccess = (GENERIC_WRITE | SYNCHRONIZE); + ShareAccess = 0; + } else if (cfs_is_flag_set(oflag, O_RDWR)) { + DesiredAccess = (GENERIC_READ | GENERIC_WRITE | SYNCHRONIZE); + ShareAccess = FILE_SHARE_READ | FILE_SHARE_WRITE; + } else { + DesiredAccess = (GENERIC_READ | SYNCHRONIZE); + ShareAccess = FILE_SHARE_READ; + } + + if (cfs_is_flag_set(oflag, O_CREAT)) { + if (cfs_is_flag_set(oflag, O_EXCL)) { + CreateDisposition = FILE_CREATE; + rc = -EINVAL; + goto errorout; + } else { + CreateDisposition = FILE_OPEN_IF; + } + } else { + CreateDisposition = FILE_OPEN; + } + + if (cfs_is_flag_set(oflag, O_TRUNC)) { + if (cfs_is_flag_set(oflag, O_EXCL)) { + CreateDisposition = FILE_OVERWRITE; + } else { + CreateDisposition = FILE_OVERWRITE_IF; + } + } + + CreateOptions = 0; + + if (cfs_is_flag_set(oflag, O_DIRECTORY)) { + cfs_set_flag(CreateOptions, FILE_DIRECTORY_FILE); + } + + if (cfs_is_flag_set(oflag, O_SYNC)) { + cfs_set_flag(CreateOptions, FILE_WRITE_THROUGH); + } + + if (cfs_is_flag_set(oflag, O_DIRECT)) { + cfs_set_flag(CreateOptions, FILE_NO_INTERMEDIATE_BUFFERING); + } + + /* Initialize the unicode path name for the specified file */ + RtlInitUnicodeString(&UnicodeName, LUSTRE_PROC_SYMLNK); + + /* Setup the object attributes structure for the file. */ + InitializeObjectAttributes( + &ObjectAttributes, + &UnicodeName, + OBJ_CASE_INSENSITIVE, + NULL, + NULL ); + + /* building EA for the proc entry ... */ + Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer; + Ea->NextEntryOffset = 0; + Ea->Flags = 0; + Ea->EaNameLength = (UCHAR)NameLength; + Ea->EaValueLength = 0; + RtlCopyMemory( + &(Ea->EaName), + filename, + NameLength + 1 + ); + EaLength = sizeof(FILE_FULL_EA_INFORMATION) - 1 + + Ea->EaNameLength + 1; + + /* Now to open or create the file now */ + status = ZwCreateFile( + &FileHandle, + DesiredAccess, + &ObjectAttributes, + &iosb, + 0, + FILE_ATTRIBUTE_NORMAL, + ShareAccess, + CreateDisposition, + CreateOptions, + Ea, + EaLength ); + + /* Check the returned status of Iosb ... */ + + if (!NT_SUCCESS(status)) { + rc = cfs_error_code(status); + goto errorout; + } + +errorout: + + return FileHandle; +} + +int cfs_proc_close(HANDLE handle) +{ + if (handle) { + NtClose((HANDLE)handle); + } + + return 0; +} + +int cfs_proc_read(HANDLE handle, void *buffer, unsigned int count) +{ + NTSTATUS status; + IO_STATUS_BLOCK iosb; + LARGE_INTEGER offset; + + + offset.QuadPart = 0; + + /* read file data */ + status = NtReadFile( + (HANDLE)handle, + 0, + NULL, + NULL, + &iosb, + buffer, + count, + &offset, + NULL); + + /* check the return status */ + if (!NT_SUCCESS(status)) { + printf("NtReadFile request failed 0x%0x\n", status); + goto errorout; + } + +errorout: + + if (NT_SUCCESS(status)) { + return iosb.Information; + } + + return cfs_error_code(status); +} + + +int cfs_proc_write(HANDLE handle, void *buffer, unsigned int count) +{ + NTSTATUS status; + IO_STATUS_BLOCK iosb; + LARGE_INTEGER offset; + + offset.QuadPart = -1; + + /* write buffer to the opened file */ + status = NtWriteFile( + (HANDLE)handle, + 0, + NULL, + NULL, + &iosb, + buffer, + count, + &offset, + NULL); + + /* check the return status */ + if (!NT_SUCCESS(status)) { + printf("NtWriteFile request failed 0x%0x\n", status); + goto errorout; + } + +errorout: + + if (NT_SUCCESS(status)) { + return iosb.Information; + } + + return cfs_error_code(status); +} + +int cfs_proc_ioctl(HANDLE handle, int cmd, void *buffer) +{ + PUCHAR procdat = NULL; + CFS_PROC_IOCTL procctl; + ULONG length = 0; + ULONG extra = 0; + + NTSTATUS status; + IO_STATUS_BLOCK iosb; + + procctl.cmd = cmd; + + if(_IOC_TYPE(cmd) == IOC_LIBCFS_TYPE) { + struct libcfs_ioctl_data * portal; + portal = (struct libcfs_ioctl_data *) buffer; + length = portal->ioc_len; + } else if (_IOC_TYPE(cmd) == 'f') { + struct obd_ioctl_data * obd; + obd = (struct obd_ioctl_data *) buffer; + length = obd->ioc_len; + extra = size_round(obd->ioc_plen1) + size_round(obd->ioc_plen2); + } else if(_IOC_TYPE(cmd) == 'u') { + length = 4; + extra = 0; + } else { + printf("user:winnt-proc:cfs_proc_ioctl: un-supported ioctl type ...\n"); + cfs_enter_debugger(); + status = STATUS_INVALID_PARAMETER; + goto errorout; + } + + procctl.len = length + extra; + procdat = malloc(length + extra + sizeof(CFS_PROC_IOCTL)); + + if (NULL == procdat) { + printf("user:winnt-proc:cfs_proc_ioctl: no enough memory ...\n"); + status = STATUS_INSUFFICIENT_RESOURCES; + cfs_enter_debugger(); + goto errorout; + } + memset(procdat, 0, length + extra + sizeof(CFS_PROC_IOCTL)); + memcpy(procdat, &procctl, sizeof(CFS_PROC_IOCTL)); + memcpy(&procdat[sizeof(CFS_PROC_IOCTL)], buffer, length); + length += sizeof(CFS_PROC_IOCTL); + + if (_IOC_TYPE(cmd) == 'f') { + + char *ptr; + struct obd_ioctl_data * data; + struct obd_ioctl_data * obd; + + data = (struct obd_ioctl_data *) buffer; + obd = (struct obd_ioctl_data *) (procdat + sizeof(CFS_PROC_IOCTL)); + ptr = obd->ioc_bulk; + + if (data->ioc_inlbuf1) { + obd->ioc_inlbuf1 = ptr; + LOGL(data->ioc_inlbuf1, data->ioc_inllen1, ptr); + } + + if (data->ioc_inlbuf2) { + obd->ioc_inlbuf2 = ptr; + LOGL(data->ioc_inlbuf2, data->ioc_inllen2, ptr); + } + if (data->ioc_inlbuf3) { + obd->ioc_inlbuf3 = ptr; + LOGL(data->ioc_inlbuf3, data->ioc_inllen3, ptr); + } + if (data->ioc_inlbuf4) { + obd->ioc_inlbuf4 = ptr; + LOGL(data->ioc_inlbuf4, data->ioc_inllen4, ptr); + } + + if ( cmd != (ULONG)OBD_IOC_BRW_WRITE && + cmd != (ULONG)OBD_IOC_BRW_READ ) { + + if (data->ioc_pbuf1 && data->ioc_plen1) { + obd->ioc_pbuf1 = &procdat[length]; + memcpy(obd->ioc_pbuf1, data->ioc_pbuf1, data->ioc_plen1); + length += size_round(data->ioc_plen1); + } + + if (data->ioc_pbuf2 && data->ioc_plen2) { + obd->ioc_pbuf2 = &procdat[length]; + memcpy(obd->ioc_pbuf2, data->ioc_pbuf2, data->ioc_plen2); + length += size_round(data->ioc_plen2); + } + } + + if (obd_ioctl_is_invalid(obd)) { + cfs_enter_debugger(); + } + } + + status = NtDeviceIoControlFile( + (HANDLE)handle, + NULL, NULL, NULL, &iosb, + IOCTL_LIBCFS_ENTRY, + procdat, length, + procdat, length ); + + + if (NT_SUCCESS(status)) { + memcpy(buffer, &procdat[sizeof(CFS_PROC_IOCTL)], procctl.len); + } + +errorout: + + if (procdat) { + free(procdat); + } + + return cfs_error_code(status); +} + +#endif /* __KERNEL__ */ diff --git a/lnet/libcfs/winnt/winnt-sync.c b/lnet/libcfs/winnt/winnt-sync.c new file mode 100644 index 0000000..5094bef --- /dev/null +++ b/lnet/libcfs/winnt/winnt-sync.c @@ -0,0 +1,449 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + +#define DEBUG_SUBSYSTEM S_LIBCFS + +#include +#include + + +/* + * Wait queue routines + */ + +/* + * cfs_waitq_init + * To initialize the wait queue + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_init(cfs_waitq_t *waitq) +{ + waitq->magic = CFS_WAITQ_MAGIC; + waitq->flags = 0; + INIT_LIST_HEAD(&(waitq->waiters)); + spin_lock_init(&(waitq->guard)); +} + +/* + * cfs_waitlink_init + * To initialize the wake link node + * + * Arguments: + * link: pointer to the cfs_waitlink_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitlink_init(cfs_waitlink_t *link) +{ + cfs_task_t * task = cfs_current(); + PTASK_SLOT slot = NULL; + + if (!task) { + /* should bugchk here */ + cfs_enter_debugger(); + return; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + cfs_assert(slot->Magic == TASKSLT_MAGIC); + + memset(link, 0, sizeof(cfs_waitlink_t)); + + link->magic = CFS_WAITLINK_MAGIC; + link->flags = 0; + + link->event = &(slot->Event); + link->hits = &(slot->hits); + + atomic_inc(&slot->count); + + INIT_LIST_HEAD(&(link->waitq[0].link)); + INIT_LIST_HEAD(&(link->waitq[1].link)); + + link->waitq[0].waitl = link->waitq[1].waitl = link; +} + + +/* + * cfs_waitlink_fini + * To finilize the wake link node + * + * Arguments: + * link: pointer to the cfs_waitlink_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitlink_fini(cfs_waitlink_t *link) +{ + cfs_task_t * task = cfs_current(); + PTASK_SLOT slot = NULL; + + if (!task) { + /* should bugchk here */ + cfs_enter_debugger(); + return; + } + + slot = CONTAINING_RECORD(task, TASK_SLOT, task); + cfs_assert(slot->Magic == TASKSLT_MAGIC); + cfs_assert(link->magic == CFS_WAITLINK_MAGIC); + cfs_assert(link->waitq[0].waitq == NULL); + cfs_assert(link->waitq[1].waitq == NULL); + + atomic_dec(&slot->count); +} + + +/* + * cfs_waitq_add_internal + * To queue the wait link node to the wait queue + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * link: pointer to the cfs_waitlink_t structure + * int: queue no (Normal or Forward waitq) + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_add_internal(cfs_waitq_t *waitq, + cfs_waitlink_t *link, + __u32 waitqid ) +{ + LASSERT(waitq != NULL); + LASSERT(link != NULL); + LASSERT(waitq->magic == CFS_WAITQ_MAGIC); + LASSERT(link->magic == CFS_WAITLINK_MAGIC); + LASSERT(waitqid < CFS_WAITQ_CHANNELS); + + spin_lock(&(waitq->guard)); + LASSERT(link->waitq[waitqid].waitq == NULL); + link->waitq[waitqid].waitq = waitq; + if (link->flags & CFS_WAITQ_EXCLUSIVE) { + list_add_tail(&link->waitq[waitqid].link, &waitq->waiters); + } else { + list_add(&link->waitq[waitqid].link, &waitq->waiters); + } + spin_unlock(&(waitq->guard)); +} +/* + * cfs_waitq_add + * To queue the wait link node to the wait queue + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * link: pointer to the cfs_waitlink_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_add(cfs_waitq_t *waitq, + cfs_waitlink_t *link) +{ + cfs_waitq_add_internal(waitq, link, CFS_WAITQ_CHAN_NORMAL); +} + +/* + * cfs_waitq_add_exclusive + * To set the wait link node to exclusive mode + * and queue it to the wait queue + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * link: pointer to the cfs_wait_link structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_add_exclusive( cfs_waitq_t *waitq, + cfs_waitlink_t *link) +{ + LASSERT(waitq != NULL); + LASSERT(link != NULL); + LASSERT(waitq->magic == CFS_WAITQ_MAGIC); + LASSERT(link->magic == CFS_WAITLINK_MAGIC); + + link->flags |= CFS_WAITQ_EXCLUSIVE; + cfs_waitq_add(waitq, link); +} + +/* + * cfs_waitq_forward + * To be determinated. + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * link: pointer to the cfs_waitlink_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_forward( cfs_waitlink_t *link, + cfs_waitq_t *waitq) +{ + cfs_waitq_add_internal(waitq, link, CFS_WAITQ_CHAN_FORWARD); +} + +/* + * cfs_waitq_del + * To remove the wait link node from the waitq + * + * Arguments: + * waitq: pointer to the cfs_ waitq_t structure + * link: pointer to the cfs_waitlink_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_del( cfs_waitq_t *waitq, + cfs_waitlink_t *link) +{ + int i = 0; + + LASSERT(waitq != NULL); + LASSERT(link != NULL); + + LASSERT(waitq->magic == CFS_WAITQ_MAGIC); + LASSERT(link->magic == CFS_WAITLINK_MAGIC); + + spin_lock(&(waitq->guard)); + + for (i=0; i < CFS_WAITQ_CHANNELS; i++) { + if (link->waitq[i].waitq == waitq) + break; + } + + if (i < CFS_WAITQ_CHANNELS) { + link->waitq[i].waitq = NULL; + list_del_init(&link->waitq[i].link); + } else { + cfs_enter_debugger(); + } + + spin_unlock(&(waitq->guard)); +} + +/* + * cfs_waitq_active + * Is the waitq active (not empty) ? + * + * Arguments: + * waitq: pointer to the cfs_ waitq_t structure + * + * Return Value: + * Zero: the waitq is empty + * Non-Zero: the waitq is active + * + * Notes: + * We always returns TRUE here, the same to Darwin. + */ + +int cfs_waitq_active(cfs_waitq_t *waitq) +{ + LASSERT(waitq != NULL); + LASSERT(waitq->magic == CFS_WAITQ_MAGIC); + + return (1); +} + +/* + * cfs_waitq_signal_nr + * To wake up all the non-exclusive tasks plus nr exclusive + * ones in the waitq + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * nr: number of exclusive tasks to be woken up + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + + +void cfs_waitq_signal_nr(cfs_waitq_t *waitq, int nr) +{ + int result; + cfs_waitlink_channel_t * scan; + + LASSERT(waitq != NULL); + LASSERT(waitq->magic == CFS_WAITQ_MAGIC); + + spin_lock(&waitq->guard); + + list_for_each_entry(scan, &waitq->waiters, cfs_waitlink_channel_t, link) { + + cfs_waitlink_t *waitl = scan->waitl; + + result = cfs_wake_event(waitl->event); + LASSERT( result == FALSE || result == TRUE ); + + if (result) { + atomic_inc(waitl->hits); + } + + if ((waitl->flags & CFS_WAITQ_EXCLUSIVE) && --nr == 0) + break; + } + + spin_unlock(&waitq->guard); + return; +} + +/* + * cfs_waitq_signal + * To wake up all the non-exclusive tasks and 1 exclusive + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_signal(cfs_waitq_t *waitq) +{ + cfs_waitq_signal_nr(waitq, 1); +} + + +/* + * cfs_waitq_broadcast + * To wake up all the tasks in the waitq + * + * Arguments: + * waitq: pointer to the cfs_waitq_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_broadcast(cfs_waitq_t *waitq) +{ + LASSERT(waitq != NULL); + LASSERT(waitq->magic ==CFS_WAITQ_MAGIC); + + cfs_waitq_signal_nr(waitq, 0); +} + +/* + * cfs_waitq_wait + * To wait on the link node until it is signaled. + * + * Arguments: + * link: pointer to the cfs_waitlink_t structure + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void cfs_waitq_wait(cfs_waitlink_t *link, cfs_task_state_t state) +{ + LASSERT(link != NULL); + LASSERT(link->magic == CFS_WAITLINK_MAGIC); + + if (atomic_read(link->hits) > 0) { + atomic_dec(link->hits); + LASSERT((__u32)atomic_read(link->hits) < (__u32)0xFFFFFF00); + } else { + cfs_wait_event(link->event, 0); + } +} + +/* + * cfs_waitq_timedwait + * To wait the link node to be signaled with a timeout limit + * + * Arguments: + * link: pointer to the cfs_waitlink_t structure + * timeout: the timeout limitation + * + * Return Value: + * Woken up: return the difference of the current time and + * the timeout + * Timeout: return 0 + * + * Notes: + * What if it happens to be woken up at the just timeout time !? + */ + +cfs_duration_t cfs_waitq_timedwait( cfs_waitlink_t *link, + cfs_task_state_t state, + cfs_duration_t timeout) +{ + + if (atomic_read(link->hits) > 0) { + atomic_dec(link->hits); + LASSERT((__u32)atomic_read(link->hits) < (__u32)0xFFFFFF00); + return TRUE; + } + + return (cfs_duration_t)cfs_wait_event(link->event, timeout); +} + + diff --git a/lnet/libcfs/winnt/winnt-tcpip.c b/lnet/libcfs/winnt/winnt-tcpip.c new file mode 100644 index 0000000..d0c725c --- /dev/null +++ b/lnet/libcfs/winnt/winnt-tcpip.c @@ -0,0 +1,6706 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LIBCFS + +#include +#include +#include + +#define TDILND_MODULE_NAME L"Tdilnd" + +ks_data_t ks_data; + +ULONG +ks_tdi_send_flags(ULONG SockFlags) +{ + ULONG TdiFlags = 0; + + if (cfs_is_flag_set(SockFlags, MSG_OOB)) { + cfs_set_flag(TdiFlags, TDI_SEND_EXPEDITED); + } + + if (cfs_is_flag_set(SockFlags, MSG_MORE)) { + cfs_set_flag(TdiFlags, TDI_SEND_PARTIAL); + } + + if (cfs_is_flag_set(SockFlags, MSG_DONTWAIT)) { + cfs_set_flag(TdiFlags, TDI_SEND_NON_BLOCKING); + } + + return TdiFlags; +} + +NTSTATUS +KsIrpCompletionRoutine( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN PVOID Context + ) +{ + if (NULL != Context) { + KeSetEvent((PKEVENT)Context, IO_NETWORK_INCREMENT, FALSE); + } + + return STATUS_MORE_PROCESSING_REQUIRED; + + UNREFERENCED_PARAMETER(DeviceObject); + UNREFERENCED_PARAMETER(Irp); +} + + +/* + * KsBuildTdiIrp + * Allocate a new IRP and initialize it to be issued to tdi + * + * Arguments: + * DeviceObject: device object created by the underlying + * TDI transport driver + * + * Return Value: + * PRIP: the allocated Irp in success or NULL in failure. + * + * NOTES: + * N/A + */ + +PIRP +KsBuildTdiIrp( + IN PDEVICE_OBJECT DeviceObject + ) +{ + PIRP Irp; + PIO_STACK_LOCATION IrpSp; + + // + // Allocating the IRP ... + // + + Irp = IoAllocateIrp(DeviceObject->StackSize, FALSE); + + if (NULL != Irp) { + + // + // Getting the Next Stack Location ... + // + + IrpSp = IoGetNextIrpStackLocation(Irp); + + // + // Initializing Irp ... + // + + IrpSp->MajorFunction = IRP_MJ_INTERNAL_DEVICE_CONTROL; + IrpSp->Parameters.DeviceIoControl.IoControlCode = 0; + } + + return Irp; +} + +/* + * KsSubmitTdiIrp + * Issue the Irp to the underlying tdi driver + * + * Arguments: + * DeviceObject: the device object created by TDI driver + * Irp: the I/O request packet to be processed + * bSynchronous: synchronous or not. If true, we need wait + * until the process is finished. + * Information: returned info + * + * Return Value: + * NTSTATUS: kernel status code + * + * NOTES: + * N/A + */ + +NTSTATUS +KsSubmitTdiIrp( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN BOOLEAN bSynchronous, + OUT PULONG Information + ) +{ + NTSTATUS Status; + KEVENT Event; + + if (bSynchronous) { + + KeInitializeEvent( + &Event, + SynchronizationEvent, + FALSE + ); + + + IoSetCompletionRoutine( + Irp, + KsIrpCompletionRoutine, + &Event, + TRUE, + TRUE, + TRUE + ); + } + + Status = IoCallDriver(DeviceObject, Irp); + + if (bSynchronous) { + + if (STATUS_PENDING == Status) { + + Status = KeWaitForSingleObject( + &Event, + Executive, + KernelMode, + FALSE, + NULL + ); + } + + Status = Irp->IoStatus.Status; + + if (Information) { + *Information = (ULONG)(Irp->IoStatus.Information); + } + + Irp->MdlAddress = NULL; + IoFreeIrp(Irp); + } + + if (!NT_SUCCESS(Status)) { + + KsPrint((2, "KsSubmitTdiIrp: Error when submitting the Irp: Status = %xh (%s) ...\n", + Status, KsNtStatusToString(Status))); + } + + return (Status); +} + + + +/* + * KsOpenControl + * Open the Control Channel Object ... + * + * Arguments: + * DeviceName: the device name to be opened + * Handle: opened handle in success case + * FileObject: the fileobject of the device + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsOpenControl( + IN PUNICODE_STRING DeviceName, + OUT HANDLE * Handle, + OUT PFILE_OBJECT * FileObject + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + OBJECT_ATTRIBUTES ObjectAttributes; + IO_STATUS_BLOCK IoStatus; + + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + // + // Initializing ... + // + + InitializeObjectAttributes( + &ObjectAttributes, + DeviceName, + OBJ_CASE_INSENSITIVE | + OBJ_KERNEL_HANDLE, + NULL, + NULL + ); + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + // + // Creating the Transport Address Object ... + // + + Status = ZwCreateFile( + Handle, + FILE_READ_DATA | FILE_WRITE_DATA, + &ObjectAttributes, + &IoStatus, + 0, + FILE_ATTRIBUTE_NORMAL, + FILE_SHARE_READ | FILE_SHARE_WRITE, + FILE_OPEN, + 0, + NULL, + 0 + ); + + + if (NT_SUCCESS(Status)) { + + // + // Now Obtaining the FileObject of the Transport Address ... + // + + Status = ObReferenceObjectByHandle( + *Handle, + FILE_ANY_ACCESS, + NULL, + KernelMode, + FileObject, + NULL + ); + + if (!NT_SUCCESS(Status)) { + + cfs_enter_debugger(); + ZwClose(*Handle); + } + + } else { + + cfs_enter_debugger(); + } + + return (Status); +} + + +/* + * KsCloseControl + * Release the Control Channel Handle and FileObject + * + * Arguments: + * Handle: the channel handle to be released + * FileObject: the fileobject to be released + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsCloseControl( + IN HANDLE Handle, + IN PFILE_OBJECT FileObject + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + if (FileObject) { + + ObDereferenceObject(FileObject); + } + + if (Handle) { + + Status = ZwClose(Handle); + } + + ASSERT(NT_SUCCESS(Status)); + + return (Status); +} + + +/* + * KsOpenAddress + * Open the tdi address object + * + * Arguments: + * DeviceName: device name of the address object + * pAddress: tdi address of the address object + * AddressLength: length in bytes of the tdi address + * Handle: the newly opened handle + * FileObject: the newly opened fileobject + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsOpenAddress( + IN PUNICODE_STRING DeviceName, + IN PTRANSPORT_ADDRESS pAddress, + IN ULONG AddressLength, + OUT HANDLE * Handle, + OUT PFILE_OBJECT * FileObject + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + PFILE_FULL_EA_INFORMATION Ea = NULL; + ULONG EaLength; + UCHAR EaBuffer[EA_MAX_LENGTH]; + + OBJECT_ATTRIBUTES ObjectAttributes; + IO_STATUS_BLOCK IoStatus; + + // + // Building EA for the Address Object to be Opened ... + // + + Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer; + Ea->NextEntryOffset = 0; + Ea->Flags = 0; + Ea->EaNameLength = TDI_TRANSPORT_ADDRESS_LENGTH; + Ea->EaValueLength = (USHORT)AddressLength; + RtlCopyMemory( + &(Ea->EaName), + TdiTransportAddress, + Ea->EaNameLength + 1 + ); + RtlMoveMemory( + &(Ea->EaName[Ea->EaNameLength + 1]), + pAddress, + AddressLength + ); + EaLength = sizeof(FILE_FULL_EA_INFORMATION) + + Ea->EaNameLength + AddressLength; + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + + // + // Initializing ... + // + + InitializeObjectAttributes( + &ObjectAttributes, + DeviceName, + OBJ_CASE_INSENSITIVE | + OBJ_KERNEL_HANDLE, + NULL, + NULL + ); + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + // + // Creating the Transport Address Object ... + // + + Status = ZwCreateFile( + Handle, + FILE_READ_DATA | FILE_WRITE_DATA, + &ObjectAttributes, + &IoStatus, + 0, + FILE_ATTRIBUTE_NORMAL, + FILE_SHARE_READ | FILE_SHARE_WRITE, /* 0: DON'T REUSE */ + FILE_OPEN, + 0, + Ea, + EaLength + ); + + + if (NT_SUCCESS(Status)) { + + // + // Now Obtaining the FileObject of the Transport Address ... + // + + Status = ObReferenceObjectByHandle( + *Handle, + FILE_ANY_ACCESS, + NULL, + KernelMode, + FileObject, + NULL + ); + + if (!NT_SUCCESS(Status)) { + + cfs_enter_debugger(); + ZwClose(*Handle); + } + + } else { + + cfs_enter_debugger(); + } + + return (Status); +} + +/* + * KsCloseAddress + * Release the Hanlde and FileObject of an opened tdi + * address object + * + * Arguments: + * Handle: the handle to be released + * FileObject: the fileobject to be released + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsCloseAddress( + IN HANDLE Handle, + IN PFILE_OBJECT FileObject +) +{ + NTSTATUS Status = STATUS_SUCCESS; + + if (FileObject) { + + ObDereferenceObject(FileObject); + } + + if (Handle) { + + Status = ZwClose(Handle); + } + + ASSERT(NT_SUCCESS(Status)); + + return (Status); +} + + +/* + * KsOpenConnection + * Open a tdi connection object + * + * Arguments: + * DeviceName: device name of the connection object + * ConnectionContext: the connection context + * Handle: the newly opened handle + * FileObject: the newly opened fileobject + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsOpenConnection( + IN PUNICODE_STRING DeviceName, + IN CONNECTION_CONTEXT ConnectionContext, + OUT HANDLE * Handle, + OUT PFILE_OBJECT * FileObject + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + PFILE_FULL_EA_INFORMATION Ea = NULL; + ULONG EaLength; + UCHAR EaBuffer[EA_MAX_LENGTH]; + + OBJECT_ATTRIBUTES ObjectAttributes; + IO_STATUS_BLOCK IoStatus; + + // + // Building EA for the Address Object to be Opened ... + // + + Ea = (PFILE_FULL_EA_INFORMATION)EaBuffer; + Ea->NextEntryOffset = 0; + Ea->Flags = 0; + Ea->EaNameLength = TDI_CONNECTION_CONTEXT_LENGTH; + Ea->EaValueLength = (USHORT)sizeof(CONNECTION_CONTEXT); + RtlCopyMemory( + &(Ea->EaName), + TdiConnectionContext, + Ea->EaNameLength + 1 + ); + RtlMoveMemory( + &(Ea->EaName[Ea->EaNameLength + 1]), + &ConnectionContext, + sizeof(CONNECTION_CONTEXT) + ); + EaLength = sizeof(FILE_FULL_EA_INFORMATION) - 1 + + Ea->EaNameLength + 1 + sizeof(CONNECTION_CONTEXT); + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + + // + // Initializing ... + // + + InitializeObjectAttributes( + &ObjectAttributes, + DeviceName, + OBJ_CASE_INSENSITIVE | + OBJ_KERNEL_HANDLE, + NULL, + NULL + ); + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + // + // Creating the Connection Object ... + // + + Status = ZwCreateFile( + Handle, + FILE_READ_DATA | FILE_WRITE_DATA, + &ObjectAttributes, + &IoStatus, + NULL, + FILE_ATTRIBUTE_NORMAL, + 0, + FILE_OPEN, + 0, + Ea, + EaLength + ); + + + if (NT_SUCCESS(Status)) { + + // + // Now Obtaining the FileObject of the Transport Address ... + // + + Status = ObReferenceObjectByHandle( + *Handle, + FILE_ANY_ACCESS, + NULL, + KernelMode, + FileObject, + NULL + ); + + if (!NT_SUCCESS(Status)) { + + cfs_enter_debugger(); + ZwClose(*Handle); + } + + } else { + + cfs_enter_debugger(); + } + + return (Status); +} + +/* + * KsCloseConnection + * Release the Hanlde and FileObject of an opened tdi + * connection object + * + * Arguments: + * Handle: the handle to be released + * FileObject: the fileobject to be released + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsCloseConnection( + IN HANDLE Handle, + IN PFILE_OBJECT FileObject + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + if (FileObject) { + + ObDereferenceObject(FileObject); + } + + if (Handle) { + + Status = ZwClose(Handle); + } + + ASSERT(NT_SUCCESS(Status)); + + return (Status); +} + + +/* + * KsAssociateAddress + * Associate an address object with a connection object + * + * Arguments: + * AddressHandle: the handle of the address object + * ConnectionObject: the FileObject of the connection + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsAssociateAddress( + IN HANDLE AddressHandle, + IN PFILE_OBJECT ConnectionObject + ) +{ + NTSTATUS Status; + PDEVICE_OBJECT DeviceObject; + PIRP Irp; + + // + // Getting the DeviceObject from Connection FileObject + // + + DeviceObject = IoGetRelatedDeviceObject(ConnectionObject); + + // + // Building Tdi Internal Irp ... + // + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + + } else { + + // + // Assocating the Address Object with the Connection Object + // + + TdiBuildAssociateAddress( + Irp, + DeviceObject, + ConnectionObject, + NULL, + NULL, + AddressHandle + ); + + // + // Calling the Transprot Driver with the Prepared Irp + // + + Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL); + } + + return (Status); +} + + +/* + * KsDisassociateAddress + * Disassociate the connection object (the relationship will + * the corresponding address object will be dismissed. ) + * + * Arguments: + * ConnectionObject: the FileObject of the connection + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsDisassociateAddress( + IN PFILE_OBJECT ConnectionObject + ) +{ + NTSTATUS Status; + PDEVICE_OBJECT DeviceObject; + PIRP Irp; + + // + // Getting the DeviceObject from Connection FileObject + // + + DeviceObject = IoGetRelatedDeviceObject(ConnectionObject); + + // + // Building Tdi Internal Irp ... + // + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + + } else { + + // + // Disassocating the Address Object with the Connection Object + // + + TdiBuildDisassociateAddress( + Irp, + DeviceObject, + ConnectionObject, + NULL, + NULL + ); + + // + // Calling the Transprot Driver with the Prepared Irp + // + + Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL); + } + + return (Status); +} + + +/* + +// +// Connection Control Event Callbacks +// + +TDI_EVENT_CONNECT +TDI_EVENT_DISCONNECT +TDI_EVENT_ERROR + +// +// Tcp Event Callbacks +// + +TDI_EVENT_RECEIVE +TDI_EVENT_RECEIVE_EXPEDITED +TDI_EVENT_CHAINED_RECEIVE +TDI_EVENT_CHAINED_RECEIVE_EXPEDITED + +// +// Udp Event Callbacks +// + +TDI_EVENT_RECEIVE_DATAGRAM +TDI_EVENT_CHAINED_RECEIVE_DATAGRAM + +*/ + + +/* + * KsSetEventHandlers + * Set the tdi event callbacks with an address object + * + * Arguments: + * AddressObject: the FileObject of the address object + * EventContext: the parameter for the callbacks + * Handlers: the handlers indictor array + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * NOTES: + * N/A + */ + +NTSTATUS +KsSetEventHandlers( + IN PFILE_OBJECT AddressObject, // Address File Object + IN PVOID EventContext, // Context for Handlers + IN PKS_EVENT_HANDLERS Handlers // Handlers Indictor + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + PDEVICE_OBJECT DeviceObject; + USHORT i = 0; + + DeviceObject = IoGetRelatedDeviceObject(AddressObject); + + for (i=0; i < TDI_EVENT_MAXIMUM_HANDLER; i++) { + + // + // Setup the tdi event callback handler if requested. + // + + if (Handlers->IsActive[i]) { + + PIRP Irp; + + // + // Building Tdi Internal Irp ... + // + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + + } else { + + // + // Building the Irp to set the Event Handler ... + // + + TdiBuildSetEventHandler( + Irp, + DeviceObject, + AddressObject, + NULL, + NULL, + i, /* tdi event type */ + Handlers->Handler[i], /* tdi event handler */ + EventContext /* context for the handler */ + ); + + // + // Calling the Transprot Driver with the Prepared Irp + // + + Status = KsSubmitTdiIrp(DeviceObject, Irp, TRUE, NULL); + + // + // tcp/ip tdi does not support these two event callbacks + // + + if ((!NT_SUCCESS(Status)) && ( i == TDI_EVENT_SEND_POSSIBLE || + i == TDI_EVENT_CHAINED_RECEIVE_EXPEDITED )) { + cfs_enter_debugger(); + Status = STATUS_SUCCESS; + } + } + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + } + } + + +errorout: + + if (!NT_SUCCESS(Status)) { + + KsPrint((2, "KsSetEventHandlers: Error Status = %xh (%s)\n", + Status, KsNtStatusToString(Status) )); + } + + return (Status); +} + + + +/* + * KsQueryAddressInfo + * Query the address of the FileObject specified + * + * Arguments: + * FileObject: the FileObject to be queried + * AddressInfo: buffer to contain the address info + * AddressSize: length of the AddressInfo buffer + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * Notes: + * N/A + */ + +NTSTATUS +KsQueryAddressInfo( + PFILE_OBJECT FileObject, + PTDI_ADDRESS_INFO AddressInfo, + PULONG AddressSize + ) +{ + NTSTATUS Status = STATUS_UNSUCCESSFUL; + PIRP Irp = NULL; + PMDL Mdl; + PDEVICE_OBJECT DeviceObject; + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + DeviceObject = IoGetRelatedDeviceObject(FileObject); + + RtlZeroMemory(AddressInfo, *(AddressSize)); + + // + // Allocating the Tdi Setting Irp ... + // + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + + } else { + + // + // Locking the User Buffer / Allocating a MDL for it + // + + Status = KsLockUserBuffer( + AddressInfo, + FALSE, + *(AddressSize), + IoModifyAccess, + &Mdl + ); + + if (!NT_SUCCESS(Status)) { + + IoFreeIrp(Irp); + Irp = NULL; + } + } + + if (Irp) { + + LASSERT(NT_SUCCESS(Status)); + + TdiBuildQueryInformation( + Irp, + DeviceObject, + FileObject, + NULL, + NULL, + TDI_QUERY_ADDRESS_INFO, + Mdl + ); + + Status = KsSubmitTdiIrp( + DeviceObject, + Irp, + TRUE, + AddressSize + ); + + KsReleaseMdl(Mdl, FALSE); + } + + if (!NT_SUCCESS(Status)) { + + cfs_enter_debugger(); + //TDI_BUFFER_OVERFLOW + } + + return (Status); +} + +/* + * KsQueryProviderInfo + * Query the underlying transport device's information + * + * Arguments: + * TdiDeviceName: the transport device's name string + * ProviderInfo: TDI_PROVIDER_INFO struncture + * + * Return Value: + * NTSTATUS: Nt system status code + * + * NOTES: + * N/A + */ + +NTSTATUS +KsQueryProviderInfo( + PWSTR TdiDeviceName, + PTDI_PROVIDER_INFO ProviderInfo + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + PIRP Irp = NULL; + PMDL Mdl = NULL; + + UNICODE_STRING ControlName; + + HANDLE Handle; + PFILE_OBJECT FileObject; + PDEVICE_OBJECT DeviceObject; + + ULONG ProviderSize = 0; + + RtlInitUnicodeString(&ControlName, TdiDeviceName); + + // + // Open the Tdi Control Channel + // + + Status = KsOpenControl( + &ControlName, + &Handle, + &FileObject + ); + + if (!NT_SUCCESS(Status)) { + + KsPrint((2, "KsQueryProviderInfo: Fail to open the tdi control channel.\n")); + return (Status); + } + + // + // Obtain The Related Device Object + // + + DeviceObject = IoGetRelatedDeviceObject(FileObject); + + ProviderSize = sizeof(TDI_PROVIDER_INFO); + RtlZeroMemory(ProviderInfo, ProviderSize); + + // + // Allocating the Tdi Setting Irp ... + // + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + + } else { + + // + // Locking the User Buffer / Allocating a MDL for it + // + + Status = KsLockUserBuffer( + ProviderInfo, + FALSE, + ProviderSize, + IoModifyAccess, + &Mdl + ); + + if (!NT_SUCCESS(Status)) { + + IoFreeIrp(Irp); + Irp = NULL; + } + } + + if (Irp) { + + LASSERT(NT_SUCCESS(Status)); + + TdiBuildQueryInformation( + Irp, + DeviceObject, + FileObject, + NULL, + NULL, + TDI_QUERY_PROVIDER_INFO, + Mdl + ); + + Status = KsSubmitTdiIrp( + DeviceObject, + Irp, + TRUE, + &ProviderSize + ); + + KsReleaseMdl(Mdl, FALSE); + } + + if (!NT_SUCCESS(Status)) { + + cfs_enter_debugger(); + //TDI_BUFFER_OVERFLOW + } + + KsCloseControl(Handle, FileObject); + + return (Status); +} + +/* + * KsQueryConnectionInfo + * Query the connection info of the FileObject specified + * (some statics data of the traffic) + * + * Arguments: + * FileObject: the FileObject to be queried + * ConnectionInfo: buffer to contain the connection info + * ConnectionSize: length of the ConnectionInfo buffer + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * NOTES: + * N/A + */ + +NTSTATUS +KsQueryConnectionInfo( + PFILE_OBJECT ConnectionObject, + PTDI_CONNECTION_INFO ConnectionInfo, + PULONG ConnectionSize + ) +{ + NTSTATUS Status = STATUS_UNSUCCESSFUL; + PIRP Irp = NULL; + PMDL Mdl; + PDEVICE_OBJECT DeviceObject; + + LASSERT( KeGetCurrentIrql() < DISPATCH_LEVEL ); + + DeviceObject = IoGetRelatedDeviceObject(ConnectionObject); + + RtlZeroMemory(ConnectionInfo, *(ConnectionSize)); + + // + // Allocating the Tdi Query Irp ... + // + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + + } else { + + // + // Locking the User Buffer / Allocating a MDL for it + // + + Status = KsLockUserBuffer( + ConnectionInfo, + FALSE, + *(ConnectionSize), + IoModifyAccess, + &Mdl + ); + + if (NT_SUCCESS(Status)) { + + IoFreeIrp(Irp); + Irp = NULL; + } + } + + if (Irp) { + + LASSERT(NT_SUCCESS(Status)); + + TdiBuildQueryInformation( + Irp, + DeviceObject, + ConnectionObject, + NULL, + NULL, + TDI_QUERY_CONNECTION_INFO, + Mdl + ); + + Status = KsSubmitTdiIrp( + DeviceObject, + Irp, + TRUE, + ConnectionSize + ); + + KsReleaseMdl(Mdl, FALSE); + } + + return (Status); +} + + +/* + * KsInitializeTdiAddress + * Initialize the tdi addresss + * + * Arguments: + * pTransportAddress: tdi address to be initialized + * IpAddress: the ip address of object + * IpPort: the ip port of the object + * + * Return Value: + * ULONG: the total size of the tdi address + * + * NOTES: + * N/A + */ + +ULONG +KsInitializeTdiAddress( + IN OUT PTA_IP_ADDRESS pTransportAddress, + IN ULONG IpAddress, + IN USHORT IpPort + ) +{ + pTransportAddress->TAAddressCount = 1; + pTransportAddress->Address[ 0 ].AddressLength = TDI_ADDRESS_LENGTH_IP; + pTransportAddress->Address[ 0 ].AddressType = TDI_ADDRESS_TYPE_IP; + pTransportAddress->Address[ 0 ].Address[ 0 ].sin_port = IpPort; + pTransportAddress->Address[ 0 ].Address[ 0 ].in_addr = IpAddress; + + return (FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) + TDI_ADDRESS_LENGTH_IP); +} + +/* + * KsQueryTdiAddressLength + * Query the total size of the tdi address + * + * Arguments: + * pTransportAddress: tdi address to be queried + * + * Return Value: + * ULONG: the total size of the tdi address + * + * NOTES: + * N/A + */ + +ULONG +KsQueryTdiAddressLength( + PTRANSPORT_ADDRESS pTransportAddress + ) +{ + ULONG TotalLength = 0; + LONG i; + + PTA_ADDRESS UNALIGNED pTaAddress = NULL; + + ASSERT (NULL != pTransportAddress); + + TotalLength = FIELD_OFFSET(TRANSPORT_ADDRESS, Address) + + FIELD_OFFSET(TA_ADDRESS, Address) * pTransportAddress->TAAddressCount; + + pTaAddress = (TA_ADDRESS UNALIGNED *)pTransportAddress->Address; + + for (i = 0; i < pTransportAddress->TAAddressCount; i++) + { + TotalLength += pTaAddress->AddressLength; + pTaAddress = (TA_ADDRESS UNALIGNED *)((PCHAR)pTaAddress + + FIELD_OFFSET(TA_ADDRESS,Address) + + pTaAddress->AddressLength ); + } + + return (TotalLength); +} + + +/* + * KsQueryIpAddress + * Query the ip address of the tdi object + * + * Arguments: + * FileObject: tdi object to be queried + * TdiAddress: TdiAddress buffer, to store the queried + * tdi ip address + * AddressLength: buffer length of the TdiAddress + * + * Return Value: + * ULONG: the total size of the tdi ip address + * + * NOTES: + * N/A + */ + +NTSTATUS +KsQueryIpAddress( + PFILE_OBJECT FileObject, + PVOID TdiAddress, + ULONG* AddressLength + ) +{ + NTSTATUS Status; + + PTDI_ADDRESS_INFO TdiAddressInfo; + ULONG Length; + + + // + // Maximum length of TDI_ADDRESSS_INFO with one TRANSPORT_ADDRESS + // + + Length = MAX_ADDRESS_LENGTH; + + TdiAddressInfo = (PTDI_ADDRESS_INFO) + ExAllocatePoolWithTag( + NonPagedPool, + Length, + 'KSAI' ); + + if (NULL == TdiAddressInfo) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + + Status = KsQueryAddressInfo( + FileObject, + TdiAddressInfo, + &Length + ); + +errorout: + + if (NT_SUCCESS(Status)) + { + if (*AddressLength < Length) { + + Status = STATUS_BUFFER_TOO_SMALL; + + } else { + + *AddressLength = Length; + RtlCopyMemory( + TdiAddress, + &(TdiAddressInfo->Address), + Length + ); + + Status = STATUS_SUCCESS; + } + + } else { + + } + + + if (NULL != TdiAddressInfo) { + + ExFreePool(TdiAddressInfo); + } + + return Status; +} + + +/* + * KsErrorEventHandler + * the common error event handler callback + * + * Arguments: + * TdiEventContext: should be the socket + * Status: the error code + * + * Return Value: + * Status: STATS_SUCCESS + * + * NOTES: + * We need not do anything in such a severe + * error case. System will process it for us. + */ + +NTSTATUS +KsErrorEventHandler( + IN PVOID TdiEventContext, + IN NTSTATUS Status + ) +{ + KsPrint((2, "KsErrorEventHandler called at Irql = %xh ...\n", + KeGetCurrentIrql())); + + cfs_enter_debugger(); + + return (STATUS_SUCCESS); +} + + +/* + * ks_set_handlers + * setup all the event handler callbacks + * + * Arguments: + * tconn: the tdi connecton object + * + * Return Value: + * int: ks error code + * + * NOTES: + * N/A + */ + +int +ks_set_handlers( + ksock_tconn_t * tconn + ) +{ + NTSTATUS status = STATUS_SUCCESS; + KS_EVENT_HANDLERS handlers; + + /* to make sure the address object is opened already */ + if (tconn->kstc_addr.FileObject == NULL) { + goto errorout; + } + + /* initialize the handlers indictor array. for sender and listenr, + there are different set of callbacks. for child, we just return. */ + + memset(&handlers, 0, sizeof(KS_EVENT_HANDLERS)); + + SetEventHandler(handlers, TDI_EVENT_ERROR, KsErrorEventHandler); + SetEventHandler(handlers, TDI_EVENT_DISCONNECT, KsDisconnectEventHandler); + SetEventHandler(handlers, TDI_EVENT_RECEIVE, KsTcpReceiveEventHandler); + SetEventHandler(handlers, TDI_EVENT_RECEIVE_EXPEDITED, KsTcpReceiveExpeditedEventHandler); + SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE, KsTcpChainedReceiveEventHandler); + + // SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE_EXPEDITED, KsTcpChainedReceiveExpeditedEventHandler); + + if (tconn->kstc_type == kstt_listener) { + SetEventHandler(handlers, TDI_EVENT_CONNECT, KsConnectEventHandler); + } else if (tconn->kstc_type == kstt_child) { + goto errorout; + } + + /* set all the event callbacks */ + status = KsSetEventHandlers( + tconn->kstc_addr.FileObject, /* Address File Object */ + tconn, /* Event Context */ + &handlers /* Event callback handlers */ + ); + +errorout: + + return cfs_error_code(status); +} + + +/* + * ks_reset_handlers + * disable all the event handler callbacks (set to NULL) + * + * Arguments: + * tconn: the tdi connecton object + * + * Return Value: + * int: ks error code + * + * NOTES: + * N/A + */ + +int +ks_reset_handlers( + ksock_tconn_t * tconn + ) +{ + NTSTATUS status = STATUS_SUCCESS; + KS_EVENT_HANDLERS handlers; + + /* to make sure the address object is opened already */ + if (tconn->kstc_addr.FileObject == NULL) { + goto errorout; + } + + /* initialize the handlers indictor array. for sender and listenr, + there are different set of callbacks. for child, we just return. */ + + memset(&handlers, 0, sizeof(KS_EVENT_HANDLERS)); + + SetEventHandler(handlers, TDI_EVENT_ERROR, NULL); + SetEventHandler(handlers, TDI_EVENT_DISCONNECT, NULL); + SetEventHandler(handlers, TDI_EVENT_RECEIVE, NULL); + SetEventHandler(handlers, TDI_EVENT_RECEIVE_EXPEDITED, NULL); + SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE, NULL); + // SetEventHandler(handlers, TDI_EVENT_CHAINED_RECEIVE_EXPEDITED, NULL); + + if (tconn->kstc_type == kstt_listener) { + SetEventHandler(handlers, TDI_EVENT_CONNECT, NULL); + } else if (tconn->kstc_type == kstt_child) { + goto errorout; + } + + /* set all the event callbacks */ + status = KsSetEventHandlers( + tconn->kstc_addr.FileObject, /* Address File Object */ + tconn, /* Event Context */ + &handlers /* Event callback handlers */ + ); + +errorout: + + return cfs_error_code(status); +} + + +/* + * KsAcceptCompletionRoutine + * Irp completion routine for TdiBuildAccept (KsConnectEventHandler) + * + * Here system gives us a chance to check the conneciton is built + * ready or not. + * + * Arguments: + * DeviceObject: the device object of the transport driver + * Irp: the Irp is being completed. + * Context: the context we specified when issuing the Irp + * + * Return Value: + * Nt status code + * + * Notes: + * N/A + */ + +NTSTATUS +KsAcceptCompletionRoutine( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN PVOID Context + ) +{ + ksock_tconn_t * child = (ksock_tconn_t *) Context; + ksock_tconn_t * parent = child->child.kstc_parent; + + KsPrint((2, "KsAcceptCompletionRoutine: called at Irql: %xh\n", + KeGetCurrentIrql() )); + + KsPrint((2, "KsAcceptCompletionRoutine: Context = %xh Status = %xh\n", + Context, Irp->IoStatus.Status)); + + LASSERT(child->kstc_type == kstt_child); + + spin_lock(&(child->kstc_lock)); + + LASSERT(parent->kstc_state == ksts_listening); + LASSERT(child->kstc_state == ksts_connecting); + + if (NT_SUCCESS(Irp->IoStatus.Status)) { + + child->child.kstc_accepted = TRUE; + + child->kstc_state = ksts_connected; + + /* wake up the daemon thread which waits on this event */ + KeSetEvent( + &(parent->listener.kstc_accept_event), + 0, + FALSE + ); + + spin_unlock(&(child->kstc_lock)); + + KsPrint((2, "KsAcceptCompletionRoutine: Get %xh now signal the event ...\n", parent)); + + } else { + + /* re-use this child connecton */ + child->child.kstc_accepted = FALSE; + child->child.kstc_busy = FALSE; + child->kstc_state = ksts_associated; + + spin_unlock(&(child->kstc_lock)); + } + + /* now free the Irp */ + IoFreeIrp(Irp); + + /* drop the refer count of the child */ + ks_put_tconn(child); + + return (STATUS_MORE_PROCESSING_REQUIRED); +} + + +/* + * ks_get_vacancy_backlog + * Get a vacancy listeing child from the backlog list + * + * Arguments: + * parent: the listener daemon connection + * + * Return Value: + * the child listening connection or NULL in failure + * + * Notes + * Parent's lock should be acquired before calling. + */ + +ksock_tconn_t * +ks_get_vacancy_backlog( + ksock_tconn_t * parent + ) +{ + ksock_tconn_t * child; + + LASSERT(parent->kstc_type == kstt_listener); + LASSERT(parent->kstc_state == ksts_listening); + + if (list_empty(&(parent->listener.kstc_listening.list))) { + + child = NULL; + + } else { + + struct list_head * tmp; + + /* check the listening queue and try to get a free connecton */ + + list_for_each(tmp, &(parent->listener.kstc_listening.list)) { + child = list_entry (tmp, ksock_tconn_t, child.kstc_link); + spin_lock(&(child->kstc_lock)); + + if (!child->child.kstc_busy) { + LASSERT(child->kstc_state == ksts_associated); + child->child.kstc_busy = TRUE; + spin_unlock(&(child->kstc_lock)); + break; + } else { + spin_unlock(&(child->kstc_lock)); + child = NULL; + } + } + } + + return child; +} + +ks_addr_slot_t * +KsSearchIpAddress(PUNICODE_STRING DeviceName) +{ + ks_addr_slot_t * slot = NULL; + PLIST_ENTRY list = NULL; + + spin_lock(&ks_data.ksnd_addrs_lock); + + list = ks_data.ksnd_addrs_list.Flink; + while (list != &ks_data.ksnd_addrs_list) { + slot = CONTAINING_RECORD(list, ks_addr_slot_t, link); + if (RtlCompareUnicodeString( + DeviceName, + &slot->devname, + TRUE) == 0) { + break; + } + list = list->Flink; + slot = NULL; + } + + spin_unlock(&ks_data.ksnd_addrs_lock); + + return slot; +} + +void +KsCleanupIpAddresses() +{ + spin_lock(&ks_data.ksnd_addrs_lock); + + while (!IsListEmpty(&ks_data.ksnd_addrs_list)) { + + ks_addr_slot_t * slot = NULL; + PLIST_ENTRY list = NULL; + + list = RemoveHeadList(&ks_data.ksnd_addrs_list); + slot = CONTAINING_RECORD(list, ks_addr_slot_t, link); + cfs_free(slot); + ks_data.ksnd_naddrs--; + } + + cfs_assert(ks_data.ksnd_naddrs == 0); + spin_unlock(&ks_data.ksnd_addrs_lock); +} + +VOID +KsAddAddressHandler( + IN PTA_ADDRESS Address, + IN PUNICODE_STRING DeviceName, + IN PTDI_PNP_CONTEXT Context + ) +{ + PTDI_ADDRESS_IP IpAddress = NULL; + + if ( Address->AddressType == TDI_ADDRESS_TYPE_IP && + Address->AddressLength == TDI_ADDRESS_LENGTH_IP ) { + + ks_addr_slot_t * slot = NULL; + + IpAddress = (PTDI_ADDRESS_IP) &Address->Address[0]; + KsPrint((1, "KsAddAddressHandle: Device=%wZ Context=%xh IpAddress=%xh(%d.%d.%d.%d)\n", + DeviceName, Context, IpAddress->in_addr, + (IpAddress->in_addr & 0xFF000000) >> 24, + (IpAddress->in_addr & 0x00FF0000) >> 16, + (IpAddress->in_addr & 0x0000FF00) >> 8, + (IpAddress->in_addr & 0x000000FF) >> 0 )); + + slot = KsSearchIpAddress(DeviceName); + + if (slot != NULL) { + slot->up = TRUE; + slot->ip_addr = ntohl(IpAddress->in_addr); + } else { + slot = cfs_alloc(sizeof(ks_addr_slot_t) + DeviceName->Length, CFS_ALLOC_ZERO); + if (slot != NULL) { + spin_lock(&ks_data.ksnd_addrs_lock); + InsertTailList(&ks_data.ksnd_addrs_list, &slot->link); + sprintf(slot->iface, "eth%d", ks_data.ksnd_naddrs++); + slot->ip_addr = ntohl(IpAddress->in_addr); + slot->up = TRUE; + RtlMoveMemory(&slot->buffer[0], DeviceName->Buffer, DeviceName->Length); + slot->devname.Length = DeviceName->Length; + slot->devname.MaximumLength = DeviceName->Length + sizeof(WCHAR); + slot->devname.Buffer = slot->buffer; + spin_unlock(&ks_data.ksnd_addrs_lock); + } + } + } +} + +VOID +KsDelAddressHandler( + IN PTA_ADDRESS Address, + IN PUNICODE_STRING DeviceName, + IN PTDI_PNP_CONTEXT Context + ) +{ + PTDI_ADDRESS_IP IpAddress = NULL; + + if ( Address->AddressType == TDI_ADDRESS_TYPE_IP && + Address->AddressLength == TDI_ADDRESS_LENGTH_IP ) { + + ks_addr_slot_t * slot = NULL; + + slot = KsSearchIpAddress(DeviceName); + + if (slot != NULL) { + slot->up = FALSE; + } + + IpAddress = (PTDI_ADDRESS_IP) &Address->Address[0]; + KsPrint((1, "KsDelAddressHandle: Device=%wZ Context=%xh IpAddress=%xh(%d.%d.%d.%d)\n", + DeviceName, Context, IpAddress->in_addr, + (IpAddress->in_addr & 0xFF000000) >> 24, + (IpAddress->in_addr & 0x00FF0000) >> 16, + (IpAddress->in_addr & 0x0000FF00) >> 8, + (IpAddress->in_addr & 0x000000FF) >> 0 )); + } +} + +NTSTATUS +KsRegisterPnpHandlers() +{ + TDI20_CLIENT_INTERFACE_INFO ClientInfo; + + /* initialize the global ks_data members */ + RtlInitUnicodeString(&ks_data.ksnd_client_name, TDILND_MODULE_NAME); + spin_lock_init(&ks_data.ksnd_addrs_lock); + InitializeListHead(&ks_data.ksnd_addrs_list); + + /* register the pnp handlers */ + RtlZeroMemory(&ClientInfo, sizeof(ClientInfo)); + ClientInfo.TdiVersion = TDI_CURRENT_VERSION; + + ClientInfo.ClientName = &ks_data.ksnd_client_name; + ClientInfo.AddAddressHandlerV2 = KsAddAddressHandler; + ClientInfo.DelAddressHandlerV2 = KsDelAddressHandler; + + return TdiRegisterPnPHandlers(&ClientInfo, sizeof(ClientInfo), + &ks_data.ksnd_pnp_handle); +} + +VOID +KsDeregisterPnpHandlers() +{ + if (ks_data.ksnd_pnp_handle) { + + /* De-register the pnp handlers */ + + TdiDeregisterPnPHandlers(ks_data.ksnd_pnp_handle); + ks_data.ksnd_pnp_handle = NULL; + + /* cleanup all the ip address slots */ + KsCleanupIpAddresses(); + } +} + +/* + * KsConnectEventHandler + * Connect event handler event handler, called by the underlying TDI + * transport in response to an incoming request to the listening daemon. + * + * it will grab a vacancy backlog from the children tconn list, and + * build an acception Irp with it, then transfer the Irp to TDI driver. + * + * Arguments: + * TdiEventContext: the tdi connnection object of the listening daemon + * ...... + * + * Return Value: + * Nt kernel status code + * + * Notes: + * N/A + */ + +NTSTATUS +KsConnectEventHandler( + IN PVOID TdiEventContext, + IN LONG RemoteAddressLength, + IN PVOID RemoteAddress, + IN LONG UserDataLength, + IN PVOID UserData, + IN LONG OptionsLength, + IN PVOID Options, + OUT CONNECTION_CONTEXT * ConnectionContext, + OUT PIRP * AcceptIrp + ) +{ + ksock_tconn_t * parent; + ksock_tconn_t * child; + + PFILE_OBJECT FileObject; + PDEVICE_OBJECT DeviceObject; + NTSTATUS Status; + + PIRP Irp = NULL; + PTDI_CONNECTION_INFORMATION ConnectionInfo = NULL; + + KsPrint((2,"KsConnectEventHandler: call at Irql: %u\n", KeGetCurrentIrql())); + parent = (ksock_tconn_t *) TdiEventContext; + + LASSERT(parent->kstc_type == kstt_listener); + + spin_lock(&(parent->kstc_lock)); + + if (parent->kstc_state == ksts_listening) { + + /* allocate a new ConnectionInfo to backup the peer's info */ + + ConnectionInfo = (PTDI_CONNECTION_INFORMATION)ExAllocatePoolWithTag( + NonPagedPool, sizeof(TDI_CONNECTION_INFORMATION) + + RemoteAddressLength, 'iCsK' ); + + if (NULL == ConnectionInfo) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + cfs_enter_debugger(); + goto errorout; + } + + /* initializing ConnectionInfo structure ... */ + + ConnectionInfo->UserDataLength = UserDataLength; + ConnectionInfo->UserData = UserData; + ConnectionInfo->OptionsLength = OptionsLength; + ConnectionInfo->Options = Options; + ConnectionInfo->RemoteAddressLength = RemoteAddressLength; + ConnectionInfo->RemoteAddress = ConnectionInfo + 1; + + RtlCopyMemory( + ConnectionInfo->RemoteAddress, + RemoteAddress, + RemoteAddressLength + ); + + /* get the vacancy listening child tdi connections */ + + child = ks_get_vacancy_backlog(parent); + + if (child) { + + spin_lock(&(child->kstc_lock)); + child->child.kstc_info.ConnectionInfo = ConnectionInfo; + child->child.kstc_info.Remote = ConnectionInfo->RemoteAddress; + child->kstc_state = ksts_connecting; + spin_unlock(&(child->kstc_lock)); + + } else { + + KsPrint((2, "KsConnectEventHandler: No enough backlogs: Refsued the connectio: %xh\n", parent)); + + Status = STATUS_INSUFFICIENT_RESOURCES; + + goto errorout; + } + + FileObject = child->child.kstc_info.FileObject; + DeviceObject = IoGetRelatedDeviceObject (FileObject); + + Irp = KsBuildTdiIrp(DeviceObject); + + TdiBuildAccept( + Irp, + DeviceObject, + FileObject, + KsAcceptCompletionRoutine, + child, + NULL, + NULL + ); + + IoSetNextIrpStackLocation(Irp); + + /* grap the refer of the child tdi connection */ + ks_get_tconn(child); + + Status = STATUS_MORE_PROCESSING_REQUIRED; + + *AcceptIrp = Irp; + *ConnectionContext = child; + + } else { + + Status = STATUS_CONNECTION_REFUSED; + goto errorout; + } + + spin_unlock(&(parent->kstc_lock)); + + return Status; + +errorout: + + spin_unlock(&(parent->kstc_lock)); + + { + *AcceptIrp = NULL; + *ConnectionContext = NULL; + + if (ConnectionInfo) { + + ExFreePool(ConnectionInfo); + } + + if (Irp) { + + IoFreeIrp (Irp); + } + } + + return Status; +} + +/* + * KsDisconnectCompletionRoutine + * the Irp completion routine for TdiBuildDisconect + * + * We just signal the event and return MORE_PRO... to + * let the caller take the responsibility of the Irp. + * + * Arguments: + * DeviceObject: the device object of the transport + * Irp: the Irp is being completed. + * Context: the event specified by the caller + * + * Return Value: + * Nt status code + * + * Notes: + * N/A + */ + +NTSTATUS +KsDisconectCompletionRoutine ( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN PVOID Context + ) +{ + + KeSetEvent((PKEVENT) Context, 0, FALSE); + + return STATUS_MORE_PROCESSING_REQUIRED; + + UNREFERENCED_PARAMETER(DeviceObject); +} + + +/* + * KsDisconnectHelper + * the routine to be executed in the WorkItem procedure + * this routine is to disconnect a tdi connection + * + * Arguments: + * Workitem: the context transferred to the workitem + * + * Return Value: + * N/A + * + * Notes: + * tconn is already referred in abort_connecton ... + */ + +VOID +KsDisconnectHelper(PKS_DISCONNECT_WORKITEM WorkItem) +{ + ksock_tconn_t * tconn = WorkItem->tconn; + + DbgPrint("KsDisconnectHelper: disconnecting tconn=%p\n", tconn); + ks_disconnect_tconn(tconn, WorkItem->Flags); + + KeSetEvent(&(WorkItem->Event), 0, FALSE); + + spin_lock(&(tconn->kstc_lock)); + cfs_clear_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY); + spin_unlock(&(tconn->kstc_lock)); + ks_put_tconn(tconn); +} + + +/* + * KsDisconnectEventHandler + * Disconnect event handler event handler, called by the underlying TDI transport + * in response to an incoming disconnection notification from a remote node. + * + * Arguments: + * ConnectionContext: tdi connnection object + * DisconnectFlags: specifies the nature of the disconnection + * ...... + * + * Return Value: + * Nt kernel status code + * + * Notes: + * N/A + */ + + +NTSTATUS +KsDisconnectEventHandler( + IN PVOID TdiEventContext, + IN CONNECTION_CONTEXT ConnectionContext, + IN LONG DisconnectDataLength, + IN PVOID DisconnectData, + IN LONG DisconnectInformationLength, + IN PVOID DisconnectInformation, + IN ULONG DisconnectFlags + ) +{ + ksock_tconn_t * tconn; + NTSTATUS Status; + PKS_DISCONNECT_WORKITEM WorkItem; + + tconn = (ksock_tconn_t *)ConnectionContext; + + KsPrint((2, "KsTcpDisconnectEventHandler: called at Irql: %xh\n", + KeGetCurrentIrql() )); + + KsPrint((2, "tconn = %x DisconnectFlags= %xh\n", + tconn, DisconnectFlags)); + + ks_get_tconn(tconn); + spin_lock(&(tconn->kstc_lock)); + + WorkItem = &(tconn->kstc_disconnect); + + if (tconn->kstc_state != ksts_connected) { + + Status = STATUS_SUCCESS; + + } else { + + if (cfs_is_flag_set(DisconnectFlags, TDI_DISCONNECT_ABORT)) { + + Status = STATUS_REMOTE_DISCONNECT; + + } else if (cfs_is_flag_set(DisconnectFlags, TDI_DISCONNECT_RELEASE)) { + + Status = STATUS_GRACEFUL_DISCONNECT; + } + + if (!cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY)) { + + ks_get_tconn(tconn); + + WorkItem->Flags = DisconnectFlags; + WorkItem->tconn = tconn; + + cfs_set_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY); + + /* queue the workitem to call */ + ExQueueWorkItem(&(WorkItem->WorkItem), DelayedWorkQueue); + } + } + + spin_unlock(&(tconn->kstc_lock)); + ks_put_tconn(tconn); + + return (Status); +} + +NTSTATUS +KsTcpReceiveCompletionRoutine( + IN PIRP Irp, + IN PKS_TCP_COMPLETION_CONTEXT Context + ) +{ + NTSTATUS Status = Irp->IoStatus.Status; + + if (NT_SUCCESS(Status)) { + + ksock_tconn_t *tconn = Context->tconn; + + PKS_TSDU_DAT KsTsduDat = Context->CompletionContext; + PKS_TSDU_BUF KsTsduBuf = Context->CompletionContext; + + KsPrint((1, "KsTcpReceiveCompletionRoutine: Total %xh bytes.\n", + Context->KsTsduMgr->TotalBytes )); + + spin_lock(&(tconn->kstc_lock)); + + if (TSDU_TYPE_DAT == KsTsduDat->TsduType) { + if (cfs_is_flag_set(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING)) { + cfs_clear_flag(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING); + } else { + cfs_enter_debugger(); + } + } else { + ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType); + if (cfs_is_flag_set(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING)) { + cfs_clear_flag(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING); + } else { + cfs_enter_debugger(); + } + } + + spin_unlock(&(tconn->kstc_lock)); + + /* wake up the thread waiting for the completion of this Irp */ + KeSetEvent(Context->Event, 0, FALSE); + + /* re-active the ks connection and wake up the scheduler */ + if (tconn->kstc_conn && tconn->kstc_sched_cb) { + tconn->kstc_sched_cb( tconn, FALSE, NULL, + Context->KsTsduMgr->TotalBytes ); + } + + } else { + + /* un-expected errors occur, we must abort the connection */ + ks_abort_tconn(Context->tconn); + } + + if (Context) { + + /* Freeing the Context structure... */ + ExFreePool(Context); + Context = NULL; + } + + + /* free the Irp */ + if (Irp) { + IoFreeIrp(Irp); + } + + return (Status); +} + + +/* + * KsTcpCompletionRoutine + * the Irp completion routine for TdiBuildSend and TdiBuildReceive ... + * We need call the use's own CompletionRoutine if specified. Or + * it's a synchronous case, we need signal the event. + * + * Arguments: + * DeviceObject: the device object of the transport + * Irp: the Irp is being completed. + * Context: the context we specified when issuing the Irp + * + * Return Value: + * Nt status code + * + * Notes: + * N/A + */ + +NTSTATUS +KsTcpCompletionRoutine( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp, + IN PVOID Context + ) +{ + if (Context) { + + PKS_TCP_COMPLETION_CONTEXT CompletionContext = NULL; + ksock_tconn_t * tconn = NULL; + + CompletionContext = (PKS_TCP_COMPLETION_CONTEXT) Context; + tconn = CompletionContext->tconn; + + /* release the chained mdl */ + KsReleaseMdl(Irp->MdlAddress, FALSE); + Irp->MdlAddress = NULL; + + if (CompletionContext->CompletionRoutine) { + + if ( CompletionContext->bCounted && + InterlockedDecrement(&CompletionContext->ReferCount) != 0 ) { + goto errorout; + } + + // + // Giving control to user specified CompletionRoutine ... + // + + CompletionContext->CompletionRoutine( + Irp, + CompletionContext + ); + + } else { + + // + // Signaling the Event ... + // + + KeSetEvent(CompletionContext->Event, 0, FALSE); + } + + /* drop the reference count of the tconn object */ + ks_put_tconn(tconn); + + } else { + + cfs_enter_debugger(); + } + +errorout: + + return STATUS_MORE_PROCESSING_REQUIRED; +} + +/* + * KsTcpSendCompletionRoutine + * the user specified Irp completion routine for asynchronous + * data transmission requests. + * + * It will do th cleanup job of the ksock_tx_t and wake up the + * ks scheduler thread + * + * Arguments: + * Irp: the Irp is being completed. + * Context: the context we specified when issuing the Irp + * + * Return Value: + * Nt status code + * + * Notes: + * N/A + */ + +NTSTATUS +KsTcpSendCompletionRoutine( + IN PIRP Irp, + IN PKS_TCP_COMPLETION_CONTEXT Context + ) +{ + NTSTATUS Status = Irp->IoStatus.Status; + ULONG rc = Irp->IoStatus.Information; + ksock_tconn_t * tconn = Context->tconn; + PKS_TSDUMGR KsTsduMgr = Context->KsTsduMgr; + + ENTRY; + + LASSERT(tconn) ; + + if (NT_SUCCESS(Status)) { + + if (Context->bCounted) { + PVOID tx = Context->CompletionContext; + + ASSERT(tconn->kstc_update_tx != NULL); + + /* update the tx, rebasing the kiov or iov pointers */ + tx = tconn->kstc_update_tx(tconn, tx, rc); + + /* update the KsTsudMgr total bytes */ + spin_lock(&tconn->kstc_lock); + KsTsduMgr->TotalBytes -= rc; + spin_unlock(&tconn->kstc_lock); + + /* + * now it's time to re-queue the conns into the + * scheduler queue and wake the scheduler thread. + */ + + if (tconn->kstc_conn && tconn->kstc_sched_cb) { + tconn->kstc_sched_cb( tconn, TRUE, tx, 0); + } + + } else { + + PKS_TSDU KsTsdu = Context->CompletionContext; + PKS_TSDU_BUF KsTsduBuf = Context->CompletionContext2; + PKS_TSDU_DAT KsTsduDat = Context->CompletionContext2; + + spin_lock(&tconn->kstc_lock); + /* This is bufferred sending ... */ + ASSERT(KsTsduBuf->StartOffset == 0); + + if (KsTsduBuf->DataLength > Irp->IoStatus.Information) { + /* not fully sent .... we have to abort the connection */ + spin_unlock(&tconn->kstc_lock); + ks_abort_tconn(tconn); + goto errorout; + } + + if (KsTsduBuf->TsduType == TSDU_TYPE_BUF) { + /* free the buffer */ + ExFreePool(KsTsduBuf->UserBuffer); + KsTsduMgr->TotalBytes -= KsTsduBuf->DataLength; + KsTsdu->StartOffset += sizeof(KS_TSDU_BUF); + } else if (KsTsduDat->TsduType == TSDU_TYPE_DAT) { + KsTsduMgr->TotalBytes -= KsTsduDat->DataLength; + KsTsdu->StartOffset += KsTsduDat->TotalLength; + } else { + cfs_enter_debugger(); /* shoult not get here */ + } + + if (KsTsdu->StartOffset == KsTsdu->LastOffset) { + + list_del(&KsTsdu->Link); + KsTsduMgr->NumOfTsdu--; + KsPutKsTsdu(KsTsdu); + } + + spin_unlock(&tconn->kstc_lock); + } + + } else { + + /* cfs_enter_debugger(); */ + + /* + * for the case that the transmission is ussuccessful, + * we need abort the tdi connection, but not destroy it. + * the socknal conn will drop the refer count, then the + * tdi connection will be freed. + */ + + ks_abort_tconn(tconn); + } + +errorout: + + /* freeing the Context structure... */ + + if (Context) { + ExFreePool(Context); + Context = NULL; + } + + /* it's our duty to free the Irp. */ + + if (Irp) { + IoFreeIrp(Irp); + Irp = NULL; + } + + EXIT; + + return Status; +} + +/* + * Normal receive event handler + * + * It will move data from system Tsdu to our TsduList + */ + +NTSTATUS +KsTcpReceiveEventHandler( + IN PVOID TdiEventContext, + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG BytesIndicated, + IN ULONG BytesAvailable, + OUT ULONG * BytesTaken, + IN PVOID Tsdu, + OUT PIRP * IoRequestPacket + ) +{ + NTSTATUS Status; + + ksock_tconn_t * tconn; + + PKS_CHAIN KsChain; + PKS_TSDUMGR KsTsduMgr; + PKS_TSDU KsTsdu; + PKS_TSDU_DAT KsTsduDat; + PKS_TSDU_BUF KsTsduBuf; + + BOOLEAN bIsExpedited; + BOOLEAN bIsCompleteTsdu; + + BOOLEAN bNewTsdu = FALSE; + BOOLEAN bNewBuff = FALSE; + + PCHAR Buffer = NULL; + + PIRP Irp = NULL; + PMDL Mdl = NULL; + PFILE_OBJECT FileObject; + PDEVICE_OBJECT DeviceObject; + + ULONG BytesReceived = 0; + + PKS_TCP_COMPLETION_CONTEXT context = NULL; + + + tconn = (ksock_tconn_t *) ConnectionContext; + + ks_get_tconn(tconn); + + /* check whether the whole body of payload is received or not */ + if ( (cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_ENTIRE_MESSAGE)) && + (BytesIndicated == BytesAvailable) ) { + bIsCompleteTsdu = TRUE; + } else { + bIsCompleteTsdu = FALSE; + } + + bIsExpedited = cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_EXPEDITED); + + KsPrint((2, "KsTcpReceiveEventHandler BytesIndicated = %d BytesAvailable = %d ...\n", BytesIndicated, BytesAvailable)); + KsPrint((2, "bIsCompleteTsdu = %d bIsExpedited = %d\n", bIsCompleteTsdu, bIsExpedited )); + + spin_lock(&(tconn->kstc_lock)); + + /* check whether we are conntected or not listener ¡­*/ + if ( !((tconn->kstc_state == ksts_connected) && + (tconn->kstc_type == kstt_sender || + tconn->kstc_type == kstt_child))) { + + *BytesTaken = BytesIndicated; + + spin_unlock(&(tconn->kstc_lock)); + ks_put_tconn(tconn); + + return (STATUS_SUCCESS); + } + + if (tconn->kstc_type == kstt_sender) { + KsChain = &(tconn->sender.kstc_recv); + } else { + LASSERT(tconn->kstc_type == kstt_child); + KsChain = &(tconn->child.kstc_recv); + } + + if (bIsExpedited) { + KsTsduMgr = &(KsChain->Expedited); + } else { + KsTsduMgr = &(KsChain->Normal); + } + + /* if the Tsdu is even larger than the biggest Tsdu, we have + to allocate new buffer and use TSDU_TYOE_BUF to store it */ + + if ( KS_TSDU_STRU_SIZE(BytesAvailable) > ks_data.ksnd_tsdu_size - + KS_DWORD_ALIGN(sizeof(KS_TSDU))) { + bNewBuff = TRUE; + } + + /* retrieve the latest Tsdu buffer form TsduMgr + list if the list is not empty. */ + + if (list_empty(&(KsTsduMgr->TsduList))) { + + LASSERT(KsTsduMgr->NumOfTsdu == 0); + KsTsdu = NULL; + + } else { + + LASSERT(KsTsduMgr->NumOfTsdu > 0); + KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link); + + /* if this Tsdu does not contain enough space, we need + allocate a new Tsdu queue. */ + + if (bNewBuff) { + if ( KsTsdu->LastOffset + sizeof(KS_TSDU_BUF) > + KsTsdu->TotalLength ) { + KsTsdu = NULL; + } + } else { + if ( KS_TSDU_STRU_SIZE(BytesAvailable) > + KsTsdu->TotalLength - KsTsdu->LastOffset ) { + KsTsdu = NULL; + } + } + } + + /* allocating the buffer for TSDU_TYPE_BUF */ + if (bNewBuff) { + Buffer = ExAllocatePool(NonPagedPool, BytesAvailable); + if (NULL == Buffer) { + /* there's no enough memory for us. We just try to + receive maximum bytes with a new Tsdu */ + bNewBuff = FALSE; + KsTsdu = NULL; + } + } + + /* allocate a new Tsdu in case we are not statisfied. */ + + if (NULL == KsTsdu) { + + KsTsdu = KsAllocateKsTsdu(); + + if (NULL == KsTsdu) { + goto errorout; + } else { + bNewTsdu = TRUE; + } + } + + KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->LastOffset); + KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->LastOffset); + + if (bNewBuff) { + + /* setup up the KS_TSDU_BUF record */ + + KsTsduBuf->TsduType = TSDU_TYPE_BUF; + KsTsduBuf->TsduFlags = 0; + KsTsduBuf->StartOffset = 0; + KsTsduBuf->UserBuffer = Buffer; + KsTsduBuf->DataLength = BytesReceived = BytesAvailable; + + KsTsdu->LastOffset += sizeof(KS_TSDU_BUF); + + } else { + + /* setup the KS_TSDU_DATA to contain all the messages */ + + KsTsduDat->TsduType = TSDU_TYPE_DAT; + KsTsduDat->TsduFlags = 0; + + if ( KsTsdu->TotalLength - KsTsdu->LastOffset >= + KS_TSDU_STRU_SIZE(BytesAvailable) ) { + BytesReceived = BytesAvailable; + } else { + BytesReceived = KsTsdu->TotalLength - KsTsdu->LastOffset - + FIELD_OFFSET(KS_TSDU_DAT, Data); + BytesReceived &= (~((ULONG)3)); + } + KsTsduDat->DataLength = BytesReceived; + KsTsduDat->TotalLength = KS_TSDU_STRU_SIZE(BytesReceived); + KsTsduDat->StartOffset = 0; + + Buffer = &KsTsduDat->Data[0]; + + KsTsdu->LastOffset += KsTsduDat->TotalLength; + } + + KsTsduMgr->TotalBytes += BytesReceived; + + if (bIsCompleteTsdu) { + + /* It's a complete receive, we just move all + the data from system to our Tsdu */ + + RtlMoveMemory( + Buffer, + Tsdu, + BytesReceived + ); + + *BytesTaken = BytesReceived; + Status = STATUS_SUCCESS; + + if (bNewTsdu) { + list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList)); + KsTsduMgr->NumOfTsdu++; + } + + KeSetEvent(&(KsTsduMgr->Event), 0, FALSE); + + /* re-active the ks connection and wake up the scheduler */ + if (tconn->kstc_conn && tconn->kstc_sched_cb) { + tconn->kstc_sched_cb( tconn, FALSE, NULL, + KsTsduMgr->TotalBytes ); + } + + } else { + + /* there's still data in tdi internal queue, we need issue a new + Irp to receive all of them. first allocate the tcp context */ + + context = ExAllocatePoolWithTag( + NonPagedPool, + sizeof(KS_TCP_COMPLETION_CONTEXT), + 'cTsK'); + + if (!context) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + /* setup the context */ + RtlZeroMemory(context, sizeof(KS_TCP_COMPLETION_CONTEXT)); + + context->tconn = tconn; + context->CompletionRoutine = KsTcpReceiveCompletionRoutine; + context->CompletionContext = KsTsdu; + context->CompletionContext = bNewBuff ? (PVOID)KsTsduBuf : (PVOID)KsTsduDat; + context->KsTsduMgr = KsTsduMgr; + context->Event = &(KsTsduMgr->Event); + + if (tconn->kstc_type == kstt_sender) { + FileObject = tconn->sender.kstc_info.FileObject; + } else { + FileObject = tconn->child.kstc_info.FileObject; + } + + DeviceObject = IoGetRelatedDeviceObject(FileObject); + + /* build new tdi Irp and setup it. */ + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + goto errorout; + } + + Status = KsLockUserBuffer( + Buffer, + FALSE, + BytesReceived, + IoModifyAccess, + &Mdl + ); + + if (!NT_SUCCESS(Status)) { + goto errorout; + } + + TdiBuildReceive( + Irp, + DeviceObject, + FileObject, + KsTcpCompletionRoutine, + context, + Mdl, + ReceiveFlags & (TDI_RECEIVE_NORMAL | TDI_RECEIVE_EXPEDITED), + BytesReceived + ); + + IoSetNextIrpStackLocation(Irp); + + /* return the newly built Irp to transport driver, + it will process it to receive all the data */ + + *IoRequestPacket = Irp; + *BytesTaken = 0; + + if (bNewTsdu) { + + list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList)); + KsTsduMgr->NumOfTsdu++; + } + + if (bNewBuff) { + cfs_set_flag(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING); + } else { + cfs_set_flag(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING); + } + ks_get_tconn(tconn); + Status = STATUS_MORE_PROCESSING_REQUIRED; + } + + spin_unlock(&(tconn->kstc_lock)); + ks_put_tconn(tconn); + + return (Status); + +errorout: + + spin_unlock(&(tconn->kstc_lock)); + + if (bNewTsdu && (KsTsdu != NULL)) { + KsFreeKsTsdu(KsTsdu); + } + + if (Mdl) { + KsReleaseMdl(Mdl, FALSE); + } + + if (Irp) { + IoFreeIrp(Irp); + } + + if (context) { + ExFreePool(context); + } + + ks_abort_tconn(tconn); + ks_put_tconn(tconn); + + *BytesTaken = BytesAvailable; + Status = STATUS_SUCCESS; + + return (Status); +} + +/* + * Expedited receive event handler + */ + +NTSTATUS +KsTcpReceiveExpeditedEventHandler( + IN PVOID TdiEventContext, + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG BytesIndicated, + IN ULONG BytesAvailable, + OUT ULONG * BytesTaken, + IN PVOID Tsdu, + OUT PIRP * IoRequestPacket + ) +{ + return KsTcpReceiveEventHandler( + TdiEventContext, + ConnectionContext, + ReceiveFlags | TDI_RECEIVE_EXPEDITED, + BytesIndicated, + BytesAvailable, + BytesTaken, + Tsdu, + IoRequestPacket + ); +} + + +/* + * Bulk receive event handler + * + * It will queue all the system Tsdus to our TsduList. + * Then later ks_recv_mdl will release them. + */ + +NTSTATUS +KsTcpChainedReceiveEventHandler ( + IN PVOID TdiEventContext, // the event context + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG ReceiveLength, + IN ULONG StartingOffset, // offset of start of client data in TSDU + IN PMDL Tsdu, // TSDU data chain + IN PVOID TsduDescriptor // for call to TdiReturnChainedReceives + ) +{ + + NTSTATUS Status; + + ksock_tconn_t * tconn; + + PKS_CHAIN KsChain; + PKS_TSDUMGR KsTsduMgr; + PKS_TSDU KsTsdu; + PKS_TSDU_MDL KsTsduMdl; + + BOOLEAN bIsExpedited; + BOOLEAN bNewTsdu = FALSE; + + tconn = (ksock_tconn_t *) ConnectionContext; + + bIsExpedited = cfs_is_flag_set(ReceiveFlags, TDI_RECEIVE_EXPEDITED); + + KsPrint((2, "KsTcpChainedReceive: ReceiveLength = %xh bIsExpedited = %d\n", ReceiveLength, bIsExpedited)); + + ks_get_tconn(tconn); + spin_lock(&(tconn->kstc_lock)); + + /* check whether we are conntected or not listener ¡­*/ + if ( !((tconn->kstc_state == ksts_connected) && + (tconn->kstc_type == kstt_sender || + tconn->kstc_type == kstt_child))) { + + spin_unlock(&(tconn->kstc_lock)); + ks_put_tconn(tconn); + + return (STATUS_SUCCESS); + } + + /* get the latest Tsdu buffer form TsduMgr list. + just set NULL if the list is empty. */ + + if (tconn->kstc_type == kstt_sender) { + KsChain = &(tconn->sender.kstc_recv); + } else { + LASSERT(tconn->kstc_type == kstt_child); + KsChain = &(tconn->child.kstc_recv); + } + + if (bIsExpedited) { + KsTsduMgr = &(KsChain->Expedited); + } else { + KsTsduMgr = &(KsChain->Normal); + } + + if (list_empty(&(KsTsduMgr->TsduList))) { + + LASSERT(KsTsduMgr->NumOfTsdu == 0); + KsTsdu = NULL; + + } else { + + LASSERT(KsTsduMgr->NumOfTsdu > 0); + KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link); + LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC); + + if (sizeof(KS_TSDU_MDL) > KsTsdu->TotalLength - KsTsdu->LastOffset) { + KsTsdu = NULL; + } + } + + /* if there's no Tsdu or the free size is not enough for this + KS_TSDU_MDL structure. We need re-allocate a new Tsdu. */ + + if (NULL == KsTsdu) { + + KsTsdu = KsAllocateKsTsdu(); + + if (NULL == KsTsdu) { + goto errorout; + } else { + bNewTsdu = TRUE; + } + } + + /* just queue the KS_TSDU_MDL to the Tsdu buffer */ + + KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->LastOffset); + + KsTsduMdl->TsduType = TSDU_TYPE_MDL; + KsTsduMdl->DataLength = ReceiveLength; + KsTsduMdl->StartOffset = StartingOffset; + KsTsduMdl->Mdl = Tsdu; + KsTsduMdl->Descriptor = TsduDescriptor; + + KsTsdu->LastOffset += sizeof(KS_TSDU_MDL); + KsTsduMgr->TotalBytes += ReceiveLength; + + KsPrint((2, "KsTcpChainedReceiveEventHandler: Total %xh bytes.\n", + KsTsduMgr->TotalBytes )); + + Status = STATUS_PENDING; + + /* attach it to the TsduMgr list if the Tsdu is newly created. */ + if (bNewTsdu) { + + list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList)); + KsTsduMgr->NumOfTsdu++; + } + + spin_unlock(&(tconn->kstc_lock)); + + /* wake up the threads waiing in ks_recv_mdl */ + KeSetEvent(&(KsTsduMgr->Event), 0, FALSE); + + if (tconn->kstc_conn && tconn->kstc_sched_cb) { + tconn->kstc_sched_cb( tconn, FALSE, NULL, + KsTsduMgr->TotalBytes ); + } + + ks_put_tconn(tconn); + + /* Return STATUS_PENDING to system because we are still + owning the MDL resources. ks_recv_mdl is expected + to free the MDL resources. */ + + return (Status); + +errorout: + + spin_unlock(&(tconn->kstc_lock)); + + if (bNewTsdu && (KsTsdu != NULL)) { + KsFreeKsTsdu(KsTsdu); + } + + /* abort the tdi connection */ + ks_abort_tconn(tconn); + ks_put_tconn(tconn); + + + Status = STATUS_SUCCESS; + + return (Status); +} + + +/* + * Expedited & Bulk receive event handler + */ + +NTSTATUS +KsTcpChainedReceiveExpeditedEventHandler ( + IN PVOID TdiEventContext, // the event context + IN CONNECTION_CONTEXT ConnectionContext, + IN ULONG ReceiveFlags, + IN ULONG ReceiveLength, + IN ULONG StartingOffset, // offset of start of client data in TSDU + IN PMDL Tsdu, // TSDU data chain + IN PVOID TsduDescriptor // for call to TdiReturnChainedReceives + ) +{ + return KsTcpChainedReceiveEventHandler( + TdiEventContext, + ConnectionContext, + ReceiveFlags | TDI_RECEIVE_EXPEDITED, + ReceiveLength, + StartingOffset, + Tsdu, + TsduDescriptor ); +} + + +VOID +KsPrintProviderInfo( + PWSTR DeviceName, + PTDI_PROVIDER_INFO ProviderInfo + ) +{ + KsPrint((2, "%ws ProviderInfo:\n", DeviceName)); + + KsPrint((2, " Version : 0x%4.4X\n", ProviderInfo->Version )); + KsPrint((2, " MaxSendSize : %d\n", ProviderInfo->MaxSendSize )); + KsPrint((2, " MaxConnectionUserData: %d\n", ProviderInfo->MaxConnectionUserData )); + KsPrint((2, " MaxDatagramSize : %d\n", ProviderInfo->MaxDatagramSize )); + KsPrint((2, " ServiceFlags : 0x%8.8X\n", ProviderInfo->ServiceFlags )); + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_CONNECTION_MODE) { + KsPrint((2, " CONNECTION_MODE\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_ORDERLY_RELEASE) { + KsPrint((2, " ORDERLY_RELEASE\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_CONNECTIONLESS_MODE) { + KsPrint((2, " CONNECTIONLESS_MODE\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_ERROR_FREE_DELIVERY) { + KsPrint((2, " ERROR_FREE_DELIVERY\n")); + } + + if( ProviderInfo->ServiceFlags & TDI_SERVICE_SECURITY_LEVEL ) { + KsPrint((2, " SECURITY_LEVEL\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_BROADCAST_SUPPORTED) { + KsPrint((2, " BROADCAST_SUPPORTED\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_MULTICAST_SUPPORTED) { + KsPrint((2, " MULTICAST_SUPPORTED\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_DELAYED_ACCEPTANCE) { + KsPrint((2, " DELAYED_ACCEPTANCE\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_EXPEDITED_DATA) { + KsPrint((2, " EXPEDITED_DATA\n")); + } + + if( ProviderInfo->ServiceFlags & TDI_SERVICE_INTERNAL_BUFFERING) { + KsPrint((2, " INTERNAL_BUFFERING\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_ROUTE_DIRECTED) { + KsPrint((2, " ROUTE_DIRECTED\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_NO_ZERO_LENGTH) { + KsPrint((2, " NO_ZERO_LENGTH\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_POINT_TO_POINT) { + KsPrint((2, " POINT_TO_POINT\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_MESSAGE_MODE) { + KsPrint((2, " MESSAGE_MODE\n")); + } + + if (ProviderInfo->ServiceFlags & TDI_SERVICE_HALF_DUPLEX) { + KsPrint((2, " HALF_DUPLEX\n")); + } + + KsPrint((2, " MinimumLookaheadData : %d\n", ProviderInfo->MinimumLookaheadData )); + KsPrint((2, " MaximumLookaheadData : %d\n", ProviderInfo->MaximumLookaheadData )); + KsPrint((2, " NumberOfResources : %d\n", ProviderInfo->NumberOfResources )); +} + + +/* + * KsAllocateKsTsdu + * Reuse a Tsdu from the freelist or allocate a new Tsdu + * from the LookAsideList table or the NonPagedPool + * + * Arguments: + * N/A + * + * Return Value: + * PKS_Tsdu: the new Tsdu or NULL if it fails + * + * Notes: + * N/A + */ + +PKS_TSDU +KsAllocateKsTsdu() +{ + PKS_TSDU KsTsdu = NULL; + + spin_lock(&(ks_data.ksnd_tsdu_lock)); + + if (!list_empty (&(ks_data.ksnd_freetsdus))) { + + LASSERT(ks_data.ksnd_nfreetsdus > 0); + + KsTsdu = list_entry(ks_data.ksnd_freetsdus.next, KS_TSDU, Link); + list_del(&(KsTsdu->Link)); + ks_data.ksnd_nfreetsdus--; + + } else { + + KsTsdu = (PKS_TSDU) cfs_mem_cache_alloc( + ks_data.ksnd_tsdu_slab, 0); + } + + spin_unlock(&(ks_data.ksnd_tsdu_lock)); + + if (NULL != KsTsdu) { + KsInitializeKsTsdu(KsTsdu, ks_data.ksnd_tsdu_size); + } + + return (KsTsdu); +} + + +/* + * KsPutKsTsdu + * Move the Tsdu to the free tsdu list in ks_data. + * + * Arguments: + * KsTsdu: Tsdu to be moved. + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +VOID +KsPutKsTsdu( + PKS_TSDU KsTsdu + ) +{ + spin_lock(&(ks_data.ksnd_tsdu_lock)); + + list_add_tail( &(KsTsdu->Link), &(ks_data.ksnd_freetsdus)); + ks_data.ksnd_nfreetsdus++; + + spin_unlock(&(ks_data.ksnd_tsdu_lock)); +} + + +/* + * KsFreeKsTsdu + * Release a Tsdu: uninitialize then free it. + * + * Arguments: + * KsTsdu: Tsdu to be freed. + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +VOID +KsFreeKsTsdu( + PKS_TSDU KsTsdu + ) +{ + cfs_mem_cache_free( + ks_data.ksnd_tsdu_slab, + KsTsdu ); +} + + +/* + * KsInitializeKsTsdu + * Initialize the Tsdu buffer header + * + * Arguments: + * KsTsdu: the Tsdu to be initialized + * Length: the total length of the Tsdu + * + * Return Value: + * VOID + * + * NOTES: + * N/A + */ + +VOID +KsInitializeKsTsdu( + PKS_TSDU KsTsdu, + ULONG Length + ) +{ + RtlZeroMemory(KsTsdu, Length); + KsTsdu->Magic = KS_TSDU_MAGIC; + KsTsdu->TotalLength = Length; + KsTsdu->StartOffset = KsTsdu->LastOffset = + KS_DWORD_ALIGN(sizeof(KS_TSDU)); +} + + +/* + * KsInitializeKsTsduMgr + * Initialize the management structure of + * Tsdu buffers + * + * Arguments: + * TsduMgr: the TsduMgr to be initialized + * + * Return Value: + * VOID + * + * NOTES: + * N/A + */ + +VOID +KsInitializeKsTsduMgr( + PKS_TSDUMGR TsduMgr + ) +{ + KeInitializeEvent( + &(TsduMgr->Event), + NotificationEvent, + FALSE + ); + + CFS_INIT_LIST_HEAD( + &(TsduMgr->TsduList) + ); + + TsduMgr->NumOfTsdu = 0; + TsduMgr->TotalBytes = 0; +} + + +/* + * KsInitializeKsChain + * Initialize the China structure for receiving + * or transmitting + * + * Arguments: + * KsChain: the KsChain to be initialized + * + * Return Value: + * VOID + * + * NOTES: + * N/A + */ + +VOID +KsInitializeKsChain( + PKS_CHAIN KsChain + ) +{ + KsInitializeKsTsduMgr(&(KsChain->Normal)); + KsInitializeKsTsduMgr(&(KsChain->Expedited)); +} + + +/* + * KsCleanupTsduMgr + * Clean up all the Tsdus in the TsduMgr list + * + * Arguments: + * KsTsduMgr: the Tsdu list manager + * + * Return Value: + * NTSTATUS: nt status code + * + * NOTES: + * N/A + */ + +NTSTATUS +KsCleanupTsduMgr( + PKS_TSDUMGR KsTsduMgr + ) +{ + PKS_TSDU KsTsdu; + PKS_TSDU_DAT KsTsduDat; + PKS_TSDU_BUF KsTsduBuf; + PKS_TSDU_MDL KsTsduMdl; + + LASSERT(NULL != KsTsduMgr); + + KeSetEvent(&(KsTsduMgr->Event), 0, FALSE); + + while (!list_empty(&KsTsduMgr->TsduList)) { + + KsTsdu = list_entry(KsTsduMgr->TsduList.next, KS_TSDU, Link); + LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC); + + if (KsTsdu->StartOffset == KsTsdu->LastOffset) { + + // + // KsTsdu is empty now, we need free it ... + // + + list_del(&(KsTsdu->Link)); + KsTsduMgr->NumOfTsdu--; + + KsFreeKsTsdu(KsTsdu); + + } else { + + KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->StartOffset); + KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->StartOffset); + KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->StartOffset); + + if (TSDU_TYPE_DAT == KsTsduDat->TsduType) { + + KsTsdu->StartOffset += KsTsduDat->TotalLength; + + } else if (TSDU_TYPE_BUF == KsTsduBuf->TsduType) { + + ASSERT(KsTsduBuf->UserBuffer != NULL); + + if (KsTsduBuf->DataLength > KsTsduBuf->StartOffset) { + ExFreePool(KsTsduBuf->UserBuffer); + } else { + cfs_enter_debugger(); + } + + KsTsdu->StartOffset += sizeof(KS_TSDU_BUF); + + } else if (TSDU_TYPE_MDL == KsTsduMdl->TsduType) { + + // + // MDL Tsdu Unit ... + // + + TdiReturnChainedReceives( + &(KsTsduMdl->Descriptor), + 1 ); + + KsTsdu->StartOffset += sizeof(KS_TSDU_MDL); + } + } + } + + return STATUS_SUCCESS; +} + + +/* + * KsCleanupKsChain + * Clean up the TsduMgrs of the KsChain + * + * Arguments: + * KsChain: the chain managing TsduMgr + * + * Return Value: + * NTSTATUS: nt status code + * + * NOTES: + * N/A + */ + +NTSTATUS +KsCleanupKsChain( + PKS_CHAIN KsChain + ) +{ + NTSTATUS Status; + + LASSERT(NULL != KsChain); + + Status = KsCleanupTsduMgr( + &(KsChain->Normal) + ); + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + + Status = KsCleanupTsduMgr( + &(KsChain->Expedited) + ); + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + +errorout: + + return Status; +} + + +/* + * KsCleanupTsdu + * Clean up all the Tsdus of a tdi connected object + * + * Arguments: + * tconn: the tdi connection which is connected already. + * + * Return Value: + * Nt status code + * + * NOTES: + * N/A + */ + +NTSTATUS +KsCleanupTsdu( + ksock_tconn_t * tconn + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + + if (tconn->kstc_type != kstt_sender && + tconn->kstc_type != kstt_child ) { + + goto errorout; + } + + if (tconn->kstc_type == kstt_sender) { + + Status = KsCleanupKsChain( + &(tconn->sender.kstc_recv) + ); + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + + Status = KsCleanupKsChain( + &(tconn->sender.kstc_send) + ); + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + + } else { + + Status = KsCleanupKsChain( + &(tconn->child.kstc_recv) + ); + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + + Status = KsCleanupKsChain( + &(tconn->child.kstc_send) + ); + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + + } + +errorout: + + return (Status); +} + + +/* + * KsCopyMdlChainToMdlChain + * Copy data from a [chained] Mdl to anther [chained] Mdl. + * Tdi library does not provide this function. We have to + * realize it ourselives. + * + * Arguments: + * SourceMdlChain: the source mdl + * SourceOffset: start offset of the source + * DestinationMdlChain: the dst mdl + * DestinationOffset: the offset where data are to be copied. + * BytesTobecopied: the expteced bytes to be copied + * BytesCopied: to store the really copied data length + * + * Return Value: + * NTSTATUS: STATUS_SUCCESS or other error code + * + * NOTES: + * The length of source mdl must be >= SourceOffset + BytesTobecopied + */ + +NTSTATUS +KsCopyMdlChainToMdlChain( + IN PMDL SourceMdlChain, + IN ULONG SourceOffset, + IN PMDL DestinationMdlChain, + IN ULONG DestinationOffset, + IN ULONG BytesTobecopied, + OUT PULONG BytesCopied + ) +{ + PMDL SrcMdl = SourceMdlChain; + PMDL DstMdl = DestinationMdlChain; + + PUCHAR SrcBuf = NULL; + PUCHAR DstBuf = NULL; + + ULONG dwBytes = 0; + + NTSTATUS Status = STATUS_SUCCESS; + + + while (dwBytes < BytesTobecopied) { + + ULONG Length = 0; + + while (MmGetMdlByteCount(SrcMdl) <= SourceOffset) { + + SourceOffset -= MmGetMdlByteCount(SrcMdl); + + SrcMdl = SrcMdl->Next; + + if (NULL == SrcMdl) { + + Status = STATUS_INVALID_PARAMETER; + goto errorout; + } + } + + while (MmGetMdlByteCount(DstMdl) <= DestinationOffset) { + + DestinationOffset -= MmGetMdlByteCount(DstMdl); + + DstMdl = DstMdl->Next; + + if (NULL == DstMdl) { + + Status = STATUS_INVALID_PARAMETER; + goto errorout; + } + } + + DstBuf = (PUCHAR)KsMapMdlBuffer(DstMdl); + + if ((NULL == DstBuf)) { + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + // + // Here we need skip the OVERFLOW case via RtlCopyMemory :-( + // + + if ( KsQueryMdlsSize(SrcMdl) - SourceOffset > + MmGetMdlByteCount(DstMdl) - DestinationOffset ) { + + Length = BytesTobecopied - dwBytes; + + if (Length > KsQueryMdlsSize(SrcMdl) - SourceOffset) { + Length = KsQueryMdlsSize(SrcMdl) - SourceOffset; + } + + if (Length > MmGetMdlByteCount(DstMdl) - DestinationOffset) { + Length = MmGetMdlByteCount(DstMdl) - DestinationOffset; + } + + SrcBuf = (PUCHAR)KsMapMdlBuffer(SrcMdl); + + if ((NULL == DstBuf)) { + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + RtlCopyMemory( + DstBuf + DestinationOffset, + SrcBuf + SourceOffset, + Length + ); + + } else { + + Status = TdiCopyMdlToBuffer( + SrcMdl, + SourceOffset, + DstBuf, + DestinationOffset, + MmGetMdlByteCount(DstMdl), + &Length + ); + + if (STATUS_BUFFER_OVERFLOW == Status) { + cfs_enter_debugger(); + } else if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + goto errorout; + } + } + + SourceOffset += Length; + DestinationOffset += Length; + dwBytes += Length; + } + +errorout: + + if (NT_SUCCESS(Status)) { + *BytesCopied = dwBytes; + } else { + *BytesCopied = 0; + } + + return Status; +} + + + +/* + * KsQueryMdlSize + * Query the whole size of a MDL (may be chained) + * + * Arguments: + * Mdl: the Mdl to be queried + * + * Return Value: + * ULONG: the total size of the mdl + * + * NOTES: + * N/A + */ + +ULONG +KsQueryMdlsSize (PMDL Mdl) +{ + PMDL Next = Mdl; + ULONG Length = 0; + + + // + // Walking the MDL Chain ... + // + + while (Next) { + Length += MmGetMdlByteCount(Next); + Next = Next->Next; + } + + return (Length); +} + + +/* + * KsLockUserBuffer + * Allocate MDL for the buffer and lock the pages into + * nonpaged pool + * + * Arguments: + * UserBuffer: the user buffer to be locked + * Length: length in bytes of the buffer + * Operation: read or write access + * pMdl: the result of the created mdl + * + * Return Value: + * NTSTATUS: kernel status code (STATUS_SUCCESS + * or other error code) + * + * NOTES: + * N/A + */ + +NTSTATUS +KsLockUserBuffer ( + IN PVOID UserBuffer, + IN BOOLEAN bPaged, + IN ULONG Length, + IN LOCK_OPERATION Operation, + OUT PMDL * pMdl + ) +{ + NTSTATUS Status; + PMDL Mdl = NULL; + + LASSERT(UserBuffer != NULL); + + *pMdl = NULL; + + Mdl = IoAllocateMdl( + UserBuffer, + Length, + FALSE, + FALSE, + NULL + ); + + if (Mdl == NULL) { + + Status = STATUS_INSUFFICIENT_RESOURCES; + + } else { + + __try { + + if (bPaged) { + MmProbeAndLockPages( + Mdl, + KernelMode, + Operation + ); + } else { + MmBuildMdlForNonPagedPool( + Mdl + ); + } + + Status = STATUS_SUCCESS; + + *pMdl = Mdl; + + } __except (EXCEPTION_EXECUTE_HANDLER) { + + IoFreeMdl(Mdl); + + Mdl = NULL; + + cfs_enter_debugger(); + + Status = STATUS_INVALID_USER_BUFFER; + } + } + + return Status; +} + +/* + * KsMapMdlBuffer + * Map the mdl into a buffer in kernel space + * + * Arguments: + * Mdl: the mdl to be mapped + * + * Return Value: + * PVOID: the buffer mapped or NULL in failure + * + * NOTES: + * N/A + */ + +PVOID +KsMapMdlBuffer (PMDL Mdl) +{ + LASSERT(Mdl != NULL); + + return MmGetSystemAddressForMdlSafe( + Mdl, + NormalPagePriority + ); +} + + +/* + * KsReleaseMdl + * Unlock all the pages in the mdl + * + * Arguments: + * Mdl: memory description list to be released + * + * Return Value: + * N/A + * + * NOTES: + * N/A + */ + +VOID +KsReleaseMdl (IN PMDL Mdl, + IN int Paged ) +{ + LASSERT(Mdl != NULL); + + while (Mdl) { + + PMDL Next; + + Next = Mdl->Next; + + if (Paged) { + MmUnlockPages(Mdl); + } + + IoFreeMdl(Mdl); + + Mdl = Next; + } +} + + +/* + * ks_lock_buffer + * allocate MDL for the user spepcified buffer and lock (paging-in) + * all the pages of the buffer into system memory + * + * Arguments: + * buffer: the user buffer to be locked + * length: length in bytes of the buffer + * access: read or write access + * mdl: the result of the created mdl + * + * Return Value: + * int: the ks error code: 0: success / -x: failture + * + * Notes: + * N/A + */ + +int +ks_lock_buffer ( + void * buffer, + int paged, + int length, + LOCK_OPERATION access, + ksock_mdl_t ** kmdl + ) +{ + NTSTATUS status; + + status = KsLockUserBuffer( + buffer, + paged !=0, + length, + access, + kmdl + ); + + return cfs_error_code(status); +} + + +/* + * ks_map_mdl + * Map the mdl pages into kernel space + * + * Arguments: + * mdl: the mdl to be mapped + * + * Return Value: + * void *: the buffer mapped or NULL in failure + * + * Notes: + * N/A + */ + +void * +ks_map_mdl (ksock_mdl_t * mdl) +{ + LASSERT(mdl != NULL); + + return KsMapMdlBuffer(mdl); +} + +/* + * ks_release_mdl + * Unlock all the pages in the mdl and release the mdl + * + * Arguments: + * mdl: memory description list to be released + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +ks_release_mdl (ksock_mdl_t *mdl, int paged) +{ + LASSERT(mdl != NULL); + + KsReleaseMdl(mdl, paged); +} + + +/* + * ks_create_tconn + * allocate a new tconn structure from the SLAB cache or + * NonPaged sysetm pool + * + * Arguments: + * N/A + * + * Return Value: + * ksock_tconn_t *: the address of tconn or NULL if it fails + * + * NOTES: + * N/A + */ + +ksock_tconn_t * +ks_create_tconn() +{ + ksock_tconn_t * tconn = NULL; + + /* allocate ksoc_tconn_t from the slab cache memory */ + + tconn = (ksock_tconn_t *)cfs_mem_cache_alloc( + ks_data.ksnd_tconn_slab, CFS_ALLOC_ZERO); + + if (tconn) { + + /* zero tconn elements */ + memset(tconn, 0, sizeof(ksock_tconn_t)); + + /* initialize the tconn ... */ + tconn->kstc_magic = KS_TCONN_MAGIC; + + ExInitializeWorkItem( + &(tconn->kstc_disconnect.WorkItem), + KsDisconnectHelper, + &(tconn->kstc_disconnect) + ); + + KeInitializeEvent( + &(tconn->kstc_disconnect.Event), + SynchronizationEvent, + FALSE ); + + ExInitializeWorkItem( + &(tconn->kstc_destroy), + ks_destroy_tconn, + tconn + ); + + spin_lock_init(&(tconn->kstc_lock)); + + ks_get_tconn(tconn); + + spin_lock(&(ks_data.ksnd_tconn_lock)); + + /* attach it into global list in ks_data */ + + list_add(&(tconn->kstc_list), &(ks_data.ksnd_tconns)); + ks_data.ksnd_ntconns++; + spin_unlock(&(ks_data.ksnd_tconn_lock)); + + tconn->kstc_rcv_wnd = tconn->kstc_snd_wnd = 0x10000; + } + + return (tconn); +} + + +/* + * ks_free_tconn + * free the tconn structure to the SLAB cache or NonPaged + * sysetm pool + * + * Arguments: + * tconn: the tcon is to be freed + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +ks_free_tconn(ksock_tconn_t * tconn) +{ + LASSERT(atomic_read(&(tconn->kstc_refcount)) == 0); + + spin_lock(&(ks_data.ksnd_tconn_lock)); + + /* remove it from the global list */ + list_del(&tconn->kstc_list); + ks_data.ksnd_ntconns--; + + /* if this is the last tconn, it would be safe for + ks_tdi_fini_data to quit ... */ + if (ks_data.ksnd_ntconns == 0) { + cfs_wake_event(&ks_data.ksnd_tconn_exit); + } + spin_unlock(&(ks_data.ksnd_tconn_lock)); + + /* free the structure memory */ + cfs_mem_cache_free(ks_data.ksnd_tconn_slab, tconn); +} + + +/* + * ks_init_listener + * Initialize the tconn as a listener (daemon) + * + * Arguments: + * tconn: the listener tconn + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +ks_init_listener( + ksock_tconn_t * tconn + ) +{ + /* preparation: intialize the tconn members */ + + tconn->kstc_type = kstt_listener; + + RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME); + + CFS_INIT_LIST_HEAD(&(tconn->listener.kstc_listening.list)); + CFS_INIT_LIST_HEAD(&(tconn->listener.kstc_accepted.list)); + + cfs_init_event( &(tconn->listener.kstc_accept_event), + TRUE, + FALSE ); + + cfs_init_event( &(tconn->listener.kstc_destroy_event), + TRUE, + FALSE ); + + tconn->kstc_state = ksts_inited; +} + + +/* + * ks_init_sender + * Initialize the tconn as a sender + * + * Arguments: + * tconn: the sender tconn + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +ks_init_sender( + ksock_tconn_t * tconn + ) +{ + tconn->kstc_type = kstt_sender; + RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME); + + KsInitializeKsChain(&(tconn->sender.kstc_recv)); + KsInitializeKsChain(&(tconn->sender.kstc_send)); + + tconn->kstc_snd_wnd = TDINAL_WINDOW_DEFAULT_SIZE; + tconn->kstc_rcv_wnd = TDINAL_WINDOW_DEFAULT_SIZE; + + tconn->kstc_state = ksts_inited; +} + +/* + * ks_init_child + * Initialize the tconn as a child + * + * Arguments: + * tconn: the child tconn + * + * Return Value: + * N/A + * + * NOTES: + * N/A + */ + +void +ks_init_child( + ksock_tconn_t * tconn + ) +{ + tconn->kstc_type = kstt_child; + RtlInitUnicodeString(&(tconn->kstc_dev), TCP_DEVICE_NAME); + + KsInitializeKsChain(&(tconn->child.kstc_recv)); + KsInitializeKsChain(&(tconn->child.kstc_send)); + + tconn->kstc_snd_wnd = TDINAL_WINDOW_DEFAULT_SIZE; + tconn->kstc_rcv_wnd = TDINAL_WINDOW_DEFAULT_SIZE; + + tconn->kstc_state = ksts_inited; +} + +/* + * ks_get_tconn + * increase the reference count of the tconn with 1 + * + * Arguments: + * tconn: the tdi connection to be referred + * + * Return Value: + * N/A + * + * NOTES: + * N/A + */ + +void +ks_get_tconn( + ksock_tconn_t * tconn + ) +{ + atomic_inc(&(tconn->kstc_refcount)); +} + +/* + * ks_put_tconn + * decrease the reference count of the tconn and destroy + * it if the refercount becomes 0. + * + * Arguments: + * tconn: the tdi connection to be dereferred + * + * Return Value: + * N/A + * + * NOTES: + * N/A + */ + +void +ks_put_tconn( + ksock_tconn_t *tconn + ) +{ + if (atomic_dec_and_test(&(tconn->kstc_refcount))) { + + spin_lock(&(tconn->kstc_lock)); + + if ( ( tconn->kstc_type == kstt_child || + tconn->kstc_type == kstt_sender ) && + ( tconn->kstc_state == ksts_connected ) ) { + + spin_unlock(&(tconn->kstc_lock)); + + ks_abort_tconn(tconn); + + } else { + + if (cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DESTROY_BUSY)) { + cfs_enter_debugger(); + } else { + ExQueueWorkItem( + &(tconn->kstc_destroy), + DelayedWorkQueue + ); + + cfs_set_flag(tconn->kstc_flags, KS_TCONN_DESTROY_BUSY); + } + + spin_unlock(&(tconn->kstc_lock)); + } + } +} + +/* + * ks_destroy_tconn + * cleanup the tdi connection and free it + * + * Arguments: + * tconn: the tdi connection to be cleaned. + * + * Return Value: + * N/A + * + * NOTES: + * N/A + */ + +void +ks_destroy_tconn( + ksock_tconn_t * tconn + ) +{ + LASSERT(tconn->kstc_refcount.counter == 0); + + if (tconn->kstc_type == kstt_listener) { + + ks_reset_handlers(tconn); + + /* for listener, we just need to close the address object */ + KsCloseAddress( + tconn->kstc_addr.Handle, + tconn->kstc_addr.FileObject + ); + + tconn->kstc_state = ksts_inited; + + } else if (tconn->kstc_type == kstt_child) { + + /* for child tdi conections */ + + /* disassociate the relation between it's connection object + and the address object */ + + if (tconn->kstc_state == ksts_associated) { + KsDisassociateAddress( + tconn->child.kstc_info.FileObject + ); + } + + /* release the connection object */ + + KsCloseConnection( + tconn->child.kstc_info.Handle, + tconn->child.kstc_info.FileObject + ); + + /* release it's refer of it's parent's address object */ + KsCloseAddress( + NULL, + tconn->kstc_addr.FileObject + ); + + spin_lock(&tconn->child.kstc_parent->kstc_lock); + spin_lock(&tconn->kstc_lock); + + tconn->kstc_state = ksts_inited; + + /* remove it frome it's parent's queues */ + + if (tconn->child.kstc_queued) { + + list_del(&(tconn->child.kstc_link)); + + if (tconn->child.kstc_queueno) { + + LASSERT(tconn->child.kstc_parent->listener.kstc_accepted.num > 0); + tconn->child.kstc_parent->listener.kstc_accepted.num -= 1; + + } else { + + LASSERT(tconn->child.kstc_parent->listener.kstc_listening.num > 0); + tconn->child.kstc_parent->listener.kstc_listening.num -= 1; + } + + tconn->child.kstc_queued = FALSE; + } + + spin_unlock(&tconn->kstc_lock); + spin_unlock(&tconn->child.kstc_parent->kstc_lock); + + /* drop the reference of the parent tconn */ + ks_put_tconn(tconn->child.kstc_parent); + + } else if (tconn->kstc_type == kstt_sender) { + + ks_reset_handlers(tconn); + + /* release the connection object */ + + KsCloseConnection( + tconn->sender.kstc_info.Handle, + tconn->sender.kstc_info.FileObject + ); + + /* release it's refer of it's parent's address object */ + KsCloseAddress( + tconn->kstc_addr.Handle, + tconn->kstc_addr.FileObject + ); + + tconn->kstc_state = ksts_inited; + + } else { + cfs_enter_debugger(); + } + + /* free the tconn structure ... */ + + ks_free_tconn(tconn); +} + +int +ks_query_data( + ksock_tconn_t * tconn, + size_t * size, + int bIsExpedited ) +{ + int rc = 0; + + PKS_CHAIN KsChain; + PKS_TSDUMGR KsTsduMgr; + + *size = 0; + + ks_get_tconn(tconn); + spin_lock(&(tconn->kstc_lock)); + + if ( tconn->kstc_type != kstt_sender && + tconn->kstc_type != kstt_child) { + rc = -EINVAL; + spin_unlock(&(tconn->kstc_lock)); + goto errorout; + } + + if (tconn->kstc_state != ksts_connected) { + rc = -ENOTCONN; + spin_unlock(&(tconn->kstc_lock)); + goto errorout; + } + + if (tconn->kstc_type == kstt_sender) { + KsChain = &(tconn->sender.kstc_recv); + } else { + LASSERT(tconn->kstc_type == kstt_child); + KsChain = &(tconn->child.kstc_recv); + } + + if (bIsExpedited) { + KsTsduMgr = &(KsChain->Expedited); + } else { + KsTsduMgr = &(KsChain->Normal); + } + + *size = KsTsduMgr->TotalBytes; + spin_unlock(&(tconn->kstc_lock)); + +errorout: + + ks_put_tconn(tconn); + + return (rc); +} + +/* + * ks_get_tcp_option + * Query the the options of the tcp stream connnection + * + * Arguments: + * tconn: the tdi connection + * ID: option id + * OptionValue: buffer to store the option value + * Length: the length of the value, to be returned + * + * Return Value: + * int: ks return code + * + * NOTES: + * N/A + */ + +int +ks_get_tcp_option ( + ksock_tconn_t * tconn, + ULONG ID, + PVOID OptionValue, + PULONG Length + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + IO_STATUS_BLOCK IoStatus; + + TCP_REQUEST_QUERY_INFORMATION_EX QueryInfoEx; + + PFILE_OBJECT ConnectionObject; + PDEVICE_OBJECT DeviceObject = NULL; + + PIRP Irp = NULL; + PIO_STACK_LOCATION IrpSp = NULL; + + KEVENT Event; + + /* make sure the tdi connection is connected ? */ + + ks_get_tconn(tconn); + + if (tconn->kstc_state != ksts_connected) { + Status = STATUS_INVALID_PARAMETER; + goto errorout; + } + + LASSERT(tconn->kstc_type == kstt_sender || + tconn->kstc_type == kstt_child); + + if (tconn->kstc_type == kstt_sender) { + ConnectionObject = tconn->sender.kstc_info.FileObject; + } else { + ConnectionObject = tconn->child.kstc_info.FileObject; + } + + QueryInfoEx.ID.toi_id = ID; + QueryInfoEx.ID.toi_type = INFO_TYPE_CONNECTION; + QueryInfoEx.ID.toi_class = INFO_CLASS_PROTOCOL; + QueryInfoEx.ID.toi_entity.tei_entity = CO_TL_ENTITY; + QueryInfoEx.ID.toi_entity.tei_instance = 0; + + RtlZeroMemory(&(QueryInfoEx.Context), CONTEXT_SIZE); + + KeInitializeEvent(&Event, NotificationEvent, FALSE); + DeviceObject = IoGetRelatedDeviceObject(ConnectionObject); + + Irp = IoBuildDeviceIoControlRequest( + IOCTL_TCP_QUERY_INFORMATION_EX, + DeviceObject, + &QueryInfoEx, + sizeof(TCP_REQUEST_QUERY_INFORMATION_EX), + OptionValue, + *Length, + FALSE, + &Event, + &IoStatus + ); + + if (Irp == NULL) { + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + IrpSp = IoGetNextIrpStackLocation(Irp); + + if (IrpSp == NULL) { + + IoFreeIrp(Irp); + Irp = NULL; + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + IrpSp->FileObject = ConnectionObject; + IrpSp->DeviceObject = DeviceObject; + + Status = IoCallDriver(DeviceObject, Irp); + + if (Status == STATUS_PENDING) { + + KeWaitForSingleObject( + &Event, + Executive, + KernelMode, + FALSE, + NULL + ); + + Status = IoStatus.Status; + } + + + if (NT_SUCCESS(Status)) { + *Length = IoStatus.Information; + } else { + cfs_enter_debugger(); + memset(OptionValue, 0, *Length); + Status = STATUS_SUCCESS; + } + +errorout: + + ks_put_tconn(tconn); + + return cfs_error_code(Status); +} + +/* + * ks_set_tcp_option + * Set the the options for the tcp stream connnection + * + * Arguments: + * tconn: the tdi connection + * ID: option id + * OptionValue: buffer containing the new option value + * Length: the length of the value + * + * Return Value: + * int: ks return code + * + * NOTES: + * N/A + */ + +NTSTATUS +ks_set_tcp_option ( + ksock_tconn_t * tconn, + ULONG ID, + PVOID OptionValue, + ULONG Length + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + + IO_STATUS_BLOCK IoStatus; + + ULONG SetInfoExLength; + PTCP_REQUEST_SET_INFORMATION_EX SetInfoEx = NULL; + + PFILE_OBJECT ConnectionObject; + PDEVICE_OBJECT DeviceObject = NULL; + + PIRP Irp = NULL; + PIO_STACK_LOCATION IrpSp = NULL; + + PKEVENT Event; + + /* make sure the tdi connection is connected ? */ + + ks_get_tconn(tconn); + + if (tconn->kstc_state != ksts_connected) { + Status = STATUS_INVALID_PARAMETER; + goto errorout; + } + + LASSERT(tconn->kstc_type == kstt_sender || + tconn->kstc_type == kstt_child); + + if (tconn->kstc_type == kstt_sender) { + ConnectionObject = tconn->sender.kstc_info.FileObject; + } else { + ConnectionObject = tconn->child.kstc_info.FileObject; + } + + SetInfoExLength = sizeof(TCP_REQUEST_SET_INFORMATION_EX) - 1 + Length + sizeof(KEVENT); + + SetInfoEx = ExAllocatePoolWithTag( + NonPagedPool, + SetInfoExLength, + 'TSSK' + ); + + if (SetInfoEx == NULL) { + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + SetInfoEx->ID.toi_id = ID; + + SetInfoEx->ID.toi_type = INFO_TYPE_CONNECTION; + SetInfoEx->ID.toi_class = INFO_CLASS_PROTOCOL; + SetInfoEx->ID.toi_entity.tei_entity = CO_TL_ENTITY; + SetInfoEx->ID.toi_entity.tei_instance = TL_INSTANCE; + + SetInfoEx->BufferSize = Length; + RtlCopyMemory(&(SetInfoEx->Buffer[0]), OptionValue, Length); + + Event = (PKEVENT)(&(SetInfoEx->Buffer[Length])); + KeInitializeEvent(Event, NotificationEvent, FALSE); + + DeviceObject = IoGetRelatedDeviceObject(ConnectionObject); + + Irp = IoBuildDeviceIoControlRequest( + IOCTL_TCP_SET_INFORMATION_EX, + DeviceObject, + SetInfoEx, + SetInfoExLength, + NULL, + 0, + FALSE, + Event, + &IoStatus + ); + + if (Irp == NULL) { + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + IrpSp = IoGetNextIrpStackLocation(Irp); + + if (IrpSp == NULL) { + IoFreeIrp(Irp); + Irp = NULL; + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + IrpSp->FileObject = ConnectionObject; + IrpSp->DeviceObject = DeviceObject; + + Status = IoCallDriver(DeviceObject, Irp); + + if (Status == STATUS_PENDING) { + + KeWaitForSingleObject( + Event, + Executive, + KernelMode, + FALSE, + NULL + ); + + Status = IoStatus.Status; + } + +errorout: + + if (SetInfoEx) { + ExFreePool(SetInfoEx); + } + + if (!NT_SUCCESS(Status)) { + printk("ks_set_tcp_option: error setup tcp option: ID (%d), Status = %xh\n", + ID, Status); + Status = STATUS_SUCCESS; + } + + ks_put_tconn(tconn); + + return cfs_error_code(Status); +} + +/* + * ks_bind_tconn + * bind the tdi connection object with an address + * + * Arguments: + * tconn: tconn to be bound + * parent: the parent tconn object + * ipaddr: the ip address + * port: the port number + * + * Return Value: + * int: 0 for success or ks error codes. + * + * NOTES: + * N/A + */ + +int +ks_bind_tconn ( + ksock_tconn_t * tconn, + ksock_tconn_t * parent, + ulong_ptr addr, + unsigned short port + ) +{ + NTSTATUS status; + int rc = 0; + + ksock_tdi_addr_t taddr; + + memset(&taddr, 0, sizeof(ksock_tdi_addr_t)); + + if (tconn->kstc_state != ksts_inited) { + + status = STATUS_INVALID_PARAMETER; + rc = cfs_error_code(status); + + goto errorout; + + } else if (tconn->kstc_type == kstt_child) { + + if (NULL == parent) { + status = STATUS_INVALID_PARAMETER; + rc = cfs_error_code(status); + + goto errorout; + } + + /* refer it's parent's address object */ + + taddr = parent->kstc_addr; + ObReferenceObject(taddr.FileObject); + + ks_get_tconn(parent); + + } else { + + PTRANSPORT_ADDRESS TdiAddress = &(taddr.Tdi); + ULONG AddrLen = 0; + + /* intialize the tdi address*/ + + TdiAddress->TAAddressCount = 1; + TdiAddress->Address[0].AddressLength = TDI_ADDRESS_LENGTH_IP; + TdiAddress->Address[0].AddressType = TDI_ADDRESS_TYPE_IP; + + ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_port = htons(port); + ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->in_addr = htonl(addr); + + memset(&(((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_zero[0]),0,8); + + + /* open the transport address object */ + + AddrLen = FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) + + TDI_ADDRESS_LENGTH_IP; + + status = KsOpenAddress( + &(tconn->kstc_dev), + &(taddr.Tdi), + AddrLen, + &(taddr.Handle), + &(taddr.FileObject) + ); + + if (!NT_SUCCESS(status)) { + + KsPrint((0, "ks_bind_tconn: failed to open ip addr object (%x:%d), status = %xh\n", + addr, port, status )); + rc = cfs_error_code(status); + goto errorout; + } + } + + if (tconn->kstc_type == kstt_child) { + tconn->child.kstc_parent = parent; + } + + tconn->kstc_state = ksts_bind; + tconn->kstc_addr = taddr; + +errorout: + + return (rc); +} + +/* + * ks_build_tconn + * build tcp/streaming connection to remote peer + * + * Arguments: + * tconn: tconn to be connected to the peer + * addr: the peer's ip address + * port: the peer's port number + * + * Return Value: + * int: 0 for success or ks error codes. + * + * Notes: + * N/A + */ + +int +ks_build_tconn( + ksock_tconn_t * tconn, + ulong_ptr addr, + unsigned short port + ) +{ + int rc = 0; + NTSTATUS status = STATUS_SUCCESS; + + + PFILE_OBJECT ConnectionObject = NULL; + PDEVICE_OBJECT DeviceObject = NULL; + + PTDI_CONNECTION_INFORMATION ConnectionInfo = NULL; + ULONG AddrLength; + + PIRP Irp = NULL; + + LASSERT(tconn->kstc_type == kstt_sender); + LASSERT(tconn->kstc_state == ksts_bind); + + ks_get_tconn(tconn); + + { + /* set the event callbacks */ + rc = ks_set_handlers(tconn); + + if (rc < 0) { + cfs_enter_debugger(); + goto errorout; + } + } + + /* create the connection file handle / object */ + status = KsOpenConnection( + &(tconn->kstc_dev), + (CONNECTION_CONTEXT)tconn, + &(tconn->sender.kstc_info.Handle), + &(tconn->sender.kstc_info.FileObject) + ); + + if (!NT_SUCCESS(status)) { + rc = cfs_error_code(status); + cfs_enter_debugger(); + goto errorout; + } + + /* associdate the the connection with the adress object of the tconn */ + + status = KsAssociateAddress( + tconn->kstc_addr.Handle, + tconn->sender.kstc_info.FileObject + ); + + if (!NT_SUCCESS(status)) { + rc = cfs_error_code(status); + cfs_enter_debugger(); + goto errorout; + } + + tconn->kstc_state = ksts_associated; + + /* Allocating Connection Info Together with the Address */ + AddrLength = FIELD_OFFSET(TRANSPORT_ADDRESS, Address->Address) + + TDI_ADDRESS_LENGTH_IP; + + ConnectionInfo = (PTDI_CONNECTION_INFORMATION)ExAllocatePoolWithTag( + NonPagedPool, sizeof(TDI_CONNECTION_INFORMATION) + AddrLength, 'iCsK'); + + if (NULL == ConnectionInfo) { + + status = STATUS_INSUFFICIENT_RESOURCES; + rc = cfs_error_code(status); + cfs_enter_debugger(); + goto errorout; + } + + /* Initializing ConnectionInfo ... */ + { + PTRANSPORT_ADDRESS TdiAddress; + + /* ConnectionInfo settings */ + + ConnectionInfo->UserDataLength = 0; + ConnectionInfo->UserData = NULL; + ConnectionInfo->OptionsLength = 0; + ConnectionInfo->Options = NULL; + ConnectionInfo->RemoteAddressLength = AddrLength; + ConnectionInfo->RemoteAddress = ConnectionInfo + 1; + + + /* intialize the tdi address*/ + + TdiAddress = ConnectionInfo->RemoteAddress; + + TdiAddress->TAAddressCount = 1; + TdiAddress->Address[0].AddressLength = TDI_ADDRESS_LENGTH_IP; + TdiAddress->Address[0].AddressType = TDI_ADDRESS_TYPE_IP; + + ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_port = htons(port); + ((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->in_addr = htonl(addr); + + memset(&(((PTDI_ADDRESS_IP)&(TdiAddress->Address[0].Address))->sin_zero[0]),0,8); + } + + /* Now prepare to connect the remote peer ... */ + + ConnectionObject = tconn->sender.kstc_info.FileObject; + DeviceObject = IoGetRelatedDeviceObject(ConnectionObject); + + /* allocate a new Irp */ + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + status = STATUS_INSUFFICIENT_RESOURCES; + rc = cfs_error_code(status); + cfs_enter_debugger(); + goto errorout; + } + + /* setup the Irp */ + + TdiBuildConnect( + Irp, + DeviceObject, + ConnectionObject, + NULL, + NULL, + NULL, + ConnectionInfo, + NULL + ); + + + /* sumbit the Irp to the underlying transport driver */ + status = KsSubmitTdiIrp( + DeviceObject, + Irp, + TRUE, + NULL + ); + + spin_lock(&(tconn->kstc_lock)); + + if (NT_SUCCESS(status)) { + + /* Connected! the conneciton is built successfully. */ + + tconn->kstc_state = ksts_connected; + + tconn->sender.kstc_info.ConnectionInfo = ConnectionInfo; + tconn->sender.kstc_info.Remote = ConnectionInfo->RemoteAddress; + + spin_unlock(&(tconn->kstc_lock)); + + } else { + + /* Not connected! Abort it ... */ + + if (rc != 0) { + cfs_enter_debugger(); + } + + Irp = NULL; + rc = cfs_error_code(status); + + tconn->kstc_state = ksts_associated; + spin_unlock(&(tconn->kstc_lock)); + + /* disassocidate the connection and the address object, + after cleanup, it's safe to set the state to abort ... */ + + if ( NT_SUCCESS(KsDisassociateAddress( + tconn->sender.kstc_info.FileObject))) { + tconn->kstc_state = ksts_aborted; + } + + /* reset the event callbacks */ + rc = ks_reset_handlers(tconn); + + goto errorout; + } + +errorout: + + if (NT_SUCCESS(status)) { + + ks_query_local_ipaddr(tconn); + + } else { + + if (ConnectionInfo) { + ExFreePool(ConnectionInfo); + } + if (Irp) { + IoFreeIrp(Irp); + } + } + + ks_put_tconn(tconn); + + return (rc); +} + + +/* + * ks_disconnect_tconn + * disconnect the tconn from a connection + * + * Arguments: + * tconn: the tdi connecton object connected already + * flags: flags & options for disconnecting + * + * Return Value: + * int: ks error code + * + * Notes: + * N/A + */ + +int +ks_disconnect_tconn( + ksock_tconn_t * tconn, + ulong_ptr flags + ) +{ + NTSTATUS status = STATUS_SUCCESS; + + ksock_tconn_info_t * info; + + PFILE_OBJECT ConnectionObject; + PDEVICE_OBJECT DeviceObject = NULL; + + PIRP Irp = NULL; + + KEVENT Event; + + ks_get_tconn(tconn); + + /* make sure tt's connected already and it + must be a sender or a child ... */ + + LASSERT(tconn->kstc_state == ksts_connected); + LASSERT( tconn->kstc_type == kstt_sender || + tconn->kstc_type == kstt_child); + + /* reset all the event handlers to NULL */ + + if (tconn->kstc_type != kstt_child) { + ks_reset_handlers (tconn); + } + + /* Disconnecting to the remote peer ... */ + + if (tconn->kstc_type == kstt_sender) { + info = &(tconn->sender.kstc_info); + } else { + info = &(tconn->child.kstc_info); + } + + ConnectionObject = info->FileObject; + DeviceObject = IoGetRelatedDeviceObject(ConnectionObject); + + /* allocate an Irp and setup it */ + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + status = STATUS_INSUFFICIENT_RESOURCES; + cfs_enter_debugger(); + goto errorout; + } + + KeInitializeEvent( + &Event, + SynchronizationEvent, + FALSE + ); + + TdiBuildDisconnect( + Irp, + DeviceObject, + ConnectionObject, + KsDisconectCompletionRoutine, + &Event, + NULL, + flags, + NULL, + NULL + ); + + /* issue the Irp to the underlying transport + driver to disconnect the connection */ + + status = IoCallDriver(DeviceObject, Irp); + + if (STATUS_PENDING == status) { + + status = KeWaitForSingleObject( + &Event, + Executive, + KernelMode, + FALSE, + NULL + ); + + status = Irp->IoStatus.Status; + } + + KsPrint((2, "KsDisconnect: Disconnection is done with Status = %xh (%s) ...\n", + status, KsNtStatusToString(status))); + + IoFreeIrp(Irp); + + if (info->ConnectionInfo) { + + /* disassociate the association between connection/address objects */ + + status = KsDisassociateAddress(ConnectionObject); + + if (!NT_SUCCESS(status)) { + cfs_enter_debugger(); + } + + spin_lock(&(tconn->kstc_lock)); + + /* cleanup the tsdumgr Lists */ + KsCleanupTsdu (tconn); + + /* set the state of the tconn */ + if (NT_SUCCESS(status)) { + tconn->kstc_state = ksts_disconnected; + } else { + tconn->kstc_state = ksts_associated; + } + + /* free the connection info to system pool*/ + ExFreePool(info->ConnectionInfo); + info->ConnectionInfo = NULL; + info->Remote = NULL; + + spin_unlock(&(tconn->kstc_lock)); + } + + status = STATUS_SUCCESS; + +errorout: + + ks_put_tconn(tconn); + + return cfs_error_code(status); +} + + +/* + * ks_abort_tconn + * The connection is broken un-expectedly. We need do + * some cleanup. + * + * Arguments: + * tconn: the tdi connection + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +ks_abort_tconn( + ksock_tconn_t * tconn + ) +{ + PKS_DISCONNECT_WORKITEM WorkItem = NULL; + + WorkItem = &(tconn->kstc_disconnect); + + ks_get_tconn(tconn); + spin_lock(&(tconn->kstc_lock)); + + if (tconn->kstc_state != ksts_connected) { + ks_put_tconn(tconn); + } else { + + if (!cfs_is_flag_set(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY)) { + + WorkItem->Flags = TDI_DISCONNECT_ABORT; + WorkItem->tconn = tconn; + + cfs_set_flag(tconn->kstc_flags, KS_TCONN_DISCONNECT_BUSY); + + ExQueueWorkItem( + &(WorkItem->WorkItem), + DelayedWorkQueue + ); + } + } + + spin_unlock(&(tconn->kstc_lock)); +} + + +/* + * ks_query_local_ipaddr + * query the local connection ip address + * + * Arguments: + * tconn: the tconn which is connected + * + * Return Value: + * int: ks error code + * + * Notes: + * N/A + */ + +int +ks_query_local_ipaddr( + ksock_tconn_t * tconn + ) +{ + PFILE_OBJECT FileObject = NULL; + NTSTATUS status; + + PTRANSPORT_ADDRESS TdiAddress; + ULONG AddressLength; + + if (tconn->kstc_type == kstt_sender) { + FileObject = tconn->sender.kstc_info.FileObject; + } else if (tconn->kstc_type == kstt_child) { + FileObject = tconn->child.kstc_info.FileObject; + } else { + status = STATUS_INVALID_PARAMETER; + goto errorout; + } + + TdiAddress = &(tconn->kstc_addr.Tdi); + AddressLength = MAX_ADDRESS_LENGTH; + + status = KsQueryIpAddress(FileObject, TdiAddress, &AddressLength); + + if (NT_SUCCESS(status)) { + + KsPrint((0, "ks_query_local_ipaddr: Local ip address = %xh port = %xh\n", + ((PTDI_ADDRESS_IP)(&(TdiAddress->Address[0].Address)))->in_addr, + ((PTDI_ADDRESS_IP)(&(TdiAddress->Address[0].Address)))->sin_port )); + } else { + KsPrint((0, "KsQueryonnectionIpAddress: Failed to query the connection local ip address.\n")); + } + +errorout: + + return cfs_error_code(status); +} + +/* + * ks_send_mdl + * send MDL chain to the peer for a stream connection + * + * Arguments: + * tconn: tdi connection object + * tx: the transmit context + * mdl: the mdl chain containing the data + * len: length of the data + * flags: flags of the transmission + * + * Return Value: + * ks return code + * + * Notes: + * N/A + */ + +int +ks_send_mdl( + ksock_tconn_t * tconn, + void * tx, + ksock_mdl_t * mdl, + int len, + int flags + ) +{ + NTSTATUS Status; + int rc = 0; + ulong_ptr length; + ulong_ptr tflags; + ksock_tdi_tx_t * context; + + PKS_CHAIN KsChain; + PKS_TSDUMGR KsTsduMgr; + PKS_TSDU KsTsdu; + PKS_TSDU_BUF KsTsduBuf; + PKS_TSDU_DAT KsTsduDat; + + BOOLEAN bNewTsdu = FALSE; /* newly allocated */ + BOOLEAN bNewBuff = FALSE; /* newly allocated */ + + BOOLEAN bBuffed; /* bufferred sending */ + + PUCHAR Buffer = NULL; + ksock_mdl_t * NewMdl = NULL; + + PIRP Irp = NULL; + PFILE_OBJECT ConnObject; + PDEVICE_OBJECT DeviceObject; + + BOOLEAN bIsNonBlock; + + ks_get_tconn(tconn); + + tflags = ks_tdi_send_flags(flags); + bIsNonBlock = cfs_is_flag_set(flags, MSG_DONTWAIT); + + spin_lock(&tconn->kstc_lock); + + LASSERT( tconn->kstc_type == kstt_sender || + tconn->kstc_type == kstt_child ); + + if (tconn->kstc_state != ksts_connected) { + spin_unlock(&tconn->kstc_lock); + ks_put_tconn(tconn); + return -ENOTCONN; + } + + /* get the latest Tsdu buffer form TsduMgr list. + just set NULL if the list is empty. */ + + if (tconn->kstc_type == kstt_sender) { + KsChain = &(tconn->sender.kstc_send); + } else { + LASSERT(tconn->kstc_type == kstt_child); + KsChain = &(tconn->child.kstc_send); + } + + if (cfs_is_flag_set(tflags, TDI_SEND_EXPEDITED)) { + KsTsduMgr = &(KsChain->Expedited); + } else { + KsTsduMgr = &(KsChain->Normal); + } + + if (KsTsduMgr->TotalBytes + len <= tconn->kstc_snd_wnd) { + bBuffed = TRUE; + } else { + bBuffed = FALSE; + } + + /* do the preparation work for bufferred sending */ + + if (bBuffed) { + + /* if the data is even larger than the biggest Tsdu, we have + to allocate new buffer and use TSDU_TYOE_BUF to store it */ + + if ( KS_TSDU_STRU_SIZE((ULONG)len) > ks_data.ksnd_tsdu_size + - KS_DWORD_ALIGN(sizeof(KS_TSDU))) { + bNewBuff = TRUE; + } + + if (list_empty(&(KsTsduMgr->TsduList))) { + + LASSERT(KsTsduMgr->NumOfTsdu == 0); + KsTsdu = NULL; + + } else { + + LASSERT(KsTsduMgr->NumOfTsdu > 0); + KsTsdu = list_entry(KsTsduMgr->TsduList.prev, KS_TSDU, Link); + LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC); + + + /* check whether KsTsdu free space is enough, or we need alloc new Tsdu */ + if (bNewBuff) { + if (sizeof(KS_TSDU_BUF) + KsTsdu->LastOffset > KsTsdu->TotalLength) { + KsTsdu = NULL; + } + } else { + if ( KS_TSDU_STRU_SIZE((ULONG)len) > + KsTsdu->TotalLength - KsTsdu->LastOffset ) { + KsTsdu = NULL; + } + } + } + + /* if there's no Tsdu or the free size is not enough for the + KS_TSDU_BUF or KS_TSDU_DAT. We need re-allocate a new Tsdu. */ + + if (NULL == KsTsdu) { + + KsTsdu = KsAllocateKsTsdu(); + + if (NULL == KsTsdu) { + bBuffed = FALSE; + bNewBuff = FALSE; + } else { + bNewTsdu = TRUE; + } + } + + /* process the case that a new buffer is to be allocated from system memory */ + if (bNewBuff) { + + /* now allocating internal buffer to contain the payload */ + Buffer = ExAllocatePool(NonPagedPool, len); + + if (NULL == Buffer) { + bBuffed = FALSE; + } + } + } + + if (bBuffed) { + + if (bNewBuff) { + + /* queue a new KS_TSDU_BUF to the Tsdu buffer */ + KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->LastOffset); + + KsTsduBuf->TsduFlags = 0; + KsTsduBuf->DataLength = (ULONG)len; + KsTsduBuf->StartOffset = 0; + KsTsduBuf->UserBuffer = Buffer; + } else { + /* queue a new KS_TSDU_BUF to the Tsdu buffer */ + KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->LastOffset); + + KsTsduDat->TsduFlags = 0; + KsTsduDat->DataLength = (ULONG)len; + KsTsduDat->StartOffset = 0; + KsTsduDat->TotalLength = KS_TSDU_STRU_SIZE((ULONG)len); + + Buffer = &KsTsduDat->Data[0]; + } + + /* now locking the Buffer and copy user payload into the buffer */ + ASSERT(Buffer != NULL); + + rc = ks_lock_buffer(Buffer, FALSE, len, IoReadAccess, &NewMdl); + if (rc != 0) { + printk("ks_send_mdl: bufferred: error allocating mdl.\n"); + bBuffed = FALSE; + } else { + ULONG BytesCopied = 0; + TdiCopyMdlToBuffer(mdl, 0, Buffer, 0, (ULONG)len, &BytesCopied); + if (BytesCopied != (ULONG) len) { + bBuffed = FALSE; + } + } + + /* Do the finializing job if we succeed to to lock the buffer and move + user data. Or we need do cleaning up ... */ + if (bBuffed) { + + if (bNewBuff) { + KsTsduBuf->TsduType = TSDU_TYPE_BUF; + KsTsdu->LastOffset += sizeof(KS_TSDU_BUF); + + } else { + KsTsduDat->TsduType = TSDU_TYPE_DAT; + KsTsdu->LastOffset += KsTsduDat->TotalLength; + } + + /* attach it to the TsduMgr list if the Tsdu is newly created. */ + if (bNewTsdu) { + + list_add_tail(&(KsTsdu->Link), &(KsTsduMgr->TsduList)); + KsTsduMgr->NumOfTsdu++; + } + + } else { + + if (NewMdl) { + ks_release_mdl(NewMdl, FALSE); + NewMdl = NULL; + } + + if (bNewBuff) { + ExFreePool(Buffer); + Buffer = NULL; + bNewBuff = FALSE; + } + } + } + + /* update the TotalBytes being in sending */ + KsTsduMgr->TotalBytes += (ULONG)len; + + spin_unlock(&tconn->kstc_lock); + + /* cleanup the Tsdu if not successful */ + if (!bBuffed && bNewTsdu) { + KsPutKsTsdu(KsTsdu); + bNewTsdu = FALSE; + KsTsdu = NULL; + } + + /* we need allocate the ksock_tx_t structure from memory pool. */ + + context = cfs_alloc(sizeof(ksock_tdi_tx_t) + sizeof(KEVENT),0); + if (!context) { + /* release the chained mdl */ + ks_release_mdl(mdl, FALSE); + + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + /* intialize the TcpContext */ + + memset(context,0, sizeof(ksock_tdi_tx_t) + sizeof(KEVENT)); + + context->tconn = tconn; + context->Event = (PKEVENT) ((PUCHAR)context + sizeof(ksock_tdi_tx_t)); + + KeInitializeEvent(context->Event, SynchronizationEvent, FALSE); + + if (bBuffed) { + + /* for bufferred transmission, we need set + the internal completion routine. */ + + context->CompletionRoutine = KsTcpSendCompletionRoutine; + context->KsTsduMgr = KsTsduMgr; + context->CompletionContext = KsTsdu; + context->CompletionContext2 = (bNewBuff ? (PVOID)KsTsduBuf : (PVOID)KsTsduDat); + context->bCounted = FALSE; + + } else if (bIsNonBlock) { + + /* for non-blocking transmission, we need set + the internal completion routine too. */ + + context->CompletionRoutine = KsTcpSendCompletionRoutine; + context->CompletionContext = tx; + context->KsTsduMgr = KsTsduMgr; + context->bCounted = TRUE; + context->ReferCount = 2; + } + + if (tconn->kstc_type == kstt_sender) { + ConnObject = tconn->sender.kstc_info.FileObject; + } else { + LASSERT(tconn->kstc_type == kstt_child); + ConnObject = tconn->child.kstc_info.FileObject; + } + + DeviceObject = IoGetRelatedDeviceObject(ConnObject); + + Irp = KsBuildTdiIrp(DeviceObject); + + if (NULL == Irp) { + + /* release the chained mdl */ + ks_release_mdl(mdl, FALSE); + + Status = STATUS_INSUFFICIENT_RESOURCES; + goto errorout; + } + + length = KsQueryMdlsSize(mdl); + + LASSERT((ULONG)len <= length); + + ks_get_tconn(tconn); + + TdiBuildSend( + Irp, + DeviceObject, + ConnObject, + KsTcpCompletionRoutine, + context, + (bBuffed ? NewMdl : mdl), + (bBuffed ? (tflags | TDI_SEND_NON_BLOCKING) : tflags), + (ULONG)len; + ); + + Status = IoCallDriver(DeviceObject, Irp); + + if (bBuffed) { + ks_release_mdl(mdl, FALSE); + NewMdl = NULL; + } + + if (!NT_SUCCESS(Status)) { + cfs_enter_debugger(); + rc = cfs_error_code(Status); + goto errorout; + } + + if (bBuffed) { + Status = STATUS_SUCCESS; + rc = len; + context = NULL; + } else { + if (bIsNonBlock) { + if (InterlockedDecrement(&context->ReferCount) == 0) { + Status = Irp->IoStatus.Status; + } else { + Status = STATUS_PENDING; + context = NULL; + } + } else { + if (STATUS_PENDING == Status) { + Status = KeWaitForSingleObject( + context->Event, + Executive, + KernelMode, + FALSE, + NULL + ); + + if (NT_SUCCESS(Status)) { + Status = Irp->IoStatus.Status; + } + } + } + + if (Status == STATUS_SUCCESS) { + rc = (int)(Irp->IoStatus.Information); + + spin_lock(&tconn->kstc_lock); + KsTsduMgr->TotalBytes -= rc; + spin_unlock(&tconn->kstc_lock); + + } else { + rc = cfs_error_code(Status); + } + } + +errorout: + + if (bBuffed) { + + if (NewMdl) { + ks_release_mdl(NewMdl, FALSE); + NewMdl = NULL; + } + + if (bNewBuff) { + if (!NT_SUCCESS(Status)) { + ExFreePool(Buffer); + Buffer = NULL; + } + } + + } else { + + if (Status != STATUS_PENDING) { + + if (Irp) { + + /* Freeing the Irp ... */ + + IoFreeIrp(Irp); + Irp = NULL; + } + } + } + + if (!NT_SUCCESS(Status)) { + + spin_lock(&tconn->kstc_lock); + + KsTsduMgr->TotalBytes -= (ULONG)len; + + if (bBuffed) { + + /* attach it to the TsduMgr list if the Tsdu is newly created. */ + if (bNewTsdu) { + + list_del(&(KsTsdu->Link)); + KsTsduMgr->NumOfTsdu--; + + KsPutKsTsdu(KsTsdu); + } else { + if (bNewBuff) { + if ( (ulong_ptr)KsTsduBuf + sizeof(KS_TSDU_BUF) == + (ulong_ptr)KsTsdu + KsTsdu->LastOffset) { + KsTsdu->LastOffset -= sizeof(KS_TSDU_BUF); + KsTsduBuf->TsduType = 0; + } else { + cfs_enter_debugger(); + KsTsduBuf->StartOffset = KsTsduBuf->DataLength; + } + } else { + if ( (ulong_ptr)KsTsduDat + KsTsduDat->TotalLength == + (ulong_ptr)KsTsdu + KsTsdu->LastOffset) { + KsTsdu->LastOffset -= KsTsduDat->TotalLength; + KsTsduDat->TsduType = 0; + } else { + cfs_enter_debugger(); + KsTsduDat->StartOffset = KsTsduDat->DataLength; + } + } + } + } + + spin_unlock(&tconn->kstc_lock); + } + + /* free the context if is not used at all */ + if (context) { + cfs_free(context); + } + + ks_put_tconn(tconn); + + return rc; +} + +/* + * ks_recv_mdl + * Receive data from the peer for a stream connection + * + * Arguments: + * tconn: tdi connection object + * mdl: the mdl chain to contain the incoming data + * len: length of the data + * flags: flags of the receiving + * + * Return Value: + * ks return code + * + * Notes: + * N/A + */ + +int +ks_recv_mdl( + ksock_tconn_t * tconn, + ksock_mdl_t * mdl, + int size, + int flags + ) +{ + NTSTATUS Status = STATUS_SUCCESS; + int rc = 0; + + BOOLEAN bIsNonBlock; + BOOLEAN bIsExpedited; + + PKS_CHAIN KsChain; + PKS_TSDUMGR KsTsduMgr; + PKS_TSDU KsTsdu; + PKS_TSDU_DAT KsTsduDat; + PKS_TSDU_BUF KsTsduBuf; + PKS_TSDU_MDL KsTsduMdl; + + PUCHAR Buffer; + + ULONG BytesRecved = 0; + ULONG RecvedOnce; + + bIsNonBlock = cfs_is_flag_set(flags, MSG_DONTWAIT); + bIsExpedited = cfs_is_flag_set(flags, MSG_OOB); + + ks_get_tconn(tconn); + +Again: + + RecvedOnce = 0; + + spin_lock(&(tconn->kstc_lock)); + + if ( tconn->kstc_type != kstt_sender && + tconn->kstc_type != kstt_child) { + + rc = -EINVAL; + spin_unlock(&(tconn->kstc_lock)); + + goto errorout; + } + + if (tconn->kstc_state != ksts_connected) { + + rc = -ENOTCONN; + spin_unlock(&(tconn->kstc_lock)); + + goto errorout; + } + + if (tconn->kstc_type == kstt_sender) { + KsChain = &(tconn->sender.kstc_recv); + } else { + LASSERT(tconn->kstc_type == kstt_child); + KsChain = &(tconn->child.kstc_recv); + } + + if (bIsExpedited) { + KsTsduMgr = &(KsChain->Expedited); + } else { + KsTsduMgr = &(KsChain->Normal); + } + +NextTsdu: + + if (list_empty(&(KsTsduMgr->TsduList))) { + + // + // It's a notification event. We need reset it to + // un-signaled state in case there no any tsdus. + // + + KeResetEvent(&(KsTsduMgr->Event)); + + } else { + + KsTsdu = list_entry(KsTsduMgr->TsduList.next, KS_TSDU, Link); + LASSERT(KsTsdu->Magic == KS_TSDU_MAGIC); + + /* remove the KsTsdu from TsduMgr list to release the lock */ + list_del(&(KsTsdu->Link)); + KsTsduMgr->NumOfTsdu--; + + spin_unlock(&(tconn->kstc_lock)); + + while ((ULONG)size > BytesRecved) { + + ULONG BytesCopied = 0; + ULONG BytesToCopy = 0; + ULONG StartOffset = 0; + + KsTsduDat = (PKS_TSDU_DAT)((PUCHAR)KsTsdu + KsTsdu->StartOffset); + KsTsduBuf = (PKS_TSDU_BUF)((PUCHAR)KsTsdu + KsTsdu->StartOffset); + KsTsduMdl = (PKS_TSDU_MDL)((PUCHAR)KsTsdu + KsTsdu->StartOffset); + + if ( TSDU_TYPE_DAT == KsTsduDat->TsduType || + TSDU_TYPE_BUF == KsTsduBuf->TsduType ) { + + + // + // Data Tsdu Unit ... + // + + if (TSDU_TYPE_DAT == KsTsduDat->TsduType) { + + if (cfs_is_flag_set(KsTsduDat->TsduFlags, KS_TSDU_DAT_RECEIVING)) { + /* data is not ready yet*/ + KeResetEvent(&(KsTsduMgr->Event)); + printk("ks_recv_mdl: KsTsduDat (%xh) is not ready yet !!!!!!!\n", KsTsduDat); + break; + } + + Buffer = &KsTsduDat->Data[0]; + StartOffset = KsTsduDat->StartOffset; + if (KsTsduDat->DataLength - KsTsduDat->StartOffset > size - BytesRecved) { + /* Recvmsg requst could be statisfied ... */ + BytesToCopy = size - BytesRecved; + } else { + BytesToCopy = KsTsduDat->DataLength - KsTsduDat->StartOffset; + } + + } else { + + if (cfs_is_flag_set(KsTsduBuf->TsduFlags, KS_TSDU_BUF_RECEIVING)) { + /* data is not ready yet*/ + KeResetEvent(&(KsTsduMgr->Event)); + DbgPrint("ks_recv_mdl: KsTsduBuf (%xh) is not ready yet !!!!!!!\n", KsTsduBuf); + break; + } + + ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType); + Buffer = KsTsduBuf->UserBuffer; + StartOffset = KsTsduBuf->StartOffset; + + if (KsTsduBuf->DataLength - KsTsduBuf->StartOffset > size - BytesRecved) { + /* Recvmsg requst could be statisfied ... */ + BytesToCopy = size - BytesRecved; + } else { + BytesToCopy = KsTsduBuf->DataLength - KsTsduBuf->StartOffset; + } + } + + if (BytesToCopy > 0) { + Status = TdiCopyBufferToMdl( + Buffer, + StartOffset, + BytesToCopy, + mdl, + BytesRecved, + &BytesCopied + ); + + if (NT_SUCCESS(Status)) { + + if (BytesToCopy != BytesCopied) { + cfs_enter_debugger(); + } + + BytesRecved += BytesCopied; + RecvedOnce += BytesCopied; + + } else { + + cfs_enter_debugger(); + + if (STATUS_BUFFER_OVERFLOW == Status) { + } + } + } + + if (TSDU_TYPE_DAT == KsTsduDat->TsduType) { + + KsTsduDat->StartOffset += BytesCopied; + + if (KsTsduDat->StartOffset == KsTsduDat->DataLength) { + KsTsdu->StartOffset += KsTsduDat->TotalLength; + } + + } else { + + ASSERT(TSDU_TYPE_BUF == KsTsduBuf->TsduType); + KsTsduBuf->StartOffset += BytesCopied; + if (KsTsduBuf->StartOffset == KsTsduBuf->DataLength) { + KsTsdu->StartOffset += sizeof(KS_TSDU_BUF); + /* now we need release the buf to system pool */ + ExFreePool(KsTsduBuf->UserBuffer); + } + } + + } else if (TSDU_TYPE_MDL == KsTsduMdl->TsduType) { + + // + // MDL Tsdu Unit ... + // + + if (KsTsduMdl->DataLength > size - BytesRecved) { + + /* Recvmsg requst could be statisfied ... */ + + BytesToCopy = size - BytesRecved; + + } else { + + BytesToCopy = KsTsduMdl->DataLength; + } + + Status = KsCopyMdlChainToMdlChain( + KsTsduMdl->Mdl, + KsTsduMdl->StartOffset, + mdl, + BytesRecved, + BytesToCopy, + &BytesCopied + ); + + if (NT_SUCCESS(Status)) { + + if (BytesToCopy != BytesCopied) { + cfs_enter_debugger(); + } + + KsTsduMdl->StartOffset += BytesCopied; + KsTsduMdl->DataLength -= BytesCopied; + + BytesRecved += BytesCopied; + RecvedOnce += BytesCopied; + } else { + cfs_enter_debugger(); + } + + if (0 == KsTsduMdl->DataLength) { + + // + // Call TdiReturnChainedReceives to release the Tsdu memory + // + + TdiReturnChainedReceives( + &(KsTsduMdl->Descriptor), + 1 ); + + KsTsdu->StartOffset += sizeof(KS_TSDU_MDL); + } + + } else { + printk("ks_recv_mdl: unknown tsdu slot: slot = %x type = %x Start= %x\n", + KsTsduDat, KsTsduDat->TsduType, KsTsduDat->StartOffset, KsTsduDat->DataLength); + printk(" Tsdu = %x Magic=%x: Start = %x Last = %x Length = %x", + KsTsdu, KsTsdu->Magic, KsTsdu->StartOffset, KsTsdu->LastOffset, KsTsdu->TotalLength); + cfs_enter_debugger(); + } + + if (KsTsdu->StartOffset == KsTsdu->LastOffset) { + + // + // KsTsdu is empty now, we need free it ... + // + + KsPutKsTsdu(KsTsdu); + KsTsdu = NULL; + + break; + } + } + + spin_lock(&(tconn->kstc_lock)); + + /* we need attach the KsTsdu to the list header */ + if (KsTsdu) { + KsTsduMgr->NumOfTsdu++; + list_add(&(KsTsdu->Link), &(KsTsduMgr->TsduList)); + } else if ((ULONG)size > BytesRecved) { + goto NextTsdu; + } + } + + if (KsTsduMgr->TotalBytes < RecvedOnce) { + cfs_enter_debugger(); + KsTsduMgr->TotalBytes = 0; + } else { + KsTsduMgr->TotalBytes -= RecvedOnce; + } + + spin_unlock(&(tconn->kstc_lock)); + + if (NT_SUCCESS(Status)) { + + if ((BytesRecved < (ulong_ptr)size) && (!bIsNonBlock)) { + + KeWaitForSingleObject( + &(KsTsduMgr->Event), + Executive, + KernelMode, + FALSE, + NULL + ); + + goto Again; + } + + if (bIsNonBlock && (BytesRecved == 0)) { + rc = -EAGAIN; + } else { + rc = BytesRecved; + } + } + +errorout: + + ks_put_tconn(tconn); + + if (rc > 0) { + KsPrint((1, "ks_recv_mdl: recvieving %d bytes ...\n", rc)); + } else { + KsPrint((0, "ks_recv_mdl: recvieving error code = %d Stauts = %xh ...\n", rc, Status)); + } + + /* release the chained mdl */ + ks_release_mdl(mdl, FALSE); + + return (rc); +} + + +/* + * ks_init_tdi_data + * initialize the global data in ksockal_data + * + * Arguments: + * N/A + * + * Return Value: + * int: ks error code + * + * Notes: + * N/A + */ + +int +ks_init_tdi_data() +{ + int rc = 0; + + /* initialize tconn related globals */ + RtlZeroMemory(&ks_data, sizeof(ks_data_t)); + + spin_lock_init(&ks_data.ksnd_tconn_lock); + CFS_INIT_LIST_HEAD(&ks_data.ksnd_tconns); + cfs_init_event(&ks_data.ksnd_tconn_exit, TRUE, FALSE); + + ks_data.ksnd_tconn_slab = cfs_mem_cache_create( + "tcon", sizeof(ksock_tconn_t) , 0, 0); + + if (!ks_data.ksnd_tconn_slab) { + rc = -ENOMEM; + goto errorout; + } + + /* initialize tsdu related globals */ + + spin_lock_init(&ks_data.ksnd_tsdu_lock); + CFS_INIT_LIST_HEAD(&ks_data.ksnd_freetsdus); + ks_data.ksnd_tsdu_size = TDINAL_TSDU_DEFAULT_SIZE; /* 64k */ + ks_data.ksnd_tsdu_slab = cfs_mem_cache_create( + "tsdu", ks_data.ksnd_tsdu_size, 0, 0); + + if (!ks_data.ksnd_tsdu_slab) { + rc = -ENOMEM; + cfs_mem_cache_destroy(ks_data.ksnd_tconn_slab); + ks_data.ksnd_tconn_slab = NULL; + goto errorout; + } + + /* initialize daemon related globals */ + + spin_lock_init(&ks_data.ksnd_daemon_lock); + CFS_INIT_LIST_HEAD(&ks_data.ksnd_daemons); + cfs_init_event(&ks_data.ksnd_daemon_exit, TRUE, FALSE); + + KsRegisterPnpHandlers(); + +errorout: + + return rc; +} + + +/* + * ks_fini_tdi_data + * finalize the global data in ksockal_data + * + * Arguments: + * N/A + * + * Return Value: + * int: ks error code + * + * Notes: + * N/A + */ + +void +ks_fini_tdi_data() +{ + PKS_TSDU KsTsdu = NULL; + struct list_head * list = NULL; + + /* clean up the pnp handler and address slots */ + KsDeregisterPnpHandlers(); + + /* we need wait until all the tconn are freed */ + spin_lock(&(ks_data.ksnd_tconn_lock)); + + if (list_empty(&(ks_data.ksnd_tconns))) { + cfs_wake_event(&ks_data.ksnd_tconn_exit); + } + spin_unlock(&(ks_data.ksnd_tconn_lock)); + + /* now wait on the tconn exit event */ + cfs_wait_event(&ks_data.ksnd_tconn_exit, 0); + + /* it's safe to delete the tconn slab ... */ + cfs_mem_cache_destroy(ks_data.ksnd_tconn_slab); + ks_data.ksnd_tconn_slab = NULL; + + /* clean up all the tsud buffers in the free list */ + spin_lock(&(ks_data.ksnd_tsdu_lock)); + list_for_each (list, &ks_data.ksnd_freetsdus) { + KsTsdu = list_entry (list, KS_TSDU, Link); + + cfs_mem_cache_free( + ks_data.ksnd_tsdu_slab, + KsTsdu ); + } + spin_unlock(&(ks_data.ksnd_tsdu_lock)); + + /* it's safe to delete the tsdu slab ... */ + cfs_mem_cache_destroy(ks_data.ksnd_tsdu_slab); + ks_data.ksnd_tsdu_slab = NULL; + + /* good! it's smooth to do the cleaning up...*/ +} + +/* + * ks_create_child_tconn + * Create the backlog child connection for a listener + * + * Arguments: + * parent: the listener daemon connection + * + * Return Value: + * the child connection or NULL in failure + * + * Notes: + * N/A + */ + +ksock_tconn_t * +ks_create_child_tconn( + ksock_tconn_t * parent + ) +{ + NTSTATUS status; + ksock_tconn_t * backlog; + + /* allocate the tdi connecton object */ + backlog = ks_create_tconn(); + + if (!backlog) { + goto errorout; + } + + /* initialize the tconn as a child */ + ks_init_child(backlog); + + + /* now bind it */ + if (ks_bind_tconn(backlog, parent, 0, 0) < 0) { + ks_free_tconn(backlog); + backlog = NULL; + goto errorout; + } + + /* open the connection object */ + status = KsOpenConnection( + &(backlog->kstc_dev), + (PVOID)backlog, + &(backlog->child.kstc_info.Handle), + &(backlog->child.kstc_info.FileObject) + ); + + if (!NT_SUCCESS(status)) { + + ks_put_tconn(backlog); + backlog = NULL; + cfs_enter_debugger(); + goto errorout; + } + + /* associate it now ... */ + status = KsAssociateAddress( + backlog->kstc_addr.Handle, + backlog->child.kstc_info.FileObject + ); + + if (!NT_SUCCESS(status)) { + + ks_put_tconn(backlog); + backlog = NULL; + cfs_enter_debugger(); + goto errorout; + } + + backlog->kstc_state = ksts_associated; + +errorout: + + return backlog; +} + +/* + * ks_replenish_backlogs( + * to replenish the backlogs listening... + * + * Arguments: + * tconn: the parent listen tdi connect + * nbacklog: number fo child connections in queue + * + * Return Value: + * N/A + * + * Notes: + * N/A + */ + +void +ks_replenish_backlogs( + ksock_tconn_t * parent, + int nbacklog + ) +{ + ksock_tconn_t * backlog; + int n = 0; + + /* calculate how many backlogs needed */ + if ( ( parent->listener.kstc_listening.num + + parent->listener.kstc_accepted.num ) < nbacklog ) { + n = nbacklog - ( parent->listener.kstc_listening.num + + parent->listener.kstc_accepted.num ); + } else { + n = 0; + } + + while (n--) { + + /* create the backlog child tconn */ + backlog = ks_create_child_tconn(parent); + + spin_lock(&(parent->kstc_lock)); + + if (backlog) { + spin_lock(&backlog->kstc_lock); + /* attch it into the listing list of daemon */ + list_add( &backlog->child.kstc_link, + &parent->listener.kstc_listening.list ); + parent->listener.kstc_listening.num++; + + backlog->child.kstc_queued = TRUE; + spin_unlock(&backlog->kstc_lock); + } else { + cfs_enter_debugger(); + } + + spin_unlock(&(parent->kstc_lock)); + } +} + +/* + * ks_start_listen + * setup the listener tdi connection and make it listen + * on the user specified ip address and port. + * + * Arguments: + * tconn: the parent listen tdi connect + * nbacklog: number fo child connections in queue + * + * Return Value: + * ks error code >=: success; otherwise error. + * + * Notes: + * N/A + */ + +int +ks_start_listen(ksock_tconn_t *tconn, int nbacklog) +{ + int rc = 0; + + /* now replenish the backlogs */ + ks_replenish_backlogs(tconn, nbacklog); + + /* set the event callback handlers */ + rc = ks_set_handlers(tconn); + + if (rc < 0) { + return rc; + } + + spin_lock(&(tconn->kstc_lock)); + tconn->listener.nbacklog = nbacklog; + tconn->kstc_state = ksts_listening; + cfs_set_flag(tconn->kstc_flags, KS_TCONN_DAEMON_STARTED); + spin_unlock(&(tconn->kstc_lock)); + + return rc; +} + +void +ks_stop_listen(ksock_tconn_t *tconn) +{ + struct list_head * list; + ksock_tconn_t * backlog; + + /* reset all tdi event callbacks to NULL */ + ks_reset_handlers (tconn); + + spin_lock(&tconn->kstc_lock); + + cfs_clear_flag(tconn->kstc_flags, KS_TCONN_DAEMON_STARTED); + + /* cleanup all the listening backlog child connections */ + list_for_each (list, &(tconn->listener.kstc_listening.list)) { + backlog = list_entry(list, ksock_tconn_t, child.kstc_link); + + /* destory and free it */ + ks_put_tconn(backlog); + } + + spin_unlock(&tconn->kstc_lock); + + /* wake up it from the waiting on new incoming connections */ + KeSetEvent(&tconn->listener.kstc_accept_event, 0, FALSE); + + /* free the listening daemon tconn */ + ks_put_tconn(tconn); +} + + +/* + * ks_wait_child_tconn + * accept a child connection from peer + * + * Arguments: + * parent: the daemon tdi connection listening + * child: to contain the accepted connection + * + * Return Value: + * ks error code; + * + * Notes: + * N/A + */ + +int +ks_wait_child_tconn( + ksock_tconn_t * parent, + ksock_tconn_t ** child + ) +{ + struct list_head * tmp; + ksock_tconn_t * backlog = NULL; + + ks_replenish_backlogs(parent, parent->listener.nbacklog); + + spin_lock(&(parent->kstc_lock)); + + if (parent->listener.kstc_listening.num <= 0) { + spin_unlock(&(parent->kstc_lock)); + return -1; + } + +again: + + /* check the listening queue and try to search the accepted connecton */ + + list_for_each(tmp, &(parent->listener.kstc_listening.list)) { + backlog = list_entry (tmp, ksock_tconn_t, child.kstc_link); + + spin_lock(&(backlog->kstc_lock)); + + if (backlog->child.kstc_accepted) { + + LASSERT(backlog->kstc_state == ksts_connected); + LASSERT(backlog->child.kstc_busy); + + list_del(&(backlog->child.kstc_link)); + list_add(&(backlog->child.kstc_link), + &(parent->listener.kstc_accepted.list)); + parent->listener.kstc_accepted.num++; + parent->listener.kstc_listening.num--; + backlog->child.kstc_queueno = 1; + + spin_unlock(&(backlog->kstc_lock)); + + break; + } else { + spin_unlock(&(backlog->kstc_lock)); + backlog = NULL; + } + } + + spin_unlock(&(parent->kstc_lock)); + + /* we need wait until new incoming connections are requested + or the case of shuting down the listenig daemon thread */ + if (backlog == NULL) { + + NTSTATUS Status; + + Status = KeWaitForSingleObject( + &(parent->listener.kstc_accept_event), + Executive, + KernelMode, + FALSE, + NULL + ); + + spin_lock(&(parent->kstc_lock)); + + /* check whether it's exptected to exit ? */ + if (!cfs_is_flag_set(parent->kstc_flags, KS_TCONN_DAEMON_STARTED)) { + spin_unlock(&(parent->kstc_lock)); + } else { + goto again; + } + } + + if (backlog) { + /* query the local ip address of the connection */ + ks_query_local_ipaddr(backlog); + } + + *child = backlog; + + return 0; +} + +int libcfs_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask) +{ + ks_addr_slot_t * slot = NULL; + PLIST_ENTRY list = NULL; + + spin_lock(&ks_data.ksnd_addrs_lock); + + list = ks_data.ksnd_addrs_list.Flink; + while (list != &ks_data.ksnd_addrs_list) { + slot = CONTAINING_RECORD(list, ks_addr_slot_t, link); + if (_stricmp(name, &slot->iface[0]) == 0) { + *up = slot->up; + *ip = slot->ip_addr; + *mask = slot->netmask; + break; + } + list = list->Flink; + slot = NULL; + } + + spin_unlock(&ks_data.ksnd_addrs_lock); + + return (int)(slot == NULL); +} + +int libcfs_ipif_enumerate(char ***names) +{ + ks_addr_slot_t * slot = NULL; + PLIST_ENTRY list = NULL; + int nips = 0; + + spin_lock(&ks_data.ksnd_addrs_lock); + + *names = cfs_alloc(sizeof(char *) * ks_data.ksnd_naddrs, CFS_ALLOC_ZERO); + if (*names == NULL) { + goto errorout; + } + + list = ks_data.ksnd_addrs_list.Flink; + while (list != &ks_data.ksnd_addrs_list) { + slot = CONTAINING_RECORD(list, ks_addr_slot_t, link); + list = list->Flink; + (*names)[nips++] = slot->iface; + cfs_assert(nips <= ks_data.ksnd_naddrs); + } + + cfs_assert(nips == ks_data.ksnd_naddrs); + +errorout: + + spin_unlock(&ks_data.ksnd_addrs_lock); + return nips; +} + +void libcfs_ipif_free_enumeration(char **names, int n) +{ + if (names) { + cfs_free(names); + } +} + +int libcfs_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog) +{ + int rc = 0; + ksock_tconn_t * parent; + + parent = ks_create_tconn(); + if (!parent) { + rc = -ENOMEM; + goto errorout; + } + + /* initialize the tconn as a listener */ + ks_init_listener(parent); + + /* bind the daemon->tconn */ + rc = ks_bind_tconn(parent, NULL, ip, (unsigned short)port); + + if (rc < 0) { + ks_free_tconn(parent); + goto errorout; + } + + /* create listening children and make it to listen state*/ + rc = ks_start_listen(parent, backlog); + if (rc < 0) { + ks_stop_listen(parent); + goto errorout; + } + + *sockp = parent; + +errorout: + + return rc; +} + +int libcfs_sock_accept(struct socket **newsockp, struct socket *sock) +{ + /* wait for incoming connecitons */ + return ks_wait_child_tconn(sock, newsockp); +} + +void libcfs_sock_abort_accept(struct socket *sock) +{ + LASSERT(sock->kstc_type == kstt_listener); + + spin_lock(&(sock->kstc_lock)); + + /* clear the daemon flag */ + cfs_clear_flag(sock->kstc_flags, KS_TCONN_DAEMON_STARTED); + + /* wake up it from the waiting on new incoming connections */ + KeSetEvent(&sock->listener.kstc_accept_event, 0, FALSE); + + spin_unlock(&(sock->kstc_lock)); +} + +/* + * libcfs_sock_connect + * build a conntion between local ip/port and the peer ip/port. + * + * Arguments: + * laddr: local ip address + * lport: local port number + * paddr: peer's ip address + * pport: peer's port number + * + * Return Value: + * int: return code ... + * + * Notes: + * N/A + */ + + +int libcfs_sock_connect(struct socket **sockp, int *fatal, + __u32 local_ip, int local_port, + __u32 peer_ip, int peer_port) +{ + ksock_tconn_t * tconn = NULL; + int rc = 0; + + *sockp = NULL; + + KsPrint((1, "libcfs_sock_connect: connecting to %x:%d with %x:%d...\n", + peer_ip, peer_port, local_ip, local_port )); + + /* create the tdi connecion structure */ + tconn = ks_create_tconn(); + if (!tconn) { + rc = -ENOMEM; + goto errorout; + } + + /* initialize the tdi sender connection */ + ks_init_sender(tconn); + + /* bind the local ip address with the tconn */ + rc = ks_bind_tconn(tconn, NULL, local_ip, (unsigned short)local_port); + if (rc < 0) { + KsPrint((0, "libcfs_sock_connect: failed to bind address %x:%d...\n", + local_ip, local_port )); + ks_free_tconn(tconn); + goto errorout; + } + + /* connect to the remote peer */ + rc = ks_build_tconn(tconn, peer_ip, (unsigned short)peer_port); + if (rc < 0) { + KsPrint((0, "libcfs_sock_connect: failed to connect %x:%d ...\n", + peer_ip, peer_port )); + + ks_put_tconn(tconn); + goto errorout; + } + + *sockp = tconn; + +errorout: + + return rc; +} + +int libcfs_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize) +{ + return 0; +} + +int libcfs_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize) +{ + return 0; +} + +int libcfs_sock_getaddr(struct socket *socket, int remote, __u32 *ip, int *port) +{ + PTRANSPORT_ADDRESS taddr = NULL; + + spin_lock(&socket->kstc_lock); + if (remote) { + if (socket->kstc_type == kstt_sender) { + taddr = socket->sender.kstc_info.Remote; + } else if (socket->kstc_type == kstt_child) { + taddr = socket->child.kstc_info.Remote; + } + } else { + taddr = &(socket->kstc_addr.Tdi); + } + + if (taddr) { + PTDI_ADDRESS_IP addr = (PTDI_ADDRESS_IP)(&(taddr->Address[0].Address)); + if (ip != NULL) + *ip = ntohl (addr->in_addr); + if (port != NULL) + *port = ntohs (addr->sin_port); + } else { + spin_unlock(&socket->kstc_lock); + return -ENOTCONN; + } + + spin_unlock(&socket->kstc_lock); + return 0; +} + +int libcfs_sock_write(struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + ksock_mdl_t * mdl; + + int offset = 0; + + while (nob > offset) { + + /* lock the user buffer */ + rc = ks_lock_buffer( (char *)buffer + offset, + FALSE, nob - offset, IoReadAccess, &mdl ); + + if (rc < 0) { + return (rc); + } + + /* send out the whole mdl */ + rc = ks_send_mdl( sock, NULL, mdl, nob - offset, 0 ); + + if (rc > 0) { + offset += rc; + } else { + return (rc); + } + } + + return (0); +} + +int libcfs_sock_read(struct socket *sock, void *buffer, int nob, int timeout) +{ + int rc; + ksock_mdl_t * mdl; + + int offset = 0; + + while (nob > offset) { + + /* lock the user buffer */ + rc = ks_lock_buffer( (char *)buffer + offset, + FALSE, nob - offset, IoWriteAccess, &mdl ); + + if (rc < 0) { + return (rc); + } + + /* recv the requested buffer */ + rc = ks_recv_mdl( sock, mdl, nob - offset, 0 ); + + if (rc > 0) { + offset += rc; + } else { + return (rc); + } + } + + return (0); +} + +void libcfs_sock_release(struct socket *sock) +{ + if (sock->kstc_type == kstt_listener && + sock->kstc_state == ksts_listening) { + ks_stop_listen(sock); + } else { + ks_put_tconn(sock); + } +} diff --git a/lnet/libcfs/winnt/winnt-tracefile.c b/lnet/libcfs/winnt/winnt-tracefile.c new file mode 100644 index 0000000..d172bff --- /dev/null +++ b/lnet/libcfs/winnt/winnt-tracefile.c @@ -0,0 +1,300 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#define LUSTRE_TRACEFILE_PRIVATE + +#include +#include +#include "tracefile.h" + +#ifndef get_cpu +#define get_cpu() smp_processor_id() +#define put_cpu() do { } while (0) +#endif + +extern union trace_data_union trace_data[NR_CPUS]; +extern char *tracefile; +extern int64_t tracefile_size; + +event_t tracefile_event; + +void tracefile_init_arch() +{ + int i; + int j; + + cfs_init_event(&tracefile_event, TRUE, TRUE); + + memset(trace_console_buffers, 0, sizeof(trace_console_buffers)); + + for (i = 0; i < NR_CPUS; i++) { + for (j = 0; j < 1; j++) { + trace_console_buffers[i][j] = + cfs_alloc(TRACE_CONSOLE_BUFFER_SIZE, + CFS_ALLOC_ZERO); + + if (trace_console_buffers[i][j] == NULL) { + tracefile_fini_arch(); + KsPrint((0, "Can't allocate console message buffer\n")); + return -ENOMEM; + } + } + } + + return 0; +} + +void tracefile_fini_arch() +{ + int i; + int j; + + for (i = 0; i < NR_CPUS; i++) { + for (j = 0; j < 2; j++) { + if (trace_console_buffers[i][j] != NULL) { + cfs_free(trace_console_buffers[i][j]); + trace_console_buffers[i][j] = NULL; + } + } + } +} + +void tracefile_read_lock() +{ + cfs_wait_event(&tracefile_event, 0); +} + +void tracefile_read_unlock() +{ + cfs_wake_event(&tracefile_event); +} + +void tracefile_write_lock() +{ + cfs_wait_event(&tracefile_event, 0); +} + +void tracefile_write_unlock() +{ + cfs_wake_event(&tracefile_event); +} + +char * +trace_get_console_buffer(void) +{ +#pragma message ("is there possible problem with pre-emption ?") + int cpu = (int) KeGetCurrentProcessorNumber(); + return trace_console_buffers[cpu][0]; +} + +void +trace_put_console_buffer(char *buffer) +{ +} + +struct trace_cpu_data * +trace_get_tcd(void) +{ +#pragma message("todo: return NULL if in interrupt context") + + int cpu = (int) KeGetCurrentProcessorNumber(); + return &trace_data[cpu].tcd; +} + +void +trace_put_tcd (struct trace_cpu_data *tcd, unsigned long flags) +{ +} + +void +set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, + const int line, unsigned long stack) +{ + struct timeval tv; + + do_gettimeofday(&tv); + + header->ph_subsys = subsys; + header->ph_mask = mask; + header->ph_cpu_id = smp_processor_id(); + header->ph_sec = (__u32)tv.tv_sec; + header->ph_usec = tv.tv_usec; + header->ph_stack = stack; + header->ph_pid = current->pid; + header->ph_line_num = line; + header->ph_extern_pid = 0; + return; +} + +void print_to_console(struct ptldebug_header *hdr, int mask, const char *buf, + int len, const char *file, const char *fn) +{ + char *prefix = NULL, *ptype = NULL; + + if ((mask & D_EMERG) != 0) { + prefix = "LustreError"; + ptype = KERN_EMERG; + } else if ((mask & D_ERROR) != 0) { + prefix = "LustreError"; + ptype = KERN_ERR; + } else if ((mask & D_WARNING) != 0) { + prefix = "Lustre"; + ptype = KERN_WARNING; + } else if ((mask & libcfs_printk) != 0 || (mask & D_CONSOLE)) { + prefix = "Lustre"; + ptype = KERN_INFO; + } + + if ((mask & D_CONSOLE) != 0) { + printk("%s%s: %s", ptype, prefix, buf); + } else { + printk("%s%s: %d:%d:(%s:%d:%s()) %s", ptype, prefix, hdr->ph_pid, + hdr->ph_extern_pid, file, hdr->ph_line_num, fn, buf); + } + return; +} + +int tcd_owns_tage(struct trace_cpu_data *tcd, struct trace_page *tage) +{ + return 1; +} + + +int trace_write_daemon_file(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char *name; + unsigned long off; + int rc; + + name =cfs_alloc(count + 1, 0); + if (name == NULL) + return -ENOMEM; + + if (copy_from_user((void *)name, (void*)buffer, count)) { + rc = -EFAULT; + goto out; + } + + /* be nice and strip out trailing '\n' */ + for (off = count ; off > 2 && isspace(name[off - 1]); off--) + ; + + name[off] = '\0'; + + tracefile_write_lock(); + if (strcmp(name, "stop") == 0) { + tracefile = NULL; + trace_stop_thread(); + goto out_sem; + } else if (strncmp(name, "size=", 5) == 0) { + tracefile_size = simple_strtoul(name + 5, NULL, 0); + if (tracefile_size < 10 || tracefile_size > 20480) + tracefile_size = TRACEFILE_SIZE; + else + tracefile_size <<= 20; + goto out_sem; + } + + if (tracefile != NULL) + cfs_free(tracefile); + + tracefile = name; + name = NULL; + printk(KERN_INFO "Lustre: debug daemon will attempt to start writing " + "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10)); + + trace_start_thread(); +out_sem: + tracefile_write_unlock(); +out: + if (name != NULL) + cfs_free(name); + return count; +} + +int trace_read_daemon_file(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int rc; + + tracefile_read_lock(); + rc = snprintf(page, count, "%s", tracefile); + tracefile_read_unlock(); + + return rc; +} + +int trace_write_debug_mb(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char string[32]; + int i; + unsigned max; + + if (count >= sizeof(string)) { + printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n", + count); + return -EOVERFLOW; + } + + if (copy_from_user((void *)string, (void *)buffer, count)) + return -EFAULT; + + max = simple_strtoul(string, NULL, 0); + if (max == 0) + return -EINVAL; + + if (max > (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5 || max >= 512) { + printk(KERN_ERR "Lustre: Refusing to set debug buffer size to " + "%dMB, which is more than 80%% of available RAM (%lu)\n", + max, (num_physpages >> (20 - 2 - CFS_PAGE_SHIFT)) / 5); + return -EINVAL; + } + + max /= smp_num_cpus; + + for (i = 0; i < NR_CPUS; i++) { + struct trace_cpu_data *tcd; + tcd = &trace_data[i].tcd; + tcd->tcd_max_pages = max << (20 - CFS_PAGE_SHIFT); + } + return count; +} + +int trace_read_debug_mb(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct trace_cpu_data *tcd; + int rc; + + tcd = trace_get_tcd(); + LASSERT (tcd != NULL); + rc = snprintf(page, count, "%lu\n", + (tcd->tcd_max_pages >> (20 - CFS_PAGE_SHIFT)) * smp_num_cpus); + trace_put_tcd(tcd); + return rc; +} + +void +trace_call_on_all_cpus(void (*fn)(void *arg), void *arg) +{ +#error "tbd" +} + diff --git a/lnet/libcfs/winnt/winnt-usr.c b/lnet/libcfs/winnt/winnt-usr.c new file mode 100644 index 0000000..f79347b --- /dev/null +++ b/lnet/libcfs/winnt/winnt-usr.c @@ -0,0 +1,85 @@ + +#ifndef __KERNEL__ + +#include +#include +#include +#include +#include + +void portals_debug_msg(int subsys, int mask, char *file, const char *fn, + const int line, unsigned long stack, + char *format, ...) { + } + +int cfs_proc_mknod(const char *path, unsigned short mode, unsigned int dev) +{ + return 0; +} + + +void print_last_error(char* Prefix) +{ + LPVOID lpMsgBuf; + + FormatMessage( + FORMAT_MESSAGE_ALLOCATE_BUFFER | + FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, + GetLastError(), + 0, + (LPTSTR) &lpMsgBuf, + 0, + NULL + ); + + printf("%s %s", Prefix, (LPTSTR) lpMsgBuf); + + LocalFree(lpMsgBuf); +} + +// +// The following declarations are defined in io.h of VC +// sys/types.h will conflict with io.h, so we need place +// these declartions here. + +#ifdef __cplusplus +extern "C" { +#endif + void + __declspec (naked) __cdecl _chkesp(void) + { +#if _X86_ + __asm { jz exit_chkesp }; + __asm { int 3 }; + exit_chkesp: + __asm { ret }; +#endif + } +#ifdef __cplusplus +} +#endif + +unsigned int sleep (unsigned int seconds) +{ + Sleep(seconds * 1000); + return 0; +} + +int gethostname(char * name, int namelen) +{ + return 0; +} + +int ioctl ( + int handle, + int cmd, + void *buffer + ) +{ + printf("hello, world\n"); + return 0; +} + +#endif /* __KERNEL__ */ \ No newline at end of file diff --git a/lnet/libcfs/winnt/winnt-utils.c b/lnet/libcfs/winnt/winnt-utils.c new file mode 100644 index 0000000..cd33aa2 --- /dev/null +++ b/lnet/libcfs/winnt/winnt-utils.c @@ -0,0 +1,158 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (c) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or modify it under + * the terms of version 2 of the GNU General Public License as published by + * the Free Software Foundation. Lustre is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. You should have received a + * copy of the GNU General Public License along with Lustre; if not, write + * to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, + * USA. + */ + + +/* + * miscellaneous libcfs stuff + */ +#define DEBUG_SUBSYSTEM S_LNET +#include + +/* + * Convert server error code to client format. Error codes are from + * Linux errno.h, so for Linux client---identity. + */ +int convert_server_error(__u64 ecode) +{ + return cfs_error_code((NTSTATUS)ecode); +} + +/* + * convert flag from client to server. + * + * nt kernel uses several members to describe the open flags + * such as DesiredAccess/ShareAccess/CreateDisposition/CreateOptions + * so it's better to convert when using, not here. + */ + +int convert_client_oflag(int cflag, int *result) +{ + *result = 0; + return 0; +} + + +int cfs_error_code(NTSTATUS Status) +{ + switch (Status) { + + case STATUS_ACCESS_DENIED: + return (-EACCES); + + case STATUS_ACCESS_VIOLATION: + return (-EFAULT); + + case STATUS_BUFFER_TOO_SMALL: + return (-ETOOSMALL); + + case STATUS_INVALID_PARAMETER: + return (-EINVAL); + + case STATUS_NOT_IMPLEMENTED: + case STATUS_NOT_SUPPORTED: + return (-EOPNOTSUPP); + + case STATUS_INVALID_ADDRESS: + case STATUS_INVALID_ADDRESS_COMPONENT: + return (-EADDRNOTAVAIL); + + case STATUS_NO_SUCH_DEVICE: + case STATUS_NO_SUCH_FILE: + case STATUS_OBJECT_NAME_NOT_FOUND: + case STATUS_OBJECT_PATH_NOT_FOUND: + case STATUS_NETWORK_BUSY: + case STATUS_INVALID_NETWORK_RESPONSE: + case STATUS_UNEXPECTED_NETWORK_ERROR: + return (-ENETDOWN); + + case STATUS_BAD_NETWORK_PATH: + case STATUS_NETWORK_UNREACHABLE: + case STATUS_PROTOCOL_UNREACHABLE: + return (-ENETUNREACH); + + case STATUS_LOCAL_DISCONNECT: + case STATUS_TRANSACTION_ABORTED: + case STATUS_CONNECTION_ABORTED: + return (-ECONNABORTED); + + case STATUS_REMOTE_DISCONNECT: + case STATUS_LINK_FAILED: + case STATUS_CONNECTION_DISCONNECTED: + case STATUS_CONNECTION_RESET: + case STATUS_PORT_UNREACHABLE: + return (-ECONNRESET); + + case STATUS_PAGEFILE_QUOTA: + case STATUS_NO_MEMORY: + case STATUS_CONFLICTING_ADDRESSES: + case STATUS_QUOTA_EXCEEDED: + case STATUS_TOO_MANY_PAGING_FILES: + case STATUS_INSUFFICIENT_RESOURCES: + case STATUS_WORKING_SET_QUOTA: + case STATUS_COMMITMENT_LIMIT: + case STATUS_TOO_MANY_ADDRESSES: + case STATUS_REMOTE_RESOURCES: + return (-ENOBUFS); + + case STATUS_INVALID_CONNECTION: + return (-ENOTCONN); + + case STATUS_PIPE_DISCONNECTED: + return (-ESHUTDOWN); + + case STATUS_TIMEOUT: + case STATUS_IO_TIMEOUT: + case STATUS_LINK_TIMEOUT: + return (-ETIMEDOUT); + + case STATUS_REMOTE_NOT_LISTENING: + case STATUS_CONNECTION_REFUSED: + return (-ECONNREFUSED); + + case STATUS_HOST_UNREACHABLE: + return (-EHOSTUNREACH); + + case STATUS_PENDING: + case STATUS_DEVICE_NOT_READY: + return (-EAGAIN); + + case STATUS_CANCELLED: + case STATUS_REQUEST_ABORTED: + return (-EINTR); + + case STATUS_BUFFER_OVERFLOW: + case STATUS_INVALID_BUFFER_SIZE: + return (-EMSGSIZE); + + } + + if (NT_SUCCESS(Status)) + return 0; + + return (-EINVAL); +} + + +void cfs_stack_trace_fill(struct cfs_stack_trace *trace) +{ +} + +void *cfs_stack_trace_frame(struct cfs_stack_trace *trace, int frame_no) +{ + return NULL; +} diff --git a/lnet/lnet/Info.plist b/lnet/lnet/Info.plist index 60c304b..2b3967f 100644 --- a/lnet/lnet/Info.plist +++ b/lnet/lnet/Info.plist @@ -5,11 +5,11 @@ CFBundleDevelopmentRegion English CFBundleExecutable - portals + lnet CFBundleIconFile CFBundleIdentifier - com.clusterfs.lustre.portals + com.clusterfs.lustre.lnet CFBundleInfoDictionaryVersion 6.0 CFBundlePackageType @@ -22,12 +22,14 @@ 1.0.0 OSBundleLibraries - com.apple.kernel.bsd - 1.1 - com.apple.kernel.iokit - 1.0.0b1 - com.apple.kernel.mach - 1.0.0b1 + com.apple.kpi.bsd + 8.0.0b1 + com.apple.kpi.libkern + 8.0.0b1 + com.apple.kpi.mach + 8.0.0b1 + com.apple.kpi.unsupported + 8.0.0b1 com.clusterfs.lustre.libcfs 1.0.0 diff --git a/lnet/lnet/Makefile.in b/lnet/lnet/Makefile.in index c0f2e71..3bc86f6 100644 --- a/lnet/lnet/Makefile.in +++ b/lnet/lnet/Makefile.in @@ -1,6 +1,10 @@ -MODULES := portals -portals-objs := api-errno.o api-ni.o api-wrap.o -portals-objs += lib-init.o lib-me.o lib-msg.o lib-eq.o lib-md.o -portals-objs += lib-move.o lib-ni.o lib-pid.o module.o +MODULES := lnet + +lnet-objs := api-errno.o api-ni.o config.o +lnet-objs += lib-me.o lib-msg.o lib-eq.o lib-md.o +lnet-objs += lib-move.o module.o lo.o +lnet-objs += router.o router_proc.o acceptor.o peer.o + +default: all @INCLUDE_RULES@ diff --git a/lnet/lnet/acceptor.c b/lnet/lnet/acceptor.c new file mode 100644 index 0000000..1968f59 --- /dev/null +++ b/lnet/lnet/acceptor.c @@ -0,0 +1,537 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +#ifdef __KERNEL__ +static char *accept = "secure"; +CFS_MODULE_PARM(accept, "s", charp, 0444, + "Accept connections (secure|all|none)"); + +static int accept_port = 988; +CFS_MODULE_PARM(accept_port, "i", int, 0444, + "Acceptor's port (same on all nodes)"); + +static int accept_backlog = 127; +CFS_MODULE_PARM(accept_backlog, "i", int, 0444, + "Acceptor's listen backlog"); + +static int accept_timeout = 5; +CFS_MODULE_PARM(accept_timeout, "i", int, 0644, + "Acceptor's timeout (seconds)"); + +struct { + int pta_shutdown; + cfs_socket_t *pta_sock; + struct semaphore pta_signal; +} lnet_acceptor_state; + +int +lnet_acceptor_timeout(void) +{ + return accept_timeout; +} +EXPORT_SYMBOL(lnet_acceptor_timeout); + +int +lnet_acceptor_port(void) +{ + return accept_port; +} +EXPORT_SYMBOL(lnet_acceptor_port); + +void +lnet_connect_console_error (int rc, lnet_nid_t peer_nid, + __u32 peer_ip, int peer_port) +{ + switch (rc) { + /* "normal" errors */ + case -ECONNREFUSED: + CDEBUG(D_NETERROR, "Connection to %s at host %u.%u.%u.%u " + "on port %d was refused: " + "check that Lustre is running on that node.\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -EHOSTUNREACH: + case -ENETUNREACH: + CDEBUG(D_NETERROR, "Connection to %s at host %u.%u.%u.%u " + "was unreachable: the network or that node may " + "be down, or Lustre may be misconfigured.\n", + libcfs_nid2str(peer_nid), HIPQUAD(peer_ip)); + break; + case -ETIMEDOUT: + LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u on " + "port %d took too long: that node may be hung " + "or experiencing high load.\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -ECONNRESET: + LCONSOLE_ERROR("Connection to %s at host %u.%u.%u.%u on " + "port %d was reset: " + "is it running a compatible version of Lustre " + "and is %s one of its NIDs?\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port, + libcfs_nid2str(peer_nid)); + break; + case -EPROTO: + LCONSOLE_ERROR("Protocol error connecting to %s at host " + "%u.%u.%u.%u on port %d: " + "is it running a compatible version of Lustre?\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + case -EADDRINUSE: + LCONSOLE_ERROR("No privileged ports available to connect to " + "%s at host %u.%u.%u.%u on port %d\n", + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + default: + LCONSOLE_ERROR("Unexpected error %d connecting to %s at " + "host %u.%u.%u.%u on port %d\n", rc, + libcfs_nid2str(peer_nid), + HIPQUAD(peer_ip), peer_port); + break; + } +} +EXPORT_SYMBOL(lnet_connect_console_error); + +int +lnet_connect(cfs_socket_t **sockp, lnet_nid_t peer_nid, + __u32 local_ip, __u32 peer_ip, int peer_port) +{ + lnet_acceptor_connreq_t cr; + cfs_socket_t *sock; + int rc; + int port; + int fatal; + + CLASSERT (sizeof(cr) <= 16); /* not too big to be on the stack */ + + for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; + port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; + --port) { + /* Iterate through reserved ports. */ + + rc = libcfs_sock_connect(&sock, &fatal, + local_ip, port, + peer_ip, peer_port); + if (rc != 0) { + if (fatal) + goto failed; + continue; + } + + CLASSERT (LNET_PROTO_ACCEPTOR_VERSION == 1); + + if (the_lnet.ln_ptlcompat != 2) { + /* When portals compatibility is "strong", simply + * connect (i.e. send no acceptor connection request). + * Othewise send an acceptor connection request. I can + * have no portals peers so everyone else should + * understand my protocol. */ + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + cr.acr_nid = peer_nid; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 4) != 0) { + cr.acr_version++; + the_lnet.ln_testprotocompat &= ~4; + } + if ((the_lnet.ln_testprotocompat & 8) != 0) { + cr.acr_magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~8; + } + LNET_UNLOCK(); + } + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + if (rc != 0) + goto failed_sock; + } + + *sockp = sock; + return 0; + } + + rc = -EADDRINUSE; + goto failed; + + failed_sock: + libcfs_sock_release(sock); + failed: + lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port); + return rc; +} +EXPORT_SYMBOL(lnet_connect); + +static inline int +lnet_accept_magic(__u32 magic, __u32 constant) +{ + return (magic == constant || + magic == __swab32(constant)); +} + +int +lnet_accept(lnet_ni_t *blind_ni, cfs_socket_t *sock, __u32 magic) +{ + lnet_acceptor_connreq_t cr; + __u32 peer_ip; + int peer_port; + int rc; + int flip; + lnet_ni_t *ni; + char *str; + + /* CAVEAT EMPTOR: I may be called by an LND in any thread's context if + * I passed the new socket "blindly" to the single NI that needed an + * acceptor. If so, blind_ni != NULL... */ + + LASSERT (sizeof(cr) <= 16); /* not too big for the stack */ + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT (rc == 0); /* we succeeded before */ + + if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { + + if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { + /* future version compatibility! + * When LNET unifies protocols over all LNDs, the first + * thing sent will be a version query. I send back + * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" */ + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to LNET magic from %u.%u.%u.%u: %d\n", + HIPQUAD(peer_ip), rc); + return -EPROTO; + } + + if (magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) + str = "'old' socknal/tcpnal"; + else if (lnet_accept_magic(magic, LNET_PROTO_RA_MAGIC)) + str = "'old' ranal"; + else if (lnet_accept_magic(magic, LNET_PROTO_OPENIB_MAGIC)) + str = "'old' openibnal"; + else + str = "unrecognised"; + + LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u magic %08x: " + " %s acceptor protocol\n", + HIPQUAD(peer_ip), magic, str); + return -EPROTO; + } + + flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); + + rc = libcfs_sock_read(sock, &cr.acr_version, + sizeof(cr.acr_version), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request version from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + return -EIO; + } + + if (flip) + __swab32s(&cr.acr_version); + + if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) { + /* future version compatibility! + * An acceptor-specific protocol rev will first send a version + * query. I send back my current version to tell her I'm + * "old". */ + int peer_version = cr.acr_version; + + memset (&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + + rc = libcfs_sock_write(sock, &cr, sizeof(cr), + accept_timeout); + + if (rc != 0) + CERROR("Error sending magic+version in response" + "to version %d from %u.%u.%u.%u: %d\n", + peer_version, HIPQUAD(peer_ip), rc); + return -EPROTO; + } + + rc = libcfs_sock_read(sock, &cr.acr_nid, + sizeof(cr) - + offsetof(lnet_acceptor_connreq_t, acr_nid), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + return -EIO; + } + + if (flip) + __swab64s(&cr.acr_nid); + + ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid)); + if (ni == NULL || /* no matching net */ + ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */ + if (ni != NULL) + lnet_ni_decref(ni); + LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: " + " No matching NI\n", + HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + if (ni->ni_lnd->lnd_accept == NULL) { + /* This catches a request for the loopback LND */ + lnet_ni_decref(ni); + LCONSOLE_ERROR("Refusing connection from %u.%u.%u.%u for %s: " + " NI doesn not accept IP connections\n", + HIPQUAD(peer_ip), libcfs_nid2str(cr.acr_nid)); + return -EPERM; + } + + CDEBUG(D_NET, "Accept %s from %u.%u.%u.%u%s\n", + libcfs_nid2str(cr.acr_nid), HIPQUAD(peer_ip), + blind_ni == NULL ? "" : " (blind)"); + + if (blind_ni == NULL) { + /* called by the acceptor: call into the requested NI... */ + rc = ni->ni_lnd->lnd_accept(ni, sock); + } else { + /* portals_compatible set and the (only) NI called me to verify + * and skip the connection request... */ + LASSERT (the_lnet.ln_ptlcompat != 0); + LASSERT (ni == blind_ni); + rc = 0; + } + + lnet_ni_decref(ni); + return rc; +} +EXPORT_SYMBOL(lnet_accept); + +int +lnet_acceptor(void *arg) +{ + char name[16]; + cfs_socket_t *newsock; + int rc; + int n_acceptor_nis; + __u32 magic; + __u32 peer_ip; + int peer_port; + lnet_ni_t *blind_ni = NULL; + int secure = (int)((unsigned long)arg); + + LASSERT (lnet_acceptor_state.pta_sock == NULL); + + if (the_lnet.ln_ptlcompat != 0) { + /* When portals_compatibility is enabled, peers may connect + * without sending an acceptor connection request. There is no + * ambiguity about which network the peer wants to connect to + * since there can only be 1 network, so I pass connections + * "blindly" to it. */ + n_acceptor_nis = lnet_count_acceptor_nis(&blind_ni); + LASSERT (n_acceptor_nis == 1); + LASSERT (blind_ni != NULL); + } + + snprintf(name, sizeof(name), "acceptor_%03d", accept_port); + cfs_daemonize(name); + cfs_block_allsigs(); + + rc = libcfs_sock_listen(&lnet_acceptor_state.pta_sock, + 0, accept_port, accept_backlog); + if (rc != 0) { + if (rc == -EADDRINUSE) + LCONSOLE_ERROR("Can't start acceptor on port %d: " + "port already in use\n", + accept_port); + else + LCONSOLE_ERROR("Can't start acceptor on port %d: " + "unexpected error %d\n", + accept_port, rc); + + lnet_acceptor_state.pta_sock = NULL; + } else { + LCONSOLE(0, "Accept %s, port %d%s\n", + accept, accept_port, + blind_ni == NULL ? "" : " (proto compatible)"); + } + + /* set init status and unblock parent */ + lnet_acceptor_state.pta_shutdown = rc; + mutex_up(&lnet_acceptor_state.pta_signal); + + if (rc != 0) + return rc; + + while (lnet_acceptor_state.pta_shutdown == 0) { + + rc = libcfs_sock_accept(&newsock, lnet_acceptor_state.pta_sock); + if (rc != 0) { + if (rc != -EAGAIN) { + CWARN("Accept error %d: pausing...\n", rc); + cfs_pause(cfs_time_seconds(1)); + } + continue; + } + + rc = libcfs_sock_getaddr(newsock, 1, &peer_ip, &peer_port); + if (rc != 0) { + CERROR("Can't determine new connection's address\n"); + goto failed; + } + + if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + CERROR("Refusing connection from %u.%u.%u.%u: " + "insecure port %d\n", + HIPQUAD(peer_ip), peer_port); + goto failed; + } + + if (blind_ni != NULL) { + rc = blind_ni->ni_lnd->lnd_accept(blind_ni, newsock); + if (rc != 0) { + CERROR("NI %s refused 'blind' connection from " + "%u.%u.%u.%u\n", + libcfs_nid2str(blind_ni->ni_nid), + HIPQUAD(peer_ip)); + goto failed; + } + continue; + } + + rc = libcfs_sock_read(newsock, &magic, sizeof(magic), + accept_timeout); + if (rc != 0) { + CERROR("Error %d reading connection request from " + "%u.%u.%u.%u\n", rc, HIPQUAD(peer_ip)); + goto failed; + } + + rc = lnet_accept(NULL, newsock, magic); + if (rc != 0) + goto failed; + + continue; + + failed: + libcfs_sock_release(newsock); + } + + libcfs_sock_release(lnet_acceptor_state.pta_sock); + lnet_acceptor_state.pta_sock = NULL; + + if (blind_ni != NULL) + lnet_ni_decref(blind_ni); + + LCONSOLE(0,"Acceptor stopping\n"); + + /* unblock lnet_acceptor_stop() */ + mutex_up(&lnet_acceptor_state.pta_signal); + return 0; +} + +int +lnet_acceptor_start(void) +{ + long pid; + long secure; + + LASSERT (lnet_acceptor_state.pta_sock == NULL); + init_mutex_locked(&lnet_acceptor_state.pta_signal); + + if (!strcmp(accept, "secure")) { + secure = 1; + } else if (!strcmp(accept, "all")) { + secure = 0; + } else if (!strcmp(accept, "none")) { + return 0; + } else { + LCONSOLE_ERROR ("Can't parse 'accept=\"%s\"'\n", + accept); + return -EINVAL; + } + + if (lnet_count_acceptor_nis(NULL) == 0) /* not required */ + return 0; + + pid = cfs_kernel_thread(lnet_acceptor, (void *)secure, 0); + if (pid < 0) { + CERROR("Can't start acceptor thread: %ld\n", pid); + return -ESRCH; + } + + mutex_down(&lnet_acceptor_state.pta_signal); /* wait for acceptor to startup */ + + if (lnet_acceptor_state.pta_shutdown == 0) { + /* started OK */ + LASSERT (lnet_acceptor_state.pta_sock != NULL); + return 0; + } + + LASSERT (lnet_acceptor_state.pta_sock == NULL); + return -ENETDOWN; +} + +void +lnet_acceptor_stop(void) +{ + if (lnet_acceptor_state.pta_sock == NULL) /* not running */ + return; + + lnet_acceptor_state.pta_shutdown = 1; + libcfs_sock_abort_accept(lnet_acceptor_state.pta_sock); + + /* block until acceptor signals exit */ + mutex_down(&lnet_acceptor_state.pta_signal); +} + +#else /* __KERNEL__ */ + +int +lnet_acceptor_start(void) +{ + return 0; +} + +void +lnet_acceptor_stop(void) +{ +} + +#endif /* !__KERNEL__ */ diff --git a/lnet/lnet/api-errno.c b/lnet/lnet/api-errno.c index 2f32cbf..a158d6e 100644 --- a/lnet/lnet/api-errno.c +++ b/lnet/lnet/api-errno.c @@ -9,41 +9,3 @@ */ /* If you change these, you must update the number table in portals/errno.h */ -const char *ptl_err_str[] = { - "PTL_OK", - "PTL_SEGV", - - "PTL_NO_SPACE", - "PTL_ME_IN_USE", - "PTL_VAL_FAILED", - - "PTL_NAL_FAILED", - "PTL_NO_INIT", - "PTL_IFACE_DUP", - "PTL_IFACE_INVALID", - - "PTL_HANDLE_INVALID", - "PTL_MD_INVALID", - "PTL_ME_INVALID", -/* If you change these, you must update the number table in portals/errno.h */ - "PTL_PROCESS_INVALID", - "PTL_PT_INDEX_INVALID", - - "PTL_SR_INDEX_INVALID", - "PTL_EQ_INVALID", - "PTL_EQ_DROPPED", - - "PTL_EQ_EMPTY", - "PTL_MD_NO_UPDATE", - "PTL_FAIL", - - "PTL_IOV_INVALID", - - "PTL_EQ_IN_USE", - - "PTL_NI_INVALID", - "PTL_MD_ILLEGAL", - - "PTL_MAX_ERRNO" -}; -/* If you change these, you must update the number table in portals/errno.h */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 91a307a..82c1d75 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -19,243 +19,1712 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS -#include +#define DEBUG_SUBSYSTEM S_LNET +#include -int ptl_init; - -/* Put some magic in the NI handle so uninitialised/zeroed handles are easy - * to spot */ -#define NI_HANDLE_MAGIC 0xebc0de00 -#define NI_HANDLE_MASK 0x000000ff +#ifdef __KERNEL__ +#define D_LNI D_CONSOLE +#else +#define D_LNI D_CONFIG +#endif -static struct nal_t *ptl_nal_table[NAL_MAX_NR + 1]; +lnet_t the_lnet; /* THE state of the network */ #ifdef __KERNEL__ -struct semaphore ptl_mutex; -static void ptl_mutex_enter (void) +static char *ip2nets = ""; +CFS_MODULE_PARM(ip2nets, "s", charp, 0444, + "LNET network <- IP table"); + +static char *networks = ""; +CFS_MODULE_PARM(networks, "s", charp, 0444, + "local networks"); + +static char *routes = ""; +CFS_MODULE_PARM(routes, "s", charp, 0444, + "routes to non-local networks"); + +static char *portals_compatibility = "none"; +CFS_MODULE_PARM(portals_compatibility, "s", charp, 0444, + "wire protocol compatibility: 'strong'|'weak'|'none'"); + +char * +lnet_get_routes(void) +{ + return routes; +} + +char * +lnet_get_networks(void) +{ + char *nets; + int rc; + + if (*networks != 0 && *ip2nets != 0) { + LCONSOLE_ERROR("Please specify EITHER 'networks' or 'ip2nets'" + " but not both at once\n"); + return NULL; + } + + if (*ip2nets != 0) { + rc = lnet_parse_ip2nets(&nets, ip2nets); + return (rc == 0) ? nets : NULL; + } + + if (*networks != 0) + return networks; + + return "tcp"; +} + +int +lnet_get_portals_compatibility(void) { - mutex_down (&ptl_mutex); + if (!strcmp(portals_compatibility, "none")) { + return 0; + } + + if (!strcmp(portals_compatibility, "weak")) { + return 1; + LCONSOLE_WARN("Starting in weak portals-compatible mode\n"); + } + + if (!strcmp(portals_compatibility, "strong")) { + return 2; + LCONSOLE_WARN("Starting in strong portals-compatible mode\n"); + } + + LCONSOLE_ERROR("portals_compatibility=\"%s\" not supported\n", + portals_compatibility); + return -EINVAL; } -static void ptl_mutex_exit (void) +void +lnet_init_locks(void) { - mutex_up (&ptl_mutex); + spin_lock_init (&the_lnet.ln_lock); + cfs_waitq_init (&the_lnet.ln_waitq); + init_mutex(&the_lnet.ln_lnd_mutex); + init_mutex(&the_lnet.ln_api_mutex); } + +void +lnet_fini_locks(void) +{ +} + #else -static void ptl_mutex_enter (void) + +char * +lnet_get_routes(void) { + char *str = getenv("LNET_ROUTES"); + + return (str == NULL) ? "" : str; } -static void ptl_mutex_exit (void) +char * +lnet_get_networks (void) { + static char default_networks[256]; + char *networks = getenv ("LNET_NETWORKS"); + char *ip2nets = getenv ("LNET_IP2NETS"); + char *str; + char *sep; + int len; + int nob; + int rc; + struct list_head *tmp; + +#ifdef NOT_YET + if (networks != NULL && ip2nets != NULL) { + LCONSOLE_ERROR("Please set EITHER 'LNET_NETWORKS' or " + "'LNET_IP2NETS' but not both at once\n"); + return NULL; + } + + if (ip2nets != NULL) { + rc = lnet_parse_ip2nets(&networks, ip2nets); + return (rc == 0) ? networks : NULL; + } +#else + ip2nets = NULL; + rc = 0; +#endif + if (networks != NULL) + return networks; + + /* In userland, the default 'networks=' is the list of known net types */ + + len = sizeof(default_networks); + str = default_networks; + *str = 0; + sep = ""; + + list_for_each (tmp, &the_lnet.ln_lnds) { + lnd_t *lnd = list_entry(tmp, lnd_t, lnd_list); + + nob = snprintf(str, len, "%s%s", sep, + libcfs_lnd2str(lnd->lnd_type)); + len -= nob; + if (len < 0) { + /* overflowed the string; leave it where it was */ + *str = 0; + break; + } + + str += nob; + sep = ","; + } + + return default_networks; +} + +int +lnet_get_portals_compatibility(void) +{ + return 0; +} + +# if !HAVE_LIBPTHREAD + +void lnet_init_locks(void) +{ + the_lnet.ln_lock = 0; + the_lnet.ln_lnd_mutex = 0; + the_lnet.ln_api_mutex = 0; } + +void lnet_fini_locks(void) +{ + LASSERT (the_lnet.ln_api_mutex == 0); + LASSERT (the_lnet.ln_lnd_mutex == 0); + LASSERT (the_lnet.ln_lock == 0); +} + +# else + +void lnet_init_locks(void) +{ + pthread_cond_init(&the_lnet.ln_cond, NULL); + pthread_mutex_init(&the_lnet.ln_lock, NULL); + pthread_mutex_init(&the_lnet.ln_lnd_mutex, NULL); + pthread_mutex_init(&the_lnet.ln_api_mutex, NULL); +} + +void lnet_fini_locks(void) +{ + pthread_mutex_destroy(&the_lnet.ln_api_mutex); + pthread_mutex_destroy(&the_lnet.ln_lnd_mutex); + pthread_mutex_destroy(&the_lnet.ln_lock); + pthread_cond_destroy(&the_lnet.ln_cond); +} + +# endif #endif -nal_t *ptl_hndl2nal(ptl_handle_any_t *handle) +void lnet_assert_wire_constants (void) { - unsigned int idx = handle->nal_idx; + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux robert.bartonsoftware.com 2.6.8-1.521 + * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux + * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) */ - /* XXX we really rely on the caller NOT racing with interface - * setup/teardown. That ensures her NI handle can't get - * invalidated out from under her (or worse, swapped for a - * completely different interface!) */ + /* Constants... */ + CLASSERT (LNET_PROTO_TCP_MAGIC == 0xeebc0ded); + CLASSERT (LNET_PROTO_TCP_VERSION_MAJOR == 1); + CLASSERT (LNET_PROTO_TCP_VERSION_MINOR == 0); + CLASSERT (LNET_MSG_ACK == 0); + CLASSERT (LNET_MSG_PUT == 1); + CLASSERT (LNET_MSG_GET == 2); + CLASSERT (LNET_MSG_REPLY == 3); + CLASSERT (LNET_MSG_HELLO == 4); - LASSERT (ptl_init); + /* Checks for struct ptl_handle_wire_t */ + CLASSERT ((int)sizeof(lnet_handle_wire_t) == 16); + CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_interface_cookie) == 0); + CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_interface_cookie) == 8); + CLASSERT ((int)offsetof(lnet_handle_wire_t, wh_object_cookie) == 8); + CLASSERT ((int)sizeof(((lnet_handle_wire_t *)0)->wh_object_cookie) == 8); - if (((idx ^ NI_HANDLE_MAGIC) & ~NI_HANDLE_MASK) != 0) - return NULL; + /* Checks for struct lnet_magicversion_t */ + CLASSERT ((int)sizeof(lnet_magicversion_t) == 8); + CLASSERT ((int)offsetof(lnet_magicversion_t, magic) == 0); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->magic) == 4); + CLASSERT ((int)offsetof(lnet_magicversion_t, version_major) == 4); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_major) == 2); + CLASSERT ((int)offsetof(lnet_magicversion_t, version_minor) == 6); + CLASSERT ((int)sizeof(((lnet_magicversion_t *)0)->version_minor) == 2); + + /* Checks for struct lnet_hdr_t */ + CLASSERT ((int)sizeof(lnet_hdr_t) == 72); + CLASSERT ((int)offsetof(lnet_hdr_t, dest_nid) == 0); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_nid) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, src_nid) == 8); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_nid) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, dest_pid) == 16); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->dest_pid) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, src_pid) == 20); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->src_pid) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, type) == 24); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->type) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, payload_length) == 28); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->payload_length) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg) == 40); - idx &= NI_HANDLE_MASK; + /* Ack */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.dst_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.dst_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.ack.mlength) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.ack.mlength) == 4); + + /* Put */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ack_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ack_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.hdr_data) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.hdr_data) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.ptl_index) == 64); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.ptl_index) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.put.offset) == 68); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.put.offset) == 4); + + /* Get */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.return_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.return_wmd) == 16); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.match_bits) == 48); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.match_bits) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.ptl_index) == 56); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.ptl_index) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.src_offset) == 60); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.src_offset) == 4); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.get.sink_length) == 64); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.get.sink_length) == 4); + + /* Reply */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.reply.dst_wmd) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.reply.dst_wmd) == 16); + + /* Hello */ + CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.incarnation) == 32); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.incarnation) == 8); + CLASSERT ((int)offsetof(lnet_hdr_t, msg.hello.type) == 40); + CLASSERT ((int)sizeof(((lnet_hdr_t *)0)->msg.hello.type) == 4); +} + +lnd_t * +lnet_find_lnd_by_type (int type) +{ + lnd_t *lnd; + struct list_head *tmp; + + /* holding lnd mutex */ + list_for_each (tmp, &the_lnet.ln_lnds) { + lnd = list_entry(tmp, lnd_t, lnd_list); + + if (lnd->lnd_type == type) + return lnd; + } - if (idx > NAL_MAX_NR || - ptl_nal_table[idx] == NULL || - ptl_nal_table[idx]->nal_refct == 0) - return NULL; + return NULL; +} + +void +lnet_register_lnd (lnd_t *lnd) +{ + LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex); - return ptl_nal_table[idx]; + LASSERT (the_lnet.ln_init); + LASSERT (libcfs_isknown_lnd(lnd->lnd_type)); + LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == NULL); + + list_add_tail (&lnd->lnd_list, &the_lnet.ln_lnds); + lnd->lnd_refcount = 0; + + CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex); } -int ptl_register_nal (ptl_interface_t interface, nal_t *nal) +void +lnet_unregister_lnd (lnd_t *lnd) { - int rc; + LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (lnet_find_lnd_by_type(lnd->lnd_type) == lnd); + LASSERT (lnd->lnd_refcount == 0); - ptl_mutex_enter(); + list_del (&lnd->lnd_list); + CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type)); + + LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex); +} + +#ifndef LNET_USE_LIB_FREELIST + +int +lnet_descriptor_setup (void) +{ + return 0; +} + +void +lnet_descriptor_cleanup (void) +{ +} + +#else + +int +lnet_freelist_init (lnet_freelist_t *fl, int n, int size) +{ + char *space; + + LASSERT (n > 0); + + size += offsetof (lnet_freeobj_t, fo_contents); + + LIBCFS_ALLOC(space, n * size); + if (space == NULL) + return (-ENOMEM); + + CFS_INIT_LIST_HEAD (&fl->fl_list); + fl->fl_objs = space; + fl->fl_nobjs = n; + fl->fl_objsize = size; + + do + { + memset (space, 0, size); + list_add ((struct list_head *)space, &fl->fl_list); + space += size; + } while (--n != 0); + + return (0); +} + +void +lnet_freelist_fini (lnet_freelist_t *fl) +{ + struct list_head *el; + int count; + + if (fl->fl_nobjs == 0) + return; + + count = 0; + for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) + count++; + + LASSERT (count == fl->fl_nobjs); + + LIBCFS_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); + memset (fl, 0, sizeof (fl)); +} + +int +lnet_descriptor_setup (void) +{ + /* NB on failure caller must still call lnet_descriptor_cleanup */ + /* ****** */ + int rc; + + memset (&the_lnet.ln_free_mes, 0, sizeof (the_lnet.ln_free_mes)); + memset (&the_lnet.ln_free_msgs, 0, sizeof (the_lnet.ln_free_msgs)); + memset (&the_lnet.ln_free_mds, 0, sizeof (the_lnet.ln_free_mds)); + memset (&the_lnet.ln_free_eqs, 0, sizeof (the_lnet.ln_free_eqs)); + + rc = lnet_freelist_init(&the_lnet.ln_free_mes, + MAX_MES, sizeof (lnet_me_t)); + if (rc != 0) + return (rc); + + rc = lnet_freelist_init(&the_lnet.ln_free_msgs, + MAX_MSGS, sizeof (lnet_msg_t)); + if (rc != 0) + return (rc); + + rc = lnet_freelist_init(&the_lnet.ln_free_mds, + MAX_MDS, sizeof (lnet_libmd_t)); + if (rc != 0) + return (rc); + + rc = lnet_freelist_init(&the_lnet.ln_free_eqs, + MAX_EQS, sizeof (lnet_eq_t)); + return (rc); +} + +void +lnet_descriptor_cleanup (void) +{ + lnet_freelist_fini (&the_lnet.ln_free_mes); + lnet_freelist_fini (&the_lnet.ln_free_msgs); + lnet_freelist_fini (&the_lnet.ln_free_mds); + lnet_freelist_fini (&the_lnet.ln_free_eqs); +} + +#endif + +__u64 +lnet_create_interface_cookie (void) +{ + /* NB the interface cookie in wire handles guards against delayed + * replies and ACKs appearing valid after reboot. Initialisation time, + * even if it's only implemented to millisecond resolution is probably + * easily good enough. */ + struct timeval tv; + __u64 cookie; +#ifndef __KERNEL__ + int rc = gettimeofday (&tv, NULL); + LASSERT (rc == 0); +#else + do_gettimeofday(&tv); +#endif + cookie = tv.tv_sec; + cookie *= 1000000; + cookie += tv.tv_usec; + return cookie; +} + +int +lnet_setup_handle_hash (void) +{ + int i; + + /* Arbitrary choice of hash table size */ +#ifdef __KERNEL__ + the_lnet.ln_lh_hash_size = CFS_PAGE_SIZE / sizeof (struct list_head); +#else + the_lnet.ln_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; +#endif + LIBCFS_ALLOC(the_lnet.ln_lh_hash_table, + the_lnet.ln_lh_hash_size * sizeof (struct list_head)); + if (the_lnet.ln_lh_hash_table == NULL) + return (-ENOMEM); + + for (i = 0; i < the_lnet.ln_lh_hash_size; i++) + CFS_INIT_LIST_HEAD (&the_lnet.ln_lh_hash_table[i]); + + the_lnet.ln_next_object_cookie = LNET_COOKIE_TYPES; + + return (0); +} + +void +lnet_cleanup_handle_hash (void) +{ + if (the_lnet.ln_lh_hash_table == NULL) + return; + + LIBCFS_FREE(the_lnet.ln_lh_hash_table, + the_lnet.ln_lh_hash_size * sizeof (struct list_head)); +} + +lnet_libhandle_t * +lnet_lookup_cookie (__u64 cookie, int type) +{ + /* ALWAYS called with LNET_LOCK held */ + struct list_head *list; + struct list_head *el; + unsigned int hash; + + if ((cookie & (LNET_COOKIE_TYPES - 1)) != type) + return (NULL); - if (interface < 0 || interface > NAL_MAX_NR) - rc = PTL_IFACE_INVALID; - else if (ptl_nal_table[interface] != NULL) - rc = PTL_IFACE_DUP; - else { - rc = PTL_OK; - ptl_nal_table[interface] = nal; - LASSERT(nal->nal_refct == 0); + hash = ((unsigned int)cookie) % the_lnet.ln_lh_hash_size; + list = &the_lnet.ln_lh_hash_table[hash]; + + list_for_each (el, list) { + lnet_libhandle_t *lh = list_entry (el, lnet_libhandle_t, + lh_hash_chain); + + if (lh->lh_cookie == cookie) + return (lh); } + + return (NULL); +} - ptl_mutex_exit(); - return (rc); +void +lnet_initialise_handle (lnet_libhandle_t *lh, int type) +{ + /* ALWAYS called with LNET_LOCK held */ + unsigned int hash; + + LASSERT (type >= 0 && type < LNET_COOKIE_TYPES); + lh->lh_cookie = the_lnet.ln_next_object_cookie | type; + the_lnet.ln_next_object_cookie += LNET_COOKIE_TYPES; + + hash = ((unsigned int)lh->lh_cookie) % the_lnet.ln_lh_hash_size; + list_add (&lh->lh_hash_chain, &the_lnet.ln_lh_hash_table[hash]); +} + +void +lnet_invalidate_handle (lnet_libhandle_t *lh) +{ + /* ALWAYS called with LNET_LOCK held */ + list_del (&lh->lh_hash_chain); +} + +int +lnet_init_finalizers(void) +{ +#ifdef __KERNEL__ + int i; + + the_lnet.ln_nfinalizers = num_online_cpus(); + + LIBCFS_ALLOC(the_lnet.ln_finalizers, + the_lnet.ln_nfinalizers * + sizeof(*the_lnet.ln_finalizers)); + if (the_lnet.ln_finalizers == NULL) { + CERROR("Can't allocate ln_finalizers\n"); + return -ENOMEM; + } + + for (i = 0; i < the_lnet.ln_nfinalizers; i++) + the_lnet.ln_finalizers[i] = NULL; +#else + the_lnet.ln_finalizing = 0; +#endif + + CFS_INIT_LIST_HEAD(&the_lnet.ln_finalizeq); + return 0; +} + +void +lnet_fini_finalizers(void) +{ +#ifdef __KERNEL__ + int i; + + for (i = 0; i < the_lnet.ln_nfinalizers; i++) + LASSERT (the_lnet.ln_finalizers[i] == NULL); + + LIBCFS_FREE(the_lnet.ln_finalizers, + the_lnet.ln_nfinalizers * + sizeof(*the_lnet.ln_finalizers)); +#else + LASSERT (!the_lnet.ln_finalizing); +#endif + LASSERT (list_empty(&the_lnet.ln_finalizeq)); } -void ptl_unregister_nal (ptl_interface_t interface) +int +lnet_prepare(lnet_pid_t requested_pid) { - LASSERT(interface >= 0 && interface <= NAL_MAX_NR); - LASSERT(ptl_nal_table[interface] != NULL); - LASSERT(ptl_nal_table[interface]->nal_refct == 0); + /* Prepare to bring up the network */ + int rc = 0; + int i; + + LASSERT (the_lnet.ln_refcount == 0); + + the_lnet.ln_routing = 0; + +#ifdef __KERNEL__ + LASSERT ((requested_pid & LNET_PID_USERFLAG) == 0); + the_lnet.ln_pid = requested_pid; +#else + /* My PID must be unique on this node and flag I'm userspace */ + the_lnet.ln_pid = getpid() | LNET_PID_USERFLAG; +#endif + + rc = lnet_descriptor_setup(); + if (rc != 0) + goto failed0; + + memset(&the_lnet.ln_counters, 0, + sizeof(the_lnet.ln_counters)); + + CFS_INIT_LIST_HEAD (&the_lnet.ln_active_msgs); + CFS_INIT_LIST_HEAD (&the_lnet.ln_active_mds); + CFS_INIT_LIST_HEAD (&the_lnet.ln_active_eqs); + CFS_INIT_LIST_HEAD (&the_lnet.ln_test_peers); + CFS_INIT_LIST_HEAD (&the_lnet.ln_nis); + CFS_INIT_LIST_HEAD (&the_lnet.ln_zombie_nis); + CFS_INIT_LIST_HEAD (&the_lnet.ln_remote_nets); + CFS_INIT_LIST_HEAD (&the_lnet.ln_routers); + + the_lnet.ln_interface_cookie = lnet_create_interface_cookie(); + + lnet_init_rtrpools(); + + rc = lnet_setup_handle_hash (); + if (rc != 0) + goto failed0; + + rc = lnet_create_peer_table(); + if (rc != 0) + goto failed1; + + rc = lnet_init_finalizers(); + if (rc != 0) + goto failed2; + + the_lnet.ln_nportals = MAX_PORTALS; + LIBCFS_ALLOC(the_lnet.ln_portals, + the_lnet.ln_nportals * + sizeof(*the_lnet.ln_portals)); + if (the_lnet.ln_portals == NULL) { + rc = -ENOMEM; + goto failed3; + } + + for (i = 0; i < the_lnet.ln_nportals; i++) { + CFS_INIT_LIST_HEAD(&(the_lnet.ln_portals[i].ptl_ml)); + CFS_INIT_LIST_HEAD(&(the_lnet.ln_portals[i].ptl_msgq)); + the_lnet.ln_portals[i].ptl_options = 0; + } + + return 0; - ptl_mutex_enter(); + failed3: + lnet_fini_finalizers(); + failed2: + lnet_destroy_peer_table(); + failed1: + lnet_cleanup_handle_hash(); + failed0: + lnet_descriptor_cleanup(); + return rc; +} + +int +lnet_unprepare (void) +{ + int idx; - ptl_nal_table[interface] = NULL; + /* NB no LNET_LOCK since this is the last reference. All LND instances + * have shut down already, so it is safe to unlink and free all + * descriptors, even those that appear committed to a network op (eg MD + * with non-zero pending count) */ - ptl_mutex_exit(); + lnet_fail_nid(LNET_NID_ANY, 0); + + LASSERT (list_empty(&the_lnet.ln_test_peers)); + LASSERT (the_lnet.ln_refcount == 0); + LASSERT (list_empty(&the_lnet.ln_nis)); + LASSERT (list_empty(&the_lnet.ln_zombie_nis)); + LASSERT (the_lnet.ln_nzombie_nis == 0); + + for (idx = 0; idx < the_lnet.ln_nportals; idx++) { + + LNetClearLazyPortal(idx); + LASSERT (list_empty(&the_lnet.ln_portals[idx].ptl_msgq)); + + while (!list_empty (&the_lnet.ln_portals[idx].ptl_ml)) { + lnet_me_t *me = list_entry (the_lnet.ln_portals[idx].ptl_ml.next, + lnet_me_t, me_list); + + CERROR ("Active me %p on exit\n", me); + list_del (&me->me_list); + lnet_me_free (me); + } + } + + while (!list_empty (&the_lnet.ln_active_mds)) { + lnet_libmd_t *md = list_entry (the_lnet.ln_active_mds.next, + lnet_libmd_t, md_list); + + CERROR ("Active md %p on exit\n", md); + list_del (&md->md_list); + lnet_md_free (md); + } + + while (!list_empty (&the_lnet.ln_active_eqs)) { + lnet_eq_t *eq = list_entry (the_lnet.ln_active_eqs.next, + lnet_eq_t, eq_list); + + CERROR ("Active eq %p on exit\n", eq); + list_del (&eq->eq_list); + lnet_eq_free (eq); + } + + while (!list_empty (&the_lnet.ln_active_msgs)) { + lnet_msg_t *msg = list_entry (the_lnet.ln_active_msgs.next, + lnet_msg_t, msg_activelist); + + CERROR ("Active msg %p on exit\n", msg); + LASSERT (msg->msg_onactivelist); + list_del (&msg->msg_activelist); + lnet_msg_free (msg); + } + + LIBCFS_FREE(the_lnet.ln_portals, + the_lnet.ln_nportals * sizeof(*the_lnet.ln_portals)); + + lnet_free_rtrpools(); + lnet_fini_finalizers(); + lnet_destroy_peer_table(); + lnet_cleanup_handle_hash(); + lnet_descriptor_cleanup(); + + return (0); } -int PtlInit(int *max_interfaces) +lnet_ni_t * +lnet_net2ni_locked (__u32 net) { - LASSERT(!strcmp(ptl_err_str[PTL_MAX_ERRNO], "PTL_MAX_ERRNO")); + struct list_head *tmp; + lnet_ni_t *ni; + + list_for_each (tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (lnet_ptlcompat_matchnet(LNET_NIDNET(ni->ni_nid), net)) { + lnet_ni_addref_locked(ni); + return ni; + } + } + + return NULL; +} - /* If this assertion fails, we need more bits in NI_HANDLE_MASK and - * to shift NI_HANDLE_MAGIC left appropriately */ - LASSERT (NAL_MAX_NR < (NI_HANDLE_MASK + 1)); +int +lnet_islocalnet (__u32 net) +{ + lnet_ni_t *ni; - if (max_interfaces != NULL) - *max_interfaces = NAL_MAX_NR + 1; + LNET_LOCK(); + ni = lnet_net2ni_locked(net); + if (ni != NULL) + lnet_ni_decref_locked(ni); + LNET_UNLOCK(); - ptl_mutex_enter(); + return ni != NULL; +} - if (!ptl_init) { - /* NULL pointers, clear flags */ - memset(ptl_nal_table, 0, sizeof(ptl_nal_table)); -#ifndef __KERNEL__ - /* Kernel NALs register themselves when their module loads, - * and unregister themselves when their module is unloaded. - * Userspace NALs, are plugged in explicitly here... */ - { - extern nal_t procapi_nal; - - /* XXX pretend it's socknal to keep liblustre happy... */ - ptl_nal_table[SOCKNAL] = &procapi_nal; - LASSERT (procapi_nal.nal_refct == 0); +lnet_ni_t * +lnet_nid2ni_locked (lnet_nid_t nid) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + list_for_each (tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (lnet_ptlcompat_matchnid(ni->ni_nid, nid)) { + lnet_ni_addref_locked(ni); + return ni; } -#endif - ptl_init = 1; } + + return NULL; +} - ptl_mutex_exit(); +int +lnet_islocalnid (lnet_nid_t nid) +{ + lnet_ni_t *ni; - return PTL_OK; + LNET_LOCK(); + ni = lnet_nid2ni_locked(nid); + if (ni != NULL) + lnet_ni_decref_locked(ni); + LNET_UNLOCK(); + + return ni != NULL; } -void PtlFini(void) +int +lnet_count_acceptor_nis (lnet_ni_t **first_ni) { - nal_t *nal; - int i; + /* Return the # of NIs that need the acceptor. Return the first one in + * *first_ni so the acceptor can pass it connections "blind" to retain + * binary compatibility. */ + int count = 0; +#ifdef __KERNEL__ + struct list_head *tmp; + lnet_ni_t *ni; - ptl_mutex_enter(); + LNET_LOCK(); + list_for_each (tmp, &the_lnet.ln_nis) { + ni = list_entry(tmp, lnet_ni_t, ni_list); - if (ptl_init) { - for (i = 0; i <= NAL_MAX_NR; i++) { + if (ni->ni_lnd->lnd_accept != NULL) { + /* This LND uses the acceptor */ + if (count == 0 && first_ni != NULL) { + lnet_ni_addref_locked(ni); + *first_ni = ni; + } + count++; + } + } + + LNET_UNLOCK(); +#endif + return count; +} - nal = ptl_nal_table[i]; - if (nal == NULL) - continue; - - if (nal->nal_refct != 0) { - CWARN("NAL %x has outstanding refcount %d\n", - i, nal->nal_refct); - nal->nal_ni_fini(nal); +void +lnet_shutdown_lndnis (void) +{ + int i; + int islo; + lnet_ni_t *ni; + + /* NB called holding the global mutex */ + + /* All quiet on the API front */ + LASSERT (!the_lnet.ln_shutdown); + LASSERT (the_lnet.ln_refcount == 0); + LASSERT (list_empty(&the_lnet.ln_zombie_nis)); + LASSERT (the_lnet.ln_nzombie_nis == 0); + LASSERT (list_empty(&the_lnet.ln_remote_nets)); + + LNET_LOCK(); + the_lnet.ln_shutdown = 1; /* flag shutdown */ + + /* Unlink NIs from the global table */ + while (!list_empty(&the_lnet.ln_nis)) { + ni = list_entry(the_lnet.ln_nis.next, + lnet_ni_t, ni_list); + list_del (&ni->ni_list); + + the_lnet.ln_nzombie_nis++; + lnet_ni_decref_locked(ni); /* drop apini's ref */ + } + + /* Drop the cached eqwait NI. */ + if (the_lnet.ln_eqwaitni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_eqwaitni); + the_lnet.ln_eqwaitni = NULL; + } + + /* Drop the cached loopback NI. */ + if (the_lnet.ln_loni != NULL) { + lnet_ni_decref_locked(the_lnet.ln_loni); + the_lnet.ln_loni = NULL; + } + + LNET_UNLOCK(); + /* Clear the peer table and wait for all peers to go (they hold refs on + * their NIs) */ + + lnet_clear_peer_table(); + + LNET_LOCK(); + /* Now wait for the NI's I just nuked to show up on apini_zombie_nis + * and shut them down in guaranteed thread context */ + i = 2; + while (the_lnet.ln_nzombie_nis != 0) { + + while (list_empty(&the_lnet.ln_zombie_nis)) { + LNET_UNLOCK(); + ++i; + if ((i & (-i)) == i) + CDEBUG(D_WARNING,"Waiting for %d zombie NIs\n", + the_lnet.ln_nzombie_nis); + cfs_pause(cfs_time_seconds(1)); + LNET_LOCK(); + } + + ni = list_entry(the_lnet.ln_zombie_nis.next, + lnet_ni_t, ni_list); + list_del(&ni->ni_list); + ni->ni_lnd->lnd_refcount--; + + LNET_UNLOCK(); + + islo = ni->ni_lnd->lnd_type == LOLND; + + LASSERT (!in_interrupt ()); + (ni->ni_lnd->lnd_shutdown)(ni); + + /* can't deref lnd anymore now; it might have unregistered + * itself... */ + + if (!islo) + CDEBUG(D_LNI, "Removed LNI %s\n", + libcfs_nid2str(ni->ni_nid)); + + LIBCFS_FREE(ni, sizeof(*ni)); + + LNET_LOCK(); + the_lnet.ln_nzombie_nis--; + } + + the_lnet.ln_shutdown = 0; + LNET_UNLOCK(); + + if (the_lnet.ln_network_tokens != NULL) { + LIBCFS_FREE(the_lnet.ln_network_tokens, + the_lnet.ln_network_tokens_nob); + the_lnet.ln_network_tokens = NULL; + } +} + +int +lnet_startup_lndnis (void) +{ + lnd_t *lnd; + lnet_ni_t *ni; + struct list_head nilist; + int rc = 0; + int lnd_type; + int nicount = 0; + char *nets = lnet_get_networks(); + + INIT_LIST_HEAD(&nilist); + + if (nets == NULL) + goto failed; + + rc = lnet_parse_networks(&nilist, nets); + if (rc != 0) + goto failed; + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); + + LASSERT (libcfs_isknown_lnd(lnd_type)); + + LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex); + lnd = lnet_find_lnd_by_type(lnd_type); + +#ifdef __KERNEL__ + if (lnd == NULL) { + LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex); + rc = request_module(libcfs_lnd2modname(lnd_type)); + LNET_MUTEX_DOWN(&the_lnet.ln_lnd_mutex); + + lnd = lnet_find_lnd_by_type(lnd_type); + if (lnd == NULL) { + LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex); + CERROR("Can't load LND %s, module %s, rc=%d\n", + libcfs_lnd2str(lnd_type), + libcfs_lnd2modname(lnd_type), rc); +#ifndef CONFIG_KMOD + LCONSOLE_ERROR("Your kernel must be compiled " + "with CONFIG_KMOD set for " + "automatic module loading."); +#endif + goto failed; } - - ptl_nal_table[i] = NULL; + } +#else + if (lnd == NULL) { + LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex); + CERROR("LND %s not supported\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } +#endif + + ni->ni_refcount = 1; + + LNET_LOCK(); + lnd->lnd_refcount++; + LNET_UNLOCK(); + + ni->ni_lnd = lnd; + + rc = (lnd->lnd_startup)(ni); + + LNET_MUTEX_UP(&the_lnet.ln_lnd_mutex); + + if (rc != 0) { + LCONSOLE_ERROR("Error %d starting up LNI %s\n", + rc, libcfs_lnd2str(lnd->lnd_type)); + LNET_LOCK(); + lnd->lnd_refcount--; + LNET_UNLOCK(); + goto failed; } - ptl_init = 0; + list_del(&ni->ni_list); + + LNET_LOCK(); + list_add_tail(&ni->ni_list, &the_lnet.ln_nis); + LNET_UNLOCK(); + + if (lnd->lnd_type == LOLND) { + lnet_ni_addref(ni); + LASSERT (the_lnet.ln_loni == NULL); + the_lnet.ln_loni = ni; + continue; + } + +#ifndef __KERNEL__ + if (lnd->lnd_wait != NULL) { + if (the_lnet.ln_eqwaitni == NULL) { + lnet_ni_addref(ni); + the_lnet.ln_eqwaitni = ni; + } + } else { +# if !HAVE_LIBPTHREAD + LCONSOLE_ERROR("LND %s not supported in a " + "single-threaded runtime\n", + libcfs_lnd2str(lnd_type)); + goto failed; +# endif + } +#endif + if (ni->ni_peertxcredits == 0 || + ni->ni_maxtxcredits == 0) { + LCONSOLE_ERROR("LNI %s has no %scredits\n", + libcfs_lnd2str(lnd->lnd_type), + ni->ni_peertxcredits == 0 ? + "" : "per-peer "); + goto failed; + } + + ni->ni_txcredits = ni->ni_mintxcredits = ni->ni_maxtxcredits; + + CDEBUG(D_LNI, "Added LNI %s [%d/%d]\n", + libcfs_nid2str(ni->ni_nid), + ni->ni_peertxcredits, ni->ni_txcredits); + + /* Handle nidstrings for network 0 just like this one */ + if (the_lnet.ln_ptlcompat > 0) { + if (nicount > 0) { + LCONSOLE_ERROR("Can't run > 1 network when " + "portals_compatibility is set\n"); + goto failed; + } + libcfs_setnet0alias(lnd->lnd_type); + } + + nicount++; } - - ptl_mutex_exit(); + + if (the_lnet.ln_eqwaitni != NULL && nicount > 1) { + lnd_type = the_lnet.ln_eqwaitni->ni_lnd->lnd_type; + LCONSOLE_ERROR("LND %s can only run single-network\n", + libcfs_lnd2str(lnd_type)); + goto failed; + } + + return 0; + + failed: + lnet_shutdown_lndnis(); + + while (!list_empty(&nilist)) { + ni = list_entry(nilist.next, lnet_ni_t, ni_list); + list_del(&ni->ni_list); + LIBCFS_FREE(ni, sizeof(*ni)); + } + + return -ENETDOWN; } -int PtlNIInit(ptl_interface_t interface, ptl_pid_t requested_pid, - ptl_ni_limits_t *desired_limits, ptl_ni_limits_t *actual_limits, - ptl_handle_ni_t *handle) +int +LNetInit(void) { - nal_t *nal; - int i; int rc; - if (!ptl_init) - return PTL_NO_INIT; + lnet_assert_wire_constants (); + LASSERT (!the_lnet.ln_init); - ptl_mutex_enter (); + memset(&the_lnet, 0, sizeof(the_lnet)); - if (interface == PTL_IFACE_DEFAULT) { - for (i = 0; i <= NAL_MAX_NR; i++) - if (ptl_nal_table[i] != NULL) { - interface = i; - break; + rc = lnet_get_portals_compatibility(); + if (rc < 0) + return rc; + + lnet_init_locks(); + CFS_INIT_LIST_HEAD(&the_lnet.ln_lnds); + the_lnet.ln_ptlcompat = rc; + the_lnet.ln_refcount = 0; + the_lnet.ln_init = 1; + +#ifdef __KERNEL__ + /* All LNDs apart from the LOLND are in separate modules. They + * register themselves when their module loads, and unregister + * themselves when their module is unloaded. */ +#else + /* Register LNDs + * NB the order here determines default 'networks=' order */ +# ifdef CRAY_XT3 + LNET_REGISTER_ULND(the_ptllnd); +# endif +# if HAVE_LIBPTHREAD + LNET_REGISTER_ULND(the_tcplnd); +# endif +#endif + lnet_register_lnd(&the_lolnd); + return 0; +} + +void +LNetFini(void) +{ + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount == 0); + + while (!list_empty(&the_lnet.ln_lnds)) + lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next, + lnd_t, lnd_list)); + lnet_fini_locks(); + + the_lnet.ln_init = 0; +} + +int +LNetNIInit(lnet_pid_t requested_pid) +{ + int im_a_router = 0; + int rc; + + LNET_MUTEX_DOWN(&the_lnet.ln_api_mutex); + + LASSERT (the_lnet.ln_init); + CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); + + if (the_lnet.ln_refcount > 0) { + rc = the_lnet.ln_refcount++; + goto out; + } + + if (requested_pid == LNET_PID_ANY) { + /* Don't instantiate LNET just for me */ + rc = -ENETDOWN; + goto failed0; + } + + rc = lnet_prepare(requested_pid); + if (rc != 0) + goto failed0; + + rc = lnet_startup_lndnis(); + if (rc != 0) + goto failed1; + + rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_check_routes(); + if (rc != 0) + goto failed2; + + rc = lnet_alloc_rtrpools(im_a_router); + if (rc != 0) + goto failed2; + + rc = lnet_acceptor_start(); + if (rc != 0) + goto failed2; + + the_lnet.ln_refcount = 1; + /* Now I may use my own API functions... */ + + rc = lnet_router_checker_start(); + if (rc != 0) + goto failed3; + + rc = lnet_ping_target_init(); + if (rc != 0) + goto failed4; + + lnet_proc_init(); + goto out; + + failed4: + lnet_router_checker_stop(); + failed3: + the_lnet.ln_refcount = 0; + lnet_acceptor_stop(); + failed2: + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + failed1: + lnet_unprepare(); + failed0: + LASSERT (rc < 0); + out: + LNET_MUTEX_UP(&the_lnet.ln_api_mutex); + return rc; +} + +int +LNetNIFini() +{ + LNET_MUTEX_DOWN(&the_lnet.ln_api_mutex); + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (the_lnet.ln_refcount != 1) { + the_lnet.ln_refcount--; + } else { + LASSERT (!the_lnet.ln_niinit_self); + + lnet_proc_fini(); + lnet_ping_target_fini(); + lnet_router_checker_stop(); + + /* Teardown fns that use my own API functions BEFORE here */ + the_lnet.ln_refcount = 0; + + lnet_acceptor_stop(); + lnet_destroy_routes(); + lnet_shutdown_lndnis(); + lnet_unprepare(); + } + + LNET_MUTEX_UP(&the_lnet.ln_api_mutex); + return 0; +} + +int +LNetCtl(unsigned int cmd, void *arg) +{ + struct libcfs_ioctl_data *data = arg; + lnet_process_id_t id; + lnet_ni_t *ni; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + switch (cmd) { + case IOC_LIBCFS_GET_NI: + rc = LNetGetId(data->ioc_count, &id); + data->ioc_nid = id.nid; + return rc; + + case IOC_LIBCFS_FAIL_NID: + return lnet_fail_nid(data->ioc_nid, data->ioc_count); + + case IOC_LIBCFS_ADD_ROUTE: + rc = lnet_add_route(data->ioc_net, data->ioc_count, + data->ioc_nid); + return (rc != 0) ? rc : lnet_check_routes(); + + case IOC_LIBCFS_DEL_ROUTE: + return lnet_del_route(data->ioc_net, data->ioc_nid); + + case IOC_LIBCFS_GET_ROUTE: + return lnet_get_route(data->ioc_count, + &data->ioc_net, &data->ioc_count, + &data->ioc_nid, &data->ioc_flags); + case IOC_LIBCFS_NOTIFY_ROUTER: + return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, + (time_t)data->ioc_u64[0]); + + case IOC_LIBCFS_PORTALS_COMPATIBILITY: + return the_lnet.ln_ptlcompat; + + case IOC_LIBCFS_LNET_DIST: + rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]); + if (rc < 0 && rc != -EHOSTUNREACH) + return rc; + + data->ioc_u32[0] = rc; + return 0; + + case IOC_LIBCFS_TESTPROTOCOMPAT: + LNET_LOCK(); + the_lnet.ln_testprotocompat = data->ioc_flags; + LNET_UNLOCK(); + return 0; + + case IOC_LIBCFS_PING: + rc = lnet_ping((lnet_process_id_t) {.nid = data->ioc_nid, + .pid = data->ioc_u32[0]}, + data->ioc_u32[1], /* timeout */ + (lnet_process_id_t *)data->ioc_pbuf1, + data->ioc_plen1/sizeof(lnet_process_id_t)); + if (rc < 0) + return rc; + data->ioc_count = rc; + return 0; + + case IOC_LIBCFS_DEBUG_PEER: { + /* CAVEAT EMPTOR: this one designed for calling directly; not + * via an ioctl */ + lnet_process_id_t *id = arg; + + lnet_debug_peer(id->nid); + + ni = lnet_net2ni(LNET_NIDNET(id->nid)); + if (ni == NULL) { + CDEBUG(D_WARNING, "No NI for %s\n", libcfs_id2str(*id)); + } else { + if (ni->ni_lnd->lnd_ctl == NULL) { + CDEBUG(D_WARNING, "No ctl for %s\n", + libcfs_id2str(*id)); + } else { + (void)ni->ni_lnd->lnd_ctl(ni, cmd, arg); } - /* NB if no interfaces are registered, 'interface' will - * fail the valid test below */ + + lnet_ni_decref(ni); + } + return 0; } - - if (interface < 0 || - interface > NAL_MAX_NR || - ptl_nal_table[interface] == NULL) { - GOTO(out, rc = PTL_IFACE_INVALID); + + default: + ni = lnet_net2ni(data->ioc_net); + if (ni == NULL) + return -EINVAL; + + if (ni->ni_lnd->lnd_ctl == NULL) + rc = -EINVAL; + else + rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg); + + lnet_ni_decref(ni); + return rc; } + /* not reached */ +} - nal = ptl_nal_table[interface]; - nal->nal_handle.nal_idx = (NI_HANDLE_MAGIC & ~NI_HANDLE_MASK) | interface; - nal->nal_handle.cookie = 0; - - CDEBUG(D_OTHER, "Starting up NAL (%x) refs %d\n", interface, nal->nal_refct); - rc = nal->nal_ni_init(nal, requested_pid, desired_limits, actual_limits); +int +LNetGetId(unsigned int index, lnet_process_id_t *id) +{ + lnet_ni_t *ni; + struct list_head *tmp; + int rc = -ENOENT; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + LNET_LOCK(); + + list_for_each(tmp, &the_lnet.ln_nis) { + if (index-- != 0) + continue; + + ni = list_entry(tmp, lnet_ni_t, ni_list); - if (rc != PTL_OK) { - CERROR("Error %d starting up NAL %x, refs %d\n", rc, - interface, nal->nal_refct); - GOTO(out, rc); + id->nid = ni->ni_nid; + id->pid = the_lnet.ln_pid; + rc = 0; + break; } + + LNET_UNLOCK(); + + return rc; +} + +void +LNetSnprintHandle(char *str, int len, lnet_handle_any_t h) +{ + snprintf(str, len, LPX64, h.cookie); +} + + +int +lnet_ping_target_init(void) +{ + lnet_handle_me_t meh; + lnet_process_id_t id; + int rc; + int rc2; + int n; + int infosz; + int i; - if (nal->nal_refct != 0) { - /* Caller gets to know if this was the first ref or not */ - rc = PTL_IFACE_DUP; + for (n = 0; ; n++) { + rc = LNetGetId(n, &id); + if (rc == -ENOENT) + break; + + LASSERT (rc == 0); + } + + infosz = offsetof(lnet_ping_info_t, pi_nid[n]); + LIBCFS_ALLOC(the_lnet.ln_ping_info, infosz); + if (the_lnet.ln_ping_info == NULL) { + CERROR("Can't allocate ping info[%d]\n", n); + return -ENOMEM; + } + + the_lnet.ln_ping_info->pi_magic = LNET_PROTO_PING_MAGIC; + the_lnet.ln_ping_info->pi_version = LNET_PROTO_PING_VERSION; + the_lnet.ln_ping_info->pi_pid = the_lnet.ln_pid; + the_lnet.ln_ping_info->pi_nnids = n; + + for (i = 0; i < n; i++) { + rc = LNetGetId(i, &id); + LASSERT (rc == 0); + the_lnet.ln_ping_info->pi_nid[i] = id.nid; } - nal->nal_refct++; - *handle = nal->nal_handle; + /* We can have a tiny EQ since we only need to see the unlink event on + * teardown, which by definition is the last one! */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &the_lnet.ln_ping_target_eq); + if (rc != 0) { + CERROR("Can't allocate ping EQ: %d\n", rc); + goto failed_0; + } - out: - ptl_mutex_exit (); + rc = LNetMEAttach(LNET_RESERVED_PORTAL, + (lnet_process_id_t){.nid = LNET_NID_ANY, + .pid = LNET_PID_ANY}, + LNET_PROTO_PING_MATCHBITS, 0LL, + LNET_UNLINK, LNET_INS_AFTER, + &meh); + if (rc != 0) { + CERROR("Can't create ping ME: %d\n", rc); + goto failed_1; + } + + rc = LNetMDAttach(meh, + (lnet_md_t){.start = the_lnet.ln_ping_info, + .length = infosz, + .threshold = LNET_MD_THRESH_INF, + .options = (LNET_MD_OP_GET | + LNET_MD_TRUNCATE | + LNET_MD_MANAGE_REMOTE), + .eq_handle = the_lnet.ln_ping_target_eq}, + LNET_RETAIN, + &the_lnet.ln_ping_target_md); + if (rc != 0) { + CERROR("Can't attach ping MD: %d\n", rc); + goto failed_2; + } + + return 0; + + failed_2: + rc2 = LNetMEUnlink(meh); + LASSERT (rc2 == 0); + failed_1: + rc2 = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT (rc2 == 0); + failed_0: + LIBCFS_FREE(the_lnet.ln_ping_info, infosz); return rc; } -int PtlNIFini(ptl_handle_ni_t ni) +void +lnet_ping_target_fini(void) { - nal_t *nal; - int idx; + lnet_event_t event; + int rc; + int which; + int timeout_ms = 1000; + cfs_sigset_t blocked = cfs_block_allsigs(); + + LNetMDUnlink(the_lnet.ln_ping_target_md); + /* NB md could be busy; this just starts the unlink */ + + for (;;) { + rc = LNetEQPoll(&the_lnet.ln_ping_target_eq, 1, + timeout_ms, &event, &which); + + /* I expect overflow... */ + LASSERT (rc >= 0 || rc == -EOVERFLOW); + + if (rc == 0) { + /* timed out: provide a diagnostic */ + CWARN("Still waiting for ping MD to unlink\n"); + timeout_ms *= 2; + continue; + } + + /* Got a valid event */ + if (event.unlinked) + break; + } + + rc = LNetEQFree(the_lnet.ln_ping_target_eq); + LASSERT (rc == 0); + + LIBCFS_FREE(the_lnet.ln_ping_info, + offsetof(lnet_ping_info_t, + pi_nid[the_lnet.ln_ping_info->pi_nnids])); + + cfs_restore_sigs(blocked); +} + +int +lnet_ping (lnet_process_id_t id, int timeout_ms, lnet_process_id_t *ids, int n_ids) +{ + lnet_handle_eq_t eqh; + lnet_handle_md_t mdh; + lnet_event_t event; + int which; + int unlinked = 0; + int replied = 0; + const int a_long_time = 60000; /* mS */ + int infosz = offsetof(lnet_ping_info_t, pi_nid[n_ids]); + lnet_ping_info_t *info; + lnet_process_id_t tmpid; + int i; + int nob; + int rc; + int rc2; + cfs_sigset_t blocked; + + if (n_ids <= 0 || + id.nid == LNET_NID_ANY || + timeout_ms > 500000 || /* arbitrary limit! */ + n_ids > 20) /* arbitrary limit! */ + return -EINVAL; + + if (id.pid == LNET_PID_ANY) + id.pid = LUSTRE_SRV_LNET_PID; + + LIBCFS_ALLOC(info, infosz); + if (info == NULL) + return -ENOMEM; + + /* NB 2 events max (including any unlink event) */ + rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh); + if (rc != 0) { + CERROR("Can't allocate EQ: %d\n", rc); + goto out_0; + } + + rc = LNetMDBind((lnet_md_t){.start = info, + .length = infosz, + .threshold = 2, /* GET/REPLY */ + .options = LNET_MD_TRUNCATE, + .eq_handle = eqh}, + LNET_UNLINK, + &mdh); + if (rc != 0) { + CERROR("Can't bind MD: %d\n", rc); + goto out_1; + } + + rc = LNetGet(LNET_NID_ANY, mdh, id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + + if (rc != 0) { + /* Don't CERROR; this could be deliberate! */ + + rc2 = LNetMDUnlink(mdh); + LASSERT (rc2 == 0); - if (!ptl_init) - return PTL_NO_INIT; + /* NB must wait for the UNLINK event below... */ + unlinked = 1; + timeout_ms = a_long_time; + } + + do { + /* MUST block for unlink to complete */ + if (unlinked) + blocked = cfs_block_allsigs(); + + rc2 = LNetEQPoll(&eqh, 1, timeout_ms, &event, &which); + + if (unlinked) + cfs_restore_sigs(blocked); - ptl_mutex_enter (); + CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2, + (rc2 <= 0) ? -1 : event.type, + (rc2 <= 0) ? -1 : event.status, + (rc2 > 0 && event.unlinked) ? " unlinked" : ""); + + LASSERT (rc2 != -EOVERFLOW); /* can't miss anything */ + + if (rc2 <= 0 || event.status != 0) { + /* timeout or error */ + if (!replied && rc == 0) + rc = (rc2 < 0) ? rc2 : + (rc2 == 0) ? -ETIMEDOUT : + event.status; + + if (!unlinked) { + /* Ensure completion in finite time... */ + LNetMDUnlink(mdh); + /* No assertion (racing with network) */ + unlinked = 1; + timeout_ms = a_long_time; + } else if (rc2 == 0) { + /* timed out waiting for unlink */ + CWARN("ping %s: late network completion\n", + libcfs_id2str(id)); + } - nal = ptl_hndl2nal (&ni); - if (nal == NULL) { - ptl_mutex_exit (); - return PTL_HANDLE_INVALID; + } else if (event.type == LNET_EVENT_REPLY) { + replied = 1; + rc = event.mlength; + } + + } while (rc2 <= 0 || !event.unlinked); + + if (!replied) { + if (rc >= 0) + CWARN("%s: Unexpected rc >= 0 but no reply!\n", + libcfs_id2str(id)); + rc = -EIO; + goto out_1; } - idx = ni.nal_idx & NI_HANDLE_MASK; + nob = rc; + LASSERT (nob >= 0 && nob <= infosz); - LASSERT(nal->nal_refct > 0); + rc = -EPROTO; /* if I can't parse... */ - nal->nal_refct--; + if (nob < 8) { + /* can't check magic/version */ + CERROR("%s: ping info too short %d\n", + libcfs_id2str(id), nob); + goto out_1; + } - /* nal_refct == 0 tells nal->shutdown to really shut down */ - nal->nal_ni_fini(nal); + if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { + /* NB I might be swabbing garbage until I check below, but it + * doesn't matter */ + __swab32s(&info->pi_version); + __swab32s(&info->pi_pid); + __swab32s(&info->pi_nnids); + for (i = 0; i < info->pi_nnids && i < n_ids; i++) + __swab64s(&info->pi_nid[i]); - ptl_mutex_exit (); - return PTL_OK; + } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) { + CERROR("%s: Unexpected magic %08x\n", + libcfs_id2str(id), info->pi_magic); + goto out_1; + } + + if (info->pi_version != LNET_PROTO_PING_VERSION) { + CERROR("%s: Unexpected version 0x%x\n", + libcfs_id2str(id), info->pi_version); + goto out_1; + } + + if (nob < offsetof(lnet_ping_info_t, pi_nid[0])) { + CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_nid[0])); + goto out_1; + } + + if (info->pi_nnids < n_ids) + n_ids = info->pi_nnids; + + if (nob < offsetof(lnet_ping_info_t, pi_nid[n_ids])) { + CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id), + nob, (int)offsetof(lnet_ping_info_t, pi_nid[n_ids])); + goto out_1; + } + + rc = -EFAULT; /* If I SEGV... */ + + for (i = 0; i < n_ids; i++) { + tmpid.pid = info->pi_pid; + tmpid.nid = info->pi_nid[i]; +#ifdef __KERNEL__ + if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) + goto out_1; +#else + ids[i] = tmpid; +#endif + } + rc = info->pi_nnids; + + out_1: + rc2 = LNetEQFree(eqh); + if (rc2 != 0) + CERROR("rc2 %d\n", rc2); + LASSERT (rc2 == 0); + + out_0: + LIBCFS_FREE(info, infosz); + return rc; } diff --git a/lnet/lnet/api-wrap.c b/lnet/lnet/api-wrap.c deleted file mode 100644 index 92f495e..0000000 --- a/lnet/lnet/api-wrap.c +++ /dev/null @@ -1,379 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * api/api-wrap.c - * User-level wrappers that dispatch across the protection boundaries - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.sf.net/projects/lustre/ - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -void PtlSnprintHandle(char *str, int len, ptl_handle_any_t h) -{ - snprintf(str, len, "0x%lx."LPX64, h.nal_idx, h.cookie); -} - -int PtlNIHandle(ptl_handle_any_t handle_in, ptl_handle_ni_t *ni_out) -{ - if (!ptl_init) - return PTL_NO_INIT; - - if (ptl_hndl2nal(&handle_in) == NULL) - return PTL_HANDLE_INVALID; - - *ni_out = handle_in; - return PTL_OK; -} - -int PtlGetId(ptl_handle_ni_t ni_handle, ptl_process_id_t *id) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_handle); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_get_id(nal, id); -} - -int PtlGetUid(ptl_handle_ni_t ni_handle, ptl_uid_t *uid) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_handle); - if (nal == NULL) - return PTL_NI_INVALID; - - /* We don't support different uids yet */ - *uid = 0; - return PTL_OK; -} - -int PtlFailNid (ptl_handle_ni_t interface, ptl_nid_t nid, unsigned int threshold) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_fail_nid(nal, nid, threshold); -} - -int PtlLoopback (ptl_handle_ni_t interface, int set, int *enabled) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_loopback(nal, set, enabled); -} - -int PtlNIStatus(ptl_handle_ni_t interface_in, ptl_sr_index_t register_in, - ptl_sr_value_t *status_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_ni_status(nal, register_in, status_out); -} - -int PtlNIDist(ptl_handle_ni_t interface_in, ptl_process_id_t process_in, - unsigned long *distance_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_ni_dist(nal, &process_in, distance_out); -} - -int PtlMEAttach(ptl_handle_ni_t interface_in, ptl_pt_index_t index_in, - ptl_process_id_t match_id_in, ptl_match_bits_t match_bits_in, - ptl_match_bits_t ignore_bits_in, ptl_unlink_t unlink_in, - ptl_ins_pos_t pos_in, ptl_handle_me_t *handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return nal->nal_me_attach(nal, index_in, match_id_in, - match_bits_in, ignore_bits_in, - unlink_in, pos_in, handle_out); -} - -int PtlMEInsert(ptl_handle_me_t current_in, ptl_process_id_t match_id_in, - ptl_match_bits_t match_bits_in, ptl_match_bits_t ignore_bits_in, - ptl_unlink_t unlink_in, ptl_ins_pos_t position_in, - ptl_handle_me_t * handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(¤t_in); - if (nal == NULL) - return PTL_ME_INVALID; - - return nal->nal_me_insert(nal, ¤t_in, match_id_in, - match_bits_in, ignore_bits_in, - unlink_in, position_in, handle_out); -} - -int PtlMEUnlink(ptl_handle_me_t current_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(¤t_in); - if (nal == NULL) - return PTL_ME_INVALID; - - return nal->nal_me_unlink(nal, ¤t_in); -} - -int PtlMDAttach(ptl_handle_me_t me_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t * handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&me_in); - if (nal == NULL) - return PTL_ME_INVALID; - - if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) && - ptl_hndl2nal(&md_in.eq_handle) != nal) - return PTL_MD_ILLEGAL; - - return (nal->nal_md_attach)(nal, &me_in, &md_in, - unlink_in, handle_out); -} - -int PtlMDBind(ptl_handle_ni_t ni_in, ptl_md_t md_in, - ptl_unlink_t unlink_in, ptl_handle_md_t *handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_in); - if (nal == NULL) - return PTL_NI_INVALID; - - if (!PtlHandleIsEqual(md_in.eq_handle, PTL_EQ_NONE) && - ptl_hndl2nal(&md_in.eq_handle) != nal) - return PTL_MD_ILLEGAL; - - return (nal->nal_md_bind)(nal, &md_in, unlink_in, handle_out); -} - -int PtlMDUpdate(ptl_handle_md_t md_in, ptl_md_t *old_inout, - ptl_md_t *new_inout, ptl_handle_eq_t testq_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - if (!PtlHandleIsEqual(testq_in, PTL_EQ_NONE) && - ptl_hndl2nal(&testq_in) != nal) - return PTL_EQ_INVALID; - - return (nal->nal_md_update)(nal, &md_in, - old_inout, new_inout, &testq_in); -} - -int PtlMDUnlink(ptl_handle_md_t md_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - return (nal->nal_md_unlink)(nal, &md_in); -} - -int PtlEQAlloc(ptl_handle_ni_t interface, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t *handle_out) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&interface); - if (nal == NULL) - return PTL_NI_INVALID; - - return (nal->nal_eq_alloc)(nal, count, callback, handle_out); -} - -int PtlEQFree(ptl_handle_eq_t eventq) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&eventq); - if (nal == NULL) - return PTL_EQ_INVALID; - - return (nal->nal_eq_free)(nal, &eventq); -} - -int PtlEQGet(ptl_handle_eq_t eventq, ptl_event_t *ev) -{ - int which; - - return (PtlEQPoll (&eventq, 1, 0, ev, &which)); -} - -int PtlEQWait(ptl_handle_eq_t eventq_in, ptl_event_t *event_out) -{ - int which; - - return (PtlEQPoll (&eventq_in, 1, PTL_TIME_FOREVER, - event_out, &which)); -} - -int PtlEQPoll(ptl_handle_eq_t *eventqs_in, int neq_in, int timeout, - ptl_event_t *event_out, int *which_out) -{ - int i; - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - if (neq_in < 1) - return PTL_EQ_INVALID; - - nal = ptl_hndl2nal(&eventqs_in[0]); - if (nal == NULL) - return PTL_EQ_INVALID; - - for (i = 1; i < neq_in; i++) - if (ptl_hndl2nal(&eventqs_in[i]) != nal) - return PTL_EQ_INVALID; - - return (nal->nal_eq_poll)(nal, eventqs_in, neq_in, timeout, - event_out, which_out); -} - - -int PtlACEntry(ptl_handle_ni_t ni_in, ptl_ac_index_t index_in, - ptl_process_id_t match_id_in, ptl_pt_index_t portal_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&ni_in); - if (nal == NULL) - return PTL_NI_INVALID; - - return (nal->nal_ace_entry)(nal, index_in, match_id_in, portal_in); -} - -int PtlPut(ptl_handle_md_t md_in, ptl_ack_req_t ack_req_in, - ptl_process_id_t target_in, ptl_pt_index_t portal_in, - ptl_ac_index_t ac_in, ptl_match_bits_t match_bits_in, - ptl_size_t offset_in, ptl_hdr_data_t hdr_data_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - return (nal->nal_put)(nal, &md_in, ack_req_in, - &target_in, portal_in, ac_in, - match_bits_in, offset_in, hdr_data_in); -} - -int PtlGet(ptl_handle_md_t md_in, ptl_process_id_t target_in, - ptl_pt_index_t portal_in, ptl_ac_index_t ac_in, - ptl_match_bits_t match_bits_in, ptl_size_t offset_in) -{ - nal_t *nal; - - if (!ptl_init) - return PTL_NO_INIT; - - nal = ptl_hndl2nal(&md_in); - if (nal == NULL) - return PTL_MD_INVALID; - - return (nal->nal_get)(nal, &md_in, - &target_in, portal_in, ac_in, - match_bits_in, offset_in); -} - diff --git a/lnet/lnet/autoMakefile.am b/lnet/lnet/autoMakefile.am index bd05e93..9ce40fe 100644 --- a/lnet/lnet/autoMakefile.am +++ b/lnet/lnet/autoMakefile.am @@ -1,32 +1,34 @@ -my_sources = api-errno.c api-ni.c api-wrap.c \ - lib-init.c lib-me.c lib-msg.c lib-eq.c \ - lib-md.c lib-move.c lib-ni.c lib-pid.c +my_sources = api-errno.c api-ni.c config.c \ + lib-me.c lib-msg.c lib-eq.c \ + lib-md.c lib-move.c lo.c \ + router.c router_proc.c \ + acceptor.c peer.c -if !CRAY_PORTALS if LIBLUSTRE -noinst_LIBRARIES= libportals.a -libportals_a_SOURCES= $(my_sources) -libportals_a_CPPFLAGS = $(LLCPPFLAGS) -libportals_a_CFLAGS = $(LLCFLAGS) +noinst_LIBRARIES= liblnet.a +liblnet_a_SOURCES= $(my_sources) +liblnet_a_CPPFLAGS = $(LLCPPFLAGS) +liblnet_a_CFLAGS = $(LLCFLAGS) endif if MODULES if LINUX -modulenet_DATA = portals$(KMODEXT) +modulenet_DATA = lnet$(KMODEXT) endif # LINUX if DARWIN -macos_PROGRAMS := portals +macos_PROGRAMS := lnet -portals_SOURCES := api-errno.c api-ni.c api-wrap.c -portals_SOURCES += lib-init.c lib-me.c lib-msg.c lib-eq.c lib-md.c -portals_SOURCES += lib-move.c lib-ni.c lib-pid.c module.c +lnet_SOURCES := api-errno.c api-ni.c config.c +lnet_SOURCES += lib-me.c lib-msg.c lib-eq.c lib-md.c +lnet_SOURCES += lib-move.c module.c lo.c router.c router_proc.c +lnet_SOURCES += acceptor.c peer.c -portals_CFLAGS := $(EXTRA_KCFLAGS) -portals_LDFLAGS := $(EXTRA_KLDFLAGS) -portals_LDADD := $(EXTRA_KLIBS) +lnet_CFLAGS := $(EXTRA_KCFLAGS) +lnet_LDFLAGS := $(EXTRA_KLDFLAGS) +lnet_LDADD := $(EXTRA_KLIBS) plist_DATA := Info.plist @@ -36,11 +38,9 @@ endif # DARWIN endif # MODULES -endif # CRAY_PORTALS - install-data-hook: $(install_data_hook) EXTRA_DIST := Info.plist -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ portals -DIST_SOURCES = $(portals-objs:%.o=%.c) +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ lnet +DIST_SOURCES = $(lnet-objs:%.o=%.c) diff --git a/lnet/lnet/config.c b/lnet/lnet/config.c new file mode 100644 index 0000000..cd5e211 --- /dev/null +++ b/lnet/lnet/config.c @@ -0,0 +1,1386 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2005 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.sf.net/projects/lustre/ + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +typedef struct { /* tmp struct for parsing routes */ + struct list_head ltb_list; /* stash on lists */ + int ltb_size; /* allocated size */ + char ltb_text[0]; /* text buffer */ +} lnet_text_buf_t; + +static int lnet_tbnob = 0; /* track text buf allocation */ +#define LNET_MAX_TEXTBUF_NOB (64<<10) /* bound allocation */ +#define LNET_SINGLE_TEXTBUF_NOB (4<<10) + +typedef struct { + struct list_head lre_list; /* stash in a list */ + int lre_min; /* min value */ + int lre_max; /* max value */ + int lre_stride; /* stride */ +} lnet_range_expr_t; + +static int lnet_re_alloc = 0; /* track expr allocation */ + +void +lnet_syntax(char *name, char *str, int offset, int width) +{ + static char dots[LNET_SINGLE_TEXTBUF_NOB]; + static char dashes[LNET_SINGLE_TEXTBUF_NOB]; + + memset(dots, '.', sizeof(dots)); + dots[sizeof(dots)-1] = 0; + memset(dashes, '-', sizeof(dashes)); + dashes[sizeof(dashes)-1] = 0; + + LCONSOLE_ERROR("Error parsing '%s=\"%s\"'\n", name, str); + LCONSOLE_ERROR("here...........%.*s..%.*s|%.*s|\n", + (int)strlen(name), dots, offset, dots, + (width < 1) ? 0 : width - 1, dashes); +} + +int +lnet_issep (char c) +{ + switch (c) { + case '\n': + case '\r': + case ';': + return 1; + default: + return 0; + } +} + +int +lnet_iswhite (char c) +{ + switch (c) { + case ' ': + case '\t': + case '\n': + case '\r': + return 1; + default: + return 0; + } +} + +char * +lnet_trimwhite(char *str) +{ + char *end; + + while (lnet_iswhite(*str)) + str++; + + end = str + strlen(str); + while (end > str) { + if (!lnet_iswhite(end[-1])) + break; + end--; + } + + *end = 0; + return str; +} + +int +lnet_net_unique(__u32 net, struct list_head *nilist) +{ + struct list_head *tmp; + lnet_ni_t *ni; + + list_for_each (tmp, nilist) { + ni = list_entry(tmp, lnet_ni_t, ni_list); + + if (LNET_NIDNET(ni->ni_nid) == net) + return 0; + } + + return 1; +} + +lnet_ni_t * +lnet_new_ni(__u32 net, struct list_head *nilist) +{ + lnet_ni_t *ni; + + if (!lnet_net_unique(net, nilist)) { + LCONSOLE_ERROR("Duplicate network specified: %s\n", + libcfs_net2str(net)); + return NULL; + } + + LIBCFS_ALLOC(ni, sizeof(*ni)); + if (ni == NULL) { + CERROR("Out of memory creating network %s\n", + libcfs_net2str(net)); + return NULL; + } + + /* zero counters/flags, NULL pointers... */ + memset(ni, 0, sizeof(*ni)); + + /* LND will fill in the address part of the NID */ + ni->ni_nid = LNET_MKNID(net, 0); + CFS_INIT_LIST_HEAD(&ni->ni_txq); + + list_add_tail(&ni->ni_list, nilist); + return ni; +} + +int +lnet_parse_networks(struct list_head *nilist, char *networks) +{ + int tokensize = strlen(networks) + 1; + char *tokens; + char *str; + lnet_ni_t *ni; + __u32 net; + int nnets = 0; + + if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) { + /* _WAY_ conservative */ + LCONSOLE_ERROR("Can't parse networks: string too long\n"); + return -EINVAL; + } + + LIBCFS_ALLOC(tokens, tokensize); + if (tokens == NULL) { + CERROR("Can't allocate net tokens\n"); + return -ENOMEM; + } + + the_lnet.ln_network_tokens = tokens; + the_lnet.ln_network_tokens_nob = tokensize; + memcpy (tokens, networks, tokensize); + str = tokens; + + /* Add in the loopback network */ + ni = lnet_new_ni(LNET_MKNET(LOLND, 0), nilist); + if (ni == NULL) + goto failed; + + while (str != NULL && *str != 0) { + char *comma = strchr(str, ','); + char *bracket = strchr(str, '('); + int niface; + char *iface; + + /* NB we don't check interface conflicts here; it's the LNDs + * responsibility (if it cares at all) */ + + if (bracket == NULL || + (comma != NULL && comma < bracket)) { + + /* no interface list specified */ + + if (comma != NULL) + *comma++ = 0; + net = libcfs_str2net(lnet_trimwhite(str)); + + if (net == LNET_NIDNET(LNET_NID_ANY)) { + lnet_syntax("networks", networks, + str - tokens, strlen(str)); + LCONSOLE_ERROR("Unrecognised network type\n"); + goto failed; + } + + if (LNET_NETTYP(net) != LOLND && /* loopback is implicit */ + lnet_new_ni(net, nilist) == NULL) + goto failed; + + str = comma; + continue; + } + + *bracket = 0; + net = libcfs_str2net(lnet_trimwhite(str)); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + lnet_syntax("networks", networks, + str - tokens, strlen(str)); + goto failed; + } + + if (nnets > 0 && + the_lnet.ln_ptlcompat > 0) { + LCONSOLE_ERROR("Only 1 network supported when " + "'portals_compatible' is set\n"); + goto failed; + } + + nnets++; + ni = lnet_new_ni(net, nilist); + if (ni == NULL) + goto failed; + + niface = 0; + iface = bracket + 1; + + bracket = strchr(iface, ')'); + if (bracket == NULL) { + lnet_syntax("networks", networks, + iface - tokens, strlen(iface)); + goto failed; + } + + *bracket = 0; + do { + comma = strchr(iface, ','); + if (comma != NULL) + *comma++ = 0; + + iface = lnet_trimwhite(iface); + if (*iface == 0) { + lnet_syntax("networks", networks, + iface - tokens, strlen(iface)); + goto failed; + } + + if (niface == LNET_MAX_INTERFACES) { + LCONSOLE_ERROR("Too many interfaces for net %s\n", + libcfs_net2str(net)); + goto failed; + } + + ni->ni_interfaces[niface++] = iface; + iface = comma; + } while (iface != NULL); + + str = bracket + 1; + comma = strchr(bracket + 1, ','); + if (comma != NULL) { + *comma = 0; + str = lnet_trimwhite(str); + if (*str != 0) { + lnet_syntax("networks", networks, + str - tokens, strlen(str)); + goto failed; + } + str = comma + 1; + continue; + } + + str = lnet_trimwhite(str); + if (*str != 0) { + lnet_syntax("networks", networks, + str - tokens, strlen(str)); + goto failed; + } + } + + LASSERT (!list_empty(nilist)); + return 0; + + failed: + while (!list_empty(nilist)) { + ni = list_entry(nilist->next, lnet_ni_t, ni_list); + + list_del(&ni->ni_list); + LIBCFS_FREE(ni, sizeof(*ni)); + } + LIBCFS_FREE(tokens, tokensize); + the_lnet.ln_network_tokens = NULL; + + return -EINVAL; +} + +lnet_text_buf_t * +lnet_new_text_buf (int str_len) +{ + lnet_text_buf_t *ltb; + int nob; + + /* NB allocate space for the terminating 0 */ + nob = offsetof(lnet_text_buf_t, ltb_text[str_len + 1]); + if (nob > LNET_SINGLE_TEXTBUF_NOB) { + /* _way_ conservative for "route net gateway..." */ + CERROR("text buffer too big\n"); + return NULL; + } + + if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) { + CERROR("Too many text buffers\n"); + return NULL; + } + + LIBCFS_ALLOC(ltb, nob); + if (ltb == NULL) + return NULL; + + ltb->ltb_size = nob; + ltb->ltb_text[0] = 0; + lnet_tbnob += nob; + return ltb; +} + +void +lnet_free_text_buf (lnet_text_buf_t *ltb) +{ + lnet_tbnob -= ltb->ltb_size; + LIBCFS_FREE(ltb, ltb->ltb_size); +} + +void +lnet_free_text_bufs(struct list_head *tbs) +{ + lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list); + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } +} + +void +lnet_print_text_bufs(struct list_head *tbs) +{ + struct list_head *tmp; + lnet_text_buf_t *ltb; + + list_for_each (tmp, tbs) { + ltb = list_entry(tmp, lnet_text_buf_t, ltb_list); + + CDEBUG(D_WARNING, "%s\n", ltb->ltb_text); + } + + CDEBUG(D_WARNING, "%d allocated\n", lnet_tbnob); +} + +int +lnet_str2tbs_sep (struct list_head *tbs, char *str) +{ + struct list_head pending; + char *sep; + int nob; + int i; + lnet_text_buf_t *ltb; + + INIT_LIST_HEAD(&pending); + + /* Split 'str' into separate commands */ + for (;;) { + /* skip leading whitespace */ + while (lnet_iswhite(*str)) + str++; + + /* scan for separator or comment */ + for (sep = str; *sep != 0; sep++) + if (lnet_issep(*sep) || *sep == '#') + break; + + nob = sep - str; + if (nob > 0) { + ltb = lnet_new_text_buf(nob); + if (ltb == NULL) { + lnet_free_text_bufs(&pending); + return -1; + } + + for (i = 0; i < nob; i++) + if (lnet_iswhite(str[i])) + ltb->ltb_text[i] = ' '; + else + ltb->ltb_text[i] = str[i]; + + ltb->ltb_text[nob] = 0; + + list_add_tail(<b->ltb_list, &pending); + } + + if (*sep == '#') { + /* scan for separator */ + do { + sep++; + } while (*sep != 0 && !lnet_issep(*sep)); + } + + if (*sep == 0) + break; + + str = sep + 1; + } + + list_splice(&pending, tbs->prev); + return 0; +} + +int +lnet_expand1tb (struct list_head *list, + char *str, char *sep1, char *sep2, + char *item, int itemlen) +{ + int len1 = sep1 - str; + int len2 = strlen(sep2 + 1); + lnet_text_buf_t *ltb; + + LASSERT (*sep1 == '['); + LASSERT (*sep2 == ']'); + + ltb = lnet_new_text_buf(len1 + itemlen + len2); + if (ltb == NULL) + return -ENOMEM; + + memcpy(ltb->ltb_text, str, len1); + memcpy(<b->ltb_text[len1], item, itemlen); + memcpy(<b->ltb_text[len1+itemlen], sep2 + 1, len2); + ltb->ltb_text[len1 + itemlen + len2] = 0; + + list_add_tail(<b->ltb_list, list); + return 0; +} + +int +lnet_str2tbs_expand (struct list_head *tbs, char *str) +{ + char num[16]; + struct list_head pending; + char *sep; + char *sep2; + char *parsed; + char *enditem; + int lo; + int hi; + int stride; + int i; + int nob; + int scanned; + + INIT_LIST_HEAD(&pending); + + sep = strchr(str, '['); + if (sep == NULL) /* nothing to expand */ + return 0; + + sep2 = strchr(sep, ']'); + if (sep2 == NULL) + goto failed; + + for (parsed = sep; parsed < sep2; parsed = enditem) { + + enditem = ++parsed; + while (enditem < sep2 && *enditem != ',') + enditem++; + + if (enditem == parsed) /* no empty items */ + goto failed; + + if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, &stride, &scanned) < 3) { + + if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) { + + /* simple string enumeration */ + if (lnet_expand1tb(&pending, str, sep, sep2, + parsed, enditem - parsed) != 0) + goto failed; + + continue; + } + + stride = 1; + } + + /* range expansion */ + + if (enditem != parsed + scanned) /* no trailing junk */ + goto failed; + + if (hi < 0 || lo < 0 || stride < 0 || hi < lo || + (hi - lo) % stride != 0) + goto failed; + + for (i = lo; i <= hi; i += stride) { + + snprintf(num, sizeof(num), "%d", i); + nob = strlen(num); + if (nob + 1 == sizeof(num)) + goto failed; + + if (lnet_expand1tb(&pending, str, sep, sep2, + num, nob) != 0) + goto failed; + } + } + + list_splice(&pending, tbs->prev); + return 1; + + failed: + lnet_free_text_bufs(&pending); + return -1; +} + +int +lnet_parse_hops (char *str, unsigned int *hops) +{ + int len = strlen(str); + int nob = len; + + return (sscanf(str, "%u%n", hops, &nob) >= 1 && + nob == len && + *hops > 0 && *hops < 256); +} + + +int +lnet_parse_route (char *str, int *im_a_router) +{ + /* static scratch buffer OK (single threaded) */ + static char cmd[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head nets; + struct list_head gateways; + struct list_head *tmp1; + struct list_head *tmp2; + __u32 net; + lnet_nid_t nid; + lnet_text_buf_t *ltb; + int rc; + char *sep; + char *token = str; + int ntokens = 0; + int myrc = -1; + unsigned int hops; + int got_hops = 0; + + CFS_INIT_LIST_HEAD(&gateways); + CFS_INIT_LIST_HEAD(&nets); + + /* save a copy of the string for error messages */ + strncpy(cmd, str, sizeof(cmd) - 1); + cmd[sizeof(cmd) - 1] = 0; + + sep = str; + for (;;) { + /* scan for token start */ + while (lnet_iswhite(*sep)) + sep++; + if (*sep == 0) { + if (ntokens < (got_hops ? 3 : 2)) + goto token_error; + break; + } + + ntokens++; + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !lnet_iswhite(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens == 1) { + tmp2 = &nets; /* expanding nets */ + } else if (ntokens == 2 && + lnet_parse_hops(token, &hops)) { + got_hops = 1; /* got a hop count */ + continue; + } else { + tmp2 = &gateways; /* expanding gateways */ + } + + ltb = lnet_new_text_buf(strlen(token)); + if (ltb == NULL) + goto out; + + strcpy(ltb->ltb_text, token); + tmp1 = <b->ltb_list; + list_add_tail(tmp1, tmp2); + + while (tmp1 != tmp2) { + ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list); + + rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text); + if (rc < 0) + goto token_error; + + tmp1 = tmp1->next; + + if (rc > 0) { /* expanded! */ + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + continue; + } + + if (ntokens == 1) { + net = libcfs_str2net(ltb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND) + goto token_error; + } else { + nid = libcfs_str2nid(ltb->ltb_text); + if (nid == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) + goto token_error; + } + } + } + + if (!got_hops) + hops = 1; + + LASSERT (!list_empty(&nets)); + LASSERT (!list_empty(&gateways)); + + list_for_each (tmp1, &nets) { + ltb = list_entry(tmp1, lnet_text_buf_t, ltb_list); + net = libcfs_str2net(ltb->ltb_text); + LASSERT (net != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each (tmp2, &gateways) { + ltb = list_entry(tmp2, lnet_text_buf_t, ltb_list); + nid = libcfs_str2nid(ltb->ltb_text); + LASSERT (nid != LNET_NID_ANY); + + if (lnet_islocalnid(nid)) { + *im_a_router = 1; + continue; + } + + rc = lnet_add_route (net, hops, nid); + if (rc != 0) { + CERROR("Can't create route " + "to %s via %s\n", + libcfs_net2str(net), + libcfs_nid2str(nid)); + goto out; + } + } + } + + myrc = 0; + goto out; + + token_error: + lnet_syntax("routes", cmd, token - str, strlen(token)); + out: + lnet_free_text_bufs(&nets); + lnet_free_text_bufs(&gateways); + return myrc; +} + +int +lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router) +{ + lnet_text_buf_t *ltb; + + while (!list_empty(tbs)) { + ltb = list_entry(tbs->next, lnet_text_buf_t, ltb_list); + + if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) { + lnet_free_text_bufs(tbs); + return -EINVAL; + } + + list_del(<b->ltb_list); + lnet_free_text_buf(ltb); + } + + return 0; +} + +int +lnet_parse_routes (char *routes, int *im_a_router) +{ + struct list_head tbs; + int rc = 0; + + *im_a_router = 0; + + if (the_lnet.ln_ptlcompat > 0 && + routes[0] != 0) { + /* Can't route when running in compatibility mode */ + LCONSOLE_ERROR("Route tables are not supported when " + "'portals_compatible' is set\n"); + return -EINVAL; + } + + CFS_INIT_LIST_HEAD(&tbs); + + if (lnet_str2tbs_sep(&tbs, routes) < 0) { + CERROR("Error parsing routes\n"); + rc = -EINVAL; + } else { + rc = lnet_parse_route_tbs(&tbs, im_a_router); + } + + LASSERT (lnet_tbnob == 0); + return rc; +} + +void +lnet_print_range_exprs(struct list_head *exprs) +{ + struct list_head *e; + lnet_range_expr_t *lre; + + list_for_each(e, exprs) { + lre = list_entry(exprs->next, lnet_range_expr_t, lre_list); + + CDEBUG(D_WARNING, "%d-%d/%d\n", + lre->lre_min, lre->lre_max, lre->lre_stride); + } + + CDEBUG(D_WARNING, "%d allocated\n", lnet_re_alloc); +} + +int +lnet_new_range_expr(struct list_head *exprs, int min, int max, int stride) +{ + lnet_range_expr_t *lre; + + CDEBUG(D_NET, "%d-%d/%d\n", min, max, stride); + + if (min < 0 || min > 255 || min > max || stride < 0) + return -EINVAL; + + LIBCFS_ALLOC(lre, sizeof(*lre)); + if (lre == NULL) + return -ENOMEM; + + lnet_re_alloc++; + + lre->lre_min = min; + lre->lre_max = max; + lre->lre_stride = stride; + + list_add(&lre->lre_list, exprs); + return 0; +} + +void +lnet_destroy_range_exprs(struct list_head *exprs) +{ + lnet_range_expr_t *lre; + + while (!list_empty(exprs)) { + lre = list_entry(exprs->next, lnet_range_expr_t, lre_list); + + list_del(&lre->lre_list); + LIBCFS_FREE(lre, sizeof(*lre)); + lnet_re_alloc--; + } +} + +int +lnet_parse_range_expr(struct list_head *exprs, char *str) +{ + int nob = strlen(str); + char *sep; + int n; + int x; + int y; + int z; + int rc; + + if (nob == 0) + return -EINVAL; + + if (!strcmp(str, "*")) /* match all */ + return lnet_new_range_expr(exprs, 0, 255, 1); + + n = nob; + if (sscanf(str, "%u%n", &x, &n) >= 1 && n == nob) { + /* simple number */ + return lnet_new_range_expr(exprs, x, x, 1); + } + + /* Has to be an expansion */ + if (!(str[0] == '[' && nob > 2 && str[nob-1] == ']')) + return -EINVAL; + + nob -= 2; + str++; + str[nob] = 0; + + do { + /* Comma separated list of expressions... */ + sep = strchr(str, ','); + if (sep != NULL) + *sep++ = 0; + + nob = strlen(str); + n = nob; + if (sscanf(str, "%u%n", &x, &n) >= 1 && n == nob) { + /* simple number */ + rc = lnet_new_range_expr(exprs, x, x, 1); + if (rc != 0) + return rc; + + continue; + } + + n = nob; + if (sscanf(str, "%u-%u%n", &x, &y, &n) >= 2 && n == nob) { + /* simple range */ + rc = lnet_new_range_expr(exprs, x, y, 1); + if (rc != 0) + return rc; + continue; + } + + n = nob; + if (sscanf(str, "%u-%u/%u%n", &x, &y, &z, &n) >= 3 && n == nob) { + /* strided range */ + rc = lnet_new_range_expr(exprs, x, y, z); + if (rc != 0) + return rc; + continue; + } + + return -EINVAL; + + } while ((str = sep) != NULL); + + return 0; +} + +int +lnet_match_network_token(char *token, __u32 *ipaddrs, int nip) +{ + struct list_head exprs[4]; + struct list_head *e; + lnet_range_expr_t *re; + char *str; + int i; + int j; + __u32 ip; + int n; + int match; + int rc; + + for (i = 0; i < 4; i++) + CFS_INIT_LIST_HEAD(&exprs[i]); + + for (i = 0; i < 4; i++) { + str = token; + if (i != 3) { + token = strchr(token, '.'); + if (token == NULL) { + rc = -EINVAL; + goto out; + } + *token++ = 0; + } + + rc = lnet_parse_range_expr(&exprs[i], str); + if (rc != 0) { + LASSERT (rc < 0); + goto out; + } + } + + for (match = i = 0; !match && i < nip; i++) { + ip = ipaddrs[i]; + + for (match = 1, j = 0; match && j < 4; j++) { + n = (ip >> (8 * (3 - j))) & 0xff; + match = 0; + + list_for_each(e, &exprs[j]) { + re = list_entry(e, lnet_range_expr_t, lre_list); + + if (re->lre_min <= n && + re->lre_max >= n && + (n - re->lre_min) % re->lre_stride == 0) { + match = 1; + break; + } + } + } + } + + rc = match ? 1 : 0; + + out: + for (i = 0; i < 4; i++) + lnet_destroy_range_exprs(&exprs[i]); + LASSERT (lnet_re_alloc == 0); + + return rc; +} + +int +lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip) +{ + static char tokens[LNET_SINGLE_TEXTBUF_NOB]; + + int matched = 0; + int ntokens = 0; + int len; + char *net = NULL; + char *sep; + char *token; + int rc; + + LASSERT (strlen(net_entry) < sizeof(tokens)); + + /* work on a copy of the string */ + strcpy(tokens, net_entry); + sep = tokens; + for (;;) { + /* scan for token start */ + while (lnet_iswhite(*sep)) + sep++; + if (*sep == 0) + break; + + token = sep++; + + /* scan for token end */ + while (*sep != 0 && !lnet_iswhite(*sep)) + sep++; + if (*sep != 0) + *sep++ = 0; + + if (ntokens++ == 0) { + net = token; + continue; + } + + len = strlen(token); + + rc = lnet_match_network_token(token, ipaddrs, nip); + if (rc < 0) { + lnet_syntax("ip2nets", net_entry, + token - tokens, len); + return rc; + } + + matched |= (rc != 0); + } + + if (!matched) + return 0; + + strcpy(net_entry, net); /* replace with matched net */ + return 1; +} + +__u32 +lnet_netspec2net(char *netspec) +{ + char *bracket = strchr(netspec, '('); + __u32 net; + + if (bracket != NULL) + *bracket = 0; + + net = libcfs_str2net(netspec); + + if (bracket != NULL) + *bracket = '('; + + return net; +} + +int +lnet_splitnets(char *source, struct list_head *nets) +{ + int offset = 0; + int offset2; + int len; + lnet_text_buf_t *tb; + lnet_text_buf_t *tb2; + struct list_head *t; + char *sep; + char *bracket; + __u32 net; + + LASSERT (!list_empty(nets)); + LASSERT (nets->next == nets->prev); /* single entry */ + + tb = list_entry(nets->next, lnet_text_buf_t, ltb_list); + + for (;;) { + sep = strchr(tb->ltb_text, ','); + bracket = strchr(tb->ltb_text, '('); + + if (sep != NULL && + bracket != NULL && + bracket < sep) { + /* netspec lists interfaces... */ + + offset2 = offset + (bracket - tb->ltb_text); + len = strlen(bracket); + + bracket = strchr(bracket + 1, ')'); + + if (bracket == NULL || + !(bracket[1] == ',' || bracket[1] == 0)) { + lnet_syntax("ip2nets", source, offset2, len); + return -EINVAL; + } + + sep = (bracket[1] == 0) ? NULL : bracket + 1; + } + + if (sep != NULL) + *sep++ = 0; + + net = lnet_netspec2net(tb->ltb_text); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + + list_for_each(t, nets) { + tb2 = list_entry(t, lnet_text_buf_t, ltb_list); + + if (tb2 == tb) + continue; + + if (net == lnet_netspec2net(tb2->ltb_text)) { + /* duplicate network */ + lnet_syntax("ip2nets", source, offset, + strlen(tb->ltb_text)); + return -EINVAL; + } + } + + if (sep == NULL) + return 0; + + offset += sep - tb->ltb_text; + tb2 = lnet_new_text_buf(strlen(sep)); + if (tb2 == NULL) + return -ENOMEM; + + strcpy(tb2->ltb_text, sep); + list_add_tail(&tb2->ltb_list, nets); + + tb = tb2; + } +} + +int +lnet_match_networks (char **networksp, char *ip2nets, __u32 *ipaddrs, int nip) +{ + static char networks[LNET_SINGLE_TEXTBUF_NOB]; + static char source[LNET_SINGLE_TEXTBUF_NOB]; + + struct list_head raw_entries; + struct list_head matched_nets; + struct list_head current_nets; + struct list_head *t; + struct list_head *t2; + lnet_text_buf_t *tb; + lnet_text_buf_t *tb2; + __u32 net1; + __u32 net2; + int len; + int count; + int dup; + int rc; + + CFS_INIT_LIST_HEAD(&raw_entries); + if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) { + CERROR("Error parsing ip2nets\n"); + LASSERT (lnet_tbnob == 0); + return -EINVAL; + } + + CFS_INIT_LIST_HEAD(&matched_nets); + CFS_INIT_LIST_HEAD(¤t_nets); + networks[0] = 0; + count = 0; + len = 0; + rc = 0; + + while (!list_empty(&raw_entries)) { + tb = list_entry(raw_entries.next, lnet_text_buf_t, ltb_list); + + strncpy(source, tb->ltb_text, sizeof(source)-1); + source[sizeof(source)-1] = 0; + + /* replace ltb_text with the network(s) add on match */ + rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip); + if (rc < 0) + break; + + list_del(&tb->ltb_list); + + if (rc == 0) { /* no match */ + lnet_free_text_buf(tb); + continue; + } + + /* split into separate networks */ + CFS_INIT_LIST_HEAD(¤t_nets); + list_add(&tb->ltb_list, ¤t_nets); + rc = lnet_splitnets(source, ¤t_nets); + if (rc < 0) + break; + + dup = 0; + list_for_each (t, ¤t_nets) { + tb = list_entry(t, lnet_text_buf_t, ltb_list); + net1 = lnet_netspec2net(tb->ltb_text); + LASSERT (net1 != LNET_NIDNET(LNET_NID_ANY)); + + list_for_each(t2, &matched_nets) { + tb2 = list_entry(t2, lnet_text_buf_t, ltb_list); + net2 = lnet_netspec2net(tb2->ltb_text); + LASSERT (net2 != LNET_NIDNET(LNET_NID_ANY)); + + if (net1 == net2) { + dup = 1; + break; + } + } + + if (dup) + break; + } + + if (dup) { + lnet_free_text_bufs(¤t_nets); + continue; + } + + list_for_each_safe(t, t2, ¤t_nets) { + tb = list_entry(t, lnet_text_buf_t, ltb_list); + + list_del(&tb->ltb_list); + list_add_tail(&tb->ltb_list, &matched_nets); + + len += snprintf(networks + len, sizeof(networks) - len, + "%s%s", (len == 0) ? "" : ",", + tb->ltb_text); + + if (len >= sizeof(networks)) { + CERROR("Too many matched networks\n"); + rc = -E2BIG; + goto out; + } + } + + count++; + } + + out: + lnet_free_text_bufs(&raw_entries); + lnet_free_text_bufs(&matched_nets); + lnet_free_text_bufs(¤t_nets); + LASSERT (lnet_tbnob == 0); + + if (rc < 0) + return rc; + + *networksp = networks; + return count; +} + +#ifdef __KERNEL__ +void +lnet_ipaddr_free_enumeration(__u32 *ipaddrs, int nip) +{ + LIBCFS_FREE(ipaddrs, nip * sizeof(*ipaddrs)); +} + +int +lnet_ipaddr_enumerate (__u32 **ipaddrsp) +{ + int up; + __u32 netmask; + __u32 *ipaddrs; + __u32 *ipaddrs2; + int nip; + char **ifnames; + int nif = libcfs_ipif_enumerate(&ifnames); + int i; + int rc; + + if (nif <= 0) + return nif; + + LIBCFS_ALLOC(ipaddrs, nif * sizeof(*ipaddrs)); + if (ipaddrs == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nif); + libcfs_ipif_free_enumeration(ifnames, nif); + return -ENOMEM; + } + + for (i = nip = 0; i < nif; i++) { + if (!strcmp(ifnames[i], "lo")) + continue; + + rc = libcfs_ipif_query(ifnames[i], &up, + &ipaddrs[nip], &netmask); + if (rc != 0) { + CWARN("Can't query interface %s: %d\n", + ifnames[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s: it's down\n", + ifnames[i]); + continue; + } + + nip++; + } + + libcfs_ipif_free_enumeration(ifnames, nif); + + if (nip == nif) { + *ipaddrsp = ipaddrs; + } else { + if (nip > 0) { + LIBCFS_ALLOC(ipaddrs2, nip * sizeof(*ipaddrs2)); + if (ipaddrs2 == NULL) { + CERROR("Can't allocate ipaddrs[%d]\n", nip); + nip = -ENOMEM; + } else { + memcpy(ipaddrs2, ipaddrs, + nip * sizeof(*ipaddrs)); + *ipaddrsp = ipaddrs2; + rc = nip; + } + } + lnet_ipaddr_free_enumeration(ipaddrs, nif); + } + return nip; +} + +int +lnet_parse_ip2nets (char **networksp, char *ip2nets) +{ + __u32 *ipaddrs; + int nip = lnet_ipaddr_enumerate(&ipaddrs); + int rc; + + if (nip < 0) { + LCONSOLE_ERROR("Error %d enumerating local IP interfaces " + "for ip2nets to match\n", nip); + return nip; + } + + if (nip == 0) { + LCONSOLE_ERROR("No local IP interfaces " + "for ip2nets to match\n"); + return -ENOENT; + } + + rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip); + lnet_ipaddr_free_enumeration(ipaddrs, nip); + + if (rc < 0) { + LCONSOLE_ERROR("Error %d parsing ip2nets\n", rc); + return rc; + } + + if (rc == 0) { + LCONSOLE_ERROR("ip2nets does not match " + "any local IP interfaces\n"); + return -ENOENT; + } + + return 0; +} + +int +lnet_set_ip_niaddr (lnet_ni_t *ni) +{ + __u32 net = LNET_NIDNET(ni->ni_nid); + char **names; + int n; + __u32 ip; + __u32 netmask; + int up; + int i; + int rc; + + /* Convenience for LNDs that use the IP address of a local interface as + * the local address part of their NID */ + + if (ni->ni_interfaces[0] != NULL) { + + CLASSERT (LNET_MAX_INTERFACES > 1); + + if (ni->ni_interfaces[1] != NULL) { + CERROR("Net %s doesn't support multiple interfaces\n", + libcfs_net2str(net)); + return -EPERM; + } + + rc = libcfs_ipif_query(ni->ni_interfaces[0], + &up, &ip, &netmask); + if (rc != 0) { + CERROR("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), ni->ni_interfaces[0], rc); + return -EPERM; + } + + if (!up) { + CERROR("Net %s can't use interface %s: it's down\n", + libcfs_net2str(net), ni->ni_interfaces[0]); + return -ENETDOWN; + } + + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Net %s can't enumerate interfaces: %d\n", + libcfs_net2str(net), n); + return 0; + } + + for (i = 0; i < n; i++) { + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &netmask); + + if (rc != 0) { + CWARN("Net %s can't query interface %s: %d\n", + libcfs_net2str(net), names[i], rc); + continue; + } + + if (!up) { + CWARN("Net %s ignoring interface %s (down)\n", + libcfs_net2str(net), names[i]); + continue; + } + + libcfs_ipif_free_enumeration(names, n); + ni->ni_nid = LNET_MKNID(net, ip); + return 0; + } + + CERROR("Net %s can't find any interfaces\n", libcfs_net2str(net)); + libcfs_ipif_free_enumeration(names, n); + return -ENOENT; +} +EXPORT_SYMBOL(lnet_set_ip_niaddr); + +#endif diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c index 4992fce..98adecc 100644 --- a/lnet/lnet/lib-eq.c +++ b/lnet/lnet/lib-eq.c @@ -22,19 +22,18 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS -#include +#define DEBUG_SUBSYSTEM S_LNET +#include -int -lib_api_eq_alloc (nal_t *apinal, ptl_size_t count, - ptl_eq_handler_t callback, - ptl_handle_eq_t *handle) +int +LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, + lnet_handle_eq_t *handle) { - lib_nal_t *nal = apinal->nal_data; - lib_eq_t *eq; - unsigned long flags; - int rc; + lnet_eq_t *eq; + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq * overflow, they don't skip entries, so the queue has the same * apparant capacity at all times */ @@ -48,36 +47,24 @@ lib_api_eq_alloc (nal_t *apinal, ptl_size_t count, } if (count == 0) /* catch bad parameter / overflow on roundup */ - return (PTL_VAL_FAILED); + return (-EINVAL); - eq = lib_eq_alloc (nal); + eq = lnet_eq_alloc(); if (eq == NULL) - return (PTL_NO_SPACE); + return (-ENOMEM); - PORTAL_ALLOC(eq->eq_events, count * sizeof(ptl_event_t)); + LIBCFS_ALLOC(eq->eq_events, count * sizeof(lnet_event_t)); if (eq->eq_events == NULL) { - LIB_LOCK(nal, flags); - lib_eq_free (nal, eq); - LIB_UNLOCK(nal, flags); - } + LNET_LOCK(); + lnet_eq_free (eq); + LNET_UNLOCK(); - if (nal->libnal_map != NULL) { - struct iovec iov = { - .iov_base = eq->eq_events, - .iov_len = count * sizeof(ptl_event_t)}; - - rc = nal->libnal_map(nal, 1, &iov, &eq->eq_addrkey); - if (rc != PTL_OK) { - LIB_LOCK(nal, flags); - lib_eq_free (nal, eq); - LIB_UNLOCK(nal, flags); - return (rc); - } + return -ENOMEM; } /* NB this resets all event sequence numbers to 0, to be earlier * than eq_deq_seq */ - memset(eq->eq_events, 0, count * sizeof(ptl_event_t)); + memset(eq->eq_events, 0, count * sizeof(lnet_event_t)); eq->eq_deq_seq = 1; eq->eq_enq_seq = 1; @@ -85,77 +72,68 @@ lib_api_eq_alloc (nal_t *apinal, ptl_size_t count, eq->eq_refcount = 0; eq->eq_callback = callback; - LIB_LOCK(nal, flags); + LNET_LOCK(); - lib_initialise_handle (nal, &eq->eq_lh, PTL_COOKIE_TYPE_EQ); - list_add (&eq->eq_list, &nal->libnal_ni.ni_active_eqs); + lnet_initialise_handle (&eq->eq_lh, LNET_COOKIE_TYPE_EQ); + list_add (&eq->eq_list, &the_lnet.ln_active_eqs); - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); - ptl_eq2handle(handle, nal, eq); - return (PTL_OK); + lnet_eq2handle(handle, eq); + return (0); } -int -lib_api_eq_free(nal_t *apinal, ptl_handle_eq_t *eqh) +int +LNetEQFree(lnet_handle_eq_t eqh) { - lib_nal_t *nal = apinal->nal_data; - lib_eq_t *eq; + lnet_eq_t *eq; int size; - ptl_event_t *events; - void *addrkey; - unsigned long flags; + lnet_event_t *events; - LIB_LOCK(nal, flags); + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + LNET_LOCK(); - eq = ptl_handle2eq(eqh, nal); + eq = lnet_handle2eq(&eqh); if (eq == NULL) { - LIB_UNLOCK(nal, flags); - return (PTL_EQ_INVALID); + LNET_UNLOCK(); + return (-ENOENT); } if (eq->eq_refcount != 0) { - LIB_UNLOCK(nal, flags); - return (PTL_EQ_IN_USE); + LNET_UNLOCK(); + return (-EBUSY); } /* stash for free after lock dropped */ events = eq->eq_events; size = eq->eq_size; - addrkey = eq->eq_addrkey; - lib_invalidate_handle (nal, &eq->eq_lh); + lnet_invalidate_handle (&eq->eq_lh); list_del (&eq->eq_list); - lib_eq_free (nal, eq); - - LIB_UNLOCK(nal, flags); + lnet_eq_free (eq); - if (nal->libnal_unmap != NULL) { - struct iovec iov = { - .iov_base = events, - .iov_len = size * sizeof(ptl_event_t)}; - - nal->libnal_unmap(nal, 1, &iov, &addrkey); - } + LNET_UNLOCK(); - PORTAL_FREE(events, size * sizeof (ptl_event_t)); + LIBCFS_FREE(events, size * sizeof (lnet_event_t)); - return (PTL_OK); + return 0; } int -lib_get_event (lib_eq_t *eq, ptl_event_t *ev) +lib_get_event (lnet_eq_t *eq, lnet_event_t *ev) { - int new_index = eq->eq_deq_seq & (eq->eq_size - 1); - ptl_event_t *new_event = &eq->eq_events[new_index]; - int rc; + int new_index = eq->eq_deq_seq & (eq->eq_size - 1); + lnet_event_t *new_event = &eq->eq_events[new_index]; + int rc; ENTRY; CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", new_event, eq->eq_deq_seq, eq->eq_size); - if (PTL_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) { - RETURN(PTL_EQ_EMPTY); + if (LNET_SEQ_GT (eq->eq_deq_seq, new_event->sequence)) { + RETURN(0); } /* We've got a new event... */ @@ -163,11 +141,13 @@ lib_get_event (lib_eq_t *eq, ptl_event_t *ev) /* ...but did it overwrite an event we've not seen yet? */ if (eq->eq_deq_seq == new_event->sequence) { - rc = PTL_OK; + rc = 1; } else { - CERROR("Event Queue Overflow: eq seq %lu ev seq %lu\n", + /* don't complain with CERROR: some EQs are sized small + * anyway; if it's important, the caller should complain */ + CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", eq->eq_deq_seq, new_event->sequence); - rc = PTL_EQ_DROPPED; + rc = -EOVERFLOW; } eq->eq_deq_seq = new_event->sequence + 1; @@ -176,13 +156,27 @@ lib_get_event (lib_eq_t *eq, ptl_event_t *ev) int -lib_api_eq_poll (nal_t *apinal, - ptl_handle_eq_t *eventqs, int neq, int timeout_ms, - ptl_event_t *event, int *which) +LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, 0, + event, &which); +} + +int +LNetEQWait (lnet_handle_eq_t eventq, lnet_event_t *event) +{ + int which; + + return LNetEQPoll(&eventq, 1, LNET_TIME_FOREVER, + event, &which); +} + +int +LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, + lnet_event_t *event, int *which) { - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - unsigned long flags; int i; int rc; #ifdef __KERNEL__ @@ -191,57 +185,112 @@ lib_api_eq_poll (nal_t *apinal, #else struct timeval then; struct timeval now; +# if HAVE_LIBPTHREAD struct timespec ts; +# endif + lnet_ni_t *eqwaitni = the_lnet.ln_eqwaitni; #endif ENTRY; - LIB_LOCK(nal, flags); + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (neq < 1) + RETURN(-ENOENT); + + LNET_LOCK(); for (;;) { for (i = 0; i < neq; i++) { - lib_eq_t *eq = ptl_handle2eq(&eventqs[i], nal); + lnet_eq_t *eq = lnet_handle2eq(&eventqs[i]); + + if (eq == NULL) { + LNET_UNLOCK(); + RETURN(-ENOENT); + } rc = lib_get_event (eq, event); - if (rc != PTL_EQ_EMPTY) { - LIB_UNLOCK(nal, flags); + if (rc != 0) { + LNET_UNLOCK(); *which = i; RETURN(rc); } } +#ifdef __KERNEL__ if (timeout_ms == 0) { - LIB_UNLOCK (nal, flags); - RETURN (PTL_EQ_EMPTY); + LNET_UNLOCK (); + RETURN (0); } - /* Some architectures force us to do spin locking/unlocking - * in the same stack frame, means we can abstract the - * locking here */ -#ifdef __KERNEL__ cfs_waitlink_init(&wl); set_current_state(TASK_INTERRUPTIBLE); - cfs_waitq_add(&ni->ni_waitq, &wl); + cfs_waitq_add(&the_lnet.ln_waitq, &wl); - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); if (timeout_ms < 0) { - cfs_waitq_wait (&wl); + cfs_waitq_wait (&wl, CFS_TASK_INTERRUPTIBLE); } else { struct timeval tv; now = cfs_time_current(); - cfs_waitq_timedwait(&wl, cfs_time_seconds(timeout_ms)/1000); - cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), &tv); + cfs_waitq_timedwait(&wl, CFS_TASK_INTERRUPTIBLE, + cfs_time_seconds(timeout_ms)/1000); + cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), + &tv); timeout_ms -= tv.tv_sec * 1000 + tv.tv_usec / 1000; if (timeout_ms < 0) timeout_ms = 0; } - LIB_LOCK(nal, flags); - cfs_waitq_del(&ni->ni_waitq, &wl); + LNET_LOCK(); + cfs_waitq_del(&the_lnet.ln_waitq, &wl); #else + if (eqwaitni != NULL) { + /* I have a single NI that I have to call into, to get + * events queued, or to block. */ + lnet_ni_addref_locked(eqwaitni); + LNET_UNLOCK(); + + if (timeout_ms <= 0) { + (eqwaitni->ni_lnd->lnd_wait)(eqwaitni, timeout_ms); + } else { + gettimeofday(&then, NULL); + + (eqwaitni->ni_lnd->lnd_wait)(eqwaitni, timeout_ms); + + gettimeofday(&now, NULL); + timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + + (now.tv_usec - then.tv_usec) / 1000; + if (timeout_ms < 0) + timeout_ms = 0; + } + + LNET_LOCK(); + lnet_ni_decref_locked(eqwaitni); + + /* don't call into eqwaitni again if timeout has + * expired */ + if (timeout_ms == 0) + eqwaitni = NULL; + + continue; /* go back and check for events */ + } + + if (timeout_ms == 0) { + LNET_UNLOCK(); + RETURN (0); + } + +# if !HAVE_LIBPTHREAD + /* If I'm single-threaded, LNET fails at startup if it can't + * set the_lnet.ln_eqwaitni correctly. */ + LBUG(); +# else if (timeout_ms < 0) { - pthread_cond_wait(&ni->ni_cond, &ni->ni_mutex); + pthread_cond_wait(&the_lnet.ln_cond, + &the_lnet.ln_lock); } else { gettimeofday(&then, NULL); @@ -253,8 +302,8 @@ lib_api_eq_poll (nal_t *apinal, ts.tv_nsec -= 1000000000; } - pthread_cond_timedwait(&ni->ni_cond, - &ni->ni_mutex, &ts); + pthread_cond_timedwait(&the_lnet.ln_cond, + &the_lnet.ln_lock, &ts); gettimeofday(&now, NULL); timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + @@ -263,6 +312,7 @@ lib_api_eq_poll (nal_t *apinal, if (timeout_ms < 0) timeout_ms = 0; } +# endif #endif } } diff --git a/lnet/lnet/lib-init.c b/lnet/lnet/lib-init.c deleted file mode 100644 index 6d0099c..0000000 --- a/lnet/lnet/lib-init.c +++ /dev/null @@ -1,433 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * lib/lib-init.c - * Start up the internal library and clear all structures - * Called by the NAL when it initializes. Safe to call multiple times. - * - * Copyright (c) 2001-2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -# define DEBUG_SUBSYSTEM S_PORTALS -#include - -#ifdef __KERNEL__ -# include -#else -# include -# include -#endif - -#ifndef PTL_USE_LIB_FREELIST - -int -kportal_descriptor_setup (lib_nal_t *nal, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - /* Ignore requested limits! */ - actual_limits->max_mes = INT_MAX; - actual_limits->max_mds = INT_MAX; - actual_limits->max_eqs = INT_MAX; - - return PTL_OK; -} - -void -kportal_descriptor_cleanup (lib_nal_t *nal) -{ -} -#else - -int -lib_freelist_init (lib_nal_t *nal, lib_freelist_t *fl, int n, int size) -{ - char *space; - - LASSERT (n > 0); - - size += offsetof (lib_freeobj_t, fo_contents); - - PORTAL_ALLOC(space, n * size); - if (space == NULL) - return (PTL_NO_SPACE); - - CFS_INIT_LIST_HEAD (&fl->fl_list); - fl->fl_objs = space; - fl->fl_nobjs = n; - fl->fl_objsize = size; - - do - { - memset (space, 0, size); - list_add ((struct list_head *)space, &fl->fl_list); - space += size; - } while (--n != 0); - - return (PTL_OK); -} - -void -lib_freelist_fini (lib_nal_t *nal, lib_freelist_t *fl) -{ - struct list_head *el; - int count; - - if (fl->fl_nobjs == 0) - return; - - count = 0; - for (el = fl->fl_list.next; el != &fl->fl_list; el = el->next) - count++; - - LASSERT (count == fl->fl_nobjs); - - PORTAL_FREE(fl->fl_objs, fl->fl_nobjs * fl->fl_objsize); - memset (fl, 0, sizeof (fl)); -} - -int -kportal_descriptor_setup (lib_nal_t *nal, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - /* NB on failure caller must still call kportal_descriptor_cleanup */ - /* ****** */ - lib_ni_t *ni = &nal->libnal_ni; - int rc; - - memset (&ni->ni_free_mes, 0, sizeof (ni->ni_free_mes)); - memset (&ni->ni_free_msgs, 0, sizeof (ni->ni_free_msgs)); - memset (&ni->ni_free_mds, 0, sizeof (ni->ni_free_mds)); - memset (&ni->ni_free_eqs, 0, sizeof (ni->ni_free_eqs)); - - /* Ignore requested limits! */ - actual_limits->max_mes = MAX_MES; - actual_limits->max_mds = MAX_MDS; - actual_limits->max_eqs = MAX_EQS; - /* Hahahah what a load of bollocks. There's nowhere to - * specify the max # messages in-flight */ - - rc = lib_freelist_init (nal, &ni->ni_free_mes, - MAX_MES, sizeof (lib_me_t)); - if (rc != PTL_OK) - return (rc); - - rc = lib_freelist_init (nal, &ni->ni_free_msgs, - MAX_MSGS, sizeof (lib_msg_t)); - if (rc != PTL_OK) - return (rc); - - rc = lib_freelist_init (nal, &ni->ni_free_mds, - MAX_MDS, sizeof (lib_md_t)); - if (rc != PTL_OK) - return (rc); - - rc = lib_freelist_init (nal, &ni->ni_free_eqs, - MAX_EQS, sizeof (lib_eq_t)); - return (rc); -} - -void -kportal_descriptor_cleanup (lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - - lib_freelist_fini (nal, &ni->ni_free_mes); - lib_freelist_fini (nal, &ni->ni_free_msgs); - lib_freelist_fini (nal, &ni->ni_free_mds); - lib_freelist_fini (nal, &ni->ni_free_eqs); -} - -#endif - -__u64 -lib_create_interface_cookie (lib_nal_t *nal) -{ - /* NB the interface cookie in wire handles guards against delayed - * replies and ACKs appearing valid in a new instance of the same - * interface. Initialisation time, even if it's only implemented - * to millisecond resolution is probably easily good enough. */ - struct timeval tv; - __u64 cookie; -#ifndef __KERNEL__ - int rc = gettimeofday (&tv, NULL); - LASSERT (rc == 0); -#else - do_gettimeofday(&tv); -#endif - cookie = tv.tv_sec; - cookie *= 1000000; - cookie += tv.tv_usec; - return (cookie); -} - -int -lib_setup_handle_hash (lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - int i; - - /* Arbitrary choice of hash table size */ -#ifdef __KERNEL__ - ni->ni_lh_hash_size = PAGE_SIZE / sizeof (struct list_head); -#else - ni->ni_lh_hash_size = (MAX_MES + MAX_MDS + MAX_EQS)/4; -#endif - PORTAL_ALLOC(ni->ni_lh_hash_table, - ni->ni_lh_hash_size * sizeof (struct list_head)); - if (ni->ni_lh_hash_table == NULL) - return (PTL_NO_SPACE); - - for (i = 0; i < ni->ni_lh_hash_size; i++) - CFS_INIT_LIST_HEAD (&ni->ni_lh_hash_table[i]); - - ni->ni_next_object_cookie = PTL_COOKIE_TYPES; - - return (PTL_OK); -} - -void -lib_cleanup_handle_hash (lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - - if (ni->ni_lh_hash_table == NULL) - return; - - PORTAL_FREE(ni->ni_lh_hash_table, - ni->ni_lh_hash_size * sizeof (struct list_head)); -} - -lib_handle_t * -lib_lookup_cookie (lib_nal_t *nal, __u64 cookie, int type) -{ - /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->libnal_ni; - struct list_head *list; - struct list_head *el; - unsigned int hash; - - if ((cookie & (PTL_COOKIE_TYPES - 1)) != type) - return (NULL); - - hash = ((unsigned int)cookie) % ni->ni_lh_hash_size; - list = &ni->ni_lh_hash_table[hash]; - - list_for_each (el, list) { - lib_handle_t *lh = list_entry (el, lib_handle_t, lh_hash_chain); - - if (lh->lh_cookie == cookie) - return (lh); - } - - return (NULL); -} - -void -lib_initialise_handle (lib_nal_t *nal, lib_handle_t *lh, int type) -{ - /* ALWAYS called with statelock held */ - lib_ni_t *ni = &nal->libnal_ni; - unsigned int hash; - - LASSERT (type >= 0 && type < PTL_COOKIE_TYPES); - lh->lh_cookie = ni->ni_next_object_cookie | type; - ni->ni_next_object_cookie += PTL_COOKIE_TYPES; - - hash = ((unsigned int)lh->lh_cookie) % ni->ni_lh_hash_size; - list_add (&lh->lh_hash_chain, &ni->ni_lh_hash_table[hash]); -} - -void -lib_invalidate_handle (lib_nal_t *nal, lib_handle_t *lh) -{ - list_del (&lh->lh_hash_chain); -} - -int -lib_init(lib_nal_t *libnal, nal_t *apinal, - ptl_process_id_t process_id, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - int rc = PTL_OK; - lib_ni_t *ni = &libnal->libnal_ni; - int ptl_size; - int i; - ENTRY; - - /* NB serialised in PtlNIInit() */ - - lib_assert_wire_constants (); - - /* Setup the API nal with the lib API handling functions */ - apinal->nal_get_id = lib_api_get_id; - apinal->nal_ni_status = lib_api_ni_status; - apinal->nal_ni_dist = lib_api_ni_dist; - apinal->nal_fail_nid = lib_api_fail_nid; - apinal->nal_loopback = lib_api_loopback; - apinal->nal_me_attach = lib_api_me_attach; - apinal->nal_me_insert = lib_api_me_insert; - apinal->nal_me_unlink = lib_api_me_unlink; - apinal->nal_md_attach = lib_api_md_attach; - apinal->nal_md_bind = lib_api_md_bind; - apinal->nal_md_unlink = lib_api_md_unlink; - apinal->nal_md_update = lib_api_md_update; - apinal->nal_eq_alloc = lib_api_eq_alloc; - apinal->nal_eq_free = lib_api_eq_free; - apinal->nal_eq_poll = lib_api_eq_poll; - apinal->nal_put = lib_api_put; - apinal->nal_get = lib_api_get; - - apinal->nal_data = libnal; - ni->ni_api = apinal; - - rc = kportal_descriptor_setup (libnal, requested_limits, - &ni->ni_actual_limits); - if (rc != PTL_OK) - goto out; - - memset(&ni->ni_counters, 0, sizeof(lib_counters_t)); - - CFS_INIT_LIST_HEAD (&ni->ni_active_msgs); - CFS_INIT_LIST_HEAD (&ni->ni_active_mds); - CFS_INIT_LIST_HEAD (&ni->ni_active_eqs); - CFS_INIT_LIST_HEAD (&ni->ni_test_peers); - -#ifdef __KERNEL__ - spin_lock_init (&ni->ni_lock); - cfs_waitq_init (&ni->ni_waitq); -#else - pthread_mutex_init(&ni->ni_mutex, NULL); - pthread_cond_init(&ni->ni_cond, NULL); -#endif - - ni->ni_interface_cookie = lib_create_interface_cookie (libnal); - ni->ni_next_object_cookie = 0; - rc = lib_setup_handle_hash (libnal); - if (rc != PTL_OK) - goto out; - - ni->ni_pid = process_id; - - if (requested_limits != NULL) - ptl_size = requested_limits->max_pt_index + 1; - else - ptl_size = 64; - - ni->ni_portals.size = ptl_size; - PORTAL_ALLOC(ni->ni_portals.tbl, - ptl_size * sizeof(struct list_head)); - if (ni->ni_portals.tbl == NULL) { - rc = PTL_NO_SPACE; - goto out; - } - - for (i = 0; i < ptl_size; i++) - CFS_INIT_LIST_HEAD(&(ni->ni_portals.tbl[i])); - - /* max_{mes,mds,eqs} set in kportal_descriptor_setup */ - - /* We don't have an access control table! */ - ni->ni_actual_limits.max_ac_index = -1; - - ni->ni_actual_limits.max_pt_index = ptl_size - 1; - ni->ni_actual_limits.max_md_iovecs = PTL_MD_MAX_IOV; - ni->ni_actual_limits.max_me_list = INT_MAX; - - /* We don't support PtlGetPut! */ - ni->ni_actual_limits.max_getput_md = 0; - - if (actual_limits != NULL) - *actual_limits = ni->ni_actual_limits; - - /* disable loopback optimisation by default */ - ni->ni_loopback = 0; - - out: - if (rc != PTL_OK) { - lib_cleanup_handle_hash (libnal); - kportal_descriptor_cleanup (libnal); - } - - RETURN (rc); -} - -int -lib_fini(lib_nal_t *nal) -{ - lib_ni_t *ni = &nal->libnal_ni; - int idx; - - /* NB no state_lock() since this is the last reference. The NAL - * should have shut down already, so it should be safe to unlink - * and free all descriptors, even those that appear committed to a - * network op (eg MD with non-zero pending count) - */ - - for (idx = 0; idx < ni->ni_portals.size; idx++) - while (!list_empty (&ni->ni_portals.tbl[idx])) { - lib_me_t *me = list_entry (ni->ni_portals.tbl[idx].next, - lib_me_t, me_list); - - CERROR ("Active me %p on exit\n", me); - list_del (&me->me_list); - lib_me_free (nal, me); - } - - while (!list_empty (&ni->ni_active_mds)) { - lib_md_t *md = list_entry (ni->ni_active_mds.next, - lib_md_t, md_list); - - CERROR ("Active md %p on exit\n", md); - list_del (&md->md_list); - lib_md_free (nal, md); - } - - while (!list_empty (&ni->ni_active_eqs)) { - lib_eq_t *eq = list_entry (ni->ni_active_eqs.next, - lib_eq_t, eq_list); - - CERROR ("Active eq %p on exit\n", eq); - list_del (&eq->eq_list); - lib_eq_free (nal, eq); - } - - while (!list_empty (&ni->ni_active_msgs)) { - lib_msg_t *msg = list_entry (ni->ni_active_msgs.next, - lib_msg_t, msg_list); - - CERROR ("Active msg %p on exit\n", msg); - list_del (&msg->msg_list); - lib_msg_free (nal, msg); - } - - PORTAL_FREE(ni->ni_portals.tbl, - ni->ni_portals.size * sizeof(struct list_head)); - - lib_cleanup_handle_hash (nal); - kportal_descriptor_cleanup (nal); - -#ifndef __KERNEL__ - pthread_mutex_destroy(&ni->ni_mutex); - pthread_cond_destroy(&ni->ni_cond); -#endif - - return (PTL_OK); -} diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index f188e2a..0e8524c 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -22,109 +22,92 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET -#ifndef __KERNEL__ -# include -#else -# include -#endif +#include -#include - -/* must be called with state lock held */ +/* must be called with LNET_LOCK held */ void -lib_md_unlink(lib_nal_t *nal, lib_md_t *md) +lnet_md_unlink(lnet_libmd_t *md) { - if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) == 0) { + if ((md->md_flags & LNET_MD_FLAG_ZOMBIE) == 0) { /* first unlink attempt... */ - lib_me_t *me = md->me; + lnet_me_t *me = md->md_me; - md->md_flags |= PTL_MD_FLAG_ZOMBIE; + md->md_flags |= LNET_MD_FLAG_ZOMBIE; /* Disassociate from ME (if any), and unlink it if it was created - * with PTL_UNLINK */ + * with LNET_UNLINK */ if (me != NULL) { - me->md = NULL; - if (me->unlink == PTL_UNLINK) - lib_me_unlink(nal, me); + me->me_md = NULL; + if (me->me_unlink == LNET_UNLINK) + lnet_me_unlink(me); } /* emsure all future handle lookups fail */ - lib_invalidate_handle(nal, &md->md_lh); + lnet_invalidate_handle(&md->md_lh); } - if (md->pending != 0) { + if (md->md_refcount != 0) { CDEBUG(D_NET, "Queueing unlink of md %p\n", md); return; } CDEBUG(D_NET, "Unlinking md %p\n", md); - if ((md->options & PTL_MD_KIOV) != 0) { - if (nal->libnal_unmap_pages != NULL) - nal->libnal_unmap_pages (nal, - md->md_niov, - md->md_iov.kiov, - &md->md_addrkey); - } else if (nal->libnal_unmap != NULL) { - nal->libnal_unmap (nal, - md->md_niov, md->md_iov.iov, - &md->md_addrkey); - } - - if (md->eq != NULL) { - md->eq->eq_refcount--; - LASSERT (md->eq->eq_refcount >= 0); + if (md->md_eq != NULL) { + md->md_eq->eq_refcount--; + LASSERT (md->md_eq->eq_refcount >= 0); } list_del (&md->md_list); - lib_md_free(nal, md); + lnet_md_free(md); } -/* must be called with state lock held */ +/* must be called with LNET_LOCK held */ static int -lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink) +lib_md_build(lnet_libmd_t *lmd, lnet_md_t *umd, int unlink) { - lib_eq_t *eq = NULL; - int rc; - int i; - int niov; - int total_length = 0; + lnet_eq_t *eq = NULL; + int i; + unsigned int niov; + int total_length = 0; /* NB we are passed an allocated, but uninitialised/active md. - * if we return success, caller may lib_md_unlink() it. - * otherwise caller may only lib_md_free() it. + * if we return success, caller may lnet_md_unlink() it. + * otherwise caller may only lnet_md_free() it. */ - if (!PtlHandleIsEqual (umd->eq_handle, PTL_EQ_NONE)) { - eq = ptl_handle2eq(&umd->eq_handle, nal); + if (!LNetHandleIsEqual (umd->eq_handle, LNET_EQ_NONE)) { + eq = lnet_handle2eq(&umd->eq_handle); if (eq == NULL) - return PTL_EQ_INVALID; + return -ENOENT; } /* This implementation doesn't know how to create START events or * disable END events. Best to LASSERT our caller is compliant so * we find out quickly... */ - LASSERT (eq == NULL || - ((umd->options & PTL_MD_EVENT_START_DISABLE) != 0 && - (umd->options & PTL_MD_EVENT_END_DISABLE) == 0)); - - lmd->me = NULL; - lmd->start = umd->start; - lmd->offset = 0; - lmd->max_size = umd->max_size; - lmd->options = umd->options; - lmd->user_ptr = umd->user_ptr; - lmd->eq = eq; - lmd->threshold = umd->threshold; - lmd->pending = 0; - lmd->md_flags = (unlink == PTL_UNLINK) ? PTL_MD_FLAG_AUTO_UNLINK : 0; - - if ((umd->options & PTL_MD_IOVEC) != 0) { - - if ((umd->options & PTL_MD_KIOV) != 0) /* Can't specify both */ - return PTL_MD_ILLEGAL; + /* TODO - reevaluate what should be here in light of + * the removal of the start and end events + * maybe there we shouldn't even allow LNET_EQ_NONE!) + LASSERT (eq == NULL); + */ + + lmd->md_me = NULL; + lmd->md_start = umd->start; + lmd->md_offset = 0; + lmd->md_max_size = umd->max_size; + lmd->md_options = umd->options; + lmd->md_user_ptr = umd->user_ptr; + lmd->md_eq = eq; + lmd->md_threshold = umd->threshold; + lmd->md_refcount = 0; + lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; + + if ((umd->options & LNET_MD_IOVEC) != 0) { + + if ((umd->options & LNET_MD_KIOV) != 0) /* Can't specify both */ + return -EINVAL; lmd->md_niov = niov = umd->length; memcpy(lmd->md_iov.iov, umd->start, @@ -133,33 +116,22 @@ lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink) for (i = 0; i < niov; i++) { /* We take the base address on trust */ if (lmd->md_iov.iov[i].iov_len <= 0) /* invalid length */ - return PTL_MD_ILLEGAL; + return -EINVAL; total_length += lmd->md_iov.iov[i].iov_len; } - lmd->length = total_length; + lmd->md_length = total_length; - if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ (umd->max_size < 0 || umd->max_size > total_length)) // illegal max_size - return PTL_MD_ILLEGAL; + return -EINVAL; - if (nal->libnal_map != NULL) { - rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, - &lmd->md_addrkey); - if (rc != PTL_OK) - return (rc); - } - } else if ((umd->options & PTL_MD_KIOV) != 0) { + } else if ((umd->options & LNET_MD_KIOV) != 0) { #ifndef __KERNEL__ - return PTL_MD_ILLEGAL; + return -EINVAL; #else - /* Trap attempt to use paged I/O if unsupported early. */ - if (nal->libnal_send_pages == NULL || - nal->libnal_recv_pages == NULL) - return PTL_MD_INVALID; - lmd->md_niov = niov = umd->length; memcpy(lmd->md_iov.kiov, umd->start, niov * sizeof (lmd->md_iov.kiov[0])); @@ -167,260 +139,179 @@ lib_md_build(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd, int unlink) for (i = 0; i < niov; i++) { /* We take the page pointer on trust */ if (lmd->md_iov.kiov[i].kiov_offset + - lmd->md_iov.kiov[i].kiov_len > PAGE_SIZE ) - return PTL_VAL_FAILED; /* invalid length */ + lmd->md_iov.kiov[i].kiov_len > CFS_PAGE_SIZE ) + return -EINVAL; /* invalid length */ total_length += lmd->md_iov.kiov[i].kiov_len; } - lmd->length = total_length; + lmd->md_length = total_length; - if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ (umd->max_size < 0 || umd->max_size > total_length)) // illegal max_size - return PTL_MD_ILLEGAL; - - if (nal->libnal_map_pages != NULL) { - rc = nal->libnal_map_pages (nal, niov, lmd->md_iov.kiov, - &lmd->md_addrkey); - if (rc != PTL_OK) - return (rc); - } + return -EINVAL; #endif } else { /* contiguous */ - lmd->length = umd->length; + lmd->md_length = umd->length; lmd->md_niov = niov = 1; lmd->md_iov.iov[0].iov_base = umd->start; lmd->md_iov.iov[0].iov_len = umd->length; - if ((umd->options & PTL_MD_MAX_SIZE) != 0 && /* max size used */ + if ((umd->options & LNET_MD_MAX_SIZE) != 0 && /* max size used */ (umd->max_size < 0 || umd->max_size > umd->length)) // illegal max_size - return PTL_MD_ILLEGAL; - - if (nal->libnal_map != NULL) { - rc = nal->libnal_map (nal, niov, lmd->md_iov.iov, - &lmd->md_addrkey); - if (rc != PTL_OK) - return (rc); - } + return -EINVAL; } if (eq != NULL) eq->eq_refcount++; /* It's good; let handle2md succeed and add to active mds */ - lib_initialise_handle (nal, &lmd->md_lh, PTL_COOKIE_TYPE_MD); - list_add (&lmd->md_list, &nal->libnal_ni.ni_active_mds); + lnet_initialise_handle (&lmd->md_lh, LNET_COOKIE_TYPE_MD); + list_add (&lmd->md_list, &the_lnet.ln_active_mds); - return PTL_OK; + return 0; } -/* must be called with state lock held */ +/* must be called with LNET_LOCK held */ void -lib_md_deconstruct(lib_nal_t *nal, lib_md_t *lmd, ptl_md_t *umd) +lnet_md_deconstruct(lnet_libmd_t *lmd, lnet_md_t *umd) { /* NB this doesn't copy out all the iov entries so when a * discontiguous MD is copied out, the target gets to know the * original iov pointer (in start) and the number of entries it had * and that's all. */ - umd->start = lmd->start; - umd->length = ((lmd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) == 0) ? - lmd->length : lmd->md_niov; - umd->threshold = lmd->threshold; - umd->max_size = lmd->max_size; - umd->options = lmd->options; - umd->user_ptr = lmd->user_ptr; - ptl_eq2handle(&umd->eq_handle, nal, lmd->eq); + umd->start = lmd->md_start; + umd->length = ((lmd->md_options & (LNET_MD_IOVEC | LNET_MD_KIOV)) == 0) ? + lmd->md_length : lmd->md_niov; + umd->threshold = lmd->md_threshold; + umd->max_size = lmd->md_max_size; + umd->options = lmd->md_options; + umd->user_ptr = lmd->md_user_ptr; + lnet_eq2handle(&umd->eq_handle, lmd->md_eq); } int -lib_api_md_attach(nal_t *apinal, ptl_handle_me_t *meh, - ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle) +LNetMDAttach(lnet_handle_me_t meh, lnet_md_t umd, + lnet_unlink_t unlink, lnet_handle_md_t *handle) { - lib_nal_t *nal = apinal->nal_data; - lib_me_t *me; - lib_md_t *md; - unsigned long flags; - int rc; - - if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - umd->length > PTL_MD_MAX_IOV) /* too many fragments */ - return PTL_IOV_INVALID; - - md = lib_md_alloc(nal, umd); + lnet_me_t *me; + lnet_libmd_t *md; + int rc; + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && + umd.length > LNET_MAX_IOV) /* too many fragments */ + return -EINVAL; + + md = lnet_md_alloc(&umd); if (md == NULL) - return PTL_NO_SPACE; + return -ENOMEM; - LIB_LOCK(nal, flags); + LNET_LOCK(); - me = ptl_handle2me(meh, nal); + me = lnet_handle2me(&meh); if (me == NULL) { - rc = PTL_ME_INVALID; - } else if (me->md != NULL) { - rc = PTL_ME_IN_USE; + rc = -ENOENT; + } else if (me->me_md != NULL) { + rc = -EBUSY; } else { - rc = lib_md_build(nal, md, umd, unlink); - if (rc == PTL_OK) { - me->md = md; - md->me = me; + rc = lib_md_build(md, &umd, unlink); + if (rc == 0) { + me->me_md = md; + md->md_me = me; + + lnet_md2handle(handle, md); - ptl_md2handle(handle, nal, md); + /* check if this MD matches any blocked msgs */ + lnet_match_blocked_msg(md); /* expects LNET_LOCK held */ - LIB_UNLOCK(nal, flags); - return (PTL_OK); + LNET_UNLOCK(); + return (0); } } - lib_md_free (nal, md); + lnet_md_free (md); - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); return (rc); } int -lib_api_md_bind(nal_t *apinal, - ptl_md_t *umd, ptl_unlink_t unlink, - ptl_handle_md_t *handle) +LNetMDBind(lnet_md_t umd, lnet_unlink_t unlink, lnet_handle_md_t *handle) { - lib_nal_t *nal = apinal->nal_data; - lib_md_t *md; - unsigned long flags; - int rc; + lnet_libmd_t *md; + int rc; - if ((umd->options & (PTL_MD_KIOV | PTL_MD_IOVEC)) != 0 && - umd->length > PTL_MD_MAX_IOV) /* too many fragments */ - return PTL_IOV_INVALID; + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if ((umd.options & (LNET_MD_KIOV | LNET_MD_IOVEC)) != 0 && + umd.length > LNET_MAX_IOV) /* too many fragments */ + return -EINVAL; - md = lib_md_alloc(nal, umd); + md = lnet_md_alloc(&umd); if (md == NULL) - return PTL_NO_SPACE; + return -ENOMEM; - LIB_LOCK(nal, flags); + LNET_LOCK(); - rc = lib_md_build(nal, md, umd, unlink); + rc = lib_md_build(md, &umd, unlink); - if (rc == PTL_OK) { - ptl_md2handle(handle, nal, md); + if (rc == 0) { + lnet_md2handle(handle, md); - LIB_UNLOCK(nal, flags); - return (PTL_OK); + LNET_UNLOCK(); + return (0); } - lib_md_free (nal, md); + lnet_md_free (md); - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); return (rc); } int -lib_api_md_unlink (nal_t *apinal, ptl_handle_md_t *mdh) +LNetMDUnlink (lnet_handle_md_t mdh) { - lib_nal_t *nal = apinal->nal_data; - ptl_event_t ev; - lib_md_t *md; - unsigned long flags; + lnet_event_t ev; + lnet_libmd_t *md; - LIB_LOCK(nal, flags); + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + LNET_LOCK(); - md = ptl_handle2md(mdh, nal); + md = lnet_handle2md(&mdh); if (md == NULL) { - LIB_UNLOCK(nal, flags); - return PTL_MD_INVALID; + LNET_UNLOCK(); + return -ENOENT; } - /* If the MD is busy, lib_md_unlink just marks it for deletion, and + /* If the MD is busy, lnet_md_unlink just marks it for deletion, and * when the NAL is done, the completion event flags that the MD was * unlinked. Otherwise, we enqueue an event now... */ - if (md->eq != NULL && - md->pending == 0) { + if (md->md_eq != NULL && + md->md_refcount == 0) { memset(&ev, 0, sizeof(ev)); - ev.type = PTL_EVENT_UNLINK; - ev.ni_fail_type = PTL_OK; + ev.type = LNET_EVENT_UNLINK; + ev.status = 0; ev.unlinked = 1; - lib_md_deconstruct(nal, md, &ev.md); - ptl_md2handle(&ev.md_handle, nal, md); + lnet_md_deconstruct(md, &ev.md); + lnet_md2handle(&ev.md_handle, md); - lib_enq_event_locked(nal, NULL, md->eq, &ev); + lnet_enq_event_locked(md->md_eq, &ev); } - lib_md_unlink(nal, md); + lnet_md_unlink(md); - LIB_UNLOCK(nal, flags); - return PTL_OK; + LNET_UNLOCK(); + return 0; } -int -lib_api_md_update (nal_t *apinal, - ptl_handle_md_t *mdh, - ptl_md_t *oldumd, ptl_md_t *newumd, - ptl_handle_eq_t *testqh) -{ - lib_nal_t *nal = apinal->nal_data; - lib_md_t *md; - lib_eq_t *test_eq = NULL; - unsigned long flags; - int rc; - - LIB_LOCK(nal, flags); - - md = ptl_handle2md(mdh, nal); - if (md == NULL) { - rc = PTL_MD_INVALID; - goto out; - } - - if (oldumd != NULL) - lib_md_deconstruct(nal, md, oldumd); - - if (newumd == NULL) { - rc = PTL_OK; - goto out; - } - - /* XXX fttb, the new MD must be the same "shape" wrt fragmentation, - * since we simply overwrite the old lib-md */ - if ((((newumd->options ^ md->options) & - (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0) || - ((newumd->options & (PTL_MD_IOVEC | PTL_MD_KIOV)) != 0 && - newumd->length != md->md_niov)) { - rc = PTL_IOV_INVALID; - goto out; - } - - if (!PtlHandleIsEqual (*testqh, PTL_EQ_NONE)) { - test_eq = ptl_handle2eq(testqh, nal); - if (test_eq == NULL) { - rc = PTL_EQ_INVALID; - goto out; - } - } - - if (md->pending != 0) { - rc = PTL_MD_NO_UPDATE; - goto out; - } - - if (test_eq == NULL || - test_eq->eq_deq_seq == test_eq->eq_enq_seq) { - lib_me_t *me = md->me; - int unlink = (md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) ? - PTL_UNLINK : PTL_RETAIN; - - // #warning this does not track eq refcounts properly - rc = lib_md_build(nal, md, newumd, unlink); - - md->me = me; - } else { - rc = PTL_MD_NO_UPDATE; - } - - out: - LIB_UNLOCK(nal, flags); - - return rc; -} diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c index cbc7c53..fb72c6d 100644 --- a/lnet/lnet/lib-me.c +++ b/lnet/lnet/lib-me.c @@ -22,160 +22,152 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET -#ifndef __KERNEL__ -# include -#else -# include -#endif - -#include +#include int -lib_api_me_attach(nal_t *apinal, - ptl_pt_index_t portal, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle) +LNetMEAttach(unsigned int portal, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) { - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - lib_ptl_t *tbl = &ni->ni_portals; - lib_me_t *me; - unsigned long flags; + lnet_me_t *me; - if (portal >= tbl->size) - return PTL_PT_INDEX_INVALID; + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (portal >= the_lnet.ln_nportals) + return -EINVAL; - me = lib_me_alloc (nal); + me = lnet_me_alloc(); if (me == NULL) - return PTL_NO_SPACE; + return -ENOMEM; - LIB_LOCK(nal, flags); + LNET_LOCK(); - me->match_id = match_id; - me->match_bits = match_bits; - me->ignore_bits = ignore_bits; - me->unlink = unlink; - me->md = NULL; + me->me_portal = portal; + me->me_match_id = match_id; + me->me_match_bits = match_bits; + me->me_ignore_bits = ignore_bits; + me->me_unlink = unlink; + me->me_md = NULL; - lib_initialise_handle (nal, &me->me_lh, PTL_COOKIE_TYPE_ME); + lnet_initialise_handle (&me->me_lh, LNET_COOKIE_TYPE_ME); - if (pos == PTL_INS_AFTER) - list_add_tail(&me->me_list, &(tbl->tbl[portal])); + if (pos == LNET_INS_AFTER) + list_add_tail(&me->me_list, &(the_lnet.ln_portals[portal].ptl_ml)); else - list_add(&me->me_list, &(tbl->tbl[portal])); + list_add(&me->me_list, &(the_lnet.ln_portals[portal].ptl_ml)); - ptl_me2handle(handle, nal, me); + lnet_me2handle(handle, me); - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); - return PTL_OK; + return 0; } -int -lib_api_me_insert(nal_t *apinal, - ptl_handle_me_t *current_meh, - ptl_process_id_t match_id, - ptl_match_bits_t match_bits, - ptl_match_bits_t ignore_bits, - ptl_unlink_t unlink, ptl_ins_pos_t pos, - ptl_handle_me_t *handle) +int +LNetMEInsert(lnet_handle_me_t current_meh, + lnet_process_id_t match_id, + __u64 match_bits, __u64 ignore_bits, + lnet_unlink_t unlink, lnet_ins_pos_t pos, + lnet_handle_me_t *handle) { - lib_nal_t *nal = apinal->nal_data; - lib_me_t *current_me; - lib_me_t *new_me; - unsigned long flags; + lnet_me_t *current_me; + lnet_me_t *new_me; - new_me = lib_me_alloc (nal); + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + new_me = lnet_me_alloc(); if (new_me == NULL) - return PTL_NO_SPACE; + return -ENOMEM; - LIB_LOCK(nal, flags); + LNET_LOCK(); - current_me = ptl_handle2me(current_meh, nal); + current_me = lnet_handle2me(¤t_meh); if (current_me == NULL) { - lib_me_free (nal, new_me); + lnet_me_free (new_me); - LIB_UNLOCK(nal, flags); - return PTL_ME_INVALID; + LNET_UNLOCK(); + return -ENOENT; } - new_me->match_id = match_id; - new_me->match_bits = match_bits; - new_me->ignore_bits = ignore_bits; - new_me->unlink = unlink; - new_me->md = NULL; + new_me->me_match_id = match_id; + new_me->me_match_bits = match_bits; + new_me->me_ignore_bits = ignore_bits; + new_me->me_unlink = unlink; + new_me->me_md = NULL; - lib_initialise_handle (nal, &new_me->me_lh, PTL_COOKIE_TYPE_ME); + lnet_initialise_handle (&new_me->me_lh, LNET_COOKIE_TYPE_ME); - if (pos == PTL_INS_AFTER) + if (pos == LNET_INS_AFTER) list_add_tail(&new_me->me_list, ¤t_me->me_list); else list_add(&new_me->me_list, ¤t_me->me_list); - ptl_me2handle(handle, nal, new_me); + lnet_me2handle(handle, new_me); - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); - return PTL_OK; + return 0; } int -lib_api_me_unlink (nal_t *apinal, ptl_handle_me_t *meh) +LNetMEUnlink(lnet_handle_me_t meh) { - lib_nal_t *nal = apinal->nal_data; - unsigned long flags; - lib_me_t *me; + lnet_me_t *me; int rc; - LIB_LOCK(nal, flags); + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + LNET_LOCK(); - me = ptl_handle2me(meh, nal); + me = lnet_handle2me(&meh); if (me == NULL) { - rc = PTL_ME_INVALID; + rc = -ENOENT; } else { - lib_me_unlink(nal, me); - rc = PTL_OK; + lnet_me_unlink(me); + rc = 0; } - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); return (rc); } -/* call with state_lock please */ +/* call with LNET_LOCK please */ void -lib_me_unlink(lib_nal_t *nal, lib_me_t *me) +lnet_me_unlink(lnet_me_t *me) { list_del (&me->me_list); - if (me->md) { - me->md->me = NULL; - lib_md_unlink(nal, me->md); + if (me->me_md) { + me->me_md->md_me = NULL; + lnet_md_unlink(me->me_md); } - lib_invalidate_handle (nal, &me->me_lh); - lib_me_free(nal, me); + lnet_invalidate_handle (&me->me_lh); + lnet_me_free(me); } #if 0 static void -lib_me_dump(lib_nal_t *nal, lib_me_t * me) +lib_me_dump(lnet_me_t *me) { CWARN("Match Entry %p ("LPX64")\n", me, me->me_lh.lh_cookie); CWARN("\tMatch/Ignore\t= %016lx / %016lx\n", - me->match_bits, me->ignore_bits); + me->me_match_bits, me->me_ignore_bits); CWARN("\tMD\t= %p\n", me->md); CWARN("\tprev\t= %p\n", - list_entry(me->me_list.prev, lib_me_t, me_list)); + list_entry(me->me_list.prev, lnet_me_t, me_list)); CWARN("\tnext\t= %p\n", - list_entry(me->me_list.next, lib_me_t, me_list)); + list_entry(me->me_list.next, lnet_me_t, me_list)); } #endif diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 5339b6d..b7c6e51 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -22,181 +22,210 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET -#ifndef __KERNEL__ -# include -#else -# include -#endif -#include -#include +#include + +static int local_nid_dist_zero = 1; +CFS_MODULE_PARM(local_nid_dist_zero, "i", int, 0444, + "Reserved"); /* forward ref */ -static void lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg); -static ptl_err_t do_lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, - void *private, int loopback); - -static lib_md_t * -lib_match_md(lib_nal_t *nal, int index, int op_mask, - ptl_nid_t src_nid, ptl_pid_t src_pid, - ptl_size_t rlength, ptl_size_t roffset, - ptl_match_bits_t match_bits, lib_msg_t *msg, - ptl_size_t *mlength_out, ptl_size_t *offset_out) +static void lnet_commit_md (lnet_libmd_t *md, lnet_msg_t *msg); +static void lnet_drop_delayed_put(lnet_msg_t *msg, char *reason); + +#define LNET_MATCHMD_NONE 0 /* Didn't match */ +#define LNET_MATCHMD_OK 1 /* Matched OK */ +#define LNET_MATCHMD_DROP 2 /* Must be disarded */ + +static int +lnet_try_match_md (int index, int op_mask, lnet_process_id_t src, + unsigned int rlength, unsigned int roffset, + __u64 match_bits, lnet_libmd_t *md, lnet_msg_t *msg, + unsigned int *mlength_out, unsigned int *offset_out) { - lib_ni_t *ni = &nal->libnal_ni; - struct list_head *match_list = &ni->ni_portals.tbl[index]; - struct list_head *tmp; - lib_me_t *me; - lib_md_t *md; - ptl_size_t mlength; - ptl_size_t offset; - ENTRY; + /* ALWAYS called holding the LNET_LOCK, and can't LNET_UNLOCK; + * lnet_match_blocked_msg() relies on this to avoid races */ + unsigned int offset; + unsigned int mlength; + lnet_me_t *me = md->md_me; - CDEBUG (D_NET, "Request from "LPU64".%d of length %d into portal %d " - "MB="LPX64"\n", src_nid, src_pid, rlength, index, match_bits); + /* mismatched MD op */ + if ((md->md_options & op_mask) == 0) + return LNET_MATCHMD_NONE; - if (index < 0 || index >= ni->ni_portals.size) { - CERROR("Invalid portal %d not in [0-%d]\n", - index, ni->ni_portals.size); - goto failed; - } + /* MD exhausted */ + if (lnet_md_exhausted(md)) + return LNET_MATCHMD_NONE; - list_for_each (tmp, match_list) { - me = list_entry(tmp, lib_me_t, me_list); - md = me->md; + /* mismatched ME nid/pid? */ + if (me->me_match_id.nid != LNET_NID_ANY && + me->me_match_id.nid != src.nid) + return LNET_MATCHMD_NONE; - /* ME attached but MD not attached yet */ - if (md == NULL) - continue; + if (me->me_match_id.pid != LNET_PID_ANY && + me->me_match_id.pid != src.pid) + return LNET_MATCHMD_NONE; - LASSERT (me == md->me); + /* mismatched ME matchbits? */ + if (((me->me_match_bits ^ match_bits) & ~me->me_ignore_bits) != 0) + return LNET_MATCHMD_NONE; - /* mismatched MD op */ - if ((md->options & op_mask) == 0) - continue; + /* Hurrah! This _is_ a match; check it out... */ - /* MD exhausted */ - if (lib_md_exhausted(md)) - continue; + if ((md->md_options & LNET_MD_MANAGE_REMOTE) == 0) + offset = md->md_offset; + else + offset = roffset; - /* mismatched ME nid/pid? */ - if (me->match_id.nid != PTL_NID_ANY && - me->match_id.nid != src_nid) - continue; + if ((md->md_options & LNET_MD_MAX_SIZE) != 0) { + mlength = md->md_max_size; + LASSERT (md->md_offset + mlength <= md->md_length); + } else { + mlength = md->md_length - offset; + } - CDEBUG(D_NET, "match_id.pid [%x], src_pid [%x]\n", - me->match_id.pid, src_pid); + if (rlength <= mlength) { /* fits in allowed space */ + mlength = rlength; + } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) { + /* this packet _really_ is too big */ + CERROR("Matching packet from %s, match "LPU64 + " length %d too big: %d left, %d allowed\n", + libcfs_id2str(src), match_bits, rlength, + md->md_length - offset, mlength); - if (me->match_id.pid != PTL_PID_ANY && - me->match_id.pid != src_pid) - continue; + return LNET_MATCHMD_DROP; + } - /* mismatched ME matchbits? */ - if (((me->match_bits ^ match_bits) & ~me->ignore_bits) != 0) - continue; + /* Commit to this ME/MD */ + CDEBUG(D_NET, "Incoming %s index %x from %s of " + "length %d/%d into md "LPX64" [%d] + %d\n", + (op_mask == LNET_MD_OP_PUT) ? "put" : "get", + index, libcfs_id2str(src), mlength, rlength, + md->md_lh.lh_cookie, md->md_niov, offset); + + lnet_commit_md(md, msg); + md->md_offset = offset + mlength; + + /* NB Caller will set ev.type and ev.hdr_data */ + msg->msg_ev.initiator = src; + msg->msg_ev.pt_index = index; + msg->msg_ev.match_bits = match_bits; + msg->msg_ev.rlength = rlength; + msg->msg_ev.mlength = mlength; + msg->msg_ev.offset = offset; + + lnet_md_deconstruct(md, &msg->msg_ev.md); + lnet_md2handle(&msg->msg_ev.md_handle, md); + + *offset_out = offset; + *mlength_out = mlength; + + /* Auto-unlink NOW, so the ME gets unlinked if required. + * We bumped md->md_refcount above so the MD just gets flagged + * for unlink when it is finalized. */ + if ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) != 0 && + lnet_md_exhausted(md)) { + lnet_md_unlink(md); + } - /* Hurrah! This _is_ a match; check it out... */ + return LNET_MATCHMD_OK; +} - if ((md->options & PTL_MD_MANAGE_REMOTE) == 0) - offset = md->offset; - else - offset = roffset; +static int +lnet_match_md(int index, int op_mask, lnet_process_id_t src, + unsigned int rlength, unsigned int roffset, + __u64 match_bits, lnet_msg_t *msg, + unsigned int *mlength_out, unsigned int *offset_out, + lnet_libmd_t **md_out) +{ + lnet_portal_t *ptl = &the_lnet.ln_portals[index]; + struct list_head *tmp; + lnet_me_t *me; + lnet_libmd_t *md; + int rc; - if ((md->options & PTL_MD_MAX_SIZE) != 0) { - mlength = md->max_size; - LASSERT (md->offset + mlength <= md->length); - } else { - mlength = md->length - offset; - } + CDEBUG (D_NET, "Request from %s of length %d into portal %d " + "MB="LPX64"\n", libcfs_id2str(src), rlength, index, match_bits); - if (rlength <= mlength) { /* fits in allowed space */ - mlength = rlength; - } else if ((md->options & PTL_MD_TRUNCATE) == 0) { - /* this packet _really_ is too big */ - CERROR("Matching packet %d too big: %d left, " - "%d allowed\n", rlength, md->length - offset, - mlength); - goto failed; + if (index < 0 || index >= the_lnet.ln_nportals) { + CERROR("Invalid portal %d not in [0-%d]\n", + index, the_lnet.ln_nportals); + return LNET_MATCHMD_DROP; + } + + list_for_each (tmp, &ptl->ptl_ml) { + me = list_entry(tmp, lnet_me_t, me_list); + md = me->me_md; + + /* ME attached but MD not attached yet */ + if (md == NULL) + continue; + + LASSERT (me == md->md_me); + + rc = lnet_try_match_md(index, op_mask, src, rlength, + roffset, match_bits, md, msg, + mlength_out, offset_out); + switch (rc) { + default: + LBUG(); + + case LNET_MATCHMD_NONE: + continue; + + case LNET_MATCHMD_OK: + *md_out = md; + return LNET_MATCHMD_OK; + + case LNET_MATCHMD_DROP: + return LNET_MATCHMD_DROP; } + /* not reached */ + } - /* Commit to this ME/MD */ - CDEBUG(D_NET, "Incoming %s index %x from "LPU64"/%u of " - "length %d/%d into md "LPX64" [%d] + %d\n", - (op_mask == PTL_MD_OP_PUT) ? "put" : "get", - index, src_nid, src_pid, mlength, rlength, - md->md_lh.lh_cookie, md->md_niov, offset); - - lib_commit_md(nal, md, msg); - md->offset = offset + mlength; - - /* NB Caller sets ev.type and ev.hdr_data */ - msg->ev.initiator.nid = src_nid; - msg->ev.initiator.pid = src_pid; - msg->ev.pt_index = index; - msg->ev.match_bits = match_bits; - msg->ev.rlength = rlength; - msg->ev.mlength = mlength; - msg->ev.offset = offset; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); - - *offset_out = offset; - *mlength_out = mlength; - - /* Auto-unlink NOW, so the ME gets unlinked if required. - * We bumped md->pending above so the MD just gets flagged - * for unlink when it is finalized. */ - if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) != 0 && - lib_md_exhausted(md)) - lib_md_unlink(nal, md); - - RETURN (md); - } - - failed: - CERROR (LPU64": Dropping %s from "LPU64".%d portal %d match "LPX64 - " offset %d length %d: no match\n", - ni->ni_pid.nid, (op_mask == PTL_MD_OP_GET) ? "GET" : "PUT", - src_nid, src_pid, index, match_bits, roffset, rlength); - RETURN(NULL); + if (op_mask == LNET_MD_OP_GET || + (ptl->ptl_options & LNET_PTL_LAZY) == 0) + return LNET_MATCHMD_DROP; + + return LNET_MATCHMD_NONE; } -int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold) +int +lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) { - lib_nal_t *nal = apinal->nal_data; - lib_test_peer_t *tp; - unsigned long flags; + lnet_test_peer_t *tp; struct list_head *el; struct list_head *next; struct list_head cull; + LASSERT (the_lnet.ln_init); + if (threshold != 0) { /* Adding a new entry */ - PORTAL_ALLOC(tp, sizeof(*tp)); + LIBCFS_ALLOC(tp, sizeof(*tp)); if (tp == NULL) - return PTL_NO_SPACE; + return -ENOMEM; tp->tp_nid = nid; tp->tp_threshold = threshold; - LIB_LOCK(nal, flags); - list_add_tail (&tp->tp_list, &nal->libnal_ni.ni_test_peers); - LIB_UNLOCK(nal, flags); - return PTL_OK; + LNET_LOCK(); + list_add_tail (&tp->tp_list, &the_lnet.ln_test_peers); + LNET_UNLOCK(); + return 0; } /* removing entries */ CFS_INIT_LIST_HEAD (&cull); - LIB_LOCK(nal, flags); + LNET_LOCK(); - list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { - tp = list_entry (el, lib_test_peer_t, tp_list); + list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = list_entry (el, lnet_test_peer_t, tp_list); if (tp->tp_threshold == 0 || /* needs culling anyway */ - nid == PTL_NID_ANY || /* removing all entries */ + nid == LNET_NID_ANY || /* removing all entries */ tp->tp_nid == nid) /* matched this one */ { list_del (&tp->tp_list); @@ -204,46 +233,32 @@ int lib_api_fail_nid (nal_t *apinal, ptl_nid_t nid, unsigned int threshold) } } - LIB_UNLOCK(nal, flags); + LNET_UNLOCK(); while (!list_empty (&cull)) { - tp = list_entry (cull.next, lib_test_peer_t, tp_list); + tp = list_entry (cull.next, lnet_test_peer_t, tp_list); list_del (&tp->tp_list); - PORTAL_FREE(tp, sizeof (*tp)); + LIBCFS_FREE(tp, sizeof (*tp)); } - return PTL_OK; -} - -int -lib_api_loopback (nal_t *apinal, int set, int *enabled) -{ - lib_nal_t *nal = apinal->nal_data; - - if (set) - nal->libnal_ni.ni_loopback = *enabled; - else - *enabled = nal->libnal_ni.ni_loopback; - - return PTL_OK; + return 0; } static int -fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) +fail_peer (lnet_nid_t nid, int outgoing) { - lib_test_peer_t *tp; + lnet_test_peer_t *tp; struct list_head *el; struct list_head *next; - unsigned long flags; struct list_head cull; int fail = 0; CFS_INIT_LIST_HEAD (&cull); - LIB_LOCK (nal, flags); + LNET_LOCK(); - list_for_each_safe (el, next, &nal->libnal_ni.ni_test_peers) { - tp = list_entry (el, lib_test_peer_t, tp_list); + list_for_each_safe (el, next, &the_lnet.ln_test_peers) { + tp = list_entry (el, lnet_test_peer_t, tp_list); if (tp->tp_threshold == 0) { /* zombie entry */ @@ -257,11 +272,11 @@ fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) continue; } - if (tp->tp_nid == PTL_NID_ANY || /* fail every peer */ + if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */ nid == tp->tp_nid) { /* fail this peer */ fail = 1; - if (tp->tp_threshold != PTL_MD_THRESH_INF) { + if (tp->tp_threshold != LNET_MD_THRESH_INF) { tp->tp_threshold--; if (outgoing && tp->tp_threshold == 0) { @@ -274,22 +289,22 @@ fail_peer (lib_nal_t *nal, ptl_nid_t nid, int outgoing) } } - LIB_UNLOCK (nal, flags); + LNET_UNLOCK (); while (!list_empty (&cull)) { - tp = list_entry (cull.next, lib_test_peer_t, tp_list); + tp = list_entry (cull.next, lnet_test_peer_t, tp_list); list_del (&tp->tp_list); - PORTAL_FREE(tp, sizeof (*tp)); + LIBCFS_FREE(tp, sizeof (*tp)); } return (fail); } -ptl_size_t -lib_iov_nob (int niov, struct iovec *iov) +unsigned int +lnet_iov_nob (unsigned int niov, struct iovec *iov) { - ptl_size_t nob = 0; + unsigned int nob = 0; while (niov-- > 0) nob += (iov++)->iov_len; @@ -298,77 +313,73 @@ lib_iov_nob (int niov, struct iovec *iov) } void -lib_copy_iov2buf (char *dest, int niov, struct iovec *iov, - ptl_size_t offset, ptl_size_t len) +lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset, + unsigned int nsiov, struct iovec *siov, unsigned int soffset, + unsigned int nob) { - ptl_size_t nob; + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; - if (len == 0) + if (nob == 0) return; - /* skip complete frags before 'offset' */ - LASSERT (niov > 0); - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); + /* skip complete frags before 'doffset' */ + LASSERT (ndiov > 0); + while (doffset >= diov->iov_len) { + doffset -= diov->iov_len; + diov++; + ndiov--; + LASSERT (ndiov > 0); } - - do { - LASSERT (niov > 0); - nob = MIN (iov->iov_len - offset, len); - memcpy (dest, iov->iov_base + offset, nob); - - len -= nob; - dest += nob; - niov--; - iov++; - offset = 0; - } while (len > 0); -} - -void -lib_copy_buf2iov (int niov, struct iovec *iov, ptl_size_t offset, - char *src, ptl_size_t len) -{ - ptl_size_t nob; - - if (len == 0) - return; - - /* skip complete frags before 'offset' */ - LASSERT (niov > 0); - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT (niov > 0); + + /* skip complete frags before 'soffset' */ + LASSERT (nsiov > 0); + while (soffset >= siov->iov_len) { + soffset -= siov->iov_len; + siov++; + nsiov--; + LASSERT (nsiov > 0); } do { - LASSERT (niov > 0); - nob = MIN (iov->iov_len - offset, len); - memcpy (iov->iov_base + offset, src, nob); - - len -= nob; - src += nob; - niov--; - iov++; - offset = 0; - } while (len > 0); + LASSERT (ndiov > 0); + LASSERT (nsiov > 0); + this_nob = MIN(diov->iov_len - doffset, + siov->iov_len - soffset); + this_nob = MIN(this_nob, nob); + + memcpy ((char *)diov->iov_base + doffset, + (char *)siov->iov_base + soffset, this_nob); + nob -= this_nob; + + if (diov->iov_len > doffset + this_nob) { + doffset += this_nob; + } else { + diov++; + ndiov--; + doffset = 0; + } + + if (siov->iov_len > soffset + this_nob) { + soffset += this_nob; + } else { + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); } int -lib_extract_iov (int dst_niov, struct iovec *dst, - int src_niov, struct iovec *src, - ptl_size_t offset, ptl_size_t len) +lnet_extract_iov (int dst_niov, struct iovec *dst, + int src_niov, struct iovec *src, + unsigned int offset, unsigned int len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', * for exactly 'len' bytes, and return the number of entries. * NB not destructive to 'src' */ - ptl_size_t frag_len; - int niov; + unsigned int frag_len; + unsigned int niov; if (len == 0) /* no data => */ return (0); /* no frags */ @@ -406,58 +417,51 @@ lib_extract_iov (int dst_niov, struct iovec *dst, } #ifndef __KERNEL__ -ptl_size_t -lib_kiov_nob (int niov, ptl_kiov_t *kiov) +unsigned int +lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov) { LASSERT (0); return (0); } void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, - ptl_size_t offset, ptl_size_t len) +lnet_copy_kiov2kiov (unsigned int ndkiov, lnet_kiov_t *dkiov, unsigned int doffset, + unsigned int nskiov, lnet_kiov_t *skiov, unsigned int soffset, + unsigned int nob) { LASSERT (0); } void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, - char *src, ptl_size_t len) +lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nob) { LASSERT (0); } -int -lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, - int src_niov, ptl_kiov_t *src, - ptl_size_t offset, ptl_size_t len) -{ - LASSERT (0); -} - -ptl_err_t -lib_lo_rxkiov(lib_nal_t *nal, void *private, lib_msg_t *libmsg, - unsigned int niov, ptl_kiov_t *kiov, - size_t offset, size_t mlen, size_t rlen) +void +lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nob) { LASSERT (0); } -ptl_err_t -lib_lo_txkiov (lib_nal_t *nal, void *private, lib_msg_t *libmsg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - unsigned int payload_niov, ptl_kiov_t *payload_kiov, - size_t payload_offset, size_t payload_nob) +int +lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len) { LASSERT (0); } #else /* __KERNEL__ */ -ptl_size_t -lib_kiov_nob (int niov, ptl_kiov_t *kiov) +unsigned int +lnet_kiov_nob (unsigned int niov, lnet_kiov_t *kiov) { - ptl_size_t nob = 0; + unsigned int nob = 0; while (niov-- > 0) nob += (kiov++)->kiov_len; @@ -466,89 +470,233 @@ lib_kiov_nob (int niov, ptl_kiov_t *kiov) } void -lib_copy_kiov2buf (char *dest, int niov, ptl_kiov_t *kiov, - ptl_size_t offset, ptl_size_t len) +lnet_copy_kiov2kiov (unsigned int ndiov, lnet_kiov_t *diov, unsigned int doffset, + unsigned int nsiov, lnet_kiov_t *siov, unsigned int soffset, + unsigned int nob) { - ptl_size_t nob; - char *addr; + /* NB diov, siov are READ-ONLY */ + unsigned int this_nob; + char *daddr = NULL; + char *saddr = NULL; - if (len == 0) + if (nob == 0) + return; + + LASSERT (!in_interrupt ()); + + LASSERT (ndiov > 0); + while (doffset > diov->kiov_len) { + doffset -= diov->kiov_len; + diov++; + ndiov--; + LASSERT (ndiov > 0); + } + + LASSERT (nsiov > 0); + while (soffset > siov->kiov_len) { + soffset -= siov->kiov_len; + siov++; + nsiov--; + LASSERT (nsiov > 0); + } + + do { + LASSERT (ndiov > 0); + LASSERT (nsiov > 0); + this_nob = MIN(diov->kiov_len - doffset, + siov->kiov_len - soffset); + this_nob = MIN(this_nob, nob); + + if (daddr == NULL) + daddr = ((char *)cfs_kmap(diov->kiov_page)) + + diov->kiov_offset + doffset; + if (saddr == NULL) + saddr = ((char *)cfs_kmap(siov->kiov_page)) + + siov->kiov_offset + soffset; + + /* Vanishing risk of kmap deadlock when mapping 2 pages. + * However in practice at least one of the kiovs will be mapped + * kernel pages and the map/unmap will be NOOPs */ + + memcpy (daddr, saddr, this_nob); + nob -= this_nob; + + if (diov->kiov_len > doffset + this_nob) { + daddr += this_nob; + doffset += this_nob; + } else { + cfs_kunmap(diov->kiov_page); + daddr = NULL; + diov++; + ndiov--; + doffset = 0; + } + + if (siov->kiov_len > soffset + this_nob) { + saddr += this_nob; + soffset += this_nob; + } else { + cfs_kunmap(siov->kiov_page); + saddr = NULL; + siov++; + nsiov--; + soffset = 0; + } + } while (nob > 0); + + if (daddr != NULL) + cfs_kunmap(diov->kiov_page); + if (saddr != NULL) + cfs_kunmap(siov->kiov_page); +} + +void +lnet_copy_kiov2iov (unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int nob) +{ + /* NB iov, kiov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; + + if (nob == 0) return; LASSERT (!in_interrupt ()); LASSERT (niov > 0); - while (offset > kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; + while (iovoffset > iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; niov--; LASSERT (niov > 0); } + LASSERT (nkiov > 0); + while (kiovoffset > kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; + kiov++; + nkiov--; + LASSERT (nkiov > 0); + } + do { LASSERT (niov > 0); - nob = MIN (kiov->kiov_len - offset, len); + LASSERT (nkiov > 0); + this_nob = MIN(iov->iov_len - iovoffset, + kiov->kiov_len - kiovoffset); + this_nob = MIN(this_nob, nob); - addr = ((char *)cfs_kmap(kiov->kiov_page)) + kiov->kiov_offset + - offset; - memcpy (dest, addr, nob); - cfs_kunmap (kiov->kiov_page); + if (addr == NULL) + addr = ((char *)cfs_kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; - len -= nob; - dest += nob; - niov--; - kiov++; - offset = 0; - } while (len > 0); + memcpy ((char *)iov->iov_base + iovoffset, addr, this_nob); + nob -= this_nob; + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + cfs_kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + } while (nob > 0); + + if (addr != NULL) + cfs_kunmap(kiov->kiov_page); } void -lib_copy_buf2kiov (int niov, ptl_kiov_t *kiov, ptl_size_t offset, - char *src, ptl_size_t len) +lnet_copy_iov2kiov (unsigned int nkiov, lnet_kiov_t *kiov, unsigned int kiovoffset, + unsigned int niov, struct iovec *iov, unsigned int iovoffset, + unsigned int nob) { - ptl_size_t nob; - char *addr; + /* NB kiov, iov are READ-ONLY */ + unsigned int this_nob; + char *addr = NULL; - if (len == 0) + if (nob == 0) return; LASSERT (!in_interrupt ()); - LASSERT (niov > 0); - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; + LASSERT (nkiov > 0); + while (kiovoffset > kiov->kiov_len) { + kiovoffset -= kiov->kiov_len; kiov++; + nkiov--; + LASSERT (nkiov > 0); + } + + LASSERT (niov > 0); + while (iovoffset > iov->iov_len) { + iovoffset -= iov->iov_len; + iov++; niov--; LASSERT (niov > 0); } do { + LASSERT (nkiov > 0); LASSERT (niov > 0); - nob = MIN (kiov->kiov_len - offset, len); + this_nob = MIN(kiov->kiov_len - kiovoffset, + iov->iov_len - iovoffset); + this_nob = MIN(this_nob, nob); - addr = ((char *)cfs_kmap(kiov->kiov_page)) + kiov->kiov_offset + - offset; - memcpy (addr, src, nob); - cfs_kunmap (kiov->kiov_page); + if (addr == NULL) + addr = ((char *)cfs_kmap(kiov->kiov_page)) + + kiov->kiov_offset + kiovoffset; - len -= nob; - src += nob; - niov--; - kiov++; - offset = 0; - } while (len > 0); + memcpy (addr, (char *)iov->iov_base + iovoffset, this_nob); + nob -= this_nob; + + if (kiov->kiov_len > kiovoffset + this_nob) { + addr += this_nob; + kiovoffset += this_nob; + } else { + cfs_kunmap(kiov->kiov_page); + addr = NULL; + kiov++; + nkiov--; + kiovoffset = 0; + } + + if (iov->iov_len > iovoffset + this_nob) { + iovoffset += this_nob; + } else { + iov++; + niov--; + iovoffset = 0; + } + } while (nob > 0); + + if (addr != NULL) + cfs_kunmap(kiov->kiov_page); } int -lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, - int src_niov, ptl_kiov_t *src, - ptl_size_t offset, ptl_size_t len) +lnet_extract_kiov (int dst_niov, lnet_kiov_t *dst, + int src_niov, lnet_kiov_t *src, + unsigned int offset, unsigned int len) { /* Initialise 'dst' to the subset of 'src' starting at 'offset', * for exactly 'len' bytes, and return the number of entries. * NB not destructive to 'src' */ - ptl_size_t frag_len; - int niov; + unsigned int frag_len; + unsigned int niov; if (len == 0) /* no data => */ return (0); /* no frags */ @@ -572,12 +720,12 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, if (len <= frag_len) { dst->kiov_len = len; - LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + LASSERT (dst->kiov_offset + dst->kiov_len <= CFS_PAGE_SIZE); return (niov); } dst->kiov_len = frag_len; - LASSERT (dst->kiov_offset + dst->kiov_len <= PAGE_SIZE); + LASSERT (dst->kiov_offset + dst->kiov_len <= CFS_PAGE_SIZE); len -= frag_len; dst++; @@ -587,442 +735,931 @@ lib_extract_kiov (int dst_niov, ptl_kiov_t *dst, offset = 0; } } - -#ifndef __KERNEL__ -#if !defined(kmap) -#define kmap(page) ((page)->addr) -#endif -#if !defined(kunmap) -#define kunmap(page) do {} while(0) -#endif -#if !defined(page_address) -#define page_address(page) ((page)->page_address) -#endif #endif -ptl_err_t -lib_lo_rxkiov(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - ptl_kiov_t *kiov, - size_t offset, - size_t mlen, - size_t rlen) +void +lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlen, unsigned int rlen) { - void *srcaddr = NULL; - void *dstaddr = NULL; - unsigned long srcfrag = 0; - unsigned long dstfrag = 0; - unsigned long fraglen; - lo_desc_t *lod = (lo_desc_t *)private; + unsigned int niov = 0; + struct iovec *iov = NULL; + lnet_kiov_t *kiov = NULL; + int rc; - /* I only handle unmapped->unmapped matches */ - LASSERT(lod->lod_type == LOD_KIOV); + LASSERT (!in_interrupt ()); + LASSERT (mlen == 0 || msg != NULL); + + if (msg != NULL) { + LASSERT(msg->msg_receiving); + LASSERT(!msg->msg_sending); + LASSERT(rlen == msg->msg_len); + LASSERT(mlen <= msg->msg_len); + + msg->msg_wanted = mlen; + msg->msg_offset = offset; + msg->msg_receiving = 0; + + if (mlen != 0) { + niov = msg->msg_niov; + iov = msg->msg_iov; + kiov = msg->msg_kiov; + + LASSERT (niov > 0); + LASSERT ((iov == NULL) != (kiov == NULL)); + } + } + + rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, + niov, iov, kiov, offset, mlen, rlen); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} - if (mlen == 0) - return PTL_OK; +int +lnet_compare_routers(lnet_peer_t *p1, lnet_peer_t *p2) +{ + if (p1->lp_txqnob < p2->lp_txqnob) + return 1; + + if (p1->lp_txqnob > p2->lp_txqnob) + return -1; + + if (p1->lp_txcredits > p2->lp_txcredits) + return 1; + + if (p1->lp_txcredits < p2->lp_txcredits) + return -1; + + return 0; +} - while (offset >= kiov->kiov_len) { - offset -= kiov->kiov_len; - kiov++; - niov--; - LASSERT(niov > 0); - } - while (lod->lod_offset >= lod->lod_iov.kiov->kiov_len) { - lod->lod_offset -= lod->lod_iov.kiov->kiov_len; - lod->lod_iov.kiov++; - lod->lod_niov--; - LASSERT(lod->lod_niov > 0); - } +void +lnet_setpayloadbuffer(lnet_msg_t *msg) +{ + lnet_libmd_t *md = msg->msg_md; + + LASSERT (msg->msg_len > 0); + LASSERT (!msg->msg_routing); + LASSERT (md != NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + + msg->msg_niov = md->md_niov; + if ((md->md_options & LNET_MD_KIOV) != 0) + msg->msg_kiov = md->md_iov.kiov; + else + msg->msg_iov = md->md_iov.iov; +} - do { - /* CAVEAT EMPTOR: - * I kmap 2 pages at once == slight risk of deadlock */ - LASSERT(niov > 0); - if (dstaddr == NULL) { - dstaddr = (void *) - ((unsigned long)cfs_kmap(kiov->kiov_page) + - kiov->kiov_offset + offset); - dstfrag = kiov->kiov_len - offset; - } +void +lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, + unsigned int offset, unsigned int len) +{ + msg->msg_type = type; + msg->msg_target = target; + msg->msg_len = len; + msg->msg_offset = offset; + + if (len != 0) + lnet_setpayloadbuffer(msg); + + memset (&msg->msg_hdr, 0, sizeof (msg->msg_hdr)); + msg->msg_hdr.type = cpu_to_le32(type); + msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); + msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); + /* src_nid will be set later */ + msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); + msg->msg_hdr.payload_length = cpu_to_le32(len); +} - LASSERT(lod->lod_niov > 0); - if (srcaddr == NULL) { - srcaddr = (void *) - ((unsigned long)cfs_kmap(lod->lod_iov.kiov->kiov_page)+ - lod->lod_iov.kiov->kiov_offset + lod->lod_offset); - srcfrag = lod->lod_iov.kiov->kiov_len - lod->lod_offset; +void +lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) +{ + void *priv = msg->msg_private; + int rc; + + LASSERT (!in_interrupt ()); + LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || + (msg->msg_txcredit && msg->msg_peertxcredit)); + + rc = (ni->ni_lnd->lnd_send)(ni, priv, msg); + if (rc < 0) + lnet_finalize(ni, msg, rc); +} + +int +lnet_eager_recv_locked(lnet_msg_t *msg) +{ + lnet_peer_t *peer; + lnet_ni_t *ni; + int rc = 0; + + LASSERT (!msg->msg_delayed); + msg->msg_delayed = 1; + + LASSERT (msg->msg_receiving); + LASSERT (!msg->msg_sending); + + peer = msg->msg_rxpeer; + ni = peer->lp_ni; + + if (ni->ni_lnd->lnd_eager_recv != NULL) { + LNET_UNLOCK(); + + rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + &msg->msg_private); + if (rc != 0) { + CERROR("recv from %s / send to %s aborted: " + "eager_recv failed %d\n", + libcfs_nid2str(peer->lp_nid), + libcfs_id2str(msg->msg_target), rc); + LASSERT (rc < 0); /* required by my callers */ } - fraglen = MIN(srcfrag, dstfrag); - if (fraglen > mlen) - fraglen = mlen; + LNET_LOCK(); + } - memcpy(dstaddr, srcaddr, fraglen); + return rc; +} - if (fraglen < dstfrag) { - dstfrag -= fraglen; - dstaddr = (void *)((unsigned long)dstaddr + fraglen); - } else { - cfs_kunmap(kiov->kiov_page); - dstaddr = NULL; - offset = 0; - kiov++; - niov--; +int +lnet_post_send_locked (lnet_msg_t *msg, int do_send) +{ + /* lnet_send is going to LNET_UNLOCK immediately after this, so it sets + * do_send FALSE and I don't do the unlock/send/lock bit. I return + * EAGAIN if msg blocked and 0 if sent or OK to send */ + lnet_peer_t *lp = msg->msg_txpeer; + lnet_ni_t *ni = lp->lp_ni; + + /* non-lnet_send() callers have checked before */ + LASSERT (!do_send || msg->msg_delayed); + LASSERT (!msg->msg_receiving); + + if (!msg->msg_peertxcredit) { + LASSERT ((lp->lp_txcredits < 0) == !list_empty(&lp->lp_txq)); + + msg->msg_peertxcredit = 1; + lp->lp_txqnob += msg->msg_len + sizeof(lnet_hdr_t); + lp->lp_txcredits--; + + if (lp->lp_txcredits < lp->lp_mintxcredits) + lp->lp_mintxcredits = lp->lp_txcredits; + + if (lp->lp_txcredits < 0) { + msg->msg_delayed = 1; + list_add_tail (&msg->msg_list, &lp->lp_txq); + return EAGAIN; } + } + + if (!msg->msg_txcredit) { + LASSERT ((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq)); - if (fraglen < srcfrag) { - srcfrag -= fraglen; - srcaddr = (void *)((unsigned long)srcaddr + fraglen); - } else { - cfs_kunmap(lod->lod_iov.kiov->kiov_page); - srcaddr = NULL; - lod->lod_offset = 0; - lod->lod_iov.kiov++; - lod->lod_niov--; + msg->msg_txcredit = 1; + ni->ni_txcredits--; + + if (ni->ni_txcredits < ni->ni_mintxcredits) + ni->ni_mintxcredits = ni->ni_txcredits; + + if (ni->ni_txcredits < 0) { + msg->msg_delayed = 1; + list_add_tail (&msg->msg_list, &ni->ni_txq); + return EAGAIN; } + } - mlen -= fraglen; - } while (mlen > 0); + if (do_send) { + LNET_UNLOCK(); + lnet_ni_send(ni, msg); + LNET_LOCK(); + } + return 0; +} - if (dstaddr != NULL) - cfs_kunmap(kiov->kiov_page); +#ifdef __KERNEL__ +static void +lnet_commit_routedmsg (lnet_msg_t *msg) +{ + /* ALWAYS called holding the LNET_LOCK */ + LASSERT (msg->msg_routing); + + the_lnet.ln_counters.msgs_alloc++; + if (the_lnet.ln_counters.msgs_alloc > + the_lnet.ln_counters.msgs_max) + the_lnet.ln_counters.msgs_max = + the_lnet.ln_counters.msgs_alloc; + + the_lnet.ln_counters.route_count++; + the_lnet.ln_counters.route_length += msg->msg_len; + + LASSERT (!msg->msg_onactivelist); + msg->msg_onactivelist = 1; + list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); +} - if (srcaddr != NULL) - cfs_kunmap(lod->lod_iov.kiov->kiov_page); +lnet_rtrbufpool_t * +lnet_msg2bufpool(lnet_msg_t *msg) +{ + lnet_rtrbufpool_t *rbp = &the_lnet.ln_rtrpools[0]; - lib_finalize(nal, private, libmsg, PTL_OK); - return PTL_OK; + LASSERT (msg->msg_len <= LNET_MTU); + while (msg->msg_len > rbp->rbp_npages * CFS_PAGE_SIZE) { + rbp++; + LASSERT (rbp < &the_lnet.ln_rtrpools[LNET_NRBPOOLS]); + } + + return rbp; } -ptl_err_t -lib_lo_txkiov (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - ptl_kiov_t *payload_kiov, - size_t payload_offset, - size_t payload_nob) +int +lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) { - lo_desc_t lod = { - .lod_type = LOD_KIOV, - .lod_niov = payload_niov, - .lod_offset = payload_offset, - .lod_nob = payload_nob, - .lod_iov = { .kiov = payload_kiov } }; - ptl_err_t rc; - - rc = do_lib_parse(nal, hdr, &lod, 1); - if (rc == PTL_OK) - lib_finalize(nal, private, libmsg, PTL_OK); + /* lnet_parse is going to LNET_UNLOCK immediately after this, so it + * sets do_recv FALSE and I don't do the unlock/send/lock bit. I + * return EAGAIN if msg blocked and 0 if sent or OK to send */ + lnet_peer_t *lp = msg->msg_rxpeer; + lnet_rtrbufpool_t *rbp; + lnet_rtrbuf_t *rb; + + LASSERT (msg->msg_iov == NULL); + LASSERT (msg->msg_kiov == NULL); + LASSERT (msg->msg_niov == 0); + LASSERT (msg->msg_routing); + LASSERT (msg->msg_receiving); + LASSERT (!msg->msg_sending); + + /* non-lnet_parse callers only send delayed messages */ + LASSERT (!do_recv || msg->msg_delayed); + + if (!msg->msg_peerrtrcredit) { + LASSERT ((lp->lp_rtrcredits < 0) == !list_empty(&lp->lp_rtrq)); + + msg->msg_peerrtrcredit = 1; + lp->lp_rtrcredits--; + if (lp->lp_rtrcredits < lp->lp_minrtrcredits) + lp->lp_minrtrcredits = lp->lp_rtrcredits; + + if (lp->lp_rtrcredits < 0) { + /* must have checked eager_recv before here */ + LASSERT (msg->msg_delayed); + list_add_tail(&msg->msg_list, &lp->lp_rtrq); + return EAGAIN; + } + } + + rbp = lnet_msg2bufpool(msg); - return rc; + if (!msg->msg_rtrcredit) { + LASSERT ((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs)); + + msg->msg_rtrcredit = 1; + rbp->rbp_credits--; + if (rbp->rbp_credits < rbp->rbp_mincredits) + rbp->rbp_mincredits = rbp->rbp_credits; + + if (rbp->rbp_credits < 0) { + /* must have checked eager_recv before here */ + LASSERT (msg->msg_delayed); + list_add_tail(&msg->msg_list, &rbp->rbp_msgs); + return EAGAIN; + } + } + + LASSERT (!list_empty(&rbp->rbp_bufs)); + rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + + msg->msg_niov = rbp->rbp_npages; + msg->msg_kiov = &rb->rb_kiov[0]; + + if (do_recv) { + LNET_UNLOCK(); + lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, + 0, msg->msg_len, msg->msg_len); + LNET_LOCK(); + } + return 0; } #endif -ptl_err_t -lib_lo_rxiov(lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) +void +lnet_return_credits_locked (lnet_msg_t *msg) { - lo_desc_t *lod = (lo_desc_t *)private; + lnet_peer_t *txpeer = msg->msg_txpeer; + lnet_peer_t *rxpeer = msg->msg_rxpeer; + lnet_msg_t *msg2; + lnet_ni_t *ni; - /* I only handle mapped->mapped matches */ - LASSERT(lod->lod_type == LOD_IOV); - LASSERT(mlen > 0); + if (msg->msg_txcredit) { + /* give back NI txcredits */ + msg->msg_txcredit = 0; + ni = txpeer->lp_ni; - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - iov++; - niov--; - LASSERT(niov > 0); + LASSERT((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq)); + + ni->ni_txcredits++; + if (ni->ni_txcredits <= 0) { + msg2 = list_entry(ni->ni_txq.next, lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT(msg2->msg_txpeer->lp_ni == ni); + LASSERT(msg2->msg_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } } - while (lod->lod_offset >= lod->lod_iov.iov->iov_len) { - lod->lod_offset -= lod->lod_iov.iov->iov_len; - lod->lod_iov.iov++; - lod->lod_niov--; - LASSERT(lod->lod_niov > 0); + if (msg->msg_peertxcredit) { + /* give back peer txcredits */ + msg->msg_peertxcredit = 0; + + LASSERT((txpeer->lp_txcredits < 0) == !list_empty(&txpeer->lp_txq)); + + txpeer->lp_txqnob -= msg->msg_len + sizeof(lnet_hdr_t); + LASSERT (txpeer->lp_txqnob >= 0); + + txpeer->lp_txcredits++; + if (txpeer->lp_txcredits <= 0) { + msg2 = list_entry(txpeer->lp_txq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + LASSERT (msg2->msg_txpeer == txpeer); + LASSERT (msg2->msg_delayed); + + (void) lnet_post_send_locked(msg2, 1); + } } - do { - int fraglen = MIN(iov->iov_len - offset, - lod->lod_iov.iov->iov_len - lod->lod_offset); + if (txpeer != NULL) { + msg->msg_txpeer = NULL; + lnet_peer_decref_locked(txpeer); + } + +#ifdef __KERNEL__ + if (msg->msg_rtrcredit) { + /* give back global router credits */ + lnet_rtrbuf_t *rb; + lnet_rtrbufpool_t *rbp; + + /* NB If a msg ever blocks for a buffer in rbp_msgs, it stays + * there until it gets one allocated, or aborts the wait + * itself */ + LASSERT (msg->msg_kiov != NULL); + + rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); + rbp = rb->rb_pool; + LASSERT (rbp == lnet_msg2bufpool(msg)); + + msg->msg_kiov = NULL; + msg->msg_rtrcredit = 0; + + LASSERT((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs)); + LASSERT((rbp->rbp_credits > 0) == !list_empty(&rbp->rbp_bufs)); + + list_add(&rb->rb_list, &rbp->rbp_bufs); + rbp->rbp_credits++; + if (rbp->rbp_credits <= 0) { + msg2 = list_entry(rbp->rbp_msgs.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } + + if (msg->msg_peerrtrcredit) { + /* give pack peer router credits */ + msg->msg_peerrtrcredit = 0; + + LASSERT((rxpeer->lp_rtrcredits < 0) == !list_empty(&rxpeer->lp_rtrq)); + + rxpeer->lp_rtrcredits++; + if (rxpeer->lp_rtrcredits <= 0) { + msg2 = list_entry(rxpeer->lp_rtrq.next, + lnet_msg_t, msg_list); + list_del(&msg2->msg_list); + + (void) lnet_post_routed_recv_locked(msg2, 1); + } + } +#else + LASSERT (!msg->msg_rtrcredit); + LASSERT (!msg->msg_peerrtrcredit); +#endif + if (rxpeer != NULL) { + msg->msg_rxpeer = NULL; + lnet_peer_decref_locked(rxpeer); + } +} + +int +lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) +{ + lnet_nid_t dst_nid = msg->msg_target.nid; + lnet_ni_t *src_ni; + lnet_ni_t *local_ni; + lnet_remotenet_t *rnet; + lnet_route_t *route; + lnet_route_t *best_route; + struct list_head *tmp; + lnet_peer_t *lp; + lnet_peer_t *lp2; + int rc; - LASSERT(niov > 0); - LASSERT(lod->lod_niov > 0); + LASSERT (msg->msg_txpeer == NULL); + LASSERT (!msg->msg_sending); + LASSERT (!msg->msg_target_is_router); + LASSERT (!msg->msg_receiving); - if (fraglen > mlen) - fraglen = mlen; + msg->msg_sending = 1; - memcpy((void *)((unsigned long)iov->iov_base + offset), - (void *)((unsigned long)lod->lod_iov.iov->iov_base + - lod->lod_offset), - fraglen); + /* NB! ni != NULL == interface pre-determined (ACK/REPLY) */ - if (offset + fraglen < iov->iov_len) { - offset += fraglen; + LNET_LOCK(); + + if (the_lnet.ln_shutdown) { + LNET_UNLOCK(); + return -ESHUTDOWN; + } + + if (src_nid == LNET_NID_ANY) { + src_ni = NULL; + } else { + src_ni = lnet_nid2ni_locked(src_nid); + if (src_ni == NULL) { + LNET_UNLOCK(); + CERROR("Can't send to %s: src %s is not a local nid\n", + libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); + return -EINVAL; + } + LASSERT (!msg->msg_routing); + } + + /* Is this for someone on a local network? */ + local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid)); + + if (local_ni != NULL) { + if (src_ni == NULL) { + src_ni = local_ni; + src_nid = src_ni->ni_nid; + } else if (src_ni == local_ni) { + lnet_ni_decref_locked(local_ni); } else { - offset = 0; - iov++; - niov--; + lnet_ni_decref_locked(local_ni); + lnet_ni_decref_locked(src_ni); + LNET_UNLOCK(); + CERROR("no route to %s via from %s\n", + libcfs_nid2str(dst_nid), libcfs_nid2str(src_nid)); + return -EINVAL; } - if (lod->lod_offset + fraglen < lod->lod_iov.iov->iov_len ) { - lod->lod_offset += fraglen; + LASSERT (src_nid != LNET_NID_ANY); + + if (!msg->msg_routing) { + src_nid = lnet_ptlcompat_srcnid(src_nid, dst_nid); + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + } + + if (src_ni == the_lnet.ln_loni) { + /* No send credit hassles with LOLND */ + LNET_UNLOCK(); + lnet_ni_send(src_ni, msg); + lnet_ni_decref(src_ni); + return 0; + } + + rc = lnet_nid2peer_locked(&lp, dst_nid); + lnet_ni_decref_locked(src_ni); /* lp has ref on src_ni; lose mine */ + if (rc != 0) { + LNET_UNLOCK(); + CERROR("Error %d finding peer %s\n", rc, + libcfs_nid2str(dst_nid)); + /* ENOMEM or shutting down */ + return rc; + } + LASSERT (lp->lp_ni == src_ni); + } else { + /* sending to a remote network */ + rnet = lnet_find_net_locked(LNET_NIDNET(dst_nid)); + if (rnet == NULL) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni); + LNET_UNLOCK(); + CERROR("No route to %s\n", libcfs_id2str(msg->msg_target)); + return -EHOSTUNREACH; + } + + /* Find the best gateway I can use */ + lp = NULL; + best_route = NULL; + list_for_each(tmp, &rnet->lrn_routes) { + route = list_entry(tmp, lnet_route_t, lr_list); + lp2 = route->lr_gateway; + + if (lp2->lp_alive && + (src_ni == NULL || lp2->lp_ni == src_ni) && + (lp == NULL || lnet_compare_routers(lp2, lp) > 0)) { + best_route = route; + lp = lp2; + } + } + + if (lp == NULL) { + if (src_ni != NULL) + lnet_ni_decref_locked(src_ni); + LNET_UNLOCK(); + CERROR("No route to %s (all routers down)\n", + libcfs_id2str(msg->msg_target)); + return -EHOSTUNREACH; + } + + /* Place selected route at the end of the route list to ensure + * fairness; everything else being equal... */ + list_del(&best_route->lr_list); + list_add_tail(&best_route->lr_list, &rnet->lrn_routes); + + if (src_ni == NULL) { + src_ni = lp->lp_ni; + src_nid = src_ni->ni_nid; } else { - lod->lod_offset = 0; - lod->lod_iov.iov++; - lod->lod_niov--; + LASSERT (src_ni == lp->lp_ni); + lnet_ni_decref_locked(src_ni); } - mlen -= fraglen; - } while (mlen > 0); + lnet_peer_addref_locked(lp); - lib_finalize(nal, private, libmsg, PTL_OK); - return PTL_OK; -} + LASSERT (src_nid != LNET_NID_ANY); -ptl_err_t -lib_lo_txiov (lib_nal_t *nal, - void *private, - lib_msg_t *libmsg, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int payload_niov, - struct iovec *payload_iov, - size_t payload_offset, - size_t payload_nob) -{ - lo_desc_t lod = { - .lod_type = LOD_IOV, - .lod_niov = payload_niov, - .lod_offset = payload_offset, - .lod_nob = payload_nob, - .lod_iov = { .iov = payload_iov } }; - ptl_err_t rc; - - rc = do_lib_parse(nal, hdr, &lod, 1); - if (rc == PTL_OK) - lib_finalize(nal, private, libmsg, PTL_OK); + if (!msg->msg_routing) { + /* I'm the source and now I know which NI to send on */ + src_nid = lnet_ptlcompat_srcnid(src_nid, dst_nid); + msg->msg_hdr.src_nid = cpu_to_le64(src_nid); + } - return rc; + msg->msg_target_is_router = 1; + msg->msg_target.nid = lp->lp_nid; + msg->msg_target.pid = LUSTRE_SRV_LNET_PID; + } + + /* 'lp' is our best choice of peer */ + + LASSERT (!msg->msg_peertxcredit); + LASSERT (!msg->msg_txcredit); + LASSERT (msg->msg_txpeer == NULL); + + msg->msg_txpeer = lp; /* msg takes my ref on lp */ + + rc = lnet_post_send_locked(msg, 0); + LNET_UNLOCK(); + + if (rc == 0) + lnet_ni_send(src_ni, msg); + + return 0; } -ptl_err_t -lib_lo_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) +static void +lnet_commit_md (lnet_libmd_t *md, lnet_msg_t *msg) { - if (mlen == 0) { - lib_finalize(nal, private, msg, PTL_OK); - return PTL_OK; + /* ALWAYS called holding the LNET_LOCK */ + /* Here, we commit the MD to a network OP by marking it busy and + * decrementing its threshold. Come what may, the network "owns" + * the MD until a call to lnet_finalize() signals completion. */ + LASSERT (!msg->msg_routing); + + msg->msg_md = md; + + md->md_refcount++; + if (md->md_threshold != LNET_MD_THRESH_INF) { + LASSERT (md->md_threshold > 0); + md->md_threshold--; } - if ((md->options & PTL_MD_KIOV) == 0) - return lib_lo_rxiov(nal, private, msg, - md->md_niov, md->md_iov.iov, - offset, mlen, rlen); + the_lnet.ln_counters.msgs_alloc++; + if (the_lnet.ln_counters.msgs_alloc > + the_lnet.ln_counters.msgs_max) + the_lnet.ln_counters.msgs_max = + the_lnet.ln_counters.msgs_alloc; - return lib_lo_rxkiov(nal, private, msg, - md->md_niov, md->md_iov.kiov, - offset, mlen, rlen); + LASSERT (!msg->msg_onactivelist); + msg->msg_onactivelist = 1; + list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); } -ptl_err_t -lib_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, lib_md_t *md, - ptl_size_t offset, ptl_size_t mlen, ptl_size_t rlen) +static void +lnet_drop_message (lnet_ni_t *ni, void *private, unsigned int nob) { - if (mlen == 0) - return (nal->libnal_recv(nal, private, msg, - 0, NULL, - offset, mlen, rlen)); - - if ((md->options & PTL_MD_KIOV) == 0) - return (nal->libnal_recv(nal, private, msg, - md->md_niov, md->md_iov.iov, - offset, mlen, rlen)); - - return (nal->libnal_recv_pages(nal, private, msg, - md->md_niov, md->md_iov.kiov, - offset, mlen, rlen)); + LNET_LOCK(); + the_lnet.ln_counters.drop_count++; + the_lnet.ln_counters.drop_length += nob; + LNET_UNLOCK(); + + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); } -ptl_err_t -lib_send (lib_nal_t *nal, void *private, lib_msg_t *msg, - ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, - lib_md_t *md, ptl_size_t offset, ptl_size_t len) +static void +lnet_drop_delayed_put(lnet_msg_t *msg, char *reason) { - int loopback = (nal->libnal_ni.ni_loopback && - (nid == nal->libnal_ni.ni_pid.nid)); - - if (len == 0) { - if (loopback) - return lib_lo_txiov(nal, private, msg, - hdr, type, nid, pid, - 0, NULL, - offset, len); - else - return nal->libnal_send(nal, private, msg, - hdr, type, nid, pid, - 0, NULL, - offset, len); - } - - if ((md->options & PTL_MD_KIOV) == 0) { - if (loopback) - return lib_lo_txiov(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.iov, - offset, len); - else - return nal->libnal_send(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.iov, - offset, len); - } - - if (loopback) - return lib_lo_txkiov(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.kiov, - offset, len); - else - return nal->libnal_send_pages(nal, private, msg, - hdr, type, nid, pid, - md->md_niov, md->md_iov.kiov, - offset, len); + LASSERT (msg->msg_md == NULL); + LASSERT (msg->msg_delayed); + LASSERT (msg->msg_rxpeer != NULL); + LASSERT (msg->msg_hdr.type == LNET_MSG_PUT); + + CWARN("Dropping delayed PUT from %s portal %d match "LPU64 + " offset %d length %d: %s\n", + libcfs_id2str((lnet_process_id_t){ + .nid = msg->msg_hdr.src_nid, + .pid = msg->msg_hdr.src_pid}), + msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.offset, + msg->msg_hdr.payload_length, + reason); + + /* NB I can't drop msg's ref on msg_rxpeer until after I've + * called lnet_drop_message(), so I just hang onto msg as well + * until that's done */ + + lnet_drop_message(msg->msg_rxpeer->lp_ni, + msg->msg_private, msg->msg_len); + + LNET_LOCK(); + + lnet_peer_decref_locked(msg->msg_rxpeer); + msg->msg_rxpeer = NULL; + + lnet_msg_free(msg); + + LNET_UNLOCK(); } -static void -lib_commit_md (lib_nal_t *nal, lib_md_t *md, lib_msg_t *msg) +int +LNetSetLazyPortal(int portal) { - /* ALWAYS called holding the LIB_LOCK */ - lib_counters_t *counters = &nal->libnal_ni.ni_counters; + lnet_portal_t *ptl = &the_lnet.ln_portals[portal]; - /* Here, we commit the MD to a network OP by marking it busy and - * decrementing its threshold. Come what may, the network "owns" - * the MD until a call to lib_finalize() signals completion. */ - msg->md = md; + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + CDEBUG(D_NET, "Setting portal %d lazy\n", portal); + + LNET_LOCK(); + + ptl->ptl_options |= LNET_PTL_LAZY; + + LNET_UNLOCK(); + + return 0; +} + +int +LNetClearLazyPortal(int portal) +{ + struct list_head zombies; + lnet_portal_t *ptl = &the_lnet.ln_portals[portal]; + lnet_msg_t *msg; + + if (portal < 0 || portal >= the_lnet.ln_nportals) + return -EINVAL; + + LNET_LOCK(); - md->pending++; - if (md->threshold != PTL_MD_THRESH_INF) { - LASSERT (md->threshold > 0); - md->threshold--; + if ((ptl->ptl_options & LNET_PTL_LAZY) == 0) { + LNET_UNLOCK(); + return 0; } - counters->msgs_alloc++; - if (counters->msgs_alloc > counters->msgs_max) - counters->msgs_max = counters->msgs_alloc; + CDEBUG(D_NET, "clearing portal %d lazy\n", portal); - list_add (&msg->msg_list, &nal->libnal_ni.ni_active_msgs); + /* grab all the blocked messages atomically */ + list_add(&zombies, &ptl->ptl_msgq); + list_del_init(&ptl->ptl_msgq); + + ptl->ptl_msgq_version++; + ptl->ptl_options &= ~LNET_PTL_LAZY; + + LNET_UNLOCK(); + + while (!list_empty(&zombies)) { + msg = list_entry(zombies.next, lnet_msg_t, msg_list); + list_del(&msg->msg_list); + + lnet_drop_delayed_put(msg, "Clearing lazy portal attr"); + } + + return 0; } static void -lib_drop_message (lib_nal_t *nal, void *private, ptl_hdr_t *hdr, int loopback) +lnet_recv_put(lnet_libmd_t *md, lnet_msg_t *msg, int delayed, + unsigned int offset, unsigned int mlength) { - unsigned long flags; + lnet_hdr_t *hdr = &msg->msg_hdr; + + LNET_LOCK(); + + the_lnet.ln_counters.recv_count++; + the_lnet.ln_counters.recv_length += mlength; - /* CAVEAT EMPTOR: this only drops messages that we've not committed - * to receive (init_msg() not called) and therefore can't cause an - * event. */ + LNET_UNLOCK(); - LIB_LOCK(nal, flags); - nal->libnal_ni.ni_counters.drop_count++; - nal->libnal_ni.ni_counters.drop_length += hdr->payload_length; - LIB_UNLOCK(nal, flags); + if (mlength != 0) + lnet_setpayloadbuffer(msg); - /* NULL msg => if NAL calls lib_finalize it will be a noop */ - if (!loopback) - (void) lib_recv(nal, private, NULL, NULL, 0, 0, - hdr->payload_length); + msg->msg_ev.type = LNET_EVENT_PUT; + msg->msg_ev.target.pid = hdr->dest_pid; + msg->msg_ev.target.nid = hdr->dest_nid; + msg->msg_ev.hdr_data = hdr->msg.put.hdr_data; + + /* Must I ACK? If so I'll grab the ack_wmd out of the header and put + * it back into the ACK during lnet_finalize() */ + msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && + (md->md_options & LNET_MD_ACK_DISABLE) == 0); + + lnet_ni_recv(msg->msg_rxpeer->lp_ni, + msg->msg_private, + msg, delayed, offset, mlength, + hdr->payload_length); } -/* - * Incoming messages have a ptl_msg_t object associated with them - * by the library. This object encapsulates the state of the - * message and allows the NAL to do non-blocking receives or sends - * of long messages. - * - */ -static ptl_err_t -parse_put(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, - lib_msg_t *msg, int loopback) +/* called with LNET_LOCK held */ +void +lnet_match_blocked_msg(lnet_libmd_t *md) { - lib_ni_t *ni = &nal->libnal_ni; - ptl_size_t mlength = 0; - ptl_size_t offset = 0; - ptl_err_t rc; - lib_md_t *md; - unsigned long flags; + CFS_LIST_HEAD (drops); + CFS_LIST_HEAD (matches); + struct list_head *tmp; + struct list_head *entry; + lnet_msg_t *msg; + lnet_me_t *me = md->md_me; + lnet_portal_t *ptl = &the_lnet.ln_portals[me->me_portal]; - /* Convert put fields to host byte order */ - hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); - hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); - hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + LASSERT (me->me_portal < the_lnet.ln_nportals); + + if ((ptl->ptl_options & LNET_PTL_LAZY) == 0) { + LASSERT (list_empty(&ptl->ptl_msgq)); + return; + } + + LASSERT (md->md_refcount == 0); /* a brand new MD */ + + list_for_each_safe (entry, tmp, &ptl->ptl_msgq) { + int rc; + int index; + unsigned int mlength; + unsigned int offset; + lnet_hdr_t *hdr; + lnet_process_id_t src; + + msg = list_entry(entry, lnet_msg_t, msg_list); + + LASSERT (msg->msg_delayed); + + hdr = &msg->msg_hdr; + index = hdr->msg.put.ptl_index; + + src.nid = hdr->src_nid; + src.pid = hdr->src_pid; + + rc = lnet_try_match_md(index, LNET_MD_OP_PUT, src, + hdr->payload_length, + hdr->msg.put.offset, + hdr->msg.put.match_bits, + md, msg, &mlength, &offset); + + if (rc == LNET_MATCHMD_NONE) + continue; + + /* Hurrah! This _is_ a match */ + list_del(&msg->msg_list); + ptl->ptl_msgq_version++; + + if (rc == LNET_MATCHMD_OK) { + list_add_tail(&msg->msg_list, &matches); + + CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " + "match "LPU64" offset %d length %d.\n", + libcfs_id2str(src), + hdr->msg.put.ptl_index, + hdr->msg.put.match_bits, + hdr->msg.put.offset, + hdr->payload_length); + } else { + LASSERT (rc == LNET_MATCHMD_DROP); - LIB_LOCK(nal, flags); + list_add_tail(&msg->msg_list, &drops); + } - md = lib_match_md(nal, hdr->msg.put.ptl_index, PTL_MD_OP_PUT, - hdr->src_nid, hdr->src_pid, - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.match_bits, msg, - &mlength, &offset); - if (md == NULL) { - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); + if (lnet_md_exhausted(md)) + break; } - msg->ev.type = PTL_EVENT_PUT_END; - msg->ev.hdr_data = hdr->msg.put.hdr_data; + LNET_UNLOCK(); - if (!ptl_is_wire_handle_none(&hdr->msg.put.ack_wmd) && - !(md->options & PTL_MD_ACK_DISABLE)) { - msg->ack_wmd = hdr->msg.put.ack_wmd; + list_for_each_safe (entry, tmp, &drops) { + msg = list_entry(entry, lnet_msg_t, msg_list); + + list_del(&msg->msg_list); + + lnet_drop_delayed_put(msg, "Bad match"); } - ni->ni_counters.recv_count++; - ni->ni_counters.recv_length += mlength; + list_for_each_safe (entry, tmp, &matches) { + msg = list_entry(entry, lnet_msg_t, msg_list); - LIB_UNLOCK(nal, flags); + list_del(&msg->msg_list); - if (loopback) - rc = lib_lo_recv(nal, private, msg, md, offset, mlength, - hdr->payload_length); - else - rc = lib_recv(nal, private, msg, md, offset, mlength, - hdr->payload_length); + /* md won't disappear under me, since each msg + * holds a ref on it */ + lnet_recv_put(md, msg, 1, + msg->msg_ev.offset, + msg->msg_ev.mlength); + } - if (rc != PTL_OK) - CERROR(LPU64": error on receiving PUT from "LPU64": %d\n", - ni->ni_pid.nid, hdr->src_nid, rc); + LNET_LOCK(); +} - return (rc); +static int +lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) +{ + int rc; + int index; + lnet_hdr_t *hdr = &msg->msg_hdr; + unsigned int rlength = hdr->payload_length; + unsigned int mlength = 0; + unsigned int offset = 0; + lnet_process_id_t src = {/* .nid = */ hdr->src_nid, + /* .pid = */ hdr->src_pid}; + lnet_libmd_t *md; + + /* Convert put fields to host byte order */ + hdr->msg.put.match_bits = le64_to_cpu(hdr->msg.put.match_bits); + hdr->msg.put.ptl_index = le32_to_cpu(hdr->msg.put.ptl_index); + hdr->msg.put.offset = le32_to_cpu(hdr->msg.put.offset); + + index = hdr->msg.put.ptl_index; + + LNET_LOCK(); + + rc = lnet_match_md(index, LNET_MD_OP_PUT, src, + rlength, hdr->msg.put.offset, + hdr->msg.put.match_bits, msg, + &mlength, &offset, &md); + switch (rc) { + default: + LBUG(); + + case LNET_MATCHMD_OK: + LNET_UNLOCK(); + lnet_recv_put(md, msg, 0, offset, mlength); + return 0; + + case LNET_MATCHMD_NONE: + rc = lnet_eager_recv_locked(msg); + if (rc == 0) { + list_add_tail(&msg->msg_list, + &the_lnet.ln_portals[index].ptl_msgq); + + the_lnet.ln_portals[index].ptl_msgq_version++; + + CDEBUG(D_NET, "Delaying PUT from %s portal %d match " + LPU64" offset %d length %d: no match \n", + libcfs_id2str(src), index, + hdr->msg.put.match_bits, + hdr->msg.put.offset, rlength); + + LNET_UNLOCK(); + return 0; + } + /* fall through */ + + case LNET_MATCHMD_DROP: + CWARN("Dropping PUT from %s portal %d match "LPU64 + " offset %d length %d: %d\n", + libcfs_id2str(src), index, + hdr->msg.put.match_bits, + hdr->msg.put.offset, rlength, rc); + LNET_UNLOCK(); + + return ENOENT; /* +ve: OK but no match */ + + } } -static ptl_err_t -parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, - lib_msg_t *msg, int loopback) +static int +lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) { - lib_ni_t *ni = &nal->libnal_ni; - ptl_size_t mlength = 0; - ptl_size_t offset = 0; - lib_md_t *md; - ptl_hdr_t reply; - unsigned long flags; - int rc; + lnet_hdr_t *hdr = &msg->msg_hdr; + unsigned int mlength = 0; + unsigned int offset = 0; + lnet_process_id_t src = {/* .nid = */ hdr->src_nid, + /* .pid = */ hdr->src_pid}; + lnet_handle_wire_t reply_wmd; + lnet_libmd_t *md; + int rc; /* Convert get fields to host byte order */ hdr->msg.get.match_bits = le64_to_cpu(hdr->msg.get.match_bits); @@ -1030,221 +1667,228 @@ parse_get(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, hdr->msg.get.sink_length = le32_to_cpu(hdr->msg.get.sink_length); hdr->msg.get.src_offset = le32_to_cpu(hdr->msg.get.src_offset); - LIB_LOCK(nal, flags); - - md = lib_match_md(nal, hdr->msg.get.ptl_index, PTL_MD_OP_GET, - hdr->src_nid, hdr->src_pid, - hdr->msg.get.sink_length, hdr->msg.get.src_offset, - hdr->msg.get.match_bits, msg, - &mlength, &offset); - if (md == NULL) { - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); + LNET_LOCK(); + + rc = lnet_match_md(hdr->msg.get.ptl_index, LNET_MD_OP_GET, src, + hdr->msg.get.sink_length, hdr->msg.get.src_offset, + hdr->msg.get.match_bits, msg, + &mlength, &offset, &md); + if (rc == LNET_MATCHMD_DROP) { + CWARN("Dropping GET from %s portal %d match "LPU64 + " offset %d length %d\n", + libcfs_id2str(src), + hdr->msg.get.ptl_index, + hdr->msg.get.match_bits, + hdr->msg.get.src_offset, + hdr->msg.get.sink_length); + LNET_UNLOCK(); + return ENOENT; /* +ve: OK but no match */ } - msg->ev.type = PTL_EVENT_GET_END; - msg->ev.hdr_data = 0; + LASSERT (rc == LNET_MATCHMD_OK); + + the_lnet.ln_counters.send_count++; + the_lnet.ln_counters.send_length += mlength; + + LNET_UNLOCK(); - ni->ni_counters.send_count++; - ni->ni_counters.send_length += mlength; + reply_wmd = hdr->msg.get.return_wmd; - LIB_UNLOCK(nal, flags); + lnet_prep_send(msg, LNET_MSG_REPLY, src, offset, mlength); - memset (&reply, 0, sizeof (reply)); - reply.type = cpu_to_le32(PTL_MSG_REPLY); - reply.dest_nid = cpu_to_le64(hdr->src_nid); - reply.dest_pid = cpu_to_le32(hdr->src_pid); - reply.src_nid = cpu_to_le64(ni->ni_pid.nid); - reply.src_pid = cpu_to_le32(ni->ni_pid.pid); - reply.payload_length = cpu_to_le32(mlength); + msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; - reply.msg.reply.dst_wmd = hdr->msg.get.return_wmd; + msg->msg_ev.type = LNET_EVENT_GET; + msg->msg_ev.target.pid = hdr->dest_pid; + msg->msg_ev.target.nid = hdr->dest_nid; + msg->msg_ev.hdr_data = 0; - /* NB call lib_send() _BEFORE_ lib_recv() completes the incoming - * message. Some NALs _require_ this to implement optimized GET */ + if (rdma_get) { + /* The LND completes the REPLY from her recv procedure */ + lnet_ni_recv(ni, msg->msg_private, msg, 0, + msg->msg_offset, msg->msg_len, msg->msg_len); + return 0; + } - rc = lib_send (nal, private, msg, &reply, PTL_MSG_REPLY, - hdr->src_nid, hdr->src_pid, md, offset, mlength); - if (rc != PTL_OK) - CERROR(LPU64": Unable to send REPLY for GET from "LPU64": %d\n", - ni->ni_pid.nid, hdr->src_nid, rc); + lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); + msg->msg_receiving = 0; + + rc = lnet_send(ni->ni_nid, msg); + if (rc < 0) { + /* didn't get as far as lnet_ni_send() */ + CERROR("%s: Unable to send REPLY for GET from %s: %d\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), rc); - /* Discard any junk after the hdr */ - if (!loopback) - (void) lib_recv(nal, private, NULL, NULL, 0, 0, - hdr->payload_length); + lnet_finalize(ni, msg, rc); + } - return (rc); + return 0; } -static ptl_err_t -parse_reply(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, - lib_msg_t *msg, int loopback) +static int +lnet_parse_reply(lnet_ni_t *ni, lnet_msg_t *msg) { - lib_ni_t *ni = &nal->libnal_ni; - lib_md_t *md; - int rlength; - int length; - unsigned long flags; - ptl_err_t rc; + void *private = msg->msg_private; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {/* .nid = */ hdr->src_nid, + /* .pid = */ hdr->src_pid}; + lnet_libmd_t *md; + int rlength; + int mlength; - LIB_LOCK(nal, flags); + LNET_LOCK(); /* NB handles only looked up by creator (no flips) */ - md = ptl_wire_handle2md(&hdr->msg.reply.dst_wmd, nal); - if (md == NULL || md->threshold == 0) { - CERROR (LPU64": Dropping REPLY from "LPU64" for %s MD "LPX64"."LPX64"\n", - ni->ni_pid.nid, hdr->src_nid, - md == NULL ? "invalid" : "inactive", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie); - - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); - } - - LASSERT (md->offset == 0); - - length = rlength = hdr->payload_length; - - if (length > md->length) { - if ((md->options & PTL_MD_TRUNCATE) == 0) { - CERROR (LPU64": Dropping REPLY from "LPU64 - " length %d for MD "LPX64" would overflow (%d)\n", - ni->ni_pid.nid, hdr->src_nid, length, - hdr->msg.reply.dst_wmd.wh_object_cookie, - md->length); - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); - } - length = md->length; + md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); + if (md == NULL || md->md_threshold == 0) { + CWARN("%s: Dropping REPLY from %s for %s " + "MD "LPX64"."LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.reply.dst_wmd.wh_interface_cookie, + hdr->msg.reply.dst_wmd.wh_object_cookie); + + LNET_UNLOCK(); + return ENOENT; /* +ve: OK but no match */ } - CDEBUG(D_NET, "Reply from "LPU64" of length %d/%d into md "LPX64"\n", - hdr->src_nid, length, rlength, - hdr->msg.reply.dst_wmd.wh_object_cookie); + LASSERT (md->md_offset == 0); - lib_commit_md(nal, md, msg); + rlength = hdr->payload_length; + mlength = MIN(rlength, md->md_length); - msg->ev.type = PTL_EVENT_REPLY_END; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.rlength = rlength; - msg->ev.mlength = length; - msg->ev.offset = 0; + if (mlength < rlength && + (md->md_options & LNET_MD_TRUNCATE) == 0) { + CERROR ("%s: Dropping REPLY from %s length %d " + "for MD "LPX64" would overflow (%d)\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, + mlength); + LNET_UNLOCK(); + return ENOENT; /* +ve: OK but no match */ + } - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); + CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md "LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); - ni->ni_counters.recv_count++; - ni->ni_counters.recv_length += length; + lnet_commit_md(md, msg); - LIB_UNLOCK(nal, flags); + if (mlength != 0) + lnet_setpayloadbuffer(msg); - if (loopback) - rc = lib_lo_recv(nal, private, msg, md, 0, length, rlength); - else - rc = lib_recv(nal, private, msg, md, 0, length, rlength); + msg->msg_ev.type = LNET_EVENT_REPLY; + msg->msg_ev.target.pid = hdr->dest_pid; + msg->msg_ev.target.nid = hdr->dest_nid; + msg->msg_ev.initiator = src; + msg->msg_ev.rlength = rlength; + msg->msg_ev.mlength = mlength; + msg->msg_ev.offset = 0; - if (rc != PTL_OK) - CERROR(LPU64": error on receiving REPLY from "LPU64": %d\n", - ni->ni_pid.nid, hdr->src_nid, rc); + lnet_md_deconstruct(md, &msg->msg_ev.md); + lnet_md2handle(&msg->msg_ev.md_handle, md); - return (rc); + the_lnet.ln_counters.recv_count++; + the_lnet.ln_counters.recv_length += mlength; + + LNET_UNLOCK(); + + lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); + return 0; } -static ptl_err_t -parse_ack(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, - lib_msg_t *msg, int loopback) +static int +lnet_parse_ack(lnet_ni_t *ni, lnet_msg_t *msg) { - lib_ni_t *ni = &nal->libnal_ni; - lib_md_t *md; - unsigned long flags; + lnet_hdr_t *hdr = &msg->msg_hdr; + lnet_process_id_t src = {/* .nid = */ hdr->src_nid, + /* .pid = */ hdr->src_pid}; + lnet_libmd_t *md; /* Convert ack fields to host byte order */ hdr->msg.ack.match_bits = le64_to_cpu(hdr->msg.ack.match_bits); hdr->msg.ack.mlength = le32_to_cpu(hdr->msg.ack.mlength); - LIB_LOCK(nal, flags); + LNET_LOCK(); /* NB handles only looked up by creator (no flips) */ - md = ptl_wire_handle2md(&hdr->msg.ack.dst_wmd, nal); - if (md == NULL || md->threshold == 0) { - CDEBUG(D_INFO, LPU64": Dropping ACK from "LPU64" to %s MD " - LPX64"."LPX64"\n", ni->ni_pid.nid, hdr->src_nid, - (md == NULL) ? "invalid" : "inactive", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie); - - LIB_UNLOCK(nal, flags); - return (PTL_FAIL); + md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); + if (md == NULL || md->md_threshold == 0) { +#if 0 + /* Don't moan; this is expected */ + CERROR ("%s: Dropping ACK from %s to %s MD "LPX64"."LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), + (md == NULL) ? "invalid" : "inactive", + hdr->msg.ack.dst_wmd.wh_interface_cookie, + hdr->msg.ack.dst_wmd.wh_object_cookie); +#endif + LNET_UNLOCK(); + return ENOENT; /* +ve! */ } - CDEBUG(D_NET, LPU64": ACK from "LPU64" into md "LPX64"\n", - ni->ni_pid.nid, hdr->src_nid, + CDEBUG(D_NET, "%s: ACK from %s into md "LPX64"\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), hdr->msg.ack.dst_wmd.wh_object_cookie); - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_ACK; - msg->ev.initiator.nid = hdr->src_nid; - msg->ev.initiator.pid = hdr->src_pid; - msg->ev.mlength = hdr->msg.ack.mlength; - msg->ev.match_bits = hdr->msg.ack.match_bits; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); + lnet_commit_md(md, msg); - ni->ni_counters.recv_count++; + msg->msg_ev.type = LNET_EVENT_ACK; + msg->msg_ev.target.pid = hdr->dest_pid; + msg->msg_ev.target.nid = hdr->dest_nid; + msg->msg_ev.initiator = src; + msg->msg_ev.mlength = hdr->msg.ack.mlength; + msg->msg_ev.match_bits = hdr->msg.ack.match_bits; - LIB_UNLOCK(nal, flags); + lnet_md_deconstruct(md, &msg->msg_ev.md); + lnet_md2handle(&msg->msg_ev.md_handle, md); - /* We have received and matched up the ack OK, create the - * completion event now... */ - lib_finalize(nal, private, msg, PTL_OK); + the_lnet.ln_counters.recv_count++; - /* ...and now discard any junk after the hdr */ - if (!loopback) - (void) lib_recv(nal, private, NULL, NULL, 0, 0, - hdr->payload_length); + LNET_UNLOCK(); - return (PTL_OK); + lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); + return 0; } -static char * -hdr_type_string (ptl_hdr_t *hdr) +char * +lnet_msgtyp2str (int type) { - switch (hdr->type) { - case PTL_MSG_ACK: + switch (type) { + case LNET_MSG_ACK: return ("ACK"); - case PTL_MSG_PUT: + case LNET_MSG_PUT: return ("PUT"); - case PTL_MSG_GET: + case LNET_MSG_GET: return ("GET"); - case PTL_MSG_REPLY: + case LNET_MSG_REPLY: return ("REPLY"); - case PTL_MSG_HELLO: + case LNET_MSG_HELLO: return ("HELLO"); default: return (""); } } -void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) +void +lnet_print_hdr(lnet_hdr_t * hdr) { - char *type_str = hdr_type_string (hdr); + lnet_process_id_t src = {/* .nid = */ hdr->src_nid, + /* .pid = */ hdr->src_pid}; + lnet_process_id_t dst = {/* .nid = */ hdr->dest_nid, + /* .pid = */ hdr->dest_pid}; + char *type_str = lnet_msgtyp2str (hdr->type); CWARN("P3 Header at %p of type %s\n", hdr, type_str); - CWARN(" From nid/pid "LPX64"/%u", hdr->src_nid, hdr->src_pid); - CWARN(" To nid/pid "LPX64"/%u\n", hdr->dest_nid, hdr->dest_pid); + CWARN(" From %s\n", libcfs_id2str(src)); + CWARN(" To %s\n", libcfs_id2str(dst)); switch (hdr->type) { default: break; - case PTL_MSG_PUT: + case LNET_MSG_PUT: CWARN(" Ptl index %d, ack md "LPX64"."LPX64", " - "match bits "LPX64"\n", + "match bits "LPU64"\n", hdr->msg.put.ptl_index, hdr->msg.put.ack_wmd.wh_interface_cookie, hdr->msg.put.ack_wmd.wh_object_cookie, @@ -1254,9 +1898,9 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) hdr->msg.put.hdr_data); break; - case PTL_MSG_GET: + case LNET_MSG_GET: CWARN(" Ptl index %d, return md "LPX64"."LPX64", " - "match bits "LPX64"\n", hdr->msg.get.ptl_index, + "match bits "LPU64"\n", hdr->msg.get.ptl_index, hdr->msg.get.return_wmd.wh_interface_cookie, hdr->msg.get.return_wmd.wh_object_cookie, hdr->msg.get.match_bits); @@ -1265,7 +1909,7 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) hdr->msg.get.src_offset); break; - case PTL_MSG_ACK: + case LNET_MSG_ACK: CWARN(" dst md "LPX64"."LPX64", " "manipulated length %d\n", hdr->msg.ack.dst_wmd.wh_interface_cookie, @@ -1273,7 +1917,7 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) hdr->msg.ack.mlength); break; - case PTL_MSG_REPLY: + case LNET_MSG_REPLY: CWARN(" dst md "LPX64"."LPX64", " "length %d\n", hdr->msg.reply.dst_wmd.wh_interface_cookie, @@ -1281,481 +1925,573 @@ void print_hdr(lib_nal_t *nal, ptl_hdr_t * hdr) hdr->payload_length); } -} /* end of print_hdr() */ +} -ptl_err_t -lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private) +int +lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, + void *private, int rdma_req) { - return do_lib_parse(nal, hdr, private, 0); -} + int rc = 0; + int for_me; + lnet_msg_t *msg; + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + __u32 payload_length; + __u32 type; -ptl_err_t -do_lib_parse(lib_nal_t *nal, ptl_hdr_t *hdr, void *private, int loopback) -{ - unsigned long flags; - ptl_err_t rc; - lib_msg_t *msg; + LASSERT (!in_interrupt ()); - /* NB we return PTL_OK if we manage to parse the header and believe - * it looks OK. Anything that goes wrong with receiving the - * message after that point is the responsibility of the NAL */ + type = le32_to_cpu(hdr->type); + src_nid = le64_to_cpu(hdr->src_nid); + dest_nid = le64_to_cpu(hdr->dest_nid); + payload_length = le32_to_cpu(hdr->payload_length); + + for_me = lnet_ptlcompat_matchnid(ni->ni_nid, dest_nid); + + switch (type) { + case LNET_MSG_ACK: + case LNET_MSG_GET: + if (payload_length > 0) { + CERROR("%s, src %s: bad %s payload %d (0 expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), payload_length); + return -EPROTO; + } + break; + + case LNET_MSG_PUT: + case LNET_MSG_REPLY: + if (payload_length > (for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { + CERROR("%s, src %s: bad %s payload %d " + "(%d max expected)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), + payload_length, + for_me ? LNET_MAX_PAYLOAD : LNET_MTU); + return -EPROTO; + } + break; - /* convert common fields to host byte order */ - hdr->type = le32_to_cpu(hdr->type); - hdr->src_nid = le64_to_cpu(hdr->src_nid); - hdr->src_pid = le32_to_cpu(hdr->src_pid); - hdr->dest_pid = le32_to_cpu(hdr->dest_pid); - hdr->payload_length = le32_to_cpu(hdr->payload_length); + default: + CERROR("%s, src %s: Bad message type 0x%x\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), type); + return -EPROTO; + } - switch (hdr->type) { - case PTL_MSG_HELLO: { - /* dest_nid is really ptl_magicversion_t */ - ptl_magicversion_t *mv = (ptl_magicversion_t *)&hdr->dest_nid; - - mv->magic = le32_to_cpu(mv->magic); - mv->version_major = le16_to_cpu(mv->version_major); - mv->version_minor = le16_to_cpu(mv->version_minor); - - if (mv->magic == PORTALS_PROTO_MAGIC && - mv->version_major == PORTALS_PROTO_VERSION_MAJOR && - mv->version_minor == PORTALS_PROTO_VERSION_MINOR) { - CWARN (LPU64": Dropping unexpected HELLO message: " - "magic %d, version %d.%d from "LPD64"\n", - nal->libnal_ni.ni_pid.nid, mv->magic, - mv->version_major, mv->version_minor, - hdr->src_nid); - - /* it's good but we don't want it */ - lib_drop_message(nal, private, hdr, loopback); - return PTL_OK; + /* Regard a bad destination NID as a protocol error. Senders should + * know what they're doing; if they don't they're misconfigured, buggy + * or malicious so we chop them off at the knees :) */ + + if (!for_me) { + if (the_lnet.ln_ptlcompat > 0) { + /* portals compatibility is single-network */ + CERROR ("%s, src %s: Bad dest nid %s " + "(routing not supported)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; } - /* we got garbage */ - CERROR (LPU64": Bad HELLO message: " - "magic %d, version %d.%d from "LPD64"\n", - nal->libnal_ni.ni_pid.nid, mv->magic, - mv->version_major, mv->version_minor, - hdr->src_nid); - return PTL_FAIL; - } - - case PTL_MSG_ACK: - case PTL_MSG_PUT: - case PTL_MSG_GET: - case PTL_MSG_REPLY: - hdr->dest_nid = le64_to_cpu(hdr->dest_nid); - if (hdr->dest_nid != nal->libnal_ni.ni_pid.nid) { - CERROR(LPU64": BAD dest NID in %s message from" - LPU64" to "LPU64" (not me)\n", - nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), - hdr->src_nid, hdr->dest_nid); - return PTL_FAIL; + if (the_lnet.ln_ptlcompat == 0 && + LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { + /* should have gone direct */ + CERROR ("%s, src %s: Bad dest nid %s " + "(should have been sent direct)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; } - break; - default: - CERROR(LPU64": Bad message type 0x%x from "LPU64"\n", - nal->libnal_ni.ni_pid.nid, hdr->type, hdr->src_nid); - return PTL_FAIL; + if (the_lnet.ln_ptlcompat == 0 && + lnet_islocalnid(dest_nid)) { + /* dest is another local NI; sender should have used + * this node's NID on its own network */ + CERROR ("%s, src %s: Bad dest nid %s " + "(it's my nid but on a different network)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (rdma_req && type == LNET_MSG_GET) { + CERROR ("%s, src %s: Bad optimized GET for %s " + "(final destination must be me)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + return -EPROTO; + } + + if (!the_lnet.ln_routing) { + CERROR ("%s, src %s: Dropping message for %s " + "(routing not enabled)\n", + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), + libcfs_nid2str(dest_nid)); + goto drop; + } } - /* We've decided we're not receiving garbage since we can parse the - * header. We will return PTL_OK come what may... */ + /* Message looks OK; we're not going to return an error, so we MUST + * call back lnd_recv() come what may... */ - if (!list_empty (&nal->libnal_ni.ni_test_peers) && /* normally we don't */ - fail_peer (nal, hdr->src_nid, 0)) /* shall we now? */ + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (src_nid, 0)) /* shall we now? */ { - CERROR(LPU64": Dropping incoming %s from "LPU64 - ": simulated failure\n", - nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), - hdr->src_nid); - lib_drop_message(nal, private, hdr, loopback); - return PTL_OK; + CERROR("%s, src %s: Dropping %s to simulate failure\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type)); + goto drop; } - msg = lib_msg_alloc(nal); + msg = lnet_msg_alloc(); if (msg == NULL) { - CERROR(LPU64": Dropping incoming %s from "LPU64 - ": can't allocate a lib_msg_t\n", - nal->libnal_ni.ni_pid.nid, hdr_type_string (hdr), - hdr->src_nid); - lib_drop_message(nal, private, hdr, loopback); - return PTL_OK; + CERROR("%s, src %s: Dropping %s (out of memory)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid) + , lnet_msgtyp2str(type)); + goto drop; } - switch (hdr->type) { - case PTL_MSG_ACK: - rc = parse_ack(nal, hdr, private, msg, loopback); + /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */ + + msg->msg_type = type; + msg->msg_private = private; + msg->msg_receiving = 1; + msg->msg_len = msg->msg_wanted = payload_length; + msg->msg_offset = 0; + msg->msg_hdr = *hdr; + + LNET_LOCK(); + rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid); + if (rc != 0) { + LNET_UNLOCK(); + CERROR("%s, src %s: Dropping %s " + "(error %d looking up sender)\n", + libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), + lnet_msgtyp2str(type), rc); + goto free_drop; + } + LNET_UNLOCK(); + +#ifndef __KERNEL__ + LASSERT (for_me); +#else + if (!for_me) { + msg->msg_target.pid = le32_to_cpu(hdr->dest_pid); + msg->msg_target.nid = dest_nid; + msg->msg_routing = 1; + msg->msg_offset = 0; + + LNET_LOCK(); + if (msg->msg_rxpeer->lp_rtrcredits <= 0 || + lnet_msg2bufpool(msg)->rbp_credits <= 0) { + rc = lnet_eager_recv_locked(msg); + if (rc != 0) { + LNET_UNLOCK(); + goto free_drop; + } + } + + lnet_commit_routedmsg(msg); + rc = lnet_post_routed_recv_locked(msg, 0); + LNET_UNLOCK(); + + if (rc == 0) + lnet_ni_recv(ni, msg->msg_private, msg, 0, + 0, payload_length, payload_length); + return 0; + } +#endif + /* convert common msg->hdr fields to host byteorder */ + msg->msg_hdr.type = type; + msg->msg_hdr.src_nid = src_nid; + msg->msg_hdr.src_pid = le32_to_cpu(msg->msg_hdr.src_pid); + msg->msg_hdr.dest_nid = dest_nid; + msg->msg_hdr.dest_pid = le32_to_cpu(msg->msg_hdr.dest_pid); + msg->msg_hdr.payload_length = payload_length; + + switch (type) { + case LNET_MSG_ACK: + rc = lnet_parse_ack(ni, msg); break; - case PTL_MSG_PUT: - rc = parse_put(nal, hdr, private, msg, loopback); + case LNET_MSG_PUT: + rc = lnet_parse_put(ni, msg); break; - case PTL_MSG_GET: - rc = parse_get(nal, hdr, private, msg, loopback); + case LNET_MSG_GET: + rc = lnet_parse_get(ni, msg, rdma_req); break; - case PTL_MSG_REPLY: - rc = parse_reply(nal, hdr, private, msg, loopback); + case LNET_MSG_REPLY: + rc = lnet_parse_reply(ni, msg); break; default: LASSERT(0); - rc = PTL_FAIL; /* no compiler warning please */ - break; + goto free_drop; /* prevent an unused label if !kernel */ } - if (rc != PTL_OK) { - if (msg->md != NULL) { - /* committed... */ - lib_finalize(nal, private, msg, rc); - } else { - LIB_LOCK(nal, flags); - lib_msg_free(nal, msg); /* expects LIB_LOCK held */ - LIB_UNLOCK(nal, flags); - - lib_drop_message(nal, private, hdr, loopback); - } + if (rc == 0) + return 0; + + LASSERT (rc == ENOENT); + + free_drop: + LASSERT (msg->msg_md == NULL); + LNET_LOCK(); + if (msg->msg_rxpeer != NULL) { + lnet_peer_decref_locked(msg->msg_rxpeer); + msg->msg_rxpeer = NULL; } + lnet_msg_free(msg); /* expects LNET_LOCK held */ + LNET_UNLOCK(); - return PTL_OK; - /* That's "OK I can parse it", not "OK I like it" :) */ + drop: + lnet_drop_message(ni, private, payload_length); + return 0; } int -lib_api_put(nal_t *apinal, ptl_handle_md_t *mdh, - ptl_ack_req_t ack, ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, - ptl_size_t offset, ptl_hdr_data_t hdr_data) +LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset, + __u64 hdr_data) { - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - lib_msg_t *msg; - ptl_hdr_t hdr; - lib_md_t *md; - unsigned long flags; + lnet_msg_t *msg; + lnet_libmd_t *md; int rc; - if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ - fail_peer (nal, id->nid, 1)) /* shall we now? */ + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (target.nid, 1)) /* shall we now? */ { - CERROR("Dropping PUT to "LPU64": simulated failure\n", - id->nid); - return PTL_PROCESS_INVALID; + CERROR("Dropping PUT to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; } - msg = lib_msg_alloc(nal); + msg = lnet_msg_alloc(); if (msg == NULL) { - CERROR(LPU64": Dropping PUT to "LPU64": ENOMEM on lib_msg_t\n", - ni->ni_pid.nid, id->nid); - return PTL_NO_SPACE; + CERROR("Dropping PUT to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; } - LIB_LOCK(nal, flags); + LNET_LOCK(); - md = ptl_handle2md(mdh, nal); - if (md == NULL || md->threshold == 0) { - lib_msg_free(nal, msg); - LIB_UNLOCK(nal, flags); + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0) { + lnet_msg_free(msg); + LNET_UNLOCK(); - return PTL_MD_INVALID; + CERROR("Dropping PUT to %s: MD invalid\n", + libcfs_id2str(target)); + return -ENOENT; } - CDEBUG(D_NET, "PtlPut -> "LPX64"\n", id->nid); + CDEBUG(D_NET, "LNetPut -> %s\n", libcfs_id2str(target)); - memset (&hdr, 0, sizeof (hdr)); - hdr.type = cpu_to_le32(PTL_MSG_PUT); - hdr.dest_nid = cpu_to_le64(id->nid); - hdr.dest_pid = cpu_to_le32(id->pid); - hdr.src_nid = cpu_to_le64(ni->ni_pid.nid); - hdr.src_pid = cpu_to_le32(ni->ni_pid.pid); - hdr.payload_length = cpu_to_le32(md->length); + lnet_commit_md(md, msg); + + lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); + + msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); + msg->msg_hdr.msg.put.hdr_data = hdr_data; /* NB handles only looked up by creator (no flips) */ - if (ack == PTL_ACK_REQ) { - hdr.msg.put.ack_wmd.wh_interface_cookie = ni->ni_interface_cookie; - hdr.msg.put.ack_wmd.wh_object_cookie = md->md_lh.lh_cookie; + if (ack == LNET_ACK_REQ) { + msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = + md->md_lh.lh_cookie; } else { - hdr.msg.put.ack_wmd = PTL_WIRE_HANDLE_NONE; + msg->msg_hdr.msg.put.ack_wmd = LNET_WIRE_HANDLE_NONE; } - hdr.msg.put.match_bits = cpu_to_le64(match_bits); - hdr.msg.put.ptl_index = cpu_to_le32(portal); - hdr.msg.put.offset = cpu_to_le32(offset); - hdr.msg.put.hdr_data = hdr_data; - - lib_commit_md(nal, md, msg); - - msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator.nid = ni->ni_pid.nid; - msg->ev.initiator.pid = ni->ni_pid.pid; - msg->ev.pt_index = portal; - msg->ev.match_bits = match_bits; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = offset; - msg->ev.hdr_data = hdr_data; - - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); - - ni->ni_counters.send_count++; - ni->ni_counters.send_length += md->length; - - LIB_UNLOCK(nal, flags); - - rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_PUT, - id->nid, id->pid, md, 0, md->length); - if (rc != PTL_OK) { - CERROR("Error sending PUT to "LPX64": %d\n", - id->nid, rc); - lib_finalize (nal, NULL, msg, rc); + msg->msg_ev.type = LNET_EVENT_SEND; + msg->msg_ev.initiator.nid = LNET_NID_ANY; + msg->msg_ev.initiator.pid = the_lnet.ln_pid; + msg->msg_ev.target = target; + msg->msg_ev.pt_index = portal; + msg->msg_ev.match_bits = match_bits; + msg->msg_ev.rlength = md->md_length; + msg->msg_ev.mlength = md->md_length; + msg->msg_ev.offset = offset; + msg->msg_ev.hdr_data = hdr_data; + + lnet_md_deconstruct(md, &msg->msg_ev.md); + lnet_md2handle(&msg->msg_ev.md_handle, md); + + the_lnet.ln_counters.send_count++; + the_lnet.ln_counters.send_length += md->md_length; + + LNET_UNLOCK(); + + rc = lnet_send(self, msg); + if (rc != 0) { + CERROR("Error sending PUT to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize (NULL, msg, rc); } /* completion will be signalled by an event */ - return PTL_OK; + return 0; } -lib_msg_t * -lib_create_reply_msg (lib_nal_t *nal, ptl_nid_t peer_nid, lib_msg_t *getmsg) +lnet_msg_t * +lnet_create_reply_msg (lnet_ni_t *ni, lnet_msg_t *getmsg) { - /* The NAL can DMA direct to the GET md (i.e. no REPLY msg). This - * returns a msg for the NAL to pass to lib_finalize() when the sink + /* The LND can DMA direct to the GET md (i.e. no REPLY msg). This + * returns a msg for the LND to pass to lnet_finalize() when the sink * data has been received. * * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when - * lib_finalize() is called on it, so the NAL must call this first */ + * lnet_finalize() is called on it, so the LND must call this first */ + + lnet_msg_t *msg = lnet_msg_alloc(); + lnet_libmd_t *getmd = getmsg->msg_md; + lnet_process_id_t peer_id = getmsg->msg_target; - lib_ni_t *ni = &nal->libnal_ni; - lib_msg_t *msg = lib_msg_alloc(nal); - lib_md_t *getmd = getmsg->md; - unsigned long flags; + LASSERT (!getmsg->msg_target_is_router); + LASSERT (!getmsg->msg_routing); - LIB_LOCK(nal, flags); + LNET_LOCK(); - LASSERT (getmd->pending > 0); + LASSERT (getmd->md_refcount > 0); if (msg == NULL) { - CERROR ("Dropping REPLY from "LPU64": can't allocate msg\n", - peer_nid); + CERROR ("%s: Dropping REPLY from %s: can't allocate msg\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); goto drop; } - if (getmd->threshold == 0) { - CERROR ("Dropping REPLY from "LPU64" for inactive MD %p\n", - peer_nid, getmd); + if (getmd->md_threshold == 0) { + CERROR ("%s: Dropping REPLY from %s for inactive MD %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), + getmd); goto drop_msg; } - LASSERT (getmd->offset == 0); + LASSERT (getmd->md_offset == 0); - CDEBUG(D_NET, "Reply from "LPU64" md %p\n", peer_nid, getmd); + CDEBUG(D_NET, "%s: Reply from %s md %p\n", + libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); - lib_commit_md (nal, getmd, msg); + lnet_commit_md (getmd, msg); - msg->ev.type = PTL_EVENT_REPLY_END; - msg->ev.initiator.nid = peer_nid; - msg->ev.initiator.pid = 0; /* XXX FIXME!!! */ - msg->ev.rlength = msg->ev.mlength = getmd->length; - msg->ev.offset = 0; + msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ - lib_md_deconstruct(nal, getmd, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, getmd); + msg->msg_ev.type = LNET_EVENT_REPLY; + msg->msg_ev.initiator = peer_id; + msg->msg_ev.rlength = msg->msg_ev.mlength = getmd->md_length; + msg->msg_ev.offset = 0; - ni->ni_counters.recv_count++; - ni->ni_counters.recv_length += getmd->length; + lnet_md_deconstruct(getmd, &msg->msg_ev.md); + lnet_md2handle(&msg->msg_ev.md_handle, getmd); - LIB_UNLOCK(nal, flags); + the_lnet.ln_counters.recv_count++; + the_lnet.ln_counters.recv_length += getmd->md_length; + + LNET_UNLOCK(); return msg; drop_msg: - lib_msg_free(nal, msg); + lnet_msg_free(msg); drop: - nal->libnal_ni.ni_counters.drop_count++; - nal->libnal_ni.ni_counters.drop_length += getmd->length; + the_lnet.ln_counters.drop_count++; + the_lnet.ln_counters.drop_length += getmd->md_length; - LIB_UNLOCK (nal, flags); + LNET_UNLOCK (); return NULL; } +void +lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) +{ + /* Set the REPLY length, now the RDMA that elides the REPLY message has + * completed and I know it. */ + LASSERT (reply != NULL); + LASSERT (reply->msg_type == LNET_MSG_GET); + LASSERT (reply->msg_ev.type == LNET_EVENT_REPLY); + + /* NB I trusted my peer to RDMA. If she tells me she's written beyond + * the end of my buffer, I might as well be dead. */ + LASSERT (len <= reply->msg_ev.mlength); + + reply->msg_ev.mlength = len; +} + int -lib_api_get(nal_t *apinal, ptl_handle_md_t *mdh, ptl_process_id_t *id, - ptl_pt_index_t portal, ptl_ac_index_t ac, - ptl_match_bits_t match_bits, ptl_size_t offset) +LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, + lnet_process_id_t target, unsigned int portal, + __u64 match_bits, unsigned int offset) { - lib_nal_t *nal = apinal->nal_data; - lib_ni_t *ni = &nal->libnal_ni; - lib_msg_t *msg; - ptl_hdr_t hdr; - lib_md_t *md; - unsigned long flags; + lnet_msg_t *msg; + lnet_libmd_t *md; int rc; - if (!list_empty (&ni->ni_test_peers) && /* normally we don't */ - fail_peer (nal, id->nid, 1)) /* shall we now? */ + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ + fail_peer (target.nid, 1)) /* shall we now? */ { - CERROR("Dropping PUT to "LPX64": simulated failure\n", - id->nid); - return PTL_PROCESS_INVALID; + CERROR("Dropping GET to %s: simulated failure\n", + libcfs_id2str(target)); + return -EIO; } - msg = lib_msg_alloc(nal); + msg = lnet_msg_alloc(); if (msg == NULL) { - CERROR("Dropping GET to "LPU64": ENOMEM on lib_msg_t\n", - id->nid); - return PTL_NO_SPACE; + CERROR("Dropping GET to %s: ENOMEM on lnet_msg_t\n", + libcfs_id2str(target)); + return -ENOMEM; } - LIB_LOCK(nal, flags); + LNET_LOCK(); - md = ptl_handle2md(mdh, nal); - if (md == NULL || !md->threshold) { - lib_msg_free(nal, msg); - LIB_UNLOCK(nal, flags); + md = lnet_handle2md(&mdh); + if (md == NULL || md->md_threshold == 0) { + lnet_msg_free(msg); + LNET_UNLOCK(); - return PTL_MD_INVALID; + CERROR("Dropping GET to %s: MD invalid\n", + libcfs_id2str(target)); + return -ENOENT; } - CDEBUG(D_NET, "PtlGet -> %Lu: %lu\n", (unsigned long long)id->nid, - (unsigned long)id->pid); + CDEBUG(D_NET, "LNetGet -> %s\n", libcfs_id2str(target)); - memset (&hdr, 0, sizeof (hdr)); - hdr.type = cpu_to_le32(PTL_MSG_GET); - hdr.dest_nid = cpu_to_le64(id->nid); - hdr.dest_pid = cpu_to_le32(id->pid); - hdr.src_nid = cpu_to_le64(ni->ni_pid.nid); - hdr.src_pid = cpu_to_le32(ni->ni_pid.pid); - hdr.payload_length = 0; + lnet_commit_md(md, msg); - /* NB handles only looked up by creator (no flips) */ - hdr.msg.get.return_wmd.wh_interface_cookie = ni->ni_interface_cookie; - hdr.msg.get.return_wmd.wh_object_cookie = md->md_lh.lh_cookie; + lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); - hdr.msg.get.match_bits = cpu_to_le64(match_bits); - hdr.msg.get.ptl_index = cpu_to_le32(portal); - hdr.msg.get.src_offset = cpu_to_le32(offset); - hdr.msg.get.sink_length = cpu_to_le32(md->length); + msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); + msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); + msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); + msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); - lib_commit_md(nal, md, msg); + /* NB handles only looked up by creator (no flips) */ + msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = + the_lnet.ln_interface_cookie; + msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = + md->md_lh.lh_cookie; + + msg->msg_ev.type = LNET_EVENT_SEND; + msg->msg_ev.initiator.nid = LNET_NID_ANY; + msg->msg_ev.initiator.pid = the_lnet.ln_pid; + msg->msg_ev.target = target; + msg->msg_ev.pt_index = portal; + msg->msg_ev.match_bits = match_bits; + msg->msg_ev.rlength = md->md_length; + msg->msg_ev.mlength = md->md_length; + msg->msg_ev.offset = offset; + msg->msg_ev.hdr_data = 0; + + lnet_md_deconstruct(md, &msg->msg_ev.md); + lnet_md2handle(&msg->msg_ev.md_handle, md); + + the_lnet.ln_counters.send_count++; + + LNET_UNLOCK(); + + rc = lnet_send(self, msg); + if (rc < 0) { + CERROR("error sending GET to %s: %d\n", + libcfs_id2str(target), rc); + lnet_finalize (NULL, msg, rc); + } + + /* completion will be signalled by an event */ + return 0; +} - msg->ev.type = PTL_EVENT_SEND_END; - msg->ev.initiator = ni->ni_pid; - msg->ev.pt_index = portal; - msg->ev.match_bits = match_bits; - msg->ev.rlength = md->length; - msg->ev.mlength = md->length; - msg->ev.offset = offset; - msg->ev.hdr_data = 0; +int +LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, int *orderp) +{ + struct list_head *e; + lnet_ni_t *ni; + lnet_route_t *route; + lnet_remotenet_t *rnet; + __u32 dstnet = LNET_NIDNET(dstnid); + int hops; + int order = 2; + + /* if !local_nid_dist_zero, I don't return a distance of 0 ever + * (when lustre sees a distance of 0, it substitutes 0@lo), so I + * keep order 0 free for 0@lo and order 1 free for a local NID + * match */ + + LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_refcount > 0); + + LNET_LOCK(); + + list_for_each (e, &the_lnet.ln_nis) { + ni = list_entry(e, lnet_ni_t, ni_list); + + if (ni->ni_nid == dstnid || + (the_lnet.ln_ptlcompat > 0 && + LNET_NIDNET(dstnid) == 0 && + LNET_NIDADDR(dstnid) == LNET_NIDADDR(ni->ni_nid) && + LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) != LOLND)) { + if (srcnidp != NULL) + *srcnidp = dstnid; + if (orderp != NULL) { + if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) + *orderp = 0; + else + *orderp = 1; + } + LNET_UNLOCK(); - lib_md_deconstruct(nal, md, &msg->ev.md); - ptl_md2handle(&msg->ev.md_handle, nal, md); + return local_nid_dist_zero ? 0 : 1; + } - ni->ni_counters.send_count++; + if (LNET_NIDNET(ni->ni_nid) == dstnet || + (the_lnet.ln_ptlcompat > 0 && + dstnet == 0 && + LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) != LOLND)) { + if (srcnidp != NULL) + *srcnidp = ni->ni_nid; + if (orderp != NULL) + *orderp = order; + LNET_UNLOCK(); + return 1; + } - LIB_UNLOCK(nal, flags); + order++; + } - rc = lib_send (nal, NULL, msg, &hdr, PTL_MSG_GET, - id->nid, id->pid, NULL, 0, 0); - if (rc != PTL_OK) { - CERROR(LPU64": error sending GET to "LPU64": %d\n", - ni->ni_pid.nid, id->nid, rc); - lib_finalize (nal, NULL, msg, rc); + list_for_each (e, &the_lnet.ln_remote_nets) { + rnet = list_entry(e, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == dstnet) { + LASSERT (!list_empty(&rnet->lrn_routes)); + route = list_entry(rnet->lrn_routes.next, + lnet_route_t, lr_list); + hops = rnet->lrn_hops; + if (srcnidp != NULL) + *srcnidp = route->lr_gateway->lp_ni->ni_nid; + if (orderp != NULL) + *orderp = order; + LNET_UNLOCK(); + return hops + 1; + } + order++; } - /* completion will be signalled by an event */ - return PTL_OK; + LNET_UNLOCK(); + return -EHOSTUNREACH; } -void lib_assert_wire_constants (void) -{ - /* Wire protocol assertions generated by 'wirecheck' - * running on Linux mdevi 2.4.21-p4smp-55chaos #1 SMP Tue Jun 8 14:38:44 PDT 2004 i686 i686 i - * with gcc version 3.2.3 20030502 (Red Hat Linux 3.2.3-34) */ - - - /* Constants... */ - LASSERT (PORTALS_PROTO_MAGIC == 0xeebc0ded); - LASSERT (PORTALS_PROTO_VERSION_MAJOR == 1); - LASSERT (PORTALS_PROTO_VERSION_MINOR == 0); - LASSERT (PTL_MSG_ACK == 0); - LASSERT (PTL_MSG_PUT == 1); - LASSERT (PTL_MSG_GET == 2); - LASSERT (PTL_MSG_REPLY == 3); - LASSERT (PTL_MSG_HELLO == 4); - - /* Checks for struct ptl_handle_wire_t */ - LASSERT ((int)sizeof(ptl_handle_wire_t) == 16); - LASSERT ((int)offsetof(ptl_handle_wire_t, wh_interface_cookie) == 0); - LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_interface_cookie) == 8); - LASSERT ((int)offsetof(ptl_handle_wire_t, wh_object_cookie) == 8); - LASSERT ((int)sizeof(((ptl_handle_wire_t *)0)->wh_object_cookie) == 8); - - /* Checks for struct ptl_magicversion_t */ - LASSERT ((int)sizeof(ptl_magicversion_t) == 8); - LASSERT ((int)offsetof(ptl_magicversion_t, magic) == 0); - LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->magic) == 4); - LASSERT ((int)offsetof(ptl_magicversion_t, version_major) == 4); - LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_major) == 2); - LASSERT ((int)offsetof(ptl_magicversion_t, version_minor) == 6); - LASSERT ((int)sizeof(((ptl_magicversion_t *)0)->version_minor) == 2); - - /* Checks for struct ptl_hdr_t */ - LASSERT ((int)sizeof(ptl_hdr_t) == 72); - LASSERT ((int)offsetof(ptl_hdr_t, dest_nid) == 0); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_nid) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, src_nid) == 8); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_nid) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, dest_pid) == 16); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->dest_pid) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, src_pid) == 20); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->src_pid) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, type) == 24); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->type) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, payload_length) == 28); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->payload_length) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg) == 40); - - /* Ack */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.dst_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.dst_wmd) == 16); - LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.match_bits) == 48); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.match_bits) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.ack.mlength) == 56); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.ack.mlength) == 4); - - /* Put */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ack_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ack_wmd) == 16); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.match_bits) == 48); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.match_bits) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.hdr_data) == 56); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.hdr_data) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.ptl_index) == 64); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.ptl_index) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg.put.offset) == 68); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.put.offset) == 4); - - /* Get */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.return_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.return_wmd) == 16); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.match_bits) == 48); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.match_bits) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.ptl_index) == 56); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.ptl_index) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.src_offset) == 60); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.src_offset) == 4); - LASSERT ((int)offsetof(ptl_hdr_t, msg.get.sink_length) == 64); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.get.sink_length) == 4); - - /* Reply */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.reply.dst_wmd) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.reply.dst_wmd) == 16); - - /* Hello */ - LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.incarnation) == 32); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.incarnation) == 8); - LASSERT ((int)offsetof(ptl_hdr_t, msg.hello.type) == 40); - LASSERT ((int)sizeof(((ptl_hdr_t *)0)->msg.hello.type) == 4); -} diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 38904c4..a9834b5 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -22,31 +22,17 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET -#ifndef __KERNEL__ -# include -#else -# include -#endif - -#include +#include void -lib_enq_event_locked (lib_nal_t *nal, void *private, - lib_eq_t *eq, ptl_event_t *ev) +lnet_enq_event_locked (lnet_eq_t *eq, lnet_event_t *ev) { - ptl_event_t *eq_slot; + lnet_event_t *eq_slot; /* Allocate the next queue slot */ - ev->link = ev->sequence = eq->eq_enq_seq++; - /* NB we don't support START events yet and we don't create a separate - * UNLINK event unless an explicit unlink succeeds, so the link - * sequence is pretty useless */ - - /* We don't support different uid/jids yet */ - ev->uid = 0; - ev->jid = 0; + ev->sequence = eq->eq_enq_seq++; /* size must be a power of 2 to handle sequence # overflow */ LASSERT (eq->eq_size != 0 && @@ -54,7 +40,7 @@ lib_enq_event_locked (lib_nal_t *nal, void *private, eq_slot = eq->eq_events + (ev->sequence & (eq->eq_size - 1)); /* There is no race since both event consumers and event producers - * take the LIB_LOCK(), so we don't screw around with memory + * take the LNET_LOCK, so we don't screw around with memory * barriers, setting the sequence number last or wierd structure * layout assertions. */ *eq_slot = *ev; @@ -63,85 +49,176 @@ lib_enq_event_locked (lib_nal_t *nal, void *private, if (eq->eq_callback != NULL) eq->eq_callback (eq_slot); - /* Wake anyone sleeping for an event (see lib-eq.c) */ #ifdef __KERNEL__ - if (cfs_waitq_active(&nal->libnal_ni.ni_waitq)) - cfs_waitq_broadcast(&nal->libnal_ni.ni_waitq); + /* Wake anyone waiting in LNetEQPoll() */ + if (cfs_waitq_active(&the_lnet.ln_waitq)) + cfs_waitq_broadcast(&the_lnet.ln_waitq); #else - pthread_cond_broadcast(&nal->libnal_ni.ni_cond); +# if !HAVE_LIBPTHREAD + /* LNetEQPoll() calls into _the_ LND to wait for action */ +# else + /* Wake anyone waiting in LNetEQPoll() */ + pthread_cond_broadcast(&the_lnet.ln_cond); +# endif #endif } void -lib_finalize (lib_nal_t *nal, void *private, lib_msg_t *msg, ptl_err_t status) +lnet_complete_msg_locked(lnet_msg_t *msg) +{ + lnet_handle_wire_t ack_wmd; + int rc; + int status = msg->msg_ev.status; + + LASSERT (msg->msg_onactivelist); + + if (status == 0 && msg->msg_ack) { + /* Only send an ACK if the PUT completed successfully */ + + lnet_return_credits_locked(msg); + + msg->msg_ack = 0; + LNET_UNLOCK(); + + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); + LASSERT(!msg->msg_routing); + + ack_wmd = msg->msg_hdr.msg.put.ack_wmd; + + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); + + msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; + msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; + msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); + + rc = lnet_send(msg->msg_ev.target.nid, msg); + + LNET_LOCK(); + + if (rc == 0) + return; + } else if (status == 0 && /* OK so far */ + (msg->msg_routing && !msg->msg_sending)) { /* not forwarded */ + + LASSERT (!msg->msg_receiving); /* called back recv already */ + + LNET_UNLOCK(); + + rc = lnet_send(LNET_NID_ANY, msg); + + LNET_LOCK(); + + if (rc == 0) + return; + } + + lnet_return_credits_locked(msg); + + LASSERT (msg->msg_onactivelist); + msg->msg_onactivelist = 0; + list_del (&msg->msg_activelist); + the_lnet.ln_counters.msgs_alloc--; + lnet_msg_free(msg); +} + + +void +lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) { - lib_md_t *md; - int unlink; - unsigned long flags; - int rc; - ptl_hdr_t ack; +#ifdef __KERNEL__ + int i; + int my_slot; +#endif + lnet_libmd_t *md; + + LASSERT (!in_interrupt ()); if (msg == NULL) return; +#if 0 + CDEBUG(D_WARNING, "%s msg->%s Flags:%s%s%s%s%s%s%s%s%s%s%s txp %s rxp %s\n", + lnet_msgtyp2str(msg->msg_type), libcfs_id2str(msg->msg_target), + msg->msg_target_is_router ? "t" : "", + msg->msg_routing ? "X" : "", + msg->msg_ack ? "A" : "", + msg->msg_sending ? "S" : "", + msg->msg_receiving ? "R" : "", + msg->msg_delayed ? "d" : "", + msg->msg_txcredit ? "C" : "", + msg->msg_peertxcredit ? "c" : "", + msg->msg_rtrcredit ? "F" : "", + msg->msg_peerrtrcredit ? "f" : "", + msg->msg_onactivelist ? "!" : "", + msg->msg_txpeer == NULL ? "" : libcfs_nid2str(msg->msg_txpeer->lp_nid), + msg->msg_rxpeer == NULL ? "" : libcfs_nid2str(msg->msg_rxpeer->lp_nid)); +#endif + LNET_LOCK(); - /* Only send an ACK if the PUT completed successfully */ - if (status == PTL_OK && - !ptl_is_wire_handle_none(&msg->ack_wmd)) { - - LASSERT(msg->ev.type == PTL_EVENT_PUT_END); - - memset (&ack, 0, sizeof (ack)); - ack.type = cpu_to_le32(PTL_MSG_ACK); - ack.dest_nid = cpu_to_le64(msg->ev.initiator.nid); - ack.dest_pid = cpu_to_le32(msg->ev.initiator.pid); - ack.src_nid = cpu_to_le64(nal->libnal_ni.ni_pid.nid); - ack.src_pid = cpu_to_le32(nal->libnal_ni.ni_pid.pid); - ack.payload_length = 0; - - ack.msg.ack.dst_wmd = msg->ack_wmd; - ack.msg.ack.match_bits = msg->ev.match_bits; - ack.msg.ack.mlength = cpu_to_le32(msg->ev.mlength); - - rc = lib_send (nal, private, NULL, &ack, PTL_MSG_ACK, - msg->ev.initiator.nid, msg->ev.initiator.pid, - NULL, 0, 0); - if (rc != PTL_OK) { - /* send failed: there's nothing else to clean up. */ - CERROR("Error %d sending ACK to "LPX64"\n", - rc, msg->ev.initiator.nid); - } - } + LASSERT (msg->msg_onactivelist); + + msg->msg_ev.status = status; - md = msg->md; + md = msg->msg_md; + if (md != NULL) { + int unlink; - LIB_LOCK(nal, flags); + /* Now it's safe to drop my caller's ref */ + md->md_refcount--; + LASSERT (md->md_refcount >= 0); - /* Now it's safe to drop my caller's ref */ - md->pending--; - LASSERT (md->pending >= 0); + unlink = lnet_md_unlinkable(md); + + msg->msg_ev.unlinked = unlink; + + if (md->md_eq != NULL) + lnet_enq_event_locked(md->md_eq, &msg->msg_ev); + + if (unlink) + lnet_md_unlink(md); + + msg->msg_md = NULL; + } - /* Should I unlink this MD? */ - if (md->pending != 0) /* other refs */ - unlink = 0; - else if ((md->md_flags & PTL_MD_FLAG_ZOMBIE) != 0) - unlink = 1; - else if ((md->md_flags & PTL_MD_FLAG_AUTO_UNLINK) == 0) - unlink = 0; - else - unlink = lib_md_exhausted(md); + list_add_tail (&msg->msg_list, &the_lnet.ln_finalizeq); - msg->ev.ni_fail_type = status; - msg->ev.unlinked = unlink; + /* Recursion breaker. Don't complete the message here if I am (or + * enough other threads are) already completing messages */ - if (md->eq != NULL) - lib_enq_event_locked(nal, private, md->eq, &msg->ev); +#ifdef __KERNEL__ + my_slot = -1; + for (i = 0; i < the_lnet.ln_nfinalizers; i++) { + if (the_lnet.ln_finalizers[i] == cfs_current()) + goto out; + if (my_slot < 0 && the_lnet.ln_finalizers[i] == NULL) + my_slot = i; + } + if (my_slot < 0) + goto out; - if (unlink) - lib_md_unlink(nal, md); + the_lnet.ln_finalizers[my_slot] = cfs_current(); +#else + if (the_lnet.ln_finalizing) + goto out; +#endif - list_del (&msg->msg_list); - nal->libnal_ni.ni_counters.msgs_alloc--; - lib_msg_free(nal, msg); + while (!list_empty(&the_lnet.ln_finalizeq)) { + msg = list_entry(the_lnet.ln_finalizeq.next, + lnet_msg_t, msg_list); + + list_del(&msg->msg_list); - LIB_UNLOCK(nal, flags); + /* NB drops and regains the lnet lock if it actually does + * anything, so my finalizing friends can chomp along too */ + lnet_complete_msg_locked(msg); + } + +#ifdef __KERNEL__ + the_lnet.ln_finalizers[my_slot] = NULL; +#else + the_lnet.ln_finalizing = 0; +#endif + + out: + LNET_UNLOCK(); } + diff --git a/lnet/lnet/lib-ni.c b/lnet/lnet/lib-ni.c deleted file mode 100644 index e45859a..0000000 --- a/lnet/lnet/lib-ni.c +++ /dev/null @@ -1,29 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * This file is part of Lustre, http://www.lustre.org - * This file is not subject to copyright protection. - */ - -#define DEBUG_SUBSYSTEM S_PORTALS -#include - -int lib_api_ni_status (nal_t *apinal, ptl_sr_index_t sr_idx, - ptl_sr_value_t *status) -{ - return PTL_FAIL; -} - - -int lib_api_ni_dist (nal_t *apinal, ptl_process_id_t *pid, unsigned long *dist) -{ - lib_nal_t *nal = apinal->nal_data; - - if (nal->libnal_ni.ni_loopback && - pid->nid == nal->libnal_ni.ni_pid.nid) { - *dist = 0; - return PTL_OK; - } - - return (nal->libnal_dist(nal, pid->nid, dist)); -} diff --git a/lnet/lnet/lib-pid.c b/lnet/lnet/lib-pid.c deleted file mode 100644 index 23d6dd3..0000000 --- a/lnet/lnet/lib-pid.c +++ /dev/null @@ -1,20 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * This file is part of Lustre, http://www.lustre.org - * This file is not subject to copyright protection. - */ - -/* This should be removed. The NAL should have the PID information */ -#define DEBUG_SUBSYSTEM S_PORTALS - -#include - -int -lib_api_get_id(nal_t *apinal, ptl_process_id_t *pid) -{ - lib_nal_t *nal = apinal->nal_data; - - *pid = nal->libnal_ni.ni_pid; - return PTL_OK; -} diff --git a/lnet/lnet/lo.c b/lnet/lnet/lo.c new file mode 100644 index 0000000..e123b3d --- /dev/null +++ b/lnet/lnet/lo.c @@ -0,0 +1,112 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +int +lolnd_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + LASSERT (!lntmsg->msg_routing); + LASSERT (!lntmsg->msg_target_is_router); + + return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0); +} + +int +lolnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + lnet_msg_t *sendmsg = private; + + if (lntmsg != NULL) { /* not discarding */ + if (sendmsg->msg_iov != NULL) { + if (iov != NULL) + lnet_copy_iov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + else + lnet_copy_iov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_iov, + sendmsg->msg_offset, mlen); + } else { + if (iov != NULL) + lnet_copy_kiov2iov(niov, iov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + else + lnet_copy_kiov2kiov(niov, kiov, offset, + sendmsg->msg_niov, + sendmsg->msg_kiov, + sendmsg->msg_offset, mlen); + } + + lnet_finalize(ni, lntmsg, 0); + } + + lnet_finalize(ni, sendmsg, 0); + return 0; +} + +static int lolnd_instanced; + +void +lolnd_shutdown(lnet_ni_t *ni) +{ + CDEBUG (D_NET, "shutdown\n"); + LASSERT (lolnd_instanced); + + lolnd_instanced = 0; +} + +int +lolnd_startup (lnet_ni_t *ni) +{ + LASSERT (ni->ni_lnd == &the_lolnd); + LASSERT (!lolnd_instanced); + lolnd_instanced = 1; + + return (0); +} + +lnd_t the_lolnd = { + /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list}, + /* .lnd_refcount = */ 0, + /* .lnd_type = */ LOLND, + /* .lnd_startup = */ lolnd_startup, + /* .lnd_shutdown = */ lolnd_shutdown, + /* .lnt_ctl = */ NULL, + /* .lnd_send = */ lolnd_send, + /* .lnd_recv = */ lolnd_recv, + /* .lnd_eager_recv = */ NULL, + /* .lnd_notify = */ NULL, +#ifdef __KERNEL__ + /* .lnd_accept = */ NULL +#else + /* .lnd_wait = */ NULL +#endif +}; + diff --git a/lnet/lnet/module.c b/lnet/lnet/module.c index 472175b..eff8daa 100644 --- a/lnet/lnet/module.c +++ b/lnet/lnet/module.c @@ -22,196 +22,162 @@ #ifndef EXPORT_SYMTAB # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_PORTALS +#define DEBUG_SUBSYSTEM S_LNET +#include -#include -#include -#include -#include -#include +static int config_on_load = 0; +CFS_MODULE_PARM(config_on_load, "i", int, 0444, + "configure network at module load"); -extern void (kping_client)(struct portal_ioctl_data *); +static struct semaphore lnet_config_mutex; -static int kportal_ioctl(struct portal_ioctl_data *data, - unsigned int cmd, unsigned long arg) +int +lnet_configure (void *arg) { - int err; - char str[PTL_NALFMT_SIZE]; - ENTRY; + /* 'arg' only there so I can be passed to cfs_kernel_thread() */ + int rc = 0; - switch (cmd) { - case IOC_PORTAL_PING: { - void (*ping)(struct portal_ioctl_data *); - - CDEBUG(D_IOCTL, "doing %d pings to nid "LPX64" (%s)\n", - data->ioc_count, data->ioc_nid, - portals_nid2str(data->ioc_nal, data->ioc_nid, str)); - ping = PORTAL_SYMBOL_GET(kping_client); - if (!ping) - CERROR("PORTAL_SYMBOL_GET failed\n"); - else { - ping(data); - PORTAL_SYMBOL_PUT(kping_client); + LNET_MUTEX_DOWN(&lnet_config_mutex); + + if (!the_lnet.ln_niinit_self) { + rc = LNetNIInit(LUSTRE_SRV_LNET_PID); + if (rc >= 0) { + the_lnet.ln_niinit_self = 1; + rc = 0; } - RETURN(0); } - case IOC_PORTAL_GET_NID: { - ptl_handle_ni_t nih; - ptl_process_id_t pid; - - CDEBUG (D_IOCTL, "Getting nid for nal [%x]\n", data->ioc_nal); - - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, - NULL, &nih); - if (!(err == PTL_OK || err == PTL_IFACE_DUP)) - RETURN (-EINVAL); - - err = PtlGetId (nih, &pid); - LASSERT (err == PTL_OK); + LNET_MUTEX_UP(&lnet_config_mutex); + return rc; +} - PtlNIFini(nih); +int +lnet_unconfigure (void) +{ + int refcount; + + LNET_MUTEX_DOWN(&lnet_config_mutex); - data->ioc_nid = pid.nid; - if (copy_to_user ((char *)arg, data, sizeof (*data))) - RETURN (-EFAULT); - RETURN(0); + if (the_lnet.ln_niinit_self) { + the_lnet.ln_niinit_self = 0; + LNetNIFini(); } - case IOC_PORTAL_FAIL_NID: { - ptl_handle_ni_t nih; - - CDEBUG (D_IOCTL, "fail nid: [%d] "LPU64" count %d\n", - data->ioc_nal, data->ioc_nid, data->ioc_count); - - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, - NULL, &nih); - if (!(err == PTL_OK || err == PTL_IFACE_DUP)) - return (-EINVAL); - - if (err == PTL_OK) { - /* There's no point in failing an interface that - * came into existance just for this */ - err = -EINVAL; - } else { - err = PtlFailNid (nih, data->ioc_nid, data->ioc_count); - if (err != PTL_OK) - err = -EINVAL; - } + LNET_MUTEX_DOWN(&the_lnet.ln_api_mutex); + refcount = the_lnet.ln_refcount; + LNET_MUTEX_UP(&the_lnet.ln_api_mutex); - PtlNIFini(nih); - RETURN (err); - } + LNET_MUTEX_UP(&lnet_config_mutex); + return (refcount == 0) ? 0 : -EBUSY; +} - case IOC_PORTAL_LOOPBACK: { - ptl_handle_ni_t nih; - int enabled = data->ioc_flags; - int set = data->ioc_misc; - - CDEBUG (D_IOCTL, "loopback: [%d] %d %d\n", - data->ioc_nal, enabled, set); - - err = PtlNIInit(data->ioc_nal, LUSTRE_SRV_PTL_PID, NULL, - NULL, &nih); - if (!(err == PTL_OK || err == PTL_IFACE_DUP)) - return (-EINVAL); - - if (err == PTL_OK) { - /* There's no point in failing an interface that - * came into existance just for this */ - err = -EINVAL; - } else { - err = PtlLoopback (nih, set, &enabled); - if (err != PTL_OK) { - err = -EINVAL; - } else { - data->ioc_flags = enabled; - if (copy_to_user ((char *)arg, data, - sizeof (*data))) - err = -EFAULT; - else - err = 0; - } - } +int +lnet_ioctl(unsigned int cmd, struct libcfs_ioctl_data *data) +{ + int rc; - PtlNIFini(nih); - RETURN (err); - } + switch (cmd) { + case IOC_LIBCFS_CONFIGURE: + return lnet_configure(NULL); + + case IOC_LIBCFS_UNCONFIGURE: + return lnet_unconfigure(); + default: - RETURN(-EINVAL); + /* Passing LNET_PID_ANY only gives me a ref if the net is up + * already; I'll need it to ensure the net can't go down while + * I'm called into it */ + rc = LNetNIInit(LNET_PID_ANY); + if (rc >= 0) { + rc = LNetCtl(cmd, data); + LNetNIFini(); + } + return rc; } - /* Not Reached */ } -DECLARE_IOCTL_HANDLER(kportal_ioctl_handler, kportal_ioctl); -extern struct semaphore ptl_mutex; +DECLARE_IOCTL_HANDLER(lnet_ioctl_handler, lnet_ioctl); -static int init_kportals_module(void) +int +init_lnet(void) { - int rc; + int rc; ENTRY; - init_mutex(&ptl_mutex); - rc = PtlInit(NULL); - if (rc) { - CERROR("PtlInit: error %d\n", rc); + init_mutex(&lnet_config_mutex); + + rc = LNetInit(); + if (rc != 0) { + CERROR("LNetInit: error %d\n", rc); RETURN(rc); } - rc = libcfs_register_ioctl(&kportal_ioctl_handler); + rc = libcfs_register_ioctl(&lnet_ioctl_handler); LASSERT (rc == 0); - RETURN(rc); + if (config_on_load) { + /* Have to schedule a separate thread to avoid deadlocking + * in modload */ + (void) cfs_kernel_thread(lnet_configure, NULL, 0); + } + + RETURN(0); } -static void exit_kportals_module(void) +void +fini_lnet(void) { int rc; - rc = libcfs_deregister_ioctl(&kportal_ioctl_handler); + rc = libcfs_deregister_ioctl(&lnet_ioctl_handler); LASSERT (rc == 0); - PtlFini(); + LNetFini(); } -EXPORT_SYMBOL(ptl_register_nal); -EXPORT_SYMBOL(ptl_unregister_nal); - -EXPORT_SYMBOL(ptl_err_str); -EXPORT_SYMBOL(PtlMEAttach); -EXPORT_SYMBOL(PtlMEInsert); -EXPORT_SYMBOL(PtlMEUnlink); -EXPORT_SYMBOL(PtlEQAlloc); -EXPORT_SYMBOL(PtlMDAttach); -EXPORT_SYMBOL(PtlMDUnlink); -EXPORT_SYMBOL(PtlNIInit); -EXPORT_SYMBOL(PtlNIFini); -EXPORT_SYMBOL(PtlInit); -EXPORT_SYMBOL(PtlFini); -EXPORT_SYMBOL(PtlSnprintHandle); -EXPORT_SYMBOL(PtlPut); -EXPORT_SYMBOL(PtlGet); -EXPORT_SYMBOL(PtlEQWait); -EXPORT_SYMBOL(PtlEQFree); -EXPORT_SYMBOL(PtlEQGet); -EXPORT_SYMBOL(PtlGetId); -EXPORT_SYMBOL(PtlMDBind); -EXPORT_SYMBOL(lib_iov_nob); -EXPORT_SYMBOL(lib_copy_iov2buf); -EXPORT_SYMBOL(lib_copy_buf2iov); -EXPORT_SYMBOL(lib_extract_iov); -EXPORT_SYMBOL(lib_kiov_nob); -EXPORT_SYMBOL(lib_copy_kiov2buf); -EXPORT_SYMBOL(lib_copy_buf2kiov); -EXPORT_SYMBOL(lib_extract_kiov); -EXPORT_SYMBOL(lib_finalize); -EXPORT_SYMBOL(lib_parse); -EXPORT_SYMBOL(lib_create_reply_msg); -EXPORT_SYMBOL(lib_init); -EXPORT_SYMBOL(lib_fini); +EXPORT_SYMBOL(lnet_register_lnd); +EXPORT_SYMBOL(lnet_unregister_lnd); + +EXPORT_SYMBOL(LNetMEAttach); +EXPORT_SYMBOL(LNetMEInsert); +EXPORT_SYMBOL(LNetMEUnlink); +EXPORT_SYMBOL(LNetEQAlloc); +EXPORT_SYMBOL(LNetMDAttach); +EXPORT_SYMBOL(LNetMDUnlink); +EXPORT_SYMBOL(LNetNIInit); +EXPORT_SYMBOL(LNetNIFini); +EXPORT_SYMBOL(LNetInit); +EXPORT_SYMBOL(LNetFini); +EXPORT_SYMBOL(LNetSnprintHandle); +EXPORT_SYMBOL(LNetPut); +EXPORT_SYMBOL(LNetGet); +EXPORT_SYMBOL(LNetEQWait); +EXPORT_SYMBOL(LNetEQFree); +EXPORT_SYMBOL(LNetEQGet); +EXPORT_SYMBOL(LNetGetId); +EXPORT_SYMBOL(LNetMDBind); +EXPORT_SYMBOL(LNetDist); +EXPORT_SYMBOL(LNetCtl); +EXPORT_SYMBOL(LNetSetLazyPortal); +EXPORT_SYMBOL(LNetClearLazyPortal); +EXPORT_SYMBOL(the_lnet); +EXPORT_SYMBOL(lnet_iov_nob); +EXPORT_SYMBOL(lnet_extract_iov); +EXPORT_SYMBOL(lnet_kiov_nob); +EXPORT_SYMBOL(lnet_extract_kiov); +EXPORT_SYMBOL(lnet_copy_iov2iov); +EXPORT_SYMBOL(lnet_copy_iov2kiov); +EXPORT_SYMBOL(lnet_copy_kiov2iov); +EXPORT_SYMBOL(lnet_copy_kiov2kiov); +EXPORT_SYMBOL(lnet_finalize); +EXPORT_SYMBOL(lnet_parse); +EXPORT_SYMBOL(lnet_create_reply_msg); +EXPORT_SYMBOL(lnet_set_reply_msg_len); +EXPORT_SYMBOL(lnet_msgtyp2str); +EXPORT_SYMBOL(lnet_net2ni_locked); MODULE_AUTHOR("Peter J. Braam "); MODULE_DESCRIPTION("Portals v3.1"); MODULE_LICENSE("GPL"); -cfs_module(portals, "1.0.0", init_kportals_module, exit_kportals_module); +cfs_module(lnet, "1.0.0", init_lnet, fini_lnet); diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c new file mode 100644 index 0000000..6ac1d1e --- /dev/null +++ b/lnet/lnet/peer.c @@ -0,0 +1,244 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lib/lib-move.c + * Data movement routines + * + * Copyright (c) 2001-2003 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +#include + +int +lnet_create_peer_table(void) +{ + struct list_head *hash; + int i; + + LASSERT (the_lnet.ln_peer_hash == NULL); + LIBCFS_ALLOC(hash, LNET_PEER_HASHSIZE * sizeof(struct list_head)); + + if (hash == NULL) { + CERROR("Can't allocate peer hash table\n"); + return -ENOMEM; + } + + for (i = 0; i < LNET_PEER_HASHSIZE; i++) + CFS_INIT_LIST_HEAD(&hash[i]); + + the_lnet.ln_peer_hash = hash; + return 0; +} + +void +lnet_destroy_peer_table(void) +{ + int i; + + if (the_lnet.ln_peer_hash == NULL) + return; + + for (i = 0; i < LNET_PEER_HASHSIZE; i++) + LASSERT (list_empty(&the_lnet.ln_peer_hash[i])); + + LIBCFS_FREE(the_lnet.ln_peer_hash, + LNET_PEER_HASHSIZE * sizeof (struct list_head)); + the_lnet.ln_peer_hash = NULL; +} + +void +lnet_clear_peer_table(void) +{ + int i; + + LASSERT (the_lnet.ln_shutdown); /* i.e. no new peers */ + + for (i = 0; i < LNET_PEER_HASHSIZE; i++) { + struct list_head *peers = &the_lnet.ln_peer_hash[i]; + + LNET_LOCK(); + while (!list_empty(peers)) { + lnet_peer_t *lp = list_entry(peers->next, + lnet_peer_t, lp_hashlist); + + list_del(&lp->lp_hashlist); + lnet_peer_decref_locked(lp); /* lose hash table's ref */ + } + LNET_UNLOCK(); + } + + LNET_LOCK(); + for (i = 3; the_lnet.ln_npeers != 0;i++) { + LNET_UNLOCK(); + + if ((i & (i-1)) == 0) + CDEBUG(D_WARNING,"Waiting for %d peers\n", + the_lnet.ln_npeers); + cfs_pause(cfs_time_seconds(1)); + + LNET_LOCK(); + } + LNET_UNLOCK(); +} + +void +lnet_destroy_peer_locked (lnet_peer_t *lp) +{ + lnet_ni_decref_locked(lp->lp_ni); + LNET_UNLOCK(); + + LASSERT (lp->lp_refcount == 0); + LASSERT (lp->lp_rtr_refcount == 0); + LASSERT (list_empty(&lp->lp_txq)); + LASSERT (lp->lp_txqnob == 0); + + LIBCFS_FREE(lp, sizeof(*lp)); + + LNET_LOCK(); + + LASSERT(the_lnet.ln_npeers > 0); + the_lnet.ln_npeers--; +} + +lnet_peer_t * +lnet_find_peer_locked (lnet_nid_t nid) +{ + unsigned int idx = LNET_NIDADDR(nid) % LNET_PEER_HASHSIZE; + struct list_head *peers = &the_lnet.ln_peer_hash[idx]; + struct list_head *tmp; + lnet_peer_t *lp; + + if (the_lnet.ln_shutdown) + return NULL; + + list_for_each (tmp, peers) { + lp = list_entry(tmp, lnet_peer_t, lp_hashlist); + + if (lp->lp_nid == nid) { + lnet_peer_addref_locked(lp); + return lp; + } + } + + return NULL; +} + +int +lnet_nid2peer_locked(lnet_peer_t **lpp, lnet_nid_t nid) +{ + lnet_peer_t *lp; + lnet_peer_t *lp2; + + lp = lnet_find_peer_locked(nid); + if (lp != NULL) { + *lpp = lp; + return 0; + } + + LNET_UNLOCK(); + + LIBCFS_ALLOC(lp, sizeof(*lp)); + if (lp == NULL) { + *lpp = NULL; + LNET_LOCK(); + return -ENOMEM; + } + + memset(lp, 0, sizeof(*lp)); /* zero counters etc */ + + CFS_INIT_LIST_HEAD(&lp->lp_txq); + CFS_INIT_LIST_HEAD(&lp->lp_rtrq); + + lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ + lp->lp_notify = 0; + lp->lp_notifylnd = 0; + lp->lp_notifying = 0; + lp->lp_alive_count = 0; + lp->lp_timestamp = 0; + lp->lp_ping_timestamp = 0; + lp->lp_nid = nid; + lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ + lp->lp_rtr_refcount = 0; + + LNET_LOCK(); + + lp2 = lnet_find_peer_locked(nid); + if (lp2 != NULL) { + LNET_UNLOCK(); + LIBCFS_FREE(lp, sizeof(*lp)); + LNET_LOCK(); + + *lpp = lp2; + return 0; + } + + lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid)); + if (lp->lp_ni == NULL) { + LNET_UNLOCK(); + LIBCFS_FREE(lp, sizeof(*lp)); + LNET_LOCK(); + + *lpp = NULL; + return the_lnet.ln_shutdown ? -ESHUTDOWN : -EHOSTUNREACH; + } + + lp->lp_txcredits = + lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits; + + /* As a first approximation; allow this peer the same number of router + * buffers as it is allowed outstanding sends */ + lp->lp_rtrcredits = lp->lp_minrtrcredits = lp->lp_txcredits; + + LASSERT (!the_lnet.ln_shutdown); + /* can't add peers after shutdown starts */ + + list_add_tail(&lp->lp_hashlist, lnet_nid2peerhash(nid)); + the_lnet.ln_npeers++; + the_lnet.ln_peertable_version++; + *lpp = lp; + return 0; +} + +void +lnet_debug_peer(lnet_nid_t nid) +{ + int rc; + lnet_peer_t *lp; + + LNET_LOCK(); + + rc = lnet_nid2peer_locked(&lp, nid); + if (rc != 0) { + LNET_UNLOCK(); + CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); + return; + } + + CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", + libcfs_nid2str(lp->lp_nid), lp->lp_refcount, + lp->lp_alive ? "up" : "down", + lp->lp_ni->ni_peertxcredits, + lp->lp_rtrcredits, lp->lp_minrtrcredits, + lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob); + + lnet_peer_decref_locked(lp); + + LNET_UNLOCK(); +} diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c new file mode 100644 index 0000000..aec3d06 --- /dev/null +++ b/lnet/lnet/router.c @@ -0,0 +1,1135 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include + +#if defined(__KERNEL__) && defined(LNET_ROUTER) + +static char *forwarding = ""; +CFS_MODULE_PARM(forwarding, "s", charp, 0444, + "Explicitly enable/disable forwarding between networks"); + +static int tiny_router_buffers = 512; +CFS_MODULE_PARM(tiny_router_buffers, "i", int, 0444, + "# of 0 payload messages to buffer in the router"); +static int small_router_buffers = 256; +CFS_MODULE_PARM(small_router_buffers, "i", int, 0444, + "# of small (1 page) messages to buffer in the router"); +static int large_router_buffers = 32; +CFS_MODULE_PARM(large_router_buffers, "i", int, 0444, + "# of large messages to buffer in the router"); + +static int auto_down = 1; +CFS_MODULE_PARM(auto_down, "i", int, 0444, + "Automatically mark peers down on comms error"); + +static int check_routers_before_use = 0; +CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444, + "Assume routers are down and ping them before use"); + +static int dead_router_check_interval = 0; +CFS_MODULE_PARM(dead_router_check_interval, "i", int, 0444, + "Seconds between dead router health checks (<= 0 to disable)"); + +static int live_router_check_interval = 0; +CFS_MODULE_PARM(live_router_check_interval, "i", int, 0444, + "Seconds between live router health checks (<= 0 to disable)"); + +static int router_ping_timeout = 50; +CFS_MODULE_PARM(router_ping_timeout, "i", int, 0444, + "Seconds to wait for the reply to a router health query"); + +typedef struct +{ + work_struct_t kpru_tq; + lnet_nid_t kpru_nid; + int kpru_alive; + time_t kpru_when; +} kpr_upcall_t; + +void +kpr_do_upcall (void *arg) +{ + kpr_upcall_t *u = (kpr_upcall_t *)arg; + +#ifndef __WINNT__ + + char nidstr[36]; + char whenstr[36]; + char *argv[] = { + NULL, + "ROUTER_NOTIFY", + nidstr, + u->kpru_alive ? "up" : "down", + whenstr, + NULL}; + + snprintf (nidstr, sizeof(nidstr), "%s", libcfs_nid2str(u->kpru_nid)); + snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); + + libcfs_run_upcall (argv); + +#endif /* __WINNT__ */ + + LIBCFS_FREE(u, sizeof(*u)); +} + +void +kpr_upcall (lnet_nid_t gw_nid, int alive, time_t when) +{ + /* May be in arbitrary context */ + kpr_upcall_t *u; + + LIBCFS_ALLOC_ATOMIC(u, sizeof(*u)); + if (u == NULL) { + CERROR ("Upcall out of memory: nid %s %s\n", + libcfs_nid2str(gw_nid), alive ? "up" : "down"); + return; + } + + u->kpru_nid = gw_nid; + u->kpru_alive = alive; + u->kpru_when = when; + + prepare_work (&u->kpru_tq, kpr_do_upcall, u); + schedule_work (&u->kpru_tq); +} + +int +lnet_peers_start_down(void) +{ + return check_routers_before_use; +} + +void +lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when) +{ + if (when < lp->lp_timestamp) { /* out of date information */ + CDEBUG(D_NET, "Out of date\n"); + return; + } + + lp->lp_timestamp = when; /* update timestamp */ + lp->lp_ping_deadline = 0; /* disable ping timeout */ + + if (lp->lp_alive_count != 0 && /* got old news */ + (!lp->lp_alive) == (!alive)) { /* new date for old news */ + CDEBUG(D_NET, "Old news\n"); + return; + } + + /* Flag that notification is outstanding */ + + lp->lp_alive_count++; + lp->lp_alive = !(!alive); /* 1 bit! */ + lp->lp_notify = 1; + lp->lp_notifylnd = notifylnd; + + CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive); +} + +void +lnet_do_notify (lnet_peer_t *lp) +{ + lnet_ni_t *ni = lp->lp_ni; + int alive; + time_t when; + int lnd; + + LNET_LOCK(); + + /* Notify only in 1 thread at any time to ensure ordered notification. + * NB individual events can be missed; the only guarantee is that you + * always get the most recent news */ + + if (lp->lp_notifying) { + LNET_UNLOCK(); + return; + } + + lp->lp_notifying = 1; + + while (lp->lp_notify) { + alive = lp->lp_alive; + when = lp->lp_timestamp; + lnd = lp->lp_notifylnd; + + lp->lp_notify = 0; + + LNET_UNLOCK(); + + /* A new notification could happen now; I'll handle it when + * control returns to me */ + + if (!lnd) { + CDEBUG(D_NET, "Upcall: NID %s is %s\n", + libcfs_nid2str(lp->lp_nid), + alive ? "alive" : "dead"); + kpr_upcall(lp->lp_nid, alive, when); + } else { + if (ni->ni_lnd->lnd_notify != NULL) + (ni->ni_lnd->lnd_notify)(ni, lp->lp_nid, alive); + } + + LNET_LOCK(); + } + + lp->lp_notifying = 0; + + LNET_UNLOCK(); +} + +int +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) +{ + lnet_peer_t *lp = NULL; + time_t now = cfs_time_current_sec(); + + LASSERT (!in_interrupt ()); + + CDEBUG (D_NET, "%s notifying %s: %s\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), + alive ? "up" : "down"); + + if (ni != NULL && + LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { + CWARN ("Ignoring notification of %s %s by %s (different net)\n", + libcfs_nid2str(nid), alive ? "birth" : "death", + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + } + + /* can't do predictions... */ + if (when > now) { + CWARN ("Ignoring prediction from %s of %s %s " + "%ld seconds in the future\n", + (ni == NULL) ? "userspace" : libcfs_nid2str(ni->ni_nid), + libcfs_nid2str(nid), alive ? "up" : "down", + when - now); + return -EINVAL; + } + + if (ni != NULL && !alive && /* LND telling me she's down */ + !auto_down) { /* auto-down disabled */ + CDEBUG(D_NET, "Auto-down disabled\n"); + return 0; + } + + LNET_LOCK(); + + lp = lnet_find_peer_locked(nid); + if (lp == NULL) { + /* nid not found */ + LNET_UNLOCK(); + CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); + return 0; + } + + lnet_notify_locked(lp, ni == NULL, alive, when); + + LNET_UNLOCK(); + + lnet_do_notify(lp); + + LNET_LOCK(); + + lnet_peer_decref_locked(lp); + + LNET_UNLOCK(); + return 0; +} +EXPORT_SYMBOL(lnet_notify); + +#else + +int +lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) +{ + return -EOPNOTSUPP; +} + +#endif + +static void +lnet_rtr_addref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + LASSERT (lp->lp_rtr_refcount >= 0); + + lp->lp_rtr_refcount++; + if (lp->lp_rtr_refcount == 1) { + struct list_head *pos; + + /* a simple insertion sort */ + list_for_each_prev(pos, &the_lnet.ln_routers) { + lnet_peer_t *rtr = list_entry(pos, lnet_peer_t, + lp_rtr_list); + + if (rtr->lp_nid < lp->lp_nid) + break; + } + + list_add(&lp->lp_rtr_list, pos); + /* addref for the_lnet.ln_routers */ + lnet_peer_addref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +static void +lnet_rtr_decref_locked(lnet_peer_t *lp) +{ + LASSERT (lp->lp_refcount > 0); + LASSERT (lp->lp_rtr_refcount > 0); + + lp->lp_rtr_refcount--; + if (lp->lp_rtr_refcount == 0) { + list_del(&lp->lp_rtr_list); + /* decref for the_lnet.ln_routers */ + lnet_peer_decref_locked(lp); + the_lnet.ln_routers_version++; + } +} + +lnet_remotenet_t * +lnet_find_net_locked (__u32 net) +{ + lnet_remotenet_t *rnet; + struct list_head *tmp; + + LASSERT (!the_lnet.ln_shutdown); + + list_for_each (tmp, &the_lnet.ln_remote_nets) { + rnet = list_entry(tmp, lnet_remotenet_t, lrn_list); + + if (rnet->lrn_net == net) + return rnet; + } + return NULL; +} + +int +lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) +{ + struct list_head zombies; + struct list_head *e; + lnet_remotenet_t *rnet; + lnet_remotenet_t *rnet2; + lnet_route_t *route; + lnet_route_t *route2; + lnet_ni_t *ni; + int add_route; + int rc; + + CDEBUG(D_NET, "Add route: net %s hops %u gw %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + + if (gateway == LNET_NID_ANY || + LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND || + net == LNET_NIDNET(LNET_NID_ANY) || + LNET_NETTYP(net) == LOLND || + LNET_NIDNET(gateway) == net || + hops < 1 || hops > 255) + return (-EINVAL); + + if (lnet_islocalnet(net)) /* it's a local network */ + return 0; /* ignore the route entry */ + + /* Assume net, route, all new */ + LIBCFS_ALLOC(route, sizeof(*route)); + LIBCFS_ALLOC(rnet, sizeof(*rnet)); + if (route == NULL || rnet == NULL) { + CERROR("Out of memory creating route %s %d %s\n", + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + if (route != NULL) + LIBCFS_FREE(route, sizeof(*route)); + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + return -ENOMEM; + } + + INIT_LIST_HEAD(&rnet->lrn_routes); + rnet->lrn_net = net; + rnet->lrn_hops = hops; + + LNET_LOCK(); + + rc = lnet_nid2peer_locked(&route->lr_gateway, gateway); + if (rc != 0) { + LNET_UNLOCK(); + + LIBCFS_FREE(route, sizeof(*route)); + LIBCFS_FREE(rnet, sizeof(*rnet)); + + if (rc == -EHOSTUNREACH) /* gateway is not on a local net */ + return 0; /* ignore the route entry */ + + CERROR("Error %d creating route %s %d %s\n", rc, + libcfs_net2str(net), hops, libcfs_nid2str(gateway)); + return rc; + } + + LASSERT (!the_lnet.ln_shutdown); + CFS_INIT_LIST_HEAD(&zombies); + + rnet2 = lnet_find_net_locked(net); + if (rnet2 == NULL) { + /* new network */ + list_add_tail(&rnet->lrn_list, &the_lnet.ln_remote_nets); + rnet2 = rnet; + } + + if (hops > rnet2->lrn_hops) { + /* New route is longer; ignore it */ + add_route = 0; + } else if (hops < rnet2->lrn_hops) { + /* new route supercedes all currently known routes to this + * net */ + list_add(&zombies, &rnet2->lrn_routes); + list_del_init(&rnet2->lrn_routes); + add_route = 1; + } else { + add_route = 1; + /* New route has the same hopcount as existing routes; search + * for a duplicate route (it's a NOOP if it is) */ + list_for_each (e, &rnet2->lrn_routes) { + route2 = list_entry(e, lnet_route_t, lr_list); + + if (route2->lr_gateway == route->lr_gateway) { + add_route = 0; + break; + } + + /* our loopups must be true */ + LASSERT (route2->lr_gateway->lp_nid != gateway); + } + } + + if (add_route) { + ni = route->lr_gateway->lp_ni; + lnet_ni_addref_locked(ni); + + LASSERT (rc == 0); + list_add_tail(&route->lr_list, &rnet2->lrn_routes); + the_lnet.ln_remote_nets_version++; + + lnet_rtr_addref_locked(route->lr_gateway); + + LNET_UNLOCK(); + + /* XXX Assume alive */ + if (ni->ni_lnd->lnd_notify != NULL) + (ni->ni_lnd->lnd_notify)(ni, gateway, 1); + + lnet_ni_decref(ni); + } else { + lnet_peer_decref_locked(route->lr_gateway); + LNET_UNLOCK(); + LIBCFS_FREE(route, sizeof(*route)); + } + + if (rnet != rnet2) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + while (!list_empty(&zombies)) { + route = list_entry(zombies.next, lnet_route_t, lr_list); + list_del(&route->lr_list); + + LNET_LOCK(); + lnet_peer_decref_locked(route->lr_gateway); + LNET_UNLOCK(); + LIBCFS_FREE(route, sizeof(*route)); + } + + return rc; +} + +int +lnet_check_routes (void) +{ + lnet_remotenet_t *rnet; + lnet_route_t *route; + lnet_route_t *route2; + struct list_head *e1; + struct list_head *e2; + + LNET_LOCK(); + + list_for_each (e1, &the_lnet.ln_remote_nets) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + route2 = NULL; + list_for_each (e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, lr_list); + + if (route2 == NULL) + route2 = route; + else if (route->lr_gateway->lp_ni != + route2->lr_gateway->lp_ni) { + LNET_UNLOCK(); + + CERROR("Routes to %s via %s and %s not supported\n", + libcfs_net2str(rnet->lrn_net), + libcfs_nid2str(route->lr_gateway->lp_nid), + libcfs_nid2str(route2->lr_gateway->lp_nid)); + return -EINVAL; + } + } + } + + LNET_UNLOCK(); + return 0; +} + +int +lnet_del_route (__u32 net, lnet_nid_t gw_nid) +{ + lnet_remotenet_t *rnet; + lnet_route_t *route; + struct list_head *e1; + struct list_head *e2; + int rc = -ENOENT; + + CDEBUG(D_NET, "Del route: net %s : gw %s\n", + libcfs_net2str(net), libcfs_nid2str(gw_nid)); + + /* NB Caller may specify either all routes via the given gateway + * or a specific route entry actual NIDs) */ + + again: + LNET_LOCK(); + + list_for_each (e1, &the_lnet.ln_remote_nets) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + if (!(net == LNET_NIDNET(LNET_NID_ANY) || + net == rnet->lrn_net)) + continue; + + list_for_each (e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, lr_list); + + if (!(gw_nid == LNET_NID_ANY || + gw_nid == route->lr_gateway->lp_nid)) + continue; + + list_del(&route->lr_list); + the_lnet.ln_remote_nets_version++; + + if (list_empty(&rnet->lrn_routes)) + list_del(&rnet->lrn_list); + else + rnet = NULL; + + lnet_rtr_decref_locked(route->lr_gateway); + lnet_peer_decref_locked(route->lr_gateway); + LNET_UNLOCK(); + + LIBCFS_FREE(route, sizeof (*route)); + + if (rnet != NULL) + LIBCFS_FREE(rnet, sizeof(*rnet)); + + rc = 0; + goto again; + } + } + + LNET_UNLOCK(); + return rc; +} + +void +lnet_destroy_routes (void) +{ + lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY); +} + +int +lnet_get_route (int idx, __u32 *net, __u32 *hops, + lnet_nid_t *gateway, __u32 *alive) +{ + struct list_head *e1; + struct list_head *e2; + lnet_remotenet_t *rnet; + lnet_route_t *route; + + LNET_LOCK(); + + list_for_each (e1, &the_lnet.ln_remote_nets) { + rnet = list_entry(e1, lnet_remotenet_t, lrn_list); + + list_for_each (e2, &rnet->lrn_routes) { + route = list_entry(e2, lnet_route_t, lr_list); + + if (idx-- == 0) { + *net = rnet->lrn_net; + *hops = rnet->lrn_hops; + *gateway = route->lr_gateway->lp_nid; + *alive = route->lr_gateway->lp_alive; + LNET_UNLOCK(); + return 0; + } + } + } + + LNET_UNLOCK(); + return -ENOENT; +} + +#if defined(__KERNEL__) && defined(LNET_ROUTER) +static void +lnet_router_checker_event (lnet_event_t *event) +{ + /* CAVEAT EMPTOR: I'm called with LNET_LOCKed and I'm not allowed to + * drop it (that's how come I see _every_ event, even ones that would + * overflow my EQ) */ + lnet_peer_t *lp; + lnet_nid_t nid; + + if (event->unlinked) { + /* The router checker thread has unlinked the rc_md + * and exited. */ + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING); + the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED; + mutex_up(&the_lnet.ln_rc_signal); + return; + } + + LASSERT (event->type == LNET_EVENT_SEND || + event->type == LNET_EVENT_REPLY); + + nid = (event->type == LNET_EVENT_SEND) ? + event->target.nid : event->initiator.nid; + + lp = lnet_find_peer_locked(nid); + if (lp == NULL) { + /* router may have been removed */ + CDEBUG(D_NET, "Router %s not found\n", libcfs_nid2str(nid)); + return; + } + + if (event->type == LNET_EVENT_SEND) /* re-enable another ping */ + lp->lp_ping_notsent = 0; + + if (lnet_isrouter(lp) && /* ignore if no longer a router */ + (event->status != 0 || + event->type == LNET_EVENT_REPLY)) { + + /* A successful REPLY means the router is up. If _any_ comms + * to the router fail I assume it's down (this will happen if + * we ping alive routers to try to detect router death before + * apps get burned). */ + + lnet_notify_locked(lp, 1, (event->status == 0), + cfs_time_current_sec()); + + /* The router checker will wake up very shortly and do the + * actual notification. + * XXX If 'lp' stops being a router before then, it will still + * have the notification pending!!! */ + } + + /* This decref will NOT drop LNET_LOCK (it had to have 1 ref when it + * was in the peer table and I've not dropped the lock, so no-one else + * can have reduced the refcount) */ + LASSERT(lp->lp_refcount > 1); + + lnet_peer_decref_locked(lp); +} + +static int +lnet_router_checker(void *arg) +{ + static lnet_ping_info_t pinginfo; + + int rc; + lnet_handle_md_t mdh; + lnet_peer_t *rtr; + struct list_head *entry; + time_t now; + lnet_process_id_t rtr_id; + int secs; + + cfs_daemonize("router_checker"); + cfs_block_allsigs(); + + rtr_id.pid = LUSTRE_SRV_LNET_PID; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + rc = LNetMDBind((lnet_md_t){.start = &pinginfo, + .length = sizeof(pinginfo), + .threshold = LNET_MD_THRESH_INF, + .options = LNET_MD_TRUNCATE, + .eq_handle = the_lnet.ln_rc_eqh}, + LNET_UNLINK, + &mdh); + + if (rc < 0) { + CERROR("Can't bind MD: %d\n", rc); + the_lnet.ln_rc_state = rc; + mutex_up(&the_lnet.ln_rc_signal); + return rc; + } + + LASSERT (rc == 0); + + the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; + mutex_up(&the_lnet.ln_rc_signal); /* let my parent go */ + + while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { + __u64 version; + + LNET_LOCK(); +rescan: + version = the_lnet.ln_routers_version; + + list_for_each (entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + lnet_peer_addref_locked(rtr); + + now = cfs_time_current_sec(); + + if (rtr->lp_ping_deadline != 0 && /* ping timed out? */ + now > rtr->lp_ping_deadline) + lnet_notify_locked(rtr, 1, 0, now); + + LNET_UNLOCK(); + + /* Run any outstanding notificiations */ + lnet_do_notify(rtr); + + if (rtr->lp_alive) { + secs = live_router_check_interval; + } else { + secs = dead_router_check_interval; + } + if (secs <= 0) + secs = 0; + + if (secs != 0 && + !rtr->lp_ping_notsent && + now > rtr->lp_ping_timestamp + secs) { + CDEBUG(D_NET, "Check: %s\n", + libcfs_nid2str(rtr->lp_nid)); + + LNET_LOCK(); + rtr_id.nid = rtr->lp_nid; + rtr->lp_ping_notsent = 1; + rtr->lp_ping_timestamp = now; + + if (rtr->lp_ping_deadline == 0) + rtr->lp_ping_deadline = + now + router_ping_timeout; + + LNET_UNLOCK(); + + LNetGet(LNET_NID_ANY, mdh, rtr_id, + LNET_RESERVED_PORTAL, + LNET_PROTO_PING_MATCHBITS, 0); + } + + LNET_LOCK(); + lnet_peer_decref_locked(rtr); + + if (version != the_lnet.ln_routers_version) { + /* the routers list has changed */ + goto rescan; + } + } + + LNET_UNLOCK(); + + /* Call cfs_pause() here always adds 1 to load average + * because kernel counts # active tasks as nr_running + * + nr_uninterruptible. */ + set_current_state(CFS_TASK_INTERRUPTIBLE); + cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + } + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD); + the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING; + + rc = LNetMDUnlink(mdh); + LASSERT (rc == 0); + + /* The unlink event callback will signal final completion */ + + return 0; +} + + +void +lnet_wait_known_routerstate(void) +{ + lnet_peer_t *rtr; + struct list_head *entry; + int all_known; + + for (;;) { + LNET_LOCK(); + + all_known = 1; + list_for_each (entry, &the_lnet.ln_routers) { + rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); + + if (rtr->lp_alive_count == 0) { + all_known = 0; + break; + } + } + + LNET_UNLOCK(); + + if (all_known) + return; + + cfs_pause(cfs_time_seconds(1)); + } +} + +void +lnet_router_checker_stop(void) +{ + int rc; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING || + the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) + return; + + the_lnet.ln_rc_state = LNET_RC_STATE_STOPTHREAD; + /* block until event callback signals exit */ + mutex_down(&the_lnet.ln_rc_signal); + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKED); + + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT (rc == 0); + + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; +} + +int +lnet_router_checker_start(void) +{ + int rc; + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); + + if (check_routers_before_use && + dead_router_check_interval <= 0) { + LCONSOLE_ERROR("'dead_router_check_interval' must be set if " + "'check_routers_before_use' is set\n"); + return -EINVAL; + } + + if (live_router_check_interval <= 0 && + dead_router_check_interval <= 0) + return 0; + + init_mutex_locked(&the_lnet.ln_rc_signal); + + /* EQ size doesn't matter; the callback is guaranteed to get every + * event */ + rc = LNetEQAlloc(1, lnet_router_checker_event, + &the_lnet.ln_rc_eqh); + if (rc != 0) { + CERROR("Can't allocate EQ: %d\n", rc); + return -ENOMEM; + } + + rc = (int)cfs_kernel_thread(lnet_router_checker, NULL, 0); + if (rc < 0) { + CERROR("Can't start router checker thread: %d\n", rc); + goto failed; + } + + mutex_down(&the_lnet.ln_rc_signal); /* wait for checker to startup */ + + rc = the_lnet.ln_rc_state; + if (rc < 0) { + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; + goto failed; + } + + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); + + if (check_routers_before_use) { + /* Note that a helpful side-effect of pinging all known routers + * at startup is that it makes them drop stale connections they + * may have to a previous instance of me. */ + lnet_wait_known_routerstate(); + } + + return 0; + + failed: + rc = LNetEQFree(the_lnet.ln_rc_eqh); + LASSERT (rc == 0); + return rc; +} + +void +lnet_destroy_rtrbuf(lnet_rtrbuf_t *rb, int npages) +{ + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + + while (--npages >= 0) + cfs_free_page(rb->rb_kiov[npages].kiov_page); + + LIBCFS_FREE(rb, sz); +} + +lnet_rtrbuf_t * +lnet_new_rtrbuf(lnet_rtrbufpool_t *rbp) +{ + int npages = rbp->rbp_npages; + int sz = offsetof(lnet_rtrbuf_t, rb_kiov[npages]); + struct page *page; + lnet_rtrbuf_t *rb; + int i; + + LIBCFS_ALLOC(rb, sz); + + rb->rb_pool = rbp; + + for (i = 0; i < npages; i++) { + page = cfs_alloc_page(CFS_ALLOC_ZERO | CFS_ALLOC_STD); + if (page == NULL) { + while (--i >= 0) + cfs_free_page(rb->rb_kiov[i].kiov_page); + + LIBCFS_FREE(rb, sz); + return NULL; + } + + rb->rb_kiov[i].kiov_len = CFS_PAGE_SIZE; + rb->rb_kiov[i].kiov_offset = 0; + rb->rb_kiov[i].kiov_page = page; + } + + return rb; +} + +void +lnet_rtrpool_free_bufs(lnet_rtrbufpool_t *rbp) +{ + int npages = rbp->rbp_npages; + int nbuffers = 0; + lnet_rtrbuf_t *rb; + + LASSERT (list_empty(&rbp->rbp_msgs)); + LASSERT (rbp->rbp_credits == rbp->rbp_nbuffers); + + while (!list_empty(&rbp->rbp_bufs)) { + LASSERT (rbp->rbp_credits > 0); + + rb = list_entry(rbp->rbp_bufs.next, + lnet_rtrbuf_t, rb_list); + list_del(&rb->rb_list); + lnet_destroy_rtrbuf(rb, npages); + nbuffers++; + } + + LASSERT (rbp->rbp_nbuffers == nbuffers); + LASSERT (rbp->rbp_credits == nbuffers); + + rbp->rbp_nbuffers = rbp->rbp_credits = 0; +} + +int +lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs) +{ + lnet_rtrbuf_t *rb; + int i; + + if (rbp->rbp_nbuffers != 0) { + LASSERT (rbp->rbp_nbuffers == nbufs); + return 0; + } + + for (i = 0; i < nbufs; i++) { + rb = lnet_new_rtrbuf(rbp); + + if (rb == NULL) { + CERROR("Failed to allocate %d router bufs of %d pages\n", + nbufs, rbp->rbp_npages); + return -ENOMEM; + } + + rbp->rbp_nbuffers++; + rbp->rbp_credits++; + rbp->rbp_mincredits++; + list_add(&rb->rb_list, &rbp->rbp_bufs); + + /* No allocation "under fire" */ + /* Otherwise we'd need code to schedule blocked msgs etc */ + LASSERT (!the_lnet.ln_routing); + } + + LASSERT (rbp->rbp_credits == nbufs); + return 0; +} + +void +lnet_rtrpool_init(lnet_rtrbufpool_t *rbp, int npages) +{ + CFS_INIT_LIST_HEAD(&rbp->rbp_msgs); + CFS_INIT_LIST_HEAD(&rbp->rbp_bufs); + + rbp->rbp_npages = npages; + rbp->rbp_credits = 0; + rbp->rbp_mincredits = 0; +} + +void +lnet_free_rtrpools(void) +{ + lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[0]); + lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[1]); + lnet_rtrpool_free_bufs(&the_lnet.ln_rtrpools[2]); +} + +void +lnet_init_rtrpools(void) +{ + int small_pages = 1; + int large_pages = (LNET_MTU + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; + + lnet_rtrpool_init(&the_lnet.ln_rtrpools[0], 0); + lnet_rtrpool_init(&the_lnet.ln_rtrpools[1], small_pages); + lnet_rtrpool_init(&the_lnet.ln_rtrpools[2], large_pages); +} + + +int +lnet_alloc_rtrpools(int im_a_router) +{ + int rc; + + if (!strcmp(forwarding, "")) { + /* not set either way */ + if (!im_a_router) + return 0; + } else if (!strcmp(forwarding, "disabled")) { + /* explicitly disabled */ + return 0; + } else if (!strcmp(forwarding, "enabled")) { + /* explicitly enabled */ + } else { + LCONSOLE_ERROR("'forwarding' not set to either " + "'enabled' or 'disabled'\n"); + return -EINVAL; + } + + if (tiny_router_buffers <= 0) { + LCONSOLE_ERROR("tiny_router_buffers=%d invalid when " + "routing enabled\n", tiny_router_buffers); + rc = -EINVAL; + goto failed; + } + + rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[0], + tiny_router_buffers); + if (rc != 0) + goto failed; + + if (small_router_buffers <= 0) { + LCONSOLE_ERROR("small_router_buffers=%d invalid when " + "routing enabled\n", small_router_buffers); + rc = -EINVAL; + goto failed; + } + + rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[1], + small_router_buffers); + if (rc != 0) + goto failed; + + if (large_router_buffers <= 0) { + LCONSOLE_ERROR("large_router_buffers=%d invalid when " + "routing enabled\n", large_router_buffers); + rc = -EINVAL; + goto failed; + } + + rc = lnet_rtrpool_alloc_bufs(&the_lnet.ln_rtrpools[2], + large_router_buffers); + if (rc != 0) + goto failed; + + LNET_LOCK(); + the_lnet.ln_routing = 1; + LNET_UNLOCK(); + + return 0; + + failed: + lnet_free_rtrpools(); + return rc; +} + +#else + +int +lnet_peers_start_down(void) +{ + return 0; +} + +void +lnet_router_checker_stop(void) +{ + return; +} + +int +lnet_router_checker_start(void) +{ + return 0; +} + +void +lnet_free_rtrpools (void) +{ +} + +void +lnet_init_rtrpools (void) +{ +} + +int +lnet_alloc_rtrpools (int im_a_arouter) +{ + return 0; +} + +#endif diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c new file mode 100644 index 0000000..5be36b18 --- /dev/null +++ b/lnet/lnet/router_proc.c @@ -0,0 +1,1094 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Portals + * http://sourceforge.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_SUBSYSTEM S_LNET +#include +#include + +#if defined(__KERNEL__) && defined(LNET_ROUTER) + +#include +#include + +/* this is really lnet_proc.c */ + +#define LNET_PROC_STATS "sys/lnet/stats" +#define LNET_PROC_ROUTES "sys/lnet/routes" +#define LNET_PROC_ROUTERS "sys/lnet/routers" +#define LNET_PROC_PEERS "sys/lnet/peers" +#define LNET_PROC_BUFFERS "sys/lnet/buffers" +#define LNET_PROC_NIS "sys/lnet/nis" + +static int +lnet_router_proc_stats_read (char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + lnet_counters_t *ctrs; + int rc; + + *start = page; + *eof = 1; + if (off != 0) + return 0; + + LIBCFS_ALLOC(ctrs, sizeof(*ctrs)); + if (ctrs == NULL) + return -ENOMEM; + + LNET_LOCK(); + *ctrs = the_lnet.ln_counters; + LNET_UNLOCK(); + + rc = sprintf(page, + "%u %u %u %u %u %u %u "LPU64" "LPU64" "LPU64" "LPU64"\n", + ctrs->msgs_alloc, ctrs->msgs_max, + ctrs->errors, + ctrs->send_count, ctrs->recv_count, + ctrs->route_count, ctrs->drop_count, + ctrs->send_length, ctrs->recv_length, + ctrs->route_length, ctrs->drop_length); + + LIBCFS_FREE(ctrs, sizeof(*ctrs)); + return rc; +} + +static int +lnet_router_proc_stats_write(struct file *file, const char *ubuffer, + unsigned long count, void *data) +{ + LNET_LOCK(); + memset(&the_lnet.ln_counters, 0, sizeof(the_lnet.ln_counters)); + LNET_UNLOCK(); + + return (count); +} + +typedef struct { + __u64 lrsi_version; + lnet_remotenet_t *lrsi_net; + lnet_route_t *lrsi_route; + loff_t lrsi_off; +} lnet_route_seq_iterator_t; + +int +lnet_route_seq_seek (lnet_route_seq_iterator_t *lrsi, loff_t off) +{ + struct list_head *n; + struct list_head *r; + int rc; + loff_t here; + + if (off == 0) { + lrsi->lrsi_net = NULL; + lrsi->lrsi_route = NULL; + lrsi->lrsi_off = 0; + return 0; + } + + LNET_LOCK(); + + if (lrsi->lrsi_net != NULL && + lrsi->lrsi_version != the_lnet.ln_remote_nets_version) { + /* tables have changed */ + rc = -ESTALE; + goto out; + } + + if (lrsi->lrsi_net == NULL || lrsi->lrsi_off > off) { + /* search from start */ + n = the_lnet.ln_remote_nets.next; + r = NULL; + here = 1; + } else { + /* continue search */ + n = &lrsi->lrsi_net->lrn_list; + r = &lrsi->lrsi_route->lr_list; + here = lrsi->lrsi_off; + } + + lrsi->lrsi_version = the_lnet.ln_remote_nets_version; + lrsi->lrsi_off = off; + + while (n != &the_lnet.ln_remote_nets) { + lnet_remotenet_t *rnet = + list_entry(n, lnet_remotenet_t, lrn_list); + + if (r == NULL) + r = rnet->lrn_routes.next; + + while (r != &rnet->lrn_routes) { + lnet_route_t *re = + list_entry(r, lnet_route_t, + lr_list); + + if (here == off) { + lrsi->lrsi_net = rnet; + lrsi->lrsi_route = re; + rc = 0; + goto out; + } + + r = r->next; + here++; + } + + r = NULL; + n = n->next; + } + + lrsi->lrsi_net = NULL; + lrsi->lrsi_route = NULL; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_route_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_route_seq_iterator_t *lrsi; + int rc; + + LIBCFS_ALLOC(lrsi, sizeof(*lrsi)); + if (lrsi == NULL) + return NULL; + + lrsi->lrsi_net = NULL; + rc = lnet_route_seq_seek(lrsi, *pos); + if (rc == 0) + return lrsi; + + LIBCFS_FREE(lrsi, sizeof(*lrsi)); + return NULL; +} + +static void +lnet_route_seq_stop (struct seq_file *s, void *iter) +{ + lnet_route_seq_iterator_t *lrsi = iter; + + if (lrsi != NULL) + LIBCFS_FREE(lrsi, sizeof(*lrsi)); +} + +static void * +lnet_route_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_route_seq_iterator_t *lrsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_route_seq_seek(lrsi, next); + if (rc != 0) { + LIBCFS_FREE(lrsi, sizeof(*lrsi)); + return NULL; + } + + *pos = next; + return lrsi; +} + +static int +lnet_route_seq_show (struct seq_file *s, void *iter) +{ + lnet_route_seq_iterator_t *lrsi = iter; + __u32 net; + unsigned int hops; + lnet_nid_t nid; + int alive; + + if (lrsi->lrsi_off == 0) { + seq_printf(s, "Routing %s\n", + the_lnet.ln_routing ? "enabled" : "disabled"); + seq_printf(s, "%-8s %4s %7s %s\n", + "net", "hops", "state", "router"); + return 0; + } + + LASSERT (lrsi->lrsi_net != NULL); + LASSERT (lrsi->lrsi_route != NULL); + + LNET_LOCK(); + + if (lrsi->lrsi_version != the_lnet.ln_remote_nets_version) { + LNET_UNLOCK(); + return -ESTALE; + } + + net = lrsi->lrsi_net->lrn_net; + hops = lrsi->lrsi_net->lrn_hops; + nid = lrsi->lrsi_route->lr_gateway->lp_nid; + alive = lrsi->lrsi_route->lr_gateway->lp_alive; + + LNET_UNLOCK(); + + seq_printf(s, "%-8s %4u %7s %s\n", libcfs_net2str(net), hops, + alive ? "up" : "down", libcfs_nid2str(nid)); + return 0; +} + +static struct seq_operations lnet_routes_sops = { + .start = lnet_route_seq_start, + .stop = lnet_route_seq_stop, + .next = lnet_route_seq_next, + .show = lnet_route_seq_show, +}; + +static int +lnet_route_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_routes_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_routes_fops = { + .owner = THIS_MODULE, + .open = lnet_route_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + __u64 lrtrsi_version; + lnet_peer_t *lrtrsi_router; + loff_t lrtrsi_off; +} lnet_router_seq_iterator_t; + +int +lnet_router_seq_seek (lnet_router_seq_iterator_t *lrtrsi, loff_t off) +{ + struct list_head *r; + lnet_peer_t *lp; + int rc; + loff_t here; + + if (off == 0) { + lrtrsi->lrtrsi_router = NULL; + lrtrsi->lrtrsi_off = 0; + return 0; + } + + LNET_LOCK(); + + lp = lrtrsi->lrtrsi_router; + + if (lp != NULL && + lrtrsi->lrtrsi_version != the_lnet.ln_routers_version) { + /* tables have changed */ + rc = -ESTALE; + goto out; + } + + if (lp == NULL || lrtrsi->lrtrsi_off > off) { + /* search from start */ + r = the_lnet.ln_routers.next; + here = 1; + } else { + /* continue search */ + r = &lp->lp_rtr_list; + here = lrtrsi->lrtrsi_off; + } + + lrtrsi->lrtrsi_version = the_lnet.ln_routers_version; + lrtrsi->lrtrsi_off = off; + + while (r != &the_lnet.ln_routers) { + lnet_peer_t *rtr = list_entry(r, + lnet_peer_t, + lp_rtr_list); + + if (here == off) { + lrtrsi->lrtrsi_router = rtr; + rc = 0; + goto out; + } + + r = r->next; + here++; + } + + lrtrsi->lrtrsi_router = NULL; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_router_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_router_seq_iterator_t *lrtrsi; + int rc; + + LIBCFS_ALLOC(lrtrsi, sizeof(*lrtrsi)); + if (lrtrsi == NULL) + return NULL; + + lrtrsi->lrtrsi_router = NULL; + rc = lnet_router_seq_seek(lrtrsi, *pos); + if (rc == 0) + return lrtrsi; + + LIBCFS_FREE(lrtrsi, sizeof(*lrtrsi)); + return NULL; +} + +static void +lnet_router_seq_stop (struct seq_file *s, void *iter) +{ + lnet_router_seq_iterator_t *lrtrsi = iter; + + if (lrtrsi != NULL) + LIBCFS_FREE(lrtrsi, sizeof(*lrtrsi)); +} + +static void * +lnet_router_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_router_seq_iterator_t *lrtrsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_router_seq_seek(lrtrsi, next); + if (rc != 0) { + LIBCFS_FREE(lrtrsi, sizeof(*lrtrsi)); + return NULL; + } + + *pos = next; + return lrtrsi; +} + +static int +lnet_router_seq_show (struct seq_file *s, void *iter) +{ + lnet_router_seq_iterator_t *lrtrsi = iter; + lnet_peer_t *lp; + lnet_nid_t nid; + int alive; + int nrefs; + int nrtrrefs; + + if (lrtrsi->lrtrsi_off == 0) { + seq_printf(s, "%-4s %7s %9s %6s %12s %s\n", + "ref", "rtr_ref", "alive_cnt", "state", "last_ping", "router"); + return 0; + } + + lp = lrtrsi->lrtrsi_router; + LASSERT (lp != NULL); + + LNET_LOCK(); + + if (lrtrsi->lrtrsi_version != the_lnet.ln_routers_version) { + LNET_UNLOCK(); + return -ESTALE; + } + + nrefs = lp->lp_refcount; + nrtrrefs = lp->lp_rtr_refcount; + nid = lp->lp_nid; + alive = lp->lp_alive; + + LNET_UNLOCK(); + + seq_printf(s, + "%-4d %7d %9d %6s %12lu %s\n", + nrefs, nrtrrefs, + lp->lp_alive_count, + alive ? "up" : "down", + lp->lp_ping_timestamp, + libcfs_nid2str(nid)); + return 0; +} + +static struct seq_operations lnet_routers_sops = { + .start = lnet_router_seq_start, + .stop = lnet_router_seq_stop, + .next = lnet_router_seq_next, + .show = lnet_router_seq_show, +}; + +static int +lnet_router_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_routers_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_routers_fops = { + .owner = THIS_MODULE, + .open = lnet_router_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + unsigned long long lpsi_version; + int lpsi_idx; + lnet_peer_t *lpsi_peer; + loff_t lpsi_off; +} lnet_peer_seq_iterator_t; + +int +lnet_peer_seq_seek (lnet_peer_seq_iterator_t *lpsi, loff_t off) +{ + int idx; + struct list_head *p; + loff_t here; + int rc; + + if (off == 0) { + lpsi->lpsi_idx = 0; + lpsi->lpsi_peer = NULL; + lpsi->lpsi_off = 0; + return 0; + } + + LNET_LOCK(); + + if (lpsi->lpsi_peer != NULL && + lpsi->lpsi_version != the_lnet.ln_peertable_version) { + /* tables have changed */ + rc = -ESTALE; + goto out; + } + + if (lpsi->lpsi_peer == NULL || + lpsi->lpsi_off > off) { + /* search from start */ + idx = 0; + p = NULL; + here = 1; + } else { + /* continue search */ + idx = lpsi->lpsi_idx; + p = &lpsi->lpsi_peer->lp_hashlist; + here = lpsi->lpsi_off; + } + + lpsi->lpsi_version = the_lnet.ln_peertable_version; + lpsi->lpsi_off = off; + + while (idx < LNET_PEER_HASHSIZE) { + if (p == NULL) + p = the_lnet.ln_peer_hash[idx].next; + + while (p != &the_lnet.ln_peer_hash[idx]) { + lnet_peer_t *lp = list_entry(p, lnet_peer_t, + lp_hashlist); + + if (here == off) { + lpsi->lpsi_idx = idx; + lpsi->lpsi_peer = lp; + rc = 0; + goto out; + } + + here++; + p = lp->lp_hashlist.next; + } + + p = NULL; + idx++; + } + + lpsi->lpsi_idx = 0; + lpsi->lpsi_peer = NULL; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_peer_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_peer_seq_iterator_t *lpsi; + int rc; + + LIBCFS_ALLOC(lpsi, sizeof(*lpsi)); + if (lpsi == NULL) + return NULL; + + lpsi->lpsi_idx = 0; + lpsi->lpsi_peer = NULL; + rc = lnet_peer_seq_seek(lpsi, *pos); + if (rc == 0) + return lpsi; + + LIBCFS_FREE(lpsi, sizeof(*lpsi)); + return NULL; +} + +static void +lnet_peer_seq_stop (struct seq_file *s, void *iter) +{ + lnet_peer_seq_iterator_t *lpsi = iter; + + if (lpsi != NULL) + LIBCFS_FREE(lpsi, sizeof(*lpsi)); +} + +static void * +lnet_peer_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_peer_seq_iterator_t *lpsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_peer_seq_seek(lpsi, next); + if (rc != 0) { + LIBCFS_FREE(lpsi, sizeof(*lpsi)); + return NULL; + } + + *pos = next; + return lpsi; +} + +static int +lnet_peer_seq_show (struct seq_file *s, void *iter) +{ + lnet_peer_seq_iterator_t *lpsi = iter; + lnet_peer_t *lp; + lnet_nid_t nid; + int maxcr; + int mintxcr; + int txcr; + int minrtrcr; + int rtrcr; + int alive; + int txqnob; + int nrefs; + + if (lpsi->lpsi_off == 0) { + seq_printf(s, "%-24s %4s %5s %5s %5s %5s %5s %5s %s\n", + "nid", "refs", "state", "max", + "rtr", "min", "tx", "min", "queue"); + return 0; + } + + LASSERT (lpsi->lpsi_peer != NULL); + + LNET_LOCK(); + + if (lpsi->lpsi_version != the_lnet.ln_peertable_version) { + LNET_UNLOCK(); + return -ESTALE; + } + + lp = lpsi->lpsi_peer; + + nid = lp->lp_nid; + maxcr = lp->lp_ni->ni_peertxcredits; + txcr = lp->lp_txcredits; + mintxcr = lp->lp_mintxcredits; + rtrcr = lp->lp_rtrcredits; + minrtrcr = lp->lp_minrtrcredits; + alive = lp->lp_alive; + txqnob = lp->lp_txqnob; + nrefs = lp->lp_refcount; + + LNET_UNLOCK(); + + seq_printf(s, "%-24s %4d %5s %5d %5d %5d %5d %5d %d\n", + libcfs_nid2str(nid), nrefs, alive ? "up" : "down", + maxcr, rtrcr, minrtrcr, txcr, mintxcr, txqnob); + return 0; +} + +static struct seq_operations lnet_peer_sops = { + .start = lnet_peer_seq_start, + .stop = lnet_peer_seq_stop, + .next = lnet_peer_seq_next, + .show = lnet_peer_seq_show, +}; + +static int +lnet_peer_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_peer_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_peer_fops = { + .owner = THIS_MODULE, + .open = lnet_peer_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + int lbsi_idx; + loff_t lbsi_off; +} lnet_buffer_seq_iterator_t; + +int +lnet_buffer_seq_seek (lnet_buffer_seq_iterator_t *lbsi, loff_t off) +{ + int idx; + loff_t here; + int rc; + + if (off == 0) { + lbsi->lbsi_idx = -1; + lbsi->lbsi_off = 0; + return 0; + } + + LNET_LOCK(); + + if (lbsi->lbsi_idx < 0 || + lbsi->lbsi_off > off) { + /* search from start */ + idx = 0; + here = 1; + } else { + /* continue search */ + idx = lbsi->lbsi_idx; + here = lbsi->lbsi_off; + } + + lbsi->lbsi_off = off; + + while (idx < LNET_NRBPOOLS) { + if (here == off) { + lbsi->lbsi_idx = idx; + rc = 0; + goto out; + } + here++; + idx++; + } + + lbsi->lbsi_idx = -1; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_buffer_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_buffer_seq_iterator_t *lbsi; + int rc; + + LIBCFS_ALLOC(lbsi, sizeof(*lbsi)); + if (lbsi == NULL) + return NULL; + + lbsi->lbsi_idx = -1; + rc = lnet_buffer_seq_seek(lbsi, *pos); + if (rc == 0) + return lbsi; + + LIBCFS_FREE(lbsi, sizeof(*lbsi)); + return NULL; +} + +static void +lnet_buffer_seq_stop (struct seq_file *s, void *iter) +{ + lnet_buffer_seq_iterator_t *lbsi = iter; + + if (lbsi != NULL) + LIBCFS_FREE(lbsi, sizeof(*lbsi)); +} + +static void * +lnet_buffer_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_buffer_seq_iterator_t *lbsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_buffer_seq_seek(lbsi, next); + if (rc != 0) { + LIBCFS_FREE(lbsi, sizeof(*lbsi)); + return NULL; + } + + *pos = next; + return lbsi; +} + +static int +lnet_buffer_seq_show (struct seq_file *s, void *iter) +{ + lnet_buffer_seq_iterator_t *lbsi = iter; + lnet_rtrbufpool_t *rbp; + int npages; + int nbuf; + int cr; + int mincr; + + if (lbsi->lbsi_off == 0) { + seq_printf(s, "%5s %5s %7s %7s\n", + "pages", "count", "credits", "min"); + return 0; + } + + LASSERT (lbsi->lbsi_idx >= 0 && lbsi->lbsi_idx < LNET_NRBPOOLS); + + LNET_LOCK(); + + rbp = &the_lnet.ln_rtrpools[lbsi->lbsi_idx]; + + npages = rbp->rbp_npages; + nbuf = rbp->rbp_nbuffers; + cr = rbp->rbp_credits; + mincr = rbp->rbp_mincredits; + + LNET_UNLOCK(); + + seq_printf(s, "%5d %5d %7d %7d\n", + npages, nbuf, cr, mincr); + return 0; +} + +static struct seq_operations lnet_buffer_sops = { + .start = lnet_buffer_seq_start, + .stop = lnet_buffer_seq_stop, + .next = lnet_buffer_seq_next, + .show = lnet_buffer_seq_show, +}; + +static int +lnet_buffer_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_buffer_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_buffers_fops = { + .owner = THIS_MODULE, + .open = lnet_buffer_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +typedef struct { + lnet_ni_t *lnsi_ni; + loff_t lnsi_off; +} lnet_ni_seq_iterator_t; + +int +lnet_ni_seq_seek (lnet_ni_seq_iterator_t *lnsi, loff_t off) +{ + struct list_head *n; + loff_t here; + int rc; + + if (off == 0) { + lnsi->lnsi_ni = NULL; + lnsi->lnsi_off = 0; + return 0; + } + + LNET_LOCK(); + + if (lnsi->lnsi_ni == NULL || + lnsi->lnsi_off > off) { + /* search from start */ + n = NULL; + here = 1; + } else { + /* continue search */ + n = &lnsi->lnsi_ni->ni_list; + here = lnsi->lnsi_off; + } + + lnsi->lnsi_off = off; + + if (n == NULL) + n = the_lnet.ln_nis.next; + + while (n != &the_lnet.ln_nis) { + if (here == off) { + lnsi->lnsi_ni = list_entry(n, lnet_ni_t, ni_list); + rc = 0; + goto out; + } + here++; + n = n->next; + } + + lnsi->lnsi_ni = NULL; + rc = -ENOENT; + out: + LNET_UNLOCK(); + return rc; +} + +static void * +lnet_ni_seq_start (struct seq_file *s, loff_t *pos) +{ + lnet_ni_seq_iterator_t *lnsi; + int rc; + + LIBCFS_ALLOC(lnsi, sizeof(*lnsi)); + if (lnsi == NULL) + return NULL; + + lnsi->lnsi_ni = NULL; + rc = lnet_ni_seq_seek(lnsi, *pos); + if (rc == 0) + return lnsi; + + LIBCFS_FREE(lnsi, sizeof(*lnsi)); + return NULL; +} + +static void +lnet_ni_seq_stop (struct seq_file *s, void *iter) +{ + lnet_ni_seq_iterator_t *lnsi = iter; + + if (lnsi != NULL) + LIBCFS_FREE(lnsi, sizeof(*lnsi)); +} + +static void * +lnet_ni_seq_next (struct seq_file *s, void *iter, loff_t *pos) +{ + lnet_ni_seq_iterator_t *lnsi = iter; + int rc; + loff_t next = *pos + 1; + + rc = lnet_ni_seq_seek(lnsi, next); + if (rc != 0) { + LIBCFS_FREE(lnsi, sizeof(*lnsi)); + return NULL; + } + + *pos = next; + return lnsi; +} + +static int +lnet_ni_seq_show (struct seq_file *s, void *iter) +{ + lnet_ni_seq_iterator_t *lnsi = iter; + lnet_ni_t *ni; + int maxtxcr; + int txcr; + int mintxcr; + int npeertxcr; + lnet_nid_t nid; + int nref; + + if (lnsi->lnsi_off == 0) { + seq_printf(s, "%-24s %4s %4s %5s %5s %5s\n", + "nid", "refs", "peer", "max", "tx", "min"); + return 0; + } + + LASSERT (lnsi->lnsi_ni != NULL); + + LNET_LOCK(); + + ni = lnsi->lnsi_ni; + + maxtxcr = ni->ni_maxtxcredits; + txcr = ni->ni_txcredits; + mintxcr = ni->ni_mintxcredits; + npeertxcr = ni->ni_peertxcredits; + nid = ni->ni_nid; + nref = ni->ni_refcount; + + LNET_UNLOCK(); + + seq_printf(s, "%-24s %4d %4d %5d %5d %5d\n", + libcfs_nid2str(nid), nref, + npeertxcr, maxtxcr, txcr, mintxcr); + return 0; +} + +static struct seq_operations lnet_ni_sops = { + .start = lnet_ni_seq_start, + .stop = lnet_ni_seq_stop, + .next = lnet_ni_seq_next, + .show = lnet_ni_seq_show, +}; + +static int +lnet_ni_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *dp = PDE(inode); + struct seq_file *sf; + int rc; + + rc = seq_open(file, &lnet_ni_sops); + if (rc == 0) { + sf = file->private_data; + sf->private = dp->data; + } + + return rc; +} + +static struct file_operations lnet_ni_fops = { + .owner = THIS_MODULE, + .open = lnet_ni_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +void +lnet_proc_init(void) +{ + struct proc_dir_entry *stats; + struct proc_dir_entry *routes; + struct proc_dir_entry *routers; + struct proc_dir_entry *peers; + + /* Initialize LNET_PROC_STATS */ + stats = create_proc_entry (LNET_PROC_STATS, 0644, NULL); + if (stats == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_STATS); + return; + } + + stats->data = NULL; + stats->read_proc = lnet_router_proc_stats_read; + stats->write_proc = lnet_router_proc_stats_write; + + /* Initialize LNET_PROC_ROUTES */ + routes = create_proc_entry (LNET_PROC_ROUTES, 0444, NULL); + if (routes == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_ROUTES); + return; + } + + routes->proc_fops = &lnet_routes_fops; + routes->data = NULL; + + /* Initialize LNET_PROC_ROUTERS */ + routers = create_proc_entry (LNET_PROC_ROUTERS, 0444, NULL); + if (routers == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_ROUTERS); + return; + } + + routers->proc_fops = &lnet_routers_fops; + routers->data = NULL; + + /* Initialize LNET_PROC_PEERS */ + peers = create_proc_entry (LNET_PROC_PEERS, 0444, NULL); + if (peers == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_PEERS); + return; + } + + peers->proc_fops = &lnet_peer_fops; + peers->data = NULL; + + /* Initialize LNET_PROC_BUFFERS */ + peers = create_proc_entry (LNET_PROC_BUFFERS, 0444, NULL); + if (peers == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_BUFFERS); + return; + } + + peers->proc_fops = &lnet_buffers_fops; + peers->data = NULL; + + /* Initialize LNET_PROC_NIS */ + peers = create_proc_entry (LNET_PROC_NIS, 0444, NULL); + if (peers == NULL) { + CERROR("couldn't create proc entry %s\n", LNET_PROC_NIS); + return; + } + + peers->proc_fops = &lnet_ni_fops; + peers->data = NULL; +} + +void +lnet_proc_fini(void) +{ + remove_proc_entry(LNET_PROC_STATS, 0); + remove_proc_entry(LNET_PROC_ROUTES, 0); + remove_proc_entry(LNET_PROC_ROUTERS, 0); + remove_proc_entry(LNET_PROC_PEERS, 0); + remove_proc_entry(LNET_PROC_BUFFERS, 0); + remove_proc_entry(LNET_PROC_NIS, 0); +} + +#else + +void +lnet_proc_init(void) +{ +} + +void +lnet_proc_fini(void) +{ +} + +#endif diff --git a/lnet/router/Makefile.in b/lnet/router/Makefile.in deleted file mode 100644 index 3bb6cf7..0000000 --- a/lnet/router/Makefile.in +++ /dev/null @@ -1,4 +0,0 @@ -MODULES := kptlrouter -kptlrouter-objs := router.o proc.o - -@INCLUDE_RULES@ diff --git a/lnet/router/proc.c b/lnet/router/proc.c deleted file mode 100644 index 61b6880..0000000 --- a/lnet/router/proc.c +++ /dev/null @@ -1,242 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * - * This file is part of Portals - * http://sourceforge.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "router.h" - -#define KPR_PROC_ROUTER "sys/portals/router" -#define KPR_PROC_ROUTES "sys/portals/routes" - -/* Used for multi-page route list book keeping */ -struct proc_route_data { - struct list_head *curr; - unsigned int generation; - off_t skip; - rwlock_t proc_route_rwlock; -} kpr_read_routes_data; - -/* nal2name support re-used from utils/portals.c */ -struct name2num { - char *name; - int num; -} nalnames[] = { - { "any", 0}, - { "elan", QSWNAL}, - { "tcp", SOCKNAL}, - { "gm", GMNAL}, - { "ib", OPENIBNAL}, - { "iib", IIBNAL}, - { "lo", LONAL}, - { NULL, -1} -}; - -static struct name2num *name2num_lookup_num(struct name2num *table, int num) -{ - while (table->name != NULL) - if (num == table->num) - return (table); - else - table++; - return (NULL); -} - -static char *nal2name(int nal) -{ - struct name2num *e = name2num_lookup_num(nalnames, nal); - return ((e == NULL) ? "???" : e->name); -} - - -static int kpr_proc_router_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - unsigned long long bytes = kpr_fwd_bytes; - unsigned long packets = kpr_fwd_packets; - unsigned long errors = kpr_fwd_errors; - unsigned int qdepth = atomic_read (&kpr_queue_depth); - int len; - - *eof = 1; - if (off != 0) - return (0); - - len = sprintf(page, "%Ld %ld %ld %d\n", bytes, packets, errors, qdepth); - - *start = page; - return (len); -} - -static int kpr_proc_router_write(struct file *file, const char *ubuffer, - unsigned long count, void *data) -{ - /* Ignore what we've been asked to write, and just zero the stats */ - kpr_fwd_bytes = 0; - kpr_fwd_packets = 0; - kpr_fwd_errors = 0; - - return (count); -} - -static int kpr_proc_routes_read(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct proc_route_data *prd = data; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge; - int chunk_len = 0; - int line_len = 0; - int user_len = 0; - int rc = 0; - - *eof = 1; - *start = page; - - write_lock(&(prd->proc_route_rwlock)); - - if (prd->curr == NULL) { - if (off != 0) - goto routes_read_exit; - - /* First pass, initialize our private data */ - prd->curr = kpr_routes.next; - prd->generation = kpr_routes_generation; - prd->skip = 0; - } else { - /* Abort route list generation change */ - if (prd->generation != kpr_routes_generation) { - prd->curr = NULL; - rc = sprintf(page, "\nError: Routes Changed\n"); - goto routes_read_exit; - } - - /* All the routes have been walked */ - if (prd->curr == &kpr_routes) { - prd->curr = NULL; - goto routes_read_exit; - } - } - - read_lock(&kpr_rwlock); - *start = page + prd->skip; - user_len = -prd->skip; - - while ((prd->curr != NULL) && (prd->curr != &kpr_routes)) { - re = list_entry(prd->curr, kpr_route_entry_t, kpre_list); - ge = re->kpre_gateway; - - line_len = sprintf(page + chunk_len, - "%12s "LPX64" : "LPX64" - "LPX64", %s\n", - nal2name(ge->kpge_nalid), ge->kpge_nid, - re->kpre_lo_nid, re->kpre_hi_nid, - ge->kpge_alive ? "up" : "down"); - chunk_len += line_len; - user_len += line_len; - - /* Abort the route list changed */ - if (prd->curr->next == NULL) { - prd->curr = NULL; - read_unlock(&kpr_rwlock); - rc = sprintf(page, "\nError: Routes Changed\n"); - goto routes_read_exit; - } - - prd->curr = prd->curr->next; - - /* The route table will exceed one page, break the while loop - * so the function can be re-called with a new page. - */ - if ((chunk_len > (PAGE_SIZE - 80)) || (user_len > count)) - break; - } - - *eof = 0; - - /* Caller received only a portion of the last entry, the - * remaining will be delivered in the next page if asked for. - */ - if (user_len > count) { - prd->curr = prd->curr->prev; - prd->skip = line_len - (user_len - count); - read_unlock(&kpr_rwlock); - rc = count; - goto routes_read_exit; - } - - /* Not enough data to entirely satify callers request */ - prd->skip = 0; - read_unlock(&kpr_rwlock); - rc = user_len; - -routes_read_exit: - write_unlock(&(prd->proc_route_rwlock)); - return rc; -} - -static int kpr_proc_routes_write(struct file *file, const char *ubuffer, - unsigned long count, void *data) -{ - /* no-op; lctl should be used to adjust the routes */ - return (count); -} - -void kpr_proc_init(void) -{ - struct proc_dir_entry *router_entry; - struct proc_dir_entry *routes_entry; - - /* Initialize KPR_PROC_ROUTER */ - router_entry = create_proc_entry (KPR_PROC_ROUTER, - S_IFREG | S_IRUGO | S_IWUSR, NULL); - - if (router_entry == NULL) { - CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTER); - return; - } - - router_entry->data = NULL; - router_entry->read_proc = kpr_proc_router_read; - router_entry->write_proc = kpr_proc_router_write; - - /* Initialize KPR_PROC_ROUTES */ - routes_entry = create_proc_entry (KPR_PROC_ROUTES, - S_IFREG | S_IRUGO | S_IWUSR, NULL); - - if (routes_entry == NULL) { - CERROR("couldn't create proc entry %s\n", KPR_PROC_ROUTES); - return; - } - - kpr_read_routes_data.curr = NULL; - kpr_read_routes_data.generation = 0; - kpr_read_routes_data.skip = 0; - kpr_read_routes_data.proc_route_rwlock = RW_LOCK_UNLOCKED; - - routes_entry->data = &kpr_read_routes_data; - routes_entry->read_proc = kpr_proc_routes_read; - routes_entry->write_proc = kpr_proc_routes_write; -} - -void kpr_proc_fini(void) -{ - remove_proc_entry(KPR_PROC_ROUTER, 0); - remove_proc_entry(KPR_PROC_ROUTES, 0); -} diff --git a/lnet/router/router.c b/lnet/router/router.c deleted file mode 100644 index 849563b..0000000 --- a/lnet/router/router.c +++ /dev/null @@ -1,824 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * - * This file is part of Portals - * http://sourceforge.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#include "router.h" - -LIST_HEAD(kpr_routes); -LIST_HEAD(kpr_gateways); -LIST_HEAD(kpr_nals); - -unsigned int kpr_routes_generation; -unsigned long long kpr_fwd_bytes; -unsigned long kpr_fwd_packets; -unsigned long kpr_fwd_errors; -atomic_t kpr_queue_depth; - -/* Mostly the tables are read-only (thread and interrupt context) - * - * Once in a blue moon we register/deregister NALs and add/remove routing - * entries (thread context only)... */ -rwlock_t kpr_rwlock = RW_LOCK_UNLOCKED; - -kpr_router_interface_t kpr_router_interface = { - kprri_register: kpr_register_nal, - kprri_lookup: kpr_lookup_target, - kprri_fwd_start: kpr_forward_packet, - kprri_fwd_done: kpr_complete_packet, - kprri_notify: kpr_nal_notify, - kprri_shutdown: kpr_shutdown_nal, - kprri_deregister: kpr_deregister_nal, -}; - -int -kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) -{ - unsigned long flags; - struct list_head *e; - kpr_nal_entry_t *ne; - - CDEBUG (D_NET, "Registering NAL %x\n", nalif->kprni_nalid); - - PORTAL_ALLOC (ne, sizeof (*ne)); - if (ne == NULL) - return (-ENOMEM); - - memset (ne, 0, sizeof (*ne)); - memcpy ((void *)&ne->kpne_interface, (void *)nalif, sizeof (*nalif)); - - LASSERT (!in_interrupt()); - write_lock_irqsave (&kpr_rwlock, flags); - - for (e = kpr_nals.next; e != &kpr_nals; e = e->next) - { - kpr_nal_entry_t *ne2 = list_entry (e, kpr_nal_entry_t, kpne_list); - - if (ne2->kpne_interface.kprni_nalid == ne->kpne_interface.kprni_nalid) - { - write_unlock_irqrestore (&kpr_rwlock, flags); - - CERROR ("Attempt to register same NAL %x twice\n", ne->kpne_interface.kprni_nalid); - - PORTAL_FREE (ne, sizeof (*ne)); - return (-EEXIST); - } - } - - list_add (&ne->kpne_list, &kpr_nals); - - write_unlock_irqrestore (&kpr_rwlock, flags); - - *argp = ne; - PORTAL_MODULE_USE; - return (0); -} - -void -kpr_do_upcall (void *arg) -{ - kpr_upcall_t *u = (kpr_upcall_t *)arg; - char nalstr[10]; - char nidstr[36]; - char whenstr[36]; - char *argv[] = { - NULL, - "ROUTER_NOTIFY", - nalstr, - nidstr, - u->kpru_alive ? "up" : "down", - whenstr, - NULL}; - - snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id); - snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid); - snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); - - portals_run_upcall (argv); - - kfree (u); -} - -void -kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when) -{ - char str[PTL_NALFMT_SIZE]; - - /* May be in arbitrary context */ - kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC); - - if (u == NULL) { - CERROR ("Upcall out of memory: nal %x nid "LPX64" (%s) %s\n", - gw_nalid, gw_nid, - portals_nid2str(gw_nalid, gw_nid, str), - alive ? "up" : "down"); - return; - } - - u->kpru_nal_id = gw_nalid; - u->kpru_nid = gw_nid; - u->kpru_alive = alive; - u->kpru_when = when; - - prepare_work (&u->kpru_tq, kpr_do_upcall, u); - schedule_work (&u->kpru_tq); -} - -int -kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, - int alive, time_t when) -{ - unsigned long flags; - int found; - kpr_nal_entry_t *ne = NULL; - kpr_gateway_entry_t *ge = NULL; - struct timeval now; - struct list_head *e; - struct list_head *n; - char str[PTL_NALFMT_SIZE]; - - CDEBUG (D_NET, "%s notifying [%x] "LPX64": %s\n", - byNal ? "NAL" : "userspace", - gateway_nalid, gateway_nid, alive ? "up" : "down"); - - /* can't do predictions... */ - do_gettimeofday (&now); - if (when > now.tv_sec) { - CWARN ("Ignoring prediction from %s of [%x] "LPX64" %s " - "%ld seconds in the future\n", - byNal ? "NAL" : "userspace", - gateway_nalid, gateway_nid, - alive ? "up" : "down", - when - now.tv_sec); - return (EINVAL); - } - - LASSERT (when <= now.tv_sec); - - /* Serialise with lookups (i.e. write lock) */ - write_lock_irqsave(&kpr_rwlock, flags); - - found = 0; - list_for_each_safe (e, n, &kpr_gateways) { - - ge = list_entry(e, kpr_gateway_entry_t, kpge_list); - if ((gateway_nalid != 0 && - ge->kpge_nalid != gateway_nalid) || - ge->kpge_nid != gateway_nid) - continue; - - found = 1; - break; - } - - if (!found) { - /* gateway not found */ - write_unlock_irqrestore(&kpr_rwlock, flags); - CDEBUG (D_NET, "Gateway not found\n"); - return (0); - } - - if (when < ge->kpge_timestamp) { - /* out of date information */ - write_unlock_irqrestore (&kpr_rwlock, flags); - CDEBUG (D_NET, "Out of date\n"); - return (0); - } - - /* update timestamp */ - ge->kpge_timestamp = when; - - if ((!ge->kpge_alive) == (!alive)) { - /* new date for old news */ - write_unlock_irqrestore (&kpr_rwlock, flags); - CDEBUG (D_NET, "Old news\n"); - return (0); - } - - ge->kpge_alive = alive; - CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive); - - if (alive) { - /* Reset all gateway weights so the newly-enabled gateway - * doesn't have to play catch-up */ - list_for_each_safe (e, n, &kpr_gateways) { - kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t, - kpge_list); - atomic_set (&ge->kpge_weight, 0); - } - } - - found = 0; - if (!byNal) { - /* userland notified me: notify NAL? */ - ne = kpr_find_nal_entry_locked (ge->kpge_nalid); - if (ne != NULL) { - if (!ne->kpne_shutdown && - ne->kpne_interface.kprni_notify != NULL) { - /* take a ref on this NAL until notifying - * it has completed... */ - atomic_inc (&ne->kpne_refcount); - found = 1; - } - } - } - - write_unlock_irqrestore(&kpr_rwlock, flags); - - if (found) { - ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg, - gateway_nid, alive); - /* 'ne' can disappear now... */ - atomic_dec (&ne->kpne_refcount); - } - - if (byNal) { - /* It wasn't userland that notified me... */ - CWARN ("Upcall: NAL %x NID "LPX64" (%s) is %s\n", - gateway_nalid, gateway_nid, - portals_nid2str(gateway_nalid, gateway_nid, str), - alive ? "alive" : "dead"); - kpr_upcall (gateway_nalid, gateway_nid, alive, when); - } else { - CDEBUG (D_NET, " NOT Doing upcall\n"); - } - - return (0); -} - -void -kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when) -{ - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - - kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when); -} - -void -kpr_shutdown_nal (void *arg) -{ - unsigned long flags; - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - - CDEBUG (D_NET, "Shutting down NAL %x\n", ne->kpne_interface.kprni_nalid); - - LASSERT (!ne->kpne_shutdown); - LASSERT (!in_interrupt()); - - write_lock_irqsave (&kpr_rwlock, flags); - ne->kpne_shutdown = 1; - write_unlock_irqrestore (&kpr_rwlock, flags); -} - -void -kpr_deregister_nal (void *arg) -{ - unsigned long flags; - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - - CDEBUG (D_NET, "Deregister NAL %x\n", ne->kpne_interface.kprni_nalid); - - LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ - LASSERT (!in_interrupt()); - - write_lock_irqsave (&kpr_rwlock, flags); - list_del (&ne->kpne_list); - write_unlock_irqrestore (&kpr_rwlock, flags); - - /* Wait until all outstanding messages/notifications have completed */ - while (atomic_read (&ne->kpne_refcount) != 0) - { - CDEBUG (D_NET, "Waiting for refcount on NAL %x to reach zero (%d)\n", - ne->kpne_interface.kprni_nalid, atomic_read (&ne->kpne_refcount)); - - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - - PORTAL_FREE (ne, sizeof (*ne)); - PORTAL_MODULE_UNUSE; -} - -int -kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2) -{ - const int significant_bits = 0x00ffffff; - /* We use atomic_t to record/compare route weights for - * load-balancing. Here we limit ourselves to only using - * 'significant_bits' when we do an 'after' comparison */ - - int diff = (atomic_read (&ge1->kpge_weight) - - atomic_read (&ge2->kpge_weight)) & significant_bits; - int rc = (diff > (significant_bits >> 1)); - - CDEBUG(D_INFO, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n", - ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight), - rc ? ">" : "<", - ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight)); - - return (rc); -} - -void -kpr_update_weight (kpr_gateway_entry_t *ge, int nob) -{ - int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t); - - /* We've chosen this route entry (i.e. gateway) to forward payload - * of length 'nob'; update the route's weight to make it less - * favoured. Note that the weight is 1 plus the payload size - * rounded and scaled to the portals header size, so we get better - * use of the significant bits in kpge_weight. */ - - CDEBUG(D_INFO, "gateway [%p]"LPX64" += %d\n", ge, - ge->kpge_nid, weight); - - atomic_add (weight, &ge->kpge_weight); -} - -int -kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, - ptl_nid_t *gateway_nidp) -{ - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - struct list_head *e; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge = NULL; - int rc = -ENOENT; - - /* Caller wants to know if 'target_nid' can be reached via a gateway - * ON HER OWN NETWORK */ - - CDEBUG (D_INFO, "lookup "LPX64" from NAL %x\n", target_nid, - ne->kpne_interface.kprni_nalid); - LASSERT (!in_interrupt()); - - read_lock (&kpr_rwlock); - - if (ne->kpne_shutdown) { /* caller is shutting down */ - read_unlock (&kpr_rwlock); - return (-ENOENT); - } - - /* Search routes for one that has a gateway to target_nid on the callers network */ - - list_for_each (e, &kpr_routes) { - re = list_entry (e, kpr_route_entry_t, kpre_list); - - if (re->kpre_lo_nid > target_nid || - re->kpre_hi_nid < target_nid) - continue; - - /* found table entry */ - - if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid || - !re->kpre_gateway->kpge_alive) { - /* different NAL or gateway down */ - rc = -EHOSTUNREACH; - continue; - } - - if (ge == NULL || - kpr_ge_isbetter (re->kpre_gateway, ge)) - ge = re->kpre_gateway; - } - - if (ge != NULL) { - kpr_update_weight (ge, nob); - *gateway_nidp = ge->kpge_nid; - rc = 0; - } - - read_unlock (&kpr_rwlock); - - /* NB can't deref 're' now; it might have been removed! */ - - CDEBUG (D_NET, "lookup "LPX64" from NAL %x: %d ("LPX64")\n", - target_nid, ne->kpne_interface.kprni_nalid, rc, - (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); - return (rc); -} - -kpr_nal_entry_t * -kpr_find_nal_entry_locked (int nal_id) -{ - struct list_head *e; - - /* Called with kpr_rwlock held */ - - list_for_each (e, &kpr_nals) { - kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list); - - if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */ - continue; - - return (ne); - } - - return (NULL); -} - -void -kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) -{ - kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; - ptl_nid_t target_nid = fwd->kprfd_target_nid; - int nob = fwd->kprfd_nob; - kpr_gateway_entry_t *ge = NULL; - kpr_nal_entry_t *dst_ne = NULL; - struct list_head *e; - kpr_route_entry_t *re; - kpr_nal_entry_t *tmp_ne; - int rc; - - CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid); - - LASSERT (nob == lib_kiov_nob (fwd->kprfd_niov, fwd->kprfd_kiov)); - LASSERT (!in_interrupt()); - - read_lock (&kpr_rwlock); - - kpr_fwd_packets++; /* (loose) stats accounting */ - kpr_fwd_bytes += nob + sizeof(ptl_hdr_t); - - if (src_ne->kpne_shutdown) { /* caller is shutting down */ - rc = -ESHUTDOWN; - goto out; - } - - fwd->kprfd_router_arg = src_ne; /* stash caller's nal entry */ - - /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ - - list_for_each (e, &kpr_routes) { - re = list_entry (e, kpr_route_entry_t, kpre_list); - - if (re->kpre_lo_nid > target_nid || /* no match */ - re->kpre_hi_nid < target_nid) - continue; - - if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid) - continue; /* don't route to same NAL */ - - if (!re->kpre_gateway->kpge_alive) - continue; /* gateway is dead */ - - tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid); - - if (tmp_ne == NULL || - tmp_ne->kpne_shutdown) { - /* NAL must be registered and not shutting down */ - continue; - } - - if (ge == NULL || - kpr_ge_isbetter (re->kpre_gateway, ge)) { - ge = re->kpre_gateway; - dst_ne = tmp_ne; - } - } - - if (ge != NULL) { - LASSERT (dst_ne != NULL); - - kpr_update_weight (ge, nob); - - fwd->kprfd_gateway_nid = ge->kpge_nid; - atomic_inc (&src_ne->kpne_refcount); /* source and dest nals are */ - atomic_inc (&dst_ne->kpne_refcount); /* busy until fwd completes */ - atomic_inc (&kpr_queue_depth); - - read_unlock (&kpr_rwlock); - - CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %x: " - "to "LPX64" on NAL %x\n", - fwd, target_nid, src_ne->kpne_interface.kprni_nalid, - fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); - - dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); - return; - } - - rc = -EHOSTUNREACH; - out: - kpr_fwd_errors++; - - CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %x: %d\n", - fwd, target_nid, src_ne->kpne_interface.kprni_nalid, rc); - - (fwd->kprfd_callback)(fwd->kprfd_callback_arg, rc); - - read_unlock (&kpr_rwlock); -} - -void -kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) -{ - kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; - kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; - - CDEBUG (D_NET, "complete(1) [%p] from NAL %x to NAL %x: %d\n", fwd, - src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); - - atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ - - (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); - - CDEBUG (D_NET, "complete(2) [%p] from NAL %x: %d\n", fwd, - src_ne->kpne_interface.kprni_nalid, error); - - atomic_dec (&kpr_queue_depth); - atomic_dec (&src_ne->kpne_refcount); /* CAVEAT EMPTOR src_ne can disappear now!!! */ -} - -int -kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, - ptl_nid_t lo_nid, ptl_nid_t hi_nid) -{ - unsigned long flags; - struct list_head *e; - kpr_route_entry_t *re; - kpr_gateway_entry_t *ge; - int dup = 0; - - CDEBUG(D_NET, "Add route: %x "LPX64" : "LPX64" - "LPX64"\n", - gateway_nalid, gateway_nid, lo_nid, hi_nid); - - if (gateway_nalid == PTL_NID_ANY || - lo_nid == PTL_NID_ANY || - hi_nid == PTL_NID_ANY || - lo_nid > hi_nid) - return (-EINVAL); - - PORTAL_ALLOC (ge, sizeof (*ge)); - if (ge == NULL) - return (-ENOMEM); - - ge->kpge_nalid = gateway_nalid; - ge->kpge_nid = gateway_nid; - ge->kpge_alive = 1; - ge->kpge_timestamp = 0; - ge->kpge_refcount = 0; - atomic_set (&ge->kpge_weight, 0); - - PORTAL_ALLOC (re, sizeof (*re)); - if (re == NULL) { - PORTAL_FREE (ge, sizeof (*ge)); - return (-ENOMEM); - } - - re->kpre_lo_nid = lo_nid; - re->kpre_hi_nid = hi_nid; - - LASSERT(!in_interrupt()); - write_lock_irqsave (&kpr_rwlock, flags); - - list_for_each (e, &kpr_gateways) { - kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, - kpge_list); - - if (ge2->kpge_nalid == gateway_nalid && - ge2->kpge_nid == gateway_nid) { - PORTAL_FREE (ge, sizeof (*ge)); - ge = ge2; - dup = 1; - break; - } - } - - if (!dup) { - /* Adding a new gateway... */ - list_add (&ge->kpge_list, &kpr_gateways); - - /* ...zero all gateway weights so this one doesn't have to - * play catch-up */ - - list_for_each (e, &kpr_gateways) { - kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, - kpge_list); - atomic_set (&ge2->kpge_weight, 0); - } - } - - re->kpre_gateway = ge; - ge->kpge_refcount++; - list_add (&re->kpre_list, &kpr_routes); - kpr_routes_generation++; - - write_unlock_irqrestore (&kpr_rwlock, flags); - return (0); -} - -int -kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid, - int alive, time_t when) -{ - return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when)); -} - -int -kpr_del_route (int gw_nalid, ptl_nid_t gw_nid, - ptl_nid_t lo, ptl_nid_t hi) -{ - int specific = (lo != PTL_NID_ANY); - unsigned long flags; - int rc = -ENOENT; - struct list_head *e; - struct list_head *n; - - CDEBUG(D_NET, "Del route [%x] "LPX64" : "LPX64" - "LPX64"\n", - gw_nalid, gw_nid, lo, hi); - - LASSERT(!in_interrupt()); - - /* NB Caller may specify either all routes via the given gateway - * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are - * actual NIDs) */ - if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY)) - return (-EINVAL); - - write_lock_irqsave(&kpr_rwlock, flags); - - list_for_each_safe (e, n, &kpr_routes) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, - kpre_list); - kpr_gateway_entry_t *ge = re->kpre_gateway; - - if (ge->kpge_nalid != gw_nalid || - ge->kpge_nid != gw_nid || - (specific && - (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid))) - continue; - - rc = 0; - - if (--ge->kpge_refcount == 0) { - list_del (&ge->kpge_list); - PORTAL_FREE (ge, sizeof (*ge)); - } - - list_del (&re->kpre_list); - PORTAL_FREE(re, sizeof (*re)); - - if (specific) - break; - } - - kpr_routes_generation++; - write_unlock_irqrestore(&kpr_rwlock, flags); - - return (rc); -} - -int -kpr_get_route (int idx, __u32 *gateway_nalid, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, __u32 *alive) -{ - struct list_head *e; - - LASSERT (!in_interrupt()); - read_lock(&kpr_rwlock); - - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, - kpre_list); - kpr_gateway_entry_t *ge = re->kpre_gateway; - - if (idx-- == 0) { - *gateway_nalid = ge->kpge_nalid; - *gateway_nid = ge->kpge_nid; - *alive = ge->kpge_alive; - *lo_nid = re->kpre_lo_nid; - *hi_nid = re->kpre_hi_nid; - - read_unlock(&kpr_rwlock); - return (0); - } - } - - read_unlock (&kpr_rwlock); - return (-ENOENT); -} - -static int -kpr_nal_cmd(struct portals_cfg *pcfg, void * private) -{ - int err = -EINVAL; - ENTRY; - - switch(pcfg->pcfg_command) { - default: - CDEBUG(D_IOCTL, "Inappropriate cmd: %d\n", pcfg->pcfg_command); - break; - - case NAL_CMD_ADD_ROUTE: - CDEBUG(D_IOCTL, "Adding route: [%x] "LPU64" : "LPU64" - "LPU64"\n", - pcfg->pcfg_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - err = kpr_add_route(pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - break; - - case NAL_CMD_DEL_ROUTE: - CDEBUG (D_IOCTL, "Removing routes via [%x] "LPU64" : "LPU64" - "LPU64"\n", - pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - err = kpr_del_route (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_nid2, pcfg->pcfg_nid3); - break; - - case NAL_CMD_NOTIFY_ROUTER: { - CDEBUG (D_IOCTL, "Notifying peer [%x] "LPU64" %s @ %ld\n", - pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_flags ? "Enabling" : "Disabling", - (time_t)pcfg->pcfg_nid3); - - err = kpr_sys_notify (pcfg->pcfg_gw_nal, pcfg->pcfg_nid, - pcfg->pcfg_flags, (time_t)pcfg->pcfg_nid3); - break; - } - - case NAL_CMD_GET_ROUTE: - CDEBUG (D_IOCTL, "Getting route [%d]\n", pcfg->pcfg_count); - err = kpr_get_route(pcfg->pcfg_count, &pcfg->pcfg_gw_nal, - &pcfg->pcfg_nid, - &pcfg->pcfg_nid2, &pcfg->pcfg_nid3, - &pcfg->pcfg_flags); - break; - } - RETURN(err); -} - - -static void /*__exit*/ -kpr_finalise (void) -{ - LASSERT (list_empty (&kpr_nals)); - - libcfs_nal_cmd_unregister(ROUTER); - - PORTAL_SYMBOL_UNREGISTER(kpr_router_interface); - - kpr_proc_fini(); - - while (!list_empty (&kpr_routes)) { - kpr_route_entry_t *re = list_entry(kpr_routes.next, - kpr_route_entry_t, - kpre_list); - - list_del(&re->kpre_list); - PORTAL_FREE(re, sizeof (*re)); - } - - CDEBUG(D_MALLOC, "kpr_finalise: kmem back to %d\n", - atomic_read(&portal_kmemory)); -} - -static int __init -kpr_initialise (void) -{ - int rc; - - CDEBUG(D_MALLOC, "kpr_initialise: kmem %d\n", - atomic_read(&portal_kmemory)); - - kpr_routes_generation = 0; - kpr_proc_init(); - - rc = libcfs_nal_cmd_register(ROUTER, kpr_nal_cmd, NULL); - if (rc != 0) { - CERROR("Can't register nal cmd handler\n"); - return (rc); - } - - PORTAL_SYMBOL_REGISTER(kpr_router_interface); - return (0); -} - -MODULE_AUTHOR("Eric Barton"); -MODULE_DESCRIPTION("Kernel Portals Router v0.01"); -MODULE_LICENSE("GPL"); - -module_init (kpr_initialise); -module_exit (kpr_finalise); - -EXPORT_SYMBOL (kpr_router_interface); diff --git a/lnet/router/router.h b/lnet/router/router.h deleted file mode 100644 index 44f307a..0000000 --- a/lnet/router/router.h +++ /dev/null @@ -1,102 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * This file is part of Lustre, http://www.lustre.org - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -#ifndef _KPTLROUTER_H -#define _KPTLROUTER_H -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_PTLROUTER - -#include -#include -#include -#include - -typedef struct -{ - struct list_head kpne_list; - kpr_nal_interface_t kpne_interface; - atomic_t kpne_refcount; - int kpne_shutdown; -} kpr_nal_entry_t; - -typedef struct -{ - struct list_head kpge_list; - atomic_t kpge_weight; - time_t kpge_timestamp; - int kpge_alive; - int kpge_nalid; - int kpge_refcount; - ptl_nid_t kpge_nid; -} kpr_gateway_entry_t; - -typedef struct -{ - struct list_head kpre_list; - kpr_gateway_entry_t *kpre_gateway; - ptl_nid_t kpre_lo_nid; - ptl_nid_t kpre_hi_nid; -} kpr_route_entry_t; - -typedef struct -{ - work_struct_t kpru_tq; - int kpru_nal_id; - ptl_nid_t kpru_nid; - int kpru_alive; - time_t kpru_when; -} kpr_upcall_t; - -extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); -extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, - ptl_nid_t *gateway_nidp); -extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id); -extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); -extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); -extern void kpr_nal_notify (void *arg, ptl_nid_t peer, - int alive, time_t when); -extern void kpr_shutdown_nal (void *arg); -extern void kpr_deregister_nal (void *arg); - -extern void kpr_proc_init (void); -extern void kpr_proc_fini (void); - -extern unsigned int kpr_routes_generation; -extern unsigned long long kpr_fwd_bytes; -extern unsigned long kpr_fwd_packets; -extern unsigned long kpr_fwd_errors; -extern atomic_t kpr_queue_depth; - -extern struct list_head kpr_routes; -extern rwlock_t kpr_rwlock; - -#endif /* _KPLROUTER_H */ diff --git a/lnet/tests/Makefile.in b/lnet/tests/Makefile.in index c309db0..5860c3e 100644 --- a/lnet/tests/Makefile.in +++ b/lnet/tests/Makefile.in @@ -1,16 +1,14 @@ -MODULES := pingsrv pingcli spingsrv spingcli +MODULES := pingsrv pingcli +#utcli utsrv pingsrv-objs := ping_srv.o ifeq ($(PATCHLEVEL),6) pingcli-objs := ping_cli.o -spingsrv-objs := sping_srv.o -spingcli-objs := sping_cli.o +#utcli-objs := ut_cli.o +#utsrv-objs := ut_srv.o else ping%.c: ping_%.c ln -sf $< $@ - -sping%.c: sping_%.c - ln -sf $< $@ endif @INCLUDE_RULES@ diff --git a/lnet/tests/autoMakefile.am b/lnet/tests/autoMakefile.am index f611868..f187255 100644 --- a/lnet/tests/autoMakefile.am +++ b/lnet/tests/autoMakefile.am @@ -4,17 +4,39 @@ # See the file COPYING in this distribution if MODULES -if !CRAY_PORTALS if TESTS if LINUX noinst_DATA := pingsrv$(KMODEXT) pingcli$(KMODEXT) -noinst_DATA += spingsrv$(KMODEXT) spingcli$(KMODEXT) +#noinst_DATA += utsrv$(KMODEXT) utcli$(KMODEXT) endif -endif -endif -endif +if DARWIN +macos_PROGRAMS := pingcli +#macos_PROGRAMS := pingsrv + +pingcli_SOURCES := ping_cli.c + +pingcli_CFLAGS := $(EXTRA_KCFLAGS) +pingcli_LDFLAGS := $(EXTRA_KLDFLAGS) +pingcli_LDADD := $(EXTRA_KLIBS) + +#pingsrv_SOURCES := ping_srv.c + +#pingsrv_CFLAGS := $(EXTRA_KCFLAGS) +#pingsrv_LDFLAGS := $(EXTRA_KLDFLAGS) +#pingsrv_LDADD := $(EXTRA_KLIBS) + +plist_DATA := ping_cli/Info.plist +#plist_DATA := ping_srv/Info.plist + +install_data_hook := fix-kext-ownership +endif # Darwin + +endif # TEST +endif # MODULE +install-data-hook: $(install_data_hook) -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ pingsrv.c pingcli.c spingsrv.c spingcli.c -DIST_SOURCES = ping_srv.c ping_cli.c sping_srv.c sping_cli.c ping.h +MOSTLYCLEANFILES = @MOSTLYCLEANFILES@ pingsrv.c pingcli.c +DIST_SOURCES = ping_srv.c ping_cli.c ping.h +#ut_cli.c ut_srv.c ut.h diff --git a/lnet/tests/ping.h b/lnet/tests/ping.h index ef937af..1dde8bc 100644 --- a/lnet/tests/ping.h +++ b/lnet/tests/ping.h @@ -2,7 +2,7 @@ #define _KPING_INCLUDED #include -#include +#include #define PTL_PING_IN_SIZE 256 // n packets per buffer @@ -28,7 +28,7 @@ (((index + 1) >= PTL_PING_IN_BUFFERS) ? 0 : (index + 1)) #define PDEBUG(str, err) \ - CERROR ("%s: error=%s (%d)\n", str, ptl_err_str[err], err) + CERROR ("%s: error=(%d)\n", str, err) /* Ping data to be passed via the ioctl to kernel space */ @@ -36,36 +36,37 @@ #if __KERNEL__ struct pingsrv_data { - - ptl_handle_ni_t ni; - ptl_handle_me_t me; - ptl_handle_eq_t eq; - void *in_buf; - ptl_process_id_t my_id; - ptl_process_id_t id_local; - ptl_md_t mdin; - ptl_md_t mdout; - ptl_handle_md_t mdin_h; - ptl_handle_md_t mdout_h; - ptl_event_t evnt; + lnet_handle_me_t me; + lnet_handle_eq_t eq; + void *in_buf; + lnet_process_id_t my_id; + lnet_process_id_t id_local; + lnet_md_t mdin; + lnet_md_t mdout; + lnet_handle_md_t mdin_h; + lnet_handle_md_t mdout_h; + lnet_event_t evnt; cfs_task_t *tsk; }; /* struct pingsrv_data */ struct pingcli_data { - struct portal_ioctl_data *args; - ptl_handle_me_t me; - ptl_handle_eq_t eq; + int count; + int size; + lnet_nid_t nid; + int timeout; + lnet_handle_me_t me; + lnet_handle_eq_t eq; char *inbuf; char *outbuf; - ptl_process_id_t myid; - ptl_process_id_t id_local; - ptl_process_id_t id_remote; - ptl_md_t md_in_head; - ptl_md_t md_out_head; - ptl_handle_md_t md_in_head_h; - ptl_handle_md_t md_out_head_h; - ptl_event_t ev; + lnet_process_id_t myid; + lnet_process_id_t id_local; + lnet_process_id_t id_remote; + lnet_md_t md_in_head; + lnet_md_t md_out_head; + lnet_handle_md_t md_in_head_h; + lnet_handle_md_t md_out_head_h; + lnet_event_t ev; cfs_task_t *tsk; }; /* struct pingcli_data */ diff --git a/lnet/tests/ping_cli.c b/lnet/tests/ping_cli.c index 2995b46..eaf83c0 100644 --- a/lnet/tests/ping_cli.c +++ b/lnet/tests/ping_cli.c @@ -26,9 +26,9 @@ #define DEBUG_SUBSYSTEM S_PINGER #include -#include +#include #include "ping.h" -/* int portal_debug = D_PING_CLI; */ +/* int libcfs_debug = D_PING_CLI; */ #define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) @@ -42,9 +42,8 @@ static struct pingcli_data *client = NULL; static int count = 0; static void -pingcli_shutdown(ptl_handle_ni_t nih, int err) +pingcli_shutdown(int err) { - struct portal_ioctl_data *args = client->args; int rc; /* Yes, we are intentionally allowing us to fall through each @@ -54,32 +53,32 @@ pingcli_shutdown(ptl_handle_ni_t nih, int err) switch (err) { case 1: /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (client->md_out_head_h))) - PDEBUG ("PtlMDUnlink", rc); + if ((rc = LNetMDUnlink (client->md_out_head_h))) + PDEBUG ("LNetMDUnlink", rc); case 2: - if ((rc = PtlMDUnlink (client->md_in_head_h))) - PDEBUG ("PtlMDUnlink", rc); + if ((rc = LNetMDUnlink (client->md_in_head_h))) + PDEBUG ("LNetMDUnlink", rc); /* Free the event queue */ - if ((rc = PtlEQFree (client->eq))) - PDEBUG ("PtlEQFree", rc); + if ((rc = LNetEQFree (client->eq))) + PDEBUG ("LNetEQFree", rc); - if ((rc = PtlMEUnlink (client->me))) - PDEBUG ("PtlMEUnlink", rc); + if ((rc = LNetMEUnlink (client->me))) + PDEBUG ("LNetMEUnlink", rc); case 3: - PtlNIFini(nih); + LNetNIFini(); case 4: /* Free our buffers */ if (client->outbuf != NULL) - PORTAL_FREE (client->outbuf, STDSIZE + args->ioc_size); + LIBCFS_FREE (client->outbuf, STDSIZE + client->size); if (client->inbuf != NULL) - PORTAL_FREE (client->inbuf, - (args->ioc_size + STDSIZE) * args->ioc_count); + LIBCFS_FREE (client->inbuf, + (client->size + STDSIZE) * client->count); if (client != NULL) - PORTAL_FREE (client, + LIBCFS_FREE (client, sizeof(struct pingcli_data)); } @@ -87,12 +86,12 @@ pingcli_shutdown(ptl_handle_ni_t nih, int err) CDEBUG (D_OTHER, "ping client released resources\n"); } /* pingcli_shutdown() */ -static void pingcli_callback(ptl_event_t *ev) +static void pingcli_callback(lnet_event_t *ev) { int i; unsigned magic; - i = __le32_to_cpu(*(int *)(ev->md.start + ev->offset + sizeof(unsigned))); - magic = __le32_to_cpu(*(int *)(ev->md.start + ev->offset)); + i = __le32_to_cpu(*(int *)((char *)ev->md.start + ev->offset + sizeof(unsigned))); + magic = __le32_to_cpu(*(int *)((char *)ev->md.start + ev->offset)); if(magic != 0xcafebabe) { CERROR("Unexpected response %x\n", magic); @@ -105,122 +104,121 @@ static void pingcli_callback(ptl_event_t *ev) } -static struct pingcli_data * -pingcli_start(struct portal_ioctl_data *args) +static void +pingcli_start(struct libcfs_ioctl_data *args) { - ptl_handle_ni_t nih = PTL_INVALID_HANDLE; unsigned ping_head_magic = __cpu_to_le32(PING_HEADER_MAGIC); int rc; struct timeval tv1, tv2; - char str[PTL_NALFMT_SIZE]; client->tsk = cfs_current(); - client->args = args; - CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64" (%s), \ - nal %x, size %u, count: %u, timeout: %u\n", - args->ioc_nid, - portals_nid2str(args->ioc_nal, args->ioc_nid, str), - args->ioc_nal, args->ioc_size, - args->ioc_count, args->ioc_timeout); + client->nid = args->ioc_nid; + client->count = args->ioc_count; + client->size = args->ioc_u32[0]; + client->timeout = args->ioc_u32[1]; + + CDEBUG (D_OTHER, "pingcli_setup args: nid %s (%s), \ + size %u, count: %u, timeout: %u\n", + libcfs_nid2str(client->nid), + libcfs_nid2str(client->nid), + client->size, client->count, client->timeout); - PORTAL_ALLOC (client->outbuf, STDSIZE + args->ioc_size) ; + LIBCFS_ALLOC (client->outbuf, STDSIZE + client->size) ; if (client->outbuf == NULL) { CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); + pingcli_shutdown (4); + return; } - PORTAL_ALLOC (client->inbuf, - (args->ioc_size + STDSIZE) * args->ioc_count); + LIBCFS_ALLOC (client->inbuf, + (client->size + STDSIZE) * client->count); if (client->inbuf == NULL) { CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); + pingcli_shutdown (4); + return; } - /* Aquire and initialize the proper nal for portals. */ - rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) + rc = LNetNIInit(0); + if (rc != 0 && rc != 1) { - CERROR ("NAL %x not loaded\n", args->ioc_nal); - pingcli_shutdown (nih, 4); - return (NULL); + CERROR ("LNetNIInit: error %d\n", rc); + pingcli_shutdown (4); + return; } /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (nih, &client->myid))) + if ((rc = LNetGetId (1, &client->myid))) { - CERROR ("PtlGetId error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); + CERROR ("LNetGetId error %d\n", rc); + pingcli_shutdown (2); + return; } /* Setup the local match entries */ - client->id_local.nid = PTL_NID_ANY; - client->id_local.pid = PTL_PID_ANY; + client->id_local.nid = LNET_NID_ANY; + client->id_local.pid = LNET_PID_ANY; /* Setup the remote match entries */ - client->id_remote.nid = args->ioc_nid; + client->id_remote.nid = client->nid; client->id_remote.pid = 0; - if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT, - client->id_local, 0, ~0, PTL_RETAIN, - PTL_INS_AFTER, &client->me))) + if ((rc = LNetMEAttach (PTL_PING_CLIENT, + client->id_local, 0, ~0, LNET_RETAIN, + LNET_INS_AFTER, &client->me))) { - CERROR ("PtlMEAttach error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); + CERROR ("LNetMEAttach error %d\n", rc); + pingcli_shutdown (2); + return; } /* Allocate the event queue for this network interface */ - if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq))) + if ((rc = LNetEQAlloc (64, pingcli_callback, &client->eq))) { - CERROR ("PtlEQAlloc error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); + CERROR ("LNetEQAlloc error %d\n", rc); + pingcli_shutdown (2); + return; } - count = args->ioc_count; + count = client->count; client->md_in_head.start = client->inbuf; - client->md_in_head.length = (args->ioc_size + STDSIZE) - * count; - client->md_in_head.threshold = PTL_MD_THRESH_INF; - client->md_in_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; + client->md_in_head.length = (client->size + STDSIZE) * count; + client->md_in_head.threshold = LNET_MD_THRESH_INF; + client->md_in_head.options = LNET_MD_OP_PUT; client->md_in_head.user_ptr = NULL; client->md_in_head.eq_handle = client->eq; - memset (client->inbuf, 0, (args->ioc_size + STDSIZE) * count); + memset (client->inbuf, 0, (client->size + STDSIZE) * count); /* Attach the incoming buffer */ - if ((rc = PtlMDAttach (client->me, client->md_in_head, - PTL_UNLINK, &client->md_in_head_h))) { - CERROR ("PtlMDAttach error %d\n", rc); - pingcli_shutdown (nih, 1); - return (NULL); + if ((rc = LNetMDAttach (client->me, client->md_in_head, + LNET_UNLINK, &client->md_in_head_h))) { + CERROR ("LNetMDAttach error %d\n", rc); + pingcli_shutdown (1); + return; } /* Setup the outgoing ping header */ client->md_out_head.start = client->outbuf; - client->md_out_head.length = STDSIZE + args->ioc_size; - client->md_out_head.threshold = args->ioc_count; - client->md_out_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; + client->md_out_head.length = STDSIZE + client->size; + client->md_out_head.threshold = client->count; + client->md_out_head.options = LNET_MD_OP_PUT; client->md_out_head.user_ptr = NULL; - client->md_out_head.eq_handle = PTL_EQ_NONE; + client->md_out_head.eq_handle = LNET_EQ_NONE; memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); count = 0; /* Bind the outgoing ping header */ - if ((rc=PtlMDBind (nih, client->md_out_head, - PTL_UNLINK, &client->md_out_head_h))) { - CERROR ("PtlMDBind error %d\n", rc); - pingcli_shutdown (nih, 1); - return NULL; + if ((rc=LNetMDBind (client->md_out_head, + LNET_UNLINK, &client->md_out_head_h))) { + CERROR ("LNetMDBind error %d\n", rc); + pingcli_shutdown (1); + return; } - while ((args->ioc_count - count)) { + while ((client->count - count)) { unsigned __count; __count = __cpu_to_le32(count); @@ -232,16 +230,19 @@ pingcli_start(struct portal_ioctl_data *args) memcpy(client->outbuf+sizeof(unsigned)+sizeof(unsigned),&tv1, sizeof(struct timeval)); - if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, - client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { - PDEBUG ("PtlPut (header)", rc); - pingcli_shutdown (nih, 1); - return NULL; + if((rc = LNetPut (LNET_NID_ANY, client->md_out_head_h, + LNET_NOACK_REQ, + client->id_remote, PTL_PING_SERVER, + 0, 0, 0))) { + PDEBUG ("LNetPut (header)", rc); + pingcli_shutdown (1); + return; } CWARN ("Lustre: sent msg no %d.\n", count); - set_current_state (TASK_INTERRUPTIBLE); - rc = schedule_timeout (cfs_time_seconds(args->ioc_timeout)); + set_current_state (CFS_TASK_INTERRUPTIBLE); + rc = cfs_schedule_timeout (CFS_TASK_INTERRUPTIBLE, + cfs_time_seconds(client->timeout)); if (rc == 0) { CERROR ("timeout .....\n"); } else { @@ -253,18 +254,16 @@ pingcli_start(struct portal_ioctl_data *args) count++; } - pingcli_shutdown (nih, 2); + pingcli_shutdown (2); - /* Success! */ - return NULL; } /* pingcli_setup() */ /* called by the portals_ioctl for ping requests */ -int kping_client(struct portal_ioctl_data *args) +int kping_client(struct libcfs_ioctl_data *args) { - PORTAL_ALLOC (client, sizeof(struct pingcli_data)); + LIBCFS_ALLOC (client, sizeof(struct pingcli_data)); if (client == NULL) { CERROR ("Unable to allocate client structure\n"); diff --git a/lnet/tests/ping_cli/Info.plist b/lnet/tests/ping_cli/Info.plist index 2dfc37b..4ecee0f 100644 --- a/lnet/tests/ping_cli/Info.plist +++ b/lnet/tests/ping_cli/Info.plist @@ -5,11 +5,11 @@ CFBundleDevelopmentRegion English CFBundleExecutable - ping_cli + pingcli CFBundleIconFile CFBundleIdentifier - com.clusterfs.lustre.portals.tests.ping_cli + com.clusterfs.lustre.pingcli CFBundleInfoDictionaryVersion 6.0 CFBundlePackageType @@ -17,20 +17,22 @@ CFBundleSignature ???? CFBundleVersion - 1.0.0d1 + 1.0.1 + OSBundleCompatibleVersion + 1.0.0 OSBundleLibraries - com.apple.kernel.bsd - 1.1 - com.apple.kernel.iokit - 1.0.0b1 - com.apple.kernel.mach - 1.0.0b1 - com.clusterfs.lustre.portals.libcfs + com.apple.kpi.bsd + 8.0.0b1 + com.apple.kpi.libkern + 8.0.0b1 + com.apple.kpi.mach + 8.0.0b1 + com.apple.kpi.unsupported + 8.0.0b1 + com.clusterfs.lustre.libcfs 1.0.0 - com.clusterfs.lustre.portals.portals - 1.0.0 - com.clusterfs.lustre.portals.knals.ksocknal + com.clusterfs.lustre.lnet 1.0.0 diff --git a/lnet/tests/ping_cli/winnt-pingcli.c b/lnet/tests/ping_cli/winnt-pingcli.c new file mode 100644 index 0000000..7c9a1a1 --- /dev/null +++ b/lnet/tests/ping_cli/winnt-pingcli.c @@ -0,0 +1,634 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Matt Wu + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +/* + * Included Headers + */ + + +#include + + +/* libcfs module init/exit routines */ +DECLARE_INIT(init_libcfs_module); +DECLARE_EXIT(exit_libcfs_module); + +/* portal module init/exit routines */ +DECLARE_INIT(init_lnet); +DECLARE_EXIT(fini_lnet); + +/* tdinal module init/exit routines */ +DECLARE_INIT(ksocknal_module_init); +DECLARE_EXIT(ksocknal_module_fini); + +/* pingcli module init/exit routines */ +DECLARE_INIT(pingcli_init); +DECLARE_EXIT(pingcli_cleanup); + + +/* pingsrv module init/exit routines */ +DECLARE_INIT(pingsrv_init); +DECLARE_EXIT(pingsrv_cleanup); + +/* + * structure definitions + */ + + +#define LUSTRE_PING_VERSION 0x00010000 /* ping srv/cli version: 0001.0000 */ + +#define LUSTRE_PING_DEVICE L"\\Device\\LNET" /* device object name */ +#define LUSTRE_PING_SYMLNK L"\\DosDevices\\LNET" /* user-visible name for the device*/ + +typedef struct _DEVICE_EXTENSION +{ + BOOLEAN bProcFS; + +} DEVICE_EXTENSION, *PDEVICE_EXTENSION; + + +/* + * global definitions + */ + +PDEVICE_OBJECT PingObject = NULL; /* ping device object */ +PDEVICE_OBJECT ProcObject = NULL; /* procfs emulator device */ + + +/* + * common routines + */ + + +// +// complete Irp request ... +// + +NTSTATUS +UTCompleteIrp( + PIRP Irp, + NTSTATUS Status, + ULONG Info + ) +{ + Irp->IoStatus.Status = Status; + Irp->IoStatus.Information = Info; + IoCompleteRequest(Irp,IO_NO_INCREMENT); + + return Status; +} + +// +// Open/Create Device ... +// + +NTSTATUS +UTCreate( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + KdPrint(("UTCreate: DeviceCreate ...\n")); + + return UTCompleteIrp(Irp,STATUS_SUCCESS,0); +} + +// +// Close Devcie ... +// + +NTSTATUS +UTClose( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp) +{ + KdPrint(("UTClose: Device Closed.\n")); + + return UTCompleteIrp(Irp, STATUS_SUCCESS, 0); + + UNREFERENCED_PARAMETER(DeviceObject); +} + + + +NTSTATUS +UTShutdown( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + KdPrint(("UTShutdown: shuting TdiSock ...\n")); + + return UTCompleteIrp(Irp, STATUS_SUCCESS, 0); + + UNREFERENCED_PARAMETER(DeviceObject); +} + +// +// driver frame Routines ... +// + + +NTSTATUS +UTDeviceControl( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status = STATUS_INVALID_DEVICE_REQUEST; + PIO_STACK_LOCATION IrpSp; + + ULONG ControlCode; + ULONG InputLength; + ULONG OutputLength; + + PVOID lpvInBuffer; + + KdPrint(("UTDeviceControl: Device Ioctl ...\n")); + + Irp->IoStatus.Information = 0; + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + ControlCode = IrpSp->Parameters.DeviceIoControl.IoControlCode; + InputLength = IrpSp->Parameters.DeviceIoControl.InputBufferLength; + OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength; + lpvInBuffer = Irp->AssociatedIrp.SystemBuffer; + + ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL); + + switch (ControlCode) + { + case IOCTL_LIBCFS_VERSION: + + *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION); + Irp->IoStatus.Information = sizeof(ULONG); + Status = STATUS_SUCCESS; + break; + + default: + break; + } + + Irp->IoStatus.Status = Status; + + IoCompleteRequest(Irp, IO_NO_INCREMENT); + + KdPrint(("UTDeviceControl: Device Ioctl returned.\n")); + + return Status; +} + +NTSTATUS +ProcCreate( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status; + PIO_STACK_LOCATION IrpSp; + + FILE_FULL_EA_INFORMATION * ea; + cfs_file_t * fp; + + KdPrint(("ProcCreate: Proc device is being opened ...\n")); + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + ea = (PFILE_FULL_EA_INFORMATION) Irp->AssociatedIrp.SystemBuffer; + + if (!ea) { + Status = STATUS_INVALID_PARAMETER; + } else { + fp = lustre_open_file(&ea->EaName[0]); + if (!fp) { + Status = STATUS_OBJECT_NAME_NOT_FOUND; + } else { + IrpSp->FileObject->FsContext = fp; + IrpSp->FileObject->FsContext2 = fp->private_data; + Status = STATUS_SUCCESS; + } + } + + return UTCompleteIrp(Irp, Status, 0); +} + +// +// Close Devcie ... +// + +NTSTATUS +ProcClose( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp) +{ + PIO_STACK_LOCATION IrpSp; + + cfs_file_t * fp; + + KdPrint(("ProcClose: Proc device object is to be closed.\n")); + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + fp = (cfs_file_t *) IrpSp->FileObject->FsContext; + + ASSERT(fp != NULL); + ASSERT(IrpSp->FileObject->FsContext2 == fp->private_data); + + lustre_close_file(fp); + + return UTCompleteIrp(Irp, STATUS_SUCCESS, 0); + + UNREFERENCED_PARAMETER(DeviceObject); +} + +/* + * proc frame routines + */ + +NTSTATUS +ProcDeviceControl( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status = STATUS_INVALID_DEVICE_REQUEST; + PIO_STACK_LOCATION IrpSp; + + ULONG ControlCode; + ULONG InputLength; + ULONG OutputLength; + + PVOID lpvInBuffer; + + KdPrint(("ProcDeviceControl: Proc device ioctling ...\n")); + + Irp->IoStatus.Information = 0; + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + ControlCode = IrpSp->Parameters.DeviceIoControl.IoControlCode; + InputLength = IrpSp->Parameters.DeviceIoControl.InputBufferLength; + OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength; + lpvInBuffer = Irp->AssociatedIrp.SystemBuffer; + + ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL); + + switch (ControlCode) + { + case IOCTL_LIBCFS_VERSION: + + *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION); + Irp->IoStatus.Information = sizeof(ULONG); + + Status = STATUS_SUCCESS; + + break; + + case IOCTL_LIBCFS_ENTRY: + { + int rc = 0; + cfs_file_t * fp; + + fp = (cfs_file_t *) IrpSp->FileObject->FsContext; + + if (!fp) { + rc = -EINVAL; + } else { + rc = lustre_ioctl_file(fp, (PCFS_PROC_IOCTL) (lpvInBuffer)); + } + + if (rc == 0) { + Irp->IoStatus.Information = InputLength; + Status = STATUS_SUCCESS; + } + } + } + + Irp->IoStatus.Status = Status; + + IoCompleteRequest(Irp, IO_NO_INCREMENT); + + KdPrint(("ProcDeviceControl: Proc device ioctl returned with status = %xh.\n", Status)); + + return Status; +} + + + +NTSTATUS +ProcReadWrite (PDEVICE_OBJECT DeviceObject, PIRP Irp) +{ + PIO_STACK_LOCATION IrpSp; + NTSTATUS Status; + + cfs_file_t * fp; + int rc; + PCHAR buf; + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + if (Irp->MdlAddress) { + buf = MmGetSystemAddressForMdlSafe( + Irp->MdlAddress, + NormalPagePriority); + } else { + buf = Irp->AssociatedIrp.SystemBuffer; + } + + if (buf == NULL) { + Status = STATUS_SUCCESS; + rc = 0; + } else { + fp = (cfs_file_t *) IrpSp->FileObject->FsContext; + + if (!fp) { + Status = STATUS_INVALID_PARAMETER; + goto errorout; + } + + if (IrpSp->MajorFunction == IRP_MJ_READ) { + rc = lustre_read_file( + fp, IrpSp->Parameters.Read.ByteOffset.LowPart, + IrpSp->Parameters.Read.Length, buf); + } else { + rc = lustre_write_file( + fp, IrpSp->Parameters.Write.ByteOffset.LowPart, + IrpSp->Parameters.Write.Length, buf); + } + if (rc < 0) { + cfs_enter_debugger(); + Status = STATUS_UNSUCCESSFUL; + } else { + Status = STATUS_SUCCESS; + } + } + + +errorout: + return UTCompleteIrp(Irp, Status, rc); +} + + +// +// common dispatch routines +// + +NTSTATUS +UTDispatchRequest( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status; + PIO_STACK_LOCATION IrpSp; + + Status = STATUS_INVALID_DEVICE_REQUEST; + + __try { + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + switch (IrpSp->MajorFunction) { + + case IRP_MJ_CREATE: + if (DeviceObject == PingObject) { + Status = UTCreate(DeviceObject, Irp); + } else if (DeviceObject == ProcObject) { + Status = ProcCreate(DeviceObject, Irp); + } + break; + + case IRP_MJ_CLOSE: + if (DeviceObject == PingObject) { + Status = UTClose(DeviceObject, Irp); + } else if (DeviceObject == ProcObject) { + Status = ProcClose(DeviceObject, Irp); + } + break; + + case IRP_MJ_READ: + case IRP_MJ_WRITE: + if (DeviceObject == ProcObject) { + Status = ProcReadWrite(DeviceObject, Irp); + } + break; + + case IRP_MJ_DEVICE_CONTROL: + if (DeviceObject == PingObject) { + Status = UTDeviceControl(DeviceObject, Irp); + } else if (DeviceObject == ProcObject) { + Status = ProcDeviceControl(DeviceObject, Irp); + } + break; + + case IRP_MJ_SHUTDOWN: + Status = UTShutdown(DeviceObject, Irp); + break; + + default: + + KdPrint(("UTDispatchRequest: Major Function: %xh is not supported.\n", + IrpSp->MajorFunction)); + UTCompleteIrp(Irp, Status, 0); + break; + } + } + + __finally { + } + + return Status; +} + +// +// create a device object and a dosdevice symbol link +// + +PDEVICE_OBJECT +CreateDevice( + IN PDRIVER_OBJECT DriverObject, + IN PWCHAR DeviceName, + IN PWCHAR SymlnkName, + IN BOOLEAN bProcFS + ) +{ + NTSTATUS Status; + + UNICODE_STRING NtDevName; + UNICODE_STRING Win32DevName; + + PDEVICE_EXTENSION DeviceExtension; + PDEVICE_OBJECT DeviceObject; + + /* create the device object with the specified name */ + + RtlInitUnicodeString(&NtDevName, DeviceName); + + Status = IoCreateDevice( + DriverObject, + sizeof(DEVICE_EXTENSION), + &NtDevName, + FILE_DEVICE_UNKNOWN, + 0, + FALSE, + &DeviceObject ); + + if (!NT_SUCCESS(Status)) { + + cfs_enter_debugger(); + return NULL; + } + + /* create the symlink to make the device visible to user */ + + RtlInitUnicodeString(&Win32DevName, SymlnkName); + + Status = IoCreateSymbolicLink(&Win32DevName, &NtDevName); + + if (!NT_SUCCESS(Status)) { + + IoDeleteDevice(DeviceObject); + return NULL; + } + + DeviceExtension = (PDEVICE_EXTENSION)DeviceObject->DeviceObjectExtension; + DeviceExtension->bProcFS = bProcFS; + + DeviceObject->Flags |= DO_BUFFERED_IO; + DeviceObject->Flags &= ~DO_DEVICE_INITIALIZING; + + return DeviceObject; +} + + +// +// DriverEntry +// + +NTSTATUS DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ) +{ + KdPrint(("Lustre ping test: Build Time: " __DATE__ " " __TIME__ "\n")); + KdPrint(("Lustre ping test: DriverEntry ... \n")); + + /* initialize libcfs module */ + if (module_init_libcfs_module() != 0) { + KdPrint(("ping: error initialize module: libcfs ...\n")); + goto errorout; + } + + /* initialize lnet module */ + if (module_init_lnet() != 0) { + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: lnet ...\n")); + goto errorout; + } + + /* initialize tdinal module */ + if (module_ksocknal_module_init() != 0) { + module_fini_lnet(); + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: tdilnd ...\n")); + goto errorout; + } + +#if defined(LUSTRE_PING_CLI) + /* initialize pingcli module */ + if (module_pingcli_init() != 0) { + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: pingcli ...\n")); + goto errorout; + } +#endif + +#if defined(LUSTRE_PING_SRV) + /* initialize pingsrv module */ + if (module_pingsrv_init() != 0) { + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: pingsrv ...\n")); + goto errorout; + } +#endif + + /* create the ping device object */ + PingObject = CreateDevice( + DriverObject, + LUSTRE_PING_DEVICE, + LUSTRE_PING_SYMLNK, + FALSE ); + if (!PingObject) { +#if defined(LUSTRE_PING_CLI) + module_pingcli_cleanup(); +#endif +#if defined(LUSTRE_PING_SRV) + module_pingsrv_cleanup(); +#endif + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + + return STATUS_INSUFFICIENT_RESOURCES; + } + + /* create the libcfs proc fs emultor device object */ + ProcObject = CreateDevice( + DriverObject, + LUSTRE_PROC_DEVICE, + LUSTRE_PROC_SYMLNK, + TRUE ); + if (!ProcObject) { + + IoDeleteDevice(PingObject); +#if defined(LUSTRE_PING_CLI) + module_pingcli_cleanup(); +#endif +#if defined(LUSTRE_PING_SRV) + module_pingsrv_cleanup(); +#endif + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + return STATUS_INSUFFICIENT_RESOURCES; + } + + /* initialize the driver callback routines */ + + DriverObject->MajorFunction[IRP_MJ_CREATE] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_CLOSE] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_READ] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_WRITE] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_SHUTDOWN] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_DEVICE_CONTROL] = UTDispatchRequest; + + return STATUS_SUCCESS; + +errorout: + + cfs_enter_debugger(); + + return STATUS_UNSUCCESSFUL; +} diff --git a/lnet/tests/ping_srv.c b/lnet/tests/ping_srv.c index ae0d722..22eefbf 100644 --- a/lnet/tests/ping_srv.c +++ b/lnet/tests/ping_srv.c @@ -26,7 +26,7 @@ #define DEBUG_SUBSYSTEM S_PINGER #include -#include +#include #include "ping.h" #define STDSIZE (sizeof(int) + sizeof(int) + sizeof(struct timeval)) @@ -34,7 +34,6 @@ static unsigned ping_head_magic; static unsigned ping_bulk_magic; -static int nal = SOCKNAL; // Your NAL, static unsigned long packets_valid = 0; // Valid packets static int running = 1; atomic_t pkt; @@ -52,28 +51,28 @@ static void *pingsrv_shutdown(int err) switch (err) { case 1: /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (server->mdin_h))) - PDEBUG ("PtlMDUnlink (out head buffer)", rc); + if ((rc = LNetMDUnlink (server->mdin_h))) + PDEBUG ("LNetMDUnlink (out head buffer)", rc); case 2: /* Free the event queue */ - if ((rc = PtlEQFree (server->eq))) - PDEBUG ("PtlEQFree", rc); + if ((rc = LNetEQFree (server->eq))) + PDEBUG ("LNetEQFree", rc); /* Unlink the client portal from the ME list */ - if ((rc = PtlMEUnlink (server->me))) - PDEBUG ("PtlMEUnlink", rc); + if ((rc = LNetMEUnlink (server->me))) + PDEBUG ("LNetMEUnlink", rc); case 3: - PtlNIFini (server->ni); + LNetNIFini (); case 4: case 5: if (server->in_buf != NULL) - PORTAL_FREE (server->in_buf, MAXSIZE); + LIBCFS_FREE (server->in_buf, MAXSIZE); if (server != NULL) - PORTAL_FREE (server, + LIBCFS_FREE (server, sizeof (struct pingsrv_data)); } @@ -89,17 +88,18 @@ int pingsrv_thread(void *arg) unsigned long magic; unsigned long ping_bulk_magic = __cpu_to_le32(0xcafebabe); - kportal_daemonize ("pingsrv"); + cfs_daemonize ("pingsrv"); server->tsk = cfs_current(); while (running) { - set_current_state (TASK_INTERRUPTIBLE); + set_current_state (CFS_TASK_INTERRUPTIBLE); if (atomic_read (&pkt) == 0) { - schedule_timeout (MAX_SCHEDULE_TIMEOUT); + cfs_schedule_timeout (CFS_TASK_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); continue; } - magic = __le32_to_cpu(*((int *)(server->evnt.md.start + magic = __le32_to_cpu(*((int *)((char *)server->evnt.md.start + server->evnt.offset))); @@ -112,14 +112,14 @@ int pingsrv_thread(void *arg) server->mdout.length = server->evnt.rlength; server->mdout.start = server->in_buf; server->mdout.threshold = 1; - server->mdout.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; + server->mdout.options = LNET_MD_OP_PUT; server->mdout.user_ptr = NULL; - server->mdout.eq_handle = PTL_EQ_NONE; + server->mdout.eq_handle = LNET_EQ_NONE; /* Bind the outgoing buffer */ - if ((rc = PtlMDBind (server->ni, server->mdout, - PTL_UNLINK, &server->mdout_h))) { - PDEBUG ("PtlMDBind", rc); + if ((rc = LNetMDBind (server->mdout, + LNET_UNLINK, &server->mdout_h))) { + PDEBUG ("LNetMDBind", rc); pingsrv_shutdown (1); return 1; } @@ -128,19 +128,21 @@ int pingsrv_thread(void *arg) server->mdin.start = server->in_buf; server->mdin.length = MAXSIZE; server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; + server->mdin.options = LNET_MD_OP_PUT; server->mdin.user_ptr = NULL; server->mdin.eq_handle = server->eq; - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); + if ((rc = LNetMDAttach (server->me, server->mdin, + LNET_UNLINK, &server->mdin_h))) { + PDEBUG ("LNetMDAttach (bulk)", rc); CDEBUG (D_OTHER, "ping server resources allocated\n"); } - if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, - server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) - PDEBUG ("PtlPut", rc); + if ((rc = LNetPut (server->evnt.target.nid, server->mdout_h, + LNET_NOACK_REQ, + server->evnt.initiator, PTL_PING_CLIENT, + 0, 0, 0))) + PDEBUG ("LNetPut", rc); atomic_dec (&pkt); @@ -150,13 +152,13 @@ int pingsrv_thread(void *arg) return 0; } -static void pingsrv_packet(ptl_event_t *ev) +static void pingsrv_packet(lnet_event_t *ev) { atomic_inc (&pkt); wake_up_process (server->tsk); } /* pingsrv_head() */ -static void pingsrv_callback(ptl_event_t *ev) +static void pingsrv_callback(lnet_event_t *ev) { if (ev == NULL) { @@ -165,12 +167,13 @@ static void pingsrv_callback(ptl_event_t *ev) } server->evnt = *ev; - CWARN ("received ping from nid "LPX64" " + CWARN ("received ping from nid %s " "(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n", - ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, - __le32_to_cpu(*((int *)(ev->md.start + ev->offset))), - __le32_to_cpu(*((int *)(ev->md.start + ev->offset + sizeof(unsigned)))), - __le32_to_cpu(*((int *)(ev->md.start + ev->offset + 2 * + libcfs_nid2str(ev->initiator.nid), + ev->offset, ev->rlength, ev->mlength, + __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset))), + __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset + sizeof(unsigned)))), + __le32_to_cpu(*((int *)((char *)ev->md.start + ev->offset + 2 * sizeof(unsigned))))); packets_valid++; @@ -184,41 +187,38 @@ static struct pingsrv_data *pingsrv_setup(void) { int rc; - server->ni = PTL_INVALID_HANDLE; - - /* Aquire and initialize the proper nal for portals. */ - rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni); - if (!(rc == PTL_OK || rc == PTL_IFACE_DUP)) { - CDEBUG (D_OTHER, "NAL %x not loaded\n", nal); + /* Aquire and initialize the proper nal for portals. */ + rc = LNetNIInit(0); + if (!(rc == 0 || rc == 1)) { + CDEBUG (D_OTHER, "LNetNIInit: error %d\n", rc); return pingsrv_shutdown (4); } /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (server->ni, &server->my_id))) { - PDEBUG ("PtlGetId", rc); + if ((rc = LNetGetId (1, &server->my_id))) { + PDEBUG ("LNetGetId", rc); return pingsrv_shutdown (2); } - server->id_local.nid = PTL_NID_ANY; - server->id_local.pid = PTL_PID_ANY; + server->id_local.nid = LNET_NID_ANY; + server->id_local.pid = LNET_PID_ANY; /* Attach a match entries for header packets */ - if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, + if ((rc = LNetMEAttach (PTL_PING_SERVER, server->id_local,0, ~0, - PTL_RETAIN, PTL_INS_AFTER, &server->me))) { - PDEBUG ("PtlMEAttach", rc); + LNET_RETAIN, LNET_INS_AFTER, &server->me))) { + PDEBUG ("LNetMEAttach", rc); return pingsrv_shutdown (2); } - if ((rc = PtlEQAlloc (server->ni, 1024, &pingsrv_callback, - &server->eq))) { - PDEBUG ("PtlEQAlloc (callback)", rc); + if ((rc = LNetEQAlloc (1024, &pingsrv_callback, &server->eq))) { + PDEBUG ("LNetEQAlloc (callback)", rc); return pingsrv_shutdown (2); } - PORTAL_ALLOC (server->in_buf, MAXSIZE); + LIBCFS_ALLOC (server->in_buf, MAXSIZE); if(!server->in_buf){ CDEBUG (D_OTHER,"Allocation error\n"); return pingsrv_shutdown(2); @@ -228,29 +228,36 @@ static struct pingsrv_data *pingsrv_setup(void) server->mdin.start = server->in_buf; server->mdin.length = MAXSIZE; server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; + server->mdin.options = LNET_MD_OP_PUT; server->mdin.user_ptr = NULL; server->mdin.eq_handle = server->eq; memset (server->in_buf, 0, STDSIZE); - - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); + + if ((rc = LNetMDAttach (server->me, server->mdin, + LNET_UNLINK, &server->mdin_h))) { + PDEBUG ("LNetMDAttach (bulk)", rc); CDEBUG (D_OTHER, "ping server resources allocated\n"); } - + /* Success! */ - return server; + return server; } /* pingsrv_setup() */ -static int pingsrv_start(void) +static int pingsrv_start(void) { + long pid; + /* Setup our server */ if (!pingsrv_setup()) { CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); return -ENOMEM; } - cfs_kernel_thread (pingsrv_thread,NULL,0); + pid = cfs_kernel_thread (pingsrv_thread,NULL,0); + if (pid < 0) { + CERROR("Can't start pingsrv thread: rc = %ld\n", pid); + return (int)pid; + } + return 0; } /* pingsrv_start() */ @@ -258,7 +265,7 @@ static int __init pingsrv_init(void) { ping_head_magic = __cpu_to_le32(PING_HEADER_MAGIC); ping_bulk_magic = __cpu_to_le32(PING_BULK_MAGIC); - PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); + LIBCFS_ALLOC (server, sizeof(struct pingsrv_data)); atomic_set(&pkt, 0); return pingsrv_start (); } /* pingsrv_init() */ @@ -270,17 +277,13 @@ static void /*__exit*/ pingsrv_cleanup(void) running = 0; wake_up_process (server->tsk); while (running != 1) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (cfs_time_seconds(1)); + set_current_state (CFS_TASK_UNINT); + cfs_schedule_timeout (CFS_TASK_UNINT, cfs_time_seconds(1)); } } /* pingsrv_cleanup() */ -MODULE_PARM(nal, "i"); -MODULE_PARM_DESC(nal, "Use the specified NAL " - "(2-ksocknal, 1-kqswnal)"); - MODULE_AUTHOR("Brian Behlendorf (LLNL)"); MODULE_DESCRIPTION("A kernel space ping server for portals testing"); MODULE_LICENSE("GPL"); diff --git a/lnet/tests/ping_srv/Info.plist b/lnet/tests/ping_srv/Info.plist index 21024f0..b08212c 100644 --- a/lnet/tests/ping_srv/Info.plist +++ b/lnet/tests/ping_srv/Info.plist @@ -2,36 +2,39 @@ - CFBundleDevelopmentRegion - English - CFBundleExecutable - ping_srv - CFBundleIconFile - - CFBundleIdentifier - com.clusterfs.lustre.portals.tests.ping_srv - CFBundleInfoDictionaryVersion - 6.0 - CFBundlePackageType - KEXT - CFBundleSignature - ???? - CFBundleVersion - 1.0.0d1 - OSBundleLibraries - - com.apple.kernel.bsd - 1.1 - com.apple.kernel.iokit - 1.0.0b1 - com.apple.kernel.mach - 1.0.0b1 - com.clusterfs.lustre.portals.libcfs - 1.0.0 - com.clusterfs.lustre.portals.portals - 1.0.0 - com.clusterfs.lustre.portals.knals.ksocknal - 1.0.0 - + CFBundleDevelopmentRegion + English + CFBundleExecutable + pingsrv + CFBundleIconFile + + CFBundleIdentifier + com.clusterfs.lustre.pingsrv + CFBundleInfoDictionaryVersion + 6.0 + CFBundlePackageType + KEXT + CFBundleSignature + ???? + CFBundleVersion + 1.0.1 + OSBundleCompatibleVersion + 1.0.0 + OSBundleLibraries + + com.apple.kpi.bsd + 8.0.0b1 + com.apple.kpi.libkern + 8.0.0b1 + com.apple.kpi.mach + 8.0.0b1 + com.apple.kpi.unsupported + 8.0.0b1 + com.clusterfs.lustre.libcfs + 1.0.0 + com.clusterfs.lustre.lnet + 1.0.0 + + diff --git a/lnet/tests/ping_srv/winnt-pingsrv.c b/lnet/tests/ping_srv/winnt-pingsrv.c new file mode 100644 index 0000000..7c9a1a1 --- /dev/null +++ b/lnet/tests/ping_srv/winnt-pingsrv.c @@ -0,0 +1,634 @@ +/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=4:tabstop=4: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Matt Wu + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LNET + +/* + * Included Headers + */ + + +#include + + +/* libcfs module init/exit routines */ +DECLARE_INIT(init_libcfs_module); +DECLARE_EXIT(exit_libcfs_module); + +/* portal module init/exit routines */ +DECLARE_INIT(init_lnet); +DECLARE_EXIT(fini_lnet); + +/* tdinal module init/exit routines */ +DECLARE_INIT(ksocknal_module_init); +DECLARE_EXIT(ksocknal_module_fini); + +/* pingcli module init/exit routines */ +DECLARE_INIT(pingcli_init); +DECLARE_EXIT(pingcli_cleanup); + + +/* pingsrv module init/exit routines */ +DECLARE_INIT(pingsrv_init); +DECLARE_EXIT(pingsrv_cleanup); + +/* + * structure definitions + */ + + +#define LUSTRE_PING_VERSION 0x00010000 /* ping srv/cli version: 0001.0000 */ + +#define LUSTRE_PING_DEVICE L"\\Device\\LNET" /* device object name */ +#define LUSTRE_PING_SYMLNK L"\\DosDevices\\LNET" /* user-visible name for the device*/ + +typedef struct _DEVICE_EXTENSION +{ + BOOLEAN bProcFS; + +} DEVICE_EXTENSION, *PDEVICE_EXTENSION; + + +/* + * global definitions + */ + +PDEVICE_OBJECT PingObject = NULL; /* ping device object */ +PDEVICE_OBJECT ProcObject = NULL; /* procfs emulator device */ + + +/* + * common routines + */ + + +// +// complete Irp request ... +// + +NTSTATUS +UTCompleteIrp( + PIRP Irp, + NTSTATUS Status, + ULONG Info + ) +{ + Irp->IoStatus.Status = Status; + Irp->IoStatus.Information = Info; + IoCompleteRequest(Irp,IO_NO_INCREMENT); + + return Status; +} + +// +// Open/Create Device ... +// + +NTSTATUS +UTCreate( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + KdPrint(("UTCreate: DeviceCreate ...\n")); + + return UTCompleteIrp(Irp,STATUS_SUCCESS,0); +} + +// +// Close Devcie ... +// + +NTSTATUS +UTClose( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp) +{ + KdPrint(("UTClose: Device Closed.\n")); + + return UTCompleteIrp(Irp, STATUS_SUCCESS, 0); + + UNREFERENCED_PARAMETER(DeviceObject); +} + + + +NTSTATUS +UTShutdown( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + KdPrint(("UTShutdown: shuting TdiSock ...\n")); + + return UTCompleteIrp(Irp, STATUS_SUCCESS, 0); + + UNREFERENCED_PARAMETER(DeviceObject); +} + +// +// driver frame Routines ... +// + + +NTSTATUS +UTDeviceControl( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status = STATUS_INVALID_DEVICE_REQUEST; + PIO_STACK_LOCATION IrpSp; + + ULONG ControlCode; + ULONG InputLength; + ULONG OutputLength; + + PVOID lpvInBuffer; + + KdPrint(("UTDeviceControl: Device Ioctl ...\n")); + + Irp->IoStatus.Information = 0; + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + ControlCode = IrpSp->Parameters.DeviceIoControl.IoControlCode; + InputLength = IrpSp->Parameters.DeviceIoControl.InputBufferLength; + OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength; + lpvInBuffer = Irp->AssociatedIrp.SystemBuffer; + + ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL); + + switch (ControlCode) + { + case IOCTL_LIBCFS_VERSION: + + *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION); + Irp->IoStatus.Information = sizeof(ULONG); + Status = STATUS_SUCCESS; + break; + + default: + break; + } + + Irp->IoStatus.Status = Status; + + IoCompleteRequest(Irp, IO_NO_INCREMENT); + + KdPrint(("UTDeviceControl: Device Ioctl returned.\n")); + + return Status; +} + +NTSTATUS +ProcCreate( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status; + PIO_STACK_LOCATION IrpSp; + + FILE_FULL_EA_INFORMATION * ea; + cfs_file_t * fp; + + KdPrint(("ProcCreate: Proc device is being opened ...\n")); + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + ea = (PFILE_FULL_EA_INFORMATION) Irp->AssociatedIrp.SystemBuffer; + + if (!ea) { + Status = STATUS_INVALID_PARAMETER; + } else { + fp = lustre_open_file(&ea->EaName[0]); + if (!fp) { + Status = STATUS_OBJECT_NAME_NOT_FOUND; + } else { + IrpSp->FileObject->FsContext = fp; + IrpSp->FileObject->FsContext2 = fp->private_data; + Status = STATUS_SUCCESS; + } + } + + return UTCompleteIrp(Irp, Status, 0); +} + +// +// Close Devcie ... +// + +NTSTATUS +ProcClose( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp) +{ + PIO_STACK_LOCATION IrpSp; + + cfs_file_t * fp; + + KdPrint(("ProcClose: Proc device object is to be closed.\n")); + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + fp = (cfs_file_t *) IrpSp->FileObject->FsContext; + + ASSERT(fp != NULL); + ASSERT(IrpSp->FileObject->FsContext2 == fp->private_data); + + lustre_close_file(fp); + + return UTCompleteIrp(Irp, STATUS_SUCCESS, 0); + + UNREFERENCED_PARAMETER(DeviceObject); +} + +/* + * proc frame routines + */ + +NTSTATUS +ProcDeviceControl( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status = STATUS_INVALID_DEVICE_REQUEST; + PIO_STACK_LOCATION IrpSp; + + ULONG ControlCode; + ULONG InputLength; + ULONG OutputLength; + + PVOID lpvInBuffer; + + KdPrint(("ProcDeviceControl: Proc device ioctling ...\n")); + + Irp->IoStatus.Information = 0; + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + ControlCode = IrpSp->Parameters.DeviceIoControl.IoControlCode; + InputLength = IrpSp->Parameters.DeviceIoControl.InputBufferLength; + OutputLength = IrpSp->Parameters.DeviceIoControl.OutputBufferLength; + lpvInBuffer = Irp->AssociatedIrp.SystemBuffer; + + ASSERT (IrpSp->MajorFunction == IRP_MJ_DEVICE_CONTROL); + + switch (ControlCode) + { + case IOCTL_LIBCFS_VERSION: + + *((ULONG *)lpvInBuffer) = (ULONG)(LUSTRE_PING_VERSION); + Irp->IoStatus.Information = sizeof(ULONG); + + Status = STATUS_SUCCESS; + + break; + + case IOCTL_LIBCFS_ENTRY: + { + int rc = 0; + cfs_file_t * fp; + + fp = (cfs_file_t *) IrpSp->FileObject->FsContext; + + if (!fp) { + rc = -EINVAL; + } else { + rc = lustre_ioctl_file(fp, (PCFS_PROC_IOCTL) (lpvInBuffer)); + } + + if (rc == 0) { + Irp->IoStatus.Information = InputLength; + Status = STATUS_SUCCESS; + } + } + } + + Irp->IoStatus.Status = Status; + + IoCompleteRequest(Irp, IO_NO_INCREMENT); + + KdPrint(("ProcDeviceControl: Proc device ioctl returned with status = %xh.\n", Status)); + + return Status; +} + + + +NTSTATUS +ProcReadWrite (PDEVICE_OBJECT DeviceObject, PIRP Irp) +{ + PIO_STACK_LOCATION IrpSp; + NTSTATUS Status; + + cfs_file_t * fp; + int rc; + PCHAR buf; + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + if (Irp->MdlAddress) { + buf = MmGetSystemAddressForMdlSafe( + Irp->MdlAddress, + NormalPagePriority); + } else { + buf = Irp->AssociatedIrp.SystemBuffer; + } + + if (buf == NULL) { + Status = STATUS_SUCCESS; + rc = 0; + } else { + fp = (cfs_file_t *) IrpSp->FileObject->FsContext; + + if (!fp) { + Status = STATUS_INVALID_PARAMETER; + goto errorout; + } + + if (IrpSp->MajorFunction == IRP_MJ_READ) { + rc = lustre_read_file( + fp, IrpSp->Parameters.Read.ByteOffset.LowPart, + IrpSp->Parameters.Read.Length, buf); + } else { + rc = lustre_write_file( + fp, IrpSp->Parameters.Write.ByteOffset.LowPart, + IrpSp->Parameters.Write.Length, buf); + } + if (rc < 0) { + cfs_enter_debugger(); + Status = STATUS_UNSUCCESSFUL; + } else { + Status = STATUS_SUCCESS; + } + } + + +errorout: + return UTCompleteIrp(Irp, Status, rc); +} + + +// +// common dispatch routines +// + +NTSTATUS +UTDispatchRequest( + IN PDEVICE_OBJECT DeviceObject, + IN PIRP Irp + ) +{ + NTSTATUS Status; + PIO_STACK_LOCATION IrpSp; + + Status = STATUS_INVALID_DEVICE_REQUEST; + + __try { + + IrpSp = IoGetCurrentIrpStackLocation(Irp); + + switch (IrpSp->MajorFunction) { + + case IRP_MJ_CREATE: + if (DeviceObject == PingObject) { + Status = UTCreate(DeviceObject, Irp); + } else if (DeviceObject == ProcObject) { + Status = ProcCreate(DeviceObject, Irp); + } + break; + + case IRP_MJ_CLOSE: + if (DeviceObject == PingObject) { + Status = UTClose(DeviceObject, Irp); + } else if (DeviceObject == ProcObject) { + Status = ProcClose(DeviceObject, Irp); + } + break; + + case IRP_MJ_READ: + case IRP_MJ_WRITE: + if (DeviceObject == ProcObject) { + Status = ProcReadWrite(DeviceObject, Irp); + } + break; + + case IRP_MJ_DEVICE_CONTROL: + if (DeviceObject == PingObject) { + Status = UTDeviceControl(DeviceObject, Irp); + } else if (DeviceObject == ProcObject) { + Status = ProcDeviceControl(DeviceObject, Irp); + } + break; + + case IRP_MJ_SHUTDOWN: + Status = UTShutdown(DeviceObject, Irp); + break; + + default: + + KdPrint(("UTDispatchRequest: Major Function: %xh is not supported.\n", + IrpSp->MajorFunction)); + UTCompleteIrp(Irp, Status, 0); + break; + } + } + + __finally { + } + + return Status; +} + +// +// create a device object and a dosdevice symbol link +// + +PDEVICE_OBJECT +CreateDevice( + IN PDRIVER_OBJECT DriverObject, + IN PWCHAR DeviceName, + IN PWCHAR SymlnkName, + IN BOOLEAN bProcFS + ) +{ + NTSTATUS Status; + + UNICODE_STRING NtDevName; + UNICODE_STRING Win32DevName; + + PDEVICE_EXTENSION DeviceExtension; + PDEVICE_OBJECT DeviceObject; + + /* create the device object with the specified name */ + + RtlInitUnicodeString(&NtDevName, DeviceName); + + Status = IoCreateDevice( + DriverObject, + sizeof(DEVICE_EXTENSION), + &NtDevName, + FILE_DEVICE_UNKNOWN, + 0, + FALSE, + &DeviceObject ); + + if (!NT_SUCCESS(Status)) { + + cfs_enter_debugger(); + return NULL; + } + + /* create the symlink to make the device visible to user */ + + RtlInitUnicodeString(&Win32DevName, SymlnkName); + + Status = IoCreateSymbolicLink(&Win32DevName, &NtDevName); + + if (!NT_SUCCESS(Status)) { + + IoDeleteDevice(DeviceObject); + return NULL; + } + + DeviceExtension = (PDEVICE_EXTENSION)DeviceObject->DeviceObjectExtension; + DeviceExtension->bProcFS = bProcFS; + + DeviceObject->Flags |= DO_BUFFERED_IO; + DeviceObject->Flags &= ~DO_DEVICE_INITIALIZING; + + return DeviceObject; +} + + +// +// DriverEntry +// + +NTSTATUS DriverEntry( + IN PDRIVER_OBJECT DriverObject, + IN PUNICODE_STRING RegistryPath + ) +{ + KdPrint(("Lustre ping test: Build Time: " __DATE__ " " __TIME__ "\n")); + KdPrint(("Lustre ping test: DriverEntry ... \n")); + + /* initialize libcfs module */ + if (module_init_libcfs_module() != 0) { + KdPrint(("ping: error initialize module: libcfs ...\n")); + goto errorout; + } + + /* initialize lnet module */ + if (module_init_lnet() != 0) { + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: lnet ...\n")); + goto errorout; + } + + /* initialize tdinal module */ + if (module_ksocknal_module_init() != 0) { + module_fini_lnet(); + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: tdilnd ...\n")); + goto errorout; + } + +#if defined(LUSTRE_PING_CLI) + /* initialize pingcli module */ + if (module_pingcli_init() != 0) { + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: pingcli ...\n")); + goto errorout; + } +#endif + +#if defined(LUSTRE_PING_SRV) + /* initialize pingsrv module */ + if (module_pingsrv_init() != 0) { + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + KdPrint(("ping: error initialize module: pingsrv ...\n")); + goto errorout; + } +#endif + + /* create the ping device object */ + PingObject = CreateDevice( + DriverObject, + LUSTRE_PING_DEVICE, + LUSTRE_PING_SYMLNK, + FALSE ); + if (!PingObject) { +#if defined(LUSTRE_PING_CLI) + module_pingcli_cleanup(); +#endif +#if defined(LUSTRE_PING_SRV) + module_pingsrv_cleanup(); +#endif + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + + return STATUS_INSUFFICIENT_RESOURCES; + } + + /* create the libcfs proc fs emultor device object */ + ProcObject = CreateDevice( + DriverObject, + LUSTRE_PROC_DEVICE, + LUSTRE_PROC_SYMLNK, + TRUE ); + if (!ProcObject) { + + IoDeleteDevice(PingObject); +#if defined(LUSTRE_PING_CLI) + module_pingcli_cleanup(); +#endif +#if defined(LUSTRE_PING_SRV) + module_pingsrv_cleanup(); +#endif + module_ksocknal_module_fini(); + module_fini_lnet(); + module_exit_libcfs_module(); + return STATUS_INSUFFICIENT_RESOURCES; + } + + /* initialize the driver callback routines */ + + DriverObject->MajorFunction[IRP_MJ_CREATE] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_CLOSE] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_READ] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_WRITE] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_SHUTDOWN] = UTDispatchRequest; + DriverObject->MajorFunction[IRP_MJ_DEVICE_CONTROL] = UTDispatchRequest; + + return STATUS_SUCCESS; + +errorout: + + cfs_enter_debugger(); + + return STATUS_UNSUCCESSFUL; +} diff --git a/lnet/tests/sping_cli.c b/lnet/tests/sping_cli.c deleted file mode 100644 index 71a2a98..0000000 --- a/lnet/tests/sping_cli.c +++ /dev/null @@ -1,279 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * Author: Brian Behlendorf - * Kedar Sovani (kedar@calsoftinc.com) - * Amey Inamdar (amey@calsoftinc.com) - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * - */ - -/* This is a striped down version of pinger. It follows a single - * request-response protocol. Doesn't do Bulk data pinging. Also doesn't - * send multiple packets in a single ioctl. - */ - - -#define DEBUG_SUBSYSTEM S_PINGER - -#include -#include -#include -#include -#include -#include -#include "ping.h" -/* int portal_debug = D_PING_CLI; */ - - -#define STDSIZE (sizeof(int) + sizeof(int) + 4) /* The data is 4 bytes - assumed */ - -/* This should be enclosed in a structure */ - -static struct pingcli_data *client = NULL; - -static int count = 0; - -static void -pingcli_shutdown(ptl_handle_ni_t nih, int err) -{ - int rc; - - /* Yes, we are intentionally allowing us to fall through each - * case in to the next. This allows us to pass an error - * code to just clean up the right stuff. - */ - switch (err) { - case 1: - /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (client->md_out_head_h))) - PDEBUG ("PtlMDUnlink", rc); - case 2: - /* Free the event queue */ - if ((rc = PtlEQFree (client->eq))) - PDEBUG ("PtlEQFree", rc); - - if ((rc = PtlMEUnlink (client->me))) - PDEBUG ("PtlMEUnlink", rc); - case 3: - PtlNIFini (nih); - - case 4: - /* Free our buffers */ - if (client->outbuf != NULL) - PORTAL_FREE (client->outbuf, STDSIZE); - - if (client->inbuf != NULL) - PORTAL_FREE (client->inbuf, STDSIZE); - - - if (client != NULL) - PORTAL_FREE (client, - sizeof(struct pingcli_data)); - } - - - CDEBUG (D_OTHER, "ping client released resources\n"); -} /* pingcli_shutdown() */ - -static void pingcli_callback(ptl_event_t *ev) -{ - wake_up_process (client->tsk); -} - - -static struct pingcli_data * -pingcli_start(struct portal_ioctl_data *args) -{ - ptl_handle_ni_t nih = PTL_INVALID_HANDLE; - unsigned ping_head_magic = PING_HEADER_MAGIC; - char str[PTL_NALFMT_SIZE]; - int rc; - - client->tsk = current; - client->args = args; - - CDEBUG (D_OTHER, "pingcli_setup args: nid "LPX64" (%s), \ - nal %x, size %u, count: %u, timeout: %u\n", - args->ioc_nid, - portals_nid2str(args->ioc_nid, args->ioc_nal, str), - args->ioc_nal, args->ioc_size, - args->ioc_count, args->ioc_timeout); - - - PORTAL_ALLOC (client->outbuf, STDSIZE) ; - if (client->outbuf == NULL) - { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); - } - - PORTAL_ALLOC (client->inbuf, STDSIZE); - - if (client->inbuf == NULL) - { - CERROR ("Unable to allocate out_buf ("LPSZ" bytes)\n", STDSIZE); - pingcli_shutdown (nih, 4); - return (NULL); - } - - /* Aquire and initialize the proper nal for portals. */ - rc = PtlNIInit(args->ioc_nal, 0, NULL, NULL, &nih); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) - { - CERROR ("NAL %x not loaded.\n", args->ioc_nal); - pingcli_shutdown (nih, 4); - return (NULL); - } - - /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (nih, &client->myid))) - { - CERROR ("PtlGetId error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - /* Setup the local match entries */ - client->id_local.nid = PTL_NID_ANY; - client->id_local.pid = PTL_PID_ANY; - - /* Setup the remote match entries */ - client->id_remote.nid = args->ioc_nid; - client->id_remote.pid = 0; - - if ((rc = PtlMEAttach (nih, PTL_PING_CLIENT, - client->id_local, 0, ~0, PTL_RETAIN, - PTL_INS_AFTER, &client->me))) - { - CERROR ("PtlMEAttach error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - /* Allocate the event queue for this network interface */ - if ((rc = PtlEQAlloc (nih, 64, pingcli_callback, &client->eq))) - { - CERROR ("PtlEQAlloc error %d\n", rc); - pingcli_shutdown (nih, 2); - return (NULL); - } - - - client->md_in_head.start = client->inbuf; - client->md_in_head.length = STDSIZE; - client->md_in_head.threshold = 1; - client->md_in_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - client->md_in_head.user_ptr = NULL; - client->md_in_head.eq_handle = client->eq; - memset (client->inbuf, 0, STDSIZE); - - /* Attach the incoming buffer */ - if ((rc = PtlMDAttach (client->me, client->md_in_head, - PTL_UNLINK, &client->md_in_head_h))) { - CERROR ("PtlMDAttach error %d\n", rc); - pingcli_shutdown (nih, 1); - return (NULL); - } - - /* Setup the outgoing ping header */ - client->md_out_head.start = client->outbuf; - client->md_out_head.length = STDSIZE; - client->md_out_head.threshold = 1; - client->md_out_head.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - client->md_out_head.user_ptr = NULL; - client->md_out_head.eq_handle = PTL_EQ_NONE; - - memcpy (client->outbuf, &ping_head_magic, sizeof(ping_head_magic)); - - /* Bind the outgoing ping header */ - if ((rc=PtlMDBind (nih, client->md_out_head, - PTL_UNLINK, &client->md_out_head_h))) { - CERROR ("PtlMDBind error %d\n", rc); - pingcli_shutdown (nih, 1); - return (NULL); - } - /* Put the ping packet */ - if((rc = PtlPut (client->md_out_head_h, PTL_NOACK_REQ, - client->id_remote, PTL_PING_SERVER, 0, 0, 0, 0))) { - PDEBUG ("PtlPut (header)", rc); - pingcli_shutdown (nih, 1); - return NULL; - } - - count = 0; - set_current_state (TASK_INTERRUPTIBLE); - rc = schedule_timeout (20 * args->ioc_timeout); - if (rc == 0) { - CERROR ("Time out on the server\n"); - pingcli_shutdown (nih, 2); - return NULL; - } else { - CWARN("Received respose from the server \n"); - } - - pingcli_shutdown (nih, 2); - - /* Success! */ - return NULL; -} /* pingcli_setup() */ - - - -/* called by the portals_ioctl for ping requests */ -int kping_client(struct portal_ioctl_data *args) -{ - - PORTAL_ALLOC (client, sizeof(struct pingcli_data)); - memset (client, 0, sizeof(struct pingcli_data)); - if (client == NULL) - { - CERROR ("Unable to allocate client structure\n"); - return (0); - } - pingcli_start (args); - - return 0; -} /* kping_client() */ - - -static int __init pingcli_init(void) -{ - PORTAL_SYMBOL_REGISTER(kping_client); - return 0; -} /* pingcli_init() */ - - -static void /*__exit*/ pingcli_cleanup(void) -{ - PORTAL_SYMBOL_UNREGISTER (kping_client); -} /* pingcli_cleanup() */ - - -MODULE_AUTHOR("Brian Behlendorf (LLNL)"); -MODULE_DESCRIPTION("A simple kernel space ping client for portals testing"); -MODULE_LICENSE("GPL"); - -module_init(pingcli_init); -module_exit(pingcli_cleanup); - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -EXPORT_SYMBOL (kping_client); -#endif diff --git a/lnet/tests/sping_srv.c b/lnet/tests/sping_srv.c deleted file mode 100644 index 30f158c..0000000 --- a/lnet/tests/sping_srv.c +++ /dev/null @@ -1,294 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002, Lawrence Livermore National Labs (LLNL) - * Author: Brian Behlendorf - * Amey Inamdar - * Kedar Sovani - * - * - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* This is a striped down version of pinger. It follows a single - * request-response protocol. Doesn't do Bulk data pinging. Also doesn't - * send multiple packets in a single ioctl. - */ - -#define DEBUG_SUBSYSTEM S_PINGER - -#include -#include -#include "ping.h" - -#include -#include -#include -#include -#include -#include -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#include -#else -#include -#endif -#include -#include - -#include -#include - -#define STDSIZE (sizeof(int) + sizeof(int) + 4) - -static int nal = PTL_IFACE_DEFAULT; // Your NAL, -static unsigned long packets_valid = 0; // Valid packets -static int running = 1; -atomic_t pkt; - -static struct pingsrv_data *server=NULL; // Our ping server - -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -#endif - -static void *pingsrv_shutdown(int err) -{ - int rc; - - /* Yes, we are intentionally allowing us to fall through each - * case in to the next. This allows us to pass an error - * code to just clean up the right stuff. - */ - switch (err) { - case 1: - /* Unlink any memory descriptors we may have used */ - if ((rc = PtlMDUnlink (server->mdin_h))) - PDEBUG ("PtlMDUnlink (out head buffer)", rc); - case 2: - /* Free the event queue */ - if ((rc = PtlEQFree (server->eq))) - PDEBUG ("PtlEQFree", rc); - - /* Unlink the client portal from the ME list */ - if ((rc = PtlMEUnlink (server->me))) - PDEBUG ("PtlMEUnlink", rc); - - case 3: - PtlNIFini(server->ni); - - case 4: - - if (server->in_buf != NULL) - PORTAL_FREE (server->in_buf, STDSIZE); - - if (server != NULL) - PORTAL_FREE (server, - sizeof (struct pingsrv_data)); - - } - - CDEBUG (D_OTHER, "ping sever resources released\n"); - return NULL; -} /* pingsrv_shutdown() */ - - -int pingsrv_thread(void *arg) -{ - int rc; - - kportal_daemonize ("pingsrv"); - server->tsk = current; - - while (running) { - set_current_state (TASK_INTERRUPTIBLE); - if (atomic_read (&pkt) == 0) { - schedule_timeout (MAX_SCHEDULE_TIMEOUT); - continue; - } - - server->mdout.start = server->in_buf; - server->mdout.length = STDSIZE; - server->mdout.threshold = 1; - server->mdout.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdout.user_ptr = NULL; - server->mdout.eq_handle = PTL_EQ_NONE; - - /* Bind the outgoing buffer */ - if ((rc = PtlMDBind (server->ni, server->mdout, - PTL_UNLINK, &server->mdout_h))) { - PDEBUG ("PtlMDBind", rc); - pingsrv_shutdown (1); - return 1; - } - - - server->mdin.start = server->in_buf; - server->mdin.length = STDSIZE; - server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdin.user_ptr = NULL; - server->mdin.eq_handle = server->eq; - - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); - CDEBUG (D_OTHER, "ping server resources allocated\n"); - } - - if ((rc = PtlPut (server->mdout_h, PTL_NOACK_REQ, - server->evnt.initiator, PTL_PING_CLIENT, 0, 0, 0, 0))) - PDEBUG ("PtlPut", rc); - - atomic_dec (&pkt); - - } - pingsrv_shutdown (1); - running = 1; - return 0; -} - -static void pingsrv_packet(ptl_event_t *ev) -{ - atomic_inc (&pkt); - wake_up_process (server->tsk); -} /* pingsrv_head() */ - -static void pingsrv_callback(ptl_event_t *ev) -{ - - if (ev == NULL) { - CERROR ("null in callback, ev=%p\n", ev); - return; - } - server->evnt = *ev; - - CWARN("Lustre: received ping from nid "LPX64" " - "(off=%u rlen=%u mlen=%u head=%x)\n", - ev->initiator.nid, ev->offset, ev->rlength, ev->mlength, - *((int *)(ev->md.start + ev->offset))); - - packets_valid++; - - pingsrv_packet(ev); - -} /* pingsrv_callback() */ - - -static struct pingsrv_data *pingsrv_setup(void) -{ - int rc; - - /* Aquire and initialize the proper nal for portals. */ - server->ni = PTL_INVALID_HANDLE; - - rc = PtlNIInit(nal, 0, NULL, NULL, &server->ni); - if (rc != PTL_OK && rc != PTL_IFACE_DUP) { - CDEBUG (D_OTHER, "Nal %x not loaded.\n", nal); - return pingsrv_shutdown (4); - } - - /* Based on the initialization aquire our unique portal ID. */ - if ((rc = PtlGetId (server->ni, &server->my_id))) { - PDEBUG ("PtlGetId", rc); - return pingsrv_shutdown (2); - } - - server->id_local.nid = PTL_NID_ANY; - server->id_local.pid = PTL_PID_ANY; - - /* Attach a match entries for header packets */ - if ((rc = PtlMEAttach (server->ni, PTL_PING_SERVER, - server->id_local,0, ~0, - PTL_RETAIN, PTL_INS_AFTER, &server->me))) { - PDEBUG ("PtlMEAttach", rc); - return pingsrv_shutdown (2); - } - - - if ((rc = PtlEQAlloc (server->ni, 64, pingsrv_callback, - &server->eq))) { - PDEBUG ("PtlEQAlloc (callback)", rc); - return pingsrv_shutdown (2); - } - - PORTAL_ALLOC (server->in_buf, STDSIZE); - if(!server->in_buf){ - CDEBUG (D_OTHER,"Allocation error\n"); - return pingsrv_shutdown(2); - } - - /* Setup the incoming buffer */ - server->mdin.start = server->in_buf; - server->mdin.length = STDSIZE; - server->mdin.threshold = 1; - server->mdin.options = PTL_MD_EVENT_START_DISABLE | PTL_MD_OP_PUT; - server->mdin.user_ptr = NULL; - server->mdin.eq_handle = server->eq; - memset (server->in_buf, 0, STDSIZE); - - if ((rc = PtlMDAttach (server->me, server->mdin, - PTL_UNLINK, &server->mdin_h))) { - PDEBUG ("PtlMDAttach (bulk)", rc); - CDEBUG (D_OTHER, "ping server resources allocated\n"); - } - - /* Success! */ - return server; -} /* pingsrv_setup() */ - -static int pingsrv_start(void) -{ - /* Setup our server */ - if (!pingsrv_setup()) { - CDEBUG (D_OTHER, "pingsrv_setup() failed, server stopped\n"); - return -ENOMEM; - } - kernel_thread (pingsrv_thread,NULL,0); - return 0; -} /* pingsrv_start() */ - - - -static int __init pingsrv_init(void) -{ - PORTAL_ALLOC (server, sizeof(struct pingsrv_data)); - return pingsrv_start (); -} /* pingsrv_init() */ - - -static void /*__exit*/ pingsrv_cleanup(void) -{ - remove_proc_entry ("net/pingsrv", NULL); - - running = 0; - wake_up_process (server->tsk); - while (running != 1) { - set_current_state (TASK_UNINTERRUPTIBLE); - schedule_timeout (HZ); - } - -} /* pingsrv_cleanup() */ - - -MODULE_PARM(nal, "i"); -MODULE_PARM_DESC(nal, "Use the specified NAL " - "(2-ksocknal, 1-kqswnal)"); - -MODULE_AUTHOR("Brian Behlendorf (LLNL)"); -MODULE_DESCRIPTION("A kernel space ping server for portals testing"); -MODULE_LICENSE("GPL"); - -module_init(pingsrv_init); -module_exit(pingsrv_cleanup); diff --git a/lnet/tests/startclient.sh b/lnet/tests/startclient.sh index be60509..2a30a01 100644 --- a/lnet/tests/startclient.sh +++ b/lnet/tests/startclient.sh @@ -1,37 +1,10 @@ #!/bin/sh -SIMPLE=${SIMPLE:-0} - -if [ $SIMPLE -eq 0 ]; then - PING=pingcli.o -else - PING=spingcli.o -fi +case `uname -r` in + 2.6.*) ext=.ko;; + 2.4.*) ext=.o;; + *) echo unknown OS version; return 1;; +esac -case "$1" in - tcp) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../socknal/ksocknal.o - /sbin/insmod ./$PING - echo ksocknal > /tmp/nal - ;; - - elan) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../qswnal/kqswnal.o - /sbin/insmod ./$PING - echo kqswnal > /tmp/nal - ;; +insmod pingcli$ext - gm) - /sbin/insmod portals - /sbin/insmod kgmnal - /sbin/insmod ./$PING - echo kgmnal > /tmp/nal - ;; - - *) - echo "Usage : ${0} < tcp | elan | gm>" - exit 1; -esac -exit 0; diff --git a/lnet/tests/startserver.sh b/lnet/tests/startserver.sh index 9b5ccf6..355a8ae 100644 --- a/lnet/tests/startserver.sh +++ b/lnet/tests/startserver.sh @@ -1,38 +1,9 @@ #!/bin/sh -SIMPLE=${SIMPLE:-0} - -if [ $SIMPLE -eq 0 ]; then - PING=pingsrv.o -else - PING=spingsrv.o -fi - -case "$1" in - tcp) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../socknal/ksocknal.o - /sbin/insmod ./$PING nal=2 - echo ksocknal > /tmp/nal - ;; - - elan) - /sbin/insmod ../oslib/portals.o - /sbin/insmod ../qswnal/kqswnal.o - /sbin/insmod ./$PING nal=4 - echo kqswnal > /tmp/nal - ;; - - gm) - /sbin/insmod portals - /sbin/insmod kgmnal - /sbin/insmod ./$PING nal=3 - echo kgmnal > /tmp/nal - ;; - - *) - echo "Usage : ${0} < tcp | elan | gm>" - exit 1; +case `uname -r` in + 2.6.*) ext=.ko;; + 2.4.*) ext=.o;; + *) echo unknown OS version; return 1;; esac -../utils/acceptor 9999& -exit 0; + +insmod pingsrv$ext diff --git a/lnet/tests/stopclient.sh b/lnet/tests/stopclient.sh index f7e3aa1..276d374 100644 --- a/lnet/tests/stopclient.sh +++ b/lnet/tests/stopclient.sh @@ -1,14 +1,3 @@ #!/bin/sh -SIMPLE=${SIMPLE:-1} - -if [ $SIMPLE -eq 0 ]; then - PING=spingcli -else - PING=pingcli -fi - -rmmod $PING -NAL=`cat /tmp/nal`; -rmmod $NAL -rmmod portals +rmmod pingcli diff --git a/lnet/tests/stopserver.sh b/lnet/tests/stopserver.sh index 3e81831..829afc6 100644 --- a/lnet/tests/stopserver.sh +++ b/lnet/tests/stopserver.sh @@ -1,16 +1,3 @@ #!/bin/sh -SIMPLE=${SIMPLE:-1} - -if [ $SIMPLE -eq 0 ]; then - PING=spingsrv -else - PING=pingsrv -fi - -rmmod $PING -NAL=`cat /tmp/nal`; -rmmod $NAL -killall -9 acceptor -rm -f /var/run/acceptor-9999.pid -rmmod portals +rmmod pingsrv diff --git a/lnet/tests/ut.README b/lnet/tests/ut.README new file mode 100644 index 0000000..ef70b2f --- /dev/null +++ b/lnet/tests/ut.README @@ -0,0 +1,43 @@ +The utcli (unit test client) and utsrv (unit test server) are very simple +unit test tools, for sending and receiving single get's/put's of a specific +size, using the LNET API set. + +Test Setup +uml1 ip=192.168.2.1 +uml2 ip=192.168.2.2 + +-------------------------------------------------------------------------------- +Example Test #1 - small get operation + +1) Setup server for listening +uml2 $ insmod utsvr.ko + +2) Do the get operation NID must be specified but all other are default +paramters which causes a 300 byte get op +uml1 $ insmod utcli.ko nid=192.168.2.2@tcp + +3) Unload the utsvr because currently it only supports a single operation +buffers are not reposted after they are consumed +*** FIX THIS LIMITATION *** +uml2 $ rmmod utsvr + +-------------------------------------------------------------------------------- +Example Test #2 - small put operation +(The setup and cleanup of the server are left out, because they are the +same as above) + +1) The adition of the "put=1" paramter causes a put rather than a get. The +default size of 300 is still used. +uml1 $ insmod utcli.ko nid=192.168.2.2@tcp put=1 + +-------------------------------------------------------------------------------- +Example Test #3 - large get operation + +1) Setup server for listening. The size must be specified on the server or else +the default of 300 bytes will be used. +uml2 $ insmod utsvr.ko pkt_size=5000 + +2) Do the large get operation pkt_size=5000. put=0 is a get operation, +it is equivlenet to just not having that parameter. +uml1 $ insmod utcli.ko nid=192.168.2.2@tcp put=0 pkt_size=5000 + diff --git a/lnet/tests/ut.h b/lnet/tests/ut.h new file mode 100644 index 0000000..96ccb34 --- /dev/null +++ b/lnet/tests/ut.h @@ -0,0 +1,45 @@ +#define DEBUG_SUBSYSTEM S_PINGER + +#include +#include + +#define UT_PORTAL 42 + +#define PJK_UT_MSG(fmt...) do{printk("<1>" UT_MSG_MODULE_NAME ":%-30s:",__FUNCTION__);printk(fmt);}while(0) + +#define DO_TYPE(x) case x: return #x; + +const char *get_ev_type_string(int evtype) +{ + switch(evtype) + { + DO_TYPE(LNET_EVENT_GET); + DO_TYPE(LNET_EVENT_PUT); + DO_TYPE(LNET_EVENT_REPLY); + DO_TYPE(LNET_EVENT_ACK); + DO_TYPE(LNET_EVENT_SEND); + DO_TYPE(LNET_EVENT_UNLINK); + default: + return ""; + } +} + +static volatile int seen = 0; +static volatile int seen_unlink = 0; + +static inline void handler(lnet_event_t *ev) +{ + PJK_UT_MSG("-------- EVENT START ------------\n"); + PJK_UT_MSG("type=%d %s\n",ev->type,get_ev_type_string(ev->type)); + PJK_UT_MSG("portal=%d\n",ev->pt_index); + PJK_UT_MSG("matchbits="LPX64"\n",ev->match_bits); + PJK_UT_MSG("request length=%d\n",ev->rlength); + PJK_UT_MSG("manipulated length=%d\n",ev->mlength); + PJK_UT_MSG("offset=%d\n",ev->offset); + PJK_UT_MSG("status=%d\n",ev->status); + PJK_UT_MSG("unlinked=%d\n",ev->unlinked); + PJK_UT_MSG("md.user_ptr=%p\n",ev->md.user_ptr); + PJK_UT_MSG("-------- EVENT END --------------\n"); + ++seen; + if(ev->unlinked)++seen_unlink; +} diff --git a/lnet/tests/ut_cli.c b/lnet/tests/ut_cli.c new file mode 100644 index 0000000..3a6e255 --- /dev/null +++ b/lnet/tests/ut_cli.c @@ -0,0 +1,211 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + + #define UT_MSG_MODULE_NAME "utcli " + #include "ut.h" + +int pkt_size = 300; +module_param(pkt_size,int,S_IRUGO); +int get=0; +module_param(get,int,S_IRUGO); +int put=0; +module_param(put,int,S_IRUGO); +int auto_unlink=1; +module_param(auto_unlink,int,S_IRUGO); +char* nid=0; +module_param(nid,charp,S_IRUGO); + +static int __init utcli_init(void) +{ + lnet_handle_md_t mdh; + lnet_process_id_t target; + lnet_process_id_t mypid; + lnet_handle_eq_t eqh; + lnet_md_t md; + int rc,i; + char* buffer = 0; + /* + * Put and get really control the same thing + */ + if(put)get=0; + /* Default to get */ + if(!put && !get)get=1; + + PJK_UT_MSG("utcli_init %s\n",get==0?"PUT":"GET"); + PJK_UT_MSG("pkt_size=%d\n",pkt_size); + PJK_UT_MSG("auto_unlink=%d\n",auto_unlink); + PJK_UT_MSG("nid=%s\n",nid); + if(nid == 0) + { + CERROR("NID Must be specified\n"); + return -EINVAL; + } + + PJK_UT_MSG("LIBCFS_ALLOC\n"); + LIBCFS_ALLOC (buffer, pkt_size); + if (buffer == NULL) + { + CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size); + return -ENOMEM; + } + + PJK_UT_MSG("LNetNiInit()\n"); + rc = LNetNIInit(0); + if (rc < 0) + { + CERROR ("LNetNIInit: error %d\n", rc); + goto exit0; + } + + + LNetGetId(0,&mypid); + PJK_UT_MSG("my.nid="LPX64"\n",mypid.nid); + PJK_UT_MSG("my.pid=0x%x\n",mypid.pid); + + + PJK_UT_MSG("LNetEQAlloc\n"); + rc = LNetEQAlloc( + 64, /* max number of envents why 64? */ + handler, /* handler callback */ + &eqh); /* output handle */ + if(rc != 0) { + CERROR("LNetEQAlloc failed %d\n",rc); + goto exit1; + } + + md.start = buffer; + md.length = pkt_size; + md.threshold = auto_unlink ? (get ? 2 : 1) : 15; + md.max_size = 0; + md.options = 0; + if(get){ + md.options |= LNET_MD_OP_GET; + }else{ + md.options |= LNET_MD_OP_PUT; + md.options |= LNET_MD_ACK_DISABLE; + } + md.user_ptr = 0; + md.eq_handle = eqh; + + PJK_UT_MSG("LNetMDBind()\n"); + if ((rc=LNetMDBind ( + md, + LNET_UNLINK, + &mdh))) /* out handle */ + { + CERROR ("LNetMDBind error %d\n", rc); + goto exit4; + } + + target.pid = 0; + target.nid = libcfs_str2nid(nid); + + PJK_UT_MSG("target.nid="LPX64"\n",target.nid); + + for(i=0;i<1;i++) + { + if(get){ + PJK_UT_MSG("LNetGet()\n"); + if((rc = LNetGet ( + LNET_ID_ANY, + mdh, + target, /* peer "address" */ + UT_PORTAL, /* portal */ + i, /* match bits */ + 0))) /* header data */ + { + CERROR("LNetGet %d error %d\n",i, rc); + goto exit5; + } + }else{ + + PJK_UT_MSG("LNetPut()\n"); + if((rc = LNetPut ( + LNET_ID_ANY, + mdh, + LNET_ACK_REQ, /* we want ack */ + target, /* peer "address" */ + UT_PORTAL, /* portal */ + i, /* match bits */ + 0, /* offset */ + 0))) /* header data */ + { + CERROR("LNetPut %d error %d\n",i, rc); + goto exit5; + } + } + } + + + PJK_UT_MSG("------------Waiting for SEND_END()------------\n"); + i=0; + while(i++ < 10 && seen == 0) + cfs_pause(cfs_time_seconds(1)); + if(seen == 0) + PJK_UT_MSG("------------------TIMEDOUT--------------------\n"); + else{ + int good; + if(get){ + PJK_UT_MSG("------------Waiting for REPLY()------------\n"); + i=0; + while(i++ < 10 && seen == 1) + cfs_pause(cfs_time_seconds(1)); + good = (seen != 1); + }else{ + good = 1; + } + + if(good) + PJK_UT_MSG("------------------COMPLETE--------------------\n"); + else + PJK_UT_MSG("------------------TIMEDOUT--------------------\n"); + } + + + + /* + PJK_UT_MSG("LNetEQWait()\n"); + rc = LNetEQWait(eqh,&ev); + if(rc != 0) + goto exit5; + */ + +exit5: + PJK_UT_MSG("LNetMDUnlink()\n"); + LNetMDUnlink(mdh); + + if(!seen_unlink){ + PJK_UT_MSG("------------Waiting for UNLINK ------------\n"); + i=0; + while(i++ < 120 && seen_unlink == 0) + cfs_pause(cfs_time_seconds(1)); + } + + cfs_pause(cfs_time_seconds(1)); +exit4: + PJK_UT_MSG("LNetEQFree()\n"); + LNetEQFree(eqh); +exit1: + PJK_UT_MSG("LNetNiFini()\n"); + LNetNIFini(); +exit0: + if(buffer) + LIBCFS_FREE(buffer,pkt_size); + + return -1; +} /* utcli_init() */ + + +static void /*__exit*/ utcli_cleanup(void) +{ + PJK_UT_MSG(">>>\n"); + PJK_UT_MSG("<<<\n"); +} /* utcli_cleanup() */ + + +MODULE_AUTHOR("PJ Kirner (CFS)"); +MODULE_DESCRIPTION("A simple LNET Unit Test module"); +MODULE_LICENSE("GPL"); + +cfs_module(ut_cli, "1.0.0", utcli_init, utcli_cleanup); diff --git a/lnet/tests/ut_srv.c b/lnet/tests/ut_srv.c new file mode 100644 index 0000000..3ffbac6 --- /dev/null +++ b/lnet/tests/ut_srv.c @@ -0,0 +1,144 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + + +#define UT_MSG_MODULE_NAME "utsrv " +#include "ut.h" + + +int pkt_size = 300; +module_param(pkt_size,int,S_IRUGO); +int auto_unlink=1; +module_param(auto_unlink,int,S_IRUGO); + +char *buffer = 0; +lnet_handle_eq_t eqh; +lnet_handle_me_t meh; +lnet_handle_md_t mdh; + +static int __init utsrv_init(void) +{ + int rc; + lnet_process_id_t anypid; + lnet_process_id_t mypid; + lnet_md_t md; + + PJK_UT_MSG(">>>\n"); + PJK_UT_MSG("pkt_size=%d\n",pkt_size); + PJK_UT_MSG("auto_unlink=%d\n",auto_unlink); + + PJK_UT_MSG("LIBCFS_ALLOC\n"); + LIBCFS_ALLOC (buffer, pkt_size); + if (buffer == NULL) + { + CERROR ("Unable to allocate out_buf (%d bytes)\n", pkt_size); + rc = -ENOMEM; + goto exit0; + } + + PJK_UT_MSG("LNetNiInit()\n"); + rc = LNetNIInit(0); + if (rc < 0) + { + CERROR ("LNetNIInit: error %d\n", rc); + goto exit1; + } + + LNetGetId(0,&mypid); + PJK_UT_MSG("my.nid="LPX64"\n",mypid.nid); + PJK_UT_MSG("my.pid=0x%x\n",mypid.pid); + + PJK_UT_MSG("LNetEQAlloc\n"); + rc = LNetEQAlloc( + 64, /* max number of envents why 64? */ + handler, /* handler callback */ + &eqh); /* output handle */ + if(rc != 0) { + CERROR("LNetEQAlloc failed %d\n",rc); + goto exit2; + } + + anypid.nid = LNET_NID_ANY; + anypid.pid = LNET_PID_ANY; + + + PJK_UT_MSG("LNetMEAttach\n"); + rc = LNetMEAttach( + UT_PORTAL, /* ptl index*/ + anypid, /* pid - in this case allow any*/ + 0, /*matchbits*/ + 0x0FFFF, /*ignorebits - ignore botton 16-bits*/ + LNET_UNLINK, /* unlik vs LNET_RETAIN*/ + LNET_INS_BEFORE, + &meh); + if(rc != 0) { + CERROR("LNetMeAttach failed %d\n",rc); + goto exit3; + } + + md.start = buffer; + md.length = pkt_size; + md.threshold = auto_unlink ? 1 : 100; + md.max_size = 0; + md.options = 0; + md.options |= LNET_MD_OP_GET; + md.options |= LNET_MD_OP_PUT; + md.options |= LNET_MD_ACK_DISABLE; + md.user_ptr= 0; + md.eq_handle = eqh; + + PJK_UT_MSG("LNetMDAttach\n"); + rc = LNetMDAttach( + meh, + md, + LNET_UNLINK, + &mdh); + if(rc != 0){ + CERROR("LNetMDAttach failed %d\n",rc); + goto exit4; + } + + rc = 0; + goto exit0; + +exit4: + PJK_UT_MSG("LNetMEUnlink()\n"); + LNetMEUnlink(meh); +exit3: + PJK_UT_MSG("LNetEQFree()\n"); + LNetEQFree(eqh); +exit2: + PJK_UT_MSG("LNetNiFini()\n"); + LNetNIFini(); +exit1: + LIBCFS_FREE(buffer,pkt_size); +exit0: + PJK_UT_MSG("<<< rc=%d\n",rc); + return rc; + +} /* utsrv_init() */ + + +static void /*__exit*/ utsrv_cleanup(void) +{ + PJK_UT_MSG(">>>\n"); + PJK_UT_MSG("LNetMDUnlink()\n"); + LNetMDUnlink(mdh); + PJK_UT_MSG("LNetMEUnlink()\n"); + LNetMEUnlink(meh); + PJK_UT_MSG("LNetEQFree()\n"); + LNetEQFree(eqh); + PJK_UT_MSG("LNetNiFini()\n"); + LNetNIFini(); + LIBCFS_FREE(buffer,pkt_size); + PJK_UT_MSG("<<<\n"); +} /* utsrv_cleanup() */ + + +MODULE_AUTHOR("PJ Kirner (CFS)"); +MODULE_DESCRIPTION("A simple LNET Unit Test module"); +MODULE_LICENSE("GPL"); + +cfs_module(utsvr, "1.0.0", utsrv_init, utsrv_cleanup); + diff --git a/lnet/ulnds/.cvsignore b/lnet/ulnds/.cvsignore index e995588..2711a44 100644 --- a/lnet/ulnds/.cvsignore +++ b/lnet/ulnds/.cvsignore @@ -1,3 +1,4 @@ .deps Makefile -Makefile.in +autoMakefile +autoMakefile.in diff --git a/lnet/ulnds/Makefile.am b/lnet/ulnds/Makefile.am deleted file mode 100644 index 3437d39..0000000 --- a/lnet/ulnds/Makefile.am +++ /dev/null @@ -1,10 +0,0 @@ -if LIBLUSTRE -if !CRAY_PORTALS -noinst_LIBRARIES = libtcpnal.a -endif -endif - -noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h -libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h -libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS) -libtcpnal_a_CFLAGS = $(LLCFLAGS) diff --git a/lnet/ulnds/Makefile.in b/lnet/ulnds/Makefile.in new file mode 100644 index 0000000..78432ee --- /dev/null +++ b/lnet/ulnds/Makefile.in @@ -0,0 +1,5 @@ +@BUILD_USOCKLND_TRUE@subdir-m += socklnd +@BUILD_UPTLLND_TRUE@subdir-m += ptllnd + +@INCLUDE_RULES@ + diff --git a/lnet/ulnds/README b/lnet/ulnds/README deleted file mode 100644 index 6cb93d9..0000000 --- a/lnet/ulnds/README +++ /dev/null @@ -1,53 +0,0 @@ -This library implements two NAL interfaces, both running over IP. -The first, tcpnal, creates TCP connections between participating -processes in order to transport the portals requests. The second, -ernal, provides a simple transport protocol which runs over -UDP datagrams. - -The interface functions return both of these values in host order for -convenience and readability. However this means that addresses -exchanged in messages between hosts of different orderings will not -function properly. - -Both NALs use the same support functions in order to schedule events -and communicate with the generic portals implementation. - - ------------------------- - | api | - |_______________________| - | lib | - |_______________________| - | ernal | |tcpnal | - |--------| |----------| - | udpsock| |connection| - |-----------------------| - | timer/select | - ------------------------- - - - These NALs uses the framework from fdnal of a pipe between the api -and library sides. This is wrapped up in the select on the library -side, and blocks on the api side. Performance could be severely -enhanced by collapsing this aritificial barrier, by using shared -memory queues, or by wiring the api layer directly to the library. - - -nid is defined as the low order 24-bits of the IP address of the -physical node left shifted by 8 plus a virtual node number of 0 -through 255 (really only 239). The virtual node number of a tcpnal -application should be specified using the environment variable -PTL_VIRTNODE. pid is now a completely arbitrary number in the -range of 0 to 255. The IP interface used can be overridden by -specifying the appropriate hostid by setting the PTL_HOSTID -environment variable. The value can be either dotted decimal -(n.n.n.n) or hex starting with "0x". -TCPNAL: - As the NAL needs to try to send to a particular nid/pid pair, it - will open up connections on demand. Because the port associated with - the connecting socket is different from the bound port, two - connections will normally be established between a pair of peers, with - data flowing from the anonymous connect (active) port to the advertised - or well-known bound (passive) port of each peer. - - Should the connection fail to open, an error is reported to the - library component, which causes the api request to fail. diff --git a/lnet/ulnds/address.c b/lnet/ulnds/address.c deleted file mode 100644 index 07b4249..0000000 --- a/lnet/ulnds/address.c +++ /dev/null @@ -1,147 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* address.c: - * this file provides functions to aquire the IP address of the node - * and translate them into a NID/PID pair which supports a static - * mapping of virtual nodes into the port range of an IP socket. -*/ - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include -#include -#include -#include - - -/* Function: get_node_id - * Returns: a 32 bit id for this node, actually a big-endian IP address - * - * get_node_id() determines the host name and uses the resolver to - * find out its ip address. This is fairly fragile and inflexible, but - * explicitly asking about interfaces and their addresses is very - * complicated and nonportable. - */ -static unsigned int get_node_id(void) -{ - char buffer[255]; - unsigned int x; - struct hostent *he; - char * host_envp; - - if (!(host_envp = getenv("PTL_HOSTID"))) - { - gethostname(buffer,sizeof(buffer)); - he=gethostbyname(buffer); - if (he) - x=*(unsigned int *)he->h_addr_list[0]; - else - x = 0; - return(ntohl(x)); - } - else - { - if (host_envp[1] != 'x') - { - int a, b, c, d; - sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); - return ((a<<24) | (b<<16) | (c<<8) | d); - } - else - { - long long hostid = strtoll(host_envp, 0, 0); - return((unsigned int) hostid); - } - } -} - - -/* Function: set_address - * Arugments: t: a procnal structure to populate with the request - * - * set_address performs the bit manipulations to set the nid, pid, and - * iptop8 fields of the procnal structures. - * - * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY - */ - -#ifdef DIRECT_IP_MODE -void set_address(bridge t,ptl_pid_t pidrequest) -{ - int port; - if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; - else port=pidrequest; - t->lib_nal->libnal_ni.ni_pid.nid=get_node_id(); - t->lib_nal->libnal_ni.ni_pid.pid=port; -} -#else - -void set_address(bridge t,ptl_pid_t pidrequest) -{ - int virtnode, in_addr, port; - ptl_pid_t pid; - - /* get and remember my node id*/ - if (!getenv("PTL_VIRTNODE")) - virtnode = 0; - else - { - int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT - >> PNAL_VNODE_SHIFT); - virtnode = atoi(getenv("PTL_VIRTNODE")); - if (virtnode > maxvnode) - { - fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", - virtnode, maxvnode); - return; - } - } - - in_addr = get_node_id(); - - t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ - t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) - << PNAL_VNODE_SHIFT) - + virtnode; - pid=pidrequest; - /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ -#ifdef notyet - if (pid==(unsigned short)PTL_PID_ANY) port = 0; -#endif - if (pid==(unsigned short)PTL_PID_ANY) - { - fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); - return; - } - else if (pid > PNAL_PID_MASK) - { - fprintf(stderr, "portal pid of %d is too large - max %d\n", - pid, PNAL_PID_MASK); - return; - } - else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; - t->lib_nal->libnal_ni.ni_pid.pid=pid; -} -#endif diff --git a/lnet/ulnds/autoMakefile.am b/lnet/ulnds/autoMakefile.am new file mode 100644 index 0000000..0e7fa4c2 --- /dev/null +++ b/lnet/ulnds/autoMakefile.am @@ -0,0 +1,6 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +SUBDIRS = socklnd ptllnd diff --git a/lnet/ulnds/bridge.h b/lnet/ulnds/bridge.h deleted file mode 100644 index d2f0f2c..0000000 --- a/lnet/ulnds/bridge.h +++ /dev/null @@ -1,34 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#ifndef TCPNAL_PROCBRIDGE_H -#define TCPNAL_PROCBRIDGE_H - -#include -#include - -#define PTL_IFACE_TCP 1 -#define PTL_IFACE_ER 2 -#define PTL_IFACE_SS 3 -#define PTL_IFACE_MAX 4 - -typedef struct bridge { - int alive; - lib_nal_t *lib_nal; - void *lower; - void *local; - void (*shutdown)(struct bridge *); - /* this doesn't really belong here */ - unsigned char iptop8; -} *bridge; - - -typedef int (*nal_initialize)(bridge); -extern nal_initialize nal_table[PTL_IFACE_MAX]; - -#endif diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c deleted file mode 100644 index 49cca96..0000000 --- a/lnet/ulnds/connection.c +++ /dev/null @@ -1,507 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* connection.c: - This file provides a simple stateful connection manager which - builds tcp connections on demand and leaves them open for - future use. It also provides the machinery to allow peers - to connect to it -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef __CYGWIN__ -#include -#endif - -/* global variable: acceptor port */ -unsigned short tcpnal_acceptor_port = 988; - - -/* Function: compare_connection - * Arguments: connection c: a connection in the hash table - * ptl_process_id_t: an id to verify agains - * Returns: 1 if the connection is the one requested, 0 otherwise - * - * compare_connection() tests for collisions in the hash table - */ -static int compare_connection(void *arg1, void *arg2) -{ - connection c = arg1; - unsigned int * id = arg2; -#if 0 - return((c->ip==id[0]) && (c->port==id[1])); -#else - /* CFS specific hacking */ - return (c->ip == id[0]); -#endif -} - - -/* Function: connection_key - * Arguments: ptl_process_id_t id: an id to hash - * Returns: a not-particularily-well-distributed hash - * of the id - */ -static unsigned int connection_key(unsigned int *id) -{ -#if 0 - return(id[0]^id[1]); -#else - /* CFS specific hacking */ - return (unsigned int) id[0]; -#endif -} - - -/* Function: remove_connection - * Arguments: c: the connection to remove - */ -void remove_connection(void *arg) -{ - connection c = arg; - unsigned int id[2]; - - id[0]=c->ip; - id[1]=c->port; - hash_table_remove(c->m->connections,id); - close(c->fd); - free(c); -} - - -/* Function: read_connection: - * Arguments: c: the connection to read from - * dest: the buffer to read into - * len: the number of bytes to read - * Returns: success as 1, or failure as 0 - * - * read_connection() reads data from the connection, continuing - * to read partial results until the request is satisfied or - * it errors. TODO: this read should be covered by signal protection. - */ -int read_connection(connection c, - unsigned char *dest, - int len) -{ - int offset = 0,rc; - - if (len) { - do { -#ifndef __CYGWIN__ - rc = syscall(SYS_read, c->fd, dest+offset, len-offset); -#else - rc = recv(c->fd, dest+offset, len-offset, 0); -#endif - if (rc <= 0) { - if (errno == EINTR) { - rc = 0; - } else { - remove_connection(c); - return (0); - } - } - offset += rc; - } while (offset < len); - } - return (1); -} - -static int connection_input(void *d) -{ - connection c = d; - return((*c->m->handler)(c->m->handler_arg,c)); -} - - -/* Function: allocate_connection - * Arguments: t: tcpnal the allocation is occuring in the context of - * dest: portal endpoint address for this connection - * fd: open file descriptor for the socket - * Returns: an allocated connection structure - * - * just encompasses the action common to active and passive - * connections of allocation and placement in the global table - */ -static connection allocate_connection(manager m, - unsigned int ip, - unsigned short port, - int fd) -{ - connection c=malloc(sizeof(struct connection)); - unsigned int id[2]; - c->m=m; - c->fd=fd; - c->ip=ip; - c->port=port; - id[0]=ip; - id[1]=port; - register_io_handler(fd,READ_HANDLER,connection_input,c); - hash_table_insert(m->connections,c,id); - return(c); -} - - -/* Function: new_connection - * Arguments: t: opaque argument holding the tcpname - * Returns: 1 in order to reregister for new connection requests - * - * called when the bound service socket recieves - * a new connection request, it always accepts and - * installs a new connection - */ -static int new_connection(void *z) -{ - manager m=z; - struct sockaddr_in s; - int len=sizeof(struct sockaddr_in); - int fd=accept(m->bound,(struct sockaddr *)&s,&len); - unsigned int nid=*((unsigned int *)&s.sin_addr); - /* cfs specific hack */ - //unsigned short pid=s.sin_port; - pthread_mutex_lock(&m->conn_lock); - allocate_connection(m,htonl(nid),0/*pid*/,fd); - pthread_mutex_unlock(&m->conn_lock); - return(1); -} - -extern ptl_nid_t tcpnal_mynid; - -int -tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) -{ - int rc; - int nob; - ptl_hdr_t hdr; - ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; - - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); - - memset (&hdr, 0, sizeof (hdr)); - hmv->magic = cpu_to_le32(PORTALS_PROTO_MAGIC); - hmv->version_major = cpu_to_le32(PORTALS_PROTO_VERSION_MAJOR); - hmv->version_minor = cpu_to_le32(PORTALS_PROTO_VERSION_MINOR); - - hdr.src_nid = cpu_to_le64(tcpnal_mynid); - hdr.type = cpu_to_le32(PTL_MSG_HELLO); - - hdr.msg.hello.type = cpu_to_le32(type); - hdr.msg.hello.incarnation = cpu_to_le64(incarnation); - - /* I don't send any interface info */ - - /* Assume sufficient socket buffering for this message */ - rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr)); - if (rc <= 0) { - CERROR ("Error %d sending HELLO to "LPX64"\n", rc, *nid); - return (rc); - } - - rc = syscall(SYS_read, sockfd, hmv, sizeof(*hmv)); - if (rc <= 0) { - CERROR ("Error %d reading HELLO from "LPX64"\n", rc, *nid); - return (rc); - } - - if (hmv->magic != le32_to_cpu(PORTALS_PROTO_MAGIC)) { - CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n", - cpu_to_le32(hmv->magic), PORTALS_PROTO_MAGIC, *nid); - return (-EPROTO); - } - - if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || - hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { - CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" - " from "LPX64"\n", - le16_to_cpu (hmv->version_major), - le16_to_cpu (hmv->version_minor), - PORTALS_PROTO_VERSION_MAJOR, - PORTALS_PROTO_VERSION_MINOR, - *nid); - return (-EPROTO); - } - -#if (PORTALS_PROTO_VERSION_MAJOR != 1) -# error "This code only understands protocol version 1.x" -#endif - /* version 1 sends magic/version as the dest_nid of a 'hello' header, - * so read the rest of it in now... */ - - rc = syscall(SYS_read, sockfd, hmv + 1, sizeof(hdr) - sizeof(*hmv)); - if (rc <= 0) { - CERROR ("Error %d reading rest of HELLO hdr from "LPX64"\n", - rc, *nid); - return (rc); - } - - /* ...and check we got what we expected */ - if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) { - CERROR ("Expecting a HELLO hdr " - " but got type %d with %d payload from "LPX64"\n", - le32_to_cpu (hdr.type), - le32_to_cpu (hdr.payload_length), *nid); - return (-EPROTO); - } - - if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n"); - return (-EPROTO); - } - - if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ - *nid = le64_to_cpu(hdr.src_nid); - } else if (*nid != le64_to_cpu (hdr.src_nid)) { - CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n", - le64_to_cpu (hdr.src_nid), *nid); - return (-EPROTO); - } - - /* Ignore any interface info in the payload */ - nob = le32_to_cpu(hdr.payload_length); - if (nob > getpagesize()) { - CERROR("Unexpected HELLO payload %d from "LPX64"\n", - nob, *nid); - return (-EPROTO); - } - if (nob > 0) { - char *space = (char *)malloc(nob); - - if (space == NULL) { - CERROR("Can't allocate scratch buffer %d\n", nob); - return (-ENOMEM); - } - - rc = syscall(SYS_read, sockfd, space, nob); - if (rc <= 0) { - CERROR("Error %d skipping HELLO payload from " - LPX64"\n", rc, *nid); - return (rc); - } - } - - return (0); -} - -/* Function: force_tcp_connection - * Arguments: t: tcpnal - * dest: portals endpoint for the connection - * Returns: an allocated connection structure, either - * a pre-existing one, or a new connection - */ -connection force_tcp_connection(manager m, - unsigned int ip, - unsigned short port, - procbridge pb) -{ - connection conn; - struct sockaddr_in addr; - struct sockaddr_in locaddr; - unsigned int id[2]; - struct timeval tv; - __u64 incarnation; - - int fd; - int option; - int rc; - int rport; - ptl_nid_t peernid = PTL_NID_ANY; - - port = tcpnal_acceptor_port; - - id[0] = ip; - id[1] = port; - - pthread_mutex_lock(&m->conn_lock); - - conn = hash_table_find(m->connections, id); - if (conn) - goto out; - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(ip); - addr.sin_port = htons(port); - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_addr.s_addr = INADDR_ANY; - - for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { - fd = socket(AF_INET, SOCK_STREAM, 0); - if (fd < 0) { - perror("tcpnal socket failed"); - goto out; - } - - option = 1; - rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, - &option, sizeof(option)); - if (rc != 0) { - perror ("Can't set SO_REUSEADDR for socket"); - close(fd); - goto out; - } - - locaddr.sin_port = htons(rport); - rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == 0 || errno == EACCES) { - rc = connect(fd, (struct sockaddr *)&addr, - sizeof(struct sockaddr_in)); - if (rc == 0) { - break; - } else if (errno != EADDRINUSE && errno != EADDRNOTAVAIL) { - perror("Error connecting to remote host"); - close(fd); - goto out; - } - } else if (errno != EADDRINUSE) { - perror("Error binding to privileged port"); - close(fd); - goto out; - } - close(fd); - } - - if (rport == IPPORT_RESERVED / 2) { - fprintf(stderr, "Out of ports trying to bind to a reserved port\n"); - goto out; - } - -#if 1 - option = 1; - setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); -#endif - - gettimeofday(&tv, NULL); - incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - - /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) - exit(-1); - - conn = allocate_connection(m, ip, port, fd); - - /* let nal thread know this event right away */ - if (conn) - procbridge_wakeup_nal(pb); - -out: - pthread_mutex_unlock(&m->conn_lock); - return (conn); -} - - -/* Function: bind_socket - * Arguments: t: the nal state for this interface - * port: the port to attempt to bind to - * Returns: 1 on success, or 0 on error - * - * bind_socket() attempts to allocate and bind a socket to the requested - * port, or dynamically assign one from the kernel should the port be - * zero. Sets the bound and bound_handler elements of m. - * - * TODO: The port should be an explicitly sized type. - */ -static int bind_socket(manager m,unsigned short port) -{ - struct sockaddr_in addr; - int alen=sizeof(struct sockaddr_in); - - if ((m->bound = socket(AF_INET, SOCK_STREAM, 0)) < 0) - return(0); - - bzero((char *) &addr, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = 0; - addr.sin_port = htons(port); - - if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ - fprintf(stderr, "tcpnal bind: %s port %u\n", strerror(errno), port); - return(0); - } - - getsockname(m->bound,(struct sockaddr *)&addr, &alen); - - m->bound_handler=register_io_handler(m->bound,READ_HANDLER, - new_connection,m); - listen(m->bound,5); - m->port=addr.sin_port; - return(1); -} - - -/* Function: shutdown_connections - * Arguments: m: the manager structure - * - * close all connections and reclaim resources - */ -void shutdown_connections(manager m) -{ - close(m->bound); - remove_io_handler(m->bound_handler); - hash_destroy_table(m->connections,remove_connection); - free(m); -} - - -/* Function: init_connections - * Arguments: t: the nal state for this interface - * port: the port to attempt to bind to - * Returns: a newly allocated manager structure, or - * zero if the fixed port could not be bound - */ -manager init_connections(unsigned short pid, - int (*input)(void *, void *), - void *a) -{ - manager m = (manager)malloc(sizeof(struct manager)); - m->connections = hash_create_table(compare_connection,connection_key); - m->handler = input; - m->handler_arg = a; - pthread_mutex_init(&m->conn_lock, 0); - - if (bind_socket(m,pid)) - return(m); - - free(m); - return(0); -} diff --git a/lnet/ulnds/connection.h b/lnet/ulnds/connection.h deleted file mode 100644 index 343ffa6..0000000 --- a/lnet/ulnds/connection.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#include -#include - -typedef struct manager { - table connections; - pthread_mutex_t conn_lock; /* protect connections table */ - int bound; - io_handler bound_handler; - int (*handler)(void *, void *); - void *handler_arg; - unsigned short port; -} *manager; - - -typedef struct connection { - unsigned int ip; - unsigned short port; - int fd; - manager m; -} *connection; - -connection force_tcp_connection(manager m, unsigned int ip, unsigned int short, - procbridge pb); -manager init_connections(unsigned short, int (*f)(void *, void *), void *); -void remove_connection(void *arg); -void shutdown_connections(manager m); -int read_connection(connection c, unsigned char *dest, int len); diff --git a/lnet/ulnds/debug.c b/lnet/ulnds/debug.c deleted file mode 100644 index b82bb2f..0000000 --- a/lnet/ulnds/debug.c +++ /dev/null @@ -1,119 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Phil Schwan - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include - -int smp_processor_id = 1; -char debug_file_path[1024] = "/tmp/lustre-log"; -char debug_file_name[1024]; -FILE *debug_file_fd; - -int portals_do_debug_dumplog(void *arg) -{ - printf("Look in %s\n", debug_file_name); - return 0; -} - - -void portals_debug_print(void) -{ - return; -} - - -void portals_debug_dumplog(void) -{ - printf("Look in %s\n", debug_file_name); - return; -} - - -int portals_debug_init(unsigned long bufsize) -{ - debug_file_fd = stdout; - return 0; -} - -int portals_debug_cleanup(void) -{ - return 0; //close(portals_debug_fd); -} - -int portals_debug_clear_buffer(void) -{ - return 0; -} - -int portals_debug_mark_buffer(char *text) -{ - - fprintf(debug_file_fd, "*******************************************************************************\n"); - fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); - fprintf(debug_file_fd, "*******************************************************************************\n"); - - return 0; -} - -int portals_debug_copy_to_user(char *buf, unsigned long len) -{ - return 0; -} - -/* FIXME: I'm not very smart; someone smarter should make this better. */ -void -portals_debug_msg (int subsys, int mask, char *file, const char *fn, - const int line, const char *format, ...) -{ - va_list ap; - unsigned long flags; - struct timeval tv; - int nob; - - - /* NB since we pass a non-zero sized buffer (at least) on the first - * print, we can be assured that by the end of all the snprinting, - * we _do_ have a terminated buffer, even if our message got truncated. - */ - - gettimeofday(&tv, NULL); - - nob += fprintf(debug_file_fd, - "%02x:%06x:%d:%lu.%06lu ", - subsys >> 24, mask, smp_processor_id, - tv.tv_sec, tv.tv_usec); - - nob += fprintf(debug_file_fd, - "(%s:%d:%s() %d+%ld): ", - file, line, fn, 0, - 8192 - ((unsigned long)&flags & 8191UL)); - - va_start (ap, format); - nob += fprintf(debug_file_fd, format, ap); - va_end (ap); - - -} - diff --git a/lnet/ulnds/dispatch.h b/lnet/ulnds/dispatch.h deleted file mode 100644 index a8f916d9..0000000 --- a/lnet/ulnds/dispatch.h +++ /dev/null @@ -1,46 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -/* this file is only called dispatch.h to prevent it - from colliding with /usr/include/sys/select.h */ - -typedef struct io_handler *io_handler; - -struct io_handler{ - io_handler *last; - io_handler next; - int fd; - int type; - int (*function)(void *); - void *argument; - int disabled; -}; - - -#define READ_HANDLER 1 -#define WRITE_HANDLER 2 -#define EXCEPTION_HANDLER 4 -#define ALL_HANDLER (READ_HANDLER | WRITE_HANDLER | EXCEPTION_HANDLER) - -io_handler register_io_handler(int fd, - int type, - int (*function)(void *), - void *arg); - -void remove_io_handler (io_handler i); -void init_unix_timer(void); -void select_timer_block(when until); -when now(void); - -/* - * hacking for CFS internal MPI testing - */ -#if !CRAY_PORTALS -#define ENABLE_SELECT_DISPATCH -#endif diff --git a/lnet/ulnds/ipmap.h b/lnet/ulnds/ipmap.h deleted file mode 100644 index 85b1e18..0000000 --- a/lnet/ulnds/ipmap.h +++ /dev/null @@ -1,38 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#define DIRECT_IP_MODE -#ifdef DIRECT_IP_MODE -#define PNAL_NID(in_addr, port) (in_addr) -#define PNAL_PID(pid) (pid) -#define PNAL_IP(in_addr, port) (in_addr) -#define PNAL_PORT(nid, pid) (pid) -#else - -#define PNAL_BASE_PORT 4096 -#define PNAL_HOSTID_SHIFT 24 -#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) -#define PNAL_VNODE_SHIFT 8 -#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) -#define PNAL_PID_SHIFT 8 -#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) - -#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ - << PNAL_VNODE_SHIFT) \ - | (((ntohs(port)-PNAL_BASE_PORT) >>\ - PNAL_PID_SHIFT))) -#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) - -#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ - >> PNAL_VNODE_SHIFT)\ - | (t->iptop8 << PNAL_HOSTID_SHIFT))) -#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ - << PNAL_VNODE_SHIFT) \ - | ((pid) & PNAL_PID_MASK)) \ - + PNAL_BASE_PORT)) -#endif diff --git a/lnet/ulnds/pqtimer.c b/lnet/ulnds/pqtimer.c deleted file mode 100644 index 98c48eb..0000000 --- a/lnet/ulnds/pqtimer.c +++ /dev/null @@ -1,226 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* timer.c: - * this file implements a simple priority-queue based timer system. when - * combined with a file which implements now() and block(), it can - * be used to provide course-grained time-based callbacks. - */ - -#include -#include -#include - -struct timer { - void (*function)(void *); - void *arg; - when w; - int interval; - int disable; -}; - -typedef struct thunk *thunk; -struct thunk { - void (*f)(void *); - void *a; - thunk next; -}; - -extern when now(void); - -static thunk thunks; -static int internal; -static void (*block_function)(when); -static int number_of_timers; -static int size_of_pqueue; -static timer *timers; - - -static void heal(int where) -{ - int left=(where<<1); - int right=(where<<1)+1; - int min=where; - timer temp; - - if (left <= number_of_timers) - if (timers[left]->w < timers[min]->w) min=left; - if (right <= number_of_timers) - if (timers[right]->w < timers[min]->w) min=right; - if (min != where){ - temp=timers[where]; - timers[where]=timers[min]; - timers[min]=temp; - heal(min); - } -} - -static void add_pqueue(int i) -{ - timer temp; - int parent=(i>>1); - if ((i>1) && (timers[i]->w< timers[parent]->w)){ - temp=timers[i]; - timers[i]=timers[parent]; - timers[parent]=temp; - add_pqueue(parent); - } -} - -static void add_timer(timer t) -{ - if (size_of_pqueue<(number_of_timers+2)){ - int oldsize=size_of_pqueue; - timer *new=(void *)malloc(sizeof(struct timer)*(size_of_pqueue+=10)); - memcpy(new,timers,sizeof(timer)*oldsize); - timers=new; - } - timers[++number_of_timers]=t; - add_pqueue(number_of_timers); -} - -/* Function: register_timer - * Arguments: interval: the time interval from the current time when - * the timer function should be called - * function: the function to call when the time has expired - * argument: the argument to call it with. - * Returns: a pointer to a timer structure - */ -timer register_timer(when interval, - void (*function)(void *), - void *argument) -{ - timer t=(timer)malloc(sizeof(struct timer)); - - t->arg=argument; - t->function=function; - t->interval=interval; - t->disable=0; - t->w=now()+interval; - add_timer(t); - if (!internal && (number_of_timers==1)) - block_function(t->w); - return(t); -} - -/* Function: remove_timer - * Arguments: t: - * Returns: nothing - * - * remove_timer removes a timer from the system, insuring - * that it will never be called. It does not actually - * free the timer due to reentrancy issues. - */ - -void remove_timer(timer t) -{ - t->disable=1; -} - - - -void timer_fire() -{ - timer current; - - current=timers[1]; - timers[1]=timers[number_of_timers--]; - heal(1); - if (!current->disable) { - (*current->function)(current->arg); - } - free(current); -} - -when next_timer(void) -{ - when here=now(); - - while (number_of_timers && (timers[1]->w <= here)) timer_fire(); - if (number_of_timers) return(timers[1]->w); - return(0); -} - -/* Function: timer_loop - * Arguments: none - * Returns: never - * - * timer_loop() is the blocking dispatch function for the timer. - * Is calls the block() function registered with init_timer, - * and handles associated with timers that have been registered. - */ -void timer_loop() -{ - when here; - - while (1){ - thunk z; - here=now(); - - for (z=thunks;z;z=z->next) (*z->f)(z->a); - - if (number_of_timers){ - if (timers[1]->w > here){ - (*block_function)(timers[1]->w); - } else { - timer_fire(); - } - } else { - thunk z; - for (z=thunks;z;z=z->next) (*z->f)(z->a); - (*block_function)(0); - } - } -} - - -/* Function: register_thunk - * Arguments: f: the function to call - * a: the single argument to call it with - * - * Thunk functions get called at irregular intervals, they - * should not assume when, or take a particularily long - * amount of time. Thunks are for background cleanup tasks. - */ -void register_thunk(void (*f)(void *),void *a) -{ - thunk t=(void *)malloc(sizeof(struct thunk)); - t->f=f; - t->a=a; - t->next=thunks; - thunks=t; -} - -/* Function: initialize_timer - * Arguments: block: the function to call to block for the specified interval - * - * initialize_timer() must be called before any other timer function, - * including timer_loop. - */ -void initialize_timer(void (*block)(when)) -{ - block_function=block; - number_of_timers=0; - size_of_pqueue=10; - timers=(timer *)malloc(sizeof(timer)*size_of_pqueue); - thunks=0; -} diff --git a/lnet/ulnds/pqtimer.h b/lnet/ulnds/pqtimer.h deleted file mode 100644 index 11efb0e..0000000 --- a/lnet/ulnds/pqtimer.h +++ /dev/null @@ -1,25 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -typedef unsigned long long when; -when now(void); -typedef struct timer *timer; -timer register_timer(when interval, - void (*function)(void *), - void *argument); -timer register_timer_wait(void); -void remove_timer(timer); -void timer_loop(void); -void initialize_timer(void (*block)(when)); -void timer_fire(void); - - -#define HZ 0x100000000ull - - diff --git a/lnet/ulnds/procapi.c b/lnet/ulnds/procapi.c deleted file mode 100644 index 6b471c0..0000000 --- a/lnet/ulnds/procapi.c +++ /dev/null @@ -1,196 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* api.c: - * This file provides the 'api' side for the process-based nals. - * it is responsible for creating the 'library' side thread, - * and passing wrapped portals transactions to it. - * - * Along with initialization, shutdown, and transport to the library - * side, this file contains some stubs to satisfy the nal definition. - */ -#include -#include -#include -#include -#ifndef __CYGWIN__ -#include -#endif -#include -#include -#include -#include -#include - - -/* XXX CFS workaround, to give a chance to let nal thread wake up - * from waiting in select - */ -static int procbridge_notifier_handler(void *arg) -{ - static char buf[8]; - procbridge p = (procbridge) arg; - - syscall(SYS_read, p->notifier[1], buf, sizeof(buf)); - return 1; -} - -void procbridge_wakeup_nal(procbridge p) -{ - static char buf[8]; - syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); -} - -/* Function: shutdown - * Arguments: nal: a pointer to my top side nal structure - * ni: my network interface index - * - * cleanup nal state, reclaim the lower side thread and - * its state using PTL_FINI codepoint - */ -static void procbridge_shutdown(nal_t *n) -{ - lib_nal_t *nal = n->nal_data; - bridge b=(bridge)nal->libnal_data; - procbridge p=(procbridge)b->local; - - p->nal_flags |= NAL_FLAG_STOPPING; - procbridge_wakeup_nal(p); - - do { - pthread_mutex_lock(&p->mutex); - if (p->nal_flags & NAL_FLAG_STOPPED) { - pthread_mutex_unlock(&p->mutex); - break; - } - pthread_cond_wait(&p->cond, &p->mutex); - pthread_mutex_unlock(&p->mutex); - } while (1); - - free(p); -} - - -/* forward decl */ -extern int procbridge_startup (nal_t *, ptl_pid_t, - ptl_ni_limits_t *, ptl_ni_limits_t *); - -/* api_nal - * the interface vector to allow the generic code to access - * this nal. this is seperate from the library side lib_nal. - * TODO: should be dyanmically allocated - */ -nal_t procapi_nal = { - nal_data: NULL, - nal_ni_init: procbridge_startup, - nal_ni_fini: procbridge_shutdown, -}; - -ptl_nid_t tcpnal_mynid; - -#ifdef ENABLE_SELECT_DISPATCH -procbridge __global_procbridge = NULL; -#endif - -/* Function: procbridge_startup - * - * Arguments: pid: requested process id (port offset) - * PTL_ID_ANY not supported. - * desired: limits passed from the application - * and effectively ignored - * actual: limits actually allocated and returned - * - * Returns: portals rc - * - * initializes the tcp nal. we define unix_failure as an - * error wrapper to cut down clutter. - */ -int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) -{ - nal_init_args_t args; - - procbridge p; - bridge b; - /* XXX nal_type is purely private to tcpnal here */ - int nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ - - LASSERT(nal == &procapi_nal); - - init_unix_timer(); - - b=(bridge)malloc(sizeof(struct bridge)); - p=(procbridge)malloc(sizeof(struct procbridge)); - b->local=p; - - args.nia_requested_pid = requested_pid; - args.nia_requested_limits = requested_limits; - args.nia_actual_limits = actual_limits; - args.nia_nal_type = nal_type; - args.nia_bridge = b; - args.nia_apinal = nal; - - /* init procbridge */ - pthread_mutex_init(&p->mutex,0); - pthread_cond_init(&p->cond, 0); - p->nal_flags = 0; - - /* initialize notifier */ - if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) { - perror("socketpair failed"); - return PTL_FAIL; - } - - if (!register_io_handler(p->notifier[1], READ_HANDLER, - procbridge_notifier_handler, p)) { - perror("fail to register notifier handler"); - return PTL_FAIL; - } - -#ifdef ENABLE_SELECT_DISPATCH - __global_procbridge = p; -#endif - - /* create nal thread */ - if (pthread_create(&p->t, NULL, nal_thread, &args)) { - perror("nal_init: pthread_create"); - return PTL_FAIL; - } - - do { - pthread_mutex_lock(&p->mutex); - if (p->nal_flags & (NAL_FLAG_RUNNING | NAL_FLAG_STOPPED)) { - pthread_mutex_unlock(&p->mutex); - break; - } - pthread_cond_wait(&p->cond, &p->mutex); - pthread_mutex_unlock(&p->mutex); - } while (1); - - if (p->nal_flags & NAL_FLAG_STOPPED) - return PTL_FAIL; - - b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid; - - return PTL_OK; -} diff --git a/lnet/ulnds/procbridge.h b/lnet/ulnds/procbridge.h deleted file mode 100644 index 1f91ced..0000000 --- a/lnet/ulnds/procbridge.h +++ /dev/null @@ -1,56 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#ifndef _PROCBRIDGE_H_ -#define _PROCBRIDGE_H_ - -#include -#include -#include - - -#define NAL_FLAG_RUNNING 1 -#define NAL_FLAG_STOPPING 2 -#define NAL_FLAG_STOPPED 4 - -typedef struct procbridge { - /* sync between user threads and nal thread */ - pthread_t t; - pthread_cond_t cond; - pthread_mutex_t mutex; - - /* socket pair used to notify nal thread */ - int notifier[2]; - - int nal_flags; - -} *procbridge; - -typedef struct nal_init_args { - ptl_pid_t nia_requested_pid; - ptl_ni_limits_t *nia_requested_limits; - ptl_ni_limits_t *nia_actual_limits; - int nia_nal_type; - bridge nia_bridge; - nal_t *nia_apinal; -} nal_init_args_t; - -extern void *nal_thread(void *); - - -#define PTL_INIT (LIB_MAX_DISPATCH+1) -#define PTL_FINI (LIB_MAX_DISPATCH+2) - -#define MAX_ACLS 1 -#define MAX_PTLS 128 - -extern void set_address(bridge t,ptl_pid_t pidrequest); -extern void procbridge_wakeup_nal(procbridge p); - -#endif diff --git a/lnet/ulnds/proclib.c b/lnet/ulnds/proclib.c deleted file mode 100644 index 7ee7c71..0000000 --- a/lnet/ulnds/proclib.c +++ /dev/null @@ -1,137 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* lib.c: - * This file provides the 'library' side for the process-based nals. - * it is responsible for communication with the 'api' side and - * providing service to the generic portals 'library' - * implementation. 'library' might be better termed 'communication' - * or 'kernel'. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* the following functions are stubs to satisfy the nal definition - without doing anything particularily useful*/ - -static int nal_dist(lib_nal_t *nal, - ptl_nid_t nid, - unsigned long *dist) -{ - return 0; -} - -static void check_stopping(void *z) -{ - bridge b = z; - procbridge p = b->local; - - if ((p->nal_flags & NAL_FLAG_STOPPING) == 0) - return; - - pthread_mutex_lock(&p->mutex); - p->nal_flags |= NAL_FLAG_STOPPED; - pthread_cond_broadcast(&p->cond); - pthread_mutex_unlock(&p->mutex); - - pthread_exit(0); -} - - -/* Function: nal_thread - * Arguments: z: an opaque reference to a nal control structure - * allocated and partially populated by the api level code - * Returns: nothing, and only on error or explicit shutdown - * - * This function is the entry point of the pthread initiated on - * the api side of the interface. This thread is used to handle - * asynchronous delivery to the application. - * - * We define a limit macro to place a ceiling on limits - * for syntactic convenience - */ -extern int tcpnal_init(bridge); - -nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; - -void *nal_thread(void *z) -{ - nal_init_args_t *args = (nal_init_args_t *) z; - bridge b = args->nia_bridge; - procbridge p=b->local; - int rc; - ptl_process_id_t process_id; - int nal_type; - - b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t)); - b->lib_nal->libnal_data=b; - b->lib_nal->libnal_map=NULL; - b->lib_nal->libnal_unmap=NULL; - b->lib_nal->libnal_dist=nal_dist; - - nal_type = args->nia_nal_type; - - /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which - * lib_init() is about to do from the process_id passed to it...*/ - set_address(b,args->nia_requested_pid); - - process_id = b->lib_nal->libnal_ni.ni_pid; - - if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); - /* initialize the generic 'library' level code */ - - rc = lib_init(b->lib_nal, args->nia_apinal, - process_id, - args->nia_requested_limits, - args->nia_actual_limits); - - /* - * Whatever the initialization returned is passed back to the - * user level code for further interpretation. We just exit if - * it is non-zero since something went wrong. - */ - /* this should perform error checking */ - pthread_mutex_lock(&p->mutex); - p->nal_flags |= (rc != PTL_OK) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING; - pthread_cond_broadcast(&p->cond); - pthread_mutex_unlock(&p->mutex); - - if (rc == PTL_OK) { - /* the thunk function is called each time the timer loop - performs an operation and returns to blocking mode. we - overload this function to inform the api side that - it may be interested in looking at the event queue */ - register_thunk(check_stopping,b); - timer_loop(); - } - return(0); -} diff --git a/lnet/ulnds/ptllnd/.cvsignore b/lnet/ulnds/ptllnd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/ulnds/ptllnd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/ulnds/ptllnd/Makefile.am b/lnet/ulnds/ptllnd/Makefile.am new file mode 100644 index 0000000..e48cb85 --- /dev/null +++ b/lnet/ulnds/ptllnd/Makefile.am @@ -0,0 +1,12 @@ + +if BUILD_UPTLLND +if LIBLUSTRE +noinst_LIBRARIES = libptllnd.a +noinst_HEADERS = ptllnd.h +libptllnd_a_SOURCES = ptllnd.h ptllnd.c ptllnd_cb.c +libptllnd_a_CPPFLAGS= $(LLCPPFLAGS) +# I need $(PTLNDCPPLFLAGS) to be AFTER $(CPPFLAGS) +# Adding them into $(AM_CFLAGS) seems wrong, but lets me get on.. +libptllnd_a_CFLAGS= $(PTLLNDCPPFLAGS) $(LLCFLAGS) +endif +endif diff --git a/lnet/ulnds/ptllnd/ptllnd.c b/lnet/ulnds/ptllnd/ptllnd.c new file mode 100644 index 0000000..b13f520 --- /dev/null +++ b/lnet/ulnds/ptllnd/ptllnd.c @@ -0,0 +1,629 @@ + +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: Eric Barton + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +#include "ptllnd.h" + +lnd_t the_ptllnd = { + .lnd_type = PTLLND, + .lnd_startup = ptllnd_startup, + .lnd_shutdown = ptllnd_shutdown, + .lnd_ctl = ptllnd_ctl, + .lnd_send = ptllnd_send, + .lnd_recv = ptllnd_recv, + .lnd_eager_recv = ptllnd_eager_recv, + .lnd_notify = ptllnd_notify, + .lnd_wait = ptllnd_wait, +}; + +static int ptllnd_ni_count = 0; + +void +ptllnd_assert_wire_constants (void) +{ + /* Wire protocol assertions generated by 'wirecheck' + * running on Linux fedora 2.6.11-co-0.6.4 #1 Mon Jun 19 05:36:13 UTC 2006 i686 i686 i386 GNU + * with gcc version 4.1.1 20060525 (Red Hat 4.1.1-1) */ + + + /* Constants... */ + CLASSERT (PTL_RESERVED_MATCHBITS == 0x100); + CLASSERT (LNET_MSG_MATCHBITS == 0); + CLASSERT (PTLLND_MSG_MAGIC == 0x50746C4E); + CLASSERT (PTLLND_MSG_VERSION == 0x04); + CLASSERT (PTLLND_RDMA_OK == 0x00); + CLASSERT (PTLLND_RDMA_FAIL == 0x01); + CLASSERT (PTLLND_MSG_TYPE_INVALID == 0x00); + CLASSERT (PTLLND_MSG_TYPE_PUT == 0x01); + CLASSERT (PTLLND_MSG_TYPE_GET == 0x02); + CLASSERT (PTLLND_MSG_TYPE_IMMEDIATE == 0x03); + CLASSERT (PTLLND_MSG_TYPE_NOOP == 0x04); + CLASSERT (PTLLND_MSG_TYPE_HELLO == 0x05); + CLASSERT (PTLLND_MSG_TYPE_NAK == 0x06); + + /* Checks for struct kptl_msg_t */ + CLASSERT ((int)sizeof(kptl_msg_t) == 136); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_magic) == 0); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_magic) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_version) == 4); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_version) == 2); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_type) == 6); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_type) == 1); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_credits) == 7); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_credits) == 1); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_nob) == 8); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_nob) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_cksum) == 12); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_cksum) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcnid) == 16); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcnid) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcstamp) == 24); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcstamp) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstnid) == 32); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstnid) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dststamp) == 40); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dststamp) == 8); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcpid) == 48); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcpid) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstpid) == 52); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstpid) == 4); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.immediate) == 56); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.immediate) == 72); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.rdma) == 56); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.rdma) == 80); + CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.hello) == 56); + CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.hello) == 12); + + /* Checks for struct kptl_immediate_msg_t */ + CLASSERT ((int)sizeof(kptl_immediate_msg_t) == 72); + CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_hdr) == 0); + CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_hdr) == 72); + CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_payload[13]) == 85); + CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_payload[13]) == 1); + + /* Checks for struct kptl_rdma_msg_t */ + CLASSERT ((int)sizeof(kptl_rdma_msg_t) == 80); + CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_hdr) == 0); + CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_hdr) == 72); + CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_matchbits) == 72); + CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_matchbits) == 8); + + /* Checks for struct kptl_hello_msg_t */ + CLASSERT ((int)sizeof(kptl_hello_msg_t) == 12); + CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_matchbits) == 0); + CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_matchbits) == 8); + CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_max_msg_size) == 8); + CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_max_msg_size) == 4); +} + +int +ptllnd_parse_int_tunable(int *value, char *name, int dflt) +{ + char *env = getenv(name); + char *end; + + if (env == NULL) { + *value = dflt; + return 0; + } + + *value = strtoull(env, &end, 0); + if (*end == 0) + return 0; + + CERROR("Can't parse tunable %s=%s\n", name, env); + return -EINVAL; +} + +int +ptllnd_get_tunables(lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + int max_msg_size; + int msgs_per_buffer; + int rc; + int temp; + + rc = ptllnd_parse_int_tunable(&plni->plni_portal, + "PTLLND_PORTAL", PTLLND_PORTAL); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&temp, + "PTLLND_PID", PTLLND_PID); + if (rc != 0) + return rc; + plni->plni_ptllnd_pid = (ptl_pid_t)temp; + + rc = ptllnd_parse_int_tunable(&plni->plni_peer_credits, + "PTLLND_PEERCREDITS", PTLLND_PEERCREDITS); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&max_msg_size, + "PTLLND_MAX_MSG_SIZE", + PTLLND_MAX_MSG_SIZE); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&msgs_per_buffer, + "PTLLND_MSGS_PER_BUFFER", + PTLLND_MSGS_PER_BUFFER); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_msgs_spare, + "PTLLND_MSGS_SPARE", + PTLLND_MSGS_SPARE); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_peer_hash_size, + "PTLLND_PEER_HASH_SIZE", + PTLLND_PEER_HASH_SIZE); + if (rc != 0) + return rc; + + + rc = ptllnd_parse_int_tunable(&plni->plni_eq_size, + "PTLLND_EQ_SIZE", PTLLND_EQ_SIZE); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_checksum, + "PTLLND_CHECKSUM", 0); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_max_tx_history, + "PTLLND_TX_HISTORY", PTLLND_TX_HISTORY); + if (rc != 0) + return rc; + + rc = ptllnd_parse_int_tunable(&plni->plni_abort_on_nak, + "PTLLND_ABORT_ON_NAK", + PTLLND_ABORT_ON_NAK); + if (rc != 0) + return rc; + + plni->plni_max_msg_size = max_msg_size & ~7; + if (plni->plni_max_msg_size < sizeof(kptl_msg_t)) + plni->plni_max_msg_size = (sizeof(kptl_msg_t) + 7) & ~7; + + plni->plni_buffer_size = plni->plni_max_msg_size * msgs_per_buffer; + + CDEBUG(D_NET, "portal = %d\n",plni->plni_portal); + CDEBUG(D_NET, "ptllnd_pid = %d\n",plni->plni_ptllnd_pid); + CDEBUG(D_NET, "max_msg_size = %d\n",max_msg_size); + CDEBUG(D_NET, "msgs_per_buffer = %d\n",msgs_per_buffer); + CDEBUG(D_NET, "msgs_spare = %d\n",plni->plni_msgs_spare); + CDEBUG(D_NET, "peer_hash_size = %d\n",plni->plni_peer_hash_size); + CDEBUG(D_NET, "eq_size = %d\n",plni->plni_eq_size); + CDEBUG(D_NET, "max_msg_size = %d\n",plni->plni_max_msg_size); + CDEBUG(D_NET, "buffer_size = %d\n",plni->plni_buffer_size); + + return 0; +} + +ptllnd_buffer_t * +ptllnd_create_buffer (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_buffer_t *buf; + + LIBCFS_ALLOC(buf, sizeof(*buf)); + if (buf == NULL) { + CERROR("Can't allocate buffer descriptor\n"); + return NULL; + } + + buf->plb_ni = ni; + buf->plb_posted = 0; + CFS_INIT_LIST_HEAD(&buf->plb_list); + + LIBCFS_ALLOC(buf->plb_buffer, plni->plni_buffer_size); + if (buf->plb_buffer == NULL) { + CERROR("Can't allocate buffer size %d\n", + plni->plni_buffer_size); + LIBCFS_FREE(buf, sizeof(*buf)); + return NULL; + } + + list_add(&buf->plb_list, &plni->plni_buffers); + plni->plni_nbuffers++; + + return buf; +} + +void +ptllnd_destroy_buffer (ptllnd_buffer_t *buf) +{ + ptllnd_ni_t *plni = buf->plb_ni->ni_data; + + LASSERT (!buf->plb_posted); + + plni->plni_nbuffers--; + list_del(&buf->plb_list); + LIBCFS_FREE(buf->plb_buffer, plni->plni_buffer_size); + LIBCFS_FREE(buf, sizeof(*buf)); +} + +int +ptllnd_grow_buffers (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_buffer_t *buf; + int nmsgs; + int nbufs; + int rc; + + CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers); + CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers); + + nmsgs = plni->plni_npeers * plni->plni_peer_credits + + plni->plni_msgs_spare; + + nbufs = (nmsgs * plni->plni_max_msg_size + plni->plni_buffer_size - 1) / + plni->plni_buffer_size; + + while (nbufs > plni->plni_nbuffers) { + buf = ptllnd_create_buffer(ni); + + if (buf == NULL) + return -ENOMEM; + + rc = ptllnd_post_buffer(buf); + if (rc != 0){ + /* TODO - this path seems to orpahn the buffer + * in a state where its not posted and will never be + * However it does not leak the buffer as it's + * already been put onto the global buffer list + * and will be cleaned up + */ + return rc; + } + } + + CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers); + CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers); + return 0; +} + +void +ptllnd_destroy_buffers (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_buffer_t *buf; + struct list_head *tmp; + struct list_head *nxt; + + CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers); + CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers); + + list_for_each_safe(tmp, nxt, &plni->plni_buffers) { + buf = list_entry(tmp, ptllnd_buffer_t, plb_list); + + //CDEBUG(D_NET, "buf=%p posted=%d\n",buf,buf->plb_posted); + + LASSERT (plni->plni_nbuffers > 0); + if (buf->plb_posted) { + time_t start = cfs_time_current_sec(); + int w = PTLLND_WARN_LONG_WAIT; + + LASSERT (plni->plni_nposted_buffers > 0); + +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + (void) PtlMDUnlink(buf->plb_md); + + while (buf->plb_posted) { + if (cfs_time_current_sec() > start + w) { + CWARN("Waited %ds to unlink buffer\n", w); + w *= 2; + } + ptllnd_wait(ni, w*1000); + } +#else + while (buf->plb_posted) { + rc = PtlMDUnlink(buf->plb_md); + if (rc == PTL_OK) { + buf->plb_posted = 0; + plni->plni_nposted_buffers--; + break; + } + LASSERT (rc == PTL_MD_IN_USE); + if (cfs_time_current_sec() > start + w) { + CWARN("Waited %ds to unlink buffer\n", w); + w *= 2; + } + ptllnd_wait(ni, w*1000); + } +#endif + } + ptllnd_destroy_buffer(buf); + } + + CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers); + CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers); + + LASSERT (plni->plni_nposted_buffers == 0); + LASSERT (plni->plni_nbuffers == 0); +} + +int +ptllnd_create_peer_hash (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + int i; + + plni->plni_npeers = 0; + + LIBCFS_ALLOC(plni->plni_peer_hash, + plni->plni_peer_hash_size * sizeof(*plni->plni_peer_hash)); + if (plni->plni_peer_hash == NULL) { + CERROR("Can't allocate ptllnd peer hash (size %d)\n", + plni->plni_peer_hash_size); + return -ENOMEM; + } + + for (i = 0; i < plni->plni_peer_hash_size; i++) + CFS_INIT_LIST_HEAD(&plni->plni_peer_hash[i]); + + return 0; +} + +void +ptllnd_destroy_peer_hash (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + int i; + + LASSERT( plni->plni_npeers == 0); + + for (i = 0; i < plni->plni_peer_hash_size; i++) + LASSERT (list_empty(&plni->plni_peer_hash[i])); + + LIBCFS_FREE(plni->plni_peer_hash, + plni->plni_peer_hash_size * sizeof(*plni->plni_peer_hash)); +} + +void +ptllnd_close_peers (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_peer_t *plp; + int i; + + for (i = 0; i < plni->plni_peer_hash_size; i++) + while (!list_empty(&plni->plni_peer_hash[i])) { + plp = list_entry(plni->plni_peer_hash[i].next, + ptllnd_peer_t, plp_list); + + ptllnd_close_peer(plp, 0); + } +} + +int +ptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + switch (cmd) { + case IOC_LIBCFS_DEBUG_PEER: + ptllnd_debug_peer(ni, *((lnet_process_id_t *)arg)); + return 0; + + default: + return -EINVAL; + } +} + +__u64 +ptllnd_get_timestamp(void) +{ + struct timeval tv; + int rc = gettimeofday(&tv, NULL); + + LASSERT (rc == 0); + return ((__u64)tv.tv_sec) * 1000000 + tv.tv_usec; +} + +void +ptllnd_shutdown (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + int rc; + time_t start = cfs_time_current_sec(); + int w = PTLLND_WARN_LONG_WAIT; + + LASSERT (ptllnd_ni_count == 1); + plni->plni_max_tx_history = 0; + + ptllnd_cull_tx_history(plni); + + ptllnd_destroy_buffers(ni); + ptllnd_close_peers(ni); + + while (plni->plni_npeers > 0) { + if (cfs_time_current_sec() > start + w) { + CWARN("Waited %ds for peers to shutdown\n", w); + w *= 2; + } + ptllnd_wait(ni, w*1000); + } + + LASSERT (plni->plni_ntxs == 0); + LASSERT (plni->plni_nrxs == 0); + + rc = PtlEQFree(plni->plni_eqh); + LASSERT (rc == PTL_OK); + + rc = PtlNIFini(plni->plni_nih); + LASSERT (rc == PTL_OK); + + ptllnd_destroy_peer_hash(ni); + LIBCFS_FREE(plni, sizeof(*plni)); + ptllnd_ni_count--; +} + +int +ptllnd_startup (lnet_ni_t *ni) +{ + ptllnd_ni_t *plni; + int rc; + + /* could get limits from portals I guess... */ + ni->ni_maxtxcredits = + ni->ni_peertxcredits = 1000; + + if (ptllnd_ni_count != 0) { + CERROR("Can't have > 1 instance of ptllnd\n"); + return -EPERM; + } + + ptllnd_ni_count++; + + LIBCFS_ALLOC(plni, sizeof(*plni)); + if (plni == NULL) { + CERROR("Can't allocate ptllnd state\n"); + rc = -ENOMEM; + goto failed0; + } + + ni->ni_data = plni; + + plni->plni_stamp = ptllnd_get_timestamp(); + plni->plni_nrxs = 0; + plni->plni_ntxs = 0; + plni->plni_ntx_history = 0; + CFS_INIT_LIST_HEAD(&plni->plni_zombie_txs); + CFS_INIT_LIST_HEAD(&plni->plni_tx_history); + + /* + * Initilize buffer related data structures + */ + CFS_INIT_LIST_HEAD(&plni->plni_buffers); + plni->plni_nbuffers = 0; + plni->plni_nposted_buffers = 0; + + rc = ptllnd_get_tunables(ni); + if (rc != 0) + goto failed1; + + rc = ptllnd_create_peer_hash(ni); + if (rc != 0) + goto failed1; + + /* NB I most probably won't get the PID I requested here. It doesn't + * matter because I don't need a fixed PID (only connection acceptors + * need a "well known" PID). */ + + rc = PtlNIInit(PTL_IFACE_DEFAULT, plni->plni_ptllnd_pid, + NULL, NULL, &plni->plni_nih); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + CERROR("PtlNIInit failed: %d\n", rc); + rc = -ENODEV; + goto failed2; + } + + rc = PtlEQAlloc(plni->plni_nih, plni->plni_eq_size, + PTL_EQ_HANDLER_NONE, &plni->plni_eqh); + if (rc != PTL_OK) { + CERROR("PtlEQAlloc failed: %d\n", rc); + rc = -ENODEV; + goto failed3; + } + + /* + * Fetch the Portals NID + */ + if(rc != PtlGetId(plni->plni_nih,&plni->plni_portals_id)){ + CERROR ("PtlGetID failed : %d\n", rc); + rc = -EINVAL; + goto failed4; + } + + CDEBUG(D_NET, "lnet nid=" LPX64 " (passed in)\n",ni->ni_nid); + + /* + * Create the new NID. Based on the LND network type + * and the lower ni's address data. + */ + ni->ni_nid = ptllnd_ptl2lnetnid(ni, plni->plni_portals_id.nid); + + CDEBUG(D_NET, "ptl id =%s\n", ptllnd_ptlid2str(plni->plni_portals_id)); + CDEBUG(D_NET, "lnet id =%s (passed back)\n", + libcfs_id2str((lnet_process_id_t) { + .nid = ni->ni_nid, .pid = the_lnet.ln_pid})); + + rc = ptllnd_grow_buffers(ni); + if (rc != 0) + goto failed4; + + return 0; + + failed4: + ptllnd_destroy_buffers(ni); + PtlEQFree(plni->plni_eqh); + failed3: + PtlNIFini(plni->plni_nih); + failed2: + ptllnd_destroy_peer_hash(ni); + failed1: + LIBCFS_FREE(plni, sizeof(*plni)); + failed0: + ptllnd_ni_count--; + CDEBUG(D_NET, "<<< rc=%d\n",rc); + return rc; +} + +const char *ptllnd_evtype2str(int type) +{ +#define DO_TYPE(x) case x: return #x; + switch(type) + { + DO_TYPE(PTL_EVENT_GET_START); + DO_TYPE(PTL_EVENT_GET_END); + DO_TYPE(PTL_EVENT_PUT_START); + DO_TYPE(PTL_EVENT_PUT_END); + DO_TYPE(PTL_EVENT_REPLY_START); + DO_TYPE(PTL_EVENT_REPLY_END); + DO_TYPE(PTL_EVENT_ACK); + DO_TYPE(PTL_EVENT_SEND_START); + DO_TYPE(PTL_EVENT_SEND_END); + DO_TYPE(PTL_EVENT_UNLINK); + default: + return ""; + } +#undef DO_TYPE +} + +const char *ptllnd_msgtype2str(int type) +{ +#define DO_TYPE(x) case x: return #x; + switch(type) + { + DO_TYPE(PTLLND_MSG_TYPE_INVALID); + DO_TYPE(PTLLND_MSG_TYPE_PUT); + DO_TYPE(PTLLND_MSG_TYPE_GET); + DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE); + DO_TYPE(PTLLND_MSG_TYPE_HELLO); + DO_TYPE(PTLLND_MSG_TYPE_NOOP); + DO_TYPE(PTLLND_MSG_TYPE_NAK); + default: + return ""; + } +#undef DO_TYPE +} diff --git a/lnet/ulnds/ptllnd/ptllnd.h b/lnet/ulnds/ptllnd/ptllnd.h new file mode 100644 index 0000000..f637c7d --- /dev/null +++ b/lnet/ulnds/ptllnd/ptllnd.h @@ -0,0 +1,262 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: Eric Barton + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + + +#define DEBUG_SUBSYSTEM S_LND + +#include +#include + +#include +#include /* Depends on portals/p30.h */ + +#define PTLLND_DEBUG_TIMING 0 + +#define PTLLND_MSGS_PER_BUFFER 64 +#define PTLLND_MSGS_SPARE 256 +#define PTLLND_PEER_HASH_SIZE 101 +#define PTLLND_EQ_SIZE 1024 +#if PTLLND_DEBUG_TIMING +# define PTLLND_TX_HISTORY 1024 +#else +# define PTLLND_TX_HISTORY 0 +#endif +#define PTLLND_WARN_LONG_WAIT 5 /* seconds */ +#define PTLLND_ABORT_ON_NAK 1 /* abort app on protocol version mismatch */ + +#define PTLLND_MD_OPTIONS (PTL_MD_LUSTRE_COMPLETION_SEMANTICS |\ + PTL_MD_EVENT_START_DISABLE) +typedef struct +{ + int plni_portal; + ptl_pid_t plni_ptllnd_pid; /* Portals PID of peers I may connect to */ + int plni_peer_credits; + int plni_max_msg_size; + int plni_buffer_size; + int plni_msgs_spare; + int plni_peer_hash_size; + int plni_eq_size; + int plni_checksum; + int plni_max_tx_history; + int plni_abort_on_nak; + + __u64 plni_stamp; + struct list_head plni_active_txs; + struct list_head plni_zombie_txs; + int plni_ntxs; + int plni_nrxs; + + ptl_handle_ni_t plni_nih; + ptl_handle_eq_t plni_eqh; + ptl_process_id_t plni_portals_id; /* Portals ID of interface */ + + struct list_head *plni_peer_hash; + int plni_npeers; + + struct list_head plni_tx_history; + int plni_ntx_history; + + struct list_head plni_buffers; + int plni_nbuffers; + int plni_nposted_buffers; +} ptllnd_ni_t; + +#define PTLLND_CREDIT_HIGHWATER(plni) ((plni)->plni_peer_credits - 1) + +typedef struct +{ + struct list_head plp_list; + lnet_ni_t *plp_ni; + lnet_process_id_t plp_id; + ptl_process_id_t plp_ptlid; + int plp_credits; + int plp_max_credits; + int plp_outstanding_credits; + int plp_max_msg_size; + int plp_refcount; + int plp_recvd_hello:1; + int plp_closing:1; + __u64 plp_match; + __u64 plp_stamp; + struct list_head plp_txq; + struct list_head plp_activeq; +} ptllnd_peer_t; + +typedef struct +{ + struct list_head plb_list; + lnet_ni_t *plb_ni; + int plb_posted; + ptl_handle_md_t plb_md; + char *plb_buffer; +} ptllnd_buffer_t; + +typedef struct +{ + ptllnd_peer_t *rx_peer; + kptl_msg_t *rx_msg; + int rx_nob; +} ptllnd_rx_t; + +typedef struct +{ + struct list_head tx_list; + int tx_type; + int tx_status; + ptllnd_peer_t *tx_peer; + lnet_msg_t *tx_lnetmsg; + lnet_msg_t *tx_lnetreplymsg; + unsigned int tx_niov; + ptl_md_iovec_t *tx_iov; + ptl_handle_md_t tx_bulkmdh; + ptl_handle_md_t tx_reqmdh; +#if PTLLND_DEBUG_TIMING + struct timeval tx_bulk_posted; + struct timeval tx_bulk_done; + struct timeval tx_req_posted; + struct timeval tx_req_done; +#endif + int tx_completing; /* someone already completing */ + int tx_msgsize; /* # bytes in tx_msg */ + kptl_msg_t tx_msg; /* message to send */ +} ptllnd_tx_t; + +#define PTLLND_RDMA_WRITE 0x100 /* pseudo message type */ +#define PTLLND_RDMA_READ 0x101 /* (no msg actually sent) */ + +/* Hack to extract object type from event's user_ptr relies on (and checks) + * that structs are somewhat aligned. */ +#define PTLLND_EVENTARG_TYPE_TX 0x1 +#define PTLLND_EVENTARG_TYPE_BUF 0x2 +#define PTLLND_EVENTARG_TYPE_MASK 0x3 + +static inline void * +ptllnd_obj2eventarg (void *obj, int type) +{ + unsigned long ptr = (unsigned long)obj; + + LASSERT ((ptr & PTLLND_EVENTARG_TYPE_MASK) == 0); + LASSERT ((type & ~PTLLND_EVENTARG_TYPE_MASK) == 0); + + return (void *)(ptr | type); +} + +static inline int +ptllnd_eventarg2type (void *arg) +{ + unsigned long ptr = (unsigned long)arg; + + return (ptr & PTLLND_EVENTARG_TYPE_MASK); +} + +static inline void * +ptllnd_eventarg2obj (void *arg) +{ + unsigned long ptr = (unsigned long)arg; + + return (void *)(ptr & ~PTLLND_EVENTARG_TYPE_MASK); +} + +#if PTLLND_DEBUG_TIMING +# define PTLLND_DBGT_INIT(tv) memset(&(tv), 0, sizeof(tv)) +# define PTLLND_DBGT_STAMP(tv) gettimeofday(&(tv), NULL) +# define DBGT_FMT "%ld.%06ld" +# define DBGT_ARGS(tv) , (long)((tv).tv_sec), (long)((tv).tv_usec) +#else +# define PTLLND_DBGT_INIT(tv) +# define PTLLND_DBGT_STAMP(tv) +# define DBGT_FMT "-" +# define DBGT_ARGS(tv) +#endif + +void ptllnd_cull_tx_history(ptllnd_ni_t *plni); +int ptllnd_startup(lnet_ni_t *ni); +void ptllnd_shutdown(lnet_ni_t *ni); +int ptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg); +int ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int ptllnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, + void **new_privatep); + +ptllnd_tx_t *ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob); +void ptllnd_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive); +void ptllnd_wait(lnet_ni_t *ni, int milliseconds); +void ptllnd_check_sends(ptllnd_peer_t *peer); +void ptllnd_debug_peer(lnet_ni_t *ni, lnet_process_id_t id); +void ptllnd_destroy_peer(ptllnd_peer_t *peer); +void ptllnd_close_peer(ptllnd_peer_t *peer, int error); +int ptllnd_post_buffer(ptllnd_buffer_t *buf); +int ptllnd_grow_buffers (lnet_ni_t *ni); +const char *ptllnd_evtype2str(int type); +const char *ptllnd_msgtype2str(int type); +char *ptllnd_ptlid2str(ptl_process_id_t id); + +static inline void +ptllnd_peer_addref (ptllnd_peer_t *peer) +{ + LASSERT (peer->plp_refcount > 0); + peer->plp_refcount++; +} + +static inline void +ptllnd_peer_decref (ptllnd_peer_t *peer) +{ + LASSERT (peer->plp_refcount > 0); + peer->plp_refcount--; + if (peer->plp_refcount == 0) + ptllnd_destroy_peer(peer); +} + +static inline void +ptllnd_post_tx(ptllnd_tx_t *tx) +{ + ptllnd_peer_t *peer = tx->tx_peer; + LASSERT(tx->tx_peer != NULL); + list_add_tail(&tx->tx_list, &peer->plp_txq); + ptllnd_check_sends(peer); +} + +static inline lnet_nid_t +ptllnd_ptl2lnetnid(lnet_ni_t *ni, ptl_nid_t portals_nid) +{ + return LNET_MKNID(LNET_NIDNET(ni->ni_nid), portals_nid); +} + +static inline ptl_nid_t +ptllnd_lnet2ptlnid(lnet_nid_t lnet_nid) +{ + return LNET_NIDADDR(lnet_nid); +} + +/* + * A note about lprintf(): + * Normally printf() is redirected to stdout of the console + * from which yod launched the catamount application. However + * there is a lot of initilziation code that runs before this + * redirection is hooked up, and printf() seems to go to the bit bucket + * + * To get any kind of debug output and init time lprintf() can + * be used to output to the console from which bookqk was used to + * boot the catamount node. This works for debugging some simple + * cases. + */ + + diff --git a/lnet/ulnds/ptllnd/ptllnd_cb.c b/lnet/ulnds/ptllnd/ptllnd_cb.c new file mode 100644 index 0000000..0114c42 --- /dev/null +++ b/lnet/ulnds/ptllnd/ptllnd_cb.c @@ -0,0 +1,1684 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved. + * Author: Eric Barton + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * This file is confidential source code owned by Cluster File Systems. + * No viewing, modification, compilation, redistribution, or any other + * form of use is permitted except through a signed license agreement. + * + * If you have not signed such an agreement, then you have no rights to + * this file. Please destroy it immediately and contact CFS. + * + */ + +#include "ptllnd.h" + +char * +ptllnd_ptlid2str(ptl_process_id_t id) +{ + static char strs[8][32]; + static int idx = 0; + + char *str = strs[idx++]; + + if (idx >= sizeof(strs)/sizeof(strs[0])) + idx = 0; + + snprintf(str, sizeof(strs[0]), FMT_PTLID, id.pid, id.nid); + return str; +} + +void +ptllnd_destroy_peer(ptllnd_peer_t *peer) +{ + lnet_ni_t *ni = peer->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + + LASSERT (peer->plp_closing); + LASSERT (plni->plni_npeers > 0); + LASSERT (list_empty(&peer->plp_txq)); + LASSERT (list_empty(&peer->plp_activeq)); + plni->plni_npeers--; + LIBCFS_FREE(peer, sizeof(*peer)); +} + +void +ptllnd_abort_txs(ptllnd_ni_t *plni, struct list_head *q) +{ + while (!list_empty(q)) { + ptllnd_tx_t *tx = list_entry(q->next, ptllnd_tx_t, tx_list); + + tx->tx_status = -ESHUTDOWN; + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &plni->plni_zombie_txs); + } +} + +void +ptllnd_close_peer(ptllnd_peer_t *peer, int error) +{ + lnet_ni_t *ni = peer->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + + if (peer->plp_closing) + return; + + peer->plp_closing = 1; + + if (!list_empty(&peer->plp_txq) || + !list_empty(&peer->plp_activeq) || + error != 0) { + CERROR("Closing %s\n", libcfs_id2str(peer->plp_id)); + ptllnd_debug_peer(ni, peer->plp_id); + } + + ptllnd_abort_txs(plni, &peer->plp_txq); + ptllnd_abort_txs(plni, &peer->plp_activeq); + + list_del(&peer->plp_list); + ptllnd_peer_decref(peer); +} + +ptllnd_peer_t * +ptllnd_find_peer(lnet_ni_t *ni, lnet_process_id_t id, int create) +{ + ptllnd_ni_t *plni = ni->ni_data; + unsigned int hash = LNET_NIDADDR(id.nid) % plni->plni_peer_hash_size; + struct list_head *tmp; + ptllnd_peer_t *plp; + ptllnd_tx_t *tx; + int rc; + + LASSERT (LNET_NIDNET(id.nid) == LNET_NIDNET(ni->ni_nid)); + + list_for_each(tmp, &plni->plni_peer_hash[hash]) { + plp = list_entry(tmp, ptllnd_peer_t, plp_list); + + if (plp->plp_id.nid == id.nid && + plp->plp_id.pid == id.pid) { + ptllnd_peer_addref(plp); + return plp; + } + } + + if (!create) + return NULL; + + /* New peer: check first for enough posted buffers */ + plni->plni_npeers++; + rc = ptllnd_grow_buffers(ni); + if (rc != 0) { + plni->plni_npeers--; + return NULL; + } + + LIBCFS_ALLOC(plp, sizeof(*plp)); + if (plp == NULL) { + CERROR("Can't allocate new peer %s\n", libcfs_id2str(id)); + plni->plni_npeers--; + return NULL; + } + + CDEBUG(D_NET, "new peer=%p\n",plp); + + plp->plp_ni = ni; + plp->plp_id = id; + plp->plp_ptlid.nid = LNET_NIDADDR(id.nid); + plp->plp_ptlid.pid = plni->plni_ptllnd_pid; + plp->plp_max_credits = + plp->plp_credits = 1; /* add more later when she gives me credits */ + plp->plp_max_msg_size = plni->plni_max_msg_size; /* until I hear from her */ + plp->plp_outstanding_credits = plni->plni_peer_credits - 1; + plp->plp_match = 0; + plp->plp_stamp = 0; + plp->plp_recvd_hello = 0; + plp->plp_closing = 0; + plp->plp_refcount = 1; + CFS_INIT_LIST_HEAD(&plp->plp_list); + CFS_INIT_LIST_HEAD(&plp->plp_txq); + CFS_INIT_LIST_HEAD(&plp->plp_activeq); + + ptllnd_peer_addref(plp); + list_add_tail(&plp->plp_list, &plni->plni_peer_hash[hash]); + + tx = ptllnd_new_tx(plp, PTLLND_MSG_TYPE_HELLO, 0); + if (tx == NULL) { + CERROR("Can't send HELLO to %s\n", libcfs_id2str(id)); + ptllnd_close_peer(plp, -ENOMEM); + ptllnd_peer_decref(plp); + return NULL; + } + + tx->tx_msg.ptlm_u.hello.kptlhm_matchbits = PTL_RESERVED_MATCHBITS; + tx->tx_msg.ptlm_u.hello.kptlhm_max_msg_size = plni->plni_max_msg_size; + + ptllnd_post_tx(tx); + + return plp; +} + +int +ptllnd_count_q(struct list_head *q) +{ + struct list_head *e; + int n = 0; + + list_for_each(e, q) { + n++; + } + + return n; +} + +const char * +ptllnd_tx_typestr(int type) +{ + switch (type) { + case PTLLND_RDMA_WRITE: + return "rdma_write"; + + case PTLLND_RDMA_READ: + return "rdma_read"; + + case PTLLND_MSG_TYPE_PUT: + return "put_req"; + + case PTLLND_MSG_TYPE_GET: + return "get_req"; + + case PTLLND_MSG_TYPE_IMMEDIATE: + return "immediate"; + + case PTLLND_MSG_TYPE_NOOP: + return "noop"; + + case PTLLND_MSG_TYPE_HELLO: + return "hello"; + + default: + return ""; + } +} + +void +ptllnd_debug_tx(ptllnd_tx_t *tx) +{ + CDEBUG(D_WARNING, "%s %s b "DBGT_FMT"/"DBGT_FMT + " r "DBGT_FMT"/"DBGT_FMT" status %d\n", + ptllnd_tx_typestr(tx->tx_type), + libcfs_id2str(tx->tx_peer->plp_id) + DBGT_ARGS(tx->tx_bulk_posted) DBGT_ARGS(tx->tx_bulk_done) + DBGT_ARGS(tx->tx_req_posted) DBGT_ARGS(tx->tx_req_done), + tx->tx_status); +} + +void +ptllnd_debug_peer(lnet_ni_t *ni, lnet_process_id_t id) +{ + ptllnd_peer_t *plp = ptllnd_find_peer(ni, id, 0); + struct list_head *tmp; + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_tx_t *tx; + + if (plp == NULL) { + CDEBUG(D_WARNING, "No peer %s\n", libcfs_id2str(id)); + return; + } + + CDEBUG(D_WARNING, "%s %s%s [%d] "LPD64".%06d m "LPD64" q %d/%d c %d/%d(%d)\n", + libcfs_id2str(id), + plp->plp_recvd_hello ? "H" : "_", + plp->plp_closing ? "C" : "_", + plp->plp_refcount, + plp->plp_stamp / 1000000, (int)(plp->plp_stamp % 1000000), + plp->plp_match, + ptllnd_count_q(&plp->plp_txq), + ptllnd_count_q(&plp->plp_activeq), + plp->plp_credits, plp->plp_outstanding_credits, plp->plp_max_credits); + + CDEBUG(D_WARNING, "txq:\n"); + list_for_each (tmp, &plp->plp_txq) { + tx = list_entry(tmp, ptllnd_tx_t, tx_list); + + ptllnd_debug_tx(tx); + } + + CDEBUG(D_WARNING, "activeq:\n"); + list_for_each (tmp, &plp->plp_activeq) { + tx = list_entry(tmp, ptllnd_tx_t, tx_list); + + ptllnd_debug_tx(tx); + } + + CDEBUG(D_WARNING, "zombies:\n"); + list_for_each (tmp, &plni->plni_zombie_txs) { + tx = list_entry(tmp, ptllnd_tx_t, tx_list); + + if (tx->tx_peer->plp_id.nid == id.nid && + tx->tx_peer->plp_id.pid == id.pid) + ptllnd_debug_tx(tx); + } + + CDEBUG(D_WARNING, "history:\n"); + list_for_each (tmp, &plni->plni_tx_history) { + tx = list_entry(tmp, ptllnd_tx_t, tx_list); + + if (tx->tx_peer->plp_id.nid == id.nid && + tx->tx_peer->plp_id.pid == id.pid) + ptllnd_debug_tx(tx); + } + + ptllnd_peer_decref(plp); +} + +void +ptllnd_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive) +{ + lnet_process_id_t id; + ptllnd_peer_t *peer; + time_t start = cfs_time_current_sec(); + int w = PTLLND_WARN_LONG_WAIT; + + /* This is only actually used to connect to routers at startup! */ + if (!alive) { + LBUG(); + return; + } + + id.nid = nid; + id.pid = LUSTRE_SRV_LNET_PID; + + peer = ptllnd_find_peer(ni, id, 1); + if (peer == NULL) + return; + + /* wait for the peer to reply */ + while (!peer->plp_recvd_hello) { + if (cfs_time_current_sec() > start + w) { + CWARN("Waited %ds to connect to %s\n", + w, libcfs_id2str(id)); + w *= 2; + } + + ptllnd_wait(ni, w*1000); + } + + ptllnd_peer_decref(peer); +} + +__u32 +ptllnd_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + /* ensure I don't return 0 (== no checksum) */ + return (sum == 0) ? 1 : sum; +} + +ptllnd_tx_t * +ptllnd_new_tx(ptllnd_peer_t *peer, int type, int payload_nob) +{ + lnet_ni_t *ni = peer->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_tx_t *tx; + int msgsize; + + CDEBUG(D_NET, "peer=%p type=%d payload=%d\n",peer,type,payload_nob); + + switch (type) { + default: + LBUG(); + + case PTLLND_RDMA_WRITE: + case PTLLND_RDMA_READ: + LASSERT (payload_nob == 0); + msgsize = 0; + break; + + case PTLLND_MSG_TYPE_PUT: + case PTLLND_MSG_TYPE_GET: + LASSERT (payload_nob == 0); + msgsize = offsetof(kptl_msg_t, ptlm_u) + + sizeof(kptl_rdma_msg_t); + break; + + case PTLLND_MSG_TYPE_IMMEDIATE: + msgsize = offsetof(kptl_msg_t, + ptlm_u.immediate.kptlim_payload[payload_nob]); + break; + + case PTLLND_MSG_TYPE_NOOP: + LASSERT (payload_nob == 0); + msgsize = offsetof(kptl_msg_t, ptlm_u); + break; + + case PTLLND_MSG_TYPE_HELLO: + LASSERT (payload_nob == 0); + msgsize = offsetof(kptl_msg_t, ptlm_u) + + sizeof(kptl_hello_msg_t); + break; + } + + msgsize = (msgsize + 7) & ~7; + LASSERT (msgsize <= peer->plp_max_msg_size); + + CDEBUG(D_NET, "msgsize=%d\n",msgsize); + + LIBCFS_ALLOC(tx, offsetof(ptllnd_tx_t, tx_msg) + msgsize); + + if (tx == NULL) { + CERROR("Can't allocate msg type %d for %s\n", + type, libcfs_id2str(peer->plp_id)); + return NULL; + } + + CFS_INIT_LIST_HEAD(&tx->tx_list); + tx->tx_peer = peer; + tx->tx_type = type; + tx->tx_lnetmsg = tx->tx_lnetreplymsg = NULL; + tx->tx_niov = 0; + tx->tx_iov = NULL; + tx->tx_reqmdh = PTL_INVALID_HANDLE; + tx->tx_bulkmdh = PTL_INVALID_HANDLE; + tx->tx_msgsize = msgsize; + tx->tx_completing = 0; + tx->tx_status = 0; + + PTLLND_DBGT_INIT(tx->tx_bulk_posted); + PTLLND_DBGT_INIT(tx->tx_bulk_done); + PTLLND_DBGT_INIT(tx->tx_req_posted); + PTLLND_DBGT_INIT(tx->tx_req_done); + + if (msgsize != 0) { + tx->tx_msg.ptlm_magic = PTLLND_MSG_MAGIC; + tx->tx_msg.ptlm_version = PTLLND_MSG_VERSION; + tx->tx_msg.ptlm_type = type; + tx->tx_msg.ptlm_credits = 0; + tx->tx_msg.ptlm_nob = msgsize; + tx->tx_msg.ptlm_cksum = 0; + tx->tx_msg.ptlm_srcnid = ni->ni_nid; + tx->tx_msg.ptlm_srcstamp = plni->plni_stamp; + tx->tx_msg.ptlm_dstnid = peer->plp_id.nid; + tx->tx_msg.ptlm_dststamp = peer->plp_stamp; + tx->tx_msg.ptlm_srcpid = the_lnet.ln_pid; + tx->tx_msg.ptlm_dstpid = peer->plp_id.pid; + } + + ptllnd_peer_addref(peer); + plni->plni_ntxs++; + + CDEBUG(D_NET, "tx=%p\n",tx); + + return tx; +} + +void +ptllnd_abort_tx(ptllnd_tx_t *tx, ptl_handle_md_t *mdh) +{ + ptllnd_peer_t *peer = tx->tx_peer; + lnet_ni_t *ni = peer->plp_ni; + int rc; + time_t start = cfs_time_current_sec(); + int w = PTLLND_WARN_LONG_WAIT; + + while (!PtlHandleIsEqual(*mdh, PTL_INVALID_HANDLE)) { + rc = PtlMDUnlink(*mdh); +#ifndef LUSTRE_PORTALS_UNLINK_SEMANTICS + if (rc == PTL_OK) /* unlink successful => no unlinked event */ + return; + LASSERT (rc == PTL_MD_IN_USE); +#endif + if (cfs_time_current_sec() > start + w) { + CWARN("Waited %ds to abort tx to %s\n", + w, libcfs_id2str(peer->plp_id)); + w *= 2; + } + /* Wait for ptllnd_tx_event() to invalidate */ + ptllnd_wait(ni, w*1000); + } +} + +void +ptllnd_cull_tx_history(ptllnd_ni_t *plni) +{ + int max = plni->plni_max_tx_history; + + while (plni->plni_ntx_history > max) { + ptllnd_tx_t *tx = list_entry(plni->plni_tx_history.next, + ptllnd_tx_t, tx_list); + list_del(&tx->tx_list); + + ptllnd_peer_decref(tx->tx_peer); + + LIBCFS_FREE(tx, offsetof(ptllnd_tx_t, tx_msg) + tx->tx_msgsize); + + LASSERT (plni->plni_ntxs > 0); + plni->plni_ntxs--; + plni->plni_ntx_history--; + } +} + +void +ptllnd_tx_done(ptllnd_tx_t *tx) +{ + ptllnd_peer_t *peer = tx->tx_peer; + lnet_ni_t *ni = peer->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + + /* CAVEAT EMPTOR: If this tx is being aborted, I'll continue to get + * events for this tx until it's unlinked. So I set tx_completing to + * flag the tx is getting handled */ + + if (tx->tx_completing) + return; + + tx->tx_completing = 1; + + if (!list_empty(&tx->tx_list)) + list_del_init(&tx->tx_list); + + if (tx->tx_status != 0) { + CERROR("Completing tx with error\n"); + ptllnd_debug_tx(tx); + ptllnd_close_peer(peer, tx->tx_status); + } + + ptllnd_abort_tx(tx, &tx->tx_reqmdh); + ptllnd_abort_tx(tx, &tx->tx_bulkmdh); + + if (tx->tx_niov > 0) { + LIBCFS_FREE(tx->tx_iov, tx->tx_niov * sizeof(*tx->tx_iov)); + tx->tx_niov = 0; + } + + if (tx->tx_lnetreplymsg != NULL) { + LASSERT (tx->tx_type == PTLLND_MSG_TYPE_GET); + LASSERT (tx->tx_lnetmsg != NULL); + /* Simulate GET success always */ + lnet_finalize(ni, tx->tx_lnetmsg, 0); + CDEBUG(D_NET, "lnet_finalize(tx_lnetreplymsg=%p)\n",tx->tx_lnetreplymsg); + lnet_finalize(ni, tx->tx_lnetreplymsg, tx->tx_status); + } else if (tx->tx_lnetmsg != NULL) { + lnet_finalize(ni, tx->tx_lnetmsg, tx->tx_status); + } + + plni->plni_ntx_history++; + list_add_tail(&tx->tx_list, &plni->plni_tx_history); + + ptllnd_cull_tx_history(plni); +} + +int +ptllnd_set_txiov(ptllnd_tx_t *tx, + unsigned int niov, struct iovec *iov, + unsigned int offset, unsigned int len) +{ + ptl_md_iovec_t *piov; + int npiov; + + if (len == 0) { + tx->tx_niov = 0; + return 0; + } + + CDEBUG(D_NET, "niov =%d\n",niov); + CDEBUG(D_NET, "offset=%d\n",offset); + CDEBUG(D_NET, "len =%d\n",len); + + + /* + * Remove iovec's at the beginning that + * are skipped because of the offset. + * Adjust the offset accordingly + */ + for (;;) { + LASSERT (niov > 0); + if (offset < iov->iov_len) + break; + offset -= iov->iov_len; + niov--; + iov++; + } + + CDEBUG(D_NET, "niov =%d (after)\n",niov); + CDEBUG(D_NET, "offset=%d (after)\n",offset); + CDEBUG(D_NET, "len =%d (after)\n",len); + + for (;;) { + int temp_offset = offset; + int resid = len; + LIBCFS_ALLOC(piov, niov * sizeof(*piov)); + if (piov == NULL) + return -ENOMEM; + + for (npiov = 0;; npiov++) { + CDEBUG(D_NET, "npiov=%d\n",npiov); + CDEBUG(D_NET, "offset=%d\n",temp_offset); + CDEBUG(D_NET, "len=%d\n",resid); + CDEBUG(D_NET, "iov[npiov].iov_len=%d\n",iov[npiov].iov_len); + + LASSERT (npiov < niov); + LASSERT (iov->iov_len >= temp_offset); + + piov[npiov].iov_base = iov[npiov].iov_base + temp_offset; + piov[npiov].iov_len = iov[npiov].iov_len - temp_offset; + + if (piov[npiov].iov_len >= resid) { + piov[npiov].iov_len = resid; + npiov++; + break; + } + resid -= piov[npiov].iov_len; + temp_offset = 0; + } + + if (npiov == niov) { + tx->tx_niov = niov; + tx->tx_iov = piov; + CDEBUG(D_NET, "tx->tx_iov=%p\n",tx->tx_iov); + CDEBUG(D_NET, "tx->tx_niov=%d\n",tx->tx_niov); + return 0; + } + + /* Dang! The piov I allocated was too big and it's a drag to + * have to maintain separate 'allocated' and 'used' sizes, so + * I'll just do it again; NB this doesn't happen normally... */ + LIBCFS_FREE(piov, niov * sizeof(*piov)); + niov = npiov; + } +} + +void +ptllnd_set_md_buffer(ptl_md_t *md, ptllnd_tx_t *tx) +{ + unsigned int niov = tx->tx_niov; + ptl_md_iovec_t *iov = tx->tx_iov; + + LASSERT ((md->options & PTL_MD_IOVEC) == 0); + + if (niov == 0) { + md->start = NULL; + md->length = 0; + } else if (niov == 1) { + md->start = iov[0].iov_base; + md->length = iov[0].iov_len; + } else { + md->start = iov; + md->length = niov; + md->options |= PTL_MD_IOVEC; + } +} + +int +ptllnd_post_buffer(ptllnd_buffer_t *buf) +{ + lnet_ni_t *ni = buf->plb_ni; + ptllnd_ni_t *plni = ni->ni_data; + ptl_process_id_t anyid = { + .nid = PTL_NID_ANY, + .pid = PTL_PID_ANY}; + ptl_md_t md = { + .start = buf->plb_buffer, + .length = plni->plni_buffer_size, + .threshold = PTL_MD_THRESH_INF, + .max_size = plni->plni_max_msg_size, + .options = (PTLLND_MD_OPTIONS | + PTL_MD_OP_PUT | PTL_MD_MAX_SIZE | + PTL_MD_LOCAL_ALIGN8), + .user_ptr = ptllnd_obj2eventarg(buf, PTLLND_EVENTARG_TYPE_BUF), + .eq_handle = plni->plni_eqh}; + ptl_handle_me_t meh; + int rc; + + LASSERT (!buf->plb_posted); + + rc = PtlMEAttach(plni->plni_nih, plni->plni_portal, + anyid, LNET_MSG_MATCHBITS, 0, + PTL_UNLINK, PTL_INS_AFTER, &meh); + if (rc != PTL_OK) { + CERROR("PtlMEAttach failed: %d\n", rc); + return -ENOMEM; + } + + buf->plb_posted = 1; + plni->plni_nposted_buffers++; + + rc = PtlMDAttach(meh, md, LNET_UNLINK, &buf->plb_md); + if (rc == PTL_OK) + return 0; + + CERROR("PtlMDAttach failed: %d\n", rc); + + buf->plb_posted = 0; + plni->plni_nposted_buffers--; + + rc = PtlMEUnlink(meh); + LASSERT (rc == PTL_OK); + + return -ENOMEM; +} + +void +ptllnd_check_sends(ptllnd_peer_t *peer) +{ + lnet_ni_t *ni = peer->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_tx_t *tx; + ptl_md_t md; + ptl_handle_md_t mdh; + int rc; + + CDEBUG(D_NET, "plp_outstanding_credits=%d\n",peer->plp_outstanding_credits); + + if (list_empty(&peer->plp_txq) && + peer->plp_outstanding_credits >= + PTLLND_CREDIT_HIGHWATER(plni)) { + + tx = ptllnd_new_tx(peer, PTLLND_MSG_TYPE_NOOP, 0); + CDEBUG(D_NET, "NOOP tx=%p\n",tx); + if (tx == NULL) { + CERROR("Can't return credits to %s\n", + libcfs_id2str(peer->plp_id)); + } else { + list_add_tail(&tx->tx_list, &peer->plp_txq); + } + } + + while (!list_empty(&peer->plp_txq)) { + tx = list_entry(peer->plp_txq.next, ptllnd_tx_t, tx_list); + + CDEBUG(D_NET, "Looking at TX=%p\n",tx); + CDEBUG(D_NET, "plp_credits=%d\n",peer->plp_credits); + CDEBUG(D_NET, "plp_outstanding_credits=%d\n",peer->plp_outstanding_credits); + + LASSERT (tx->tx_msgsize > 0); + + LASSERT (peer->plp_outstanding_credits >= 0); + LASSERT (peer->plp_outstanding_credits <= + plni->plni_peer_credits); + LASSERT (peer->plp_credits >= 0); + LASSERT (peer->plp_credits <= peer->plp_max_credits); + + if (peer->plp_credits == 0) /* no credits */ + break; + + if (peer->plp_credits == 1 && /* last credit reserved for */ + peer->plp_outstanding_credits == 0) /* returning credits */ + break; + + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &peer->plp_activeq); + + CDEBUG(D_NET, "Sending at TX=%p type=%s (%d)\n",tx, + ptllnd_msgtype2str(tx->tx_type),tx->tx_type); + + if (tx->tx_type == PTLLND_MSG_TYPE_NOOP && + (!list_empty(&peer->plp_txq) || + peer->plp_outstanding_credits < + PTLLND_CREDIT_HIGHWATER(plni))) { + /* redundant NOOP */ + ptllnd_tx_done(tx); + continue; + } + + /* Set stamp at the last minute; on a new peer, I don't know it + * until I receive the HELLO back */ + tx->tx_msg.ptlm_dststamp = peer->plp_stamp; + + CDEBUG(D_NET, "Returning %d to peer\n",peer->plp_outstanding_credits); + + /* + * Return all the credits we have + */ + tx->tx_msg.ptlm_credits = peer->plp_outstanding_credits; + peer->plp_outstanding_credits = 0; + + /* + * One less credit + */ + peer->plp_credits--; + + if (plni->plni_checksum) + tx->tx_msg.ptlm_cksum = + ptllnd_cksum(&tx->tx_msg, + offsetof(kptl_msg_t, ptlm_u)); + + md.user_ptr = ptllnd_obj2eventarg(tx, PTLLND_EVENTARG_TYPE_TX); + md.eq_handle = plni->plni_eqh; + md.threshold = 1; + md.options = PTLLND_MD_OPTIONS; + md.start = &tx->tx_msg; + md.length = tx->tx_msgsize; + + rc = PtlMDBind(plni->plni_nih, md, LNET_UNLINK, &mdh); + if (rc != PTL_OK) { + CERROR("PtlMDBind for %s failed: %d\n", + libcfs_id2str(peer->plp_id), rc); + tx->tx_status = -EIO; + ptllnd_tx_done(tx); + break; + } + + tx->tx_reqmdh = mdh; + PTLLND_DBGT_STAMP(tx->tx_req_posted); + + rc = PtlPut(mdh, PTL_NOACK_REQ, peer->plp_ptlid, + plni->plni_portal, 0, LNET_MSG_MATCHBITS, 0, 0); + if (rc != PTL_OK) { + CERROR("PtlPut for %s failed: %d\n", + libcfs_id2str(peer->plp_id), rc); + tx->tx_status = -EIO; + ptllnd_tx_done(tx); + break; + } + } +} + +int +ptllnd_passive_rdma(ptllnd_peer_t *peer, int type, lnet_msg_t *msg, + unsigned int niov, struct iovec *iov, + unsigned int offset, unsigned int len) +{ + lnet_ni_t *ni = peer->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_tx_t *tx = ptllnd_new_tx(peer, type, 0); + __u64 matchbits; + ptl_md_t md; + ptl_handle_md_t mdh; + ptl_handle_me_t meh; + int rc; + int rc2; + time_t start; + int w; + + CDEBUG(D_NET, "niov=%d offset=%d len=%d\n",niov,offset,len); + + LASSERT (type == PTLLND_MSG_TYPE_GET || + type == PTLLND_MSG_TYPE_PUT); + + if (tx == NULL) { + CERROR("Can't allocate %s tx for %s\n", + type == PTLLND_MSG_TYPE_GET ? "GET" : "PUT/REPLY", + libcfs_id2str(peer->plp_id)); + return -ENOMEM; + } + + rc = ptllnd_set_txiov(tx, niov, iov, offset, len); + if (rc != 0) { + CERROR ("Can't allocate iov %d for %s\n", + niov, libcfs_id2str(peer->plp_id)); + rc = -ENOMEM; + goto failed; + } + + md.user_ptr = ptllnd_obj2eventarg(tx, PTLLND_EVENTARG_TYPE_TX); + md.eq_handle = plni->plni_eqh; + md.threshold = 1; + md.max_size = 0; + md.options = PTLLND_MD_OPTIONS; + if(type == PTLLND_MSG_TYPE_GET) + md.options |= PTL_MD_OP_PUT | PTL_MD_ACK_DISABLE; + else + md.options |= PTL_MD_OP_GET; + ptllnd_set_md_buffer(&md, tx); + + start = cfs_time_current_sec(); + w = PTLLND_WARN_LONG_WAIT; + + while (!peer->plp_recvd_hello) { /* wait to validate plp_match */ + if (peer->plp_closing) { + rc = -EIO; + goto failed; + } + if (cfs_time_current_sec() > start + w) { + CWARN("Waited %ds to connect to %s\n", + w, libcfs_id2str(peer->plp_id)); + w *= 2; + } + ptllnd_wait(ni, w*1000); + } + + if (peer->plp_match < PTL_RESERVED_MATCHBITS) + peer->plp_match = PTL_RESERVED_MATCHBITS; + matchbits = peer->plp_match++; + CDEBUG(D_NET, "matchbits " LPX64 " %s\n", matchbits, + ptllnd_ptlid2str(peer->plp_ptlid)); + + rc = PtlMEAttach(plni->plni_nih, plni->plni_portal, peer->plp_ptlid, + matchbits, 0, PTL_UNLINK, PTL_INS_BEFORE, &meh); + if (rc != PTL_OK) { + CERROR("PtlMEAttach for %s failed: %d\n", + libcfs_id2str(peer->plp_id), rc); + rc = -EIO; + goto failed; + } + + CDEBUG(D_NET, "md.start=%p\n",md.start); + CDEBUG(D_NET, "md.length=%d\n",md.length); + CDEBUG(D_NET, "md.threshold=%d\n",md.threshold); + CDEBUG(D_NET, "md.max_size=%d\n",md.max_size); + CDEBUG(D_NET, "md.options=0x%x\n",md.options); + CDEBUG(D_NET, "md.user_ptr=%p\n",md.user_ptr); + + PTLLND_DBGT_STAMP(tx->tx_bulk_posted); + + rc = PtlMDAttach(meh, md, LNET_UNLINK, &mdh); + if (rc != PTL_OK) { + CERROR("PtlMDAttach for %s failed: %d\n", + libcfs_id2str(peer->plp_id), rc); + rc2 = PtlMEUnlink(meh); + LASSERT (rc2 == PTL_OK); + rc = -EIO; + goto failed; + } + tx->tx_bulkmdh = mdh; + + /* + * We need to set the stamp here because it + * we could have received a HELLO above that set + * peer->plp_stamp + */ + tx->tx_msg.ptlm_dststamp = peer->plp_stamp; + + tx->tx_msg.ptlm_u.rdma.kptlrm_hdr = msg->msg_hdr; + tx->tx_msg.ptlm_u.rdma.kptlrm_matchbits = matchbits; + + if (type == PTLLND_MSG_TYPE_GET) { + tx->tx_lnetreplymsg = lnet_create_reply_msg(ni, msg); + if (tx->tx_lnetreplymsg == NULL) { + CERROR("Can't create reply for GET to %s\n", + libcfs_id2str(msg->msg_target)); + rc = -ENOMEM; + goto failed; + } + } + + tx->tx_lnetmsg = msg; + ptllnd_post_tx(tx); + return 0; + + failed: + ptllnd_tx_done(tx); + return rc; +} + +int +ptllnd_active_rdma(ptllnd_peer_t *peer, int type, + lnet_msg_t *msg, __u64 matchbits, + unsigned int niov, struct iovec *iov, + unsigned int offset, unsigned int len) +{ + lnet_ni_t *ni = peer->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_tx_t *tx = ptllnd_new_tx(peer, type, 0); + ptl_md_t md; + ptl_handle_md_t mdh; + int rc; + + LASSERT (type == PTLLND_RDMA_READ || + type == PTLLND_RDMA_WRITE); + + if (tx == NULL) { + CERROR("Can't allocate tx for RDMA %s with %s\n", + (type == PTLLND_RDMA_WRITE) ? "write" : "read", + libcfs_id2str(peer->plp_id)); + ptllnd_close_peer(peer, -ENOMEM); + return -ENOMEM; + } + + rc = ptllnd_set_txiov(tx, niov, iov, offset, len); + if (rc != 0) { + CERROR ("Can't allocate iov %d for %s\n", + niov, libcfs_id2str(peer->plp_id)); + rc = -ENOMEM; + goto failed; + } + + md.user_ptr = ptllnd_obj2eventarg(tx, PTLLND_EVENTARG_TYPE_TX); + md.eq_handle = plni->plni_eqh; + md.max_size = 0; + md.options = PTLLND_MD_OPTIONS; + md.threshold = (type == PTLLND_RDMA_READ) ? 2 : 1; + + ptllnd_set_md_buffer(&md, tx); + + rc = PtlMDBind(plni->plni_nih, md, LNET_UNLINK, &mdh); + if (rc != PTL_OK) { + CERROR("PtlMDBind for %s failed: %d\n", + libcfs_id2str(peer->plp_id), rc); + rc = -EIO; + goto failed; + } + + tx->tx_bulkmdh = mdh; + tx->tx_lnetmsg = msg; + + list_add_tail(&tx->tx_list, &peer->plp_activeq); + PTLLND_DBGT_STAMP(tx->tx_bulk_posted); + + if (type == PTLLND_RDMA_READ) + rc = PtlGet(mdh, peer->plp_ptlid, + plni->plni_portal, 0, matchbits, 0); + else + rc = PtlPut(mdh, PTL_NOACK_REQ, peer->plp_ptlid, + plni->plni_portal, 0, matchbits, 0, + (msg == NULL) ? PTLLND_RDMA_FAIL : PTLLND_RDMA_OK); + + if (rc == PTL_OK) + return 0; + + CERROR("Can't initiate RDMA with %s: %d\n", + libcfs_id2str(peer->plp_id), rc); + + tx->tx_lnetmsg = NULL; + failed: + tx->tx_status = rc; + ptllnd_tx_done(tx); /* this will close peer */ + return rc; +} + +int +ptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *msg) +{ + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_peer_t *plp; + ptllnd_tx_t *tx; + int nob; + int rc; + + LASSERT (!msg->msg_routing); + LASSERT (msg->msg_kiov == NULL); + + LASSERT (msg->msg_niov <= PTL_MD_MAX_IOV); /* !!! */ + + CDEBUG(D_NET, "%s [%d]+%d,%d -> %s%s\n", + lnet_msgtyp2str(msg->msg_type), + msg->msg_niov, msg->msg_offset, msg->msg_len, + libcfs_nid2str(msg->msg_target.nid), + msg->msg_target_is_router ? "(rtr)" : ""); + + if ((msg->msg_target.pid & LNET_PID_USERFLAG) != 0) { + CERROR("Can't send to non-kernel peer %s\n", + libcfs_id2str(msg->msg_target)); + return -EHOSTUNREACH; + } + + plp = ptllnd_find_peer(ni, msg->msg_target, 1); + if (plp == NULL) + return -ENOMEM; + + switch (msg->msg_type) { + default: + LBUG(); + + case LNET_MSG_ACK: + CDEBUG(D_NET, "LNET_MSG_ACK\n"); + + LASSERT (msg->msg_len == 0); + break; /* send IMMEDIATE */ + + case LNET_MSG_GET: + CDEBUG(D_NET, "LNET_MSG_GET nob=%d\n",msg->msg_md->md_length); + + if (msg->msg_target_is_router) + break; /* send IMMEDIATE */ + + nob = msg->msg_md->md_length; + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[nob]); + if (nob <= plni->plni_max_msg_size) + break; + + LASSERT ((msg->msg_md->md_options & LNET_MD_KIOV) == 0); + rc = ptllnd_passive_rdma(plp, PTLLND_MSG_TYPE_GET, msg, + msg->msg_md->md_niov, + msg->msg_md->md_iov.iov, + 0, msg->msg_md->md_length); + ptllnd_peer_decref(plp); + return rc; + + case LNET_MSG_REPLY: + case LNET_MSG_PUT: + CDEBUG(D_NET, "LNET_MSG_PUT nob=%d\n",msg->msg_len); + nob = msg->msg_len; + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[nob]); + CDEBUG(D_NET, "msg_size=%d max=%d\n",msg->msg_len,plp->plp_max_msg_size); + if (nob <= plp->plp_max_msg_size) + break; /* send IMMEDIATE */ + + rc = ptllnd_passive_rdma(plp, PTLLND_MSG_TYPE_PUT, msg, + msg->msg_niov, msg->msg_iov, + msg->msg_offset, msg->msg_len); + ptllnd_peer_decref(plp); + return rc; + } + + /* send IMMEDIATE + * NB copy the payload so we don't have to do a fragmented send */ + + CDEBUG(D_NET, "IMMEDIATE len=%d\n", msg->msg_len); + tx = ptllnd_new_tx(plp, PTLLND_MSG_TYPE_IMMEDIATE, msg->msg_len); + if (tx == NULL) { + CERROR("Can't allocate tx for lnet type %d to %s\n", + msg->msg_type, libcfs_id2str(msg->msg_target)); + ptllnd_peer_decref(plp); + return -ENOMEM; + } + + lnet_copy_iov2flat(tx->tx_msgsize, &tx->tx_msg, + offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload), + msg->msg_niov, msg->msg_iov, msg->msg_offset, + msg->msg_len); + tx->tx_msg.ptlm_u.immediate.kptlim_hdr = msg->msg_hdr; + + tx->tx_lnetmsg = msg; + ptllnd_post_tx(tx); + ptllnd_peer_decref(plp); + return 0; +} + +void +ptllnd_rx_done(ptllnd_rx_t *rx) +{ + ptllnd_peer_t *plp = rx->rx_peer; + lnet_ni_t *ni = plp->plp_ni; + ptllnd_ni_t *plni = ni->ni_data; + + CDEBUG(D_NET, "rx=%p\n", rx); + + plp->plp_outstanding_credits++; + ptllnd_check_sends(rx->rx_peer); + + LASSERT (plni->plni_nrxs > 0); + plni->plni_nrxs--; +} + +int +ptllnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, + void **new_privatep) +{ + /* Shouldn't get here; recvs only block for router buffers */ + LBUG(); + return 0; +} + +int +ptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + ptllnd_rx_t *rx = private; + int rc = 0; + int nob; + + LASSERT (kiov == NULL); + LASSERT (niov <= PTL_MD_MAX_IOV); /* !!! */ + + switch (rx->rx_msg->ptlm_type) { + default: + LBUG(); + + case PTLLND_MSG_TYPE_IMMEDIATE: + nob = offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload[mlen]); + CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE nob=%d\n",nob); + if (nob > rx->rx_nob) { + CERROR("Immediate message from %s too big: %d(%d)\n", + libcfs_id2str(rx->rx_peer->plp_id), + nob, rx->rx_nob); + rc = -EPROTO; + break; + } + lnet_copy_flat2iov(niov, iov, offset, + rx->rx_nob, rx->rx_msg, + offsetof(kptl_msg_t, ptlm_u.immediate.kptlim_payload), + mlen); + lnet_finalize(ni, msg, 0); + break; + + case PTLLND_MSG_TYPE_PUT: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_PUT offset=%d mlen=%d\n",offset,mlen); + rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_READ, msg, + rx->rx_msg->ptlm_u.rdma.kptlrm_matchbits, + niov, iov, offset, mlen); + break; + + case PTLLND_MSG_TYPE_GET: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_GET\n"); + if (msg != NULL) + rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_WRITE, msg, + rx->rx_msg->ptlm_u.rdma.kptlrm_matchbits, + msg->msg_niov, msg->msg_iov, + msg->msg_offset, msg->msg_len); + else + rc = ptllnd_active_rdma(rx->rx_peer, PTLLND_RDMA_WRITE, NULL, + rx->rx_msg->ptlm_u.rdma.kptlrm_matchbits, + 0, NULL, 0, 0); + break; + } + + ptllnd_rx_done(rx); + return rc; +} + +void +ptllnd_abort_on_nak(lnet_ni_t *ni) +{ + ptllnd_ni_t *plni = ni->ni_data; + + if (plni->plni_abort_on_nak) + abort(); +} + +void +ptllnd_parse_request(lnet_ni_t *ni, ptl_process_id_t initiator, + kptl_msg_t *msg, unsigned int nob) +{ + ptllnd_ni_t *plni = ni->ni_data; + const int basenob = offsetof(kptl_msg_t, ptlm_u); + lnet_process_id_t srcid; + ptllnd_rx_t rx; + int flip; + __u16 msg_version; + __u32 msg_cksum; + ptllnd_peer_t *plp; + int rc; + + if (nob < 6) { + CERROR("Very short receive from %s\n", + ptllnd_ptlid2str(initiator)); + return; + } + + /* I can at least read MAGIC/VERSION */ + + flip = msg->ptlm_magic == __swab32(PTLLND_MSG_MAGIC); + if (!flip && msg->ptlm_magic != PTLLND_MSG_MAGIC) { + CERROR("Bad protocol magic %08x from %s\n", + msg->ptlm_magic, ptllnd_ptlid2str(initiator)); + return; + } + + msg_version = flip ? __swab16(msg->ptlm_version) : msg->ptlm_version; + + if (msg_version != PTLLND_MSG_VERSION) { + CERROR("Bad protocol version %04x from %s\n", + (__u32)msg_version, ptllnd_ptlid2str(initiator)); + ptllnd_abort_on_nak(ni); + return; + } + + if (nob < basenob) { + CERROR("Short receive from %s: got %d, wanted at least %d\n", + ptllnd_ptlid2str(initiator), nob, basenob); + return; + } + + /* checksum must be computed with + * 1) ptlm_cksum zero and + * 2) BEFORE anything gets modified/flipped + */ + msg_cksum = flip ? __swab32(msg->ptlm_cksum) : msg->ptlm_cksum; + msg->ptlm_cksum = 0; + if (msg_cksum != 0 && + msg_cksum != ptllnd_cksum(msg, offsetof(kptl_msg_t, ptlm_u))) { + CERROR("Bad checksum from %s\n", ptllnd_ptlid2str(initiator)); + return; + } + + msg->ptlm_version = msg_version; + msg->ptlm_cksum = msg_cksum; + + if (flip) { + /* NB stamps are opaque cookies */ + __swab32s(&msg->ptlm_nob); + __swab64s(&msg->ptlm_srcnid); + __swab64s(&msg->ptlm_dstnid); + __swab32s(&msg->ptlm_srcpid); + __swab32s(&msg->ptlm_dstpid); + } + + srcid.nid = msg->ptlm_srcnid; + srcid.pid = msg->ptlm_srcpid; + + if (LNET_NIDNET(msg->ptlm_srcnid) != LNET_NIDNET(ni->ni_nid)) { + CERROR("Bad source id %s from %s\n", + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + return; + } + + if (msg->ptlm_type == PTLLND_MSG_TYPE_NAK) { + CERROR("NAK from %s (%s)\n", + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + ptllnd_abort_on_nak(ni); + return; + } + + if (msg->ptlm_dstnid != ni->ni_nid || + msg->ptlm_dstpid != the_lnet.ln_pid) { + CERROR("Bad dstid %s (%s expected) from %s\n", + libcfs_id2str((lnet_process_id_t) { + .nid = msg->ptlm_dstnid, + .pid = msg->ptlm_dstpid}), + libcfs_id2str((lnet_process_id_t) { + .nid = ni->ni_nid, + .pid = the_lnet.ln_pid}), + libcfs_id2str(srcid)); + return; + } + + if (msg->ptlm_dststamp != plni->plni_stamp) { + CERROR("Bad dststamp "LPX64"("LPX64" expected) from %s\n", + msg->ptlm_dststamp, plni->plni_stamp, + libcfs_id2str(srcid)); + return; + } + + switch (msg->ptlm_type) { + case PTLLND_MSG_TYPE_PUT: + case PTLLND_MSG_TYPE_GET: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n", + msg->ptlm_type==PTLLND_MSG_TYPE_PUT ? "PUT" : "GET"); + if (nob < basenob + sizeof(kptl_rdma_msg_t)) { + CERROR("Short rdma request from %s(%s)\n", + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + return; + } + if (flip) + __swab64s(&msg->ptlm_u.rdma.kptlrm_matchbits); + break; + + case PTLLND_MSG_TYPE_IMMEDIATE: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n"); + if (nob < offsetof(kptl_msg_t, + ptlm_u.immediate.kptlim_payload)) { + CERROR("Short immediate from %s(%s)\n", + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + return; + } + break; + + case PTLLND_MSG_TYPE_HELLO: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_HELLO from %s(%s)\n", + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + if (nob < basenob + sizeof(kptl_hello_msg_t)) { + CERROR("Short hello from %s(%s)\n", + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + return; + } + if(flip){ + __swab64s(&msg->ptlm_u.hello.kptlhm_matchbits); + __swab32s(&msg->ptlm_u.hello.kptlhm_max_msg_size); + } + break; + + case PTLLND_MSG_TYPE_NOOP: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_NOOP from %s(%s)\n", + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + break; + + default: + CERROR("Bad message type %d from %s(%s)\n", msg->ptlm_type, + libcfs_id2str(srcid), + ptllnd_ptlid2str(initiator)); + return; + } + + plp = ptllnd_find_peer(ni, srcid, + msg->ptlm_type == PTLLND_MSG_TYPE_HELLO); + if (plp == NULL) { + CERROR("Can't find peer %s\n", libcfs_id2str(srcid)); + return; + } + + if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) { + if (plp->plp_recvd_hello) { + CERROR("Unexpected HELLO from %s\n", + libcfs_id2str(srcid)); + ptllnd_peer_decref(plp); + return; + } + + CDEBUG(D_NET, "maxsz %d match "LPX64" stamp "LPX64"\n", + msg->ptlm_u.hello.kptlhm_max_msg_size, + msg->ptlm_u.hello.kptlhm_matchbits, + msg->ptlm_srcstamp); + + plp->plp_max_msg_size = MAX(plni->plni_max_msg_size, + msg->ptlm_u.hello.kptlhm_max_msg_size); + plp->plp_match = msg->ptlm_u.hello.kptlhm_matchbits; + plp->plp_stamp = msg->ptlm_srcstamp; + plp->plp_max_credits += msg->ptlm_credits; + plp->plp_recvd_hello = 1; + + CDEBUG(D_NET, "plp_max_msg_size=%d\n",plp->plp_max_msg_size); + + } else if (!plp->plp_recvd_hello) { + + CERROR("Bad message type %d (HELLO expected) from %s\n", + msg->ptlm_type, libcfs_id2str(srcid)); + ptllnd_peer_decref(plp); + return; + + } else if (msg->ptlm_srcstamp != plp->plp_stamp) { + + CERROR("Bad srcstamp "LPX64"("LPX64" expected) from %s\n", + msg->ptlm_srcstamp, plp->plp_stamp, + libcfs_id2str(srcid)); + ptllnd_peer_decref(plp); + return; + } + + if (msg->ptlm_credits > 0) { + CDEBUG(D_NET, "Getting back %d credits from peer\n",msg->ptlm_credits); + if (plp->plp_credits + msg->ptlm_credits > + plp->plp_max_credits) { + CWARN("Too many credits from %s: %d + %d > %d\n", + libcfs_id2str(srcid), + plp->plp_credits, msg->ptlm_credits, + plp->plp_max_credits); + plp->plp_credits = plp->plp_max_credits; + } else { + plp->plp_credits += msg->ptlm_credits; + } + ptllnd_check_sends(plp); + } + + /* All OK so far; assume the message is good... */ + + rx.rx_peer = plp; + rx.rx_msg = msg; + rx.rx_nob = nob; + plni->plni_nrxs++; + + CDEBUG(D_NET, "rx=%p type=%d\n",&rx,msg->ptlm_type); + + switch (msg->ptlm_type) { + default: /* message types have been checked already */ + ptllnd_rx_done(&rx); + break; + + case PTLLND_MSG_TYPE_PUT: + case PTLLND_MSG_TYPE_GET: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_%s\n", + msg->ptlm_type==PTLLND_MSG_TYPE_PUT ? "PUT" : "GET"); + rc = lnet_parse(ni, &msg->ptlm_u.rdma.kptlrm_hdr, + msg->ptlm_srcnid, &rx, 1); + CDEBUG(D_NET, "lnet_parse rc=%d\n",rc); + if (rc < 0) + ptllnd_rx_done(&rx); + break; + + case PTLLND_MSG_TYPE_IMMEDIATE: + CDEBUG(D_NET, "PTLLND_MSG_TYPE_IMMEDIATE\n"); + rc = lnet_parse(ni, &msg->ptlm_u.immediate.kptlim_hdr, + msg->ptlm_srcnid, &rx, 0); + CDEBUG(D_NET, "lnet_parse rc=%d\n",rc); + if (rc < 0) + ptllnd_rx_done(&rx); + break; + } + + ptllnd_peer_decref(plp); +} + +void +ptllnd_buf_event (lnet_ni_t *ni, ptl_event_t *event) +{ + ptllnd_buffer_t *buf = ptllnd_eventarg2obj(event->md.user_ptr); + ptllnd_ni_t *plni = ni->ni_data; + char *msg = &buf->plb_buffer[event->offset]; + int repost; + int unlinked = event->type == PTL_EVENT_UNLINK; + + LASSERT (buf->plb_ni == ni); + LASSERT (event->type == PTL_EVENT_PUT_END || + event->type == PTL_EVENT_UNLINK); + + CDEBUG(D_NET, "buf=%p event=%d\n",buf,event->type); + + if (event->ni_fail_type != PTL_NI_OK) { + + CERROR("event type %d, status %d from %s\n", + event->type, event->ni_fail_type, + ptllnd_ptlid2str(event->initiator)); + + } else if (event->type == PTL_EVENT_PUT_END) { +#if (PTL_MD_LOCAL_ALIGN8 == 0) + /* Portals can't force message alignment - someone sending an + * odd-length message could misalign subsequent messages */ + if ((event->mlength & 7) != 0) { + CERROR("Message from %s has odd length %d: " + "probable version incompatibility\n", + ptllnd_ptlid2str(event->initiator), + event->mlength); + LBUG(); + } +#endif + LASSERT ((event->offset & 7) == 0); + + ptllnd_parse_request(ni, event->initiator, + (kptl_msg_t *)msg, event->mlength); + } + +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + /* UNLINK event only on explicit unlink */ + repost = (event->unlinked && event->type != PTL_EVENT_UNLINK); + if (event->unlinked) + unlinked = 1; +#else + /* UNLINK event only on implicit unlink */ + repost = (event->type == PTL_EVENT_UNLINK); +#endif + + CDEBUG(D_NET, "repost=%d unlinked=%d\n",repost,unlinked); + + if (unlinked) { + LASSERT(buf->plb_posted); + buf->plb_posted = 0; + plni->plni_nposted_buffers--; + } + + if (repost) + (void) ptllnd_post_buffer(buf); +} + +void +ptllnd_tx_event (lnet_ni_t *ni, ptl_event_t *event) +{ + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_tx_t *tx = ptllnd_eventarg2obj(event->md.user_ptr); + int error = (event->ni_fail_type != PTL_NI_OK); + int isreq; + int isbulk; +#ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS + int unlinked = event->unlinked; +#else + int unlinked = (event->type == PTL_EVENT_UNLINK); +#endif + + if (error) + CERROR("Error event type %d for %s for %s\n", + event->type, ptllnd_msgtype2str(tx->tx_type), + libcfs_id2str(tx->tx_peer->plp_id)); + + LASSERT (!PtlHandleIsEqual(event->md_handle, PTL_INVALID_HANDLE)); + + CDEBUG(D_NET, "tx=%p type=%s (%d)\n",tx, + ptllnd_msgtype2str(tx->tx_type),tx->tx_type); + CDEBUG(D_NET, "unlinked=%d\n",unlinked); + CDEBUG(D_NET, "error=%d\n",error); + + isreq = PtlHandleIsEqual(event->md_handle, tx->tx_reqmdh); + CDEBUG(D_NET, "isreq=%d\n",isreq); + if (isreq) { + LASSERT (event->md.start == (void *)&tx->tx_msg); + if (unlinked) { + tx->tx_reqmdh = PTL_INVALID_HANDLE; + PTLLND_DBGT_STAMP(tx->tx_req_done); + } + } + + isbulk = PtlHandleIsEqual(event->md_handle, tx->tx_bulkmdh); + CDEBUG(D_NET, "isbulk=%d\n",isbulk); + if ( isbulk && unlinked ) { + tx->tx_bulkmdh = PTL_INVALID_HANDLE; + PTLLND_DBGT_STAMP(tx->tx_bulk_done); + } + + LASSERT (!isreq != !isbulk); /* always one and only 1 match */ + + switch (tx->tx_type) { + default: + LBUG(); + + case PTLLND_MSG_TYPE_NOOP: + case PTLLND_MSG_TYPE_HELLO: + case PTLLND_MSG_TYPE_IMMEDIATE: + LASSERT (event->type == PTL_EVENT_UNLINK || + event->type == PTL_EVENT_SEND_END); + LASSERT (isreq); + break; + + case PTLLND_MSG_TYPE_GET: + LASSERT (event->type == PTL_EVENT_UNLINK || + (isreq && event->type == PTL_EVENT_SEND_END) || + (isbulk && event->type == PTL_EVENT_PUT_END)); + + if (isbulk && !error && event->type == PTL_EVENT_PUT_END) { + /* Check GET matched */ + if (event->hdr_data == PTLLND_RDMA_OK) { + lnet_set_reply_msg_len(ni, + tx->tx_lnetreplymsg, + event->mlength); + } else { + CERROR ("Unmatched GET with %s\n", + libcfs_id2str(tx->tx_peer->plp_id)); + tx->tx_status = -EIO; + } + } + break; + + case PTLLND_MSG_TYPE_PUT: + LASSERT (event->type == PTL_EVENT_UNLINK || + (isreq && event->type == PTL_EVENT_SEND_END) || + (isbulk && event->type == PTL_EVENT_GET_END)); + break; + + case PTLLND_RDMA_READ: + LASSERT (event->type == PTL_EVENT_UNLINK || + event->type == PTL_EVENT_SEND_END || + event->type == PTL_EVENT_REPLY_END); + LASSERT (isbulk); + break; + + case PTLLND_RDMA_WRITE: + LASSERT (event->type == PTL_EVENT_UNLINK || + event->type == PTL_EVENT_SEND_END); + LASSERT (isbulk); + } + + /* Schedule ptllnd_tx_done() on error or last completion event */ + if (error || + (PtlHandleIsEqual(tx->tx_bulkmdh, PTL_INVALID_HANDLE) && + PtlHandleIsEqual(tx->tx_reqmdh, PTL_INVALID_HANDLE))) { + if (error) + tx->tx_status = -EIO; + list_del(&tx->tx_list); + list_add_tail(&tx->tx_list, &plni->plni_zombie_txs); + CDEBUG(D_NET, "tx=%p ONTO ZOMBIE LIST\n",tx); + } +} + +void +ptllnd_wait (lnet_ni_t *ni, int milliseconds) +{ + ptllnd_ni_t *plni = ni->ni_data; + ptllnd_tx_t *tx; + ptl_event_t event; + int which; + int rc; + int blocked = 0; + int found = 0; + int timeout = 0; + + /* Handle any currently queued events, returning immediately if any. + * Otherwise block for the timeout and handle all events queued + * then. */ + + for (;;) { + time_t then = cfs_time_current_sec(); + + CDEBUG(D_NET, "Poll(%d)\n", timeout); + + rc = PtlEQPoll(&plni->plni_eqh, 1, + (timeout < 0) ? PTL_TIME_FOREVER : timeout, + &event, &which); + + if (timeout >= 0 && + (cfs_time_current_sec() - then)*1000 > timeout + 1000) { + /* 1000 mS grace.............................^ */ + CERROR("SLOW PtlEQPoll(%d): %d seconds\n", timeout, + (int)(cfs_time_current_sec() - then)); + } + + CDEBUG(D_NET, "PtlEQPoll rc=%d\n",rc); + timeout = 0; + + if (rc == PTL_EQ_EMPTY) { + if (found || /* handled some events */ + milliseconds == 0 || /* just checking */ + blocked) /* blocked already */ + break; + + blocked = 1; + timeout = milliseconds; + continue; + } + + LASSERT (rc == PTL_OK || rc == PTL_EQ_DROPPED); + + if (rc == PTL_EQ_DROPPED) + CERROR("Event queue: size %d is too small\n", + plni->plni_eq_size); + + CDEBUG(D_NET, "event.type=%s(%d)\n", + ptllnd_evtype2str(event.type),event.type); + + found = 1; + switch (ptllnd_eventarg2type(event.md.user_ptr)) { + default: + LBUG(); + + case PTLLND_EVENTARG_TYPE_TX: + ptllnd_tx_event(ni, &event); + break; + + case PTLLND_EVENTARG_TYPE_BUF: + ptllnd_buf_event(ni, &event); + break; + } + } + + while (!list_empty(&plni->plni_zombie_txs)) { + tx = list_entry(plni->plni_zombie_txs.next, + ptllnd_tx_t, tx_list); + CDEBUG(D_NET, "Process ZOMBIE tx=%p\n",tx); + ptllnd_tx_done(tx); + } +} diff --git a/lnet/ulnds/select.c b/lnet/ulnds/select.c deleted file mode 100644 index 49c770f..0000000 --- a/lnet/ulnds/select.c +++ /dev/null @@ -1,421 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* select.c: - * Provides a general mechanism for registering and dispatching - * io events through the select system call. - */ - -#define DEBUG_SUBSYSTEM S_NAL - -#ifdef sun -#include -#else -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -static struct timeval beginning_of_epoch; -static io_handler io_handlers; - -/* Function: now - * - * Return: the current time in canonical units: a 64 bit number - * where the most significant 32 bits contains the number - * of seconds, and the least signficant a count of (1/(2^32))ths - * of a second. - */ -when now() -{ - struct timeval result; - - gettimeofday(&result,0); - return((((unsigned long long)result.tv_sec)<<32)| - (((unsigned long long)result.tv_usec)<<32)/1000000); -} - - -/* Function: register_io_handler - * Arguments: fd: the file descriptor of interest - * type: a mask of READ_HANDLER, WRITE_HANDLER, EXCEPTION_HANDLER - * function: a function to call when io is available on fd - * arg: an opaque correlator to return to the handler - * Returns: a pointer to the io_handler structure - */ -io_handler register_io_handler(int fd, - int type, - int (*function)(void *), - void *arg) -{ - io_handler i=(io_handler)malloc(sizeof(struct io_handler)); - if ((i->fd=fd)>=0){ - i->type=type; - i->function=function; - i->argument=arg; - i->disabled=0; - i->last=&io_handlers; - if ((i->next=io_handlers)) i->next->last=&i->next; - io_handlers=i; - } - return(i); -} - -/* Function: remove_io_handler - * Arguments: i: a pointer to the handler to stop servicing - * - * remove_io_handler() doesn't actually free the handler, due - * to reentrancy problems. it just marks the handler for - * later cleanup by the blocking function. - */ -void remove_io_handler (io_handler i) -{ - i->disabled=1; -} - -static void set_flag(io_handler n,fd_set *r, fd_set *w, fd_set *e) -{ - if (n->type & READ_HANDLER) FD_SET(n->fd, r); - if (n->type & WRITE_HANDLER) FD_SET(n->fd, w); - if (n->type & EXCEPTION_HANDLER) FD_SET(n->fd, e); -} - -static int prepare_fd_sets(fd_set *r, fd_set *w, fd_set *e) -{ - io_handler j; - io_handler *k; - int max = 0; - - FD_ZERO(r); - FD_ZERO(w); - FD_ZERO(e); - for (k=&io_handlers;*k;){ - if ((*k)->disabled){ - j=*k; - *k=(*k)->next; - free(j); - } - if (*k) { - set_flag(*k,r,w,e); - if ((*k)->fd > max) - max = (*k)->fd; - k=&(*k)->next; - } - } - return max + 1; -} - -static int execute_callbacks(fd_set *r, fd_set *w, fd_set *e) -{ - io_handler j; - int n = 0, t; - - for (j = io_handlers; j; j = j->next) { - if (j->disabled) - continue; - - t = 0; - if (FD_ISSET(j->fd, r) && (j->type & READ_HANDLER)) { - FD_CLR(j->fd, r); - t++; - } - if (FD_ISSET(j->fd, w) && (j->type & WRITE_HANDLER)) { - FD_CLR(j->fd, w); - t++; - } - if (FD_ISSET(j->fd, e) && (j->type & EXCEPTION_HANDLER)) { - FD_CLR(j->fd, e); - t++; - } - if (t == 0) - continue; - - if (!(*j->function)(j->argument)) - j->disabled = 1; - - n += t; - } - - return n; -} - -#ifdef ENABLE_SELECT_DISPATCH - -static struct { - pthread_mutex_t mutex; - pthread_cond_t cond; - int submitted; - int nready; - int maxfd; - fd_set *rset; - fd_set *wset; - fd_set *eset; - struct timeval *timeout; - struct timeval submit_time; -} fd_extra = { - PTHREAD_MUTEX_INITIALIZER, - PTHREAD_COND_INITIALIZER, - 0, 0, 0, - NULL, NULL, NULL, NULL, -}; - -extern int liblustre_wait_event(int timeout); -extern procbridge __global_procbridge; - -/* - * this will intercept syscall select() of user apps - * such as MPI libs. - */ -int select(int n, fd_set *rset, fd_set *wset, fd_set *eset, - struct timeval *timeout) -{ - LASSERT(fd_extra.submitted == 0); - - fd_extra.nready = 0; - fd_extra.maxfd = n; - fd_extra.rset = rset; - fd_extra.wset = wset; - fd_extra.eset = eset; - fd_extra.timeout = timeout; - - liblustre_wait_event(0); - pthread_mutex_lock(&fd_extra.mutex); - gettimeofday(&fd_extra.submit_time, NULL); - fd_extra.submitted = 1; - LASSERT(__global_procbridge); - procbridge_wakeup_nal(__global_procbridge); - -again: - if (fd_extra.submitted) - pthread_cond_wait(&fd_extra.cond, &fd_extra.mutex); - pthread_mutex_unlock(&fd_extra.mutex); - - liblustre_wait_event(0); - - pthread_mutex_lock(&fd_extra.mutex); - if (fd_extra.submitted) - goto again; - pthread_mutex_unlock(&fd_extra.mutex); - - LASSERT(fd_extra.nready >= 0); - LASSERT(fd_extra.submitted == 0); - return fd_extra.nready; -} - -static int merge_fds(int max, fd_set *rset, fd_set *wset, fd_set *eset) -{ - int i; - - LASSERT(rset); - LASSERT(wset); - LASSERT(eset); - - for (i = 0; i < __FD_SETSIZE/__NFDBITS; i++) { - LASSERT(!fd_extra.rset || - !(__FDS_BITS(rset)[i] & __FDS_BITS(fd_extra.rset)[i])); - LASSERT(!fd_extra.wset || - !(__FDS_BITS(wset)[i] & __FDS_BITS(fd_extra.wset)[i])); - LASSERT(!fd_extra.eset || - !(__FDS_BITS(eset)[i] & __FDS_BITS(fd_extra.eset)[i])); - - if (fd_extra.rset && __FDS_BITS(fd_extra.rset)[i]) - __FDS_BITS(rset)[i] |= __FDS_BITS(fd_extra.rset)[i]; - if (fd_extra.wset && __FDS_BITS(fd_extra.wset)[i]) - __FDS_BITS(wset)[i] |= __FDS_BITS(fd_extra.wset)[i]; - if (fd_extra.eset && __FDS_BITS(fd_extra.eset)[i]) - __FDS_BITS(eset)[i] |= __FDS_BITS(fd_extra.eset)[i]; - } - - return (fd_extra.maxfd > max ? fd_extra.maxfd : max); -} - -static inline -int timeval_ge(struct timeval *tv1, struct timeval *tv2) -{ - LASSERT(tv1 && tv2); - return ((tv1->tv_sec - tv2->tv_sec) * 1000000 + - (tv1->tv_usec - tv2->tv_usec) >= 0); -} - -/* - * choose the most recent timeout value - */ -static struct timeval *choose_timeout(struct timeval *tv1, - struct timeval *tv2) -{ - if (!tv1) - return tv2; - else if (!tv2) - return tv1; - - if (timeval_ge(tv1, tv2)) - return tv2; - else - return tv1; -} - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) -{ - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer, *select_timeout; - int max, nready, nexec; - int fd_handling; - -again: - if (until) { - when interval; - - interval = until - now(); - timeout.tv_sec = (interval >> 32); - timeout.tv_usec = ((interval << 32) / 1000000) >> 32; - timeout_pointer = &timeout; - } else - timeout_pointer = NULL; - - fd_handling = 0; - max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); - select_timeout = timeout_pointer; - - pthread_mutex_lock(&fd_extra.mutex); - fd_handling = fd_extra.submitted; - pthread_mutex_unlock(&fd_extra.mutex); - if (fd_handling) { - max = merge_fds(max, &fds[0], &fds[1], &fds[2]); - select_timeout = choose_timeout(timeout_pointer, fd_extra.timeout); - } - - /* XXX only compile for linux */ -#if __WORDSIZE == 64 - nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2], - select_timeout); -#else - nready = syscall(SYS__newselect, max, &fds[0], &fds[1], &fds[2], - select_timeout); -#endif - if (nready < 0) { - CERROR("select return err %d, errno %d\n", nready, errno); - return; - } - - if (nready) { - nexec = execute_callbacks(&fds[0], &fds[1], &fds[2]); - nready -= nexec; - } else - nexec = 0; - - /* even both nready & nexec are 0, we still need try to wakeup - * upper thread since it may have timed out - */ - if (fd_handling) { - LASSERT(nready >= 0); - - pthread_mutex_lock(&fd_extra.mutex); - if (nready) { - if (fd_extra.rset) - *fd_extra.rset = fds[0]; - if (fd_extra.wset) - *fd_extra.wset = fds[1]; - if (fd_extra.eset) - *fd_extra.eset = fds[2]; - fd_extra.nready = nready; - fd_extra.submitted = 0; - } else { - struct timeval t; - - fd_extra.nready = 0; - if (fd_extra.timeout) { - gettimeofday(&t, NULL); - if (timeval_ge(&t, &fd_extra.submit_time)) - fd_extra.submitted = 0; - } - } - - pthread_cond_signal(&fd_extra.cond); - pthread_mutex_unlock(&fd_extra.mutex); - } - - /* haven't found portals event, go back to loop if time - * is not expired */ - if (!nexec) { - if (timeout_pointer == NULL || now() >= until) - goto again; - } -} - -#else /* !ENABLE_SELECT_DISPATCH */ - -/* Function: select_timer_block - * Arguments: until: an absolute time when the select should return - * - * This function dispatches the various file descriptors' handler - * functions, if the kernel indicates there is io available. - */ -void select_timer_block(when until) -{ - fd_set fds[3]; - struct timeval timeout; - struct timeval *timeout_pointer; - int max, nready; - -again: - if (until) { - when interval; - interval = until - now(); - timeout.tv_sec = (interval >> 32); - timeout.tv_usec = ((interval << 32) / 1000000) >> 32; - timeout_pointer = &timeout; - } else - timeout_pointer = NULL; - - max = prepare_fd_sets(&fds[0], &fds[1], &fds[2]); - - nready = select(max, &fds[0], &fds[1], &fds[2], timeout_pointer); - if (nready > 0) - execute_callbacks(&fds[0], &fds[1], &fds[2]); -} -#endif /* ENABLE_SELECT_DISPATCH */ - -/* Function: init_unix_timer() - * is called to initialize the library - */ -void init_unix_timer() -{ - io_handlers=0; - gettimeofday(&beginning_of_epoch, 0); - initialize_timer(select_timer_block); -} diff --git a/lnet/ulnds/socklnd/.cvsignore b/lnet/ulnds/socklnd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lnet/ulnds/socklnd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lnet/ulnds/socklnd/Makefile.am b/lnet/ulnds/socklnd/Makefile.am index 3437d39..f970be9 100644 --- a/lnet/ulnds/socklnd/Makefile.am +++ b/lnet/ulnds/socklnd/Makefile.am @@ -1,10 +1,13 @@ if LIBLUSTRE -if !CRAY_PORTALS -noinst_LIBRARIES = libtcpnal.a +if BUILD_USOCKLND +noinst_LIBRARIES = libsocklnd.a endif endif -noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h connection.h ipmap.h bridge.h procbridge.h -libtcpnal_a_SOURCES = debug.c pqtimer.c select.c table.c pqtimer.h dispatch.h table.h timer.h address.c procapi.c proclib.c connection.c tcpnal.c connection.h -libtcpnal_a_CPPFLAGS = $(LLCPPFLAGS) -libtcpnal_a_CFLAGS = $(LLCFLAGS) +noinst_HEADERS = pqtimer.h dispatch.h table.h timer.h \ + connection.h bridge.h procbridge.h +libsocklnd_a_SOURCES = pqtimer.c select.c table.c pqtimer.h \ + dispatch.h table.h timer.h procapi.c proclib.c \ + connection.c tcplnd.c connection.h +libsocklnd_a_CPPFLAGS = $(LLCPPFLAGS) +libsocklnd_a_CFLAGS = $(LLCFLAGS) diff --git a/lnet/ulnds/socklnd/address.c b/lnet/ulnds/socklnd/address.c deleted file mode 100644 index 07b4249..0000000 --- a/lnet/ulnds/socklnd/address.c +++ /dev/null @@ -1,147 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* address.c: - * this file provides functions to aquire the IP address of the node - * and translate them into a NID/PID pair which supports a static - * mapping of virtual nodes into the port range of an IP socket. -*/ - -#define DEBUG_SUBSYSTEM S_NAL - -#include -#include -#include -#include -#include -#include -#include - - -/* Function: get_node_id - * Returns: a 32 bit id for this node, actually a big-endian IP address - * - * get_node_id() determines the host name and uses the resolver to - * find out its ip address. This is fairly fragile and inflexible, but - * explicitly asking about interfaces and their addresses is very - * complicated and nonportable. - */ -static unsigned int get_node_id(void) -{ - char buffer[255]; - unsigned int x; - struct hostent *he; - char * host_envp; - - if (!(host_envp = getenv("PTL_HOSTID"))) - { - gethostname(buffer,sizeof(buffer)); - he=gethostbyname(buffer); - if (he) - x=*(unsigned int *)he->h_addr_list[0]; - else - x = 0; - return(ntohl(x)); - } - else - { - if (host_envp[1] != 'x') - { - int a, b, c, d; - sscanf(host_envp, "%d.%d.%d.%d", &a, &b, &c, &d); - return ((a<<24) | (b<<16) | (c<<8) | d); - } - else - { - long long hostid = strtoll(host_envp, 0, 0); - return((unsigned int) hostid); - } - } -} - - -/* Function: set_address - * Arugments: t: a procnal structure to populate with the request - * - * set_address performs the bit manipulations to set the nid, pid, and - * iptop8 fields of the procnal structures. - * - * TODO: fix pidrequest to try to do dynamic binding if PTL_ID_ANY - */ - -#ifdef DIRECT_IP_MODE -void set_address(bridge t,ptl_pid_t pidrequest) -{ - int port; - if (pidrequest==(unsigned short)PTL_PID_ANY) port = 0; - else port=pidrequest; - t->lib_nal->libnal_ni.ni_pid.nid=get_node_id(); - t->lib_nal->libnal_ni.ni_pid.pid=port; -} -#else - -void set_address(bridge t,ptl_pid_t pidrequest) -{ - int virtnode, in_addr, port; - ptl_pid_t pid; - - /* get and remember my node id*/ - if (!getenv("PTL_VIRTNODE")) - virtnode = 0; - else - { - int maxvnode = PNAL_VNODE_MASK - (PNAL_BASE_PORT - >> PNAL_VNODE_SHIFT); - virtnode = atoi(getenv("PTL_VIRTNODE")); - if (virtnode > maxvnode) - { - fprintf(stderr, "PTL_VIRTNODE of %d is too large - max %d\n", - virtnode, maxvnode); - return; - } - } - - in_addr = get_node_id(); - - t->iptop8 = in_addr >> PNAL_HOSTID_SHIFT;/* for making new connections */ - t->lib_nal->libnal_ni.ni_pid.nid = ((in_addr & PNAL_HOSTID_MASK) - << PNAL_VNODE_SHIFT) - + virtnode; - pid=pidrequest; - /* TODO: Support of pid PTL_ID_ANY with virtual nodes needs more work. */ -#ifdef notyet - if (pid==(unsigned short)PTL_PID_ANY) port = 0; -#endif - if (pid==(unsigned short)PTL_PID_ANY) - { - fprintf(stderr, "portal pid PTL_ID_ANY is not currently supported\n"); - return; - } - else if (pid > PNAL_PID_MASK) - { - fprintf(stderr, "portal pid of %d is too large - max %d\n", - pid, PNAL_PID_MASK); - return; - } - else port = ((virtnode << PNAL_VNODE_SHIFT) + pid) + PNAL_BASE_PORT; - t->lib_nal->libnal_ni.ni_pid.pid=pid; -} -#endif diff --git a/lnet/ulnds/socklnd/bridge.h b/lnet/ulnds/socklnd/bridge.h index d2f0f2c..a46cb13 100644 --- a/lnet/ulnds/socklnd/bridge.h +++ b/lnet/ulnds/socklnd/bridge.h @@ -9,26 +9,15 @@ #ifndef TCPNAL_PROCBRIDGE_H #define TCPNAL_PROCBRIDGE_H -#include -#include - -#define PTL_IFACE_TCP 1 -#define PTL_IFACE_ER 2 -#define PTL_IFACE_SS 3 -#define PTL_IFACE_MAX 4 +#include typedef struct bridge { int alive; - lib_nal_t *lib_nal; + lnet_ni_t *b_ni; void *lower; void *local; - void (*shutdown)(struct bridge *); /* this doesn't really belong here */ unsigned char iptop8; } *bridge; - -typedef int (*nal_initialize)(bridge); -extern nal_initialize nal_table[PTL_IFACE_MAX]; - #endif diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c index 49cca96..51aa535 100644 --- a/lnet/ulnds/socklnd/connection.c +++ b/lnet/ulnds/socklnd/connection.c @@ -22,8 +22,7 @@ /* connection.c: This file provides a simple stateful connection manager which builds tcp connections on demand and leaves them open for - future use. It also provides the machinery to allow peers - to connect to it + future use. */ #include @@ -38,9 +37,9 @@ #include #include #include -#include -#include -#include +#include +#include +#include #include #include #include @@ -49,45 +48,84 @@ #include #endif -/* global variable: acceptor port */ -unsigned short tcpnal_acceptor_port = 988; +/* tunables (via environment) */ +int tcpnal_acceptor_port = 988; +int tcpnal_buffer_size = 0; +int tcpnal_nagle = 0; +int +tcpnal_env_param (char *name, int *val) +{ + char *env = getenv(name); + int n; + + if (env == NULL) + return 1; + + n = strlen(env); /* scanf may not assign on EOS */ + if (sscanf(env, "%i%n", val, &n) >= 1 && n == strlen(env)) { + CDEBUG(D_INFO, "Environment variable %s set to %d\n", + name, *val); + return 1; + } + + CERROR("Can't parse environment variable '%s=%s'\n", + name, env); + return 0; +} + +int +tcpnal_set_global_params (void) +{ + return tcpnal_env_param("TCPNAL_PORT", + &tcpnal_acceptor_port) && + tcpnal_env_param("TCPLND_PORT", + &tcpnal_acceptor_port) && + tcpnal_env_param("TCPNAL_BUFFER_SIZE", + &tcpnal_buffer_size) && + tcpnal_env_param("TCPLND_BUFFER_SIZE", + &tcpnal_buffer_size) && + tcpnal_env_param("TCPNAL_NAGLE", + &tcpnal_nagle) && + tcpnal_env_param("TCPLND_NAGLE", + &tcpnal_nagle); +} /* Function: compare_connection * Arguments: connection c: a connection in the hash table - * ptl_process_id_t: an id to verify agains + * lnet_process_id_t: an id to verify agains * Returns: 1 if the connection is the one requested, 0 otherwise * * compare_connection() tests for collisions in the hash table */ static int compare_connection(void *arg1, void *arg2) { - connection c = arg1; - unsigned int * id = arg2; -#if 0 - return((c->ip==id[0]) && (c->port==id[1])); -#else - /* CFS specific hacking */ - return (c->ip == id[0]); -#endif -} + connection c = arg1; + lnet_nid_t *nid = arg2; + return (c->peer_nid == *nid); +} /* Function: connection_key - * Arguments: ptl_process_id_t id: an id to hash + * Arguments: lnet_process_id_t id: an id to hash * Returns: a not-particularily-well-distributed hash * of the id */ -static unsigned int connection_key(unsigned int *id) +static unsigned int connection_key(void *arg) { -#if 0 - return(id[0]^id[1]); -#else - /* CFS specific hacking */ - return (unsigned int) id[0]; -#endif + lnet_nid_t *nid = arg; + + return (unsigned int)(*nid); } +void +close_connection(void *arg) +{ + connection c = arg; + + close(c->fd); + free(c); +} /* Function: remove_connection * Arguments: c: the connection to remove @@ -95,13 +133,9 @@ static unsigned int connection_key(unsigned int *id) void remove_connection(void *arg) { connection c = arg; - unsigned int id[2]; - id[0]=c->ip; - id[1]=c->port; - hash_table_remove(c->m->connections,id); - close(c->fd); - free(c); + hash_table_remove(c->m->connections,&c->peer_nid); + close_connection(c); } @@ -149,172 +183,179 @@ static int connection_input(void *d) } -/* Function: allocate_connection - * Arguments: t: tcpnal the allocation is occuring in the context of - * dest: portal endpoint address for this connection - * fd: open file descriptor for the socket - * Returns: an allocated connection structure - * - * just encompasses the action common to active and passive - * connections of allocation and placement in the global table - */ -static connection allocate_connection(manager m, - unsigned int ip, - unsigned short port, - int fd) +static connection +allocate_connection(manager m, + lnet_nid_t nid, + int fd) { connection c=malloc(sizeof(struct connection)); - unsigned int id[2]; + c->m=m; c->fd=fd; - c->ip=ip; - c->port=port; - id[0]=ip; - id[1]=port; + c->peer_nid = nid; + register_io_handler(fd,READ_HANDLER,connection_input,c); - hash_table_insert(m->connections,c,id); + hash_table_insert(m->connections,c,&nid); return(c); } - -/* Function: new_connection - * Arguments: t: opaque argument holding the tcpname - * Returns: 1 in order to reregister for new connection requests - * - * called when the bound service socket recieves - * a new connection request, it always accepts and - * installs a new connection - */ -static int new_connection(void *z) +int +tcpnal_write(lnet_nid_t nid, int sockfd, void *buffer, int nob) { - manager m=z; - struct sockaddr_in s; - int len=sizeof(struct sockaddr_in); - int fd=accept(m->bound,(struct sockaddr *)&s,&len); - unsigned int nid=*((unsigned int *)&s.sin_addr); - /* cfs specific hack */ - //unsigned short pid=s.sin_port; - pthread_mutex_lock(&m->conn_lock); - allocate_connection(m,htonl(nid),0/*pid*/,fd); - pthread_mutex_unlock(&m->conn_lock); - return(1); + int rc = syscall(SYS_write, sockfd, buffer, nob); + + /* NB called on an 'empty' socket with huge buffering! */ + if (rc == nob) + return 0; + + if (rc < 0) { + CERROR("Failed to send to %s: %s\n", + libcfs_nid2str(nid), strerror(errno)); + return -1; + } + + CERROR("Short send to %s: %d/%d\n", + libcfs_nid2str(nid), rc, nob); + return -1; } -extern ptl_nid_t tcpnal_mynid; - int -tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) +tcpnal_read(lnet_nid_t nid, int sockfd, void *buffer, int nob) { - int rc; - int nob; - ptl_hdr_t hdr; - ptl_magicversion_t *hmv = (ptl_magicversion_t *)&hdr.dest_nid; + int rc; + + while (nob > 0) { + rc = syscall(SYS_read, sockfd, buffer, nob); + + if (rc == 0) { + CERROR("Unexpected EOF from %s\n", + libcfs_nid2str(nid)); + return -1; + } + + if (rc < 0) { + CERROR("Failed to receive from %s: %s\n", + libcfs_nid2str(nid), strerror(errno)); + return -1; + } + + nob -= rc; + } + return 0; +} - LASSERT (sizeof (*hmv) == sizeof (hdr.dest_nid)); +int +tcpnal_hello (int sockfd, lnet_nid_t nid) +{ + struct timeval tv; + __u64 incarnation; + int rc; + int nob; + lnet_acceptor_connreq_t cr; + lnet_hdr_t hdr; + lnet_magicversion_t hmv; + + gettimeofday(&tv, NULL); + incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + memset(&cr, 0, sizeof(cr)); + cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; + cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; + cr.acr_nid = nid; + + /* hmv initialised and copied separately into hdr; compiler "optimize" + * likely due to confusion about pointer alias of hmv and hdr when this + * was done in-place. */ + hmv.magic = cpu_to_le32(LNET_PROTO_TCP_MAGIC); + hmv.version_major = cpu_to_le32(LNET_PROTO_TCP_VERSION_MAJOR); + hmv.version_minor = cpu_to_le32(LNET_PROTO_TCP_VERSION_MINOR); memset (&hdr, 0, sizeof (hdr)); - hmv->magic = cpu_to_le32(PORTALS_PROTO_MAGIC); - hmv->version_major = cpu_to_le32(PORTALS_PROTO_VERSION_MAJOR); - hmv->version_minor = cpu_to_le32(PORTALS_PROTO_VERSION_MINOR); - - hdr.src_nid = cpu_to_le64(tcpnal_mynid); - hdr.type = cpu_to_le32(PTL_MSG_HELLO); - hdr.msg.hello.type = cpu_to_le32(type); + CLASSERT (sizeof (hmv) == sizeof (hdr.dest_nid)); + memcpy(&hdr.dest_nid, &hmv, sizeof(hmv)); + + /* hdr.src_nid/src_pid are ignored at dest */ + + hdr.type = cpu_to_le32(LNET_MSG_HELLO); + hdr.msg.hello.type = cpu_to_le32(SOCKLND_CONN_ANY); hdr.msg.hello.incarnation = cpu_to_le64(incarnation); /* I don't send any interface info */ - /* Assume sufficient socket buffering for this message */ - rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr)); - if (rc <= 0) { - CERROR ("Error %d sending HELLO to "LPX64"\n", rc, *nid); - return (rc); - } + /* Assume sufficient socket buffering for these messages... */ + rc = tcpnal_write(nid, sockfd, &cr, sizeof(cr)); + if (rc != 0) + return -1; - rc = syscall(SYS_read, sockfd, hmv, sizeof(*hmv)); - if (rc <= 0) { - CERROR ("Error %d reading HELLO from "LPX64"\n", rc, *nid); - return (rc); - } + rc = tcpnal_write(nid, sockfd, &hdr, sizeof(hdr)); + if (rc != 0) + return -1; + + rc = tcpnal_read(nid, sockfd, &hmv, sizeof(hmv)); + if (rc != 0) + return -1; - if (hmv->magic != le32_to_cpu(PORTALS_PROTO_MAGIC)) { - CERROR ("Bad magic %#08x (%#08x expected) from "LPX64"\n", - cpu_to_le32(hmv->magic), PORTALS_PROTO_MAGIC, *nid); - return (-EPROTO); + if (hmv.magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + CERROR ("Bad magic %#08x (%#08x expected) from %s\n", + cpu_to_le32(hmv.magic), LNET_PROTO_TCP_MAGIC, + libcfs_nid2str(nid)); + return -1; } - if (hmv->version_major != cpu_to_le16 (PORTALS_PROTO_VERSION_MAJOR) || - hmv->version_minor != cpu_to_le16 (PORTALS_PROTO_VERSION_MINOR)) { + if (hmv.version_major != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MAJOR) || + hmv.version_minor != cpu_to_le16 (LNET_PROTO_TCP_VERSION_MINOR)) { CERROR ("Incompatible protocol version %d.%d (%d.%d expected)" - " from "LPX64"\n", - le16_to_cpu (hmv->version_major), - le16_to_cpu (hmv->version_minor), - PORTALS_PROTO_VERSION_MAJOR, - PORTALS_PROTO_VERSION_MINOR, - *nid); - return (-EPROTO); + " from %s\n", + le16_to_cpu (hmv.version_major), + le16_to_cpu (hmv.version_minor), + LNET_PROTO_TCP_VERSION_MAJOR, + LNET_PROTO_TCP_VERSION_MINOR, + libcfs_nid2str(nid)); + return -1; } -#if (PORTALS_PROTO_VERSION_MAJOR != 1) +#if (LNET_PROTO_TCP_VERSION_MAJOR != 1) # error "This code only understands protocol version 1.x" #endif /* version 1 sends magic/version as the dest_nid of a 'hello' header, * so read the rest of it in now... */ - rc = syscall(SYS_read, sockfd, hmv + 1, sizeof(hdr) - sizeof(*hmv)); - if (rc <= 0) { - CERROR ("Error %d reading rest of HELLO hdr from "LPX64"\n", - rc, *nid); - return (rc); - } + rc = tcpnal_read(nid, sockfd, ((char *)&hdr) + sizeof (hmv), + sizeof(hdr) - sizeof(hmv)); + if (rc != 0) + return -1; /* ...and check we got what we expected */ - if (hdr.type != cpu_to_le32 (PTL_MSG_HELLO)) { + if (hdr.type != cpu_to_le32 (LNET_MSG_HELLO)) { CERROR ("Expecting a HELLO hdr " - " but got type %d with %d payload from "LPX64"\n", + " but got type %d with %d payload from %s\n", le32_to_cpu (hdr.type), - le32_to_cpu (hdr.payload_length), *nid); - return (-EPROTO); + le32_to_cpu (hdr.payload_length), libcfs_nid2str(nid)); + return -1; } - if (le64_to_cpu(hdr.src_nid) == PTL_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got PTL_NID_ANY\n"); - return (-EPROTO); + if (le64_to_cpu(hdr.src_nid) == LNET_NID_ANY) { + CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY\n"); + return -1; } - if (*nid == PTL_NID_ANY) { /* don't know peer's nid yet */ - *nid = le64_to_cpu(hdr.src_nid); - } else if (*nid != le64_to_cpu (hdr.src_nid)) { - CERROR ("Connected to nid "LPX64", but expecting "LPX64"\n", - le64_to_cpu (hdr.src_nid), *nid); - return (-EPROTO); + if (nid != le64_to_cpu (hdr.src_nid)) { + CERROR ("Connected to %s, but expecting %s\n", + libcfs_nid2str(le64_to_cpu (hdr.src_nid)), + libcfs_nid2str(nid)); + return -1; } /* Ignore any interface info in the payload */ nob = le32_to_cpu(hdr.payload_length); - if (nob > getpagesize()) { - CERROR("Unexpected HELLO payload %d from "LPX64"\n", - nob, *nid); - return (-EPROTO); - } - if (nob > 0) { - char *space = (char *)malloc(nob); - - if (space == NULL) { - CERROR("Can't allocate scratch buffer %d\n", nob); - return (-ENOMEM); - } - - rc = syscall(SYS_read, sockfd, space, nob); - if (rc <= 0) { - CERROR("Error %d skipping HELLO payload from " - LPX64"\n", rc, *nid); - return (rc); - } + if (nob != 0) { + CERROR("Unexpected HELLO payload %d from %s\n", + nob, libcfs_nid2str(nid)); + return -1; } - return (0); + return 0; } /* Function: force_tcp_connection @@ -323,44 +364,81 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) * Returns: an allocated connection structure, either * a pre-existing one, or a new connection */ -connection force_tcp_connection(manager m, - unsigned int ip, - unsigned short port, +connection force_tcp_connection(manager m, + lnet_nid_t nid, procbridge pb) { - connection conn; + unsigned int ip = LNET_NIDADDR(nid); + connection conn; struct sockaddr_in addr; struct sockaddr_in locaddr; - unsigned int id[2]; - struct timeval tv; - __u64 incarnation; - - int fd; - int option; - int rc; - int rport; - ptl_nid_t peernid = PTL_NID_ANY; - - port = tcpnal_acceptor_port; - - id[0] = ip; - id[1] = port; + int fd; + int option; + int rc; + int sz; pthread_mutex_lock(&m->conn_lock); - conn = hash_table_find(m->connections, id); + conn = hash_table_find(m->connections, &nid); if (conn) goto out; memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_addr.s_addr = htonl(ip); - addr.sin_port = htons(port); + addr.sin_port = htons(tcpnal_acceptor_port); memset(&locaddr, 0, sizeof(locaddr)); locaddr.sin_family = AF_INET; locaddr.sin_addr.s_addr = INADDR_ANY; + locaddr.sin_port = htons(m->port); + +#if 1 /* tcpnal connects from a non-privileged port */ + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + perror("tcpnal socket failed"); + goto out; + } + + option = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + &option, sizeof(option)); + if (rc != 0) { + perror ("Can't set SO_REUSEADDR for socket"); + close(fd); + goto out; + } + + if (m->port != 0) { + /* Bind all subsequent connections to the same port */ + rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); + if (rc != 0) { + perror("Error binding port"); + close(fd); + goto out; + } + } + + rc = connect(fd, (struct sockaddr *)&addr, + sizeof(struct sockaddr_in)); + if (rc != 0) { + perror("Error connecting to remote host"); + close(fd); + goto out; + } + + sz = sizeof(locaddr); + rc = getsockname(fd, (struct sockaddr *)&locaddr, &sz); + if (rc != 0) { + perror ("Error on getsockname"); + close(fd); + goto out; + } + if (m->port == 0) + m->port = ntohs(locaddr.sin_port); + +#else for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { fd = socket(AF_INET, SOCK_STREAM, 0); if (fd < 0) { @@ -401,24 +479,22 @@ connection force_tcp_connection(manager m, fprintf(stderr, "Out of ports trying to bind to a reserved port\n"); goto out; } +#endif -#if 1 - option = 1; + option = tcpnal_nagle ? 0 : 1; setsockopt(fd, SOL_TCP, TCP_NODELAY, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); - option = 1<<20; - setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); -#endif - - gettimeofday(&tv, NULL); - incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; - + option = tcpnal_buffer_size; + if (option != 0) { + setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option)); + option = tcpnal_buffer_size; + setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); + } + /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) - exit(-1); + if (tcpnal_hello(fd, nid)) + goto out; - conn = allocate_connection(m, ip, port, fd); + conn = allocate_connection(m, nid, fd); /* let nal thread know this event right away */ if (conn) @@ -430,6 +506,30 @@ out: } +#if 0 /* we don't accept connections */ +/* Function: new_connection + * Arguments: t: opaque argument holding the tcpname + * Returns: 1 in order to reregister for new connection requests + * + * called when the bound service socket recieves + * a new connection request, it always accepts and + * installs a new connection + */ +static int new_connection(void *z) +{ + manager m=z; + struct sockaddr_in s; + int len=sizeof(struct sockaddr_in); + int fd=accept(m->bound,(struct sockaddr *)&s,&len); + unsigned int nid=*((unsigned int *)&s.sin_addr); + /* cfs specific hack */ + //unsigned short pid=s.sin_port; + pthread_mutex_lock(&m->conn_lock); + allocate_connection(m,htonl(nid),0/*pid*/,fd); + pthread_mutex_unlock(&m->conn_lock); + return(1); +} + /* Function: bind_socket * Arguments: t: the nal state for this interface * port: the port to attempt to bind to @@ -455,7 +555,7 @@ static int bind_socket(manager m,unsigned short port) addr.sin_port = htons(port); if (bind(m->bound,(struct sockaddr *)&addr,alen)<0){ - fprintf(stderr, "tcpnal bind: %s port %u\n", strerror(errno), port); + perror ("tcpnal bind"); return(0); } @@ -467,6 +567,7 @@ static int bind_socket(manager m,unsigned short port) m->port=addr.sin_port; return(1); } +#endif /* Function: shutdown_connections @@ -476,32 +577,37 @@ static int bind_socket(manager m,unsigned short port) */ void shutdown_connections(manager m) { - close(m->bound); - remove_io_handler(m->bound_handler); - hash_destroy_table(m->connections,remove_connection); - free(m); +#if 0 + /* we don't accept connections */ + close(m->bound); + remove_io_handler(m->bound_handler); +#endif + hash_destroy_table(m->connections,close_connection); + free(m); } /* Function: init_connections * Arguments: t: the nal state for this interface - * port: the port to attempt to bind to * Returns: a newly allocated manager structure, or * zero if the fixed port could not be bound */ -manager init_connections(unsigned short pid, - int (*input)(void *, void *), - void *a) +manager init_connections(int (*input)(void *, void *), void *a) { manager m = (manager)malloc(sizeof(struct manager)); + m->connections = hash_create_table(compare_connection,connection_key); m->handler = input; m->handler_arg = a; + m->port = 0; /* set on first connection */ pthread_mutex_init(&m->conn_lock, 0); + return m; +#if 0 if (bind_socket(m,pid)) return(m); free(m); return(0); +#endif } diff --git a/lnet/ulnds/socklnd/connection.h b/lnet/ulnds/socklnd/connection.h index 343ffa6..0c4718e 100644 --- a/lnet/ulnds/socklnd/connection.h +++ b/lnet/ulnds/socklnd/connection.h @@ -10,26 +10,26 @@ #include typedef struct manager { - table connections; + table connections; pthread_mutex_t conn_lock; /* protect connections table */ - int bound; - io_handler bound_handler; - int (*handler)(void *, void *); - void *handler_arg; - unsigned short port; +#if 0 /* we don't accept connections */ + int bound; + io_handler bound_handler; +#endif + int (*handler)(void *, void *); + void *handler_arg; + int port; } *manager; typedef struct connection { - unsigned int ip; - unsigned short port; - int fd; - manager m; + lnet_nid_t peer_nid; + int fd; + manager m; } *connection; -connection force_tcp_connection(manager m, unsigned int ip, unsigned int short, - procbridge pb); -manager init_connections(unsigned short, int (*f)(void *, void *), void *); +connection force_tcp_connection(manager m, lnet_nid_t nid, procbridge pb); +manager init_connections(int (*f)(void *, void *), void *); void remove_connection(void *arg); void shutdown_connections(manager m); int read_connection(connection c, unsigned char *dest, int len); diff --git a/lnet/ulnds/socklnd/debug.c b/lnet/ulnds/socklnd/debug.c deleted file mode 100644 index b82bb2f..0000000 --- a/lnet/ulnds/socklnd/debug.c +++ /dev/null @@ -1,119 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Phil Schwan - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include - -int smp_processor_id = 1; -char debug_file_path[1024] = "/tmp/lustre-log"; -char debug_file_name[1024]; -FILE *debug_file_fd; - -int portals_do_debug_dumplog(void *arg) -{ - printf("Look in %s\n", debug_file_name); - return 0; -} - - -void portals_debug_print(void) -{ - return; -} - - -void portals_debug_dumplog(void) -{ - printf("Look in %s\n", debug_file_name); - return; -} - - -int portals_debug_init(unsigned long bufsize) -{ - debug_file_fd = stdout; - return 0; -} - -int portals_debug_cleanup(void) -{ - return 0; //close(portals_debug_fd); -} - -int portals_debug_clear_buffer(void) -{ - return 0; -} - -int portals_debug_mark_buffer(char *text) -{ - - fprintf(debug_file_fd, "*******************************************************************************\n"); - fprintf(debug_file_fd, "DEBUG MARKER: %s\n", text); - fprintf(debug_file_fd, "*******************************************************************************\n"); - - return 0; -} - -int portals_debug_copy_to_user(char *buf, unsigned long len) -{ - return 0; -} - -/* FIXME: I'm not very smart; someone smarter should make this better. */ -void -portals_debug_msg (int subsys, int mask, char *file, const char *fn, - const int line, const char *format, ...) -{ - va_list ap; - unsigned long flags; - struct timeval tv; - int nob; - - - /* NB since we pass a non-zero sized buffer (at least) on the first - * print, we can be assured that by the end of all the snprinting, - * we _do_ have a terminated buffer, even if our message got truncated. - */ - - gettimeofday(&tv, NULL); - - nob += fprintf(debug_file_fd, - "%02x:%06x:%d:%lu.%06lu ", - subsys >> 24, mask, smp_processor_id, - tv.tv_sec, tv.tv_usec); - - nob += fprintf(debug_file_fd, - "(%s:%d:%s() %d+%ld): ", - file, line, fn, 0, - 8192 - ((unsigned long)&flags & 8191UL)); - - va_start (ap, format); - nob += fprintf(debug_file_fd, format, ap); - va_end (ap); - - -} - diff --git a/lnet/ulnds/socklnd/dispatch.h b/lnet/ulnds/socklnd/dispatch.h index a8f916d9..300f33b 100644 --- a/lnet/ulnds/socklnd/dispatch.h +++ b/lnet/ulnds/socklnd/dispatch.h @@ -41,6 +41,4 @@ when now(void); /* * hacking for CFS internal MPI testing */ -#if !CRAY_PORTALS #define ENABLE_SELECT_DISPATCH -#endif diff --git a/lnet/ulnds/socklnd/ipmap.h b/lnet/ulnds/socklnd/ipmap.h deleted file mode 100644 index 85b1e18..0000000 --- a/lnet/ulnds/socklnd/ipmap.h +++ /dev/null @@ -1,38 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#define DIRECT_IP_MODE -#ifdef DIRECT_IP_MODE -#define PNAL_NID(in_addr, port) (in_addr) -#define PNAL_PID(pid) (pid) -#define PNAL_IP(in_addr, port) (in_addr) -#define PNAL_PORT(nid, pid) (pid) -#else - -#define PNAL_BASE_PORT 4096 -#define PNAL_HOSTID_SHIFT 24 -#define PNAL_HOSTID_MASK ((1 << PNAL_HOSTID_SHIFT) - 1) -#define PNAL_VNODE_SHIFT 8 -#define PNAL_VNODE_MASK ((1 << PNAL_VNODE_SHIFT) - 1) -#define PNAL_PID_SHIFT 8 -#define PNAL_PID_MASK ((1 << PNAL_PID_SHIFT) - 1) - -#define PNAL_NID(in_addr, port) (((ntohl(in_addr) & PNAL_HOSTID_MASK) \ - << PNAL_VNODE_SHIFT) \ - | (((ntohs(port)-PNAL_BASE_PORT) >>\ - PNAL_PID_SHIFT))) -#define PNAL_PID(port) ((ntohs(port) - PNAL_BASE_PORT) & PNAL_PID_MASK) - -#define PNAL_IP(nid,t) (htonl((((unsigned)(nid))\ - >> PNAL_VNODE_SHIFT)\ - | (t->iptop8 << PNAL_HOSTID_SHIFT))) -#define PNAL_PORT(nid, pid) (htons(((((nid) & PNAL_VNODE_MASK) \ - << PNAL_VNODE_SHIFT) \ - | ((pid) & PNAL_PID_MASK)) \ - + PNAL_BASE_PORT)) -#endif diff --git a/lnet/ulnds/socklnd/procapi.c b/lnet/ulnds/socklnd/procapi.c index 6b471c0..5fd5f46 100644 --- a/lnet/ulnds/socklnd/procapi.c +++ b/lnet/ulnds/socklnd/procapi.c @@ -33,14 +33,22 @@ #include #include #ifndef __CYGWIN__ -#include +# include #endif +#include #include +#include #include #include #include #include +#ifdef HAVE_GETHOSTBYNAME +# include +#endif +#if !HAVE_LIBPTHREAD +# error "This LND requires a multi-threaded runtime" +#endif /* XXX CFS workaround, to give a chance to let nal thread wake up * from waiting in select @@ -60,17 +68,26 @@ void procbridge_wakeup_nal(procbridge p) syscall(SYS_write, p->notifier[0], buf, sizeof(buf)); } +lnd_t the_tcplnd = { + .lnd_type = SOCKLND, + .lnd_startup = procbridge_startup, + .lnd_shutdown = procbridge_shutdown, + .lnd_send = tcpnal_send, + .lnd_recv = tcpnal_recv, + .lnd_notify = tcpnal_notify, +}; +int tcpnal_running; + /* Function: shutdown - * Arguments: nal: a pointer to my top side nal structure - * ni: my network interface index + * Arguments: ni: the instance of me * * cleanup nal state, reclaim the lower side thread and * its state using PTL_FINI codepoint */ -static void procbridge_shutdown(nal_t *n) +void +procbridge_shutdown(lnet_ni_t *ni) { - lib_nal_t *nal = n->nal_data; - bridge b=(bridge)nal->libnal_data; + bridge b=(bridge)ni->ni_data; procbridge p=(procbridge)b->local; p->nal_flags |= NAL_FLAG_STOPPING; @@ -87,68 +104,51 @@ static void procbridge_shutdown(nal_t *n) } while (1); free(p); + tcpnal_running = 0; } - -/* forward decl */ -extern int procbridge_startup (nal_t *, ptl_pid_t, - ptl_ni_limits_t *, ptl_ni_limits_t *); - -/* api_nal - * the interface vector to allow the generic code to access - * this nal. this is seperate from the library side lib_nal. - * TODO: should be dyanmically allocated - */ -nal_t procapi_nal = { - nal_data: NULL, - nal_ni_init: procbridge_startup, - nal_ni_fini: procbridge_shutdown, -}; - -ptl_nid_t tcpnal_mynid; - #ifdef ENABLE_SELECT_DISPATCH procbridge __global_procbridge = NULL; #endif /* Function: procbridge_startup * - * Arguments: pid: requested process id (port offset) - * PTL_ID_ANY not supported. - * desired: limits passed from the application - * and effectively ignored - * actual: limits actually allocated and returned + * Arguments: ni: the instance of me + * interfaces: ignored * * Returns: portals rc * * initializes the tcp nal. we define unix_failure as an * error wrapper to cut down clutter. */ -int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, - ptl_ni_limits_t *requested_limits, - ptl_ni_limits_t *actual_limits) +int +procbridge_startup (lnet_ni_t *ni) { - nal_init_args_t args; - procbridge p; - bridge b; - /* XXX nal_type is purely private to tcpnal here */ - int nal_type = PTL_IFACE_TCP;/* PTL_IFACE_DEFAULT FIXME hack */ - - LASSERT(nal == &procapi_nal); - + bridge b; + int rc; + + /* NB The local NID is not assigned. We only ever connect to the socknal, + * which assigns the src nid/pid on incoming non-privileged connections + * (i.e. us), and we don't accept connections. */ + + LASSERT (ni->ni_lnd == &the_tcplnd); + LASSERT (!tcpnal_running); /* only single instance supported */ + LASSERT (ni->ni_interfaces[0] == NULL); /* explicit interface(s) not supported */ + + /* The credit settings here are pretty irrelevent. Userspace tcplnd has no + * tx descriptor pool to exhaust and does a blocking send; that's the real + * limit on send concurrency. */ + ni->ni_maxtxcredits = 1000; + ni->ni_peertxcredits = 1000; + init_unix_timer(); b=(bridge)malloc(sizeof(struct bridge)); p=(procbridge)malloc(sizeof(struct procbridge)); b->local=p; - - args.nia_requested_pid = requested_pid; - args.nia_requested_limits = requested_limits; - args.nia_actual_limits = actual_limits; - args.nia_nal_type = nal_type; - args.nia_bridge = b; - args.nia_apinal = nal; + b->b_ni = ni; + ni->ni_data = b; /* init procbridge */ pthread_mutex_init(&p->mutex,0); @@ -158,13 +158,14 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, /* initialize notifier */ if (socketpair(AF_UNIX, SOCK_STREAM, 0, p->notifier)) { perror("socketpair failed"); - return PTL_FAIL; + rc = -errno; + return rc; } if (!register_io_handler(p->notifier[1], READ_HANDLER, procbridge_notifier_handler, p)) { perror("fail to register notifier handler"); - return PTL_FAIL; + return -ENOMEM; } #ifdef ENABLE_SELECT_DISPATCH @@ -172,9 +173,10 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, #endif /* create nal thread */ - if (pthread_create(&p->t, NULL, nal_thread, &args)) { + rc = pthread_create(&p->t, NULL, nal_thread, b); + if (rc != 0) { perror("nal_init: pthread_create"); - return PTL_FAIL; + return -ESRCH; } do { @@ -188,9 +190,9 @@ int procbridge_startup (nal_t *nal, ptl_pid_t requested_pid, } while (1); if (p->nal_flags & NAL_FLAG_STOPPED) - return PTL_FAIL; + return -ENETDOWN; - b->lib_nal->libnal_ni.ni_pid.nid = tcpnal_mynid; + tcpnal_running = 1; - return PTL_OK; + return 0; } diff --git a/lnet/ulnds/socklnd/procbridge.h b/lnet/ulnds/socklnd/procbridge.h index 1f91ced..2dd534b 100644 --- a/lnet/ulnds/socklnd/procbridge.h +++ b/lnet/ulnds/socklnd/procbridge.h @@ -12,7 +12,6 @@ #include #include -#include #define NAL_FLAG_RUNNING 1 @@ -33,24 +32,27 @@ typedef struct procbridge { } *procbridge; typedef struct nal_init_args { - ptl_pid_t nia_requested_pid; - ptl_ni_limits_t *nia_requested_limits; - ptl_ni_limits_t *nia_actual_limits; - int nia_nal_type; + lnet_pid_t nia_requested_pid; bridge nia_bridge; - nal_t *nia_apinal; } nal_init_args_t; extern void *nal_thread(void *); +extern void procbridge_wakeup_nal(procbridge p); + +extern int procbridge_startup (lnet_ni_t *); +extern void procbridge_shutdown (lnet_ni_t *); + +extern void tcpnal_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive); + +extern int tcpnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int tcpnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *cookie, + int delayed, unsigned int niov, + struct iovec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +extern int tcpnal_set_global_params(); -#define PTL_INIT (LIB_MAX_DISPATCH+1) -#define PTL_FINI (LIB_MAX_DISPATCH+2) -#define MAX_ACLS 1 -#define MAX_PTLS 128 -extern void set_address(bridge t,ptl_pid_t pidrequest); -extern void procbridge_wakeup_nal(procbridge p); #endif diff --git a/lnet/ulnds/socklnd/proclib.c b/lnet/ulnds/socklnd/proclib.c index 7ee7c71..01faf05 100644 --- a/lnet/ulnds/socklnd/proclib.c +++ b/lnet/ulnds/socklnd/proclib.c @@ -42,13 +42,8 @@ /* the following functions are stubs to satisfy the nal definition without doing anything particularily useful*/ - -static int nal_dist(lib_nal_t *nal, - ptl_nid_t nid, - unsigned long *dist) -{ - return 0; -} +extern int tcpnal_init(bridge); +extern void tcpnal_shutdown(bridge); static void check_stopping(void *z) { @@ -58,6 +53,8 @@ static void check_stopping(void *z) if ((p->nal_flags & NAL_FLAG_STOPPING) == 0) return; + tcpnal_shutdown(b); + pthread_mutex_lock(&p->mutex); p->nal_flags |= NAL_FLAG_STOPPED; pthread_cond_broadcast(&p->cond); @@ -79,53 +76,27 @@ static void check_stopping(void *z) * We define a limit macro to place a ceiling on limits * for syntactic convenience */ -extern int tcpnal_init(bridge); - -nal_initialize nal_table[PTL_IFACE_MAX]={0,tcpnal_init,0}; void *nal_thread(void *z) { - nal_init_args_t *args = (nal_init_args_t *) z; - bridge b = args->nia_bridge; + bridge b = (bridge) z; procbridge p=b->local; int rc; - ptl_process_id_t process_id; - int nal_type; - b->lib_nal=(lib_nal_t *)malloc(sizeof(lib_nal_t)); - b->lib_nal->libnal_data=b; - b->lib_nal->libnal_map=NULL; - b->lib_nal->libnal_unmap=NULL; - b->lib_nal->libnal_dist=nal_dist; - - nal_type = args->nia_nal_type; - - /* Wierd, but this sets b->lib_nal->libnal_ni.ni_pid.{nid,pid}, which - * lib_init() is about to do from the process_id passed to it...*/ - set_address(b,args->nia_requested_pid); - - process_id = b->lib_nal->libnal_ni.ni_pid; - - if (nal_table[nal_type]) rc=(*nal_table[nal_type])(b); - /* initialize the generic 'library' level code */ - - rc = lib_init(b->lib_nal, args->nia_apinal, - process_id, - args->nia_requested_limits, - args->nia_actual_limits); + rc = tcpnal_init(b); /* * Whatever the initialization returned is passed back to the * user level code for further interpretation. We just exit if * it is non-zero since something went wrong. */ - /* this should perform error checking */ + pthread_mutex_lock(&p->mutex); - p->nal_flags |= (rc != PTL_OK) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING; + p->nal_flags |= (rc != 0) ? NAL_FLAG_STOPPED : NAL_FLAG_RUNNING; pthread_cond_broadcast(&p->cond); pthread_mutex_unlock(&p->mutex); - if (rc == PTL_OK) { + if (rc == 0) { /* the thunk function is called each time the timer loop performs an operation and returns to blocking mode. we overload this function to inform the api side that diff --git a/lnet/ulnds/socklnd/select.c b/lnet/ulnds/socklnd/select.c index 49c770f..42c9bc1 100644 --- a/lnet/ulnds/socklnd/select.c +++ b/lnet/ulnds/socklnd/select.c @@ -25,7 +25,7 @@ * io events through the select system call. */ -#define DEBUG_SUBSYSTEM S_NAL +#define DEBUG_SUBSYSTEM S_LND #ifdef sun #include @@ -320,7 +320,7 @@ again: } /* XXX only compile for linux */ -#if __WORDSIZE == 64 +#if (__WORDSIZE == 64) && !defined(__mips64__) nready = syscall(SYS_select, max, &fds[0], &fds[1], &fds[2], select_timeout); #else diff --git a/lnet/ulnds/socklnd/table.c b/lnet/ulnds/socklnd/table.c index 662775a..eb390c4 100644 --- a/lnet/ulnds/socklnd/table.c +++ b/lnet/ulnds/socklnd/table.c @@ -110,7 +110,7 @@ unsigned int key_from_string(char *s) * Returns: a pointer to the new table */ table hash_create_table (int (*compare_function)(void *, void *), - unsigned int (*key_function)(unsigned int *)) + unsigned int (*key_function)(void *)) { table new=(table)malloc(sizeof(struct table)); memset(new, 0, sizeof(struct table)); diff --git a/lnet/ulnds/socklnd/table.h b/lnet/ulnds/socklnd/table.h index 7fab586..0cb9669 100644 --- a/lnet/ulnds/socklnd/table.h +++ b/lnet/ulnds/socklnd/table.h @@ -22,13 +22,14 @@ typedef struct table { int number_of_entries; table_entry *entries; int (*compare_function)(void *, void *); - unsigned int (*key_function)(unsigned int *); + unsigned int (*key_function)(void *); } *table; /* table.c */ unsigned int key_from_int(int i); unsigned int key_from_string(char *s); -table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); +table hash_create_table(int (*compare_function)(void *, void *), + unsigned int (*key_function)(void *)); void *hash_table_find(table t, void *comparator); void hash_table_insert(table t, void *value, void *comparator); void hash_table_remove(table t, void *comparator); diff --git a/lnet/ulnds/socklnd/tcplnd.c b/lnet/ulnds/socklnd/tcplnd.c index abb6d01..bd73fb2 100644 --- a/lnet/ulnds/socklnd/tcplnd.c +++ b/lnet/ulnds/socklnd/tcplnd.c @@ -33,201 +33,199 @@ #include #include #include -#include -#include +#include #include -#include #include + #ifndef __CYGWIN__ #include #endif -/* Function: tcpnal_send - * Arguments: nal: pointer to my nal control block - * private: unused - * cookie: passed back to the portals library - * hdr: pointer to the portals header - * nid: destination node - * pid: destination process - * data: body of the message - * len: length of the body - * Returns: zero on success - * +void +tcpnal_notify(lnet_ni_t *ni, lnet_nid_t nid, int alive) +{ + bridge b = (bridge)ni->ni_data; + connection c; + + if (!alive) { + LBUG(); + } + + c = force_tcp_connection((manager)b->lower, nid, b->local); + if (c == NULL) + CERROR("Can't create connection to %s\n", + libcfs_nid2str(nid)); +} + +/* * sends a packet to the peer, after insuring that a connection exists */ -ptl_err_t tcpnal_send(lib_nal_t *n, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t len) +int tcpnal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { - connection c; - bridge b=(bridge)n->libnal_data; - struct iovec tiov[257]; - static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; - ptl_err_t rc = PTL_OK; - int sysrc; - int total; - int ntiov; - int i; - - if (!(c=force_tcp_connection((manager)b->lower, - PNAL_IP(nid,b), - PNAL_PORT(nid,pid), - b->local))) - return(PTL_FAIL); - - /* TODO: these results should be checked. furthermore, provision - must be made for the SIGPIPE which is delivered when - writing on a tcp socket which has closed underneath - the application. there is a linux flag in the sendmsg - call which turns off the signally behaviour, but its - nonstandard */ - - LASSERT (niov <= 256); - - tiov[0].iov_base = hdr; - tiov[0].iov_len = sizeof(ptl_hdr_t); - ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len); - - pthread_mutex_lock(&send_lock); + lnet_hdr_t *hdr = &lntmsg->msg_hdr; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int niov = lntmsg->msg_niov; + struct iovec *iov = lntmsg->msg_iov; + unsigned int offset = lntmsg->msg_offset; + unsigned int len = lntmsg->msg_len; + + connection c; + bridge b = (bridge)ni->ni_data; + struct iovec tiov[257]; + static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; + int rc = 0; + int sysrc; + int total; + int ntiov; + int i; + + if (!(c = force_tcp_connection((manager)b->lower, target.nid, + b->local))) + return(-EIO); + + /* TODO: these results should be checked. furthermore, provision + must be made for the SIGPIPE which is delivered when + writing on a tcp socket which has closed underneath + the application. there is a linux flag in the sendmsg + call which turns off the signally behaviour, but its + nonstandard */ + + LASSERT (niov <= 256); + LASSERT (len == 0 || iov != NULL); /* I don't understand kiovs */ + + tiov[0].iov_base = hdr; + tiov[0].iov_len = sizeof(lnet_hdr_t); + ntiov = 1 + lnet_extract_iov(256, &tiov[1], niov, iov, offset, len); + + pthread_mutex_lock(&send_lock); #if 1 - for (i = total = 0; i < ntiov; i++) - total += tiov[i].iov_len; - - sysrc = syscall(SYS_writev, c->fd, tiov, ntiov); - if (sysrc != total) { - fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", - rc, total, errno); - rc = PTL_FAIL; - } + for (i = total = 0; i < ntiov; i++) + total += tiov[i].iov_len; + + sysrc = syscall(SYS_writev, c->fd, tiov, ntiov); + if (sysrc != total) { + fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", + rc, total, errno); + rc = -errno; + } #else - for (i = total = 0; i <= ntiov; i++) { - rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0); - - if (rc != tiov[i].iov_len) { - fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", - rc, tiov[i].iov_len, errno); - rc = PTL_FAIL; - break; - } - total += rc; - } + for (i = total = 0; i <= ntiov; i++) { + rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0); + + if (rc != tiov[i].iov_len) { + fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", + rc, tiov[i].iov_len, errno); + rc = -errno; + break; + } + total += rc; + } #endif #if 0 - fprintf (stderr, "sent %s total %d in %d frags\n", - hdr->type == PTL_MSG_ACK ? "ACK" : - hdr->type == PTL_MSG_PUT ? "PUT" : - hdr->type == PTL_MSG_GET ? "GET" : - hdr->type == PTL_MSG_REPLY ? "REPLY" : - hdr->type == PTL_MSG_HELLO ? "HELLO" : "UNKNOWN", - total, niov + 1); + fprintf (stderr, "sent %s total %d in %d frags\n", + hdr->type == LNET_MSG_ACK ? "ACK" : + hdr->type == LNET_MSG_PUT ? "PUT" : + hdr->type == LNET_MSG_GET ? "GET" : + hdr->type == LNET_MSG_REPLY ? "REPLY" : + hdr->type == LNET_MSG_HELLO ? "HELLO" : "UNKNOWN", + total, niov + 1); #endif - pthread_mutex_unlock(&send_lock); + pthread_mutex_unlock(&send_lock); - if (rc == PTL_OK) { - /* NB the NAL only calls lib_finalize() if it returns PTL_OK - * from cb_send() */ - lib_finalize(n, private, cookie, PTL_OK); - } + if (rc == 0) { + /* NB the NAL only calls lnet_finalize() if it returns 0 + * from cb_send() */ + lnet_finalize(ni, lntmsg, 0); + } - return(rc); + return(rc); } -/* Function: tcpnal_recv - * Arguments: lib_nal_t *nal: pointer to my nal control block - * void *private: connection pointer passed through - * lib_parse() - * lib_msg_t *cookie: passed back to portals library - * user_ptr data: pointer to the destination buffer - * size_t mlen: length of the body - * size_t rlen: length of data in the network - * Returns: zero on success - * - * blocking read of the requested data. must drain out the - * difference of mainpulated and requested lengths from the network - */ -ptl_err_t tcpnal_recv(lib_nal_t *n, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) - +int tcpnal_recv(lnet_ni_t *ni, + void *private, + lnet_msg_t *cookie, + int delayed, + unsigned int niov, + struct iovec *iov, + lnet_kiov_t *kiov, + unsigned int offset, + unsigned int mlen, + unsigned int rlen) { - struct iovec tiov[256]; - int ntiov; - int i; - - if (!niov) - goto finalize; - - LASSERT(mlen); - LASSERT(rlen); - LASSERT(rlen >= mlen); - - ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen); - - /* FIXME - * 1. Is this effecient enough? change to use readv() directly? - * 2. need check return from read_connection() - * - MeiJia - */ - for (i = 0; i < ntiov; i++) - read_connection(private, tiov[i].iov_base, tiov[i].iov_len); + struct iovec tiov[256]; + int ntiov; + int i; + + if (mlen == 0) + goto finalize; + + LASSERT(iov != NULL); /* I don't understand kiovs */ + + ntiov = lnet_extract_iov(256, tiov, niov, iov, offset, mlen); + + /* FIXME + * 1. Is this effecient enough? change to use readv() directly? + * 2. need check return from read_connection() + * - MeiJia + */ + for (i = 0; i < ntiov; i++) + read_connection(private, tiov[i].iov_base, tiov[i].iov_len); finalize: - /* FIXME; we always assume success here... */ - lib_finalize(n, private, cookie, PTL_OK); - - if (mlen!=rlen){ - char *trash=malloc(rlen-mlen); - - /*TODO: check error status*/ - read_connection(private,trash,rlen-mlen); - free(trash); - } - - return(PTL_OK); + /* FIXME; we always assume success here... */ + lnet_finalize(ni, cookie, 0); + + LASSERT(rlen >= mlen); + + if (mlen != rlen){ + char *trash=malloc(rlen - mlen); + + /*TODO: check error status*/ + read_connection(private, trash, rlen - mlen); + free(trash); + } + + return(0); } -/* Function: from_connection: - * Arguments: c: the connection to read from +/* Function: from_connection: + * Arguments: c: the connection to read from * Returns: whether or not to continue reading from this connection, * expressed as a 1 to continue, and a 0 to not * - * from_connection() is called from the select loop when i/o is - * available. It attempts to read the portals header and + * from_connection() is called from the select loop when i/o is + * available. It attempts to read the portals header and * pass it to the generic library for processing. */ static int from_connection(void *a, void *d) { - connection c = d; - bridge b = a; - ptl_hdr_t hdr; - - if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ - lib_parse(b->lib_nal, &hdr, c); - /*TODO: check error status*/ - return(1); - } - return(0); + connection c = d; + bridge b = a; + lnet_hdr_t hdr; + int rc; + + if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))) { + /* replace dest_nid,pid (socknal sets its own) */ + hdr.dest_nid = cpu_to_le64(b->b_ni->ni_nid); + hdr.dest_pid = cpu_to_le32(the_lnet.ln_pid); + + rc = lnet_parse(b->b_ni, &hdr, c->peer_nid, c, 0); + if (rc < 0) { + CERROR("Error %d from lnet_parse\n", rc); + return 0; + } + + return(1); + } + return(0); } -static void tcpnal_shutdown(bridge b) +void tcpnal_shutdown(bridge b) { - shutdown_connections(b->lower); + shutdown_connections(b->lower); } /* Function: PTL_IFACE_TCP @@ -238,19 +236,14 @@ static void tcpnal_shutdown(bridge b) */ int tcpnal_init(bridge b) { - manager m; - - b->lib_nal->libnal_send=tcpnal_send; - b->lib_nal->libnal_recv=tcpnal_recv; - b->shutdown=tcpnal_shutdown; - - if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid, - b->lib_nal->libnal_ni.ni_pid.pid), - from_connection,b))){ - /* TODO: this needs to shut down the - newly created junk */ - return(PTL_NAL_FAILED); - } - b->lower=m; - return(PTL_OK); + manager m; + + tcpnal_set_global_params(); + + if (!(m = init_connections(from_connection, b))) { + /* TODO: this needs to shut down the newly created junk */ + return(-ENXIO); + } + b->lower = m; + return(0); } diff --git a/lnet/ulnds/socklnd/utypes.h b/lnet/ulnds/socklnd/utypes.h deleted file mode 100644 index 7eca959..0000000 --- a/lnet/ulnds/socklnd/utypes.h +++ /dev/null @@ -1,12 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -typedef unsigned short uint16; -typedef unsigned long uint32; -typedef unsigned long long uint64; -typedef unsigned char uint8; diff --git a/lnet/ulnds/table.c b/lnet/ulnds/table.c deleted file mode 100644 index 662775a..0000000 --- a/lnet/ulnds/table.c +++ /dev/null @@ -1,264 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include - - -/* table.c: - * a very simple hash table implementation with paramerterizable - * comparison and key generation functions. it does resize - * in order to accomidate more entries, but never collapses - * the table - */ - -static table_entry *table_lookup (table t,void *comparator, - unsigned int k, - int (*compare_function)(void *, void *), - int *success) -{ - unsigned int key=k%t->size; - table_entry *i; - - for (i=&(t->entries[key]);*i;i=&((*i)->next)){ - if (compare_function && ((*i)->key==k)) - if ((*t->compare_function)((*i)->value,comparator)){ - *success=1; - return(i); - } - } - *success=0; - return(&(t->entries[key])); -} - - -static void resize_table(table t, int size) -{ - int old_size=t->size; - table_entry *old_entries=t->entries; - int i; - table_entry j,n; - table_entry *position; - int success; - - t->size=size; - t->entries=(table_entry *)malloc(sizeof(table_entry)*t->size); - memset(t->entries,0,sizeof(table_entry)*t->size); - - for (i=0;inext; - position=table_lookup(t,0,j->key,0,&success); - j->next= *position; - *position=j; - } - free(old_entries); -} - - -/* Function: key_from_int - * Arguments: int i: value to compute the key of - * Returns: the key - */ -unsigned int key_from_int(int i) -{ - return(i); -} - - -/* Function: key_from_string - * Arguments: char *s: the null terminated string - * to compute the key of - * Returns: the key - */ -unsigned int key_from_string(char *s) -{ - unsigned int result=0; - unsigned char *n; - int i; - if (!s) return(1); - for (n=s,i=0;*n;n++,i++) result^=(*n*57)^*n*i; - return(result); -} - - -/* Function: hash_create_table - * Arguments: compare_function: a function to compare - * a table instance with a correlator - * key_function: a function to generate a 32 bit - * hash key from a correlator - * Returns: a pointer to the new table - */ -table hash_create_table (int (*compare_function)(void *, void *), - unsigned int (*key_function)(unsigned int *)) -{ - table new=(table)malloc(sizeof(struct table)); - memset(new, 0, sizeof(struct table)); - - new->compare_function=compare_function; - new->key_function=key_function; - new->number_of_entries=0; - new->size=4; - new->entries=(table_entry *)malloc(sizeof(table_entry)*new->size); - memset(new->entries,0,sizeof(table_entry)*new->size); - return(new); -} - - -/* Function: hash_table_find - * Arguments: t: a table to look in - * comparator: a value to access the table entry - * Returns: the element references to by comparator, or null - */ -void *hash_table_find (table t, void *comparator) -{ - int success; - table_entry* entry=table_lookup(t,comparator, - (*t->key_function)(comparator), - t->compare_function, - &success); - if (success) return((*entry)->value); - return(0); -} - - -/* Function: hash_table_insert - * Arguments: t: a table to insert the object - * value: the object to put in the table - * comparator: the value by which the object - * will be addressed - * Returns: nothing - */ -void hash_table_insert (table t, void *value, void *comparator) -{ - int success; - unsigned int k=(*t->key_function)(comparator); - table_entry *position=table_lookup(t,comparator,k, - t->compare_function,&success); - table_entry entry; - - if (success) { - entry = *position; - } else { - entry = (table_entry)malloc(sizeof(struct table_entry)); - memset(entry, 0, sizeof(struct table_entry)); - entry->next= *position; - *position=entry; - t->number_of_entries++; - } - entry->value=value; - entry->key=k; - if (t->number_of_entries > t->size) resize_table(t,t->size*2); -} - -/* Function: hash_table_remove - * Arguments: t: the table to remove the object from - * comparator: the index value of the object to remove - * Returns: - */ -void hash_table_remove (table t, void *comparator) -{ - int success; - table_entry temp; - table_entry *position=table_lookup(t,comparator, - (*t->key_function)(comparator), - t->compare_function,&success); - if(success) { - temp=*position; - *position=(*position)->next; - free(temp); /* the value? */ - t->number_of_entries--; - } -} - -/* Function: hash_iterate_table_entries - * Arguments: t: the table to iterate over - * handler: a function to call with each element - * of the table, along with arg - * arg: the opaque object to pass to handler - * Returns: nothing - */ -void hash_iterate_table_entries(table t, - void (*handler)(void *,void *), - void *arg) -{ - int i; - table_entry *j,*next; - - for (i=0;isize;i++) - for (j=t->entries+i;*j;j=next){ - next=&((*j)->next); - (*handler)(arg,(*j)->value); - } -} - -/* Function: hash_filter_table_entries - * Arguments: t: the table to iterate over - * handler: a function to call with each element - * of the table, along with arg - * arg: the opaque object to pass to handler - * Returns: nothing - * Notes: operations on the table inside handler are not safe - * - * filter_table_entires() calls the handler function for each - * item in the table, passing it and arg. The handler function - * returns 1 if it is to be retained in the table, and 0 - * if it is to be removed. - */ -void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg) -{ - int i; - table_entry *j,*next,v; - - for (i=0;isize;i++) - for (j=t->entries+i;*j;j=next){ - next=&((*j)->next); - if (!(*handler)(arg,(*j)->value)){ - next=j; - v=*j; - *j=(*j)->next; - free(v); - t->number_of_entries--; - } - } -} - -/* Function: destroy_table - * Arguments: t: the table to free - * thunk: a function to call with each element, - * most likely free() - * Returns: nothing - */ -void hash_destroy_table(table t,void (*thunk)(void *)) -{ - table_entry j,next; - int i; - for (i=0;isize;i++) - for (j=t->entries[i];j;j=next){ - next=j->next; - if (thunk) (*thunk)(j->value); - free(j); - } - free(t->entries); - free(t); -} diff --git a/lnet/ulnds/table.h b/lnet/ulnds/table.h deleted file mode 100644 index 7fab586..0000000 --- a/lnet/ulnds/table.h +++ /dev/null @@ -1,39 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -#ifndef E_TABLE -#define E_TABLE - -typedef struct table_entry { - unsigned int key; - void *value; - struct table_entry *next; -} *table_entry; - - -typedef struct table { - unsigned int size; - int number_of_entries; - table_entry *entries; - int (*compare_function)(void *, void *); - unsigned int (*key_function)(unsigned int *); -} *table; - -/* table.c */ -unsigned int key_from_int(int i); -unsigned int key_from_string(char *s); -table hash_create_table(int (*compare_function)(void *, void *), unsigned int (*key_function)(unsigned int *)); -void *hash_table_find(table t, void *comparator); -void hash_table_insert(table t, void *value, void *comparator); -void hash_table_remove(table t, void *comparator); -void hash_iterate_table_entries(table t, void (*handler)(void *, void *), void *arg); -void hash_filter_table_entries(table t, int (*handler)(void *, void *), void *arg); -void hash_destroy_table(table t, void (*thunk)(void *)); - -#endif diff --git a/lnet/ulnds/tcplnd.c b/lnet/ulnds/tcplnd.c deleted file mode 100644 index abb6d01..0000000 --- a/lnet/ulnds/tcplnd.c +++ /dev/null @@ -1,256 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2003 Cluster File Systems, Inc. - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -/* tcpnal.c: - This file implements the TCP-based nal by providing glue - between the connection service and the generic NAL implementation */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef __CYGWIN__ -#include -#endif - -/* Function: tcpnal_send - * Arguments: nal: pointer to my nal control block - * private: unused - * cookie: passed back to the portals library - * hdr: pointer to the portals header - * nid: destination node - * pid: destination process - * data: body of the message - * len: length of the body - * Returns: zero on success - * - * sends a packet to the peer, after insuring that a connection exists - */ -ptl_err_t tcpnal_send(lib_nal_t *n, - void *private, - lib_msg_t *cookie, - ptl_hdr_t *hdr, - int type, - ptl_nid_t nid, - ptl_pid_t pid, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t len) -{ - connection c; - bridge b=(bridge)n->libnal_data; - struct iovec tiov[257]; - static pthread_mutex_t send_lock = PTHREAD_MUTEX_INITIALIZER; - ptl_err_t rc = PTL_OK; - int sysrc; - int total; - int ntiov; - int i; - - if (!(c=force_tcp_connection((manager)b->lower, - PNAL_IP(nid,b), - PNAL_PORT(nid,pid), - b->local))) - return(PTL_FAIL); - - /* TODO: these results should be checked. furthermore, provision - must be made for the SIGPIPE which is delivered when - writing on a tcp socket which has closed underneath - the application. there is a linux flag in the sendmsg - call which turns off the signally behaviour, but its - nonstandard */ - - LASSERT (niov <= 256); - - tiov[0].iov_base = hdr; - tiov[0].iov_len = sizeof(ptl_hdr_t); - ntiov = 1 + lib_extract_iov(256, &tiov[1], niov, iov, offset, len); - - pthread_mutex_lock(&send_lock); -#if 1 - for (i = total = 0; i < ntiov; i++) - total += tiov[i].iov_len; - - sysrc = syscall(SYS_writev, c->fd, tiov, ntiov); - if (sysrc != total) { - fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", - rc, total, errno); - rc = PTL_FAIL; - } -#else - for (i = total = 0; i <= ntiov; i++) { - rc = send(c->fd, tiov[i].iov_base, tiov[i].iov_len, 0); - - if (rc != tiov[i].iov_len) { - fprintf (stderr, "BAD SEND rc %d != %d, errno %d\n", - rc, tiov[i].iov_len, errno); - rc = PTL_FAIL; - break; - } - total += rc; - } -#endif -#if 0 - fprintf (stderr, "sent %s total %d in %d frags\n", - hdr->type == PTL_MSG_ACK ? "ACK" : - hdr->type == PTL_MSG_PUT ? "PUT" : - hdr->type == PTL_MSG_GET ? "GET" : - hdr->type == PTL_MSG_REPLY ? "REPLY" : - hdr->type == PTL_MSG_HELLO ? "HELLO" : "UNKNOWN", - total, niov + 1); -#endif - pthread_mutex_unlock(&send_lock); - - if (rc == PTL_OK) { - /* NB the NAL only calls lib_finalize() if it returns PTL_OK - * from cb_send() */ - lib_finalize(n, private, cookie, PTL_OK); - } - - return(rc); -} - - -/* Function: tcpnal_recv - * Arguments: lib_nal_t *nal: pointer to my nal control block - * void *private: connection pointer passed through - * lib_parse() - * lib_msg_t *cookie: passed back to portals library - * user_ptr data: pointer to the destination buffer - * size_t mlen: length of the body - * size_t rlen: length of data in the network - * Returns: zero on success - * - * blocking read of the requested data. must drain out the - * difference of mainpulated and requested lengths from the network - */ -ptl_err_t tcpnal_recv(lib_nal_t *n, - void *private, - lib_msg_t *cookie, - unsigned int niov, - struct iovec *iov, - size_t offset, - size_t mlen, - size_t rlen) - -{ - struct iovec tiov[256]; - int ntiov; - int i; - - if (!niov) - goto finalize; - - LASSERT(mlen); - LASSERT(rlen); - LASSERT(rlen >= mlen); - - ntiov = lib_extract_iov(256, tiov, niov, iov, offset, mlen); - - /* FIXME - * 1. Is this effecient enough? change to use readv() directly? - * 2. need check return from read_connection() - * - MeiJia - */ - for (i = 0; i < ntiov; i++) - read_connection(private, tiov[i].iov_base, tiov[i].iov_len); - -finalize: - /* FIXME; we always assume success here... */ - lib_finalize(n, private, cookie, PTL_OK); - - if (mlen!=rlen){ - char *trash=malloc(rlen-mlen); - - /*TODO: check error status*/ - read_connection(private,trash,rlen-mlen); - free(trash); - } - - return(PTL_OK); -} - - -/* Function: from_connection: - * Arguments: c: the connection to read from - * Returns: whether or not to continue reading from this connection, - * expressed as a 1 to continue, and a 0 to not - * - * from_connection() is called from the select loop when i/o is - * available. It attempts to read the portals header and - * pass it to the generic library for processing. - */ -static int from_connection(void *a, void *d) -{ - connection c = d; - bridge b = a; - ptl_hdr_t hdr; - - if (read_connection(c, (unsigned char *)&hdr, sizeof(hdr))){ - lib_parse(b->lib_nal, &hdr, c); - /*TODO: check error status*/ - return(1); - } - return(0); -} - - -static void tcpnal_shutdown(bridge b) -{ - shutdown_connections(b->lower); -} - -/* Function: PTL_IFACE_TCP - * Arguments: pid_request: desired port number to bind to - * desired: passed NAL limits structure - * actual: returned NAL limits structure - * Returns: a nal structure on success, or null on failure - */ -int tcpnal_init(bridge b) -{ - manager m; - - b->lib_nal->libnal_send=tcpnal_send; - b->lib_nal->libnal_recv=tcpnal_recv; - b->shutdown=tcpnal_shutdown; - - if (!(m=init_connections(PNAL_PORT(b->lib_nal->libnal_ni.ni_pid.nid, - b->lib_nal->libnal_ni.ni_pid.pid), - from_connection,b))){ - /* TODO: this needs to shut down the - newly created junk */ - return(PTL_NAL_FAILED); - } - b->lower=m; - return(PTL_OK); -} diff --git a/lnet/ulnds/timer.h b/lnet/ulnds/timer.h deleted file mode 100644 index aaf39d2..0000000 --- a/lnet/ulnds/timer.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * Copyright (c) 2002 Eric Hoffman - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -/* TODO: make this an explicit type when they become available */ -typedef unsigned long long when; - -typedef struct timer { - void (*function)(void *); - void *arg; - when w; - int interval; - int disable; -} *timer; - -timer register_timer(when, void (*f)(void *), void *a); -void remove_timer(timer t); -void timer_loop(void); -void initialize_timer(void); -void register_thunk(void (*f)(void *),void *a); - - -#define HZ 0x100000000ull - - diff --git a/lnet/ulnds/utypes.h b/lnet/ulnds/utypes.h deleted file mode 100644 index 7eca959..0000000 --- a/lnet/ulnds/utypes.h +++ /dev/null @@ -1,12 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (c) 2002 Cray Inc. - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - */ - -typedef unsigned short uint16; -typedef unsigned long uint32; -typedef unsigned long long uint64; -typedef unsigned char uint8; diff --git a/lnet/utils/.cvsignore b/lnet/utils/.cvsignore index e2a0d44..13c2683 100644 --- a/lnet/utils/.cvsignore +++ b/lnet/utils/.cvsignore @@ -6,5 +6,5 @@ ptlctl .deps routerstat wirecheck -gmnalnid +gmlndnid .*.cmd diff --git a/lnet/utils/Makefile.am b/lnet/utils/Makefile.am index 70a9ad8..9cd3f25 100644 --- a/lnet/utils/Makefile.am +++ b/lnet/utils/Makefile.am @@ -11,34 +11,29 @@ if LIBLUSTRE noinst_LIBRARIES = libuptlctl.a endif -libuptlctl_a_SOURCES = portals.c debug.c l_ioctl.c -libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS) -DLUSTRE_UTILS=1 -libuptlctl_a_CFLAGS = $(LLCFLAGS) +libuptlctl_a_SOURCES = portals.c nidstrings.c debug.c l_ioctl.c +libuptlctl_a_CPPFLAGS = $(LLCPPFLAGS) +libuptlctl_a_CFLAGS = $(LLCFLAGS) -DLUSTRE_UTILS=1 sbin_PROGRAMS = debugctl lib_LIBRARIES = libptlctl.a -libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h +libptlctl_a_SOURCES = portals.c nidstrings.c debug.c l_ioctl.c parser.c parser.h if UTILS -if !CRAY_PORTALS -sbin_PROGRAMS += acceptor ptlctl routerstat wirecheck +sbin_PROGRAMS += ptlctl routerstat wirecheck +if BUILD_GMLND +sbin_PROGRAMS += gmlndnid endif -if BUILD_GMNAL -sbin_PROGRAMS += gmnalnid endif -endif - -acceptor_SOURCES = acceptor.c -acceptor_LDADD = $(LIBWRAP) wirecheck_SOURCES = wirecheck.c -gmnalnid_SOURCES = gmnalnid.c -gmnalnid_CFLAGS = $(GMCPPFLAGS) -gmnalnid_LDFLAGS = -static -gmnalnid_LDADD = $(GMLIBS) -lgm +gmlndnid_SOURCES = gmlndnid.c +gmlndnid_CFLAGS = $(GMCPPFLAGS) +gmlndnid_LDFLAGS = -static +gmlndnid_LDADD = $(GMLIBS) -lgm ptlctl_SOURCES = ptlctl.c ptlctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) @@ -49,3 +44,6 @@ routerstat_SOURCES = routerstat.c debugctl_SOURCES = debugctl.c debugctl_LDADD = -L. -lptlctl $(LIBREADLINE) $(LIBEFENCE) debugctl_DEPENDENCIES = libptlctl.a + +nidstrings.c: @top_srcdir@/lnet/libcfs/nidstrings.c + ln -sf $< $@ diff --git a/lnet/utils/acceptor.c b/lnet/utils/acceptor.c deleted file mode 100644 index a270ad2..0000000 --- a/lnet/utils/acceptor.c +++ /dev/null @@ -1,363 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_LIBWRAP -#include -#include -#include -#endif - -#include -#include -#include -#include - -/* should get this from autoconf somehow */ -#ifndef PIDFILE_DIR -#define PIDFILE_DIR "/var/run" -#endif - -char progname[] = "acceptor"; -char name_port[40]; /* for signal handler */ - -#ifdef HAVE_LIBWRAP -/* needed because libwrap declares these as externs */ -int allow_severity = LOG_INFO; -int deny_severity = LOG_WARNING; -#endif - -void usage(char *progname) -{ - fprintf(stderr, "usage: %s [-N nal_id] [-p] [-l] port\n\n" - " -l\tKeep stdin/stdout open\n" - " -p\tAllow connections from non-privileged ports\n", progname); - exit (1); -} - -void errlog(int level, const char *fmt, ...) -{ - va_list arg; - FILE *out; - - switch (level) { - case LOG_DEBUG: - case LOG_INFO: - case LOG_NOTICE: - out = stdout; - break; - default: - out = stderr; - break; - } - va_start(arg, fmt); - fprintf(out, "%s: ", name_port); - vfprintf(out, fmt, arg); - va_end(arg); - va_start(arg, fmt); - vsyslog(level, fmt, arg); - va_end(arg); -} - -char *pidfile_name(char *name_port) -{ - static char pidfile[1024]; - - snprintf(pidfile, sizeof(pidfile), "%s/%s.pid", PIDFILE_DIR, name_port); - - return pidfile; -} - -int pidfile_create(char *name_port) -{ - char *pidfile = pidfile_name(name_port); - int fd, rc; - - if ((fd = open(pidfile, O_CREAT | O_WRONLY)) >= 0) { - char pid[16]; - int size = snprintf(pid, sizeof(pid), "%u\n", getpid()); - if (write(fd, pid, size) != size) { - /* hard error or short write */ - rc = errno ? : EIO; - } else { - rc = 0; - } - close(fd); - } else { - rc = errno; - } - - if (rc) - errlog(LOG_ERR, " error creating %s: %s\n", - pidfile, strerror(rc)); - - return rc; -} - -int pidfile_cleanup(char *name_port) -{ - char *pidfile = pidfile_name(name_port); - int rc; - - rc = unlink(pidfile); - if (rc && errno != -ENOENT) - fprintf(stderr, "%s: error removing %s: %s\n", - progname, pidfile, strerror(errno)); - errlog(LOG_NOTICE, "exiting\n"); - - return errno; -} - -int pidfile_exists(char *name_port) -{ - char *pidfile = pidfile_name(name_port); - FILE *fpid; - int pid, rc; - - fpid = fopen(pidfile, "r+"); - if (fpid == NULL) { - if (errno == ENOENT) - return 0; - - fprintf(stderr, "%s: error opening %s: %s.\n", - progname, pidfile, strerror(errno)); - return (1); - } - - rc = fscanf(fpid, "%i", &pid); - fclose(fpid); - if (rc != 1) { - fprintf(stderr,"%s: %s didn't contain a valid pid, removing.\n", - progname, pidfile); - goto stale; - } - - if (kill(pid, 0) == 0) { - fprintf(stderr, "%s: %s exists, acceptor pid %d running.\n", - progname, pidfile, pid); - return (1); - } - - fprintf(stderr, "%s: stale %s exists, pid %d doesn't, removing.\n", - progname, pidfile, pid); -stale: - pidfile_cleanup(name_port); - return (0); -} - -void handler(int sig) -{ - exit(sig); -} - -void atexit_handler(void) -{ - pidfile_cleanup(name_port); -} - -void show_connection(int fd, __u32 net_ip) -{ - static long last_time; - static __u32 host_ip; - long now = time(0); - struct hostent *h; - int len; - char host[1024]; - - /* Don't show repeats for same host, it adds no value */ - if (host_ip == ntohl(net_ip) && (now - last_time) < 5) - return; - - h = gethostbyaddr((char *)&net_ip, sizeof(net_ip), AF_INET); - last_time = now; - host_ip = ntohl(net_ip); - - if (h == NULL) - snprintf(host, sizeof(host), "%d.%d.%d.%d", - (host_ip >> 24) & 0xff, (host_ip >> 16) & 0xff, - (host_ip >> 8) & 0xff, host_ip & 0xff); - else - snprintf(host, sizeof(host), "%s", h->h_name); - - syslog(LOG_INFO, "accepted host: %s\n", host); -} - -int main(int argc, char **argv) -{ - int o, fd, rc, port, pfd; - struct sockaddr_in srvaddr; - int c; - int noclose = 0; - int nal = SOCKNAL; - int rport; - int require_privports = 1; - - while ((c = getopt (argc, argv, "N:lp")) != -1) { - switch (c) { - case 'N': - if (sscanf(optarg, "%d", &nal) != 1 || - nal < 0 || nal > NAL_MAX_NR) - usage(argv[0]); - break; - case 'l': - noclose = 1; - break; - case 'p': - require_privports = 0; - break; - default: - usage (argv[0]); - break; - } - } - - if (optind >= argc) - usage (argv[0]); - - port = atol(argv[optind++]); - - snprintf(name_port, sizeof(name_port) - 1, "%s-%d", progname, port); - if (pidfile_exists(name_port)) - return(EEXIST); - openlog(name_port, LOG_PID, LOG_DAEMON); - - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(port); - srvaddr.sin_addr.s_addr = INADDR_ANY; - - fd = socket(PF_INET, SOCK_STREAM, 0); - if (fd < 0) { - rc = errno; - errlog(LOG_ERR, "error opening socket: %s\n", strerror(errno)); - return(rc); - } - - o = 1; - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &o, sizeof(o))) { - rc = errno; - errlog(LOG_ERR, "cannot set REUSEADDR socket opt: %s\n", - strerror(errno)); - return(rc); - } - - rc = bind(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if (rc == -1) { - rc = errno; - errlog(LOG_ERR, "error binding to socket: %s\n", - strerror(errno)); - return(rc); - } - - if (listen(fd, 127)) { - rc = errno; - perror("listen: "); - return(rc); - } - printf("listening on port %d\n", port); - - pfd = open("/dev/portals", O_RDWR); - if (pfd < 0) { - rc = errno; - errlog(LOG_ERR, "opening portals device: %s\n",strerror(errno)); - return(rc); - } - - rc = daemon(0, noclose); - if (rc < 0) { - rc = errno; - errlog(LOG_ERR, "error daemonizing: %s\n", strerror(errno)); - return(rc); - } - - signal(SIGHUP, SIG_IGN); - signal(SIGINT, handler); - signal(SIGQUIT, handler); - signal(SIGTERM, handler); - - errlog(LOG_NOTICE, "started, listening on port %d\n", port); - if (pidfile_create(name_port) == 0) - atexit(atexit_handler); - - while (1) { - struct sockaddr_in clntaddr; - int len = sizeof(clntaddr); - int cfd; - struct portal_ioctl_data data; - struct portals_cfg pcfg; -#ifdef HAVE_LIBWRAP - struct request_info request; -#endif - char addrstr[INET_ADDRSTRLEN]; - - cfd = accept(fd, (struct sockaddr *)&clntaddr, &len); - if (cfd < 0) { - errlog(LOG_ERR, "error accepting connection: %s\n", - strerror(errno)); - break; - //continue; - } - - inet_ntop(AF_INET, &clntaddr.sin_addr, addrstr,INET_ADDRSTRLEN); -#ifdef HAVE_LIBWRAP - /* libwrap access control */ - request_init(&request, RQ_DAEMON, "lustre", RQ_FILE, cfd, 0); - sock_host(&request); - if (!hosts_access(&request)) { - errlog(LOG_WARNING, "unauthorized access from %s:%hd\n", - addrstr, ntohs(clntaddr.sin_port)); - close (cfd); - continue; - } -#endif - - if (require_privports && - ntohs(clntaddr.sin_port) >= IPPORT_RESERVED) { - errlog(LOG_ERR, - "closing non-privileged connection from %s:%d\n", - addrstr, ntohs(clntaddr.sin_port)); - rc = close(cfd); - if (rc) - perror ("close un-privileged client failed"); - continue; - } - - show_connection (cfd, clntaddr.sin_addr.s_addr); - - PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); - pcfg.pcfg_nal = nal; - pcfg.pcfg_fd = cfd; - pcfg.pcfg_misc = SOCKNAL_CONN_NONE; /* == incoming connection */ - - PORTAL_IOC_INIT(data); - data.ioc_pbuf1 = (char*)&pcfg; - data.ioc_plen1 = sizeof(pcfg); - - if (ioctl(pfd, IOC_PORTAL_NAL_CMD, &data) < 0) { - errlog(LOG_ERR, "portals ioctl failed for %s: %s\n", - addrstr, strerror(errno)); - } else { - errlog(LOG_DEBUG, "client %s registered\n", addrstr); - } - rc = close(cfd); - if (rc) - perror("close failed"); - } - - closelog(); - - return (0); -} diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 857be97..6dec5b8 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -3,19 +3,19 @@ * * Copyright (C) 2001, 2002 Cluster File Systems, Inc. * - * This file is part of Portals, http://www.sf.net/projects/lustre/ + * This file is part of Lustre Networking, http://www.lustre.org. * - * Portals is free software; you can redistribute it and/or + * LNET is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * - * Portals is distributed in the hope that it will be useful, + * LNET is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License - * along with Portals; if not, write to the Free Software + * along with LNET; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Some day I'll split all of this functionality into a cfs_debug module @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -49,8 +50,8 @@ #include #include -#include -#include +#include +#include #include #include "parser.h" @@ -63,22 +64,23 @@ static int max = 8192; static int subsystem_mask = ~0; static int debug_mask = ~0; -#define MAX_MARK_SIZE 100 +#define MAX_MARK_SIZE 256 -static const char *portal_debug_subsystems[] = - {"undefined", "mdc", "mds", "osc", +static const char *libcfs_debug_subsystems[] = + {"undefined", "mdc", "mds", "osc", "ost", "class", "log", "llite", - "rpc", "mgmt", "portals", "nal", - "pinger", "filter", "ptlbd", "echo", - "ldlm", "lov", "router", "cobd", - "sm", "asobd", "confobd", "lmv", - "cmobd", "sec", NULL}; -static const char *portal_debug_masks[] = - {"trace", "inode", "super", "ext2", + "rpc", "", "lnet", "lnd", + "pinger", "filter", "", "echo", + "ldlm", "lov", "", "", + "", "", "", "lmv", + "", "sec", "gss", "", "mgc", "mgs", + "fid", "fld", NULL}; +static const char *libcfs_debug_masks[] = + {"trace", "inode", "super", "ext2", "malloc", "cache", "info", "ioctl", - "blocks", "net", "warning", "buffs", - "other", "dentry", "portals", "page", - "dlmtrace", "error", "emerg", "ha", + "blocks", "net", "warning", "buffs", + "other", "dentry", "lnet", "page", + "dlmtrace", "error", "emerg", "ha", "rpctrace", "vfstrace", "reada", "mmap", "config", "console", "quota", "sec", NULL}; @@ -87,22 +89,101 @@ struct debug_daemon_cmd { unsigned int cmdv; }; -static const struct debug_daemon_cmd portal_debug_daemon_cmd[] = { +static const struct debug_daemon_cmd libcfs_debug_daemon_cmd[] = { {"start", DEBUG_DAEMON_START}, {"stop", DEBUG_DAEMON_STOP}, {0, 0} }; +#ifdef __linux__ + +#define DAEMON_CTL_NAME "/proc/sys/lnet/daemon_file" +#define SUBSYS_DEBUG_CTL_NAME "/proc/sys/lnet/subsystem_debug" +#define DEBUG_CTL_NAME "/proc/sys/lnet/debug" +#define DUMP_KERNEL_CTL_NAME "/proc/sys/lnet/dump_kernel" + +static int +dbg_open_ctlhandle(const char *str) +{ + int fd; + fd = open(str, O_WRONLY); + if (fd < 0) { + fprintf(stderr, "open %s failed: %s\n", str, + strerror(errno)); + return -1; + } + return fd; +} + +static void +dbg_close_ctlhandle(int fd) +{ + close(fd); +} + +static int +dbg_write_cmd(int fd, char *str, int len) +{ + int rc = write(fd, str, len); + + return (rc == len ? 0 : 1); +} + +#elif defined(__DARWIN__) + +#define DAEMON_CTL_NAME "lnet.trace_daemon" +#define SUBSYS_DEBUG_CTL_NAME "lnet.subsystem_debug" +#define DEBUG_CTL_NAME "lnet.debug" +#define DUMP_KERNEL_CTL_NAME "lnet.trace_dumpkernel" + +static char sysctl_name[128]; +static int +dbg_open_ctlhandle(const char *str) +{ + + if (strlen(str)+1 > 128) { + fprintf(stderr, "sysctl name is too long: %s.\n", str); + return -1; + } + strcpy(sysctl_name, str); + + return 0; +} + +static void +dbg_close_ctlhandle(int fd) +{ + sysctl_name[0] = '\0'; + return; +} + +static int +dbg_write_cmd(int fd, char *str, int len) +{ + int rc; + + rc = sysctlbyname(sysctl_name, NULL, NULL, str, len+1); + if (rc != 0) { + fprintf(stderr, "sysctl %s with cmd (%s) error: %d\n", + sysctl_name, str, errno); + } + return (rc == 0 ? 0: 1); +} + +#else +#error - Unknown sysctl convention. +#endif + static int do_debug_mask(char *name, int enable) { int found = 0, i; - for (i = 0; portal_debug_subsystems[i] != NULL; i++) { - if (strcasecmp(name, portal_debug_subsystems[i]) == 0 || + for (i = 0; libcfs_debug_subsystems[i] != NULL; i++) { + if (strcasecmp(name, libcfs_debug_subsystems[i]) == 0 || strcasecmp(name, "all_subs") == 0) { printf("%s output from subsystem \"%s\"\n", enable ? "Enabling" : "Disabling", - portal_debug_subsystems[i]); + libcfs_debug_subsystems[i]); if (enable) subsystem_mask |= (1 << i); else @@ -110,12 +191,12 @@ static int do_debug_mask(char *name, int enable) found = 1; } } - for (i = 0; portal_debug_masks[i] != NULL; i++) { - if (strcasecmp(name, portal_debug_masks[i]) == 0 || + for (i = 0; libcfs_debug_masks[i] != NULL; i++) { + if (strcasecmp(name, libcfs_debug_masks[i]) == 0 || strcasecmp(name, "all_types") == 0) { printf("%s output of type \"%s\"\n", enable ? "Enabling" : "Disabling", - portal_debug_masks[i]); + libcfs_debug_masks[i]); if (enable) debug_mask |= (1 << i); else @@ -173,38 +254,38 @@ static int applymask(char* procpath, int value) char buf[64]; int len = snprintf(buf, 64, "%d", value); - int fd = open(procpath, O_WRONLY); + int fd = dbg_open_ctlhandle(procpath); if (fd == -1) { fprintf(stderr, "Unable to open %s: %s\n", procpath, strerror(errno)); return fd; } - rc = write(fd, buf, len+1); - if (rc<0) { + rc = dbg_write_cmd(fd, buf, len+1); + if (rc != 0) { fprintf(stderr, "Write to %s failed: %s\n", procpath, strerror(errno)); return rc; } - close(fd); + dbg_close_ctlhandle(fd); return 0; } static void applymask_all(unsigned int subs_mask, unsigned int debug_mask) { if (!dump_filename) { - applymask("/proc/sys/portals/subsystem_debug", subs_mask); - applymask("/proc/sys/portals/debug", debug_mask); + applymask(SUBSYS_DEBUG_CTL_NAME, subs_mask); + applymask(DEBUG_CTL_NAME, debug_mask); } else { - struct portals_debug_ioctl_data data; + struct libcfs_debug_ioctl_data data; data.hdr.ioc_len = sizeof(data); data.hdr.ioc_version = 0; data.subs = subs_mask; data.debug = debug_mask; - dump(OBD_DEV_ID, PTL_IOC_DEBUG_MASK, &data); + dump(OBD_DEV_ID, LIBCFS_IOC_DEBUG_MASK, &data); } - printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/portals\n", + printf("Applied subsystem_debug=%d, debug=%d to /proc/sys/lnet\n", subs_mask, debug_mask); } @@ -219,13 +300,14 @@ int jt_dbg_list(int argc, char **argv) if (strcasecmp(argv[1], "subs") == 0) { printf("Subsystems: all_subs"); - for (i = 0; portal_debug_subsystems[i] != NULL; i++) - printf(", %s", portal_debug_subsystems[i]); + for (i = 0; libcfs_debug_subsystems[i] != NULL; i++) + if (libcfs_debug_subsystems[i][0]) + printf(", %s", libcfs_debug_subsystems[i]); printf("\n"); } else if (strcasecmp(argv[1], "types") == 0) { printf("Types: all_types"); - for (i = 0; portal_debug_masks[i] != NULL; i++) - printf(", %s", portal_debug_masks[i]); + for (i = 0; libcfs_debug_masks[i] != NULL; i++) + printf(", %s", libcfs_debug_masks[i]); printf("\n"); } else if (strcasecmp(argv[1], "applymasks") == 0) { applymask_all(subsystem_mask, debug_mask); @@ -275,7 +357,7 @@ static void print_saved_records(struct list_head *list, FILE *out) list_del(&line->chain); hdr = line->hdr; - fprintf(out, "%07x:%06x:%u:%u.%06Lu:%u:%u:%u:(%s:%u:%s()) %s", + fprintf(out, "%08x:%08x:%u:%u.%06llu:%u:%u:%u:(%s:%u:%s()) %s", hdr->ph_subsys, hdr->ph_mask, hdr->ph_cpu_id, hdr->ph_sec, (unsigned long long)hdr->ph_usec, hdr->ph_stack, hdr->ph_pid, hdr->ph_extern_pid, @@ -297,7 +379,7 @@ static int parse_buffer(FILE *in, FILE *out) CFS_INIT_LIST_HEAD(&chunk_list); while (1) { - rc = fread(buf, sizeof(hdr->ph_len), 1, in); + rc = fread(buf, sizeof(hdr->ph_len) + sizeof(hdr->ph_flags), 1, in); if (rc <= 0) break; @@ -316,8 +398,8 @@ static int parse_buffer(FILE *in, FILE *out) assert(list_empty(&chunk_list)); } - rc = fread(buf + sizeof(hdr->ph_len), 1, - hdr->ph_len - sizeof(hdr->ph_len), in); + rc = fread(buf + sizeof(hdr->ph_len) + sizeof(hdr->ph_flags), 1, + hdr->ph_len - sizeof(hdr->ph_len) - sizeof(hdr->ph_flags), in); if (rc <= 0) break; @@ -337,6 +419,7 @@ static int parse_buffer(FILE *in, FILE *out) line->hdr = malloc(hdr->ph_len + 1); if (line->hdr == NULL) { + free(line); fprintf(stderr, "malloc failed; printing accumulated " "records and exiting.\n"); break; @@ -394,21 +477,21 @@ int jt_dbg_debug_kernel(int argc, char **argv) if (stat(filename, &st) == 0 && S_ISREG(st.st_mode)) unlink(filename); - fd = open("/proc/sys/portals/dump_kernel", O_WRONLY); + fd = dbg_open_ctlhandle(DUMP_KERNEL_CTL_NAME); if (fd < 0) { fprintf(stderr, "open(dump_kernel) failed: %s\n", strerror(errno)); return 1; } - rc = write(fd, filename, strlen(filename)); - if (rc != strlen(filename)) { + rc = dbg_write_cmd(fd, filename, strlen(filename)); + if (rc != 0) { fprintf(stderr, "write(%s) failed: %s\n", filename, strerror(errno)); close(fd); return 1; } - close(fd); + dbg_close_ctlhandle(fd); if (raw) return 0; @@ -476,8 +559,8 @@ int jt_dbg_debug_file(int argc, char **argv) return 1; } if (argc > 2) { - fdout = open(argv[2], - O_CREAT | O_TRUNC | O_WRONLY | O_LARGEFILE, + fdout = open(argv[2], + O_CREAT | O_TRUNC | O_WRONLY | O_LARGEFILE, 0600); if (fdout == -1) { fprintf(stderr, "open(%s) failed: %s\n", argv[2], @@ -504,17 +587,8 @@ int jt_dbg_debug_file(int argc, char **argv) return rc; } -static int -dbg_write_cmd(int fd, char *str) -{ - int len = strlen(str); - int rc = write(fd, str, len); - - return (rc == len ? 0 : 1); -} - const char debug_daemon_usage[] = "usage: %s {start file [MB]|stop}\n"; -#define DAEMON_FILE "/proc/sys/portals/daemon_file" + int jt_dbg_debug_daemon(int argc, char **argv) { int rc; @@ -525,13 +599,10 @@ int jt_dbg_debug_daemon(int argc, char **argv) return 1; } - fd = open(DAEMON_FILE, O_WRONLY); - if (fd < 0) { - fprintf(stderr, "open %s failed: %s\n", DAEMON_FILE, - strerror(errno)); + fd = dbg_open_ctlhandle(DAEMON_CTL_NAME); + if (fd < 0) return -1; - } - + rc = -1; if (strcasecmp(argv[1], "start") == 0) { if (argc < 3 || argc > 4 || @@ -556,7 +627,7 @@ int jt_dbg_debug_daemon(int argc, char **argv) goto out; } snprintf(buf, sizeof(buf), "size=%ld", size); - rc = dbg_write_cmd(fd, buf); + rc = dbg_write_cmd(fd, buf, strlen(buf)); if (rc != 0) { fprintf(stderr, "set %s failed: %s\n", @@ -565,7 +636,7 @@ int jt_dbg_debug_daemon(int argc, char **argv) } } - rc = dbg_write_cmd(fd, argv[2]); + rc = dbg_write_cmd(fd, argv[2], strlen(argv[2])); if (rc != 0) { fprintf(stderr, "start debug_daemon on %s failed: %s\n", argv[2], strerror(errno)); @@ -575,7 +646,7 @@ int jt_dbg_debug_daemon(int argc, char **argv) goto out; } if (strcasecmp(argv[1], "stop") == 0) { - rc = dbg_write_cmd(fd, "stop"); + rc = dbg_write_cmd(fd, "stop", 4); if (rc != 0) { fprintf(stderr, "stopping debug_daemon failed: %s\n", strerror(errno)); @@ -589,14 +660,14 @@ int jt_dbg_debug_daemon(int argc, char **argv) fprintf(stderr, debug_daemon_usage, argv[0]); rc = -1; out: - close(fd); + dbg_close_ctlhandle(fd); return rc; } int jt_dbg_clear_debug_buf(int argc, char **argv) { int rc; - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; if (argc != 1) { fprintf(stderr, "usage: %s\n", argv[0]); @@ -604,14 +675,14 @@ int jt_dbg_clear_debug_buf(int argc, char **argv) } memset(&data, 0, sizeof(data)); - if (portal_ioctl_pack(&data, &buf, max) != 0) { - fprintf(stderr, "portal_ioctl_pack failed.\n"); + if (libcfs_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "libcfs_ioctl_pack failed.\n"); return -1; } - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_CLEAR_DEBUG, buf); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_CLEAR_DEBUG, buf); if (rc) { - fprintf(stderr, "IOC_PORTAL_CLEAR_DEBUG failed: %s\n", + fprintf(stderr, "IOC_LIBCFS_CLEAR_DEBUG failed: %s\n", strerror(errno)); return -1; } @@ -620,41 +691,37 @@ int jt_dbg_clear_debug_buf(int argc, char **argv) int jt_dbg_mark_debug_buf(int argc, char **argv) { + static char scratch[MAX_MARK_SIZE] = { '\0' }; int rc, max_size = MAX_MARK_SIZE-1; - struct portal_ioctl_data data; + struct libcfs_ioctl_data data = { 0 }; char *text; time_t now = time(NULL); if (argc > 1) { - int counter; - text = malloc(MAX_MARK_SIZE); + int count; + text = scratch; strncpy(text, argv[1], max_size); max_size-=strlen(argv[1]); - for(counter = 2; (counter < argc) && (max_size > 0) ; counter++){ - strncat(text, " ", 1); - max_size-=1; - strncat(text, argv[counter], max_size); - max_size-=strlen(argv[counter]); + for (count = 2; (count < argc) && (max_size > 0); count++){ + strncat(text, " ", max_size); + max_size -= 1; + strncat(text, argv[count], max_size); + max_size -= strlen(argv[count]); } } else { text = ctime(&now); - text[strlen(text) - 1] = '\0'; /* stupid \n */ - } - if (!max_size) { - text[MAX_MARK_SIZE - 1] = '\0'; } - memset(&data, 0, sizeof(data)); data.ioc_inllen1 = strlen(text) + 1; data.ioc_inlbuf1 = text; - if (portal_ioctl_pack(&data, &buf, max) != 0) { - fprintf(stderr, "portal_ioctl_pack failed.\n"); + if (libcfs_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "libcfs_ioctl_pack failed.\n"); return -1; } - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MARK_DEBUG, buf); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_MARK_DEBUG, buf); if (rc) { - fprintf(stderr, "IOC_PORTAL_MARK_DEBUG failed: %s\n", + fprintf(stderr, "IOC_LIBCFS_MARK_DEBUG failed: %s\n", strerror(errno)); return -1; } @@ -664,46 +731,56 @@ int jt_dbg_mark_debug_buf(int argc, char **argv) static struct mod_paths { char *name, *path; } mod_paths[] = { - {"libcfs", "portals/libcfs"}, - {"portals", "portals/portals"}, - {"ksocknal", "portals/knals/socknal"}, - {"kptlrouter", "portals/router"}, + {"libcfs", "lnet/libcfs"}, + {"lnet", "lnet/lnet"}, + {"kciblnd", "lnet/klnds/ciblnd"}, + {"kgmlnd", "lnet/klnds/gmlnd"}, + {"kmxlnd", "lnet/klnds/mxlnd"}, + {"kiiblnd", "lnet/klnds/iiblnd"}, + {"ko2iblnd", "lnet/klnds/o2iblnd"}, + {"kopeniblnd", "lnet/klnds/openiblnd"}, + {"kptllnd", "lnet/klnds/ptllnd"}, + {"kqswlnd", "lnet/klnds/qswlnd"}, + {"kralnd", "lnet/klnds/ralnd"}, + {"ksocklnd", "lnet/klnds/socklnd"}, + {"ktdilnd", "lnet/klnds/tdilnd"}, + {"kviblnd", "lnet/klnds/viblnd"}, {"lvfs", "lustre/lvfs"}, {"obdclass", "lustre/obdclass"}, {"llog_test", "lustre/obdclass"}, - {"ptlrpcs", "lustre/sec"}, - {"ptlrpcs_gss", "lustre/sec/gss"}, + {"ptlrpc_gss", "lustre/ptlrpc/gss"}, + {"ptlrpc", "lustre/ptlrpc"}, {"gks", "lustre/sec/gks"}, {"gkc", "lustre/sec/gks"}, - {"ptlrpc", "lustre/ptlrpc"}, - {"obdext2", "lustre/obdext2"}, {"ost", "lustre/ost"}, {"osc", "lustre/osc"}, {"mds", "lustre/mds"}, {"mdc", "lustre/mdc"}, {"llite", "lustre/llite"}, + {"lustre", "lustre/llite"}, {"ldiskfs", "lustre/ldiskfs"}, {"smfs", "lustre/smfs"}, {"obdecho", "lustre/obdecho"}, {"ldlm", "lustre/ldlm"}, {"obdfilter", "lustre/obdfilter"}, - {"extN", "lustre/extN"}, {"lov", "lustre/lov"}, {"lmv", "lustre/lmv"}, {"fsfilt_ext3", "lustre/lvfs"}, - {"fsfilt_extN", "lustre/lvfs"}, {"fsfilt_reiserfs", "lustre/lvfs"}, {"fsfilt_smfs", "lustre/lvfs"}, {"fsfilt_ldiskfs", "lustre/lvfs"}, - {"mds_ext2", "lustre/mds"}, {"mds_ext3", "lustre/mds"}, - {"mds_extN", "lustre/mds"}, - {"ptlbd", "lustre/ptlbd"}, - {"mgmt_svc", "lustre/mgmt"}, - {"mgmt_cli", "lustre/mgmt"}, {"cobd", "lustre/cobd"}, {"cmobd", "lustre/cmobd"}, - {"confobd", "lustre/obdclass"}, + {"lquota", "lustre/quota"}, + {"mgs", "lustre/mgs"}, + {"mgc", "lustre/mgc"}, + {"mdt", "lustre/mdt"}, + {"mdd", "lustre/mdd"}, + {"osd", "lustre/osd"}, + {"cmm", "lustre/cmm"}, + {"fid", "lustre/fid"}, + {"fld", "lustre/fld"}, {NULL, NULL} }; @@ -724,7 +801,6 @@ static int jt_dbg_modules_2_4(int argc, char **argv) return 0; } - printf("dir\n"); for (mp = mod_paths; mp->name != NULL; mp++) { struct module_info info; int rc; @@ -742,8 +818,6 @@ static int jt_dbg_modules_2_4(int argc, char **argv) printf("add-symbol-file %s%s%s/%s.o 0x%0lx\n", path, path[0] ? "/" : "", mp->path, mp->name, info.addr + sizeof(struct module)); - printf("dir %s%s%s\n", path, - path[0] ? "/" : "", mp->path); } } @@ -759,7 +833,7 @@ static int jt_dbg_modules_2_5(int argc, char **argv) char *path = ""; char *kernel = "linux"; const char *proc = "/proc/modules"; - char modname[128], others[128]; + char modname[128], others[4096]; long modaddr; int rc; FILE *file; @@ -779,7 +853,6 @@ static int jt_dbg_modules_2_5(int argc, char **argv) return 0; } - printf("dir\n"); while ((rc = fscanf(file, "%s %s %s %s %s %lx\n", modname, others, others, others, others, &modaddr)) == 6) { for (mp = mod_paths; mp->name != NULL; mp++) { @@ -789,11 +862,10 @@ static int jt_dbg_modules_2_5(int argc, char **argv) if (mp->name) { printf("add-symbol-file %s%s%s/%s.o 0x%0lx\n", path, path[0] ? "/" : "", mp->path, mp->name, modaddr); - printf("dir %s%s%s\n", path, - path[0] ? "/" : "", mp->path); } } + fclose(file); return 0; } @@ -820,7 +892,7 @@ int jt_dbg_modules(int argc, char **argv) int jt_dbg_panic(int argc, char **argv) { int rc; - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; if (argc != 1) { fprintf(stderr, "usage: %s\n", argv[0]); @@ -828,14 +900,14 @@ int jt_dbg_panic(int argc, char **argv) } memset(&data, 0, sizeof(data)); - if (portal_ioctl_pack(&data, &buf, max) != 0) { - fprintf(stderr, "portal_ioctl_pack failed.\n"); + if (libcfs_ioctl_pack(&data, &buf, max) != 0) { + fprintf(stderr, "libcfs_ioctl_pack failed.\n"); return -1; } - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PANIC, buf); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PANIC, buf); if (rc) { - fprintf(stderr, "IOC_PORTAL_PANIC failed: %s\n", + fprintf(stderr, "IOC_LIBCFS_PANIC failed: %s\n", strerror(errno)); return -1; } diff --git a/lnet/utils/debugctl.c b/lnet/utils/debugctl.c index 1b6cd96..cf70fd8 100644 --- a/lnet/utils/debugctl.c +++ b/lnet/utils/debugctl.c @@ -25,8 +25,8 @@ #include #include -#include -#include +#include +#include #include "parser.h" @@ -53,7 +53,8 @@ int main(int argc, char **argv) if (dbg_initialize(argc, argv) < 0) exit(2); - register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); + register_ioc_dev(LNET_DEV_ID, LNET_DEV_PATH, + LNET_DEV_MAJOR, LNET_DEV_MINOR); Parser_init("debugctl > ", list); if (argc > 1) @@ -61,6 +62,6 @@ int main(int argc, char **argv) Parser_commands(); - unregister_ioc_dev(PORTALS_DEV_ID); + unregister_ioc_dev(LNET_DEV_ID); return 0; } diff --git a/lnet/utils/gmlndnid.c b/lnet/utils/gmlndnid.c index f7e5250..ce5cb14 100644 --- a/lnet/utils/gmlndnid.c +++ b/lnet/utils/gmlndnid.c @@ -32,12 +32,11 @@ #include #include -#include -#include +#include +#include #include -#define GMNAL_IOC_GET_GNID 1 /* * portals always uses unit 0 * Can this be configurable? @@ -47,27 +46,34 @@ void usage(char *prg, int h) { - fprintf(stderr, "usage %s -n hostname | -l | -h\n", prg); - if (h) { - printf("\nGet Myrinet Global network ids for specified host\n" - "-l gets network id for local host\n"); - } + fprintf(stderr, + "usage %s -h\n" + " %s [-l] [-n hostname] [-L] [hostnames]\n", prg); + + if (h) + printf("Print Myrinet Global network ids for specified hosts\n" + "-l print local host's ID\n" + "-n hostname print given host's ID\n" + "-L print Myringet local net ID too\n" + "[hostnames] print ids of given hosts (local if none)\n"); } -unsigned -u_getgmnid(char *name, int get_local_id) +gm_status_t +print_gmid(char *name, int name_fieldlen, int show_local_id) { struct gm_port *gm_port; - int gm_port_id = 2; - gm_status_t gm_status = GM_SUCCESS; - unsigned global_nid = 0, local_nid = 0; /* gm ids never 0 */ + int gm_port_id; + gm_status_t gm_status; + unsigned int local_id; + unsigned int global_id; gm_status = gm_init(); if (gm_status != GM_SUCCESS) { fprintf(stderr, "gm_init: %s\n", gm_strerror(gm_status)); - return(0); + return gm_status; } + gm_port_id = 2; gm_status = gm_open(&gm_port, GM_UNIT, gm_port_id, "gmnalnid", GM_API_VERSION); if (gm_status != GM_SUCCESS) { @@ -83,77 +89,96 @@ u_getgmnid(char *name, int get_local_id) if (gm_status != GM_SUCCESS) { fprintf(stderr, "gm_open: %s\n",gm_strerror(gm_status)); - gm_finalize(); - return(0); + goto out_0; } } - if (get_local_id) { - local_nid = 1; + if (name == NULL) { + local_id = 1; + name = ""; } else { gm_status = gm_host_name_to_node_id_ex(gm_port, 1000000, name, - &local_nid); + &local_id); if (gm_status != GM_SUCCESS) { - fprintf(stderr, "gm_host_name_to_node_id_ex: %s\n", - gm_strerror(gm_status)); - gm_close(gm_port); - gm_finalize(); - return(0); + fprintf(stderr, "gm_host_name_to_node_id_ex(%s): %s\n", + name, gm_strerror(gm_status)); + goto out_1; } } - gm_status = gm_node_id_to_global_id(gm_port, local_nid, &global_nid) ; + gm_status = gm_node_id_to_global_id(gm_port, local_id, &global_id) ; if (gm_status != GM_SUCCESS) { - fprintf(stderr, "gm_node_id_to_global_id: %s\n", - gm_strerror(gm_status)); - gm_close(gm_port); - gm_finalize(); - return(0); + fprintf(stderr, "gm_node_id_to_global_id(%s:%d): %s\n", + name, local_id, gm_strerror(gm_status)); + goto out_1; } + + if (name_fieldlen > 0) + printf ("%*s ", name_fieldlen, name); + + if (!show_local_id) + printf("0x%x\n", global_id); + else + printf("local 0x%x global 0x%x\n", local_id, global_id); + + out_1: gm_close(gm_port); + out_0: gm_finalize(); - return(global_nid); + + return gm_status; } -int main(int argc, char **argv) +int +main (int argc, char **argv) { - unsigned int nid = 0; - char *name = NULL; int c; - int get_local_id = 0; + gm_status_t gmrc; + int rc; + int max_namelen = 0; + int show_local_id = 0; - while ((c = getopt(argc, argv, "n:lh")) != -1) { + while ((c = getopt(argc, argv, "n:lLh")) != -1) switch(c) { - case('n'): - if (get_local_id) { - usage(argv[0], 0); - exit(-1); - } - name = optarg; - break; - case('h'): + case 'h': usage(argv[0], 1); - exit(-1); - break; - case('l'): - if (name) { - usage(argv[0], 0); - exit(-1); - } - get_local_id = 1; + return 0; + + case 'L': + show_local_id = 1; break; + + case 'n': + gmrc = print_gmid(optarg, 0, show_local_id); + return (gmrc == GM_SUCCESS) ? 0 : 1; + + case 'l': + gmrc = print_gmid(NULL, 0, show_local_id); + return (gmrc == GM_SUCCESS) ? 0 : 1; + default: usage(argv[0], 0); - exit(-1); + return 2; } + + if (optind == argc) { + gmrc = print_gmid(NULL, 0, show_local_id); + return (gmrc == GM_SUCCESS) ? 0 : 1; } - if (!name && !get_local_id) { - usage(argv[0], 0); - exit(-1); + if (optind != argc - 1) + for (c = optind; c < argc; c++) + if (strlen(argv[c]) > max_namelen) + max_namelen = strlen(argv[c]); + + rc = 0; + + for (c = optind; c < argc; c++) { + gmrc = print_gmid(argv[c], max_namelen, show_local_id); + + if (gmrc != GM_SUCCESS) + rc = 1; } - nid = u_getgmnid(name, get_local_id); - printf("%u\n", nid); - exit(0); + return rc; } diff --git a/lnet/utils/l_ioctl.c b/lnet/utils/l_ioctl.c index 01dccb1..0bdb782 100644 --- a/lnet/utils/l_ioctl.c +++ b/lnet/utils/l_ioctl.c @@ -33,23 +33,26 @@ #include #include -#include -#include +#include +#include #include + static ioc_handler_t do_ioctl; /* forward ref */ static ioc_handler_t *current_ioc_handler = &do_ioctl; struct ioc_dev { - const char * dev_name; - int dev_fd; + const char * dev_name; + int dev_fd; + int dev_major; + int dev_minor; }; static struct ioc_dev ioc_dev_list[10]; struct dump_hdr { - int magic; - int dev_id; + int magic; + int dev_id; unsigned int opc; }; @@ -64,60 +67,78 @@ set_ioc_handler (ioc_handler_t *handler) current_ioc_handler = handler; } +/* Catamount has no , so just define it here */ +#ifndef MKDEV +# define MKDEV(a,b) (((a) << 8) | (b)) +#endif + static int open_ioc_dev(int dev_id) { - const char * dev_name; + const char * dev_name; - if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) - return -EINVAL; + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return -EINVAL; - dev_name = ioc_dev_list[dev_id].dev_name; - if (dev_name == NULL) { + dev_name = ioc_dev_list[dev_id].dev_name; + if (dev_name == NULL) { fprintf(stderr, "unknown device id: %d\n", dev_id); - return -EINVAL; - } - - if (ioc_dev_list[dev_id].dev_fd < 0) { - int fd = open(dev_name, O_RDWR); - - if (fd < 0) { - fprintf(stderr, "opening %s failed: %s\n" - "hint: the kernel modules may not be loaded\n", - dev_name, strerror(errno)); - return fd; - } - ioc_dev_list[dev_id].dev_fd = fd; - } - - return ioc_dev_list[dev_id].dev_fd; + return -EINVAL; + } + + if (ioc_dev_list[dev_id].dev_fd < 0) { + int fd = open(dev_name, O_RDWR); + + /* Make the /dev/ node if we need to */ + if (fd < 0 && errno == ENOENT) { + if (mknod(dev_name, + S_IFCHR|S_IWUSR|S_IRUSR, + MKDEV(ioc_dev_list[dev_id].dev_major, + ioc_dev_list[dev_id].dev_minor)) == 0) + fd = open(dev_name, O_RDWR); + else + fprintf(stderr, "mknod %s failed: %s\n", + dev_name, strerror(errno)); + } + + if (fd < 0) { + fprintf(stderr, "opening %s failed: %s\n" + "hint: the kernel modules may not be loaded\n", + dev_name, strerror(errno)); + return fd; + } + ioc_dev_list[dev_id].dev_fd = fd; + } + + return ioc_dev_list[dev_id].dev_fd; } static int do_ioctl(int dev_id, unsigned int opc, void *buf) { - int fd, rc; - - fd = open_ioc_dev(dev_id); - if (fd < 0) - return fd; - - rc = ioctl(fd, opc, buf); - return rc; - + int fd, rc; + + fd = open_ioc_dev(dev_id); + if (fd < 0) + return fd; + + rc = ioctl(fd, opc, buf); + return rc; + } static FILE * get_dump_file() { - FILE *fp = NULL; - - if (!dump_filename) { - fprintf(stderr, "no dump filename\n"); - } else - fp = fopen(dump_filename, "a"); - return fp; + FILE *fp = NULL; + + if (!dump_filename) { + fprintf(stderr, "no dump filename\n"); + } else + fp = fopen(dump_filename, "a"); + return fp; } /* @@ -127,25 +148,25 @@ get_dump_file() int dump(int dev_id, unsigned int opc, void *buf) { - FILE *fp; - struct dump_hdr dump_hdr; - struct portal_ioctl_hdr * ioc_hdr = (struct portal_ioctl_hdr *) buf; - int rc; - - printf("dumping opc %x to %s\n", opc, dump_filename); - - - dump_hdr.magic = 0xdeadbeef; - dump_hdr.dev_id = dev_id; - dump_hdr.opc = opc; - - fp = get_dump_file(); - if (fp == NULL) { - fprintf(stderr, "%s: %s\n", dump_filename, - strerror(errno)); - return -EINVAL; - } - + FILE *fp; + struct dump_hdr dump_hdr; + struct libcfs_ioctl_hdr * ioc_hdr = (struct libcfs_ioctl_hdr *) buf; + int rc; + + printf("dumping opc %x to %s\n", opc, dump_filename); + + + dump_hdr.magic = 0xdeadbeef; + dump_hdr.dev_id = dev_id; + dump_hdr.opc = opc; + + fp = get_dump_file(); + if (fp == NULL) { + fprintf(stderr, "%s: %s\n", dump_filename, + strerror(errno)); + return -EINVAL; + } + rc = fwrite(&dump_hdr, sizeof(dump_hdr), 1, fp); if (rc == 1) rc = fwrite(buf, ioc_hdr->ioc_len, 1, fp); @@ -161,32 +182,36 @@ dump(int dev_id, unsigned int opc, void *buf) /* register a device to send ioctls to. */ int -register_ioc_dev(int dev_id, const char * dev_name) +register_ioc_dev(int dev_id, const char * dev_name, int major, int minor) { - if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) - return -EINVAL; - - unregister_ioc_dev(dev_id); + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return -EINVAL; - ioc_dev_list[dev_id].dev_name = dev_name; - ioc_dev_list[dev_id].dev_fd = -1; + unregister_ioc_dev(dev_id); - return dev_id; + ioc_dev_list[dev_id].dev_name = dev_name; + ioc_dev_list[dev_id].dev_fd = -1; + ioc_dev_list[dev_id].dev_major = major; + ioc_dev_list[dev_id].dev_minor = minor; + + return dev_id; } void unregister_ioc_dev(int dev_id) { - if (dev_id < 0 || dev_id >= sizeof(ioc_dev_list)) - return; - if (ioc_dev_list[dev_id].dev_name != NULL && - ioc_dev_list[dev_id].dev_fd >= 0) - close(ioc_dev_list[dev_id].dev_fd); + if (dev_id < 0 || + dev_id >= sizeof(ioc_dev_list) / sizeof(ioc_dev_list[0])) + return; + if (ioc_dev_list[dev_id].dev_name != NULL && + ioc_dev_list[dev_id].dev_fd >= 0) + close(ioc_dev_list[dev_id].dev_fd); - ioc_dev_list[dev_id].dev_name = NULL; - ioc_dev_list[dev_id].dev_fd = -1; + ioc_dev_list[dev_id].dev_name = NULL; + ioc_dev_list[dev_id].dev_fd = -1; } /* If this file is set, then all ioctl buffers will be @@ -194,15 +219,15 @@ unregister_ioc_dev(int dev_id) int set_ioctl_dump(char * file) { - if (dump_filename) - free(dump_filename); - - dump_filename = strdup(file); + if (dump_filename) + free(dump_filename); + + dump_filename = strdup(file); if (dump_filename == NULL) abort(); set_ioc_handler(&dump); - return 0; + return 0; } int @@ -222,69 +247,69 @@ l_ioctl(int dev_id, unsigned int opc, void *buf) int parse_dump(char * dump_file, ioc_handler_t ioc_func) { - int line =0; - struct stat st; - char *start, *buf, *end; + int line =0; + struct stat st; + char *start, *buf, *end; #ifndef __CYGWIN__ int fd; #else HANDLE fd, hmap; DWORD size; #endif - + #ifndef __CYGWIN__ - fd = syscall(SYS_open, dump_file, O_RDONLY); + fd = syscall(SYS_open, dump_file, O_RDONLY); if (fd < 0) { fprintf(stderr, "couldn't open %s: %s\n", dump_file, strerror(errno)); exit(1); } - if (fstat(fd, &st)) { - perror("stat fails"); - exit(1); - } + if (fstat(fd, &st)) { + perror("stat fails"); + exit(1); + } - if (st.st_size < 1) { - fprintf(stderr, "KML is empty\n"); - exit(1); - } + if (st.st_size < 1) { + fprintf(stderr, "KML is empty\n"); + exit(1); + } - start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); - end = start + st.st_size; - close(fd); + start = buf = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE , fd, 0); + end = start + st.st_size; + close(fd); if (start == MAP_FAILED) { - fprintf(stderr, "can't create file mapping\n"); - exit(1); + fprintf(stderr, "can't create file mapping\n"); + exit(1); } #else fd = CreateFile(dump_file, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); size = GetFileSize(fd, NULL); if (size < 1) { - fprintf(stderr, "KML is empty\n"); - exit(1); - } + fprintf(stderr, "KML is empty\n"); + exit(1); + } hmap = CreateFileMapping(fd, NULL, PAGE_READONLY, 0,0, NULL); start = buf = MapViewOfFile(hmap, FILE_MAP_READ, 0, 0, 0); end = buf + size; CloseHandle(fd); if (start == NULL) { - fprintf(stderr, "can't create file mapping\n"); - exit(1); + fprintf(stderr, "can't create file mapping\n"); + exit(1); } #endif /* __CYGWIN__ */ - while (buf < end) { + while (buf < end) { struct dump_hdr *dump_hdr = (struct dump_hdr *) buf; - struct portal_ioctl_hdr * data; + struct libcfs_ioctl_hdr * data; char tmp[8096]; int rc; line++; - data = (struct portal_ioctl_hdr *) (buf + sizeof(*dump_hdr)); + data = (struct libcfs_ioctl_hdr *) (buf + sizeof(*dump_hdr)); if (buf + data->ioc_len > end ) { fprintf(stderr, "dump file overflow, %p + %d > %p\n", buf, data->ioc_len, end); @@ -307,7 +332,7 @@ parse_dump(char * dump_file, ioc_handler_t ioc_func) } buf += data->ioc_len + sizeof(*dump_hdr); - } + } #ifndef __CYGWIN__ munmap(start, end - start); @@ -316,7 +341,7 @@ parse_dump(char * dump_file, ioc_handler_t ioc_func) CloseHandle(hmap); #endif - return 0; + return 0; } int @@ -326,8 +351,8 @@ jt_ioc_dump(int argc, char **argv) fprintf(stderr, "usage: %s [hostname]\n", argv[0]); return 0; } - printf("setting dumpfile to: %s\n", argv[1]); - - set_ioctl_dump(argv[1]); - return 0; + printf("setting dumpfile to: %s\n", argv[1]); + + set_ioctl_dump(argv[1]); + return 0; } diff --git a/lnet/utils/lbstats b/lnet/utils/lbstats new file mode 100755 index 0000000..a8f0857 --- /dev/null +++ b/lnet/utils/lbstats @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "=== Router Buffers =======" +test -e /proc/sys/lnet/buffers && cat /proc/sys/lnet/buffers +echo +echo "=== NIs ============================================" +test -e /proc/sys/lnet/nis && cat /proc/sys/lnet/nis +echo +echo "=== Peers =============================================================" +test -e /proc/sys/lnet/peers && cat /proc/sys/lnet/peers +echo diff --git a/lnet/utils/parser.c b/lnet/utils/parser.c index b91295b..2f740c1 100644 --- a/lnet/utils/parser.c +++ b/lnet/utils/parser.c @@ -27,20 +27,7 @@ #include #include #include - -#ifdef HAVE_LIBREADLINE -#define READLINE_LIBRARY -#include - -/* completion_matches() is #if 0-ed out in modern glibc */ -#ifndef completion_matches -# define completion_matches rl_completion_matches -#endif -#endif - -extern void using_history(void); -extern void stifle_history(int); -extern void add_history(char *); +#include #include "parser.h" diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 692342f..671e78c 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -45,54 +45,23 @@ #ifdef HAVE_ENDIAN_H #include #endif -#if CRAY_PORTALS -#ifdef REDSTORM -#define __QK__ -#endif -#include -#endif #include -#include -#include -#include -#include +#include +#include +#include #include "parser.h" -unsigned int portal_debug; -unsigned int portal_printk; - -static unsigned int g_nal = 0; - -typedef struct -{ - char *name; - int num; -} name2num_t; - -static name2num_t nalnames[] = { - {"any", 0}, -#if !CRAY_PORTALS - {"tcp", SOCKNAL}, - {"elan", QSWNAL}, - {"gm", GMNAL}, - {"openib", OPENIBNAL}, - {"iib", IIBNAL}, - {"vib", VIBNAL}, - {"lo", LONAL}, - {"ra", RANAL}, -#else - {"cray_kern_nal", CRAY_KERN_NAL}, - {"cray_user_nal", CRAY_USER_NAL}, - {"cray_qk_nal", CRAY_QK_NAL}, -#endif - {NULL, -1} -}; +unsigned int libcfs_debug; +unsigned int libcfs_printk; -static cfg_record_cb_t g_record_cb; +static int g_net_set; +static __u32 g_net; /* Convert a string boolean to an int; "enable" -> 1 */ -int ptl_parse_bool (int *b, char *str) { +int +lnet_parse_bool (int *b, char *str) +{ if (!strcasecmp (str, "no") || !strcasecmp (str, "n") || !strcasecmp (str, "off") || @@ -116,116 +85,18 @@ int ptl_parse_bool (int *b, char *str) { return (-1); } -/* Convert human readable size string to and int; "1k" -> 1000 */ -int ptl_parse_size (int *sizep, char *str) { - int size; - char mod[32]; - - switch (sscanf (str, "%d%1[gGmMkK]", &size, mod)) { - default: - return (-1); - - case 1: - *sizep = size; - return (0); - - case 2: - switch (*mod) { - case 'g': - case 'G': - *sizep = size << 30; - return (0); - - case 'm': - case 'M': - *sizep = size << 20; - return (0); - - case 'k': - case 'K': - *sizep = size << 10; - return (0); - - default: - *sizep = size; - return (0); - } - } -} - -int -ptl_set_cfg_record_cb(cfg_record_cb_t cb) -{ - g_record_cb = cb; - return 0; -} - -int -pcfg_ioctl(struct portals_cfg *pcfg) -{ - int rc; - - if (pcfg->pcfg_nal ==0) - pcfg->pcfg_nal = g_nal; - - if (g_record_cb) { - rc = g_record_cb(PORTALS_CFG_TYPE, sizeof(*pcfg), pcfg); - } else { - struct portal_ioctl_data data; - PORTAL_IOC_INIT (data); - data.ioc_pbuf1 = (char*)pcfg; - data.ioc_plen1 = sizeof(*pcfg); - /* XXX liblustre hack XXX */ - data.ioc_nal_cmd = pcfg->pcfg_command; - data.ioc_nid = pcfg->pcfg_nid; - - rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); - - if (rc == 0 && pcfg->pcfg_version != PORTALS_CFG_VERSION) - return -EINVAL; - } - - return (rc); -} - - - -static name2num_t * -name2num_lookup_name (name2num_t *table, char *str) -{ - while (table->name != NULL) - if (!strcmp (str, table->name)) - return (table); - else - table++; - return (NULL); -} - -static name2num_t * -name2num_lookup_num (name2num_t *table, int num) -{ - while (table->name != NULL) - if (num == table->num) - return (table); - else - table++; - return (NULL); -} - int -ptl_name2nal (char *str) -{ - name2num_t *e = name2num_lookup_name (nalnames, str); - - return ((e == NULL) ? -1 : e->num); -} - -static char * -nal2name (int nal) +lnet_parse_port (int *port, char *str) { - name2num_t *e = name2num_lookup_num (nalnames, nal); + char *end; + + *port = strtol (str, &end, 0); - return ((e == NULL) ? "???" : e->name); + if (*end == 0 && /* parsed whole string */ + *port > 0 && *port < 65536) /* minimal sanity check */ + return (0); + + return (-1); } #ifdef HAVE_GETHOSTBYNAME @@ -252,50 +123,7 @@ ptl_gethostbyname(char * hname) { #endif int -ptl_parse_port (int *port, char *str) -{ - char *end; - - *port = strtol (str, &end, 0); - - if (*end == 0 && /* parsed whole string */ - *port > 0 && *port < 65536) /* minimal sanity check */ - return (0); - - return (-1); -} - -int -ptl_parse_time (time_t *t, char *str) -{ - char *end; - int n; - struct tm tm; - - *t = strtol (str, &end, 0); - if (*end == 0) /* parsed whole string */ - return (0); - - memset (&tm, 0, sizeof (tm)); - n = sscanf (str, "%d-%d-%d-%d:%d:%d", - &tm.tm_year, &tm.tm_mon, &tm.tm_mday, - &tm.tm_hour, &tm.tm_min, &tm.tm_sec); - if (n != 6) - return (-1); - - tm.tm_mon--; /* convert to 0 == Jan */ - tm.tm_year -= 1900; /* y2k quirk */ - tm.tm_isdst = -1; /* dunno if it's daylight savings... */ - - *t = mktime (&tm); - if (*t == (time_t)-1) - return (-1); - - return (0); -} - -int -ptl_parse_ipquad (__u32 *ipaddrp, char *str) +lnet_parse_ipquad (__u32 *ipaddrp, char *str) { int a; int b; @@ -314,7 +142,7 @@ ptl_parse_ipquad (__u32 *ipaddrp, char *str) } int -ptl_parse_ipaddr (__u32 *ipaddrp, char *str) +lnet_parse_ipaddr (__u32 *ipaddrp, char *str) { #ifdef HAVE_GETHOSTBYNAME struct hostent *he; @@ -325,7 +153,7 @@ ptl_parse_ipaddr (__u32 *ipaddrp, char *str) return (0); } - if (ptl_parse_ipquad(ipaddrp, str) == 0) + if (lnet_parse_ipquad(ipaddrp, str) == 0) return (0); #ifdef HAVE_GETHOSTBYNAME @@ -366,226 +194,272 @@ ptl_ipaddr_2_str (__u32 ipaddr, char *str, int lookup) } int -ptl_parse_nid (ptl_nid_t *nidp, char *str) +lnet_parse_time (time_t *t, char *str) { - __u32 ipaddr; - char *end; - unsigned long long ullval; + char *end; + int n; + struct tm tm; - if (ptl_parse_ipaddr (&ipaddr, str) == 0) { -#if !CRAY_PORTALS - *nidp = (ptl_nid_t)ipaddr; -#else - *nidp = (((ptl_nid_t)ipaddr & PNAL_HOSTID_MASK) << PNAL_VNODE_SHIFT); -#endif - return (0); - } - - ullval = strtoull(str, &end, 0); - if (end != str && *end == 0) { - /* parsed whole non-empty string */ - *nidp = (ptl_nid_t)ullval; + *t = strtol (str, &end, 0); + if (*end == 0) /* parsed whole string */ return (0); - } + + memset (&tm, 0, sizeof (tm)); + n = sscanf (str, "%d-%d-%d-%d:%d:%d", + &tm.tm_year, &tm.tm_mon, &tm.tm_mday, + &tm.tm_hour, &tm.tm_min, &tm.tm_sec); + if (n != 6) + return (-1); + + tm.tm_mon--; /* convert to 0 == Jan */ + tm.tm_year -= 1900; /* y2k quirk */ + tm.tm_isdst = -1; /* dunno if it's daylight savings... */ + + *t = mktime (&tm); + if (*t == (time_t)-1) + return (-1); + + return (0); +} - return (-1); +int g_net_is_set (char *cmd) +{ + if (g_net_set) + return 1; + + if (cmd != NULL) + fprintf(stderr, + "You must run the 'network' command before '%s'.\n", + cmd); + return 0; } -int -ptl_parse_anynid (ptl_nid_t *nidp, char *str) +int g_net_is_compatible (char *cmd, ...) { - if (!strcmp (str, "_all_")) { - *nidp = PTL_NID_ANY; + va_list ap; + int nal; + + if (!g_net_is_set(cmd)) return 0; - } - return ptl_parse_nid(nidp, str); + va_start(ap, cmd); + + do { + nal = va_arg (ap, int); + if (nal == LNET_NETTYP(g_net)) { + va_end (ap); + return 1; + } + } while (nal != 0); + + va_end (ap); + + if (cmd != NULL) + fprintf (stderr, + "Command %s not compatible with %s NAL\n", + cmd, + libcfs_lnd2str(LNET_NETTYP(g_net))); + return 0; } -__u64 ptl_nid2u64(ptl_nid_t nid) +int ptl_initialize(int argc, char **argv) { - switch (sizeof (nid)) { - case 8: - return (nid); - case 4: - return ((__u32)nid); - default: - fprintf(stderr, "Unexpected sizeof(ptl_nid_t) == %u\n", - (int)sizeof(nid)); - abort(); - /* notreached */ - return (-1); - } + register_ioc_dev(LNET_DEV_ID, LNET_DEV_PATH, + LNET_DEV_MAJOR, LNET_DEV_MINOR); + return 0; } -char * -ptl_nid2str (char *buffer, ptl_nid_t nid) -{ - __u64 nid64 = ptl_nid2u64(nid); -#ifdef HAVE_GETHOSTBYNAME - struct hostent *he = 0; - /* Don't try to resolve NIDs that are e.g. Elan host IDs. Assume - * TCP addresses in the 0.x.x.x subnet are not in use. This can - * happen on routers and slows things down a _lot_. Bug 3442. */ - if (nid & 0xff000000) { - __u32 addr = htonl((__u32)nid); /* back to NETWORK byte order */ +int jt_ptl_network(int argc, char **argv) +{ + struct libcfs_ioctl_data data; + __u32 net = LNET_NIDNET(LNET_NID_ANY); + int rc; - he = gethostbyaddr((const char *)&addr, sizeof(addr), AF_INET); + if (argc < 2) { + fprintf(stderr, "usage: %s |up|down\n", argv[0]); + return 0; } - if (he != NULL) - sprintf(buffer, "%#x:%s", (int)(nid64 >> 32), he->h_name); - else -#endif /* HAVE_GETHOSTBYNAME */ - sprintf(buffer, LPX64, nid64); + if (!strcmp(argv[1], "unconfigure") || + !strcmp(argv[1], "down")) { + LIBCFS_IOC_INIT(data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_UNCONFIGURE, &data); - return (buffer); -} + if (rc == 0) { + printf ("LNET ready to unload\n"); + return 0; + } -int g_nal_is_set () -{ - if (g_nal == 0) { - fprintf (stderr, "Error: you must run the 'network' command first.\n"); - return (0); + if (errno == EBUSY) + fprintf(stderr, "LNET busy\n"); + else + fprintf(stderr, "LNET unconfigure error %d: %s\n", + errno, strerror(errno)); + return -1; } - return (1); -} - -int g_nal_is_compatible (char *cmd, ...) -{ - va_list ap; - int nal; - - if (!g_nal_is_set ()) - return (0); + if (!strcmp(argv[1], "configure") || + !strcmp(argv[1], "up")) { + LIBCFS_IOC_INIT(data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_CONFIGURE, &data); - va_start (ap, cmd); + if (rc == 0) { + printf ("LNET configured\n"); + return 0; + } - do { - nal = va_arg (ap, int); - } while (nal != 0 && nal != g_nal); - - va_end (ap); - - if (g_nal == nal) - return (1); + fprintf(stderr, "LNET configure error %d: %s\n", + errno, strerror(errno)); + return -1; + } - if (cmd != NULL) { - /* Don't complain verbosely if we've not been passed a command - * name to complain about! */ - fprintf (stderr, "Command %s not compatible with nal %s\n", - cmd, nal2name (g_nal)); + net = libcfs_str2net(argv[1]); + if (net == LNET_NIDNET(LNET_NID_ANY)) { + fprintf(stderr, "Can't parse net %s\n", argv[1]); + return -1; } - return (0); + + g_net_set = 1; + g_net = net; + return 0; } int -sock_write (int cfd, void *buffer, int nob) +jt_ptl_list_nids(int argc, char **argv) { - while (nob > 0) - { - int rc = write (cfd, buffer, nob); + struct libcfs_ioctl_data data; + int all = 0, return_nid = 0; + int count; + int rc; - if (rc < 0) - { - if (errno == EINTR) - continue; - - return (rc); - } + all = (argc == 2) && (strcmp(argv[1], "all") == 0); + /* Hack to pass back value */ + return_nid = (argc == 2) && (argv[1][0] == 1); - if (rc == 0) - { - fprintf (stderr, "Unexpected zero sock_write\n"); - abort(); + if ((argc > 2) && !(all || return_nid)) { + fprintf(stderr, "usage: %s [all]\n", argv[0]); + return 0; + } + + for (count = 0;; count++) { + LIBCFS_IOC_INIT (data); + data.ioc_count = count; + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_NI, &data); + + if (rc < 0) { + if ((count > 0) && (errno == ENOENT)) + /* We found them all */ + break; + fprintf(stderr,"IOC_LIBCFS_GET_NI error %d: %s\n", + errno, strerror(errno)); + return -1; } - nob -= rc; - buffer = (char *)buffer + nob; + if (all || (LNET_NETTYP(LNET_NIDNET(data.ioc_nid)) != LOLND)) { + printf("%s\n", libcfs_nid2str(data.ioc_nid)); + if (return_nid) { + *(__u64 *)(argv[1]) = data.ioc_nid; + return_nid--; + } + } } - - return (0); + + return 0; } int -sock_read (int cfd, void *buffer, int nob) +jt_ptl_which_nid (int argc, char **argv) { - while (nob > 0) - { - int rc = read (cfd, buffer, nob); + struct libcfs_ioctl_data data; + int best_dist = 0; + int best_order = 0; + lnet_nid_t best_nid = LNET_NID_ANY; + int dist; + int order; + lnet_nid_t nid; + char *nidstr; + int rc; + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s NID [NID...]\n", argv[0]); + return 0; + } + + for (i = 1; i < argc; i++) { + nidstr = argv[i]; + nid = libcfs_str2nid(nidstr); + if (nid == LNET_NID_ANY) { + fprintf(stderr, "Can't parse NID %s\n", nidstr); + return -1; + } - if (rc < 0) - { - if (errno == EINTR) - continue; - - return (rc); + LIBCFS_IOC_INIT(data); + data.ioc_nid = nid; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LNET_DIST, &data); + if (rc != 0) { + fprintf(stderr, "Can't get distance to %s: %s\n", + nidstr, strerror(errno)); + return -1; } + + dist = data.ioc_u32[0]; + order = data.ioc_u32[1]; + + if (dist < 0) { + if (dist == -EHOSTUNREACH) + continue; - if (rc == 0) /* EOF */ - { - errno = ECONNABORTED; - return (-1); + fprintf(stderr, "Unexpected distance to %s: %d\n", + nidstr, dist); + return -1; } - nob -= rc; - buffer = (char *)buffer + nob; + if (best_nid == LNET_NID_ANY || + dist < best_dist || + (dist == best_dist && order < best_order)) { + best_dist = dist; + best_order = order; + best_nid = nid; + } } - - return (0); -} -int ptl_initialize(int argc, char **argv) -{ - register_ioc_dev(PORTALS_DEV_ID, PORTALS_DEV_PATH); - return 0; -} - - -int jt_ptl_network(int argc, char **argv) -{ - name2num_t *entry; - int nal; - - if (argc == 2 && - (nal = ptl_name2nal (argv[1])) >= 0) { - g_nal = nal; - return (0); + if (best_nid == LNET_NID_ANY) { + fprintf(stderr, "No reachable NID\n"); + return -1; } - - fprintf(stderr, "usage: %s \n", argv[0]); - for (entry = nalnames; entry->name != NULL; entry++) - fprintf (stderr, "%s%s", entry == nalnames ? "<" : "|", entry->name); - fprintf(stderr, ">\n"); - return (-1); + + printf("%s\n", libcfs_nid2str(best_nid)); + return 0; } int jt_ptl_print_interfaces (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; char buffer[3][64]; int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, 0)) return -1; for (index = 0;;index++) { - PCFG_INIT (pcfg, NAL_CMD_GET_INTERFACE); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl (&pcfg); + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_count = index; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_INTERFACE, &data); if (rc != 0) break; printf ("%s: (%s/%s) npeer %d nroute %d\n", - ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[2], 1), - ptl_ipaddr_2_str(pcfg.pcfg_id, buffer[0], 0), - ptl_ipaddr_2_str(pcfg.pcfg_misc, buffer[1], 0), - pcfg.pcfg_fd, pcfg.pcfg_count); + ptl_ipaddr_2_str(data.ioc_u32[0], buffer[2], 1), + ptl_ipaddr_2_str(data.ioc_u32[0], buffer[0], 0), + ptl_ipaddr_2_str(data.ioc_u32[1], buffer[1], 0), + data.ioc_u32[2], data.ioc_u32[3]); } if (index == 0) { @@ -604,7 +478,7 @@ jt_ptl_print_interfaces (int argc, char **argv) int jt_ptl_add_interface (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; __u32 ipaddr; int rc; __u32 netmask = 0xffffff00; @@ -617,10 +491,10 @@ jt_ptl_add_interface (int argc, char **argv) return 0; } - if (!g_nal_is_compatible(argv[0], SOCKNAL, 0)) + if (!g_net_is_compatible(argv[0], SOCKLND, 0)) return -1; - if (ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) { + if (lnet_parse_ipaddr(&ipaddr, argv[1]) != 0) { fprintf (stderr, "Can't parse ip: %s\n", argv[1]); return -1; } @@ -631,17 +505,18 @@ jt_ptl_add_interface (int argc, char **argv) netmask = 0; for (i = count; i > 0; i--) netmask = netmask|(1<<(32-i)); - } else if (ptl_parse_ipquad(&netmask, argv[2]) != 0) { + } else if (lnet_parse_ipquad(&netmask, argv[2]) != 0) { fprintf (stderr, "Can't parse netmask: %s\n", argv[2]); return -1; } } - PCFG_INIT(pcfg, NAL_CMD_ADD_INTERFACE); - pcfg.pcfg_id = ipaddr; - pcfg.pcfg_misc = netmask; + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_u32[0] = ipaddr; + data.ioc_u32[1] = netmask; - rc = pcfg_ioctl (&pcfg); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_INTERFACE, &data); if (rc != 0) { fprintf (stderr, "failed to add interface: %s\n", strerror (errno)); @@ -654,7 +529,7 @@ jt_ptl_add_interface (int argc, char **argv) int jt_ptl_del_interface (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; int rc; __u32 ipaddr = 0; @@ -663,19 +538,20 @@ jt_ptl_del_interface (int argc, char **argv) return 0; } - if (!g_nal_is_compatible(argv[0], SOCKNAL, 0)) + if (!g_net_is_compatible(argv[0], SOCKLND, 0)) return -1; if (argc == 2 && - ptl_parse_ipaddr(&ipaddr, argv[1]) != 0) { + lnet_parse_ipaddr(&ipaddr, argv[1]) != 0) { fprintf (stderr, "Can't parse ip: %s\n", argv[1]); return -1; } - PCFG_INIT(pcfg, NAL_CMD_DEL_INTERFACE); - pcfg.pcfg_id = ipaddr; + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_u32[0] = ipaddr; - rc = pcfg_ioctl (&pcfg); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_INTERFACE, &data); if (rc != 0) { fprintf (stderr, "failed to delete interface: %s\n", strerror (errno)); @@ -688,37 +564,64 @@ jt_ptl_del_interface (int argc, char **argv) int jt_ptl_print_peers (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; + lnet_process_id_t id; char buffer[2][64]; int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, - OPENIBNAL, IIBNAL, VIBNAL, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, PTLLND, + OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) return -1; for (index = 0;;index++) { - PCFG_INIT (pcfg, NAL_CMD_GET_PEER); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl (&pcfg); + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_count = index; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_PEER, &data); if (rc != 0) break; - if (g_nal_is_compatible(NULL, SOCKNAL, 0)) - printf (LPX64"[%d]%s@%s:%d #%d\n", - pcfg.pcfg_nid, pcfg.pcfg_wait, - ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1), - ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), - pcfg.pcfg_misc, pcfg.pcfg_count); - else if (g_nal_is_compatible(NULL, RANAL, OPENIBNAL, VIBNAL, 0)) - printf (LPX64"[%d]@%s:%d\n", - pcfg.pcfg_nid, pcfg.pcfg_wait, - ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), - pcfg.pcfg_misc); - else - printf (LPX64"[%d]\n", - pcfg.pcfg_nid, pcfg.pcfg_wait); + if (g_net_is_compatible(NULL, SOCKLND, 0)) { + id.nid = data.ioc_nid; + id.pid = data.ioc_u32[4]; + printf ("%-20s [%d]%s->%s:%d #%d\n", + libcfs_id2str(id), + data.ioc_count, /* persistence */ + ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* my ip */ + ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */ + data.ioc_u32[1], /* peer port */ + data.ioc_u32[3]); /* conn_count */ + } else if (g_net_is_compatible(NULL, PTLLND, 0)) { + id.nid = data.ioc_nid; + id.pid = data.ioc_u32[4]; + printf ("%-20s s %d%s [%d] "LPD64".%06d" + " m "LPD64"/"LPD64" q %d/%d c %d/%d\n", + libcfs_id2str(id), + data.ioc_net, /* state */ + data.ioc_flags ? "" : " ~!h", /* sent_hello */ + data.ioc_count, /* refcount */ + data.ioc_u64[0]/1000000, /* incarnation secs */ + (int)(data.ioc_u64[0]%1000000), /* incarnation usecs */ + (((__u64)data.ioc_u32[1])<<32) | + ((__u64)data.ioc_u32[0]), /* next_matchbits */ + (((__u64)data.ioc_u32[3])<<32) | + ((__u64)data.ioc_u32[2]), /* last_matchbits_seen */ + data.ioc_u32[5] >> 16, /* nsendq */ + data.ioc_u32[5] & 0xffff, /* nactiveq */ + data.ioc_u32[6] >> 16, /* credits */ + data.ioc_u32[6] & 0xffff); /* outstanding_credits */ + } else if (g_net_is_compatible(NULL, RALND, OPENIBLND, CIBLND, VIBLND, 0)) { + printf ("%-20s [%d]@%s:%d\n", + libcfs_nid2str(data.ioc_nid), /* peer nid */ + data.ioc_count, /* peer persistence */ + ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* peer ip */ + data.ioc_u32[1]); /* peer port */ + } else { + printf ("%-20s [%d]\n", + libcfs_nid2str(data.ioc_nid), data.ioc_count); + } } if (index == 0) { @@ -736,23 +639,23 @@ jt_ptl_print_peers (int argc, char **argv) int jt_ptl_add_peer (int argc, char **argv) { - struct portals_cfg pcfg; - ptl_nid_t nid; + struct libcfs_ioctl_data data; + lnet_nid_t nid; __u32 ip = 0; int port = 0; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, - OPENIBNAL, IIBNAL, VIBNAL, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, + OPENIBLND, CIBLND, IIBLND, VIBLND, 0)) return -1; - if (g_nal_is_compatible(NULL, SOCKNAL, OPENIBNAL, RANAL, 0)) { + if (g_net_is_compatible(NULL, SOCKLND, OPENIBLND, CIBLND, RALND, 0)) { if (argc != 4) { - fprintf (stderr, "usage(tcp,openib,ra): %s nid ipaddr port\n", + fprintf (stderr, "usage(tcp,openib,cib,ra): %s nid ipaddr port\n", argv[0]); return 0; } - } else if (g_nal_is_compatible(NULL, VIBNAL, 0)) { + } else if (g_net_is_compatible(NULL, VIBLND, 0)) { if (argc != 3) { fprintf (stderr, "usage(vib): %s nid ipaddr\n", argv[0]); @@ -763,30 +666,31 @@ jt_ptl_add_peer (int argc, char **argv) return 0; } - if (ptl_parse_nid (&nid, argv[1]) != 0 || - nid == PTL_NID_ANY) { + nid = libcfs_str2nid(argv[1]); + if (nid == LNET_NID_ANY) { fprintf (stderr, "Can't parse NID: %s\n", argv[1]); return -1; } - if (g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, VIBNAL, RANAL, 0) && - ptl_parse_ipaddr (&ip, argv[2]) != 0) { + if (g_net_is_compatible (NULL, SOCKLND, OPENIBLND, CIBLND, VIBLND, RALND, 0) && + lnet_parse_ipaddr (&ip, argv[2]) != 0) { fprintf (stderr, "Can't parse ip addr: %s\n", argv[2]); return -1; } - if (g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, RANAL, 0) && - ptl_parse_port (&port, argv[3]) != 0) { + if (g_net_is_compatible (NULL, SOCKLND, OPENIBLND, CIBLND, RALND, 0) && + lnet_parse_port (&port, argv[3]) != 0) { fprintf (stderr, "Can't parse port: %s\n", argv[3]); return -1; } - PCFG_INIT(pcfg, NAL_CMD_ADD_PEER); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ip; - pcfg.pcfg_misc = port; + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_nid = nid; + data.ioc_u32[0] = ip; + data.ioc_u32[1] = port; - rc = pcfg_ioctl (&pcfg); + rc = l_ioctl (LNET_DEV_ID, IOC_LIBCFS_ADD_PEER, &data); if (rc != 0) { fprintf (stderr, "failed to add peer: %s\n", strerror (errno)); @@ -799,60 +703,65 @@ jt_ptl_add_peer (int argc, char **argv) int jt_ptl_del_peer (int argc, char **argv) { - struct portals_cfg pcfg; - ptl_nid_t nid = PTL_NID_ANY; + struct libcfs_ioctl_data data; + lnet_nid_t nid = LNET_NID_ANY; + lnet_pid_t pid = LNET_PID_ANY; __u32 ip = 0; - int single_share = 0; - int argidx; + char *end; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, - OPENIBNAL, IIBNAL, VIBNAL, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, MXLND, PTLLND, + OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) return -1; - if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { - if (argc > 4) { - fprintf (stderr, "usage: %s [nid] [ipaddr] [single_share]\n", + if (g_net_is_compatible(NULL, SOCKLND, 0)) { + if (argc > 3) { + fprintf (stderr, "usage: %s [nid] [ipaddr]\n", argv[0]); return 0; } - } else if (argc > 3) { - fprintf (stderr, "usage: %s [nid] [single_share]\n", argv[0]); + } else if (g_net_is_compatible(NULL, PTLLND, 0)) { + if (argc > 3) { + fprintf (stderr, "usage: %s [nid] [pid]\n", + argv[0]); + return 0; + } + } else if (argc > 2) { + fprintf (stderr, "usage: %s [nid]\n", argv[0]); return 0; } if (argc > 1 && - ptl_parse_anynid (&nid, argv[1]) != 0) { + !libcfs_str2anynid(&nid, argv[1])) { fprintf (stderr, "Can't parse nid: %s\n", argv[1]); return -1; } - argidx = 2; - if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { - if (argc > argidx && - ptl_parse_ipaddr (&ip, argv[argidx]) != 0) { + if (g_net_is_compatible(NULL, SOCKLND, 0)) { + if (argc > 2 && + lnet_parse_ipaddr (&ip, argv[2]) != 0) { fprintf (stderr, "Can't parse ip addr: %s\n", - argv[argidx]); + argv[2]); return -1; } - argidx++; - } - - if (argc > argidx) { - if (!strcmp (argv[argidx], "single_share")) { - single_share = 1; - } else { - fprintf (stderr, "Unrecognised arg %s'\n", argv[3]); - return -1; + } else if (g_net_is_compatible(NULL, PTLLND, 0)) { + if (argc > 2) { + pid = strtol(argv[2], &end, 0); + if (end == argv[2] || *end == 0) { + fprintf(stderr, "Can't parse pid %s\n", + argv[2]); + return -1; + } } } + + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_nid = nid; + data.ioc_u32[0] = ip; + data.ioc_u32[1] = pid; - PCFG_INIT(pcfg, NAL_CMD_DEL_PEER); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ip; - pcfg.pcfg_flags = single_share; - - rc = pcfg_ioctl (&pcfg); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_PEER, &data); if (rc != 0) { fprintf (stderr, "failed to remove peer: %s\n", strerror (errno)); @@ -865,44 +774,48 @@ jt_ptl_del_peer (int argc, char **argv) int jt_ptl_print_connections (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; + lnet_process_id_t id; char buffer[2][64]; int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, - OPENIBNAL, IIBNAL, VIBNAL, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, RALND, + OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) return -1; for (index = 0; ; index++) { - PCFG_INIT (pcfg, NAL_CMD_GET_CONN); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl (&pcfg); + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_count = index; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_CONN, &data); if (rc != 0) break; - if (g_nal_is_compatible (NULL, SOCKNAL, 0)) - printf ("[%d]%s:"LPX64"@%s:%d:%s %d/%d %s\n", - pcfg.pcfg_gw_nal, /* scheduler */ - ptl_ipaddr_2_str (pcfg.pcfg_fd, buffer[0], 1), /* local IP addr */ - pcfg.pcfg_nid, - ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), /* remote IP addr */ - pcfg.pcfg_misc, /* remote port */ - (pcfg.pcfg_flags == SOCKNAL_CONN_ANY) ? "A" : - (pcfg.pcfg_flags == SOCKNAL_CONN_CONTROL) ? "C" : - (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_IN) ? "I" : - (pcfg.pcfg_flags == SOCKNAL_CONN_BULK_OUT) ? "O" : "?", - pcfg.pcfg_count, /* tx buffer size */ - pcfg.pcfg_size, /* rx buffer size */ - pcfg.pcfg_wait ? "nagle" : "nonagle"); - else if (g_nal_is_compatible (NULL, RANAL, 0)) - printf ("[%d]"LPX64"\n", - pcfg.pcfg_id, /* device id */ - pcfg.pcfg_nid); - else - printf (LPX64"\n", - pcfg.pcfg_nid); + if (g_net_is_compatible (NULL, SOCKLND, 0)) { + id.nid = data.ioc_nid; + id.pid = data.ioc_u32[6]; + printf ("%-20s %s[%d]%s->%s:%d %d/%d %s\n", + libcfs_id2str(id), + (data.ioc_u32[3] == SOCKLND_CONN_ANY) ? "A" : + (data.ioc_u32[3] == SOCKLND_CONN_CONTROL) ? "C" : + (data.ioc_u32[3] == SOCKLND_CONN_BULK_IN) ? "I" : + (data.ioc_u32[3] == SOCKLND_CONN_BULK_OUT) ? "O" : "?", + data.ioc_u32[4], /* scheduler */ + ptl_ipaddr_2_str (data.ioc_u32[2], buffer[0], 1), /* local IP addr */ + ptl_ipaddr_2_str (data.ioc_u32[0], buffer[1], 1), /* remote IP addr */ + data.ioc_u32[1], /* remote port */ + data.ioc_count, /* tx buffer size */ + data.ioc_u32[5], /* rx buffer size */ + data.ioc_flags ? "nagle" : "nonagle"); + } else if (g_net_is_compatible (NULL, RALND, 0)) { + printf ("%-20s [%d]\n", + libcfs_nid2str(data.ioc_nid), + data.ioc_u32[0] /* device id */); + } else { + printf ("%s\n", libcfs_nid2str(data.ioc_nid)); + } } if (index == 0) { @@ -917,154 +830,10 @@ jt_ptl_print_connections (int argc, char **argv) return 0; } -int jt_ptl_connect(int argc, char **argv) -{ -#ifndef HAVE_CONNECT - /* no connect() support */ - return -1; -#else /* HAVE_CONNECT */ - struct portals_cfg pcfg; - struct sockaddr_in srvaddr; - struct sockaddr_in locaddr; - __u32 ipaddr; - char *flag; - int fd, rc; - int type = SOCKNAL_CONN_ANY; - int port, rport; - int o; - - if (argc < 3) { - fprintf(stderr, "usage: %s ip port [type]\n", argv[0]); - return 0; - } - - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) - return -1; - - rc = ptl_parse_ipaddr (&ipaddr, argv[1]); - if (rc != 0) { - fprintf(stderr, "Can't parse hostname: %s\n", argv[1]); - return -1; - } - - if (ptl_parse_port (&port, argv[2]) != 0) { - fprintf (stderr, "Can't parse port: %s\n", argv[2]); - return -1; - } - - if (argc > 3) - for (flag = argv[3]; *flag != 0; flag++) - switch (*flag) - { - case 'I': - if (type != SOCKNAL_CONN_ANY) { - fprintf(stderr, "Can't flag type twice\n"); - return -1; - } - type = SOCKNAL_CONN_BULK_IN; - break; - - case 'O': - if (type != SOCKNAL_CONN_ANY) { - fprintf(stderr, "Can't flag type twice\n"); - return -1; - } - type = SOCKNAL_CONN_BULK_OUT; - break; - - case 'C': - if (type != SOCKNAL_CONN_ANY) { - fprintf(stderr, "Can't flag type twice\n"); - return -1; - } - type = SOCKNAL_CONN_CONTROL; - break; - - default: - fprintf (stderr, "unrecognised flag '%c'\n", - *flag); - return (-1); - } - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_addr.s_addr = INADDR_ANY; - - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(port); - srvaddr.sin_addr.s_addr = htonl(ipaddr); - - - for (rport = IPPORT_RESERVED - 1; rport > IPPORT_RESERVED / 2; --rport) { - fd = socket(PF_INET, SOCK_STREAM, 0); - if ( fd < 0 ) { - fprintf(stderr, "socket() failed: %s\n", strerror(errno)); - return -1; - } - - o = 1; - rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, - &o, sizeof(o)); - - locaddr.sin_port = htons(rport); - rc = bind(fd, (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == 0 || errno == EACCES) { - rc = connect(fd, (struct sockaddr *)&srvaddr, sizeof(srvaddr)); - if (rc == 0) { - break; - } else if (errno != EADDRINUSE) { - fprintf(stderr, "Error connecting to host: %s\n", strerror(errno)); - close(fd); - return -1; - } - } else if (errno != EADDRINUSE) { - fprintf(stderr, "Error binding to port %d: %d: %s\n", port, errno, strerror(errno)); - close(fd); - return -1; - } - } - - if (rport == IPPORT_RESERVED / 2) { - fprintf(stderr, - "Warning: all privileged ports are in use.\n"); - return -1; - } - - printf("Connected host: %s type: %s\n", - argv[1], - (type == SOCKNAL_CONN_ANY) ? "A" : - (type == SOCKNAL_CONN_CONTROL) ? "C" : - (type == SOCKNAL_CONN_BULK_IN) ? "I" : - (type == SOCKNAL_CONN_BULK_OUT) ? "O" : "?"); - - PCFG_INIT(pcfg, NAL_CMD_REGISTER_PEER_FD); - pcfg.pcfg_nal = g_nal; - pcfg.pcfg_fd = fd; - pcfg.pcfg_misc = type; - - rc = pcfg_ioctl(&pcfg); - if (rc) { - fprintf(stderr, "failed to register fd with portals: %s\n", - strerror(errno)); - close (fd); - return -1; - } - - printf("Connection to %s registered with socknal\n", argv[1]); - - rc = close(fd); - if (rc) - fprintf(stderr, "close failed: %d\n", rc); - - return 0; -#endif /* HAVE_CONNECT */ -} - int jt_ptl_disconnect(int argc, char **argv) { - struct portals_cfg pcfg; - ptl_nid_t nid = PTL_NID_ANY; + struct libcfs_ioctl_data data; + lnet_nid_t nid = LNET_NID_ANY; __u32 ipaddr = 0; int rc; @@ -1073,29 +842,30 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_nal_is_compatible (NULL, SOCKNAL, RANAL, - OPENIBNAL, IIBNAL, VIBNAL, 0)) + if (!g_net_is_compatible (NULL, SOCKLND, RALND, MXLND, + OPENIBLND, CIBLND, IIBLND, VIBLND, O2IBLND, 0)) return 0; if (argc >= 2 && - ptl_parse_anynid (&nid, argv[1]) != 0) { + !libcfs_str2anynid(&nid, argv[1])) { fprintf (stderr, "Can't parse nid %s\n", argv[1]); return -1; } - if (g_nal_is_compatible (NULL, SOCKNAL, 0) && + if (g_net_is_compatible (NULL, SOCKLND, 0) && argc >= 3 && - ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { + lnet_parse_ipaddr (&ipaddr, argv[2]) != 0) { fprintf (stderr, "Can't parse ip addr %s\n", argv[2]); return -1; } - PCFG_INIT(pcfg, NAL_CMD_CLOSE_CONNECTION); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ipaddr; + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_nid = nid; + data.ioc_u32[0] = ipaddr; - rc = pcfg_ioctl(&pcfg); - if (rc) { + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_CLOSE_CONNECTION, &data); + if (rc != 0) { fprintf(stderr, "failed to remove connection: %s\n", strerror(errno)); return -1; @@ -1106,36 +876,30 @@ int jt_ptl_disconnect(int argc, char **argv) int jt_ptl_push_connection (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; int rc; - ptl_nid_t nid = PTL_NID_ANY; - __u32 ipaddr = 0; + lnet_nid_t nid = LNET_NID_ANY; - if (argc > 3) { - fprintf(stderr, "usage: %s [nid] [ip]\n", argv[0]); + if (argc > 2) { + fprintf(stderr, "usage: %s [nid]\n", argv[0]); return 0; } - if (!g_nal_is_compatible (argv[0], SOCKNAL, 0)) + if (!g_net_is_compatible (argv[0], SOCKLND, 0)) return -1; if (argc > 1 && - ptl_parse_anynid (&nid, argv[1]) != 0) { + !libcfs_str2anynid(&nid, argv[1])) { fprintf(stderr, "Can't parse nid: %s\n", argv[1]); return -1; } - if (argc > 2 && - ptl_parse_ipaddr (&ipaddr, argv[2]) != 0) { - fprintf(stderr, "Can't parse ipaddr: %s\n", argv[2]); - } - - PCFG_INIT(pcfg, NAL_CMD_PUSH_CONNECTION); - pcfg.pcfg_nid = nid; - pcfg.pcfg_id = ipaddr; + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_nid = nid; - rc = pcfg_ioctl(&pcfg); - if (rc) { + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PUSH_CONNECTION, &data); + if (rc != 0) { fprintf(stderr, "failed to push connection: %s\n", strerror(errno)); return -1; @@ -1147,33 +911,32 @@ int jt_ptl_push_connection (int argc, char **argv) int jt_ptl_print_active_txs (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; int index; int rc; - if (!g_nal_is_compatible (argv[0], QSWNAL, 0)) + if (!g_net_is_compatible (argv[0], QSWLND, 0)) return -1; for (index = 0;;index++) { - PCFG_INIT(pcfg, NAL_CMD_GET_TXDESC); - pcfg.pcfg_count = index; - - rc = pcfg_ioctl(&pcfg); + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_count = index; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_TXDESC, &data); if (rc != 0) break; - printf ("%5s payload %6d bytes to "LPX64" via "LPX64" by pid %6d: %s, %s, state %d\n", - pcfg.pcfg_count == PTL_MSG_ACK ? "ACK" : - pcfg.pcfg_count == PTL_MSG_PUT ? "PUT" : - pcfg.pcfg_count == PTL_MSG_GET ? "GET" : - pcfg.pcfg_count == PTL_MSG_REPLY ? "REPLY" : "", - pcfg.pcfg_size, - pcfg.pcfg_nid, - pcfg.pcfg_nid2, - pcfg.pcfg_misc, - (pcfg.pcfg_flags & 1) ? "delayed" : "immediate", - (pcfg.pcfg_flags & 2) ? "nblk" : "normal", - pcfg.pcfg_flags >> 2); + printf ("type %u payload %6d to %s via %s by pid %6d: " + "%s, %s, state %d\n", + data.ioc_u32[0], + data.ioc_count, + libcfs_nid2str(data.ioc_nid), + libcfs_nid2str(data.ioc_u64[0]), + data.ioc_u32[1], + (data.ioc_flags & 1) ? "delayed" : "immediate", + (data.ioc_flags & 2) ? "nblk" : "normal", + data.ioc_flags >> 2); } if (index == 0) { @@ -1188,25 +951,22 @@ jt_ptl_print_active_txs (int argc, char **argv) return 0; } -int jt_ptl_ping(int argc, char **argv) +int jt_ptl_ping_test(int argc, char **argv) { int rc; - ptl_nid_t nid; + lnet_nid_t nid; long count = 1; long size = 4; long timeout = 1; - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; if (argc < 2) { fprintf(stderr, "usage: %s nid [count] [size] [timeout (secs)]\n", argv[0]); return 0; } - if (!g_nal_is_set()) - return -1; - - if (ptl_parse_nid (&nid, argv[1]) != 0) - { + nid = libcfs_str2nid(argv[1]); + if (nid == LNET_NID_ANY) { fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); return (-1); } @@ -1228,14 +988,13 @@ int jt_ptl_ping(int argc, char **argv) if (argc > 4) timeout = atol (argv[4]); - PORTAL_IOC_INIT (data); + LIBCFS_IOC_INIT (data); data.ioc_count = count; - data.ioc_size = size; data.ioc_nid = nid; - data.ioc_nal = g_nal; - data.ioc_timeout = timeout; + data.ioc_u32[0] = size; + data.ioc_u32[1] = timeout; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_PING, &data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PING_TEST, &data); if (rc) { fprintf(stderr, "failed to start pinger: %s\n", strerror(errno)); @@ -1244,73 +1003,90 @@ int jt_ptl_ping(int argc, char **argv) return 0; } -int jt_ptl_shownid(int argc, char **argv) +int jt_ptl_ping(int argc, char **argv) { - struct portal_ioctl_data data; int rc; - - if (argc > 1) { - fprintf(stderr, "usage: %s\n", argv[0]); + int timeout; + lnet_process_id_t id; + lnet_process_id_t ids[16]; + int maxids = sizeof(ids)/sizeof(ids[0]); + struct libcfs_ioctl_data data; + int i; + + if (argc < 2) { + fprintf(stderr, "usage: %s nid [timeout (secs)] [pid]\n", argv[0]); return 0; } - - if (!g_nal_is_set()) + + id.nid = libcfs_str2nid(argv[1]); + if (id.nid == LNET_NID_ANY) { + fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); return -1; + } - PORTAL_IOC_INIT (data); - data.ioc_nal = g_nal; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); - if (rc < 0) - fprintf(stderr, "getting my NID failed: %s\n", - strerror (errno)); + if (argc > 2) + timeout = 1000 * atol(argv[2]); else - printf(LPX64"\n", data.ioc_nid); + timeout = 1000; /* default 1 second timeout */ + + if (argc > 3) + id.pid = atol(argv[3]); + else + id.pid = LNET_PID_ANY; + + LIBCFS_IOC_INIT (data); + data.ioc_nid = id.nid; + data.ioc_u32[0] = id.pid; + data.ioc_u32[1] = timeout; + data.ioc_plen1 = sizeof(ids); + data.ioc_pbuf1 = (char *)ids; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_PING, &data); + if (rc != 0) { + fprintf(stderr, "failed to ping %s: %s\n", + id.pid == LNET_PID_ANY ? + libcfs_nid2str(id.nid) : libcfs_id2str(id), + strerror(errno)); + return -1; + } + + for (i = 0; i < data.ioc_count && i < maxids; i++) + printf("%s\n", libcfs_id2str(ids[i])); + + if (data.ioc_count > maxids) + printf("%d out of %d ids listed\n", maxids, data.ioc_count); + return 0; } int jt_ptl_mynid(int argc, char **argv) { + struct libcfs_ioctl_data data; + lnet_nid_t nid; int rc; - char hostname[1024]; - char *nidstr; - struct portals_cfg pcfg; - ptl_nid_t mynid; - if (argc > 2) { - fprintf(stderr, "usage: %s [NID]\n", argv[0]); - fprintf(stderr, "NID defaults to the primary IP address of the machine.\n"); + if (argc != 2) { + fprintf(stderr, "usage: %s NID\n", argv[0]); return 0; } - if (!g_nal_is_set()) - return -1; - - if (argc >= 2) - nidstr = argv[1]; - else if (gethostname(hostname, sizeof(hostname)) != 0) { - fprintf(stderr, "gethostname failed: %s\n", - strerror(errno)); + nid = libcfs_str2nid(argv[1]); + if (nid == LNET_NID_ANY) { + fprintf(stderr, "Can't parse NID '%s'\n", argv[1]); return -1; } - else - nidstr = hostname; - rc = ptl_parse_nid (&mynid, nidstr); - if (rc != 0) { - fprintf (stderr, "Can't convert '%s' into a NID\n", nidstr); - return -1; - } - - PCFG_INIT(pcfg, NAL_CMD_REGISTER_MYNID); - pcfg.pcfg_nid = mynid; + LIBCFS_IOC_INIT(data); + data.ioc_net = LNET_NIDNET(nid); + data.ioc_nid = nid; - rc = pcfg_ioctl(&pcfg); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_REGISTER_MYNID, &data); if (rc < 0) fprintf(stderr, "setting my NID failed: %s\n", strerror(errno)); else - printf("registered my nid "LPX64" (%s)\n", - ptl_nid2u64(mynid), hostname); + printf("registered my nid %s\n", libcfs_nid2str(nid)); + return 0; } @@ -1318,42 +1094,36 @@ int jt_ptl_fail_nid (int argc, char **argv) { int rc; - ptl_nid_t nid; + lnet_nid_t nid; unsigned int threshold; - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; if (argc < 2 || argc > 3) { - fprintf (stderr, "usage: %s nid|\"_all_\" [count (0 == mend)]\n", argv[0]); + fprintf (stderr, "usage: %s nid|\"*\" [count (0 == mend)]\n", argv[0]); return (0); } - if (!g_nal_is_set()) - return (-1); - - if (!strcmp (argv[1], "_all_")) - nid = PTL_NID_ANY; - else if (ptl_parse_anynid (&nid, argv[1]) != 0) + if (!libcfs_str2anynid(&nid, argv[1])) { fprintf (stderr, "Can't parse nid \"%s\"\n", argv[1]); return (-1); } - if (argc < 3) - threshold = PTL_MD_THRESH_INF; - else if (sscanf (argv[2], "%i", &threshold) != 1) { + if (argc < 3) { + threshold = LNET_MD_THRESH_INF; + } else if (sscanf (argv[2], "%i", &threshold) != 1) { fprintf (stderr, "Can't parse count \"%s\"\n", argv[2]); return (-1); } - PORTAL_IOC_INIT (data); - data.ioc_nal = g_nal; + LIBCFS_IOC_INIT (data); data.ioc_nid = nid; data.ioc_count = threshold; - rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_FAIL_NID, &data); + rc = l_ioctl (LNET_DEV_ID, IOC_LIBCFS_FAIL_NID, &data); if (rc < 0) - fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n", + fprintf (stderr, "IOC_LIBCFS_FAIL_NID failed: %s\n", strerror (errno)); else printf ("%s %s\n", threshold == 0 ? "Unfailing" : "Failing", argv[1]); @@ -1362,92 +1132,45 @@ jt_ptl_fail_nid (int argc, char **argv) } int -jt_ptl_loopback (int argc, char **argv) -{ - int rc; - int set; - int enable; - struct portal_ioctl_data data; - - if (argc > 2) - { - fprintf (stderr, "usage: %s [on|off]\n", argv[0]); - return (0); - } - - if (!g_nal_is_set()) - return (-1); - - set = argc > 1; - if (set && ptl_parse_bool (&enable, argv[1]) != 0) { - fprintf (stderr, "Can't parse boolean %s\n", argv[1]); - return (-1); - } - - PORTAL_IOC_INIT (data); - data.ioc_nal = g_nal; - data.ioc_flags = enable; - data.ioc_misc = set; - - rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_LOOPBACK, &data); - if (rc < 0) - fprintf (stderr, "IOC_PORTAL_FAIL_NID failed: %s\n", - strerror (errno)); - else - printf ("loopback %s\n", data.ioc_flags ? "enabled" : "disabled"); - - return (0); -} - -int jt_ptl_add_route (int argc, char **argv) { - struct portals_cfg pcfg; - ptl_nid_t nid1; - ptl_nid_t nid2; - ptl_nid_t gateway_nid; + struct libcfs_ioctl_data data; + lnet_nid_t gateway_nid; + unsigned int hops = 1; + char *end; int rc; - if (argc < 3) + if (argc < 2 || argc > 3) { - fprintf (stderr, "usage: %s gateway target [target]\n", argv[0]); + fprintf (stderr, "usage: %s gateway [hopcount]\n", argv[0]); return (0); } - if (!g_nal_is_set()) + if (!g_net_is_set(argv[0])) return (-1); - if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) - { + gateway_nid = libcfs_str2nid(argv[1]); + if (gateway_nid == LNET_NID_ANY) { fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); return (-1); } - if (ptl_parse_nid (&nid1, argv[2]) != 0) - { - fprintf (stderr, "Can't parse first target NID \"%s\"\n", argv[2]); - return (-1); - } - - if (argc < 4) - nid2 = nid1; - else if (ptl_parse_nid (&nid2, argv[3]) != 0) - { - fprintf (stderr, "Can't parse second target NID \"%s\"\n", argv[4]); - return (-1); + if (argc == 3) { + hops = strtoul(argv[2], &end, 0); + if (hops >= 256 || *end != 0) { + fprintf (stderr, "Can't parse hopcount \"%s\"\n", argv[2]); + return -1; + } } + + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net; + data.ioc_count = hops; + data.ioc_nid = gateway_nid; - PCFG_INIT(pcfg, NAL_CMD_ADD_ROUTE); - pcfg.pcfg_nid = gateway_nid; - pcfg.pcfg_nal = ROUTER; - pcfg.pcfg_gw_nal = g_nal; - pcfg.pcfg_nid2 = MIN (nid1, nid2); - pcfg.pcfg_nid3 = MAX (nid1, nid2); - - rc = pcfg_ioctl(&pcfg); - if (rc != 0) - { - fprintf (stderr, "NAL_CMD_ADD_ROUTE failed: %s\n", strerror (errno)); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_ADD_ROUTE, &data); + if (rc != 0) { + fprintf (stderr, "IOC_LIBCFS_ADD_ROUTE failed: %s\n", strerror (errno)); return (-1); } @@ -1457,62 +1180,29 @@ jt_ptl_add_route (int argc, char **argv) int jt_ptl_del_route (int argc, char **argv) { - struct portals_cfg pcfg; - ptl_nid_t nid; - ptl_nid_t nid1 = PTL_NID_ANY; - ptl_nid_t nid2 = PTL_NID_ANY; + struct libcfs_ioctl_data data; + lnet_nid_t nid; int rc; - if (argc < 2) - { - fprintf (stderr, "usage: %s targetNID\n", argv[0]); + if (argc != 2) { + fprintf (stderr, "usage: %s gatewayNID\n", argv[0]); return (0); } - if (!g_nal_is_set()) - return (-1); - - if (ptl_parse_nid (&nid, argv[1]) != 0) - { - fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); - return (-1); - } - - if (argc >= 3 && - ptl_parse_nid (&nid1, argv[2]) != 0) - { - fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]); - return (-1); + if (!libcfs_str2anynid(&nid, argv[1])) { + fprintf (stderr, "Can't parse gateway NID " + "\"%s\"\n", argv[1]); + return -1; } - if (argc < 4) { - nid2 = nid1; - } else { - if (ptl_parse_nid (&nid2, argv[3]) != 0) { - fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]); - return (-1); - } + LIBCFS_IOC_INIT(data); + data.ioc_net = g_net_set ? g_net : LNET_NIDNET(LNET_NID_ANY); + data.ioc_nid = nid; - if (nid1 > nid2) { - ptl_nid_t tmp = nid1; - - nid1 = nid2; - nid2 = tmp; - } - } - - PCFG_INIT(pcfg, NAL_CMD_DEL_ROUTE); - pcfg.pcfg_nal = ROUTER; - pcfg.pcfg_gw_nal = g_nal; - pcfg.pcfg_nid = nid; - pcfg.pcfg_nid2 = nid1; - pcfg.pcfg_nid3 = nid2; - - rc = pcfg_ioctl(&pcfg); - if (rc != 0) - { - fprintf (stderr, "NAL_CMD_DEL_ROUTE ("LPX64") failed: %s\n", - ptl_nid2u64(nid), strerror (errno)); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_DEL_ROUTE, &data); + if (rc != 0) { + fprintf (stderr, "IOC_LIBCFS_DEL_ROUTE (%s) failed: %s\n", + libcfs_nid2str(nid), strerror (errno)); return (-1); } @@ -1522,9 +1212,9 @@ jt_ptl_del_route (int argc, char **argv) int jt_ptl_notify_router (int argc, char **argv) { - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; int enable; - ptl_nid_t nid; + lnet_nid_t nid; int rc; struct timeval now; time_t when; @@ -1536,13 +1226,13 @@ jt_ptl_notify_router (int argc, char **argv) return (0); } - if (ptl_parse_nid (&nid, argv[1]) != 0) - { + nid = libcfs_str2nid(argv[1]); + if (nid == LNET_NID_ANY) { fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); return (-1); } - if (ptl_parse_bool (&enable, argv[2]) != 0) { + if (lnet_parse_bool (&enable, argv[2]) != 0) { fprintf (stderr, "Can't parse boolean %s\n", argv[2]); return (-1); } @@ -1551,7 +1241,7 @@ jt_ptl_notify_router (int argc, char **argv) if (argc < 4) { when = now.tv_sec; - } else if (ptl_parse_time (&when, argv[3]) != 0) { + } else if (lnet_parse_time (&when, argv[3]) != 0) { fprintf(stderr, "Can't parse time %s\n" "Please specify either 'YYYY-MM-DD-HH:MM:SS'\n" "or an absolute unix time in seconds\n", argv[3]); @@ -1562,19 +1252,16 @@ jt_ptl_notify_router (int argc, char **argv) return (-1); } - PCFG_INIT(pcfg, NAL_CMD_NOTIFY_ROUTER); - pcfg.pcfg_nal = ROUTER; - pcfg.pcfg_gw_nal = g_nal; - pcfg.pcfg_nid = nid; - pcfg.pcfg_flags = enable; + LIBCFS_IOC_INIT(data); + data.ioc_nid = nid; + data.ioc_flags = enable; /* Yeuch; 'cept I need a __u64 on 64 bit machines... */ - pcfg.pcfg_nid3 = (__u64)when; + data.ioc_u64[0] = (__u64)when; - rc = pcfg_ioctl(&pcfg); - if (rc != 0) - { - fprintf (stderr, "NAL_CMD_NOTIFY_ROUTER ("LPX64") failed: %s\n", - ptl_nid2u64(nid), strerror (errno)); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_NOTIFY_ROUTER, &data); + if (rc != 0) { + fprintf (stderr, "IOC_LIBCFS_NOTIFY_ROUTER (%s) failed: %s\n", + libcfs_nid2str(nid), strerror (errno)); return (-1); } @@ -1584,105 +1271,96 @@ jt_ptl_notify_router (int argc, char **argv) int jt_ptl_print_routes (int argc, char **argv) { - char buffer[3][128]; - struct portals_cfg pcfg; + struct libcfs_ioctl_data data; int rc; int index; - int gateway_nal; - ptl_nid_t gateway_nid; - ptl_nid_t nid1; - ptl_nid_t nid2; + __u32 net; + lnet_nid_t nid; + unsigned int hops; int alive; for (index = 0;;index++) { - PCFG_INIT(pcfg, NAL_CMD_GET_ROUTE); - pcfg.pcfg_nal = ROUTER; - pcfg.pcfg_count = index; + LIBCFS_IOC_INIT(data); + data.ioc_count = index; - rc = pcfg_ioctl(&pcfg); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_ROUTE, &data); if (rc != 0) break; - gateway_nal = pcfg.pcfg_gw_nal; - gateway_nid = pcfg.pcfg_nid; - nid1 = pcfg.pcfg_nid2; - nid2 = pcfg.pcfg_nid3; - alive = pcfg.pcfg_flags; + net = data.ioc_net; + hops = data.ioc_count; + nid = data.ioc_nid; + alive = data.ioc_flags; - printf ("%8s %18s : %s - %s, %s\n", - nal2name (gateway_nal), - ptl_nid2str (buffer[0], gateway_nid), - ptl_nid2str (buffer[1], nid1), - ptl_nid2str (buffer[2], nid2), - alive ? "up" : "down"); + printf ("net %18s hops %u gw %32s %s\n", + libcfs_net2str(net), hops, + libcfs_nid2str(nid), alive ? "up" : "down"); } - if (index == 0 && errno != ENOENT) { + if (errno != ENOENT) fprintf(stderr, "Error getting routes: %s: check dmesg.\n", strerror(errno)); - } + return (0); } static int lwt_control(int enable, int clear) { - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; int rc; - PORTAL_IOC_INIT(data); - data.ioc_flags = enable; - data.ioc_misc = clear; + LIBCFS_IOC_INIT(data); + data.ioc_flags = (enable ? 1 : 0) | (clear ? 2 : 0); - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_CONTROL, &data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_CONTROL, &data); if (rc == 0) return (0); - fprintf(stderr, "IOC_PORTAL_LWT_CONTROL failed: %s\n", + fprintf(stderr, "IOC_LIBCFS_LWT_CONTROL failed: %s\n", strerror(errno)); return (-1); } static int -lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize, +lwt_snapshot(cycles_t *now, int *ncpu, int *totalsize, lwt_event_t *events, int size) { - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; int rc; - PORTAL_IOC_INIT(data); + LIBCFS_IOC_INIT(data); data.ioc_pbuf1 = (char *)events; data.ioc_plen1 = size; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_SNAPSHOT, &data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_SNAPSHOT, &data); if (rc != 0) { - fprintf(stderr, "IOC_PORTAL_LWT_SNAPSHOT failed: %s\n", + fprintf(stderr, "IOC_LIBCFS_LWT_SNAPSHOT failed: %s\n", strerror(errno)); return (-1); } /* crappy overloads */ - if (data.ioc_nid2 != sizeof(lwt_event_t) || - data.ioc_nid3 != offsetof(lwt_event_t, lwte_where)) { + if (data.ioc_u32[2] != sizeof(lwt_event_t) || + data.ioc_u32[3] != offsetof(lwt_event_t, lwte_where)) { fprintf(stderr,"kernel/user LWT event mismatch %d(%d),%d(%d)\n", - (int)data.ioc_nid2, (int)sizeof(lwt_event_t), - (int)data.ioc_nid3, + (int)data.ioc_u32[2], (int)sizeof(lwt_event_t), + (int)data.ioc_u32[3], (int)offsetof(lwt_event_t, lwte_where)); return (-1); } - LASSERT (data.ioc_count != 0); - LASSERT (data.ioc_misc != 0); - if (now != NULL) - *now = data.ioc_nid; + *now = data.ioc_u64[0]; + LASSERT (data.ioc_u32[0] != 0); if (ncpu != NULL) - *ncpu = data.ioc_count; + *ncpu = data.ioc_u32[0]; + LASSERT (data.ioc_u32[1] != 0); if (totalsize != NULL) - *totalsize = data.ioc_misc; + *totalsize = data.ioc_u32[1]; return (0); } @@ -1691,22 +1369,22 @@ static char * lwt_get_string(char *kstr) { char *ustr; - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; int size; int rc; /* FIXME: this could maintain a symbol table since we expect to be * looking up the same strings all the time... */ - PORTAL_IOC_INIT(data); + LIBCFS_IOC_INIT(data); data.ioc_pbuf1 = kstr; data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */ data.ioc_pbuf2 = NULL; data.ioc_plen2 = 0; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_LOOKUP_STRING, &data); if (rc != 0) { - fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n", + fprintf(stderr, "IOC_LIBCFS_LWT_LOOKUP_STRING failed: %s\n", strerror(errno)); return (NULL); } @@ -1719,15 +1397,15 @@ lwt_get_string(char *kstr) return (NULL); } - PORTAL_IOC_INIT(data); + LIBCFS_IOC_INIT(data); data.ioc_pbuf1 = kstr; data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */ data.ioc_pbuf2 = ustr; data.ioc_plen2 = size; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_LWT_LOOKUP_STRING, &data); if (rc != 0) { - fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n", + fprintf(stderr, "IOC_LIBCFS_LWT_LOOKUP_STRING failed: %s\n", strerror(errno)); return (NULL); } @@ -1994,7 +1672,7 @@ int jt_ptl_memhog(int argc, char **argv) { static int gfp = 0; /* sticky! */ - struct portal_ioctl_data data; + struct libcfs_ioctl_data data; int rc; int count; char *end; @@ -2019,10 +1697,10 @@ int jt_ptl_memhog(int argc, char **argv) gfp = rc; } - PORTAL_IOC_INIT(data); + LIBCFS_IOC_INIT(data); data.ioc_count = count; data.ioc_flags = gfp; - rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_MEMHOG, &data); + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_MEMHOG, &data); if (rc != 0) { fprintf(stderr, "memhog %d failed: %s\n", count, strerror(errno)); @@ -2033,3 +1711,36 @@ int jt_ptl_memhog(int argc, char **argv) return 0; } +int jt_ptl_testprotocompat(int argc, char **argv) +{ + struct libcfs_ioctl_data data; + int rc; + int flags; + char *end; + + if (argc < 2) { + fprintf(stderr, "usage: %s \n", argv[0]); + return 0; + } + + flags = strtol(argv[1], &end, 0); + if (flags < 0 || *end != 0) { + fprintf(stderr, "Can't parse flags '%s'\n", argv[1]); + return -1; + } + + LIBCFS_IOC_INIT(data); + data.ioc_flags = flags; + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_TESTPROTOCOMPAT, &data); + + if (rc != 0) { + fprintf(stderr, "test proto compat %x failed: %s\n", + flags, strerror(errno)); + return -1; + } + + printf("test proto compat %x OK\n", flags); + return 0; +} + + diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c index 3089211..c3ab2b7 100644 --- a/lnet/utils/ptlctl.c +++ b/lnet/utils/ptlctl.c @@ -22,14 +22,17 @@ #include #include -#include -#include +#include +#include #include "parser.h" command_t list[] = { - {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, + {"network", jt_ptl_network, 0,"select/configure network (args: up|down|LND name)"}, + {"net", jt_ptl_network, 0,"select/configure network (args: up|down|LND name)"}, + {"list_nids", jt_ptl_list_nids, 0,"list local NIDs"}, + {"which_nid", jt_ptl_which_nid, 0,"select the closest NID"}, {"print_interfaces", jt_ptl_print_interfaces, 0, "print interface entries (no args)"}, {"add_interface", jt_ptl_add_interface, 0, "add interface entry (args: ip [netmask])"}, {"del_interface", jt_ptl_del_interface, 0, "delete interface entries (args: [ip])"}, @@ -37,12 +40,11 @@ command_t list[] = { {"add_peer", jt_ptl_add_peer, 0, "add peer entry (args: nid host port)"}, {"del_peer", jt_ptl_del_peer, 0, "delete peer entry (args: [nid] [host])"}, {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"}, - {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [iIOC])"}, {"disconnect", jt_ptl_disconnect, 0, "disconnect from a remote nid (args: [nid] [host]"}, {"push", jt_ptl_push_connection, 0, "flush connection to a remote nid (args: [nid]"}, {"active_tx", jt_ptl_print_active_txs, 0, "print active transmits (no args)"}, - {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, - {"shownid", jt_ptl_shownid, 0, "print the local NID"}, + {"testping", jt_ptl_ping_test, 0, "do a ping test (args: nid [count] [size] [timeout])"}, + {"ping", jt_ptl_ping, 0, "ping (args: nid [timeout] [pid])"}, {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, @@ -53,7 +55,7 @@ command_t list[] = { {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, {"dump", jt_ioc_dump, 0, "usage: dump file, save ioctl buffer to file"}, {"fail", jt_ptl_fail_nid, 0, "usage: fail nid|_all_ [count]"}, - {"loopback", jt_ptl_loopback, 0, "usage: loopback [on|off]"}, + {"testprotocompat", jt_ptl_testprotocompat, 0, "usage: testprotocompat count"}, {"help", Parser_help, 0, "help"}, {"exit", Parser_quit, 0, "quit"}, {"quit", Parser_quit, 0, "quit"}, diff --git a/lnet/utils/routerstat.c b/lnet/utils/routerstat.c index 99bc59b..febe89a 100644 --- a/lnet/utils/routerstat.c +++ b/lnet/utils/routerstat.c @@ -16,20 +16,56 @@ timenow () return (tv.tv_sec + tv.tv_usec / 1000000.0); } +typedef struct { + unsigned long msgs_alloc; + unsigned long msgs_max; + unsigned long errors; + unsigned long send_count; + unsigned long recv_count; + unsigned long route_count; + unsigned long drop_count; + unsigned long long send_length; + unsigned long long recv_length; + unsigned long long route_length; + unsigned long long drop_length; +} counters_t; + +unsigned long long subull(unsigned long long a, unsigned long long b) +{ + if (a < b) + return -1ULL - b + a + 1; + + return a - b; +} + +unsigned long long subul(unsigned long a, unsigned long b) +{ + if (a < b) + return -1UL - b + a + 1; + + return a - b; +} + +double rul(unsigned long a, double secs) +{ + return (double)a/secs; +} + +double rull(unsigned long long a, double secs) +{ + return (double)a/secs; +} + void do_stat (int fd) { static char buffer[1024]; static double last = 0.0; - static unsigned long long old_bytes; - static unsigned long old_packets; - static unsigned long old_errors; + static counters_t old_counter; double now; double t; - unsigned long long new_bytes, bytes; - unsigned long new_packets, packets; - unsigned long new_errors, errors; - unsigned long depth; + counters_t new_counter; + counters_t counter; int n; lseek (fd, 0, SEEK_SET); @@ -42,51 +78,53 @@ do_stat (int fd) } buffer[n] = 0; - n = sscanf (buffer, "%Lu %lu %lu %lu", - &new_bytes, &new_packets, &new_errors, &depth); - - if (n < 3) + n = sscanf (buffer, "%u %u %u %u %u %u %u %Lu %Lu %Lu %Lu", + &new_counter.msgs_alloc, &new_counter.msgs_max, + &new_counter.errors, + &new_counter.send_count, &new_counter.recv_count, + &new_counter.route_count, &new_counter.drop_count, + &new_counter.send_length, &new_counter.recv_length, + &new_counter.route_length, &new_counter.drop_length); + if (n < 11) { fprintf (stderr, "Can't parse statfile\n"); exit (1); } - if (last == 0.0) - printf ("%llu bytes, %lu packets (sz %lld), %lu errors", - new_bytes, new_packets, - (long long)((new_packets == 0) ? 0LL : new_bytes/new_packets), - new_errors); - else - { - t = now - last; + if (last == 0.0) { + printf ("M %lu(%lu) E %lu S %lu/%llu R %lu/%llu F %lu/%llu D %lu/%llu\n", + new_counter.msgs_alloc, new_counter.msgs_max, + new_counter.errors, + new_counter.send_count, new_counter.send_length, + new_counter.recv_count, new_counter.recv_length, + new_counter.route_count, new_counter.route_length, + new_counter.drop_count, new_counter.drop_length); + } else { + t = now - last; - if (new_bytes < old_bytes) - bytes = -1ULL - old_bytes + new_bytes + 1; - else - bytes = new_bytes - old_bytes; - if (new_packets < old_packets) - packets = -1UL - old_packets + new_packets + 1; - else - packets = new_packets - old_packets; - if (new_errors < old_errors) - errors = -1UL - old_errors + new_errors + 1; - else - errors = new_errors - old_errors; - - printf ("%9llu bytes (%7.2fMb/s), %7lu packets (sz %5lld, %5ld/s), %lu errors (%ld/s)", - bytes, ((double)bytes)/((1<<20) * t), - packets, (long long)((packets == 0) ? 0LL : bytes/packets), (long)(packets/t), - errors, (long)(errors/t)); - } - old_bytes = new_bytes; - old_packets = new_packets; - old_errors = new_errors; + counter.msgs_alloc = new_counter.msgs_alloc; + counter.msgs_max = new_counter.msgs_max; + + counter.errors = subul(new_counter.errors, old_counter.errors); + counter.send_count = subul(new_counter.send_count, old_counter.send_count); + counter.recv_count = subul(new_counter.recv_count, old_counter.recv_count); + counter.route_count = subul(new_counter.route_count, old_counter.route_count); + counter.drop_count = subul(new_counter.drop_count, old_counter.drop_count); + counter.send_length = subull(new_counter.send_length, old_counter.send_length); + counter.recv_length = subull(new_counter.recv_length, old_counter.recv_length); + counter.route_length = subull(new_counter.route_length, old_counter.route_length); + counter.drop_length = subull(new_counter.drop_length, old_counter.drop_length); - if (n == 4) - printf (", depth (%ld)\n", depth); - else - printf ("\n"); + printf ("M %3lu(%3lu) E %0.0f S %7.2f/%6.0f R %7.2f/%6.0f F %7.2f/%6.0f D %4.2f/%0.0f\n", + counter.msgs_alloc, counter.msgs_max, + rul(counter.errors,t), + rull(counter.send_length,t*1024.0*1024.0), rul(counter.send_count, t), + rull(counter.recv_length,t*1024.0*1024.0), rul(counter.recv_count, t), + rull(counter.route_length,t*1024.0*1024.0), rul(counter.route_count, t), + rull(counter.drop_length,t*1024.0*1024.0), rul(counter.drop_count, t)); + } + old_counter = new_counter; fflush (stdout); lseek (fd, 0, SEEK_SET); @@ -101,7 +139,7 @@ int main (int argc, char **argv) if (argc > 1) interval = atoi (argv[1]); - fd = open ("/proc/sys/portals/router", O_RDONLY); + fd = open ("/proc/sys/lnet/stats", O_RDONLY); if (fd < 0) { fprintf (stderr, "Can't open stat: %s\n", strerror (errno)); diff --git a/lnet/utils/wirecheck.c b/lnet/utils/wirecheck.c index 986d081..9590b8b 100644 --- a/lnet/utils/wirecheck.c +++ b/lnet/utils/wirecheck.c @@ -4,8 +4,7 @@ #include #include #include -#include -#include +#include #include @@ -25,14 +24,14 @@ do { \ #define STRINGIFY(a) #a -#define CHECK_DEFINE(a) \ -do { \ - printf (" LASSERT ("#a" == "STRINGIFY(a)");\n"); \ +#define CHECK_DEFINE(a) \ +do { \ + printf (" CLASSERT ("#a" == "STRINGIFY(a)");\n"); \ } while (0) #define CHECK_VALUE(a) \ do { \ - printf (" LASSERT ("#a" == %d);\n", a); \ + printf (" CLASSERT ("#a" == %d);\n", a); \ } while (0) #define CHECK_MEMBER_OFFSET(s,m) \ @@ -59,64 +58,64 @@ do { \ } while (0) void -check_ptl_handle_wire (void) +check_lnet_handle_wire (void) { - CHECK_STRUCT (ptl_handle_wire_t); - CHECK_MEMBER (ptl_handle_wire_t, wh_interface_cookie); - CHECK_MEMBER (ptl_handle_wire_t, wh_object_cookie); + CHECK_STRUCT (lnet_handle_wire_t); + CHECK_MEMBER (lnet_handle_wire_t, wh_interface_cookie); + CHECK_MEMBER (lnet_handle_wire_t, wh_object_cookie); } void -check_ptl_magicversion (void) +check_lnet_magicversion (void) { - CHECK_STRUCT (ptl_magicversion_t); - CHECK_MEMBER (ptl_magicversion_t, magic); - CHECK_MEMBER (ptl_magicversion_t, version_major); - CHECK_MEMBER (ptl_magicversion_t, version_minor); + CHECK_STRUCT (lnet_magicversion_t); + CHECK_MEMBER (lnet_magicversion_t, magic); + CHECK_MEMBER (lnet_magicversion_t, version_major); + CHECK_MEMBER (lnet_magicversion_t, version_minor); } void -check_ptl_hdr (void) +check_lnet_hdr (void) { - CHECK_STRUCT (ptl_hdr_t); - CHECK_MEMBER (ptl_hdr_t, dest_nid); - CHECK_MEMBER (ptl_hdr_t, src_nid); - CHECK_MEMBER (ptl_hdr_t, dest_pid); - CHECK_MEMBER (ptl_hdr_t, src_pid); - CHECK_MEMBER (ptl_hdr_t, type); - CHECK_MEMBER (ptl_hdr_t, payload_length); - CHECK_MEMBER (ptl_hdr_t, msg); + CHECK_STRUCT (lnet_hdr_t); + CHECK_MEMBER (lnet_hdr_t, dest_nid); + CHECK_MEMBER (lnet_hdr_t, src_nid); + CHECK_MEMBER (lnet_hdr_t, dest_pid); + CHECK_MEMBER (lnet_hdr_t, src_pid); + CHECK_MEMBER (lnet_hdr_t, type); + CHECK_MEMBER (lnet_hdr_t, payload_length); + CHECK_MEMBER (lnet_hdr_t, msg); BLANK_LINE (); COMMENT ("Ack"); - CHECK_MEMBER (ptl_hdr_t, msg.ack.dst_wmd); - CHECK_MEMBER (ptl_hdr_t, msg.ack.match_bits); - CHECK_MEMBER (ptl_hdr_t, msg.ack.mlength); + CHECK_MEMBER (lnet_hdr_t, msg.ack.dst_wmd); + CHECK_MEMBER (lnet_hdr_t, msg.ack.match_bits); + CHECK_MEMBER (lnet_hdr_t, msg.ack.mlength); BLANK_LINE (); COMMENT ("Put"); - CHECK_MEMBER (ptl_hdr_t, msg.put.ack_wmd); - CHECK_MEMBER (ptl_hdr_t, msg.put.match_bits); - CHECK_MEMBER (ptl_hdr_t, msg.put.hdr_data); - CHECK_MEMBER (ptl_hdr_t, msg.put.ptl_index); - CHECK_MEMBER (ptl_hdr_t, msg.put.offset); + CHECK_MEMBER (lnet_hdr_t, msg.put.ack_wmd); + CHECK_MEMBER (lnet_hdr_t, msg.put.match_bits); + CHECK_MEMBER (lnet_hdr_t, msg.put.hdr_data); + CHECK_MEMBER (lnet_hdr_t, msg.put.ptl_index); + CHECK_MEMBER (lnet_hdr_t, msg.put.offset); BLANK_LINE (); COMMENT ("Get"); - CHECK_MEMBER (ptl_hdr_t, msg.get.return_wmd); - CHECK_MEMBER (ptl_hdr_t, msg.get.match_bits); - CHECK_MEMBER (ptl_hdr_t, msg.get.ptl_index); - CHECK_MEMBER (ptl_hdr_t, msg.get.src_offset); - CHECK_MEMBER (ptl_hdr_t, msg.get.sink_length); + CHECK_MEMBER (lnet_hdr_t, msg.get.return_wmd); + CHECK_MEMBER (lnet_hdr_t, msg.get.match_bits); + CHECK_MEMBER (lnet_hdr_t, msg.get.ptl_index); + CHECK_MEMBER (lnet_hdr_t, msg.get.src_offset); + CHECK_MEMBER (lnet_hdr_t, msg.get.sink_length); BLANK_LINE (); COMMENT ("Reply"); - CHECK_MEMBER (ptl_hdr_t, msg.reply.dst_wmd); + CHECK_MEMBER (lnet_hdr_t, msg.reply.dst_wmd); BLANK_LINE (); COMMENT ("Hello"); - CHECK_MEMBER (ptl_hdr_t, msg.hello.incarnation); - CHECK_MEMBER (ptl_hdr_t, msg.hello.type); + CHECK_MEMBER (lnet_hdr_t, msg.hello.incarnation); + CHECK_MEMBER (lnet_hdr_t, msg.hello.type); } void @@ -174,13 +173,13 @@ system_string (char *cmdline, char *str, int len) int main (int argc, char **argv) { - char unameinfo[80]; - char gccinfo[80]; + char unameinfo[256]; + char gccinfo[256]; system_string("uname -a", unameinfo, sizeof(unameinfo)); system_string("gcc -v 2>&1 | tail -1", gccinfo, sizeof(gccinfo)); - printf ("void lib_assert_wire_constants (void)\n" + printf ("void lnet_assert_wire_constants (void)\n" "{\n" " /* Wire protocol assertions generated by 'wirecheck'\n" " * running on %s\n" @@ -190,19 +189,23 @@ main (int argc, char **argv) BLANK_LINE (); COMMENT ("Constants..."); - CHECK_DEFINE (PORTALS_PROTO_MAGIC); - CHECK_DEFINE (PORTALS_PROTO_VERSION_MAJOR); - CHECK_DEFINE (PORTALS_PROTO_VERSION_MINOR); - - CHECK_VALUE (PTL_MSG_ACK); - CHECK_VALUE (PTL_MSG_PUT); - CHECK_VALUE (PTL_MSG_GET); - CHECK_VALUE (PTL_MSG_REPLY); - CHECK_VALUE (PTL_MSG_HELLO); - - check_ptl_handle_wire (); - check_ptl_magicversion (); - check_ptl_hdr (); + + CHECK_DEFINE (LNET_PROTO_OPENIB_MAGIC); + CHECK_DEFINE (LNET_PROTO_RA_MAGIC); + + CHECK_DEFINE (LNET_PROTO_TCP_MAGIC); + CHECK_DEFINE (LNET_PROTO_TCP_VERSION_MAJOR); + CHECK_DEFINE (LNET_PROTO_TCP_VERSION_MINOR); + + CHECK_VALUE (LNET_MSG_ACK); + CHECK_VALUE (LNET_MSG_PUT); + CHECK_VALUE (LNET_MSG_GET); + CHECK_VALUE (LNET_MSG_REPLY); + CHECK_VALUE (LNET_MSG_HELLO); + + check_lnet_handle_wire (); + check_lnet_magicversion (); + check_lnet_hdr (); printf ("}\n\n"); -- 1.8.3.1